
#pragma once

#include "namespace_config.h"

#include <cuda.h>
#include <vector>

namespace FLASH_NAMESPACE {
constexpr int TOTAL_DIM = 0;
constexpr int H_DIM = 1;
constexpr int D_DIM = 2;
typedef int64_t index_t;

////////////////////////////////////////////////////////////////////////////////////////////////////

struct QKV_params {
    // The QKV matrices.
    void *__restrict__ q_ptr;   // Query tensor [batch_size, num_heads, query_len, head_dim]
    void *__restrict__ k_ptr;   // Key tensor [batch_size, num_kv_heads, key_len, head_dim]
    void *__restrict__ v_ptr;   // Value tensor [batch_size, num_kv_heads, key_len, head_dim]

    // The stride between rows of the Q, K and V matrices.
    index_t q_batch_stride;     // Stride between batches of Q
    index_t k_batch_stride;     // Stride between batches of K
    index_t v_batch_stride;     // Stride between batches of V
    index_t q_row_stride;       // Stride between rows of Q
    index_t k_row_stride;       // Stride between rows of K
    index_t v_row_stride;       // Stride between rows of V
    index_t q_head_stride;      // Stride between heads of Q
    index_t k_head_stride;      // Stride between heads of K
    index_t v_head_stride;      // Stride between heads of V

    // The number of heads.
    int h, h_k;
    // In the case of multi-query and grouped-query attention (MQA/GQA), nheads_k could be
    // different from nheads (query).
    int h_h_k_ratio;    // precompute h / h_k,
};

////////////////////////////////////////////////////////////////////////////////////////////////////

struct Mask_params {
    void * __restrict__ mask_ptr;       // Attention mask tensor [batch_size, num_mask_heads, query_len, key_len]

    // The stride of the attention mask tensors.
    index_t mask_batch_stride;          // Stride between batches of attention mask
    index_t mask_head_stride;           // Stride between heads of attention mask
    index_t mask_row_stride;            // Stride between rows of attention mask

    // The number of heads in the mask.
    int h_mask;
    int h_h_mask_ratio; // precompute h / h_mask

    bool has_mask;
};

////////////////////////////////////////////////////////////////////////////////////////////////////

struct Bias_params {
    void *__restrict__ bias_ptr;        // Attention bias tensor [batch_size, num_bias_heads, query_len, key_len]

    // The stride of the attention bias tensor.
    index_t bias_batch_stride;          // Stride between batches of attention bias
    index_t bias_head_stride;           // Stride between heads of attention bias
    index_t bias_row_stride;            // Stride between rows of attention bias

    // The number of heads in the bias.
    int h_bias;
    int h_h_bias_ratio; // precompute h / h_bias

    bool has_bias;
};

////////////////////////////////////////////////////////////////////////////////////////////////////

struct Flash_fwd_params : public QKV_params, public Mask_params, public Bias_params {

    // The O matrix.
    void * __restrict__ o_ptr;
    void * __restrict__ oaccum_ptr;

    // The stride between rows of O.
    index_t o_batch_stride;
    index_t o_row_stride;
    index_t o_head_stride;

    // The pointer to the P matrix.
    void * __restrict__ p_ptr;

    // The pointer to the softmax sum.
    void * __restrict__ softmax_lse_ptr;
    void * __restrict__ softmax_lseaccum_ptr;

    // The dimensions.
    int b, seqlen_q, seqlen_k, seqlen_knew, d, seqlen_q_rounded, seqlen_k_rounded, d_rounded, total_q, total_k;

    // The scaling factors for the kernel.
    float scale_softmax;
    float scale_softmax_log2;
    float softcap;

    // array of length b+1 holding starting offset of each sequence.
    int * __restrict__ cu_seqlens_q;
    int * __restrict__ cu_seqlens_k;
    int * __restrict__ leftpad_k;

    // If provided, the actual length of each k sequence.
    int * __restrict__ seqused_k;

    // TODO: block mask for less memory usage
    int *__restrict__ blockmask;

    // The K_new and V_new matrices.
    void * __restrict__ knew_ptr;
    void * __restrict__ vnew_ptr;

    // The stride between rows of the Q, K and V matrices.
    index_t knew_batch_stride;
    index_t vnew_batch_stride;
    index_t knew_row_stride;
    index_t vnew_row_stride;
    index_t knew_head_stride;
    index_t vnew_head_stride;

    // The cos and sin matrices for rotary embedding.
    void * __restrict__ rotary_cos_ptr;
    void * __restrict__ rotary_sin_ptr;

    // The indices to index into the KV cache.
    int * __restrict__ cache_batch_idx;

    // Paged KV cache
    int * __restrict__ block_table;
    index_t block_table_batch_stride;
    int page_block_size;

    bool is_bf16;
    bool is_causal;

    // If is_seqlens_k_cumulative, then seqlen_k is cu_seqlens_k[bidb + 1] - cu_seqlens_k[bidb].
    // Otherwise it's cu_seqlens_k[bidb], i.e., we use cu_seqlens_k to store the sequence lengths of K.
    bool is_seqlens_k_cumulative;

    int num_splits;  // For split-KV version

    bool unpadded_lse;  // For varlen paths: LSE is in [nheads, total_seqlen_q] format instead of [b, nheads, seqlen_q].
    bool seqlenq_ngroups_swapped;  // q has been transposed from (b, 1, (nheads_kv ngroups), d) to (b, ngroups, nheads_kv, d).
};

////////////////////////////////////////////////////////////////////////////////////////////////////

struct Flash_bwd_params : public Flash_fwd_params {

    // The dO and dQKV and dBias matrices.
    void *__restrict__ do_ptr;
    void *__restrict__ dq_ptr;
    void *__restrict__ dk_ptr;
    void *__restrict__ dv_ptr;
    void *__restrict__ dbias_ptr;

    // To accumulate dQ
    void *__restrict__ dq_accum_ptr;
    void *__restrict__ dk_accum_ptr;
    void *__restrict__ dv_accum_ptr;

    // // To accumulate dK and dV in case we're splitting the bwd along seqlen_q
    // dimension void *__restrict__ dk_accum_ptr; void *__restrict__
    // dv_accum_ptr;

    // The stride between rows of the dO, dQ, dK and dV matrices.
    // TD [2022-04-16]: We're using 32-bit indexing to save registers.
    // The code probably won't work for arrays larger than 2GB.
    index_t do_batch_stride;
    index_t do_row_stride;
    index_t do_head_stride;
    index_t dq_batch_stride;
    index_t dk_batch_stride;
    index_t dv_batch_stride;
    index_t dq_row_stride;
    index_t dk_row_stride;
    index_t dv_row_stride;
    index_t dq_head_stride;
    index_t dk_head_stride;
    index_t dv_head_stride;
    index_t dbias_batch_stride;
    index_t dbias_head_stride;
    index_t dbias_row_stride;

    // The pointer to the softmax d sum.
    void *__restrict__ dsoftmax_sum;

    bool deterministic;
    index_t dq_accum_split_stride;
};

////////////////////////////////////////////////////////////////////////////////////////////////////

template<typename T, int Headdim, bool Is_causal, bool Has_mask, bool Has_bias> void run_mha_fwd_(Flash_fwd_params &params, cudaStream_t stream);
template<typename T, int Headdim, bool Is_causal, bool Has_mask, bool Has_bias> void run_mha_fwd_splitkv_dispatch(Flash_fwd_params &params, cudaStream_t stream);

template<typename T, int Headdim, bool Is_causal, bool Has_mask, bool Has_bias> void run_mha_bwd_(Flash_bwd_params &params, cudaStream_t stream);

}  // namespace FLASH_NAMESPACE
