# qwen_flash_attn2_forward_PyramidKV
# qwen_attn_forward_PyramidKV
# qwen_sdpa_attn_forward_PyramidKV

import torch
import torch.nn as nn
import torch.nn.functional as F
from typing import List, Optional, Tuple, Union
import warnings
from transformers.cache_utils import Cache, DynamicCache, StaticCache 
from transformers.models.qwen2.modeling_qwen2 import (
    apply_rotary_pos_emb,
    repeat_kv,
)
from transformers.modeling_outputs import BaseModelOutputWithPast
from transformers.utils import (
    logging,
    is_flash_attn_greater_or_equal_2_10,
)
from transformers.modeling_flash_attention_utils import _flash_attention_forward 

from pyramidkv.pyramidkv_utils import init_pyramidkv, init_snapkv, init_CAM, init_H2O, init_StreamingLLM, init_l2norm, init_adakv, init_headkv
import math
from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input
from pyramidkv.pyramidkv_utils import DynamicCacheSplitHeadFlatten

logger = logging.get_logger(__name__)

def qwen_flash_attn2_forward_PyramidKV(
    self,
    hidden_states: torch.Tensor,
    attention_mask: Optional[torch.LongTensor] = None,
    position_ids: Optional[torch.LongTensor] = None,
    past_key_value: Optional[Cache] = None,
    output_attentions: bool = False,
    use_cache: bool = False,
    cache_position: Optional[torch.LongTensor] = None,
    **kwargs,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
    """
    Qwen2 Flash Attention 2 forward pass with PyramidKV integration.
    This function adapts the original Qwen2 FlashAttention2 forward for PyramidKV,
    """
    if isinstance(past_key_value, StaticCache):
        raise ValueError(
            "`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` "
            "make sure to use `sdpa` in the mean time, and open an issue at https://github.com/huggingface/transformers"
        )

    # PyramidKV Initialize
    init_pyramidkv(self, num_hidden_layers=self.config.num_hidden_layers)
    # print(self.config.num_hidden_layers)
    
    if "padding_mask" in kwargs:
        warnings.warn(
            "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
        )
        attention_mask = kwargs.pop("padding_mask")

    output_attentions = False 

    bsz, q_len, _ = hidden_states.size()

    query_states = self.q_proj(hidden_states)
    key_states = self.k_proj(hidden_states) 
    value_states = self.v_proj(hidden_states) 

    # Reshape for Flash Attention
    query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) 
    key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
    value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
    
    kv_seq_len = key_states.shape[-2]
    if past_key_value is not None:
        if self.layer_idx is None:
            raise ValueError(
                f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
                "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                "with a layer index."
            )
        if hasattr(self, "kv_seq_len"): 
            if self.kv_seq_len != 0:
                kv_seq_len += self.kv_seq_len
            else:
                kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
        else:
            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)

    # Apply RoPE
    rotary_seq_len = (
        max(kv_seq_len, position_ids[:, -1].max().item() + 1) if position_ids is not None else kv_seq_len
    )

    cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)
    query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)

    # Repeat k/v heads before calling kv_cluster.update_kv
    key_states = repeat_kv(key_states, self.num_key_value_groups)
    value_states = repeat_kv(value_states, self.num_key_value_groups)

    if past_key_value is not None:
        # Activate slicing cache only if the config has a value `sliding_windows` attribute
        cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
        if (
            getattr(self.config, "sliding_window", None) is not None
            and kv_seq_len > self.config.sliding_window
            and cache_has_contents
        ):
            slicing_tokens = 1 - self.config.sliding_window

            past_key = past_key_value[self.layer_idx][0]
            past_value = past_key_value[self.layer_idx][1]

            past_key = past_key[:, :, slicing_tokens:, :].contiguous()
            past_value = past_value[:, :, slicing_tokens:, :].contiguous()

            if past_key.shape[-2] != self.config.sliding_window - 1:
                raise ValueError(
                    f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
                    f" {past_key.shape}"
                )

            if attention_mask is not None:
                attention_mask = attention_mask[:, slicing_tokens:]
                attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)
        
        
        
        cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} 
        
        # PyramidKV update
        if key_states.shape[-2] == kv_seq_len:
            self.kv_seq_len = kv_seq_len 
            key_states_compress, value_states_compress = self.kv_cluster.update_kv(
                key_states, query_states, value_states, attention_mask, self.num_key_value_groups
            )
            key_states, value_states = past_key_value.update(
                key_states_compress, value_states_compress, self.layer_idx, cache_kwargs
            )
        else:
            self.kv_seq_len += q_len
            key_states, value_states = past_key_value.update(
                key_states, value_states, self.layer_idx, cache_kwargs
            )
        
        past_key_value._seen_tokens = self.kv_seq_len

    # Transpose for Flash Attention
    query_states_for_attn = query_states.transpose(1, 2)
    key_states_for_attn = key_states.transpose(1, 2)
    value_states_for_attn = value_states.transpose(1, 2)

    dropout_rate = self.attention_dropout if self.training else 0.0

    # Handle potential float32
    input_dtype = query_states_for_attn.dtype
    if input_dtype == torch.float32:
        if torch.is_autocast_enabled():
            target_dtype = torch.get_autocast_gpu_dtype()
        elif hasattr(self.config, "_pre_quantization_dtype"):
            target_dtype = self.config._pre_quantization_dtype
        else:
            target_dtype = self.q_proj.weight.dtype

        logger.warning_once(
            f"The input hidden states seems to be silently casted in float32, this might be related to"
            f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
            f" {target_dtype}."
        )

        query_states_for_attn = query_states_for_attn.to(target_dtype)
        key_states_for_attn = key_states_for_attn.to(target_dtype)
        value_states_for_attn = value_states_for_attn.to(target_dtype)
    
    if (
        self.config.use_sliding_window
        and getattr(self.config, "sliding_window", None) is not None
        and self.layer_idx >= self.config.max_window_layers
    ):
        sliding_window = self.config.sliding_window
    else:
        sliding_window = None

    # Call the Flash Attention kernel
    attn_output = _flash_attention_forward(
        query_states_for_attn,
        key_states_for_attn,
        value_states_for_attn,
        attention_mask,
        q_len,
        position_ids=position_ids,
        dropout=dropout_rate,
        sliding_window=sliding_window,
        is_causal=self.is_causal,
        use_top_left_mask=self._flash_attn_uses_top_left_mask,
    )

    attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
    attn_output = self.o_proj(attn_output)

    if not output_attentions:
        attn_weights = None

    return attn_output, attn_weights, past_key_value


def qwen_attn_forward_PyramidKV(
    self,
    hidden_states: torch.Tensor,
    attention_mask: Optional[torch.Tensor] = None,
    position_ids: Optional[torch.LongTensor] = None,
    past_key_value: Optional[Cache] = None,
    output_attentions: bool = False,
    use_cache: bool = False,
    cache_position: Optional[torch.LongTensor] = None,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
    bsz, q_len, _ = hidden_states.size()

    init_pyramidkv(self, num_hidden_layers=self.config.num_hidden_layers)

    query_states = self.q_proj(hidden_states)
    key_states = self.k_proj(hidden_states)
    value_states = self.v_proj(hidden_states)

    query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
    key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
    value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)

    kv_seq_len = key_states.shape[-2]
    if past_key_value is not None:
        if self.layer_idx is None:
            raise ValueError(
                f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
                "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                "with a layer index."
            )
        if hasattr(self, "kv_seq_len"): 
            if self.kv_seq_len != 0:
                kv_seq_len += self.kv_seq_len
            else:
                kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
        else:
            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)

    
    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
    query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)

    # modify core implementation    
    if past_key_value is not None:
        # sin and cos are specific to RoPE models; cache_position needed for the static cache
        cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}

        if key_states.shape[-2] == kv_seq_len:
            self.kv_seq_len = kv_seq_len
            key_states_compress, value_states_compress = self.kv_cluster.update_kv(key_states, query_states, value_states, attention_mask, self.num_key_value_groups)
            past_key_value.update(key_states_compress, value_states_compress, self.layer_idx, cache_kwargs)
        else:
            self.kv_seq_len += q_len
            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
        past_key_value._seen_tokens=self.kv_seq_len

    # repeat k/v heads if n_kv_heads < n_heads
    key_states = repeat_kv(key_states, self.num_key_value_groups)
    value_states = repeat_kv(value_states, self.num_key_value_groups)

    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)

    # remove above
    # if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
    #     raise ValueError(
    #         f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
    #         f" {attn_weights.size()}"
    #     )

    if attention_mask is not None:  # no matter the length, we just slice it
        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
        attn_weights = attn_weights + causal_mask

    # upcast attention to fp32
    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
    attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
    attn_output = torch.matmul(attn_weights, value_states)

    if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
        raise ValueError(
            f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
            f" {attn_output.size()}"
        )

    attn_output = attn_output.transpose(1, 2).contiguous()
    attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)

    attn_output = self.o_proj(attn_output)

    if not output_attentions:
        attn_weights = None

    return attn_output, attn_weights, past_key_value









# def qwen_attn_forward_PyramidKV(
#     self,
#     hidden_states: torch.Tensor,
#     attention_mask: Optional[torch.Tensor] = None,
#     position_ids: Optional[torch.LongTensor] = None,
#     past_key_value: Optional[Cache] = None,
#     output_attentions: bool = False,
#     use_cache: bool = False,
#     cache_position: Optional[torch.LongTensor] = None,
#     **kwargs, 
# ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
#     """
#     Qwen2 standard attention forward pass with PyramidKV integration.
#     """
#     bsz, q_len, _ = hidden_states.size()

#     init_pyramidkv(self, num_hidden_layers=self.config.num_hidden_layers)

#     query_states = self.q_proj(hidden_states)
#     key_states = self.k_proj(hidden_states)
#     value_states = self.v_proj(hidden_states)

#     query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
#     key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
#     value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)

#     # kv_seq_len = key_states.shape[-2]
#     # if past_key_value is not None:
#     #     if self.layer_idx is None:
#     #         raise ValueError(
#     #             f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
#     #             "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
#     #             "with a layer index."
#     #         )
#     #     if hasattr(self, "kv_seq_len"): 
#     #         if self.kv_seq_len != 0:
#     #             kv_seq_len += self.kv_seq_len
#     #         else:
#     #             kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
#     #     else:
#     #         kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)

#     # rotary_seq_len = (
#     #     max(kv_seq_len, position_ids[:, -1].max().item() + 1) if position_ids is not None else kv_seq_len
#     # )

#     # cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)
#     # query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
    



#     kv_seq_len = key_states.shape[-2]

#     if past_key_value is not None:
#         if hasattr(self, "kv_seq_len"): #[SnapKV] add kv_seq_len
#             if self.kv_seq_len != 0:
#                 kv_seq_len += self.kv_seq_len
#             else:
#                 kv_seq_len += cache_position[0]
#         else:
#             kv_seq_len += cache_position[0]

#     cos, sin = self.rotary_emb(value_states, position_ids)
#     query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)






#     # Repeat k/v heads before calling kv_cluster.update_kv
#     key_states = repeat_kv(key_states, self.num_key_value_groups)
#     value_states = repeat_kv(value_states, self.num_key_value_groups)

#     if past_key_value is not None:
#         cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}

#         if key_states.shape[-2] == kv_seq_len:
#             self.kv_seq_len = kv_seq_len
#             key_states_compress, value_states_compress = self.kv_cluster.update_kv(
#                 key_states, query_states, value_states, attention_mask, self.num_key_value_groups
#             )
#             key_states, value_states = past_key_value.update(
#                 key_states_compress, value_states_compress, self.layer_idx, cache_kwargs
#             )
#         else:
#             self.kv_seq_len += q_len
#             key_states, value_states = past_key_value.update(
#                 key_states, value_states, self.layer_idx, cache_kwargs
#             )
#         past_key_value._seen_tokens = self.kv_seq_len

#     attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)

#     if attention_mask is not None:
#         causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
#         attn_weights = attn_weights + causal_mask

#     # upcast attention to fp32
#     attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
#     attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
#     attn_output = torch.matmul(attn_weights, value_states)

#     if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
#         raise ValueError(
#             f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
#             f" {attn_output.size()}"
#         )

#     attn_output = attn_output.transpose(1, 2).contiguous()
#     attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)

#     attn_output = self.o_proj(attn_output)

#     if not output_attentions:
#         attn_weights = None

#     return attn_output, attn_weights, past_key_value


def qwen_sdpa_attn_forward_PyramidKV(
    self,
    hidden_states: torch.Tensor,
    attention_mask: Optional[torch.Tensor] = None,
    position_ids: Optional[torch.LongTensor] = None,
    past_key_value: Optional[Cache] = None,
    output_attentions: bool = False,
    use_cache: bool = False,
    cache_position: Optional[torch.LongTensor] = None,
    position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
    if output_attentions:
        # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
        logger.warning_once(
            "Qwen2Model is using Qwen2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
            'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
        )
        return super().forward(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_value=past_key_value,
            output_attentions=output_attentions,
            use_cache=use_cache,
        )

    # modify
    init_pyramidkv(self, num_hidden_layers=self.config.num_hidden_layers)

    bsz, q_len, _ = hidden_states.size()

    query_states = self.q_proj(hidden_states)
    key_states = self.k_proj(hidden_states)
    value_states = self.v_proj(hidden_states)

    query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
    key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
    value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)

    kv_seq_len = key_states.shape[-2]

    # modify
    if past_key_value is not None:
        if self.layer_idx is None:
            raise ValueError(
                f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
                "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                "with a layer index."
            )
        if hasattr(self, "kv_seq_len"): 
            if self.kv_seq_len != 0:
                kv_seq_len += self.kv_seq_len
            else:
                kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
        else:
            
            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
    
    # Used for transformers 4.44.2
    # cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
    # query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)

    # Update for transformers 4.45.0
    # Using RotaryEmbedding now
    if position_embeddings is None:
        logger.warning_once(
            "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
            "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
            "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
            "removed and `position_embeddings` will be mandatory."
        )
        cos, sin = self.rotary_emb(value_states, position_ids)
    else:
        cos, sin = position_embeddings
        
    query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)


    
    # modify
    key_states = repeat_kv(key_states, self.num_key_value_groups)
    value_states = repeat_kv(value_states, self.num_key_value_groups)

    if past_key_value is not None:
        # sin and cos are specific to RoPE models; cache_position needed for the static cache
        cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
        if key_states.shape[-2] == kv_seq_len:
            self.kv_seq_len = kv_seq_len
            key_states_compress, value_states_compress = self.kv_cluster.update_kv(key_states, query_states, value_states, attention_mask, self.num_key_value_groups)
            past_key_value.update(key_states_compress, value_states_compress, self.layer_idx, cache_kwargs)
        else:
            self.kv_seq_len += q_len
            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
        past_key_value._seen_tokens=self.kv_seq_len

    causal_mask = attention_mask
    if attention_mask is not None:  # no matter the length, we just slice it
        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]

    # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
    # Reference: https://github.com/pytorch/pytorch/issues/112577.
    if query_states.device.type == "cuda" and attention_mask is not None:
        query_states = query_states.contiguous()
        key_states = key_states.contiguous()
        value_states = value_states.contiguous()

    # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
    # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
    # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
    is_causal = True if causal_mask is None and q_len > 1 else False

    attn_output = torch.nn.functional.scaled_dot_product_attention(
        query_states,
        key_states,
        value_states,
        attn_mask=causal_mask,
        dropout_p=self.attention_dropout if self.training else 0.0,
        is_causal=is_causal,
    )

    attn_output = attn_output.transpose(1, 2).contiguous()
    attn_output = attn_output.view(bsz, q_len, self.hidden_size)

    attn_output = self.o_proj(attn_output)

    return attn_output, None, past_key_value





# def qwen_sdpa_attn_forward_PyramidKV(
#     self,
#     hidden_states: torch.Tensor,
#     attention_mask: Optional[torch.Tensor] = None,
#     position_ids: Optional[torch.LongTensor] = None,
#     past_key_value: Optional[Cache] = None,
#     output_attentions: bool = False,
#     use_cache: bool = False,
#     cache_position: Optional[torch.LongTensor] = None,
#     **kwargs, 
# ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
#     """
#     Qwen2 SDPA attention forward pass with PyramidKV integration.
#     """
#     if output_attentions:
#         logger.warning_once(
#             "Qwen2Model is using Qwen2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
#             'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
#         )
#         return super().forward(
#             hidden_states=hidden_states,
#             attention_mask=attention_mask,
#             position_ids=position_ids,
#             past_key_value=past_key_value,
#             output_attentions=output_attentions,
#             use_cache=use_cache,
#             cache_position=cache_position,
#         )

#     init_pyramidkv(self, num_hidden_layers=self.config.num_hidden_layers)

#     bsz, q_len, _ = hidden_states.size()

#     query_states = self.q_proj(hidden_states)
#     key_states = self.k_proj(hidden_states)
#     value_states = self.v_proj(hidden_states)

#     query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
#     key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
#     value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
    
#     kv_seq_len = key_states.shape[-2]
#     if past_key_value is not None:
#         if self.layer_idx is None:
#             raise ValueError(
#                 f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
#                 "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
#                 "with a layer index."
#             )
#         if hasattr(self, "kv_seq_len"): 
#             if self.kv_seq_len != 0:
#                 kv_seq_len += self.kv_seq_len
#             else:
#                 kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
#         else:
#             kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)

#     rotary_seq_len = (
#         max(kv_seq_len, position_ids[:, -1].max().item() + 1) if position_ids is not None else kv_seq_len
#     )

#     cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)
#     query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
    
#     # Repeat k/v heads before calling kv_cluster.update_kv
#     key_states = repeat_kv(key_states, self.num_key_value_groups)
#     value_states = repeat_kv(value_states, self.num_key_value_groups)

#     if past_key_value is not None:
#         cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
        
#         if key_states.shape[-2] == kv_seq_len:
#             self.kv_seq_len = kv_seq_len
#             key_states_compress, value_states_compress = self.kv_cluster.update_kv(
#                 key_states, query_states, value_states, attention_mask, self.num_key_value_groups
#             )
#             key_states, value_states = past_key_value.update(
#                 key_states_compress, value_states_compress, self.layer_idx, cache_kwargs
#             )
#         else:
#             self.kv_seq_len += q_len
#             key_states, value_states = past_key_value.update(
#                 key_states, value_states, self.layer_idx, cache_kwargs
#             )
#         past_key_value._seen_tokens = self.kv_seq_len

#     causal_mask = attention_mask
#     if attention_mask is not None:
#         causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]

#     if query_states.device.type == "cuda" and causal_mask is not None:
#         query_states = query_states.contiguous()
#         key_states = key_states.contiguous()
#         value_states = value_states.contiguous()

#     is_causal = True if causal_mask is None and q_len > 1 else False

#     attn_output = torch.nn.functional.scaled_dot_product_attention(
#         query_states,
#         key_states,
#         value_states,
#         attn_mask=causal_mask,
#         dropout_p=self.attention_dropout if self.training else 0.0,
#         is_causal=is_causal,
#     )

#     attn_output = attn_output.transpose(1, 2).contiguous()
#     attn_output = attn_output.view(bsz, q_len, self.hidden_size)

#     attn_output = self.o_proj(attn_output)

#     return attn_output, None, past_key_value




def qwen_flash_attn2_forward_SnapKV(
    self,
    hidden_states: torch.Tensor,
    attention_mask: Optional[torch.LongTensor] = None,
    position_ids: Optional[torch.LongTensor] = None,
    past_key_value: Optional["Cache"] = None,
    output_attentions: bool = False,
    use_cache: bool = False,
    cache_position: Optional[torch.LongTensor] = None,
    **kwargs,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
    """
    Flash‑Attn2 前向 + SnapKV
    
    1. **未 repeat 的 k/v** 进入 SnapKV -> Think -> update_kv
    2. KV 更新结束后再做 **repeat_kv**
    3. rotary_seq_len 直接取 `position_ids` 最大值，杜绝估计误差
    """

    if isinstance(past_key_value, StaticCache):
        raise ValueError(
            "`static` cache is incompatible with Flash‑Attn2; "
            "switch to SDPA or remove attn_implementation override."
        )

    init_snapkv(self)  # 保证 kv_cluster 等属性存在

    if "padding_mask" in kwargs:  # 兼容旧接口
        warnings.warn("`padding_mask` 已弃用，请改用 `attention_mask`")
        attention_mask = kwargs.pop("padding_mask")

    output_attentions = False  # Flash‑Attn2 不回传权重
    bsz, q_len, _ = hidden_states.size()

    query_states = self.q_proj(hidden_states)
    key_states   = self.k_proj(hidden_states)   # unrepeated
    value_states = self.v_proj(hidden_states)   # unrepeated

    query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
    key_states_u = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
    value_states_u = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)

 
    if position_ids is not None:
        rotary_seq_len = int(position_ids[:, -1].max().item()) + 1
    else:
        prev_len = getattr(self, "kv_seq_len", 0)
        rotary_seq_len = q_len + (prev_len if past_key_value is not None else 0)

    cos, sin = self.rotary_emb(value_states_u, seq_len=rotary_seq_len)

    query_states_r, key_states_r_u = apply_rotary_pos_emb(
        query_states, key_states_u, cos, sin, position_ids
    )

    #SnapKV • Think 阶段
    if hasattr(self, "kv_cluster") and hasattr(self.kv_cluster, "update_think"):
        bsz_, q_heads, _, hd = query_states_r.shape
        kv_heads = key_states_r_u.shape[1]
        heads_per_kv = q_heads // kv_heads
        # GQA 平均，保证 query 头数与 KV 对齐
        query_states_gqa = query_states_r.view(bsz_, kv_heads, heads_per_kv, q_len, hd).mean(dim=2)
        self.kv_cluster.update_think(
            key_states_r_u,
            query_states_gqa,
            value_states_u,
            attention_mask,
            self.num_key_value_groups,
        )
    else:
        query_states_gqa = None 

    final_key_states   = key_states_r_u
    final_value_states = value_states_u

    if past_key_value is not None:
        cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}

        if key_states_r_u.shape[-2] == q_len:
            # 当前输入是原始 token（未被压缩）
            # print(f"[SnapKV] layer {self.layer_idx} | update_kv前 KV头数 = {key_states_r_u.shape[1]}"),这里是对的，没有提前repeat
            self.kv_seq_len = getattr(self, "kv_seq_len", 0) + q_len
            if query_states_gqa is None:
                # 如果上面没走 think，也需要构造 gqa 版本
                bsz_, q_heads, _, hd = query_states_r.shape
                kv_heads = key_states_r_u.shape[1]
                heads_per_kv = q_heads // kv_heads
                query_states_gqa = query_states_r.view(bsz_, kv_heads, heads_per_kv, q_len, hd).mean(dim=2)

            key_states_kv, value_states_kv = self.kv_cluster.update_kv(
                key_states_r_u,
                query_states_gqa,
                value_states_u,
                attention_mask,
                self.num_key_value_groups,
            )
            final_key_states, final_value_states = past_key_value.update(
                key_states_kv, value_states_kv, self.layer_idx, cache_kwargs
            )
        else:
            # 输入块已被 SnapKV 压缩
            self.kv_seq_len += q_len
            final_key_states, final_value_states = past_key_value.update(
                key_states_r_u, value_states_u, self.layer_idx, cache_kwargs
            )
        # 记录已见 token，用于生成
        past_key_value._seen_tokens = self.kv_seq_len
    else:
        # 首次前向，记录长度
        self.kv_seq_len = q_len

    k_rep = repeat_kv(final_key_states,   self.num_key_value_groups)
    v_rep = repeat_kv(final_value_states, self.num_key_value_groups)

    q_attn = query_states_r.transpose(1, 2)
    k_attn = k_rep.transpose(1, 2)
    v_attn = v_rep.transpose(1, 2)

    if q_attn.dtype == torch.float32:
        tgt = (
            torch.get_autocast_gpu_dtype() if torch.is_autocast_enabled()
            else getattr(self.config, "_pre_quantization_dtype", self.q_proj.weight.dtype)
        )
        q_attn, k_attn, v_attn = q_attn.to(tgt), k_attn.to(tgt), v_attn.to(tgt)

    dropout_rate = self.attention_dropout if self.training else 0.0
    sliding_window = (
        self.config.sliding_window
        if self.config.use_sliding_window and getattr(self.config, "sliding_window", None) is not None
        and self.layer_idx >= self.config.max_window_layers else None
    )
    
   
    print("=== SnapKV Debug Info ===") 
    print("num_heads       =", self.num_heads)
    print("num_kv_heads    =", self.num_key_value_heads)
    print("num_kv_groups   =", self.num_key_value_groups)
    print("k_rep.shape     =", k_rep.shape)
    print("v_rep.shape     =", v_rep.shape)
    print("query_states_r.shape =", query_states_r.shape)
    print("==========================")

    attn_output = _flash_attention_forward(
        q_attn,
        k_attn,
        v_attn,
        attention_mask,
        q_len,
        position_ids=position_ids,
        dropout=dropout_rate,
        sliding_window=sliding_window,
        is_causal=self.is_causal,
        use_top_left_mask=self._flash_attn_uses_top_left_mask,
    )

    attn_output = self.o_proj(attn_output.reshape(bsz, q_len, self.hidden_size).contiguous())

    # Flash‑Attn2 取不到权重，此处返回 None 占位
    attn_weights = None

    return attn_output, attn_weights, past_key_value



# def qwen_attn_forward_SnapKV(
#     self,
#     hidden_states: torch.Tensor,
#     attention_mask: Optional[torch.Tensor] = None,
#     position_ids: Optional[torch.LongTensor] = None,
#     past_key_value: Optional["Cache"] = None,
#     output_attentions: bool = False,
#     use_cache: bool = False,
#     cache_position: Optional[torch.LongTensor] = None,
#     **kwargs,
#     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
#     """MatMul‑Attention + SnapKV 实现"""

#     bsz, q_len, _ = hidden_states.size()
#     init_snapkv(self)

#     query = self.q_proj(hidden_states)
#     key   = self.k_proj(hidden_states)
#     value = self.v_proj(hidden_states)

#     query = query.view(bsz, q_len, self.num_heads,       self.head_dim).transpose(1, 2)
#     key_u = key  .view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
#     val_u = value.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)

#     if position_ids is not None:
#         rotary_seq_len = int(position_ids[:, -1].max().item()) + 1
#     else:
#         prev = getattr(self, "kv_seq_len", 0)
#         rotary_seq_len = q_len + (prev if past_key_value is not None else 0)

#     cos, sin = self.rotary_emb(val_u, seq_len=rotary_seq_len)
#     query_r, key_r_u = apply_rotary_pos_emb(query, key_u, cos, sin, position_ids)

#     # SnapKV Think 
#     if hasattr(self, "kv_cluster") and hasattr(self.kv_cluster, "update_think"):
#         bsz_, q_heads, _, hd = query_r.shape
#         kv_heads = key_r_u.shape[1]
#         hpk = q_heads // kv_heads
#         query_gqa = query_r.view(bsz_, kv_heads, hpk, q_len, hd).mean(dim=2)
#         self.kv_cluster.update_think(
#             key_r_u, query_gqa, val_u, attention_mask, self.num_key_value_groups
#         )
#     else:
#         query_gqa = None

#     # KV 更新 
#     final_k, final_v = key_r_u, val_u
#     if past_key_value is not None:
#         cache_kwargs = {"sin": cos, "cos": sin, "cache_position": cache_position}
#         if key_r_u.shape[-2] == q_len:
#             self.kv_seq_len = getattr(self, "kv_seq_len", 0) + q_len
#             if query_gqa is None:
#                 bsz_, qh, _, hd = query_r.shape
#                 kvh = key_r_u.shape[1]
#                 query_gqa = query_r.view(bsz_, kvh, qh//kvh, q_len, hd).mean(dim=2)
#             comp_k, comp_v = self.kv_cluster.update_kv(
#                 key_r_u, query_gqa, val_u, attention_mask, self.num_key_value_groups
#             )
#             final_k, final_v = past_key_value.update(
#                 comp_k, comp_v, self.layer_idx, cache_kwargs
#             )
#         else:
#             self.kv_seq_len += q_len
#             final_k, final_v = past_key_value.update(
#                 key_r_u, val_u, self.layer_idx, cache_kwargs
#             )
#         past_key_value._seen_tokens = self.kv_seq_len
#     else:
#         self.kv_seq_len = q_len

#     k_rep = repeat_kv(final_k, self.num_key_value_groups)
#     v_rep = repeat_kv(final_v, self.num_key_value_groups)

#     attn_w = torch.matmul(query_r, k_rep.transpose(2, 3)) / math.sqrt(self.head_dim)
#     if attention_mask is not None:
#         attn_w += attention_mask[:, :, :, : k_rep.shape[-2]]
#     attn_w = nn.functional.softmax(attn_w, dim=-1, dtype=torch.float32).to(query.dtype)
#     attn_w = nn.functional.dropout(attn_w, p=self.attention_dropout, training=self.training)
#     attn_out = torch.matmul(attn_w, v_rep)

#     # 检查形状
#     if attn_out.size() != (bsz, self.num_heads, q_len, self.head_dim):
#         raise ValueError("attn_out 形状不符", attn_out.shape)

#     attn_out = attn_out.transpose(1, 2).contiguous().view(bsz, q_len, self.hidden_size)
#     attn_out = self.o_proj(attn_out)

#     # Flash‑Attn2 无权重，此处返回 None 以占位
#     if not output_attentions:
#         attn_w = None

#     return attn_out, attn_w, past_key_value

def qwen_attn_forward_SnapKV(
    self,
    hidden_states: torch.Tensor,
    attention_mask: Optional[torch.Tensor] = None,
    position_ids: Optional[torch.LongTensor] = None,
    past_key_value: Optional[Cache] = None,
    output_attentions: bool = False,
    use_cache: bool = False,
    cache_position: Optional[torch.LongTensor] = None,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
    bsz, q_len, _ = hidden_states.size()
    '''
    理论上第二轮开始的时候是会比上一轮增加一个token，所以第二次的65是哪来的。。。
    attention的大小倒是正确的，比上一轮增加了1个
    '''
    # modify
    init_snapkv(self)

    query_states = self.q_proj(hidden_states)
    key_states = self.k_proj(hidden_states)
    value_states = self.v_proj(hidden_states)

    query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
    key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
    value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)

    kv_seq_len = key_states.shape[-2]
    # modify
    if past_key_value is not None:
        if self.layer_idx is None:
            raise ValueError(
                f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
                "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                "with a layer index."
            )
        
        if hasattr(self, "kv_seq_len"): 
            if self.kv_seq_len != 0:
                kv_seq_len += self.kv_seq_len
            else:
                kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
        else:
            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
    
    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
    query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
    
    # modify
    # repeat k/v heads if n_kv_heads < n_heads
    key_states = repeat_kv(key_states, self.num_key_value_groups)
    value_states = repeat_kv(value_states, self.num_key_value_groups)

    # modify
    if past_key_value is not None:
        # sin and cos are specific to RoPE models; cache_position needed for the static cache
        cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
        if key_states.shape[-2] == kv_seq_len:
            self.kv_seq_len = kv_seq_len
            key_states_compress, value_states_compress = self.kv_cluster.update_kv(key_states, query_states, value_states, attention_mask, self.num_key_value_groups)
            past_key_value.update(key_states_compress, value_states_compress, self.layer_idx, cache_kwargs)
        else:
            self.kv_seq_len += q_len
            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
        past_key_value._seen_tokens=self.kv_seq_len

    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)

    # comments below..
    # if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): # 这里的kv_seq_len变化了
    #     # 现在的问题就是这里的kv_seq_len是被压缩之后的长度，但是attn_weights还是正常的长度
    #     raise ValueError(
    #         f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
    #         f" {attn_weights.size()}"
    #     )

    if attention_mask is not None:  # no matter the length, we just slice it
        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
        attn_weights = attn_weights + causal_mask

    # upcast attention to fp32
    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
    attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
    attn_output = torch.matmul(attn_weights, value_states)

    if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
        # 现在的问题就是在这里，
        raise ValueError(
            f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
            f" {attn_output.size()}"
        )

    attn_output = attn_output.transpose(1, 2).contiguous()
    attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)

    attn_output = self.o_proj(attn_output)

    if not output_attentions:
        attn_weights = None

    return attn_output, attn_weights, past_key_value


def qwen_sdpa_attn_forward_SnapKV(
    self,
    hidden_states: torch.Tensor,
    attention_mask: Optional[torch.Tensor] = None,
    position_ids: Optional[torch.LongTensor] = None,
    past_key_value: Optional[Cache] = None,
    output_attentions: bool = False,
    use_cache: bool = False,
    cache_position: Optional[torch.LongTensor] = None,
    position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
    if output_attentions:
        # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
        logger.warning_once(
            "Qwen2Model is using Qwen2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
            'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
        )
        return super().forward(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_value=past_key_value,
            output_attentions=output_attentions,
            use_cache=use_cache,
        )
    # modify
    init_snapkv(self)
    
    bsz, q_len, _ = hidden_states.size()

    query_states = self.q_proj(hidden_states)
    key_states = self.k_proj(hidden_states)
    value_states = self.v_proj(hidden_states)

    query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
    key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
    value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)

    kv_seq_len = key_states.shape[-2]
    # modify
    if past_key_value is not None:
        if hasattr(self, "kv_seq_len"): 
            if self.kv_seq_len != 0:
                kv_seq_len += self.kv_seq_len
            else:
                kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
        else:
            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
    

    # Used for transformers 4.44.2
    # cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
    # query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)

    # Update for transformers 4.45.0
    # Using RotaryEmbedding now
    if position_embeddings is None:
        logger.warning_once(
            "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
            "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
            "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
            "removed and `position_embeddings` will be mandatory."
        )
        cos, sin = self.rotary_emb(value_states, position_ids)
    else:
        cos, sin = position_embeddings
        
    query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
    
    
    # modify
    key_states = repeat_kv(key_states, self.num_key_value_groups)
    value_states = repeat_kv(value_states, self.num_key_value_groups)

    # modify
    if past_key_value is not None:
        # core implmentation of SnapKV
        cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
        if key_states.shape[-2] == kv_seq_len:
            self.kv_seq_len = kv_seq_len
            # change seq_len here...
            key_states_compress, value_states_compress = self.kv_cluster.update_kv(key_states, query_states, value_states, attention_mask, self.num_key_value_groups)
            # print('Use SnapKV..')
            past_key_value.update(key_states_compress, value_states_compress, self.layer_idx, cache_kwargs)
        else:
            self.kv_seq_len += q_len
            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
        past_key_value._seen_tokens=self.kv_seq_len
        
    causal_mask = attention_mask
    if attention_mask is not None:  # no matter the length, we just slice it
        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]

    # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
    # Reference: https://github.com/pytorch/pytorch/issues/112577.
    if query_states.device.type == "cuda" and attention_mask is not None:
        query_states = query_states.contiguous()
        key_states = key_states.contiguous()
        value_states = value_states.contiguous()

    # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
    # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
    # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
    is_causal = True if causal_mask is None and q_len > 1 else False

    attn_output = torch.nn.functional.scaled_dot_product_attention(
        query_states,
        key_states,
        value_states,
        attn_mask=causal_mask,
        dropout_p=self.attention_dropout if self.training else 0.0,
        is_causal=is_causal,
    )

    attn_output = attn_output.transpose(1, 2).contiguous()
    attn_output = attn_output.view(bsz, q_len, self.hidden_size)

    attn_output = self.o_proj(attn_output)

    return attn_output, None, past_key_value



# def qwen_sdpa_attn_forward_SnapKV(
#     self,
#     hidden_states: torch.Tensor,
#     attention_mask: Optional[torch.Tensor] = None,
#     position_ids: Optional[torch.LongTensor] = None,
#     past_key_value: Optional["Cache"] = None,
#     output_attentions: bool = False,
#     use_cache: bool = False,
#     cache_position: Optional[torch.LongTensor] = None,
#     **kwargs,
#     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
#     # ---- SDPA 不支持 output_attentions=True ----
#     if output_attentions:
#         logger.warning_once(
#             "torch scaled_dot_product_attention 无 output_attentions=True; 回退到 eager 路径。"
#         )
#         return super().forward(
#             hidden_states=hidden_states,
#             attention_mask=attention_mask,
#             position_ids=position_ids,
#             past_key_value=past_key_value,
#             output_attentions=output_attentions,
#             use_cache=use_cache,
#             cache_position=cache_position,
#         )

#     init_snapkv(self)
#     bsz, q_len, _ = hidden_states.size()

#     q = self.q_proj(hidden_states)
#     k = self.k_proj(hidden_states)
#     v = self.v_proj(hidden_states)

#     q = q.view(bsz, q_len, self.num_heads,       self.head_dim).transpose(1, 2)
#     k_u = k.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
#     v_u = v.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)


#     if position_ids is not None:
#         rotary_seq_len = int(position_ids[:, -1].max().item()) + 1
#     else:
#         rotary_seq_len = q_len + (getattr(self, "kv_seq_len", 0) if past_key_value is not None else 0)

#     cos, sin = self.rotary_emb(v_u, seq_len=rotary_seq_len)
#     q_r, k_r_u = apply_rotary_pos_emb(q, k_u, cos, sin, position_ids)

#     #  SnapKV Think 
#     if hasattr(self, "kv_cluster") and hasattr(self.kv_cluster, "update_think"):
#         bsz_, qh, _, hd = q_r.shape
#         kvh = k_r_u.shape[1]
#         hpk = qh // kvh
#         q_gqa = q_r.view(bsz_, kvh, hpk, q_len, hd).mean(dim=2)
#         self.kv_cluster.update_think(k_r_u, q_gqa, v_u, attention_mask, self.num_key_value_groups)
#     else:
#         q_gqa = None

#     final_k, final_v = k_r_u, v_u
#     if past_key_value is not None:
#         cache_kwargs = {"sin": cos, "cos": sin, "cache_position": cache_position}
#         if k_r_u.shape[-2] == q_len:
#             self.kv_seq_len = getattr(self, "kv_seq_len", 0) + q_len
#             if q_gqa is None:
#                 bsz_, qh, _, hd = q_r.shape
#                 kvh = k_r_u.shape[1]
#                 q_gqa = q_r.view(bsz_, kvh, qh//kvh, q_len, hd).mean(dim=2)
#             comp_k, comp_v = self.kv_cluster.update_kv(k_r_u, q_gqa, v_u, attention_mask, self.num_key_value_groups)
#             final_k, final_v = past_key_value.update(comp_k, comp_v, self.layer_idx, cache_kwargs)
#         else:
#             self.kv_seq_len += q_len
#             final_k, final_v = past_key_value.update(k_r_u, v_u, self.layer_idx, cache_kwargs)
#         past_key_value._seen_tokens = self.kv_seq_len
#     else:
#         self.kv_seq_len = q_len

#     k_rep = repeat_kv(final_k, self.num_key_value_groups)
#     v_rep = repeat_kv(final_v, self.num_key_value_groups)

#     causal_mask = attention_mask
#     if attention_mask is not None:
#         causal_mask = causal_mask[:, :, :, : k_rep.shape[-2]]
#     if q_r.device.type == "cuda" and causal_mask is not None:
#         q_r, k_rep, v_rep = q_r.contiguous(), k_rep.contiguous(), v_rep.contiguous()
#     is_causal = causal_mask is None and q_len > 1

#     attn_out = torch.nn.functional.scaled_dot_product_attention(
#         q_r, k_rep, v_rep, attn_mask=causal_mask,
#         dropout_p=self.attention_dropout if self.training else 0.0,
#         is_causal=is_causal,
#     )

#     attn_out = attn_out.transpose(1, 2).contiguous().view(bsz, q_len, self.hidden_size)
#     attn_out = self.o_proj(attn_out)

#     return attn_out, None, past_key_value


























