import math

import torch
import torch.nn.functional as F
from torch import nn
import torch.utils.checkpoint
import matplotlib.pyplot as plt
import json
import transformers
import os
from transformers.cache_utils import Cache, DynamicCache, StaticCache
from transformers.models.qwen2.configuration_qwen2 import Qwen2Config
from .LA_SA_Cache import DynamicCache_LA_SA, DynamicCache_LA_SA_1, DynamicCache_LA_SA_2
from transformers.utils import (
    add_start_docstrings_to_model_forward,
    logging,
)
from transformers.utils import is_flash_attn_greater_or_equal_2_10

from transformers.models.qwen2.modeling_qwen2 import Qwen2Attention, Qwen2FlashAttention2
from typing import List, Optional, Tuple, Union
from transformers.modeling_flash_attention_utils import _flash_attention_forward
from transformers.modeling_outputs import BaseModelOutputWithPast
logger = logging.get_logger(__name__)

class Qwen2RotaryEmbedding(nn.Module):
    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
        super().__init__()

        self.dim = dim
        self.max_position_embeddings = max_position_embeddings
        self.base = base
        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
        self.register_buffer("inv_freq", inv_freq, persistent=False)

        # Build here to make `torch.jit.trace` work.
        self._set_cos_sin_cache(
            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
        )

    def _set_cos_sin_cache(self, seq_len, device, dtype):
        self.max_seq_len_cached = seq_len
        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)

        freqs = torch.outer(t, self.inv_freq)
        # Different from paper, but it uses a different permutation in order to obtain the same calculation
        emb = torch.cat((freqs, freqs), dim=-1)
        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)

    def forward(self, x, seq_len=None):
        # x: [bs, num_attention_heads, seq_len, head_size]
        if seq_len > self.max_seq_len_cached:
            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)

        return (
            self.cos_cached[:seq_len].to(dtype=x.dtype),
            self.sin_cached[:seq_len].to(dtype=x.dtype),
        )

def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
    """Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`):
            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
            used to pass offsetted position ids when working with a KV-cache.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    """
    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
    q_embed = (q * cos) + (rotate_half(q) * sin)
    k_embed = (k * cos) + (rotate_half(k) * sin)
    return q_embed, k_embed

def rotate_half(x):
    """Rotates half the hidden dims of the input."""
    x1 = x[..., : x.shape[-1] // 2]
    x2 = x[..., x.shape[-1] // 2 :]
    return torch.cat((-x2, x1), dim=-1)

def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
    """
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    """
    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
    if n_rep == 1:
        return hidden_states
    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)

class EagerAttention_SLAKE(nn.Module):
    def __init__(self, cache_size, window_size, linear_cache_size, alpha, beta):
        super().__init__()
        self.window_size = window_size
        self.cache_size = cache_size  # self.cache+self.window_size
        self.gamma = 200
        self.alpha = alpha
        self.beta = beta
        #print("using alpha:", self.alpha, ", beta:", self.beta)
        self.prefill = True
        self.linear_cache_size = linear_cache_size

    def forward(
            self,
            module: nn.Module,
            query: torch.Tensor,
            key_states: torch.Tensor,
            value_states: torch.Tensor,
            attention_mask: Optional[torch.Tensor],
            past_key_value: Optional[DynamicCache_LA_SA_2] = None,
            layer_idx=None,
            **kwargs,
    ):

        torch.backends.cuda.enable_flash_sdp(True)

        _, _, k_len, _ = key_states.shape
        bsz, H, q_len, D = query.shape

        if past_key_value.prefill_stage[layer_idx]:

            tmp_attn_weights = torch.matmul(query[:, :, -self.window_size:, :], key_states.transpose(2, 3)) / math.sqrt(module.head_dim)

            mask = torch.full((self.window_size, self.window_size), torch.finfo(tmp_attn_weights.dtype).min,
                              device=tmp_attn_weights.device)
            mask_cond = torch.arange(mask.size(-1), device=tmp_attn_weights.device)
            mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
            linear_mask = (mask == 0)
            mask = mask.to(tmp_attn_weights.device)
            linear_mask = linear_mask.to(tmp_attn_weights.device)
            tmp_attention_mask = mask[None, None, :, :]
            linear_mask = linear_mask[None, None, :, :]
            # print("tmp_attention_mask", tmp_attention_mask.shape)

            tmp_attn_weights[:, :, -self.window_size:, -self.window_size:] += tmp_attention_mask
            tmp_attn_weights = tmp_attn_weights[:, :, -self.window_size:, :]

            tmp_attn_weights_back = tmp_attn_weights
            tmp_attn_weights_back_soft = nn.functional.softmax(tmp_attn_weights_back, dim=-1, dtype=torch.float32).to(
                query.dtype)
            tmp_attn_weights = nn.functional.softmax(tmp_attn_weights, dim=-1, dtype=torch.float32).to(
                query.dtype)
            attn_score = tmp_attn_weights



            disp = self.calculate_entropy(attn_score)

            attn_var = attn_score.var(dim=-2)
            attn_score = attn_score.mean(dim=-2)
            attn_score = attn_score + self.gamma * attn_var
            attn_score = attn_score[:, :, :-self.window_size]

            attn_score = F.avg_pool1d(attn_score, kernel_size=5, padding=5 // 2, stride=1)
            attn_score = attn_score.reshape(bsz,  module.num_key_value_heads, module.num_key_value_groups, -1)
            attn_score = attn_score.mean(dim=-2)

            past_key_value.evict_scores.append(attn_score)

            _, _, disp_n = attn_score.shape

            attn_score = attn_score.unsqueeze(3)
            attn_score = repeat_kv(attn_score, module.num_key_value_groups)
            attn_score = attn_score.squeeze(3)

            att_LA_SA = self.LA_SA(attn_score.unsqueeze(2), tmp_attn_weights_back, linear_mask, disp_n, self.linear_cache_size)
            if not torch.isfinite(att_LA_SA).all():
                print("att_LA_SA contains non-finite values (NaN or Inf)!")


            p_true = tmp_attn_weights_back_soft  # [B,H,W,Kb]  정확 softmax(백 키들)
            p_hat = att_LA_SA  # [B,H,W,Kb]  근사 분포
            V_back = value_states[:, :, :, :]  # [B,H,Kb,D]


            # 3) 재정규화
            eps = 1e-6
            p_t = p_true.clamp_min(eps)
            p_t = p_t / p_t.sum(dim=-1, keepdim=True).clamp_min(eps)
            p_a = p_hat.clamp_min(eps)
            p_a = p_a / p_a.sum(dim=-1, keepdim=True).clamp_min(eps)

            if not torch.isfinite(p_t).all():
                print("p_t contains non-finite values (NaN or Inf)!")
            if not torch.isfinite(p_a).all():
                print("p_a contains non-finite values (NaN or Inf)!")
            tv = 0.5 * (p_t - p_a).abs().sum(dim=-1)
            if not torch.isfinite(tv).all():
                print("tv contains non-finite values (NaN or Inf)!")

            vnorm = V_back.norm(dim=-1)  # [B,H,W,k_union]
            if not torch.isfinite(vnorm).all():
                print("vnorm1 contains non-finite values (NaN or Inf)!")

            vnorm = (vnorm / vnorm.sum(dim=-1, keepdim=True)).clamp_min(eps).unsqueeze(2)
            #print("vnorm", vnorm.shape)
            if not torch.isfinite(vnorm).all():
                print("vnorm2 contains non-finite values (NaN or Inf)!")
            value_effect = ((p_t - p_a).abs())  # [B,H,W]
            value_effect = (value_effect * vnorm)  # [B,H,W]
            value_effect = value_effect.sum(dim=-1).clamp_min(eps)
            if not torch.isfinite(value_effect).all():
                print("value_effect1 contains non-finite values (NaN or Inf)!")

            value_effect = (value_effect**0.1)
            if not torch.isfinite(value_effect).all():
                print("value_effect2 contains non-finite values (NaN or Inf)!")
            score_bhw = (tv*value_effect).mean()  # [B,H,W]
            if not torch.isfinite(score_bhw).all():
                print("score_bhw contains non-finite values (NaN or Inf)!")
            pref_score = ((score_bhw ** self.alpha) * (disp ** self.beta)).to(torch.float32).cpu().numpy()



            #print("pref_score", pref_score)
            past_key_value.pref_scores.append(pref_score)
            past_key_value.layer_budget.append(self.cache_size)
            past_key_value.window_size.append(self.window_size)
            past_key_value.linear_cache_size.append(self.linear_cache_size)
            past_key_value.budget_update(layer_idx, q_len)

            for i in range(layer_idx + 1):
                past_key_value.update_index_prefill(i, layer_idx)

            past_key_value.prefill_stage[layer_idx] = False


        else:
            # print(query)
            # print(key_states)
            if not torch.isfinite(query).all():
                print("query contains non-finite values (NaN or Inf)!")
            if not torch.isfinite(key_states).all():
                print("key_states contains non-finite values (NaN or Inf)!")
            dtype = query.dtype
            attn_score = torch.matmul(query, key_states.transpose(2, 3))
            attn_score = attn_score / math.sqrt(module.head_dim)
            attn_score = nn.functional.softmax(attn_score, dim=-1, dtype=torch.float32).to(dtype)
            # print(key_states.shape)
            attn_score = attn_score[:, :, :, 0:-self.window_size].mean(dim=-2)
            attn_score = attn_score.reshape(bsz,  module.num_key_value_heads, module.num_key_value_groups, -1)
            attn_score = attn_score.mean(dim=-2)
            attn_score = F.avg_pool1d(attn_score, kernel_size=5, padding=5 // 2, stride=1)
            past_key_value.evict_scores[layer_idx] = attn_score
            past_key_value.update_index(layer_idx)

        if past_key_value.lin_cached[layer_idx] and q_len <= 1:

            linear_cache, k_sum, v_sum, seen_tokens, key_states, value_states = past_key_value.linear_cache(layer_idx)



            key_states = repeat_kv(key_states, module.num_key_value_groups)
            value_states = repeat_kv(value_states, module.num_key_value_groups)

            linear_cache = repeat_kv(linear_cache, module.num_key_value_groups)
            k_sum = repeat_kv(k_sum, module.num_key_value_groups)
            v_sum = repeat_kv(v_sum, module.num_key_value_groups)



            la_len = seen_tokens - past_key_value.layer_budget[layer_idx]
            attn_weights = torch.matmul(query, key_states.transpose(2, 3)) / math.sqrt(module.head_dim)
            att_max = torch.max(attn_weights, dim=-1, keepdim=True).values
            attn_weights = torch.exp(attn_weights - att_max)
            attn_sum = torch.sum(attn_weights, dim=-1, keepdim=True)  # + 1e-12

            att_lin = torch.matmul(query, linear_cache) / math.sqrt(module.head_dim)
            att_mean = torch.matmul(query, k_sum.transpose(2, 3)) / (la_len * math.sqrt(module.head_dim))
            lin_e = torch.exp(att_mean - att_max)
            att_lin = (att_lin + torch.matmul((1 - att_mean), v_sum)) * lin_e

            # noramlize both attention
            attn_weights = torch.matmul(attn_weights, value_states)
            attn_output = (attn_weights + att_lin) / (attn_sum + lin_e * la_len)




        else:

            attn_output = _flash_attention_forward(
                query_states=query.transpose(1, 2),
                key_states=key_states.transpose(1, 2),
                value_states=value_states.transpose(1, 2),
                attention_mask=None,
                query_length=q_len,
                is_causal=True,
            ).transpose(1, 2)
            # if not torch.isfinite(attn_output).all():
            #     print("NAN")

        if attn_output.size() != (bsz, module.num_heads, q_len, module.head_dim):
            raise ValueError(
                f"`attn_output` should be of size {(bsz, module.num_heads, q_len, module.head_dim)}, but is"
                f" {attn_output.size()}"
            )

        attn_output = attn_output.transpose(1, 2).contiguous()

        # if(layer_idx==31):
        #     endtime = time.time()
        #     print("time consumed", endtime-start_time)
        return attn_output, None

    def LA_SA(self, attn_score, tmp_attn_weights_back, linear_mask, disp_n, linear_cache_len):

        if (self.cache_size - self.window_size)<(disp_n/2):
            un_evict_len = self.cache_size - self.window_size
            Linear_len = (disp_n - (self.cache_size))
        else:
            #print("cahce size longer then seq_len")
            Linear_len = int(disp_n/2)
            # if Linear_len<=32:
            #     Linear_len = 33
            un_evict_len = disp_n-Linear_len

        topk_indices = attn_score.topk(un_evict_len, dim=-1).indices
        mask = torch.zeros_like(attn_score, dtype=torch.bool)
        mask.scatter_(dim=-1, index=topk_indices, value=True)
        tmp_attn_weights_soft = tmp_attn_weights_back.clone()
        tmp_attn_weights_soft[:, :, :, :-self.window_size].masked_fill_(~mask, float("-inf"))

        att_max = torch.max(tmp_attn_weights_soft, dim=-1, keepdim=True).values  # torch.Size([8, 12, 1024, 1])
        if not torch.isfinite(att_max).all():
            print("att_max contains non-finite values (NaN or Inf)!")
        att_soft = torch.exp(tmp_attn_weights_soft - att_max)
        if not torch.isfinite(att_soft).all():
            print("att_soft1 contains non-finite values (NaN or Inf)!")
        att_soft_sum = torch.sum(att_soft, dim=-1, keepdim=True) + 1e-12
        if not torch.isfinite(att_soft_sum).all():
            print("att_soft_sum contains non-finite values (NaN or Inf)!")
        tmp_attn_weights_back[:, :, :, :-self.window_size] = tmp_attn_weights_back[:, :, :,
                                                             :-self.window_size].masked_fill_(mask, 0)
        tmp_attn_weights_back[:, :, :, -self.window_size:] = tmp_attn_weights_back[:, :, :,
                                                             -self.window_size:].masked_fill_(~linear_mask, 0)


        att_lin = tmp_attn_weights_back
        if not torch.isfinite(att_lin).all():
            print("att_lin contains non-finite values (NaN or Inf)!")
        att_lin_avg = torch.sum(att_lin, dim=-1, keepdim=True)
        if not torch.isfinite(att_lin_avg).all():
            print("att_lin_avg contains non-finite values (NaN or Inf)!")

        att_lin_avg_device = att_lin_avg.device  # 기존 device 유지
        scale = torch.full(
            (1, 1, self.window_size, 1),  # shape
            fill_value=Linear_len,  # 모든 값 = Linear_len
            device=att_lin_avg_device  # device 지정
        )

        att_lin_avg = att_lin_avg /scale
        att_lin_avg = torch.clamp(att_lin_avg, max=torch.max(att_max))
        if not torch.isfinite(att_lin_avg).all():
            print("att_lin_avg_scale contains non-finite values (NaN or Inf)!")

        att_lin = torch.exp(att_lin_avg - att_max) * (1 + att_lin - att_lin_avg)
        att_lin = torch.clamp(att_lin, min=0)
        if not torch.isfinite(att_lin).all():
            print("att_lin2 contains non-finite values (NaN or Inf)!")

        att_lin[:, :, :, :-self.window_size] = att_lin[:, :, :, :-self.window_size].masked_fill_(mask, 0)
        att_lin[:, :, :, -self.window_size:] = att_lin[:, :, :, -self.window_size:].masked_fill_(~linear_mask, 0)
        if not torch.isfinite(att_lin).all():
            print("att_lin3 contains non-finite values (NaN or Inf)!")
        att_lin_sum = torch.sum(att_lin, dim=-1, keepdim=True)
        if not torch.isfinite(att_lin_sum).all():
            print("att_lin_sum contains non-finite values (NaN or Inf)!")
        att = (att_soft + att_lin)
        if not torch.isfinite(att).all():
            print("att contains non-finite values (NaN or Inf)!")
        att_LA_SA = att / ((att_soft_sum + att_lin_sum)+ 1e-12)
        if not torch.isfinite(att_LA_SA).all():
            print("att_LA_SA contains non-finite values (NaN or Inf)!")
        return att_LA_SA


    def calculate_entropy(self, attention_scores):
        attention_scores = attention_scores.to(torch.float32)
        entropy = -torch.sum(attention_scores * torch.log(attention_scores + 1e-10))
        entropy = entropy.to(dtype=torch.float32)
        return entropy

class Qwen2Attention_SLAKE(Qwen2FlashAttention2):
    """
    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
    and "Generating Long Sequences with Sparse Transformers".
    """
    def __init__(self, config, layer_idx: int, cache_size, window_size, linear_cache_size, alpha, beta):
        super().__init__( config, layer_idx)

        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
        self.cache_size = cache_size
        self.window_size = window_size
        self.linear_cache_size = linear_cache_size
        self.alpha = alpha
        self.beta = beta
        self.EagerAttention_LA_SA = EagerAttention_SLAKE(cache_size=self.cache_size, window_size=self.window_size,
                                                           linear_cache_size=self.linear_cache_size, alpha=self.alpha,
                                                           beta=self.beta)
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Cache] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
        cache_position: Optional[torch.LongTensor] = None,
    ):
        bsz, q_len, _ = hidden_states.size()

        query_states = self.q_proj(hidden_states)
        key_states = self.k_proj(hidden_states)
        value_states = self.v_proj(hidden_states)

        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)

        kv_seq_len = key_states.shape[-2]
        if past_key_value is not None:
            if self.layer_idx is None:
                raise ValueError(
                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                    "with a layer index."
                )
            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)

        # Because the input can be padded, the absolute sequence length depends on the max position id.
        rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
        cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)

        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)

        if past_key_value is not None:
            # Activate slicing cache only if the config has a value `sliding_windows` attribute
            cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
            if (
                getattr(self.config, "sliding_window", None) is not None
                and kv_seq_len > self.config.sliding_window
                and cache_has_contents
            ):
                slicing_tokens = 1 - self.config.sliding_window

                past_key = past_key_value[self.layer_idx][0]
                past_value = past_key_value[self.layer_idx][1]

                past_key = past_key[:, :, slicing_tokens:, :].contiguous()
                past_value = past_value[:, :, slicing_tokens:, :].contiguous()

                if past_key.shape[-2] != self.config.sliding_window - 1:
                    raise ValueError(
                        f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
                        f" {past_key.shape}"
                    )

                if attention_mask is not None:
                    attention_mask = attention_mask[:, slicing_tokens:]
                    attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)

            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)

        # repeat k/v heads if n_kv_heads < n_heads
        key_states = repeat_kv(key_states, self.num_key_value_groups)
        value_states = repeat_kv(value_states, self.num_key_value_groups)
        dropout_rate = 0.0 if not self.training else self.attention_dropout

        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
        # therefore the input hidden states gets silently casted in float32. Hence, we need
        # cast them back in float16 just to be sure everything works as expected.
        input_dtype = query_states.dtype
        if input_dtype == torch.float32:
            if torch.is_autocast_enabled():
                target_dtype = torch.get_autocast_gpu_dtype()
            # Handle the case where the model is quantized
            elif hasattr(self.config, "_pre_quantization_dtype"):
                target_dtype = self.config._pre_quantization_dtype
            else:
                target_dtype = self.q_proj.weight.dtype

            logger.warning_once(
                f"The input hidden states seems to be silently casted in float32, this might be related to"
                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
                f" {target_dtype}."
            )

            query_states = query_states.to(target_dtype)
            key_states = key_states.to(target_dtype)
            value_states = value_states.to(target_dtype)

        # Reashape to the expected shape for Flash Attention
        # query_states = query_states.transpose(1, 2)
        # key_states = key_states.transpose(1, 2)
        # value_states = value_states.transpose(1, 2)
        #
        # if (
        #     self.config.use_sliding_window
        #     and getattr(self.config, "sliding_window", None) is not None
        #     and self.layer_idx >= self.config.max_window_layers
        # ):
        #     sliding_window = self.config.sliding_window
        # else:
        #     sliding_window = None
        #
        # attn_output = _flash_attention_forward(
        #     query_states,
        #     key_states,
        #     value_states,
        #     attention_mask,
        #     q_len,
        #     dropout=dropout_rate,
        #     sliding_window=sliding_window,
        #     is_causal=self.is_causal,
        #     use_top_left_mask=self._flash_attn_uses_top_left_mask,
        # )
        attn_output, attn_weights = self.EagerAttention_LA_SA(
            self,
            query_states,
            key_states,
            value_states,
            attention_mask,
            past_key_value,
            self.layer_idx,
        )

        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
        attn_output = self.o_proj(attn_output)

        if not output_attentions:
            attn_weights = None

        return attn_output, attn_weights, past_key_value


QWEN2_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
            information on the default strategy.

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.n_positions - 1]`.

            [What are position IDs?](../glossary#position-ids)
        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.

            Two formats are allowed:
            - a [`~cache_utils.Cache`] instance;
            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
            cache format.

            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
            legacy cache format will be returned.

            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
            of shape `(batch_size, sequence_length)`.
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
            the complete sequence length.
"""


@add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
def Qwen2_SLAKE_forward(
    self,
    input_ids: torch.LongTensor = None,
    attention_mask: Optional[torch.Tensor] = None,
    position_ids: Optional[torch.LongTensor] = None,
    past_key_values: Optional[List[torch.FloatTensor]] = None,
    inputs_embeds: Optional[torch.FloatTensor] = None,
    use_cache: Optional[bool] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
    cache_position: Optional[torch.LongTensor] = None,
) -> Union[Tuple, BaseModelOutputWithPast]:
    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
    output_hidden_states = (
        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
    )
    use_cache = use_cache if use_cache is not None else self.config.use_cache

    return_dict = return_dict if return_dict is not None else self.config.use_return_dict

    if (input_ids is None) ^ (inputs_embeds is not None):
        raise ValueError(
            "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
        )

    if self.gradient_checkpointing and self.training:
        if use_cache:
            logger.warning_once(
                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
            )
            use_cache = False

    use_legacy_cache = False
    if (
        use_cache and not isinstance(past_key_values, Cache) and not self.training
    ):  # kept for BC (non `Cache` `past_key_values` inputs)
        return_legacy_cache = True
        past_key_values = DynamicCache_LA_SA_2.from_legacy_cache(past_key_values, self.config.num_hidden_layers)
        logger.warning_once(
            "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
            "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)"
        )
    elif use_cache and not isinstance(past_key_values, DynamicCache_LA_SA_2):
            #print("replace cache with DynamicCache_LA_SA")
            past_key_values = DynamicCache_LA_SA_2(self.config.num_hidden_layers)

    if inputs_embeds is None:
        inputs_embeds = self.embed_tokens(input_ids)

    if cache_position is None:
        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
        cache_position = torch.arange(
            past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
        )
    if position_ids is None:
        position_ids = cache_position.unsqueeze(0)

    causal_mask = self._update_causal_mask(
        attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
    )

    hidden_states = inputs_embeds

    # decoder layers
    all_hidden_states = () if output_hidden_states else None
    all_self_attns = () if output_attentions else None
    next_decoder_cache = None

    for decoder_layer in self.layers:
        if output_hidden_states:
            all_hidden_states += (hidden_states,)

        if self.gradient_checkpointing and self.training:
            layer_outputs = self._gradient_checkpointing_func(
                decoder_layer.__call__,
                hidden_states,
                causal_mask,
                position_ids,
                past_key_values,
                output_attentions,
                use_cache,
                cache_position,
            )
        else:
            layer_outputs = decoder_layer(
                hidden_states,
                attention_mask=causal_mask,
                position_ids=position_ids,
                past_key_value=past_key_values,
                output_attentions=output_attentions,
                use_cache=use_cache,
                cache_position=cache_position,
            )

        hidden_states = layer_outputs[0]

        if use_cache:
            next_decoder_cache = layer_outputs[2 if output_attentions else 1]

        if output_attentions:
            all_self_attns += (layer_outputs[1],)

    hidden_states = self.norm(hidden_states)

    # add hidden states from the last decoder layer
    if output_hidden_states:
        all_hidden_states += (hidden_states,)

    next_cache = None
    if use_cache:
        next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache

    if not return_dict:
        return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
    return BaseModelOutputWithPast(
        last_hidden_state=hidden_states,
        past_key_values=next_cache,
        hidden_states=all_hidden_states,
        attentions=all_self_attns,
    )

# Copied from transformers.models.llama.modeling_llama.LlamaModel._update_causal_mask