# coding=utf-8
# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch Qwen2-VL model."""

from copy import deepcopy, copy
import math
from dataclasses import dataclass
from typing import List, Optional, Tuple, Union

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.checkpoint
from torch.nn import CrossEntropyLoss, LayerNorm

from ...activations import ACT2FN
from ...cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache
from ...generation import GenerationMixin
from ...modeling_attn_mask_utils import (
    AttentionMaskConverter,
)
from ...modeling_outputs import (
    BaseModelOutputWithPast,
    ModelOutput,
)
from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
from ...modeling_utils import PreTrainedModel
from ...utils import (
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    is_flash_attn_2_available,
    is_flash_attn_greater_or_equal_2_10,
    logging,
    replace_return_docstrings,
)
from .configuration_qwen2_vl import Qwen2VLConfig, Qwen2VLVisionConfig

if is_flash_attn_2_available():
    from flash_attn import flash_attn_varlen_func

    from ...modeling_flash_attention_utils import _flash_attention_forward, _upad_input, pad_input
else:
    flash_attn_varlen_func = None
import time

logger = logging.get_logger(__name__)

_CONFIG_FOR_DOC = "Qwen2VLConfig"


@dataclass
class Qwen2VLCausalLMOutputWithPast(ModelOutput):
    """
    Base class for Qwen2VL causal language model (or autoregressive) outputs.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Language modeling loss (for next-token prediction).
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
            `past_key_values` input) to speed up sequential decoding.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
            The rope index difference between sequence length and multimodal rope.
    """

    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    past_key_values: Optional[List[torch.FloatTensor]] = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None
    rope_deltas: Optional[torch.LongTensor] = None


class Qwen2VLRotaryEmbedding(nn.Module):
    def __init__(
        self,
        dim=None,
        max_position_embeddings=2048,
        base=10000,
        device=None,
        scaling_factor=1.0,
        rope_type="default",
        config: Optional[Qwen2VLConfig] = None,
    ):
        super().__init__()
        # TODO (joao): remove the `if` below, only used for BC
        self.rope_kwargs = {}
        if config is None:
            logger.warning_once(
                "`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the "
                "`config` argument. All other arguments will be removed in v4.46"
            )
            self.rope_kwargs = {
                "rope_type": rope_type,
                "factor": scaling_factor,
                "dim": dim,
                "base": base,
                "max_position_embeddings": max_position_embeddings,
            }
            self.rope_type = rope_type
            self.max_seq_len_cached = max_position_embeddings
            self.original_max_seq_len = max_position_embeddings
        else:
            # BC: "rope_type" was originally "type"
            if config.rope_scaling is not None:
                self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
            else:
                self.rope_type = "default"
            self.max_seq_len_cached = config.max_position_embeddings
            self.original_max_seq_len = config.max_position_embeddings

        self.config = config
        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]

        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
        self.register_buffer("inv_freq", inv_freq, persistent=False)
        self.original_inv_freq = self.inv_freq

    def _dynamic_frequency_update(self, position_ids, device):
        """
        dynamic RoPE layers should recompute `inv_freq` in the following situations:
        1 - growing beyond the cached sequence length (allow scaling)
        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
        """
        seq_len = torch.max(position_ids) + 1
        if seq_len > self.max_seq_len_cached:  # growth
            inv_freq, self.attention_scaling = self.rope_init_fn(
                self.config, device, seq_len=seq_len, **self.rope_kwargs
            )
            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
            self.max_seq_len_cached = seq_len

        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
            self.max_seq_len_cached = self.original_max_seq_len

    @torch.no_grad()
    def forward(self, x, position_ids):
        if "dynamic" in self.rope_type:
            self._dynamic_frequency_update(position_ids, device=x.device)

        # Core RoPE block. In contrast to other models, Qwen2_VL has different position ids for thw grids
        # So we expand the inv_freq to shape (3, ...)
        inv_freq_expanded = self.inv_freq[None, None, :, None].float().expand(3, position_ids.shape[1], -1, 1)
        position_ids_expanded = position_ids[:, :, None, :].float()  # shape (3, bs, 1, positions)
        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
        device_type = x.device.type
        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
        with torch.autocast(device_type=device_type, enabled=False):
            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(2, 3)
            emb = torch.cat((freqs, freqs), dim=-1)
            cos = emb.cos()
            sin = emb.sin()

        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
        cos = cos * self.attention_scaling
        sin = sin * self.attention_scaling

        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)


# Copied from transformers.models.llama.modeling_llama.rotate_half
def rotate_half(x):
    """Rotates half the hidden dims of the input."""
    x1 = x[..., : x.shape[-1] // 2]
    x2 = x[..., x.shape[-1] // 2 :]
    return torch.cat((-x2, x1), dim=-1)


def apply_multimodal_rotary_pos_emb(q, k, cos, sin, mrope_section, unsqueeze_dim=1):
    """Applies Rotary Position Embedding with Multimodal Sections to the query and key tensors (https://qwenlm.github.io/blog/qwen2-vl/).

    Explanation:
        Multimodal 3D rotary position embedding is an extension to 1D rotary position embedding. The input embedding
        sequence contains vision (images / videos) embedding and text embedding or just contains text embedding. For
        vision embedding part, we apply rotary position embedding on temporal, height and width dimension seperately.
        Here we split the channel dimension to 3 chunks for the temporal, height and width rotary position embedding.
        For text embedding part, we just apply 1D rotary position embedding. The three rotary position index (temporal,
        height and width) of text embedding is always the same, so the text embedding rotary position embedding has no
        difference with modern LLMs.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`):
            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
            used to pass offsetted position ids when working with a KV-cache.
        mrope_section(`List(int)`):
            Multimodal rope section is for channel dimension of temporal, height and width in rope calculation.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    """
    mrope_section = mrope_section * 2
    cos = torch.cat([m[i % 3] for i, m in enumerate(cos.split(mrope_section, dim=-1))], dim=-1).unsqueeze(
        unsqueeze_dim
    )
    sin = torch.cat([m[i % 3] for i, m in enumerate(sin.split(mrope_section, dim=-1))], dim=-1).unsqueeze(
        unsqueeze_dim
    )

    q_embed = (q * cos) + (rotate_half(q) * sin)
    k_embed = (k * cos) + (rotate_half(k) * sin)
    return q_embed, k_embed


def apply_rotary_pos_emb_vision(tensor: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor:
    orig_dtype = tensor.dtype
    tensor = tensor.float()
    cos = freqs.cos()
    sin = freqs.sin()
    cos = cos.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float()
    sin = sin.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float()
    output = (tensor * cos) + (rotate_half(tensor) * sin)
    output = output.to(orig_dtype)
    return output


class VisionRotaryEmbedding(nn.Module):
    def __init__(self, dim: int, theta: float = 10000.0) -> None:
        super().__init__()
        inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
        self.register_buffer("inv_freq", inv_freq, persistent=False)

    def forward(self, seqlen: int) -> torch.Tensor:
        seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
        freqs = torch.outer(seq, self.inv_freq)
        return freqs


class PatchEmbed(nn.Module):
    def __init__(
        self,
        patch_size: int = 14,
        temporal_patch_size: int = 2,
        in_channels: int = 3,
        embed_dim: int = 1152,
    ) -> None:
        super().__init__()
        self.patch_size = patch_size
        self.temporal_patch_size = temporal_patch_size
        self.in_channels = in_channels
        self.embed_dim = embed_dim

        kernel_size = [temporal_patch_size, patch_size, patch_size]
        self.proj = nn.Conv3d(in_channels, embed_dim, kernel_size=kernel_size, stride=kernel_size, bias=False)

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        target_dtype = self.proj.weight.dtype
        hidden_states = hidden_states.view(
            -1, self.in_channels, self.temporal_patch_size, self.patch_size, self.patch_size
        )
        hidden_states = self.proj(hidden_states.to(dtype=target_dtype)).view(-1, self.embed_dim)
        return hidden_states


class PatchMerger(nn.Module):
    def __init__(self, dim: int, context_dim: int, spatial_merge_size: int = 2) -> None:
        super().__init__()
        self.hidden_size = context_dim * (spatial_merge_size**2)
        self.ln_q = LayerNorm(context_dim, eps=1e-6)
        self.mlp = nn.Sequential(
            nn.Linear(self.hidden_size, self.hidden_size),
            nn.GELU(),
            nn.Linear(self.hidden_size, dim),
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.mlp(self.ln_q(x).view(-1, self.hidden_size))
        return x


class VisionMlp(nn.Module):
    def __init__(self, dim: int, hidden_dim: int, hidden_act: str) -> None:
        super().__init__()
        self.fc1 = nn.Linear(dim, hidden_dim)
        self.act = ACT2FN[hidden_act]
        self.fc2 = nn.Linear(hidden_dim, dim)

    def forward(self, x) -> torch.Tensor:
        return self.fc2(self.act(self.fc1(x)))


class VisionAttention(nn.Module):
    def __init__(self, dim: int, num_heads: int = 16) -> None:
        super().__init__()
        self.num_heads = num_heads
        self.head_dim = dim // num_heads
        self.qkv = nn.Linear(dim, dim * 3, bias=True)
        self.proj = nn.Linear(dim, dim)

    def forward(
        self, hidden_states: torch.Tensor, cu_seqlens: torch.Tensor, rotary_pos_emb: torch.Tensor = None
    ) -> torch.Tensor:
        seq_length = hidden_states.shape[0]
        q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
        q = apply_rotary_pos_emb_vision(q.unsqueeze(0), rotary_pos_emb).squeeze(0)
        k = apply_rotary_pos_emb_vision(k.unsqueeze(0), rotary_pos_emb).squeeze(0)

        attention_mask = torch.full(
            [1, seq_length, seq_length], torch.finfo(q.dtype).min, device=q.device, dtype=q.dtype
        )
        for i in range(1, len(cu_seqlens)):
            attention_mask[..., cu_seqlens[i - 1] : cu_seqlens[i], cu_seqlens[i - 1] : cu_seqlens[i]] = 0

        q = q.transpose(0, 1)
        k = k.transpose(0, 1)
        v = v.transpose(0, 1)
        attn_weights = torch.matmul(q, k.transpose(1, 2)) / math.sqrt(self.head_dim)
        attn_weights = attn_weights + attention_mask
        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(q.dtype)
        attn_output = torch.matmul(attn_weights, v)
        attn_output = attn_output.transpose(0, 1)
        attn_output = attn_output.reshape(seq_length, -1)
        attn_output = self.proj(attn_output)
        return attn_output


class VisionFlashAttention2(nn.Module):
    def __init__(self, dim: int, num_heads: int = 16) -> None:
        super().__init__()
        self.num_heads = num_heads
        self.qkv = nn.Linear(dim, dim * 3, bias=True)
        self.proj = nn.Linear(dim, dim)

    def forward(
        self, hidden_states: torch.Tensor, cu_seqlens: torch.Tensor, rotary_pos_emb: torch.Tensor = None
    ) -> torch.Tensor:
        seq_length = hidden_states.shape[0]
        q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
        q = apply_rotary_pos_emb_vision(q.unsqueeze(0), rotary_pos_emb).squeeze(0)
        k = apply_rotary_pos_emb_vision(k.unsqueeze(0), rotary_pos_emb).squeeze(0)

        max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
        attn_output = flash_attn_varlen_func(q, k, v, cu_seqlens, cu_seqlens, max_seqlen, max_seqlen).reshape(
            seq_length, -1
        )
        attn_output = self.proj(attn_output)
        return attn_output


class VisionSdpaAttention(nn.Module):
    def __init__(self, dim: int, num_heads: int = 16) -> None:
        super().__init__()
        self.num_heads = num_heads
        self.qkv = nn.Linear(dim, dim * 3, bias=True)
        self.proj = nn.Linear(dim, dim)

    def forward(
        self, hidden_states: torch.Tensor, cu_seqlens: torch.Tensor, rotary_pos_emb: torch.Tensor = None
    ) -> torch.Tensor:
        seq_length = hidden_states.shape[0]
        q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
        q = apply_rotary_pos_emb_vision(q.unsqueeze(0), rotary_pos_emb).squeeze(0)
        k = apply_rotary_pos_emb_vision(k.unsqueeze(0), rotary_pos_emb).squeeze(0)

        attention_mask = torch.zeros([1, seq_length, seq_length], device=q.device, dtype=torch.bool)
        for i in range(1, len(cu_seqlens)):
            attention_mask[..., cu_seqlens[i - 1] : cu_seqlens[i], cu_seqlens[i - 1] : cu_seqlens[i]] = True
        q = q.transpose(0, 1)
        k = k.transpose(0, 1)
        v = v.transpose(0, 1)
        attn_output = F.scaled_dot_product_attention(q, k, v, attention_mask, dropout_p=0.0)
        attn_output = attn_output.transpose(0, 1)
        attn_output = attn_output.reshape(seq_length, -1)
        attn_output = self.proj(attn_output)
        return attn_output


QWEN2_VL_VISION_ATTENTION_CLASSES = {
    "eager": VisionFlashAttention2,
    "flash_attention_2": VisionFlashAttention2,
    "sdpa": VisionSdpaAttention,
    "moba": VisionFlashAttention2,
}


class Qwen2VLVisionBlock(nn.Module):
    def __init__(self, config, attn_implementation: str = "sdpa") -> None:
        super().__init__()
        self.norm1 = LayerNorm(config.embed_dim, eps=1e-6)
        self.norm2 = LayerNorm(config.embed_dim, eps=1e-6)
        mlp_hidden_dim = int(config.embed_dim * config.mlp_ratio)

        self.attn = QWEN2_VL_VISION_ATTENTION_CLASSES[attn_implementation](
            config.embed_dim, num_heads=config.num_heads
        )
        self.mlp = VisionMlp(dim=config.embed_dim, hidden_dim=mlp_hidden_dim, hidden_act=config.hidden_act)

    def forward(self, hidden_states, cu_seqlens, rotary_pos_emb) -> torch.Tensor:
        hidden_states = hidden_states + self.attn(
            self.norm1(hidden_states), cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb
        )
        hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
        return hidden_states


# Copied from transformers.models.qwen2.modeling_qwen2.Qwen2RMSNorm
class Qwen2RMSNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-6):
        """
        Qwen2RMSNorm is equivalent to T5LayerNorm
        """
        super().__init__()
        self.weight = nn.Parameter(torch.ones(hidden_size))
        self.variance_epsilon = eps

    def forward(self, hidden_states):
        input_dtype = hidden_states.dtype
        hidden_states = hidden_states.to(torch.float32)
        variance = hidden_states.pow(2).mean(-1, keepdim=True)
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
        return self.weight * hidden_states.to(input_dtype)

    def extra_repr(self):
        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"


# Copied from transformers.models.qwen2.modeling_qwen2.Qwen2MLP
class Qwen2MLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.hidden_size = config.hidden_size
        self.intermediate_size = config.intermediate_size
        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
        self.act_fn = ACT2FN[config.hidden_act]

    def forward(self, hidden_state):
        return self.down_proj(self.act_fn(self.gate_proj(hidden_state)) * self.up_proj(hidden_state))


# Copied from transformers.models.llama.modeling_llama.repeat_kv
def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
    """
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    """
    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
    if n_rep == 1:
        return hidden_states
    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)


class Qwen2VLAttention(nn.Module):
    """
    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
    and "Generating Long Sequences with Sparse Transformers".
    """

    def __init__(self, config: Qwen2VLConfig, layer_idx: Optional[int] = None):
        super().__init__()
        self.config = config
        self.layer_idx = layer_idx
        if layer_idx is None:
            logger.warning_once(
                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
                "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
                "when creating this class."
            )

        self.hidden_size = config.hidden_size
        self.num_heads = config.num_attention_heads
        self.head_dim = self.hidden_size // self.num_heads
        self.num_key_value_heads = config.num_key_value_heads
        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
        self.max_position_embeddings = config.max_position_embeddings
        self.rope_theta = config.rope_theta
        self.is_causal = True
        self.attention_dropout = config.attention_dropout
        self.rope_scaling = config.rope_scaling

        if (self.head_dim * self.num_heads) != self.hidden_size:
            raise ValueError(
                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
                f" and `num_heads`: {self.num_heads})."
            )
        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True)
        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)

        self.rotary_emb = Qwen2VLRotaryEmbedding(
            self.head_dim,
            max_position_embeddings=self.max_position_embeddings,
            base=self.rope_theta,
        )

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Cache] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
        cache_position: Optional[torch.LongTensor] = None,
        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
        custom_kv: Optional[Cache] = None,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        bsz, q_len, _ = hidden_states.size()

        query_states = self.q_proj(hidden_states)
        key_states = self.k_proj(hidden_states)
        value_states = self.v_proj(hidden_states)

        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)

        cos, sin = position_embeddings
        query_states, key_states = apply_multimodal_rotary_pos_emb(
            query_states, key_states, cos, sin, self.rope_scaling["mrope_section"]
        )

        if past_key_value is not None:
            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
        if custom_kv is not None:
            bs = query_states.shape[0]
            custom_k = custom_kv.key_cache[self.layer_idx].expand(bs, -1, -1, -1)
            custom_v = custom_kv.value_cache[self.layer_idx].expand(bs, -1, -1, -1)
            key_states = torch.cat([custom_k, key_states], dim=-2).contiguous()
            value_states = torch.cat([custom_v, value_states], dim=-2).contiguous()
            prefix_attn = -torch.zeros((bs, 1, attention_mask.shape[-2], custom_k.shape[-2])).to(attention_mask.device).to(attention_mask.dtype)
            attention_mask = torch.cat([prefix_attn, attention_mask], dim=-1)
        key_states = repeat_kv(key_states, self.num_key_value_groups)
        value_states = repeat_kv(value_states, self.num_key_value_groups)

        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)

        if attention_mask is not None:  # no matter the length, we just slice it
            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
            attn_weights = attn_weights + causal_mask

        # Fix precision issues in Qwen2-VL float16 inference
        # Replace inf values with zeros in attention weights to prevent NaN propagation
        if query_states.dtype == torch.float16:
            attn_weights = torch.where(torch.isinf(attn_weights), torch.zeros_like(attn_weights), attn_weights)
        # wo_sink_attn_weights = nn.functional.softmax(attn_weights[..., 1:], dim=-1, dtype=torch.float32).to(query_states.dtype)
        # upcast attention to fp32
        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
        # attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.bfloat16).to(query_states.dtype)
        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
        attn_output = torch.matmul(attn_weights, value_states)
        
        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
            raise ValueError(
                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
                f" {attn_output.size()}"
            )

        attn_output = attn_output.transpose(1, 2).contiguous()
        attn_output = attn_output.reshape(bsz, q_len, -1)

        attn_output = self.o_proj(attn_output)

        if not output_attentions:
            attn_weights = None
        return attn_output, attn_weights, past_key_value


class Qwen2VLFlashAttention2(Qwen2VLAttention):
    """
    Qwen2VL flash attention module, following Qwen2VL attention module. This module inherits from `Qwen2VLAttention`
    as the weights of the module stays untouched. The only required change would be on the forward pass
    where it needs to correctly call the public API of flash attention and deal with padding tokens
    in case the input contains any of them. Additionally, for sliding window attention, we apply SWA only to the bottom
    config.max_window_layers layers.
    """

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Cache] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
        cache_position: Optional[torch.LongTensor] = None,
        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
        custom_kv: Optional[Cache] = None,
    ):
        if output_attentions:
            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
            logger.warning_once(
                "Qwen2VLModel is using Qwen2VLFlashAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
            )
            return super().forward(
                hidden_states=hidden_states,
                attention_mask=attention_mask,
                position_ids=position_ids,
                past_key_value=past_key_value,
                output_attentions=output_attentions,
                use_cache=use_cache,
                cache_position=cache_position,
                position_embeddings=position_embeddings,
            )
        bsz, q_len, _ = hidden_states.size()

        query_states = self.q_proj(hidden_states)
        key_states = self.k_proj(hidden_states)
        value_states = self.v_proj(hidden_states)

        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)

        # Because the input can be padded, the absolute sequence length depends on the max position id.
        cos, sin = position_embeddings
        query_states, key_states = apply_multimodal_rotary_pos_emb(
            query_states, key_states, cos, sin, self.rope_scaling["mrope_section"]
        )

        if past_key_value is not None:
            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
        if custom_kv is not None:
            key_states = torch.cat([custom_kv.key_cache[self.layer_idx], key_states], dim=-2).contiguous()
            value_states = torch.cat([custom_kv.value_cache[self.layer_idx], value_states], dim=-2).contiguous()
            # import pdb
            # pdb.set_trace()
        # repeat k/v heads if n_kv_heads < n_heads
        key_states = repeat_kv(key_states, self.num_key_value_groups)
        value_states = repeat_kv(value_states, self.num_key_value_groups)
        dropout_rate = 0.0 if not self.training else self.attention_dropout

        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
        # therefore the input hidden states gets silently casted in float32. Hence, we need
        # cast them back in float16 just to be sure everything works as expected.
        input_dtype = query_states.dtype
        if input_dtype == torch.float32:
            if torch.is_autocast_enabled():
                target_dtype = torch.get_autocast_gpu_dtype()
            # Handle the case where the model is quantized
            elif hasattr(self.config, "_pre_quantization_dtype"):
                target_dtype = self.config._pre_quantization_dtype
            else:
                target_dtype = self.q_proj.weight.dtype

            logger.warning_once(
                f"The input hidden states seems to be silently casted in float32, this might be related to"
                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
                f" {target_dtype}."
            )

            query_states = query_states.to(target_dtype)
            key_states = key_states.to(target_dtype)
            value_states = value_states.to(target_dtype)

        # Reashape to the expected shape for Flash Attention
        query_states = query_states.transpose(1, 2)
        key_states = key_states.transpose(1, 2)
        value_states = value_states.transpose(1, 2)

        if (
            self.config.use_sliding_window
            and getattr(self.config, "sliding_window", None) is not None
            and self.layer_idx >= self.config.max_window_layers
        ):
            sliding_window = self.config.sliding_window
        else:
            sliding_window = None

        attn_output = _flash_attention_forward(
            query_states,
            key_states,
            value_states,
            attention_mask,
            q_len,
            dropout=dropout_rate,
            sliding_window=sliding_window,
            is_causal=self.is_causal,
            use_top_left_mask=self._flash_attn_uses_top_left_mask,
        )

        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
        attn_output = self.o_proj(attn_output)

        if not output_attentions:
            attn_weights = None

        return attn_output, attn_weights, past_key_value

# class Qwen2VLSdpaAttention(Qwen2VLAttention):
#     """
#     Qwen2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
#     `Qwen2Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
#     SDPA API.
#     """

#     # Adapted from Qwen2Attention.forward
#     def forward(
#         self,
#         hidden_states: torch.Tensor,
#         attention_mask: Optional[torch.Tensor] = None,
#         position_ids: Optional[torch.LongTensor] = None,
#         past_key_value: Optional[Cache] = None,
#         output_attentions: bool = False,
#         use_cache: bool = False,
#         cache_position: Optional[torch.LongTensor] = None,
#         position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
#         custom_kv: Optional[Cache] = None,
#     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:

#         bsz, q_len, _ = hidden_states.size()

#         query_states = self.q_proj(hidden_states)
#         key_states = self.k_proj(hidden_states)
#         value_states = self.v_proj(hidden_states)

#         query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
#         key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
#         value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)

#         cos, sin = position_embeddings
#         query_states, key_states = apply_multimodal_rotary_pos_emb(
#             query_states, key_states, cos, sin, self.rope_scaling["mrope_section"]
#         )

#         if past_key_value is not None:
#             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
#             key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
#         if custom_kv is not None:
#             bs = query_states.shape[0]
#             custom_k = custom_kv.key_cache[self.layer_idx].expand(bs, -1, -1, -1)
#             custom_v = custom_kv.value_cache[self.layer_idx].expand(bs, -1, -1, -1)
#             key_states = torch.cat([custom_k, key_states], dim=-2).contiguous()
#             value_states = torch.cat([custom_v, value_states], dim=-2).contiguous()
#             prefix_attn = torch.zeros((bs, 1, attention_mask.shape[-2], custom_k.shape[-2])).to(attention_mask.device).type_as(attention_mask)
#             pre_len = custom_kv.pre_lens[self.layer_idx] if len(custom_kv.pre_lens)>0 else 0
#             attention_mask = torch.cat([prefix_attn, attention_mask], dim=-1)
#             # implement batch decode attn masks for left padding
#         else:
#             pre_len = None
        
#         # bs, n_k, seq_len, head_dim = key_states[:,:,pre_len:].shape
#         key_states = repeat_kv(key_states, self.num_key_value_groups)
#         value_states = repeat_kv(value_states, self.num_key_value_groups)
#         if output_attentions:
#             attn_weights = torch.matmul(query_states, key_states[:,:,pre_len:].transpose(2, 3)) / math.sqrt(self.head_dim)
#             if attention_mask is not None:  # no matter the length, we just slice it
#                 causal_mask = attention_mask[:, :, :, pre_len: key_states.shape[-2]]
#                 attn_weights = attn_weights + causal_mask
#             if query_states.dtype == torch.float16:
#                 attn_weights = torch.where(torch.isinf(attn_weights), torch.zeros_like(attn_weights), attn_weights)
#             attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.bfloat16)#.view(bs, n_k, self.num_key_value_groups, -1, seq_len).sum(dim=-3)
        
#         causal_mask = attention_mask
#         if attention_mask is not None:  # no matter the length, we just slice it
#             causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]

#         # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
#         # Reference: https://github.com/pytorch/pytorch/issues/112577.
#         if query_states.device.type == "cuda" and attention_mask is not None:
#             query_states = query_states.contiguous()
#             key_states = key_states.contiguous()
#             value_states = value_states.contiguous()

#         # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
#         # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
#         # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
#         is_causal = True if causal_mask is None and q_len > 1 else False

#         attn_output = torch.nn.functional.scaled_dot_product_attention(
#             query_states,
#             key_states,
#             value_states,
#             attn_mask=causal_mask,
#             dropout_p=self.attention_dropout if self.training else 0.0,
#             is_causal=is_causal,
#         )

#         attn_output = attn_output.transpose(1, 2).contiguous()
#         attn_output = attn_output.view(bsz, q_len, self.hidden_size)

#         attn_output = self.o_proj(attn_output)
#         if output_attentions:
#             return attn_output, attn_weights, past_key_value
#         return attn_output, None, past_key_value 
    
# class Qwen2VLSdpaAttention(Qwen2VLAttention):
#     """
#     Qwen2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
#     `Qwen2Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
#     SDPA API.
#     """

#     # Adapted from Qwen2Attention.forward
#     def forward(
#         self,
#         hidden_states: torch.Tensor,
#         attention_mask: Optional[torch.Tensor] = None,
#         position_ids: Optional[torch.LongTensor] = None,
#         past_key_value: Optional[Cache] = None,
#         output_attentions: bool = False,
#         use_cache: bool = False,
#         cache_position: Optional[torch.LongTensor] = None,
#         position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
#         custom_kv: Optional[Cache] = None,
#     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
#         # if output_attentions:
#         #     # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
#         #     logger.warning_once(
#         #         "Qwen2VLModel is using Qwen2VLSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
#         #         'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
#         #     )
#         #     return super().forward(
#         #         hidden_states=hidden_states,
#         #         attention_mask=attention_mask,
#         #         position_ids=position_ids,
#         #         past_key_value=past_key_value,
#         #         output_attentions=output_attentions,
#         #         use_cache=use_cache,
#         #         cache_position=cache_position,
#         #         position_embeddings=position_embeddings,
#         #         custom_kv=custom_kv,
#         #     )

#         bsz, q_len, _ = hidden_states.size()

#         query_states = self.q_proj(hidden_states)
#         key_states = self.k_proj(hidden_states)
#         value_states = self.v_proj(hidden_states)

#         query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
#         key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
#         value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)

#         cos, sin = position_embeddings
#         query_states, key_states = apply_multimodal_rotary_pos_emb(
#             query_states, key_states, cos, sin, self.rope_scaling["mrope_section"]
#         )

#         if past_key_value is not None:
#             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
#             key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
#         if custom_kv is not None:
#             bs = query_states.shape[0]
#             custom_k = custom_kv.key_cache[self.layer_idx].expand(bs, -1, -1, -1)
#             custom_v = custom_kv.value_cache[self.layer_idx].expand(bs, -1, -1, -1)
#             key_states = torch.cat([custom_k, key_states], dim=-2).contiguous()
#             value_states = torch.cat([custom_v, value_states], dim=-2).contiguous()
#             prefix_attn = torch.zeros((bs, 1, attention_mask.shape[-2], custom_k.shape[-2])).to(attention_mask.device).type_as(attention_mask)
#             pre_len = custom_kv.pre_lens[self.layer_idx] if len(custom_kv.pre_lens)>0 else 0
#             attention_mask = torch.cat([prefix_attn, attention_mask], dim=-1)
#             # implement batch decode attn masks for left padding
#         else:
#             pre_len = None
        
#         # bs, n_k, seq_len, head_dim = key_states[:,:,pre_len:].shape
#         key_states = repeat_kv(key_states, self.num_key_value_groups)
#         value_states = repeat_kv(value_states, self.num_key_value_groups)
#         if output_attentions:
#             attn_weights = torch.matmul(query_states, key_states[:,:,pre_len:].transpose(2, 3)) / math.sqrt(self.head_dim)
#             if attention_mask is not None:  # no matter the length, we just slice it
#                 causal_mask = attention_mask[:, :, :, pre_len: key_states.shape[-2]]
#                 attn_weights = attn_weights + causal_mask
#             if query_states.dtype == torch.float16:
#                 attn_weights = torch.where(torch.isinf(attn_weights), torch.zeros_like(attn_weights), attn_weights)
#             attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.bfloat16)#.view(bs, n_k, self.num_key_value_groups, -1, seq_len).sum(dim=-3)
        
#         causal_mask = attention_mask
#         if attention_mask is not None:  # no matter the length, we just slice it
#             causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]

#         # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
#         # Reference: https://github.com/pytorch/pytorch/issues/112577.
#         if query_states.device.type == "cuda" and attention_mask is not None:
#             query_states = query_states.contiguous()
#             key_states = key_states.contiguous()
#             value_states = value_states.contiguous()

#         # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
#         # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
#         # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
#         is_causal = True if causal_mask is None and q_len > 1 else False

#         attn_output = torch.nn.functional.scaled_dot_product_attention(
#             query_states,
#             key_states,
#             value_states,
#             attn_mask=causal_mask,
#             dropout_p=self.attention_dropout if self.training else 0.0,
#             is_causal=is_causal,
#         )

#         attn_output = attn_output.transpose(1, 2).contiguous()
#         attn_output = attn_output.view(bsz, q_len, self.hidden_size)

#         attn_output = self.o_proj(attn_output)
#         if output_attentions:
#             return attn_output, attn_weights, past_key_value
#         return attn_output, None, past_key_value 

class Qwen2VLSdpaAttention(Qwen2VLAttention):
    """
    Qwen2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
    `Qwen2Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
    SDPA API.
    """

    # Adapted from Qwen2Attention.forward
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Cache] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
        cache_position: Optional[torch.LongTensor] = None,
        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
        custom_kv: Optional[Cache] = None,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        # if output_attentions:
        #     # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
        #     logger.warning_once(
        #         "Qwen2VLModel is using Qwen2VLSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
        #         'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
        #     )
        #     return super().forward(
        #         hidden_states=hidden_states,
        #         attention_mask=attention_mask,
        #         position_ids=position_ids,
        #         past_key_value=past_key_value,
        #         output_attentions=output_attentions,
        #         use_cache=use_cache,
        #         cache_position=cache_position,
        #         position_embeddings=position_embeddings,
        #         custom_kv=custom_kv,
        #     )

        bsz, q_len, _ = hidden_states.size()

        query_states = self.q_proj(hidden_states)
        key_states = self.k_proj(hidden_states)
        value_states = self.v_proj(hidden_states)

        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)

        cos, sin = position_embeddings
        query_states, key_states = apply_multimodal_rotary_pos_emb(
            query_states, key_states, cos, sin, self.rope_scaling["mrope_section"]
        )

        if past_key_value is not None:
            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
        if custom_kv is not None:
            bs = query_states.shape[0]
            custom_k = custom_kv.key_cache[self.layer_idx].expand(bs, -1, -1, -1)
            custom_v = custom_kv.value_cache[self.layer_idx].expand(bs, -1, -1, -1)
            key_states = torch.cat([custom_k, key_states], dim=-2).contiguous()
            value_states = torch.cat([custom_v, value_states], dim=-2).contiguous()
            prefix_attn = torch.zeros((bs, 1, attention_mask.shape[-2], custom_k.shape[-2])).to(attention_mask.device).type_as(attention_mask)
            pre_len = custom_kv.pre_lens[self.layer_idx] if len(custom_kv.pre_lens)>0 else 0
            attention_mask = torch.cat([prefix_attn, attention_mask], dim=-1)
            # implement batch decode attn masks for left padding
        else:
            pre_len = None
        
        # bs, n_k, seq_len, head_dim = key_states[:,:,pre_len:].shape
        key_states = repeat_kv(key_states, self.num_key_value_groups)
        value_states = repeat_kv(value_states, self.num_key_value_groups)
        if output_attentions:
            attn_weights = torch.matmul(query_states, key_states[:,:,pre_len:].transpose(2, 3)) / math.sqrt(self.head_dim)
            if attention_mask is not None:  # no matter the length, we just slice it
                causal_mask = attention_mask[:, :, :, pre_len: key_states.shape[-2]]
                attn_weights = attn_weights + causal_mask
            if query_states.dtype == torch.float16:
                attn_weights = torch.where(torch.isinf(attn_weights), torch.zeros_like(attn_weights), attn_weights)
            attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.bfloat16)#.view(bs, n_k, self.num_key_value_groups, -1, seq_len).sum(dim=-3)
        
        causal_mask = attention_mask
        if attention_mask is not None:  # no matter the length, we just slice it
            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]

        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
        # Reference: https://github.com/pytorch/pytorch/issues/112577.
        if query_states.device.type == "cuda" and attention_mask is not None:
            query_states = query_states.contiguous()
            key_states = key_states.contiguous()
            value_states = value_states.contiguous()

        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
        # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
        is_causal = True if causal_mask is None and q_len > 1 else False

        attn_output = torch.nn.functional.scaled_dot_product_attention(
            query_states,
            key_states,
            value_states,
            attn_mask=causal_mask,
            dropout_p=self.attention_dropout if self.training else 0.0,
            is_causal=is_causal,
        )

        attn_output = attn_output.transpose(1, 2).contiguous()
        attn_output = attn_output.view(bsz, q_len, self.hidden_size)

        attn_output = self.o_proj(attn_output)
        if output_attentions:
            return attn_output, attn_weights, past_key_value
        return attn_output, None, past_key_value 

QWEN2_VL_ATTENTION_CLASSES = {
    "eager": Qwen2VLAttention,
    "flash_attention_2": Qwen2VLFlashAttention2,
    "sdpa": Qwen2VLSdpaAttention,
}


class Qwen2VLDecoderLayer(nn.Module):
    def __init__(self, config: Qwen2VLConfig, layer_idx: int):
        super().__init__()
        self.hidden_size = config.hidden_size

        if config.use_sliding_window and config._attn_implementation in ["flash_attention_2", "moba"]:
            logger.warning_once(
                f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
                "unexpected results may be encountered."
            )
        self.self_attn = QWEN2_VL_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)

        self.mlp = Qwen2MLP(config)
        self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
        self.post_attention_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        output_attentions: Optional[bool] = False,
        use_cache: Optional[bool] = False,
        cache_position: Optional[torch.LongTensor] = None,
        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
        custom_kv: Optional[Tuple[torch.Tensor]] = None,
        **kwargs,
    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence.
            position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
                into the model
        """

        residual = hidden_states

        hidden_states = self.input_layernorm(hidden_states)

        # Self Attention
        hidden_states, self_attn_weights, present_key_value = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_value=past_key_value,
            output_attentions=output_attentions,
            use_cache=use_cache,
            cache_position=cache_position,
            position_embeddings=position_embeddings,
            custom_kv=custom_kv,
        )
        hidden_states = residual + hidden_states

        # Fully Connected
        residual = hidden_states
        hidden_states = self.post_attention_layernorm(hidden_states)
        hidden_states = self.mlp(hidden_states)
        hidden_states = residual + hidden_states

        outputs = (hidden_states,)

        if output_attentions:
            outputs += (self_attn_weights,)

        if use_cache:
            outputs += (present_key_value,)

        return outputs


QWEN2VL_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`Qwen2VLConfig`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""


@add_start_docstrings(
    "The bare Qwen2VL Model outputting raw hidden-states without any specific head on top.",
    QWEN2VL_START_DOCSTRING,
)
class Qwen2VLPreTrainedModel(PreTrainedModel):
    config_class = Qwen2VLConfig
    base_model_prefix = "model"
    supports_gradient_checkpointing = True
    _no_split_modules = ["Qwen2VLDecoderLayer", "Qwen2VLVisionBlock"]
    _skip_keys_device_placement = "past_key_values"
    _supports_flash_attn_2 = True
    _supports_sdpa = True
    _supports_cache_class = True
    _supports_static_cache = True

    def _init_weights(self, module):
        std = self.config.initializer_range
        if isinstance(module, (nn.Linear, nn.Conv3d)):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()


class Qwen2VisionTransformerPretrainedModel(Qwen2VLPreTrainedModel):
    config_class = Qwen2VLVisionConfig
    _no_split_modules = ["Qwen2VLVisionBlock"]

    def __init__(self, config) -> None:
        super().__init__(config)
        self.spatial_merge_size = config.spatial_merge_size

        self.patch_embed = PatchEmbed(
            patch_size=config.patch_size,
            temporal_patch_size=config.temporal_patch_size,
            in_channels=config.in_channels,
            embed_dim=config.embed_dim,
        )

        head_dim = config.embed_dim // config.num_heads
        self.rotary_pos_emb = VisionRotaryEmbedding(head_dim // 2)

        self.blocks = nn.ModuleList(
            [Qwen2VLVisionBlock(config, config._attn_implementation) for _ in range(config.depth)]
        )
        self.merger = PatchMerger(
            dim=config.hidden_size, context_dim=config.embed_dim, spatial_merge_size=config.spatial_merge_size
        )
        self.gradient_checkpointing = False

    def get_dtype(self) -> torch.dtype:
        return self.blocks[0].mlp.fc2.weight.dtype

    def get_device(self) -> torch.device:
        return self.blocks[0].mlp.fc2.weight.device

    def rot_pos_emb(self, grid_thw):
        pos_ids = []
        for t, h, w in grid_thw:
            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
            hpos_ids = hpos_ids.reshape(
                h // self.spatial_merge_size,
                self.spatial_merge_size,
                w // self.spatial_merge_size,
                self.spatial_merge_size,
            )
            hpos_ids = hpos_ids.permute(0, 2, 1, 3)
            hpos_ids = hpos_ids.flatten()

            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
            wpos_ids = wpos_ids.reshape(
                h // self.spatial_merge_size,
                self.spatial_merge_size,
                w // self.spatial_merge_size,
                self.spatial_merge_size,
            )
            wpos_ids = wpos_ids.permute(0, 2, 1, 3)
            wpos_ids = wpos_ids.flatten()
            pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
        pos_ids = torch.cat(pos_ids, dim=0)
        max_grid_size = grid_thw[:, 1:].max()
        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
        return rotary_pos_emb

    def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor) -> torch.Tensor:
        hidden_states = self.patch_embed(hidden_states)
        rotary_pos_emb = self.rot_pos_emb(grid_thw)

        cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum(
            dim=0,
            # Select dtype based on the following factors:
            #  - FA2 requires that cu_seqlens_q must have dtype int32
            #  - torch.onnx.export requires that cu_seqlens_q must have same dtype as grid_thw
            # See https://github.com/huggingface/transformers/pull/34852 for more information
            dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32,
        )
        cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)

        for blk in self.blocks:
            if self.gradient_checkpointing and self.training:
                hidden_states = self._gradient_checkpointing_func(
                    blk.__call__, hidden_states, cu_seqlens, rotary_pos_emb
                )
            else:
                hidden_states = blk(hidden_states, cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb)

        return self.merger(hidden_states)


@add_start_docstrings(
    "The bare Qwen2VL Model outputting raw hidden-states without any specific head on top.",
    QWEN2VL_START_DOCSTRING,
)
class Qwen2VLModel(Qwen2VLPreTrainedModel):
    def __init__(self, config: Qwen2VLConfig):
        super().__init__(config)
        self.padding_idx = config.pad_token_id
        self.vocab_size = config.vocab_size

        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
        self.layers = nn.ModuleList(
            [Qwen2VLDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
        )
        self._attn_implementation = config._attn_implementation
        self.norm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
        self.rotary_emb = Qwen2VLRotaryEmbedding(config=config)

        self.gradient_checkpointing = False
        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.embed_tokens

    def set_input_embeddings(self, value):
        self.embed_tokens = value

    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        cache_position: Optional[torch.LongTensor] = None,
        custom_kv: Optional[List[torch.FloatTensor]] = None,
        custom_kv_pos_offset: Optional[torch.Tensor] = None,
        start_layer_num: int = 0,
    ) -> Union[Tuple, BaseModelOutputWithPast]:
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        use_cache = use_cache if use_cache is not None else self.config.use_cache

        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if (input_ids is None) ^ (inputs_embeds is not None):
            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")

        if self.gradient_checkpointing and self.training:
            if use_cache:
                logger.warning_once(
                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
                )
                use_cache = False

        # torch.jit.trace() doesn't support cache objects in the output
        if use_cache and past_key_values is None and not torch.jit.is_tracing():
            past_key_values = DynamicCache()

        if inputs_embeds is None:
            inputs_embeds = self.embed_tokens(input_ids)

        if cache_position is None:
            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
            cache_position = torch.arange(
                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
            )

        # the hard coded `3` is for temporal, height and width.
        if position_ids is None:
            position_ids = cache_position.view(1, 1, -1).expand(3, inputs_embeds.shape[0], -1)
        elif position_ids.dim() == 2:
            position_ids = position_ids[None, ...].expand(3, position_ids.shape[0], -1)
        if custom_kv is not None:
            position_ids = position_ids+custom_kv_pos_offset
        causal_mask = self._update_causal_mask(
            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
        )

        hidden_states = inputs_embeds

        # create position embeddings to be shared across the decoder layers
        position_embeddings = self.rotary_emb(hidden_states, position_ids)

        # decoder layers
        all_hidden_states = () if output_hidden_states else None
        all_self_attns = () if output_attentions else None
        next_decoder_cache = None

        for decoder_layer in self.layers[start_layer_num:]:
            if output_hidden_states:
                all_hidden_states += (hidden_states,)

            if self.gradient_checkpointing and self.training:
                layer_outputs = self._gradient_checkpointing_func(
                    decoder_layer.__call__,
                    hidden_states,
                    causal_mask,
                    position_ids,
                    past_key_values,
                    output_attentions,
                    use_cache,
                    cache_position,
                    position_embeddings,
                    custom_kv,
                )
            else:
                layer_outputs = decoder_layer(
                    hidden_states,
                    attention_mask=causal_mask,
                    position_ids=position_ids,
                    past_key_value=past_key_values,
                    output_attentions=output_attentions,
                    use_cache=use_cache,
                    cache_position=cache_position,
                    position_embeddings=position_embeddings,
                    custom_kv=custom_kv,
                )

            hidden_states = layer_outputs[0]

            if use_cache:
                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
                

            if output_attentions:
                all_self_attns += (layer_outputs[1],)

        hidden_states = self.norm(hidden_states)

        # add hidden states from the last decoder layer
        if output_hidden_states:
            all_hidden_states += (hidden_states,)

        next_cache = next_decoder_cache if use_cache else None
        if not return_dict:
            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
        return BaseModelOutputWithPast(
            last_hidden_state=hidden_states,
            past_key_values=next_cache,
            hidden_states=all_hidden_states,
            attentions=all_self_attns,
        )
    
    # Copied from transformers.models.phi3.modeling_phi3.Phi3Model._update_causal_mask
    def _update_causal_mask(
        self,
        attention_mask: torch.Tensor,
        input_tensor: torch.Tensor,
        cache_position: torch.Tensor,
        past_key_values: Cache,
        output_attentions: bool,
    ):
        if self.config._attn_implementation in ["flash_attention_2"] :
            if attention_mask is not None and 0.0 in attention_mask:
                return attention_mask
            return None
        if self.config._attn_implementation == "eager" :
            return attention_mask
          
        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
        # to infer the attention mask.
        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
        using_static_cache = isinstance(past_key_values, StaticCache)
        using_sliding_window_cache = isinstance(past_key_values, SlidingWindowCache)

        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
        # if (
        #     self.config._attn_implementation == "sdpa"
        #     and not (using_static_cache or using_sliding_window_cache)
        #     and not output_attentions
        # ):
        #     if AttentionMaskConverter._ignore_causal_mask_sdpa(
        #         attention_mask,
        #         inputs_embeds=input_tensor,
        #         past_key_values_length=past_seen_tokens,
        #         sliding_window=self.config.sliding_window,
        #         is_training=self.training,
        #     ):
        #         return None

        dtype, device = input_tensor.dtype, input_tensor.device
        min_dtype = torch.finfo(dtype).min
        sequence_length = input_tensor.shape[1]
        # SlidingWindowCache or StaticCache
        if using_sliding_window_cache or using_static_cache:
            target_length = past_key_values.get_max_cache_shape()
        # DynamicCache or no cache
        else:
            target_length = (
                attention_mask.shape[-1]
                if isinstance(attention_mask, torch.Tensor)
                else past_seen_tokens + sequence_length + 1
            )

        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
            attention_mask,
            sequence_length=sequence_length,
            target_length=target_length,
            dtype=dtype,
            device=device,
            cache_position=cache_position,
            batch_size=input_tensor.shape[0],
            config=self.config,
            past_key_values=past_key_values,
        )
        
        if (
            self.config._attn_implementation == "sdpa"
            and attention_mask is not None
            and attention_mask.device.type == "cuda"
            and not output_attentions
        ):
            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
            # Details: https://github.com/pytorch/pytorch/issues/110213
            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)

        return causal_mask

    @staticmethod
    # Copied from transformers.models.mistral.modeling_mistral.MistralModel._prepare_4d_causal_attention_mask_with_cache_position with Mistral->Qwen2VL
    def _prepare_4d_causal_attention_mask_with_cache_position(
        attention_mask: torch.Tensor,
        sequence_length: int,
        target_length: int,
        dtype: torch.dtype,
        device: torch.device,
        cache_position: torch.Tensor,
        batch_size: int,
        config: Qwen2VLConfig,
        past_key_values: Cache,
    ):
        """
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            device (`torch.device`):
                The device to plcae the 4D attention mask on.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
            config (`Qwen2VLConfig`):
                The model's configuration class
            past_key_values (`Cache`):
                The cache class that is being used currently to generate
        """
        if attention_mask is not None and attention_mask.dim() == 4:
            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
            causal_mask = attention_mask
        else:
            min_dtype = torch.finfo(dtype).min
            causal_mask = torch.full(
                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
            )
            diagonal_attend_mask = torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
            if config.sliding_window is not None:
                # if we have sliding window, we should not attend to tokens beyond sliding window length, so we mask them out also
                # the check is needed to verify is current checkpoint was trained with sliding window or not
                if not isinstance(past_key_values, SlidingWindowCache) or sequence_length > target_length:
                    sliding_attend_mask = torch.arange(target_length, device=device) <= (
                        cache_position.reshape(-1, 1) - config.sliding_window
                    )
                    diagonal_attend_mask.bitwise_or_(sliding_attend_mask)
            causal_mask *= diagonal_attend_mask
            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
            if attention_mask is not None:
                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
                if attention_mask.shape[-1] > target_length:
                    attention_mask = attention_mask[:, :target_length]
                mask_length = attention_mask.shape[-1]
                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
                padding_mask = padding_mask == 0
                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
                    padding_mask, min_dtype
                )
        return causal_mask


QWEN2_VL_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
            information on the default strategy.

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.

            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.

            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        pixel_values (`torch.FloatTensor` of shape `(seq_length, num_channels * image_size * image_size)):
            The tensors corresponding to the input images. Pixel values can be obtained using
            [`AutoImageProcessor`]. See [`Qwen2VLImageProcessor.__call__`] for details. [`Qwen2VLProcessor`] uses
            [`Qwen2VLImageProcessor`] for processing images.
        pixel_values_videos (`torch.FloatTensor` of shape `(seq_length, num_channels * temporal_size * image_size * image_size)):
            The tensors corresponding to the input videos. Pixel values can be obtained using
            [`AutoImageProcessor`]. See [`Qwen2VLImageProcessor.__call__`] for details. [`Qwen2VLProcessor`] uses
            [`Qwen2VLImageProcessor`] for processing videos.
        image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
            The temporal, height and width of feature shape of each image in LLM.
        video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
            The temporal, height and width of feature shape of each video in LLM.
        rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
            The rope index difference between sequence length and multimodal rope.
"""


class Qwen2VLForConditionalGeneration(Qwen2VLPreTrainedModel, GenerationMixin):
    _tied_weights_keys = ["lm_head.weight"]

    def __init__(self, config):
        super().__init__(config)
        self.visual = Qwen2VisionTransformerPretrainedModel._from_config(config.vision_config)
        self.model = Qwen2VLModel(config)
        self.vocab_size = config.vocab_size
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
        self.rope_deltas = None  # cache rope_deltas here
        self.history_remain_ratios = None
        # Initialize weights and apply final processing
        self.post_init()
        self.all_ori_num = 0
        self.all_cur_num = 0

    def get_input_embeddings(self):
        return self.model.embed_tokens

    def set_input_embeddings(self, value):
        self.model.embed_tokens = value

    def get_output_embeddings(self):
        return self.lm_head

    def set_output_embeddings(self, new_embeddings):
        self.lm_head = new_embeddings

    def set_decoder(self, decoder):
        self.model = decoder

    def get_decoder(self):
        return self.model

    def get_rope_index(
        self,
        input_ids: torch.LongTensor,
        image_grid_thw: Optional[torch.LongTensor] = None,
        video_grid_thw: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Calculate the 3D rope index based on image and video's temporal, height and width in LLM.

        Explanation:
            Each embedding sequence contains vision embedding and text embedding or just contains text embedding.

            For pure text embedding sequence, the rotary position embedding has no difference with mordern LLMs.
            Examples:
                input_ids: [T T T T T], here T is for text.
                temporal position_ids: [0, 1, 2, 3, 4]
                height position_ids: [0, 1, 2, 3, 4]
                width position_ids: [0, 1, 2, 3, 4]

            For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
            and 1D rotary position embeddin for text part.
            Examples:
                Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches.
                input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
                vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
                vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
                vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
                text temporal position_ids: [3, 4, 5, 6, 7]
                text height position_ids: [3, 4, 5, 6, 7]
                text width position_ids: [3, 4, 5, 6, 7]
                Here we calculate the text start position_ids as the max vision position_ids plus 1.

        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
                it.
            image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
                The temporal, height and width of feature shape of each image in LLM.
            video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
                The temporal, height and width of feature shape of each video in LLM.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

        Returns:
            position_ids (`torch.LongTensor` of shape `(3, batch_size, sequence_length)`)
            mrope_position_deltas (`torch.Tensor` of shape `(batch_size)`)
        """
        spatial_merge_size = self.config.vision_config.spatial_merge_size
        image_token_id = self.config.image_token_id
        video_token_id = self.config.video_token_id
        vision_start_token_id = self.config.vision_start_token_id
        mrope_position_deltas = []
        if input_ids is not None and (image_grid_thw is not None or video_grid_thw is not None):
            total_input_ids = input_ids
            if attention_mask is None:
                attention_mask = torch.ones_like(total_input_ids)
            position_ids = torch.ones(
                3, input_ids.shape[0], input_ids.shape[1], dtype=input_ids.dtype, device=input_ids.device
            )
            image_index, video_index = 0, 0
            for i, input_ids in enumerate(total_input_ids):
                input_ids = input_ids[attention_mask[i] == 1]
                image_nums, video_nums = 0, 0
                vision_start_indices = torch.argwhere(input_ids == vision_start_token_id).squeeze(1)
                vision_tokens = input_ids[vision_start_indices + 1]
                image_nums = (vision_tokens == image_token_id).sum()
                video_nums = (vision_tokens == video_token_id).sum()
                input_tokens = input_ids.tolist()
                llm_pos_ids_list: list = []
                st = 0
                remain_images, remain_videos = image_nums, video_nums
                for _ in range(image_nums + video_nums):
                    if image_token_id in input_tokens and remain_images > 0:
                        ed_image = input_tokens.index(image_token_id, st)
                    else:
                        ed_image = len(input_tokens) + 1
                    if video_token_id in input_tokens and remain_videos > 0:
                        ed_video = input_tokens.index(video_token_id, st)
                    else:
                        ed_video = len(input_tokens) + 1
                    if ed_image < ed_video:
                        t, h, w = (
                            image_grid_thw[image_index][0],
                            image_grid_thw[image_index][1],
                            image_grid_thw[image_index][2],
                        )
                        image_index += 1
                        remain_images -= 1
                        ed = ed_image
                    else:
                        t, h, w = (
                            video_grid_thw[video_index][0],
                            video_grid_thw[video_index][1],
                            video_grid_thw[video_index][2],
                        )
                        video_index += 1
                        remain_videos -= 1
                        ed = ed_video
                    llm_grid_t, llm_grid_h, llm_grid_w = (
                        t.item(),
                        h.item() // spatial_merge_size,
                        w.item() // spatial_merge_size,
                    )
                    text_len = ed - st

                    st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
                    llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)

                    t_index = torch.arange(llm_grid_t).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w).flatten()
                    h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(llm_grid_t, -1, llm_grid_w).flatten()
                    w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(llm_grid_t, llm_grid_h, -1).flatten()
                    llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + text_len + st_idx)
                    st = ed + llm_grid_t * llm_grid_h * llm_grid_w

                if st < len(input_tokens):
                    st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
                    text_len = len(input_tokens) - st
                    llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)

                llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
                position_ids[..., i, attention_mask[i] == 1] = llm_positions.to(position_ids.device)
                mrope_position_deltas.append(llm_positions.max() + 1 - len(total_input_ids[i]))
            mrope_position_deltas = torch.tensor(mrope_position_deltas, device=input_ids.device).unsqueeze(1)
            return position_ids, mrope_position_deltas
        else:
            if attention_mask is not None:
                position_ids = attention_mask.long().cumsum(-1) - 1
                position_ids.masked_fill_(attention_mask == 0, 1)
                position_ids = position_ids.unsqueeze(0).expand(3, -1, -1).to(input_ids.device)
                max_position_ids = position_ids.max(0, keepdim=False)[0].max(-1, keepdim=True)[0]
                mrope_position_deltas = max_position_ids + 1 - attention_mask.shape[-1]
            else:
                position_ids = (
                    torch.arange(input_ids.shape[1], device=input_ids.device)
                    .view(1, 1, -1)
                    .expand(3, input_ids.shape[0], -1)
                )
                mrope_position_deltas = torch.zeros(
                    [input_ids.shape[0], 1],
                    device=input_ids.device,
                    dtype=input_ids.dtype,
                )

            return position_ids, mrope_position_deltas

    @add_start_docstrings_to_model_forward(QWEN2_VL_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=Qwen2VLCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        pixel_values: Optional[torch.Tensor] = None,
        pixel_values_videos: Optional[torch.FloatTensor] = None,
        image_grid_thw: Optional[torch.LongTensor] = None,
        video_grid_thw: Optional[torch.LongTensor] = None,
        rope_deltas: Optional[torch.LongTensor] = None,
        cache_position: Optional[torch.LongTensor] = None,
        custom_kv: Optional[List[torch.FloatTensor]] = None,
        custom_kv_pos_offset: Optional[List[torch.FloatTensor]] = None,
        loss_reduction: Optional[str] = "mean",
    ) -> Union[Tuple, Qwen2VLCausalLMOutputWithPast]:
        r"""
        Args:
            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Returns:

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Qwen2VLForConditionalGeneration

        >>> model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
        >>> processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")

        >>> messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image"},
                    {"type": "text", "text": "What is shown in this image?"},
                ],
            },
        ]
        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        >>> inputs = processor(text=[text], images=[image], vision_infos=[vision_infos])

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "The image shows a street scene with a red stop sign in the foreground. In the background, there is a large red gate with Chinese characters ..."
        ```"""

        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if inputs_embeds is None:
            # Ensure input_ids is long type for embedding lookup
            if input_ids is not None and input_ids.dtype != torch.long:
                input_ids = input_ids.long()
            inputs_embeds = self.model.embed_tokens(input_ids)
            if pixel_values is not None:
                pixel_values = pixel_values.type(self.visual.get_dtype())
                image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
                n_image_tokens = (input_ids == self.config.image_token_id).sum().item()
                n_image_features = image_embeds.shape[0]
                if n_image_tokens != n_image_features:
                    raise ValueError(
                        f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
                    )
                image_mask = (
                    (input_ids == self.config.image_token_id)
                    .unsqueeze(-1)
                    .expand_as(inputs_embeds)
                    .to(inputs_embeds.device)
                )
                image_embeds = image_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
                inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)

            if pixel_values_videos is not None:
                pixel_values_videos = pixel_values_videos.type(self.visual.get_dtype())
                # video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw)
                # print(video_embeds.shape)
                split_video_grid_thw = deepcopy(video_grid_thw)
                split_video_grid_thw[:, 0] = 4
                vclip_num = video_grid_thw[0,0]//4 + int((video_grid_thw[0,0]%4)>0)
                st_ind = 0
                video_embeds = []
                for iter_num in range(vclip_num):
                    if iter_num==(vclip_num-1) and (video_grid_thw[0,0]%4)>0:
                        split_video_grid_thw[:, 0] = video_grid_thw[0,0]%4
                    end_ind = st_ind + split_video_grid_thw[0].prod(dim=-1)
                    split_pixels = pixel_values_videos[st_ind:end_ind]
                    sub_video_embeds = self.visual(split_pixels, split_video_grid_thw)
                    video_embeds.append(sub_video_embeds)
                    st_ind = end_ind
                video_embeds = torch.cat(video_embeds, dim=0)
                n_video_tokens = (input_ids == self.config.video_token_id).sum().item()
                n_video_features = video_embeds.shape[0]
                if n_video_tokens != n_video_features:
                    raise ValueError(
                        f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
                    )
                video_mask = (
                    (input_ids == self.config.video_token_id)
                    .unsqueeze(-1)
                    .expand_as(inputs_embeds)
                    .to(inputs_embeds.device)
                )
                video_embeds = video_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
                inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)

            if attention_mask is not None:
                attention_mask = attention_mask.to(inputs_embeds.device)

        # if we get 4D attention mask we cannot calculate rope deltas anymore. TODO @raushan fixme
        if position_ids is None and input_ids is not None and (attention_mask is None or attention_mask.ndim == 2):
            # calculate RoPE index once per generation in the pre-fill stage only
            if (cache_position is not None and cache_position[0] == 0) or self.rope_deltas is None:
                position_ids, rope_deltas = self.get_rope_index(
                    input_ids, image_grid_thw, video_grid_thw, attention_mask
                )
                self.rope_deltas = rope_deltas
            # then use the prev pre-calculated rope-deltas to get the correct position ids
            else:
                batch_size, seq_length, _ = inputs_embeds.shape
                delta = cache_position[0] + self.rope_deltas if cache_position is not None else 0
                position_ids = torch.arange(seq_length, device=inputs_embeds.device)
                position_ids = position_ids.view(1, -1).expand(batch_size, -1)
                if cache_position is not None:  # otherwise `deltas` is an int `0`
                    delta = delta.repeat_interleave(batch_size // delta.shape[0], dim=0)
                position_ids = position_ids.add(delta)
                position_ids = position_ids.unsqueeze(0).expand(3, -1, -1)
        outputs = self.model(
            input_ids=None,
            position_ids=position_ids,
            attention_mask=attention_mask,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            cache_position=cache_position,
            custom_kv=custom_kv,
            custom_kv_pos_offset=custom_kv_pos_offset,
        )

        hidden_states = outputs[0]
        logits = self.lm_head(hidden_states)

        loss = None
        if labels is not None:
            # Upcast to float if we need to compute the loss to avoid potential precision issues
            logits = logits.float()
            # Shift so that tokens < n predict n
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            # Flatten the tokens
            loss_fct = CrossEntropyLoss(reduction=loss_reduction)
            shift_logits = shift_logits.view(-1, self.config.vocab_size)
            shift_labels = shift_labels.view(-1)
            # Enable model parallelism
            shift_labels = shift_labels.to(shift_logits.device)
            loss = loss_fct(shift_logits, shift_labels)

        if not return_dict:
            output = (logits,) + outputs[1:]
            return (loss,) + output if loss is not None else output

        return Qwen2VLCausalLMOutputWithPast(
            loss=loss,
            logits=logits,
            past_key_values=outputs.past_key_values,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
            rope_deltas=self.rope_deltas,
        )

    @torch.no_grad()
    def fastgen(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        pixel_values: Optional[torch.Tensor] = None,
        pixel_values_videos: Optional[torch.FloatTensor] = None,
        image_grid_thw: Optional[torch.LongTensor] = None,
        video_grid_thw: Optional[torch.LongTensor] = None,
        rope_deltas: Optional[torch.LongTensor] = None,
        cache_position: Optional[torch.LongTensor] = None,
        icl_input_ids: torch.LongTensor = None,
        icl_inputs_embeds: Optional[torch.FloatTensor] = None,
        icl_attention_mask: Optional[torch.Tensor] = None,
        icl_position_ids: Optional[torch.LongTensor] = None,
        icl_labels: Optional[torch.LongTensor] = None,
        icl_pixel_values: Optional[torch.Tensor] = None,
        icl_pixel_values_videos: Optional[torch.FloatTensor] = None,
        icl_image_grid_thw: Optional[torch.LongTensor] = None,
        icl_video_grid_thw: Optional[torch.LongTensor] = None,
        custom_kv: Optional[List[torch.FloatTensor]] = None,
        custom_kv_pos_offset = 0,
        img_chunk_num = 10,
    ) -> Union[Tuple, Qwen2VLCausalLMOutputWithPast]:
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if inputs_embeds is None:
            # Ensure input_ids is long type for embedding lookup
            if input_ids is not None and input_ids.dtype != torch.long:
                input_ids = input_ids.long()
            inputs_embeds = self.model.embed_tokens(input_ids)
            if pixel_values is not None:
                pixel_values = pixel_values.type(self.visual.get_dtype())
                # image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
                pixel_st = 0
                image_embeds = []
                chunk_iters = len(image_grid_thw)//img_chunk_num + int(len(image_grid_thw)%img_chunk_num>0)
                for chunk_iter in range(chunk_iters):
                    pixel_length = image_grid_thw[chunk_iter*img_chunk_num:(chunk_iter+1)*img_chunk_num].prod(dim=-1).sum()
                    sub_img_embeds = self.visual(pixel_values[pixel_st:pixel_st+pixel_length], 
                                                 grid_thw=image_grid_thw[chunk_iter*img_chunk_num:(chunk_iter+1)*img_chunk_num]
                                                 )
                    pixel_st += pixel_length
                    image_embeds.append(sub_img_embeds)
                image_embeds = torch.cat(image_embeds, dim=0)
                n_image_tokens = (input_ids == self.config.image_token_id).sum().item()
                n_image_features = image_embeds.shape[0]
                if n_image_tokens != n_image_features:
                    raise ValueError(
                        f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
                    )
                image_mask = (
                    (input_ids == self.config.image_token_id)
                    .unsqueeze(-1)
                    .expand_as(inputs_embeds)
                    .to(inputs_embeds.device)
                )
                image_embeds = image_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
                inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)

            if pixel_values_videos is not None:
                pixel_values_videos = pixel_values_videos.type(self.visual.get_dtype())
                video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw)
                n_video_tokens = (input_ids == self.config.video_token_id).sum().item()
                n_video_features = video_embeds.shape[0]
                if n_video_tokens != n_video_features:
                    raise ValueError(
                        f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
                    )
                video_mask = (
                    (input_ids == self.config.video_token_id)
                    .unsqueeze(-1)
                    .expand_as(inputs_embeds)
                    .to(inputs_embeds.device)
                )
                video_embeds = video_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
                inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)

            if attention_mask is not None:
                attention_mask = attention_mask.to(inputs_embeds.device)

        # if we get 4D attention mask we cannot calculate rope deltas anymore. TODO @raushan fixme
        if position_ids is None and input_ids is not None and (attention_mask is None or attention_mask.ndim == 2):
            # calculate RoPE index once per generation in the pre-fill stage only
        
            position_ids, rope_deltas = self.get_rope_index(
                input_ids, image_grid_thw, video_grid_thw, attention_mask
            )
            self.rope_deltas = rope_deltas
        
        # forward to get the kv cache of current chunk
        outputs = self.model.forward(
            input_ids=None,
            position_ids=position_ids,
            attention_mask=attention_mask,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=True,
            output_attentions=False,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            cache_position=cache_position,
            custom_kv=custom_kv,
            custom_kv_pos_offset=custom_kv_pos_offset,
        )
        
        # update position ids and merge the privious kv cache with kv cache of the current chunk
        custom_kv_pos_offset += (position_ids.max()+1)
        cur_kv = outputs.past_key_values
        # if custom_kv is None:
        #     custom_kv = outputs.past_key_values
        # else:
        #     custom_kv.merge_other_kv(outputs.past_key_values)
        self.all_ori_num += input_ids.shape[-1]
        if icl_inputs_embeds is None:
            # Ensure icl_input_ids is long type for embedding lookup
            if icl_input_ids is not None and icl_input_ids.dtype != torch.long:
                icl_input_ids = icl_input_ids.long()
            icl_inputs_embeds = self.model.embed_tokens(icl_input_ids)
            if icl_pixel_values is not None:
                # icl_pixel_values = icl_pixel_values.type(self.visual.get_dtype())
                icl_image_embeds = image_embeds #self.visual(icl_pixel_values, grid_thw=icl_image_grid_thw)
                n_image_tokens = (icl_input_ids == self.config.image_token_id).sum().item()
                n_image_features = icl_image_embeds.shape[0]
                if n_image_tokens != n_image_features:
                    raise ValueError(
                        f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
                    )
                icl_image_mask = (
                    (icl_input_ids == self.config.image_token_id)
                    .unsqueeze(-1)
                    .expand_as(icl_inputs_embeds)
                    .to(icl_inputs_embeds.device)
                )
                icl_image_embeds = icl_image_embeds.to(icl_inputs_embeds.device, icl_inputs_embeds.dtype)
                icl_inputs_embeds = icl_inputs_embeds.masked_scatter(icl_image_mask, icl_image_embeds)
            if icl_attention_mask is not None:
                icl_attention_mask = icl_attention_mask.to(icl_inputs_embeds.device)
        if icl_position_ids is None and icl_input_ids is not None and (icl_attention_mask is None or icl_attention_mask.ndim == 2):
            # calculate RoPE index once per generation in the pre-fill stage only
            icl_position_ids, icl_rope_deltas = self.get_rope_index(
                icl_input_ids, icl_image_grid_thw, icl_video_grid_thw, icl_attention_mask
            )
        
        
        # demonstration examples as suffix to get attention scores and hidden states
        icl_output = self.model(
            input_ids=None,
            position_ids=icl_position_ids,
            attention_mask=icl_attention_mask,
            past_key_values=past_key_values,
            inputs_embeds=icl_inputs_embeds,
            use_cache=False,
            output_attentions=True,
            output_hidden_states=True,
            return_dict=return_dict,
            cache_position=cache_position,
            custom_kv=outputs.past_key_values,
            custom_kv_pos_offset=custom_kv_pos_offset,
        )
        select_ori_logits = self.lm_head(icl_output[0][icl_labels>-100]).view(-1, self.config.vocab_size).to(torch.bfloat16)
        icl_tmp_hidden_states= icl_output.hidden_states
        icl_attentions = icl_output.attentions
        # torch.cuda.empty_cache()
        
        num_layers = len(self.model.layers)
        label_mask = torch.nonzero(labels.squeeze()>-100).squeeze()
        icl_label_mask = torch.nonzero(icl_labels.squeeze()>-100).squeeze() #(labels>-100).squeeze(0) # do not need shift for loss
        
        
        special_mask = torch.isin(input_ids, torch.tensor([151643, 151644, 151645, 151652, 151653, 872, 8948, 77091, 2610, 525, 264, 10950, 17847, 198], dtype=torch.int32, device=input_ids.device)) 
        
        punctu_mask = torch.isin(input_ids, torch.tensor([11, 13, 30, 0], dtype=torch.int32, device=input_ids.device)) 
        
        locality_mask = torch.zeros_like(input_ids, dtype=torch.bool) 
        locality_mask[:, -int(0.3*input_ids.shape[-1]):] = True
        kv_head_num = 4
        pooled_IQA_sigs = []
        for ind, attn_ in enumerate(icl_attentions):
            # attn_ = attn_[..., :-icl_labels.shape[-1]]
            observe_attn_mask = icl_label_mask
            
            IQA_sig_scores = attn_.permute(0,2,1,3)[observe_attn_mask[:,0], observe_attn_mask[:,1]].mean(dim=0, keepdim=True)
        
            IQA_sig_scores = IQA_sig_scores.reshape(1, kv_head_num, -1, input_ids.shape[1]).mean(dim=2)
            pooled_IQA_sigs.append(IQA_sig_scores)
        pooled_IQA_sigs = torch.stack(pooled_IQA_sigs)
        topk_indices = pooled_IQA_sigs.topk(dim=-1, k=int(0.3*input_ids.shape[-1]))[1]
        freq_mask = torch.zeros_like(pooled_IQA_sigs, dtype=torch.bool)
        freq_mask.scatter_(dim=-1, index=topk_indices, value=True)
        
        # four strategy
        second_mask = special_mask|punctu_mask
        if custom_kv is None:
            custom_kv = [DynamicCache() for _ in range(kv_head_num)]
        all_num = 0
        for layer_id in range(num_layers):
            for head_id in range(kv_head_num):
                iter_masks = [special_mask, special_mask|punctu_mask, special_mask|freq_mask[layer_id, 0,  head_id, :], special_mask|freq_mask[layer_id, 0, head_id, :]|locality_mask]
                is_place = False
                for mask in iter_masks:
                    
                    # print(pooled_IQA_sigs[layer_id, :,  head_id].sum())
                    if pooled_IQA_sigs[layer_id, :, head_id][mask].sum()>0.6:
                        key_state = cur_kv.key_cache[layer_id][:, head_id][mask].unsqueeze(0).unsqueeze(0)
                        value_state = cur_kv.value_cache[layer_id][:, head_id][mask].unsqueeze(0).unsqueeze(0)
                        if len(custom_kv[head_id].key_cache) <= layer_id:
                            custom_kv[head_id].update(key_state, value_state, layer_id)
                        else:
                            custom_kv[head_id].merge_layer_kv(key_state, value_state, layer_id)
                        is_place = True
                        break
                if is_place is False:
                    key_state = cur_kv.key_cache[layer_id][:, head_id:head_id+1]
                    value_state = cur_kv.value_cache[layer_id][:, head_id:head_id+1]
                    if len(custom_kv[head_id].key_cache) <= layer_id:
                        custom_kv[head_id].update(key_state, value_state, layer_id)
                    else:
                        custom_kv[head_id].merge_layer_kv(key_state, value_state, layer_id)
                all_num+=key_state.shape[-2]
        torch.cuda.empty_cache()
        self.all_cur_num += all_num/(28*4)
        print(self.all_cur_num, self.all_ori_num)
        # print("total_token_num: ", custom_kv.get_past_seq_len(), all_token_num/28)
        return custom_kv, custom_kv_pos_offset, self.all_cur_num/self.all_ori_num
    
    @torch.no_grad()
    def pyramid_infer(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        pixel_values: Optional[torch.Tensor] = None,
        pixel_values_videos: Optional[torch.FloatTensor] = None,
        image_grid_thw: Optional[torch.LongTensor] = None,
        video_grid_thw: Optional[torch.LongTensor] = None,
        rope_deltas: Optional[torch.LongTensor] = None,
        cache_position: Optional[torch.LongTensor] = None,
        icl_input_ids: torch.LongTensor = None,
        icl_inputs_embeds: Optional[torch.FloatTensor] = None,
        icl_attention_mask: Optional[torch.Tensor] = None,
        icl_position_ids: Optional[torch.LongTensor] = None,
        icl_labels: Optional[torch.LongTensor] = None,
        icl_pixel_values: Optional[torch.Tensor] = None,
        icl_pixel_values_videos: Optional[torch.FloatTensor] = None,
        icl_image_grid_thw: Optional[torch.LongTensor] = None,
        icl_video_grid_thw: Optional[torch.LongTensor] = None,
        custom_kv: Optional[List[torch.FloatTensor]] = None,
        custom_kv_pos_offset = 0,
        img_chunk_num = 10,
    ) -> Union[Tuple, Qwen2VLCausalLMOutputWithPast]:
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if inputs_embeds is None:
            # Ensure input_ids is long type for embedding lookup
            if input_ids is not None and input_ids.dtype != torch.long:
                input_ids = input_ids.long()
            inputs_embeds = self.model.embed_tokens(input_ids)
            if pixel_values is not None:
                pixel_values = pixel_values.type(self.visual.get_dtype())
                # image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
                pixel_st = 0
                image_embeds = []
                chunk_iters = len(image_grid_thw)//img_chunk_num + int(len(image_grid_thw)%img_chunk_num>0)
                for chunk_iter in range(chunk_iters):
                    pixel_length = image_grid_thw[chunk_iter*img_chunk_num:(chunk_iter+1)*img_chunk_num].prod(dim=-1).sum()
                    sub_img_embeds = self.visual(pixel_values[pixel_st:pixel_st+pixel_length], 
                                                 grid_thw=image_grid_thw[chunk_iter*img_chunk_num:(chunk_iter+1)*img_chunk_num]
                                                 )
                    pixel_st += pixel_length
                    image_embeds.append(sub_img_embeds)
                image_embeds = torch.cat(image_embeds, dim=0)
                n_image_tokens = (input_ids == self.config.image_token_id).sum().item()
                n_image_features = image_embeds.shape[0]
                if n_image_tokens != n_image_features:
                    raise ValueError(
                        f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
                    )
                image_mask = (
                    (input_ids == self.config.image_token_id)
                    .unsqueeze(-1)
                    .expand_as(inputs_embeds)
                    .to(inputs_embeds.device)
                )
                image_embeds = image_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
                inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)

            if pixel_values_videos is not None:
                pixel_values_videos = pixel_values_videos.type(self.visual.get_dtype())
                video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw)
                n_video_tokens = (input_ids == self.config.video_token_id).sum().item()
                n_video_features = video_embeds.shape[0]
                if n_video_tokens != n_video_features:
                    raise ValueError(
                        f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
                    )
                video_mask = (
                    (input_ids == self.config.video_token_id)
                    .unsqueeze(-1)
                    .expand_as(inputs_embeds)
                    .to(inputs_embeds.device)
                )
                video_embeds = video_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
                inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)

            if attention_mask is not None:
                attention_mask = attention_mask.to(inputs_embeds.device)

        # if we get 4D attention mask we cannot calculate rope deltas anymore. TODO @raushan fixme
        if position_ids is None and input_ids is not None and (attention_mask is None or attention_mask.ndim == 2):
            # calculate RoPE index once per generation in the pre-fill stage only
        
            position_ids, rope_deltas = self.get_rope_index(
                input_ids, image_grid_thw, video_grid_thw, attention_mask
            )
            self.rope_deltas = rope_deltas
        
        # forward to get the kv cache of current chunk
        outputs = self.model.forward(
            input_ids=None,
            position_ids=position_ids,
            attention_mask=attention_mask,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=True,
            output_attentions=False,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            cache_position=cache_position,
            custom_kv=custom_kv,
            custom_kv_pos_offset=custom_kv_pos_offset,
        )
        
        # update position ids and merge the privious kv cache with kv cache of the current chunk
        custom_kv_pos_offset += (position_ids.max()+1)
        cur_kv = outputs.past_key_values
        if custom_kv is None:
            custom_kv = outputs.past_key_values
        else:
            custom_kv.merge_other_kv(outputs.past_key_values)
    
        if icl_inputs_embeds is None:
            # Ensure icl_input_ids is long type for embedding lookup
            if icl_input_ids is not None and icl_input_ids.dtype != torch.long:
                icl_input_ids = icl_input_ids.long()
            icl_inputs_embeds = self.model.embed_tokens(icl_input_ids)
            if icl_pixel_values is not None:
                # icl_pixel_values = icl_pixel_values.type(self.visual.get_dtype())
                icl_image_embeds = image_embeds #self.visual(icl_pixel_values, grid_thw=icl_image_grid_thw)
                n_image_tokens = (icl_input_ids == self.config.image_token_id).sum().item()
                n_image_features = icl_image_embeds.shape[0]
                if n_image_tokens != n_image_features:
                    raise ValueError(
                        f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
                    )
                icl_image_mask = (
                    (icl_input_ids == self.config.image_token_id)
                    .unsqueeze(-1)
                    .expand_as(icl_inputs_embeds)
                    .to(icl_inputs_embeds.device)
                )
                icl_image_embeds = icl_image_embeds.to(icl_inputs_embeds.device, icl_inputs_embeds.dtype)
                icl_inputs_embeds = icl_inputs_embeds.masked_scatter(icl_image_mask, icl_image_embeds)
            if icl_attention_mask is not None:
                icl_attention_mask = icl_attention_mask.to(icl_inputs_embeds.device)
        if icl_position_ids is None and icl_input_ids is not None and (icl_attention_mask is None or icl_attention_mask.ndim == 2):
            # calculate RoPE index once per generation in the pre-fill stage only
            icl_position_ids, icl_rope_deltas = self.get_rope_index(
                icl_input_ids, icl_image_grid_thw, icl_video_grid_thw, icl_attention_mask
            )
        
        
        # demonstration examples as suffix to get attention scores and hidden states
        icl_output = self.model(
            input_ids=None,
            position_ids=icl_position_ids,
            attention_mask=icl_attention_mask,
            past_key_values=past_key_values,
            inputs_embeds=icl_inputs_embeds,
            use_cache=False,
            output_attentions=True,
            output_hidden_states=True,
            return_dict=return_dict,
            cache_position=cache_position,
            custom_kv=custom_kv,
            custom_kv_pos_offset=custom_kv_pos_offset,
        )
        select_ori_logits = self.lm_head(icl_output[0][icl_labels>-100]).view(-1, self.config.vocab_size).to(torch.bfloat16)
        icl_tmp_hidden_states= icl_output.hidden_states
        icl_attentions = icl_output.attentions
        # torch.cuda.empty_cache()
        del icl_output
        num_layers = len(self.model.layers)
        label_mask = torch.nonzero(labels.squeeze()>-100).squeeze()
        icl_label_mask = torch.nonzero(icl_labels.squeeze()>-100).squeeze() #(labels>-100).squeeze(0) # do not need shift for loss
        
        # process the attention scores, unmask the previous kv cache and important textual answer tokens
        head_dim = custom_kv.key_cache[0].shape[-1]
        kv_head_num = custom_kv.key_cache[0].shape[1]
        pooled_IQA_sigs = []
        for ind, attn_ in enumerate(icl_attentions):
            attn_ = attn_[..., :-icl_labels.shape[-1]]
            observe_attn_mask = icl_label_mask
            # IQA_sig_scores = attn_.sum(dim=-2, keepdim=True)
            IQA_sig_scores = attn_.permute(0,2,1,3)[observe_attn_mask[:,0], observe_attn_mask[:,1]].sum(dim=0, keepdim=True)
            # group kv
            IQA_sig_scores = IQA_sig_scores.reshape(1, kv_head_num, -1, IQA_sig_scores.shape[-1]).sum(dim=-2)
            # pooled_IQA_sig = F.avg_pool1d(IQA_sig_scores, kernel_size=5, padding=2, stride=1)
            # unmask the label tokens and previouly pruned tokens
            IQA_sig_scores[:,:, :4] += 100.0
            pooled_IQA_sigs.append(IQA_sig_scores)
        # del IQA_sig_scores, pooled_IQA_sig, icl_attentions, outputs, icl_output
        # torch.cuda.empty_cache()
        # ratio = 0.141
        # k_final = ratio*input_ids.shape[-1]//20
        # k0 = 2*ratio*input_ids.shape[-1] - k_final
        # decrease_order = [int(k0-i*(k0-k_final)/(num_layers-1)) for i in range(num_layers)]
        prefill_decay_ratio = 0.01
        schedule_prefill_decay_ratios = [(0.4 - prefill_decay_ratio) * (math.cos(math.pi * idx / 28) + 1) / 2 + prefill_decay_ratio for idx in range(28)] 
        decrease_order = [max(64, int(schedule_prefill_decay_ratios[i]*input_ids.shape[-1])) for i in range(num_layers)]
        
        # outer_kv = custom_kv

        all_token_num = 0
        for layer_id in range(num_layers):
            inner_kv = deepcopy(custom_kv)
            pre_len = custom_kv.pre_lens[layer_id] if len(custom_kv.pre_lens)>0 else 0
            remain_num = cur_kv.value_cache[layer_id].shape[-2]
            
        
            remain_index = pooled_IQA_sigs[layer_id].topk(decrease_order[layer_id], dim=-1)[1] #torch.sort(pooled_IQA_sigs[layer_id].topk(cur_remain_num, dim=-1)[1], dim=-1)[0]
            
            remain_index = remain_index.unsqueeze(-1).expand(-1, -1, -1, head_dim)  
            custom_kv.key_cache[layer_id] = torch.cat([
                custom_kv.key_cache[layer_id][:,:,:pre_len],
                torch.gather(cur_kv.key_cache[layer_id], dim=2, index=remain_index).contiguous()
            ], dim=2
            )
            custom_kv.value_cache[layer_id] = torch.cat([
                custom_kv.value_cache[layer_id][:,:,:pre_len],
                torch.gather(cur_kv.value_cache[layer_id], dim=2, index=remain_index).contiguous()
            ], dim=2
            )
            print(custom_kv.value_cache[layer_id].shape[-2])
            all_token_num += custom_kv.value_cache[layer_id].shape[-2]
            
        torch.cuda.empty_cache()
        # print("total_token_num: ", custom_kv.get_past_seq_len(), all_token_num/28)
        return inner_kv, custom_kv_pos_offset, all_token_num/28
    
    @torch.inference_mode()
    def context_organize_pre(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        pixel_values: Optional[torch.Tensor] = None,
        pixel_values_videos: Optional[torch.FloatTensor] = None,
        image_grid_thw: Optional[torch.LongTensor] = None,
        video_grid_thw: Optional[torch.LongTensor] = None,
        rope_deltas: Optional[torch.LongTensor] = None,
        cache_position: Optional[torch.LongTensor] = None,
        icl_input_ids: torch.LongTensor = None,
        icl_inputs_embeds: Optional[torch.FloatTensor] = None,
        icl_attention_mask: Optional[torch.Tensor] = None,
        icl_position_ids: Optional[torch.LongTensor] = None,
        icl_labels: Optional[torch.LongTensor] = None,
        icl_pixel_values: Optional[torch.Tensor] = None,
        icl_pixel_values_videos: Optional[torch.FloatTensor] = None,
        icl_image_grid_thw: Optional[torch.LongTensor] = None,
        icl_video_grid_thw: Optional[torch.LongTensor] = None,
        custom_kv: Optional[List[torch.FloatTensor]] = None,
        custom_kv_pos_offset = 0,
        img_chunk_num = 10,
    ) -> Union[Tuple, Qwen2VLCausalLMOutputWithPast]:
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if inputs_embeds is None:
            # Ensure input_ids is long type for embedding lookup
            if input_ids is not None and input_ids.dtype != torch.long:
                input_ids = input_ids.long()
            inputs_embeds = self.model.embed_tokens(input_ids)
            if pixel_values is not None:
                pixel_values = pixel_values.type(self.visual.get_dtype())
                # image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
                pixel_st = 0
                image_embeds = []
                chunk_iters = len(image_grid_thw)//img_chunk_num + int(len(image_grid_thw)%img_chunk_num>0)
                for chunk_iter in range(chunk_iters):
                    pixel_length = image_grid_thw[chunk_iter*img_chunk_num:(chunk_iter+1)*img_chunk_num].prod(dim=-1).sum()
                    sub_img_embeds = self.visual(pixel_values[pixel_st:pixel_st+pixel_length], 
                                                 grid_thw=image_grid_thw[chunk_iter*img_chunk_num:(chunk_iter+1)*img_chunk_num]
                                                 )
                    pixel_st += pixel_length
                    image_embeds.append(sub_img_embeds)
                image_embeds = torch.cat(image_embeds, dim=0)
                n_image_tokens = (input_ids == self.config.image_token_id).sum().item()
                n_image_features = image_embeds.shape[0]
                if n_image_tokens != n_image_features:
                    raise ValueError(
                        f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
                    )
                image_mask = (
                    (input_ids == self.config.image_token_id)
                    .unsqueeze(-1)
                    .expand_as(inputs_embeds)
                    .to(inputs_embeds.device)
                )
                image_embeds = image_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
                inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)

            if pixel_values_videos is not None:
                pixel_values_videos = pixel_values_videos.type(self.visual.get_dtype())
                video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw)
                n_video_tokens = (input_ids == self.config.video_token_id).sum().item()
                n_video_features = video_embeds.shape[0]
                if n_video_tokens != n_video_features:
                    raise ValueError(
                        f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
                    )
                video_mask = (
                    (input_ids == self.config.video_token_id)
                    .unsqueeze(-1)
                    .expand_as(inputs_embeds)
                    .to(inputs_embeds.device)
                )
                video_embeds = video_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
                inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)

            if attention_mask is not None:
                attention_mask = attention_mask.to(inputs_embeds.device)

        # if we get 4D attention mask we cannot calculate rope deltas anymore. TODO @raushan fixme
        if position_ids is None and input_ids is not None and (attention_mask is None or attention_mask.ndim == 2):
            # calculate RoPE index once per generation in the pre-fill stage only
        
            position_ids, rope_deltas = self.get_rope_index(
                input_ids, image_grid_thw, video_grid_thw, attention_mask
            )
            self.rope_deltas = rope_deltas
        
        # forward to get the kv cache of current chunk
        outputs = self.model.forward(
            input_ids=None,
            position_ids=position_ids,
            attention_mask=attention_mask,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=True,
            output_attentions=False,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            cache_position=cache_position,
            custom_kv=custom_kv,
            custom_kv_pos_offset=custom_kv_pos_offset,
        )
        
        # update position ids and merge the privious kv cache with kv cache of the current chunk
        custom_kv_pos_offset += (position_ids.max()+1)
        cur_kv = outputs.past_key_values
        if custom_kv is None:
            custom_kv = outputs.past_key_values
        else:
            custom_kv.merge_other_kv(outputs.past_key_values)
    
        if icl_inputs_embeds is None:
            # Ensure icl_input_ids is long type for embedding lookup
            if icl_input_ids is not None and icl_input_ids.dtype != torch.long:
                icl_input_ids = icl_input_ids.long()
            icl_inputs_embeds = self.model.embed_tokens(icl_input_ids)
            if icl_pixel_values is not None:
                # icl_pixel_values = icl_pixel_values.type(self.visual.get_dtype())
                icl_image_embeds = image_embeds #self.visual(icl_pixel_values, grid_thw=icl_image_grid_thw)
                n_image_tokens = (icl_input_ids == self.config.image_token_id).sum().item()
                n_image_features = icl_image_embeds.shape[0]
                if n_image_tokens != n_image_features:
                    raise ValueError(
                        f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
                    )
                icl_image_mask = (
                    (icl_input_ids == self.config.image_token_id)
                    .unsqueeze(-1)
                    .expand_as(icl_inputs_embeds)
                    .to(icl_inputs_embeds.device)
                )
                icl_image_embeds = icl_image_embeds.to(icl_inputs_embeds.device, icl_inputs_embeds.dtype)
                icl_inputs_embeds = icl_inputs_embeds.masked_scatter(icl_image_mask, icl_image_embeds)
            if icl_attention_mask is not None:
                icl_attention_mask = icl_attention_mask.to(icl_inputs_embeds.device)
        if icl_position_ids is None and icl_input_ids is not None and (icl_attention_mask is None or icl_attention_mask.ndim == 2):
            # calculate RoPE index once per generation in the pre-fill stage only
            icl_position_ids, icl_rope_deltas = self.get_rope_index(
                icl_input_ids, icl_image_grid_thw, icl_video_grid_thw, icl_attention_mask
            )
        
        
        # demonstration examples as suffix to get attention scores and hidden states
        icl_output = self.model(
            input_ids=None,
            position_ids=icl_position_ids,
            attention_mask=icl_attention_mask,
            past_key_values=past_key_values,
            inputs_embeds=icl_inputs_embeds,
            use_cache=False,
            output_attentions=True,
            output_hidden_states=True,
            return_dict=return_dict,
            cache_position=cache_position,
            custom_kv=custom_kv,
            custom_kv_pos_offset=custom_kv_pos_offset,
        )
        select_ori_logits = self.lm_head(icl_output[0][icl_labels>-100]).view(-1, self.config.vocab_size).to(torch.bfloat16)
        icl_tmp_hidden_states= icl_output.hidden_states
        icl_attentions = icl_output.attentions
        # torch.cuda.empty_cache()
        
        num_layers = len(self.model.layers)
        label_mask = torch.nonzero(labels.squeeze()>-100).squeeze()
        icl_label_mask = torch.nonzero(icl_labels.squeeze()>-100).squeeze() #(labels>-100).squeeze(0) # do not need shift for loss
        
        # process the attention scores, unmask the previous kv cache and important textual answer tokens
        head_dim = custom_kv.key_cache[0].shape[-1]
        kv_head_num = custom_kv.key_cache[0].shape[1]
        pooled_IQA_sigs = []
        
        for ind, attn_ in enumerate(icl_attentions):
            attn_ = attn_[..., :-icl_labels.shape[-1]]
            observe_attn_mask = icl_label_mask
            # IQA_sig_scores = attn_.sum(dim=-2, keepdim=True)
            IQA_sig_scores = attn_.permute(0,2,1,3)[observe_attn_mask[:,0], observe_attn_mask[:,1]].sum(dim=0, keepdim=True)
            # group kv
            IQA_sig_scores = IQA_sig_scores.reshape(1, kv_head_num, -1, IQA_sig_scores.shape[-1]).sum(dim=-2)
            pooled_IQA_sig = F.avg_pool1d(IQA_sig_scores, kernel_size=5, padding=2, stride=1)
            # unmask the label tokens and previouly pruned tokens
            pooled_IQA_sig = pooled_IQA_sig.permute(2, 0, 1)
            pooled_IQA_sig[label_mask] += 100.0
            pooled_IQA_sig = pooled_IQA_sig.permute(1, 2, 0).contiguous()
            pooled_IQA_sigs.append(pooled_IQA_sig)
        # del IQA_sig_scores, pooled_IQA_sig, icl_attentions, outputs, icl_output
        # torch.cuda.empty_cache()
        # ratio = 0.141
        # k_final = ratio*input_ids.shape[-1]//20
        # k0 = 2*ratio*input_ids.shape[-1] - k_final
        # decrease_order = [int(k0-i*(k0-k_final)/(num_layers-1)) for i in range(num_layers)]
        
        # outer_kv = custom_kv
        all_token_num = 0
        delta_L_max =  0.005
        remain_nums =[int(input_ids.shape[-1]*r) for r in [1/10, 1/5, 1/2]] #[ii*input_ids.shape[-1]//20 for ii in [1, 2, 5, 10]]
        
        # layer kv cache compression
        chunk_number = icl_input_ids.shape[0]
        chunk_iters = icl_input_ids.shape[0]//chunk_number + int((icl_input_ids.shape[0]%chunk_number)>0)
        
        # for distribution of pruning token
        img_mask = (input_ids == self.config.image_token_id).squeeze()
        ans_mask = labels.squeeze()>-100
        question_mask = torch.ones_like(labels.squeeze())
        question_mask[ans_mask] = 0
        question_mask[img_mask] = 0
        question_mask = question_mask>0
        all_img_num, all_ques_num, all_ans_num, all_img_keep, all_ques_keep, all_ans_keep = 0, 0, 0,0,0,0
        
        for layer_id in range(num_layers-1, -1, -1):
            # inner_kv = deepcopy(custom_kv)
            pre_len = custom_kv.pre_lens[layer_id] if len(custom_kv.pre_lens)>0 else 0
            remain_num = cur_kv.value_cache[layer_id].shape[-2]
            key_backup = custom_kv.key_cache[layer_id].clone()
            value_backup = custom_kv.value_cache[layer_id].clone()
            for cur_remain_num in remain_nums: #range(1, 5):
                cnt_mask = torch.ones_like(pooled_IQA_sigs[layer_id])
                keep_mask = torch.zeros_like(pooled_IQA_sigs[layer_id])
                img_tokens_num = 0
                question_tokens_num = 0
                ans_tokens_num = 0
                img_keep = cnt_mask.squeeze().transpose(1,0)[img_mask].sum()/kv_head_num
                question_keep = cnt_mask.squeeze().transpose(1,0)[question_mask].sum()/kv_head_num
                ans_keep = cnt_mask.squeeze().transpose(1,0)[ans_mask].sum()/kv_head_num
                # cur_remain_num = 433
                # cur_remain_num += (remain_num-input_ids.shape[-1])
                # cur_remain_num = 1046//5 #decrease_order[layer_id]
                ################################
                remain_index = pooled_IQA_sigs[layer_id].topk(cur_remain_num, dim=-1)[1] #torch.sort(pooled_IQA_sigs[layer_id].topk(cur_remain_num, dim=-1)[1], dim=-1)[0]
                ##########################################
                
                fill_values = torch.zeros_like(cnt_mask)
                fill_values1 = torch.ones_like(cnt_mask)
                cnt_mask.scatter_(dim=-1, index=remain_index, src=fill_values)
                keep_mask.scatter_(dim=-1, index=remain_index, src=fill_values1)
                
                ########################################
                remain_index = remain_index.unsqueeze(-1).expand(-1, -1, -1, head_dim)  
                # inner_kv.key_cache[layer_id] = torch.cat([
                #     custom_kv.key_cache[layer_id][:,:,:pre_len],
                #     torch.gather(cur_kv.key_cache[layer_id], dim=2, index=remain_index).contiguous()
                # ], dim=2
                # )
                # inner_kv.value_cache[layer_id] = torch.cat([
                #     custom_kv.value_cache[layer_id][:,:,:pre_len],
                #     torch.gather(cur_kv.value_cache[layer_id], dim=2, index=remain_index).contiguous()
                # ], dim=2
                # )
                custom_kv.key_cache[layer_id] = torch.cat([
                    key_backup[:, :, :pre_len],
                    torch.gather(cur_kv.key_cache[layer_id], dim=2, index=remain_index)
                ], dim=2)
                custom_kv.value_cache[layer_id] = torch.cat([
                    value_backup[:, :, :pre_len],
                    torch.gather(cur_kv.value_cache[layer_id], dim=2, index=remain_index)
                ], dim=2)
                current_output = self.model(
                    input_ids=None,
                    start_layer_num=layer_id,
                    position_ids=icl_position_ids,
                    attention_mask=icl_attention_mask,
                    past_key_values=past_key_values,
                    inputs_embeds=icl_tmp_hidden_states[layer_id],
                    use_cache=False,
                    output_attentions=output_attentions,
                    output_hidden_states=output_hidden_states,
                    return_dict=return_dict,
                    cache_position=cache_position,
                    custom_kv=custom_kv,
                    custom_kv_pos_offset=custom_kv_pos_offset,
                )
                select_new_logits = self.lm_head(current_output[0][icl_labels>-100]).view(-1, self.config.vocab_size).to(torch.bfloat16)
                
                # JS Divergence        
                stu_logits = F.softmax(select_new_logits, dim=-1, dtype=torch.bfloat16)
                tea_logits = F.softmax(select_ori_logits, dim=-1, dtype=torch.bfloat16)
                M = (0.5*(stu_logits+tea_logits)).log()
                js_loss = 0.5*(F.kl_div(M, (stu_logits).log(), log_target=True, reduction="batchmean") + 
                               F.kl_div(M, (tea_logits).log(), log_target=True, reduction="batchmean"))
                # del current_output, stu_logits, tea_logits, M, select_new_logits
                # print("js loss:", js_loss)
                # KL divergence
                # kl_loss = F.kl_div(F.log_softmax(select_new_logits,dim=-1), select_ori_logits.softmax(dim=-1), reduction="batchmean")
                # print("kl_loss", kl_loss)
                
                if js_loss <= delta_L_max:
                    remain_num = cur_remain_num
                    # custom_kv.key_cache[layer_id] = inner_kv.key_cache[layer_id]
                    # custom_kv.value_cache[layer_id] = inner_kv.value_cache[layer_id]
                    
                    #########################
                    cnt_mask = cnt_mask.squeeze().transpose(1,0)
                    img_tokens_num = cnt_mask[img_mask].sum()/kv_head_num
                    question_tokens_num = cnt_mask[question_mask].sum()/kv_head_num
                    ans_tokens_num = cnt_mask[ans_mask].sum()/kv_head_num
                    keep_mask = keep_mask.squeeze().transpose(1,0)
                    img_keep = keep_mask[img_mask].sum()/kv_head_num
                    question_keep = keep_mask[question_mask].sum()/kv_head_num
                    ans_keep = keep_mask[ans_mask].sum()/kv_head_num
                    # del inner_kv
                    break
                else:
                    custom_kv.key_cache[layer_id] = key_backup
                    custom_kv.value_cache[layer_id] = value_backup
            # print(f"layer id {layer_id}: ", remain_num+pre_len)
            all_token_num+=remain_num+pre_len
            all_img_num+=img_tokens_num
            all_ques_num+=question_tokens_num
            all_ans_num+=ans_tokens_num
            all_img_keep +=img_keep
            all_ques_keep+= question_keep
            all_ans_keep +=ans_keep
        # del cur_kv, icl_tmp_hidden_states, inner_kv, pooled_IQA_sigs, current_output, stu_logits, tea_logits, M, select_new_logits, select_ori_logits, IQA_sig_scores, pooled_IQA_sig, icl_attentions, outputs, icl_output
        # torch.cuda.empty_cache()
        print("total_token_num: ", custom_kv.get_past_seq_len(), all_token_num/28)
        return custom_kv, custom_kv_pos_offset, all_token_num/28, all_img_num, all_ques_num, all_ans_num, all_img_keep, all_ques_keep, all_ans_keep

    @torch.inference_mode()
    def context_organize_tasm(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        pixel_values: Optional[torch.Tensor] = None,
        pixel_values_videos: Optional[torch.FloatTensor] = None,
        image_grid_thw: Optional[torch.LongTensor] = None,
        video_grid_thw: Optional[torch.LongTensor] = None,
        rope_deltas: Optional[torch.LongTensor] = None,
        cache_position: Optional[torch.LongTensor] = None,
        icl_input_ids: torch.LongTensor = None,
        icl_inputs_embeds: Optional[torch.FloatTensor] = None,
        icl_attention_mask: Optional[torch.Tensor] = None,
        icl_position_ids: Optional[torch.LongTensor] = None,
        icl_labels: Optional[torch.LongTensor] = None,
        icl_pixel_values: Optional[torch.Tensor] = None,
        icl_pixel_values_videos: Optional[torch.FloatTensor] = None,
        icl_image_grid_thw: Optional[torch.LongTensor] = None,
        icl_video_grid_thw: Optional[torch.LongTensor] = None,
        custom_kv: Optional[List[torch.FloatTensor]] = None,
        custom_kv_pos_offset: int = 0,
        img_chunk_num: int = 10,
        tasm_compressor = None,  # TASM compressor instance
    ) -> Union[Tuple, Qwen2VLCausalLMOutputWithPast]:
        """
        TASM: Task-Aware Structured Memory compression for KV cache.
        
        This method implements three key innovations over EMLoC:
        
        Innovation 1 - Task Vector Guided Compression:
            Instead of using attention-based importance (β = Σα_ans), we extract
            the Q→A transformation direction to capture task-level patterns.
            Importance = projection of KV onto task vector direction.
            
        Innovation 2 - Semantic-Aware Token Merging:
            Instead of hard top-k pruning that destroys spatial semantics,
            we use soft bipartite matching to merge low-importance tokens
            into high-importance ones, preserving the manifold structure.
            
        Innovation 3 - Dynamic Memory Retrieval:
            Instead of static memory M, we store tokens in two levels:
            - Core Memory (10%): Essential task skeleton, always active
            - Latent Bank (20%): Secondary details, retrieved on demand
            Dynamic retrieval based on JS divergence of current query.
        
        Target: ICML 2026
        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # ========== Step 1: Process input embeddings ==========
        if inputs_embeds is None:
            if input_ids is not None and input_ids.dtype != torch.long:
                input_ids = input_ids.long()
            inputs_embeds = self.model.embed_tokens(input_ids)
            
            if pixel_values is not None:
                pixel_values = pixel_values.type(self.visual.get_dtype())
                pixel_st = 0
                image_embeds = []
                chunk_iters = len(image_grid_thw)//img_chunk_num + int(len(image_grid_thw)%img_chunk_num>0)
                for chunk_iter in range(chunk_iters):
                    pixel_length = image_grid_thw[chunk_iter*img_chunk_num:(chunk_iter+1)*img_chunk_num].prod(dim=-1).sum()
                    sub_img_embeds = self.visual(pixel_values[pixel_st:pixel_st+pixel_length], 
                                                 grid_thw=image_grid_thw[chunk_iter*img_chunk_num:(chunk_iter+1)*img_chunk_num])
                    pixel_st += pixel_length
                    image_embeds.append(sub_img_embeds)
                image_embeds = torch.cat(image_embeds, dim=0)
                n_image_tokens = (input_ids == self.config.image_token_id).sum().item()
                n_image_features = image_embeds.shape[0]
                if n_image_tokens != n_image_features:
                    raise ValueError(f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}")
                image_mask = ((input_ids == self.config.image_token_id).unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device))
                image_embeds = image_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
                inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)

            if pixel_values_videos is not None:
                pixel_values_videos = pixel_values_videos.type(self.visual.get_dtype())
                video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw)
                n_video_tokens = (input_ids == self.config.video_token_id).sum().item()
                n_video_features = video_embeds.shape[0]
                if n_video_tokens != n_video_features:
                    raise ValueError(f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}")
                video_mask = ((input_ids == self.config.video_token_id).unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device))
                video_embeds = video_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
                inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)

            if attention_mask is not None:
                attention_mask = attention_mask.to(inputs_embeds.device)

        # Calculate RoPE position IDs
        if position_ids is None and input_ids is not None and (attention_mask is None or attention_mask.ndim == 2):
            position_ids, rope_deltas = self.get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask)
            self.rope_deltas = rope_deltas
        
        # ========== Step 2: Forward pass to get KV cache ==========
        outputs = self.model.forward(
            input_ids=None,
            position_ids=position_ids,
            attention_mask=attention_mask,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=True,
            output_attentions=False,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            cache_position=cache_position,
            custom_kv=custom_kv,
            custom_kv_pos_offset=custom_kv_pos_offset,
        )
        
        custom_kv_pos_offset += (position_ids.max()+1)
        cur_kv = outputs.past_key_values
        if custom_kv is None:
            custom_kv = outputs.past_key_values
        else:
            custom_kv.merge_other_kv(outputs.past_key_values)
    
        # ========== Step 3: Process ICL examples for task vector extraction ==========
        if icl_inputs_embeds is None:
            if icl_input_ids is not None and icl_input_ids.dtype != torch.long:
                icl_input_ids = icl_input_ids.long()
            icl_inputs_embeds = self.model.embed_tokens(icl_input_ids)
            if icl_pixel_values is not None:
                icl_image_embeds = image_embeds
                n_image_tokens = (icl_input_ids == self.config.image_token_id).sum().item()
                n_image_features = icl_image_embeds.shape[0]
                if n_image_tokens != n_image_features:
                    raise ValueError(f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}")
                icl_image_mask = ((icl_input_ids == self.config.image_token_id).unsqueeze(-1).expand_as(icl_inputs_embeds).to(icl_inputs_embeds.device))
                icl_image_embeds = icl_image_embeds.to(icl_inputs_embeds.device, icl_inputs_embeds.dtype)
                icl_inputs_embeds = icl_inputs_embeds.masked_scatter(icl_image_mask, icl_image_embeds)
            if icl_attention_mask is not None:
                icl_attention_mask = icl_attention_mask.to(icl_inputs_embeds.device)
                
        if icl_position_ids is None and icl_input_ids is not None and (icl_attention_mask is None or icl_attention_mask.ndim == 2):
            icl_position_ids, icl_rope_deltas = self.get_rope_index(icl_input_ids, icl_image_grid_thw, icl_video_grid_thw, icl_attention_mask)
        
        # ========== Step 4: Get attention scores and hidden states from ICL ==========
        icl_output = self.model(
            input_ids=None,
            position_ids=icl_position_ids,
            attention_mask=icl_attention_mask,
            past_key_values=past_key_values,
            inputs_embeds=icl_inputs_embeds,
            use_cache=False,
            output_attentions=True,
            output_hidden_states=True,
            return_dict=return_dict,
            cache_position=cache_position,
            custom_kv=custom_kv,
            custom_kv_pos_offset=custom_kv_pos_offset,
        )
        
        select_ori_logits = self.lm_head(icl_output[0][icl_labels>-100]).view(-1, self.config.vocab_size).to(torch.bfloat16)
        icl_tmp_hidden_states = icl_output.hidden_states
        icl_attentions = icl_output.attentions
        
        num_layers = len(self.model.layers)
        label_mask = torch.nonzero(labels.squeeze()>-100).squeeze()
        icl_label_mask = torch.nonzero(icl_labels.squeeze()>-100).squeeze()
        
        head_dim = custom_kv.key_cache[0].shape[-1]
        kv_head_num = custom_kv.key_cache[0].shape[1]
        
        # ========== TASM Innovation 1: Extract Task Vector ==========
        task_vector_extracted = False
        if tasm_compressor is not None:
            # Extract Q and A hidden states for task vector computation
            # Use the ICL hidden states to compute task direction
            # v19: Also consider image tokens for VQA tasks
            try:
                for layer_idx in range(num_layers):
                    hidden_state = icl_tmp_hidden_states[layer_idx]  # [batch, seq_len, hidden_size]
                    
                    # Handle different icl_labels shapes
                    if icl_labels.dim() == 1:
                        icl_labels_flat = icl_labels
                    else:
                        # Use first batch element for task vector extraction
                        icl_labels_flat = icl_labels[0]
                    
                    # Find question, answer, and image boundaries
                    q_mask = icl_labels_flat == -100
                    a_mask = icl_labels_flat > -100
                    
                    if q_mask.any() and a_mask.any():
                        # Use first batch element
                        h = hidden_state[0] if hidden_state.dim() == 3 else hidden_state  # [seq_len, hidden_size]
                        q_indices = torch.where(q_mask)[0]
                        a_indices = torch.where(a_mask)[0]
                        
                        if len(q_indices) > 0 and len(a_indices) > 0:
                            q_hidden = h[q_indices].unsqueeze(0)  # [1, q_len, hidden_size]
                            a_hidden = h[a_indices].unsqueeze(0)  # [1, a_len, hidden_size]
                            
                            # v19: Weight question hidden states by attention to answer
                            # This helps capture which parts of the question are most relevant
                            # For now, use simple mean pooling (same as before)
                            
                            # Extract task vector for this layer
                            tasm_compressor.task_vector.extract_from_qa_pairs(
                                q_hidden, a_hidden, layer_idx
                            )
                            task_vector_extracted = True
            except Exception as e:
                # If task vector extraction fails, continue without it
                print(f"[TASM] Task vector extraction skipped: {e}")
        
        if task_vector_extracted:
            print(f"[TASM] Task vector extracted for {len(tasm_compressor.task_vector.task_vectors)} layers")
        
        # ========== Step 5: Compute importance scores ==========
        # Token type masks
        img_mask = (input_ids == self.config.image_token_id).squeeze()
        ans_mask = labels.squeeze() > -100
        question_mask = torch.ones_like(labels.squeeze())
        question_mask[ans_mask] = 0
        question_mask[img_mask] = 0
        question_mask = question_mask > 0
        
        # Compute attention-based importance (v9 simple pooling - best so far)
        pooled_IQA_sigs = []
        for ind, attn_ in enumerate(icl_attentions):
            attn_ = attn_[..., :-icl_labels.shape[-1]]
            observe_attn_mask = icl_label_mask
            IQA_sig_scores = attn_.permute(0,2,1,3)[observe_attn_mask[:,0], observe_attn_mask[:,1]].sum(dim=0, keepdim=True)
            IQA_sig_scores = IQA_sig_scores.reshape(1, kv_head_num, -1, IQA_sig_scores.shape[-1]).sum(dim=-2)
            pooled_IQA_sig = F.avg_pool1d(IQA_sig_scores, kernel_size=5, padding=2, stride=1)
            pooled_IQA_sig = pooled_IQA_sig.permute(2, 0, 1)
            pooled_IQA_sig[label_mask] += 100.0
            pooled_IQA_sig = pooled_IQA_sig.permute(1, 2, 0).contiguous()
            pooled_IQA_sigs.append(pooled_IQA_sig)
        
        # ========== Step 6: TASM Layer-wise Compression ==========
        all_token_num = 0
        
        # TASM uses different compression ratios
        if tasm_compressor is not None:
            core_ratio = tasm_compressor.config.core_ratio
            latent_ratio = tasm_compressor.config.latent_ratio
            target_ratio = tasm_compressor.config.target_compression_ratio
            task_vector_weight = tasm_compressor.config.task_vector_weight
            # v9 (best): Use EMLoC's original JS threshold
            delta_L_max = 0.005
        else:
            core_ratio = 0.1
            latent_ratio = 0.3
            target_ratio = 0.2
            task_vector_weight = 0.7
            delta_L_max = 0.004  # v12: stricter
        
        # Calculate remain numbers based on TASM target_compression_ratio
        # v9 (best): Original ratios
        base_ratios = [0.1, 0.2, 0.5]
        base_remain_nums = [int(input_ids.shape[-1] * r) for r in base_ratios]
        
        all_img_num, all_ques_num, all_ans_num = 0, 0, 0
        all_img_keep, all_ques_keep, all_ans_keep = 0, 0, 0
        
        for layer_id in range(num_layers-1, -1, -1):
            pre_len = custom_kv.pre_lens[layer_id] if len(custom_kv.pre_lens) > 0 else 0
            remain_num = cur_kv.value_cache[layer_id].shape[-2]
            key_backup = custom_kv.key_cache[layer_id].clone()
            value_backup = custom_kv.value_cache[layer_id].clone()
            
            # ========== v9 Innovation: Layer-Adaptive Compression ==========
            # Deep layers (high layer_id) keep more tokens for semantic preservation
            # Shallow layers can be compressed more aggressively
            layer_ratio = layer_id / max(num_layers - 1, 1)
            # v9 (best): Scale factor: 0.9 for layer 0, 1.1 for layer 27
            layer_scale = 0.9 + 0.2 * layer_ratio
            remain_nums = [int(r * layer_scale) for r in base_remain_nums]
            
            # ========== TASM Innovation 1: Task Vector Importance ==========
            if tasm_compressor is not None:
                # Compute task-vector based importance with layer-adaptive weighting
                # v11: Also pass visual mask for visual token boosting
                task_importance = tasm_compressor.task_vector.compute_importance_scores(
                    key_states=cur_kv.key_cache[layer_id],
                    value_states=cur_kv.value_cache[layer_id],
                    layer_idx=layer_id,
                    attention_scores=pooled_IQA_sigs[layer_id],
                    num_layers=num_layers,
                    visual_mask=img_mask,
                )
                
                # Use the layer-adaptive importance directly (no manual weighting needed)
                combined_importance = task_importance
            else:
                combined_importance = pooled_IQA_sigs[layer_id]
            
            for cur_remain_num in remain_nums:
                cnt_mask = torch.ones_like(combined_importance)
                keep_mask = torch.zeros_like(combined_importance)
                img_tokens_num = 0
                question_tokens_num = 0
                ans_tokens_num = 0
                img_keep = cnt_mask.squeeze().transpose(1,0)[img_mask].sum()/kv_head_num
                question_keep = cnt_mask.squeeze().transpose(1,0)[question_mask].sum()/kv_head_num
                ans_keep = cnt_mask.squeeze().transpose(1,0)[ans_mask].sum()/kv_head_num
                
                # ========== TASM Innovation 2: Semantic Token Merging ==========
                if tasm_compressor is not None and tasm_compressor.config.enable_merging:
                    # Use semantic merging instead of hard top-k
                    merged_k, merged_v, merge_map = tasm_compressor.merger.merge_tokens(
                        key_states=cur_kv.key_cache[layer_id],
                        value_states=cur_kv.value_cache[layer_id],
                        importance_scores=combined_importance,
                        num_keep=cur_remain_num,
                        visual_mask=img_mask.unsqueeze(0) if img_mask.dim() == 1 else img_mask,
                        protected_mask=ans_mask.unsqueeze(0) if ans_mask.dim() == 1 else ans_mask,
                    )
                    
                    # Update KV cache with merged tokens
                    custom_kv.key_cache[layer_id] = torch.cat([
                        key_backup[:, :, :pre_len],
                        merged_k
                    ], dim=2)
                    custom_kv.value_cache[layer_id] = torch.cat([
                        value_backup[:, :, :pre_len],
                        merged_v
                    ], dim=2)
                else:
                    # Fallback to hard top-k (EMLoC style)
                    remain_index = combined_importance.topk(cur_remain_num, dim=-1)[1]
                    
                    fill_values = torch.zeros_like(cnt_mask)
                    fill_values1 = torch.ones_like(cnt_mask)
                    cnt_mask.scatter_(dim=-1, index=remain_index, src=fill_values)
                    keep_mask.scatter_(dim=-1, index=remain_index, src=fill_values1)
                    
                    remain_index = remain_index.unsqueeze(-1).expand(-1, -1, -1, head_dim)
                    custom_kv.key_cache[layer_id] = torch.cat([
                        key_backup[:, :, :pre_len],
                        torch.gather(cur_kv.key_cache[layer_id], dim=2, index=remain_index)
                    ], dim=2)
                    custom_kv.value_cache[layer_id] = torch.cat([
                        value_backup[:, :, :pre_len],
                        torch.gather(cur_kv.value_cache[layer_id], dim=2, index=remain_index)
                    ], dim=2)
                
                # Validate compression with JS divergence
                current_output = self.model(
                    input_ids=None,
                    start_layer_num=layer_id,
                    position_ids=icl_position_ids,
                    attention_mask=icl_attention_mask,
                    past_key_values=past_key_values,
                    inputs_embeds=icl_tmp_hidden_states[layer_id],
                    use_cache=False,
                    output_attentions=output_attentions,
                    output_hidden_states=output_hidden_states,
                    return_dict=return_dict,
                    cache_position=cache_position,
                    custom_kv=custom_kv,
                    custom_kv_pos_offset=custom_kv_pos_offset,
                )
                select_new_logits = self.lm_head(current_output[0][icl_labels>-100]).view(-1, self.config.vocab_size).to(torch.bfloat16)
                
                # JS Divergence check
                stu_logits = F.softmax(select_new_logits, dim=-1, dtype=torch.bfloat16)
                tea_logits = F.softmax(select_ori_logits, dim=-1, dtype=torch.bfloat16)
                M = (0.5*(stu_logits+tea_logits)).log()
                js_loss = 0.5*(F.kl_div(M, (stu_logits).log(), log_target=True, reduction="batchmean") + 
                               F.kl_div(M, (tea_logits).log(), log_target=True, reduction="batchmean"))
                
                if js_loss <= delta_L_max:
                    remain_num = cur_remain_num
                    
                    # ========== TASM Innovation 3: Store in Hierarchical Memory ==========
                    if tasm_compressor is not None and tasm_compressor.config.enable_dynamic_retrieval:
                        # Get the compressed KV states
                        compressed_k = custom_kv.key_cache[layer_id][:, :, pre_len:]
                        compressed_v = custom_kv.value_cache[layer_id][:, :, pre_len:]
                        compressed_seq_len = compressed_k.shape[2]
                        
                        # Create importance scores matching compressed sequence length
                        # For compressed tokens, use uniform importance since they've already been selected
                        compressed_importance = torch.ones(
                            compressed_k.shape[0], compressed_k.shape[1], compressed_seq_len,
                            device=compressed_k.device, dtype=compressed_k.dtype
                        )
                        
                        # Store compressed KV in hierarchical memory
                        tasm_compressor.memory.store(
                            layer_idx=layer_id,
                            key_states=compressed_k,
                            value_states=compressed_v,
                            importance_scores=compressed_importance,
                        )
                    
                    # Update statistics
                    if not (tasm_compressor is not None and tasm_compressor.config.enable_merging):
                        cnt_mask = cnt_mask.squeeze().transpose(1,0)
                        img_tokens_num = cnt_mask[img_mask].sum()/kv_head_num
                        question_tokens_num = cnt_mask[question_mask].sum()/kv_head_num
                        ans_tokens_num = cnt_mask[ans_mask].sum()/kv_head_num
                        keep_mask = keep_mask.squeeze().transpose(1,0)
                        img_keep = keep_mask[img_mask].sum()/kv_head_num
                        question_keep = keep_mask[question_mask].sum()/kv_head_num
                        ans_keep = keep_mask[ans_mask].sum()/kv_head_num
                    break
                else:
                    custom_kv.key_cache[layer_id] = key_backup
                    custom_kv.value_cache[layer_id] = value_backup
            
            all_token_num += remain_num + pre_len
            all_img_num += img_tokens_num
            all_ques_num += question_tokens_num
            all_ans_num += ans_tokens_num
            all_img_keep += img_keep
            all_ques_keep += question_keep
            all_ans_keep += ans_keep
        
        # Calculate compression ratio
        # Use the actual KV cache length for comparison
        final_kv_len = custom_kv.get_past_seq_len()
        avg_tokens_per_layer = all_token_num / num_layers
        
        # The original length should be based on cur_kv (before compression)
        # which represents the full context length
        original_seq_len = cur_kv.key_cache[0].shape[2] if cur_kv.key_cache[0] is not None else input_ids.shape[-1]
        compression_ratio = 1.0 - (avg_tokens_per_layer / original_seq_len) if original_seq_len > 0 else 0.0
        
        print(f"[TASM] total_token_num: {final_kv_len}, avg: {avg_tokens_per_layer:.1f}, compression_ratio: {compression_ratio:.3f}")
        
        return custom_kv, custom_kv_pos_offset, all_token_num/num_layers, all_img_num, all_ques_num, all_ans_num, all_img_keep, all_ques_keep, all_ans_keep
    
    def compute_progressive_attention_scores(
        self,
        icl_attentions: List[torch.Tensor],
        icl_label_mask: torch.Tensor,
        icl_labels: torch.Tensor,
        kv_head_num: int,
        label_mask: torch.Tensor,
        num_layers: int,
        img_mask: torch.Tensor = None,
        ans_mask: torch.Tensor = None,
        use_token_type_weighting: bool = True,
    ) -> List[torch.Tensor]:
        """
        改进的注意力分数计算：
        1. 跨层聚合：考虑相邻层的注意力模式
        2. 渐进式平滑：使用指数移动平均
        3. 多尺度池化：捕获不同粒度的重要性
        4. Token 类型加权
        """
        pooled_IQA_sigs = []
        prev_attn_score = None
        alpha = 0.3  # 跨层聚合权重
        
        for ind, attn_ in enumerate(icl_attentions):
            attn_ = attn_[..., :-icl_labels.shape[-1]]
            observe_attn_mask = icl_label_mask
            
            IQA_sig_scores = attn_.permute(0, 2, 1, 3)[observe_attn_mask[:, 0], observe_attn_mask[:, 1]].sum(dim=0, keepdim=True)
            IQA_sig_scores = IQA_sig_scores.reshape(1, kv_head_num, -1, IQA_sig_scores.shape[-1]).sum(dim=-2)
            
            # 多尺度池化
            pooled_3 = F.avg_pool1d(IQA_sig_scores, kernel_size=3, padding=1, stride=1)
            pooled_5 = F.avg_pool1d(IQA_sig_scores, kernel_size=5, padding=2, stride=1)
            pooled_7 = F.avg_pool1d(IQA_sig_scores, kernel_size=7, padding=3, stride=1)
            
            # 加权组合多尺度结果
            pooled_IQA_sig = 0.5 * pooled_5 + 0.3 * pooled_3 + 0.2 * pooled_7
            
            # 跨层聚合（指数移动平均）
            if prev_attn_score is not None:
                pooled_IQA_sig = (1 - alpha) * pooled_IQA_sig + alpha * prev_attn_score
            
            prev_attn_score = pooled_IQA_sig.clone()
            
            # Token 类型加权
            pooled_IQA_sig = pooled_IQA_sig.permute(2, 0, 1)
            
            if use_token_type_weighting and img_mask is not None and ans_mask is not None:
                # 图像 token 权重提升
                pooled_IQA_sig[img_mask] *= 1.2
                # 答案 token 权重提升
                pooled_IQA_sig[ans_mask] *= 1.5
            
            # 标记保护（确保标签位置被保留）
            pooled_IQA_sig[label_mask] += 100.0
            pooled_IQA_sig = pooled_IQA_sig.permute(1, 2, 0).contiguous()
            
            pooled_IQA_sigs.append(pooled_IQA_sig)
        
        return pooled_IQA_sigs

    @torch.inference_mode()
    def context_organize_pre_improved(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        pixel_values: Optional[torch.Tensor] = None,
        pixel_values_videos: Optional[torch.FloatTensor] = None,
        image_grid_thw: Optional[torch.LongTensor] = None,
        video_grid_thw: Optional[torch.LongTensor] = None,
        rope_deltas: Optional[torch.LongTensor] = None,
        cache_position: Optional[torch.LongTensor] = None,
        icl_input_ids: torch.LongTensor = None,
        icl_inputs_embeds: Optional[torch.FloatTensor] = None,
        icl_attention_mask: Optional[torch.Tensor] = None,
        icl_position_ids: Optional[torch.LongTensor] = None,
        icl_labels: Optional[torch.LongTensor] = None,
        icl_pixel_values: Optional[torch.Tensor] = None,
        icl_pixel_values_videos: Optional[torch.FloatTensor] = None,
        icl_image_grid_thw: Optional[torch.LongTensor] = None,
        icl_video_grid_thw: Optional[torch.LongTensor] = None,
        custom_kv: Optional[List[torch.FloatTensor]] = None,
        custom_kv_pos_offset: int = 0,
        img_chunk_num: int = 10,
        # 新增改进参数
        adaptive_threshold: bool = True,
        use_layer_wise_ratio: bool = True,
        use_token_type_weighting: bool = True,
    ) -> Union[Tuple, "Qwen2VLCausalLMOutputWithPast"]:
        """
        改进版 context_organize_pre，包含以下优化：
        1. 自适应 JS 散度阈值（分层）
        2. 分层压缩比例（浅层保留更多，深层压缩更多）
        3. 基于 token 类型的加权重要性评分
        4. 更细粒度的保留比例选择
        5. 多尺度注意力池化
        6. 跨层注意力聚合
        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # ========== 图像/视频嵌入处理 ==========
        if inputs_embeds is None:
            # Ensure input_ids is long type for embedding lookup
            if input_ids is not None and input_ids.dtype != torch.long:
                input_ids = input_ids.long()
            inputs_embeds = self.model.embed_tokens(input_ids)
            if pixel_values is not None:
                pixel_values = pixel_values.type(self.visual.get_dtype())
                pixel_st = 0
                image_embeds = []
                chunk_iters = len(image_grid_thw) // img_chunk_num + int(len(image_grid_thw) % img_chunk_num > 0)
                for chunk_iter in range(chunk_iters):
                    pixel_length = image_grid_thw[chunk_iter * img_chunk_num:(chunk_iter + 1) * img_chunk_num].prod(dim=-1).sum()
                    sub_img_embeds = self.visual(
                        pixel_values[pixel_st:pixel_st + pixel_length],
                        grid_thw=image_grid_thw[chunk_iter * img_chunk_num:(chunk_iter + 1) * img_chunk_num]
                    )
                    pixel_st += pixel_length
                    image_embeds.append(sub_img_embeds)
                image_embeds = torch.cat(image_embeds, dim=0)
                n_image_tokens = (input_ids == self.config.image_token_id).sum().item()
                n_image_features = image_embeds.shape[0]
                if n_image_tokens != n_image_features:
                    raise ValueError(f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}")
                image_mask = (input_ids == self.config.image_token_id).unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
                image_embeds = image_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
                inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)

            if pixel_values_videos is not None:
                pixel_values_videos = pixel_values_videos.type(self.visual.get_dtype())
                video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw)
                n_video_tokens = (input_ids == self.config.video_token_id).sum().item()
                n_video_features = video_embeds.shape[0]
                if n_video_tokens != n_video_features:
                    raise ValueError(f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}")
                video_mask = (input_ids == self.config.video_token_id).unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
                video_embeds = video_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
                inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)

            if attention_mask is not None:
                attention_mask = attention_mask.to(inputs_embeds.device)

        if position_ids is None and input_ids is not None and (attention_mask is None or attention_mask.ndim == 2):
            position_ids, rope_deltas = self.get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask)
            self.rope_deltas = rope_deltas

        # ========== 前向传播获取 KV cache ==========
        outputs = self.model.forward(
            input_ids=None,
            position_ids=position_ids,
            attention_mask=attention_mask,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=True,
            output_attentions=False,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            cache_position=cache_position,
            custom_kv=custom_kv,
            custom_kv_pos_offset=custom_kv_pos_offset,
        )

        custom_kv_pos_offset += (position_ids.max() + 1)
        cur_kv = outputs.past_key_values
        if custom_kv is None:
            custom_kv = outputs.past_key_values
        else:
            custom_kv.merge_other_kv(outputs.past_key_values)

        # ========== ICL 示例嵌入处理 ==========
        if icl_inputs_embeds is None:
            # Ensure icl_input_ids is long type for embedding lookup
            if icl_input_ids is not None and icl_input_ids.dtype != torch.long:
                icl_input_ids = icl_input_ids.long()
            icl_inputs_embeds = self.model.embed_tokens(icl_input_ids)
            if icl_pixel_values is not None:
                icl_image_embeds = image_embeds
                n_image_tokens = (icl_input_ids == self.config.image_token_id).sum().item()
                n_image_features = icl_image_embeds.shape[0]
                if n_image_tokens != n_image_features:
                    raise ValueError(f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}")
                icl_image_mask = (icl_input_ids == self.config.image_token_id).unsqueeze(-1).expand_as(icl_inputs_embeds).to(icl_inputs_embeds.device)
                icl_image_embeds = icl_image_embeds.to(icl_inputs_embeds.device, icl_inputs_embeds.dtype)
                icl_inputs_embeds = icl_inputs_embeds.masked_scatter(icl_image_mask, icl_image_embeds)
            if icl_attention_mask is not None:
                icl_attention_mask = icl_attention_mask.to(icl_inputs_embeds.device)

        if icl_position_ids is None and icl_input_ids is not None and (icl_attention_mask is None or icl_attention_mask.ndim == 2):
            icl_position_ids, icl_rope_deltas = self.get_rope_index(icl_input_ids, icl_image_grid_thw, icl_video_grid_thw, icl_attention_mask)

        # ========== ICL 前向获取注意力分数和隐藏状态 ==========
        icl_output = self.model(
            input_ids=None,
            position_ids=icl_position_ids,
            attention_mask=icl_attention_mask,
            past_key_values=past_key_values,
            inputs_embeds=icl_inputs_embeds,
            use_cache=False,
            output_attentions=True,
            output_hidden_states=True,
            return_dict=return_dict,
            cache_position=cache_position,
            custom_kv=custom_kv,
            custom_kv_pos_offset=custom_kv_pos_offset,
        )

        select_ori_logits = self.lm_head(icl_output[0][icl_labels > -100]).view(-1, self.config.vocab_size).to(torch.bfloat16)
        icl_tmp_hidden_states = icl_output.hidden_states
        icl_attentions = icl_output.attentions

        num_layers = len(self.model.layers)
        label_mask = torch.nonzero(labels.squeeze() > -100).squeeze()
        icl_label_mask = torch.nonzero(icl_labels.squeeze() > -100).squeeze()

        head_dim = custom_kv.key_cache[0].shape[-1]
        kv_head_num = custom_kv.key_cache[0].shape[1]

        # ========== 改进1: Token 类型掩码 ==========
        img_mask = (input_ids == self.config.image_token_id).squeeze()
        ans_mask = labels.squeeze() > -100
        question_mask = torch.ones_like(labels.squeeze(), dtype=torch.bool)
        question_mask[ans_mask] = False
        question_mask[img_mask] = False

        # ========== 改进2: 使用改进的注意力分数计算 ==========
        pooled_IQA_sigs = self.compute_progressive_attention_scores(
            icl_attentions=icl_attentions,
            icl_label_mask=icl_label_mask,
            icl_labels=icl_labels,
            kv_head_num=kv_head_num,
            label_mask=label_mask,
            num_layers=num_layers,
            img_mask=img_mask,
            ans_mask=ans_mask,
            use_token_type_weighting=use_token_type_weighting,
        )

        # ========== 改进3: 分层自适应参数 ==========
        all_token_num = 0

        # 更细粒度的保留比例
        base_remain_ratios = [1/20, 1/10, 1/5, 1/3, 1/2]

        # 分层 JS 散度阈值（深层更严格，浅层更宽松）
        if adaptive_threshold:
            delta_L_max_per_layer = [0.003 + 0.004 * (num_layers - 1 - i) / (num_layers - 1) for i in range(num_layers)]
        else:
            delta_L_max_per_layer = [0.005] * num_layers

        # 分层压缩比例调整因子（浅层保留更多）
        if use_layer_wise_ratio:
            layer_ratio_factors = [1.0 + 0.5 * i / (num_layers - 1) for i in range(num_layers)]
        else:
            layer_ratio_factors = [1.0] * num_layers

        # 统计信息
        all_img_num, all_ques_num, all_ans_num = 0, 0, 0
        all_img_keep, all_ques_keep, all_ans_keep = 0, 0, 0

        for layer_id in range(num_layers - 1, -1, -1):
            pre_len = custom_kv.pre_lens[layer_id] if len(custom_kv.pre_lens) > 0 else 0
            remain_num = cur_kv.value_cache[layer_id].shape[-2]
            key_backup = custom_kv.key_cache[layer_id].clone()
            value_backup = custom_kv.value_cache[layer_id].clone()

            # 当前层的 JS 阈值
            delta_L_max = delta_L_max_per_layer[layer_id]

            # 当前层的保留比例
            layer_remain_ratios = [min(1.0, r * layer_ratio_factors[layer_id]) for r in base_remain_ratios]
            remain_nums = [int(input_ids.shape[-1] * r) for r in layer_remain_ratios]

            for cur_remain_num in remain_nums:
                cnt_mask = torch.ones_like(pooled_IQA_sigs[layer_id])
                keep_mask = torch.zeros_like(pooled_IQA_sigs[layer_id])

                remain_index = pooled_IQA_sigs[layer_id].topk(cur_remain_num, dim=-1)[1]

                fill_values = torch.zeros_like(cnt_mask)
                fill_values1 = torch.ones_like(cnt_mask)
                cnt_mask.scatter_(dim=-1, index=remain_index, src=fill_values)
                keep_mask.scatter_(dim=-1, index=remain_index, src=fill_values1)

                remain_index = remain_index.unsqueeze(-1).expand(-1, -1, -1, head_dim)

                custom_kv.key_cache[layer_id] = torch.cat([
                    key_backup[:, :, :pre_len],
                    torch.gather(cur_kv.key_cache[layer_id], dim=2, index=remain_index)
                ], dim=2)
                custom_kv.value_cache[layer_id] = torch.cat([
                    value_backup[:, :, :pre_len],
                    torch.gather(cur_kv.value_cache[layer_id], dim=2, index=remain_index)
                ], dim=2)

                current_output = self.model(
                    input_ids=None,
                    start_layer_num=layer_id,
                    position_ids=icl_position_ids,
                    attention_mask=icl_attention_mask,
                    past_key_values=past_key_values,
                    inputs_embeds=icl_tmp_hidden_states[layer_id],
                    use_cache=False,
                    output_attentions=False,
                    output_hidden_states=False,
                    return_dict=return_dict,
                    cache_position=cache_position,
                    custom_kv=custom_kv,
                    custom_kv_pos_offset=custom_kv_pos_offset,
                )

                select_new_logits = self.lm_head(current_output[0][icl_labels > -100]).view(-1, self.config.vocab_size).to(torch.bfloat16)

                # JS Divergence
                stu_logits = F.softmax(select_new_logits, dim=-1, dtype=torch.bfloat16)
                tea_logits = F.softmax(select_ori_logits, dim=-1, dtype=torch.bfloat16)
                M = (0.5 * (stu_logits + tea_logits)).log()
                js_loss = 0.5 * (F.kl_div(M, torch.log(stu_logits + 1e-10), log_target=True, reduction="batchmean") +
                                 F.kl_div(M, torch.log(tea_logits + 1e-10), log_target=True, reduction="batchmean"))

                if js_loss <= delta_L_max:
                    remain_num = cur_remain_num

                    # 统计各类型 token 的保留情况
                    cnt_mask_t = cnt_mask.squeeze().transpose(1, 0)
                    keep_mask_t = keep_mask.squeeze().transpose(1, 0)

                    img_tokens_num = cnt_mask_t[img_mask].sum() / kv_head_num
                    question_tokens_num = cnt_mask_t[question_mask].sum() / kv_head_num
                    ans_tokens_num = cnt_mask_t[ans_mask].sum() / kv_head_num

                    img_keep = keep_mask_t[img_mask].sum() / kv_head_num
                    question_keep = keep_mask_t[question_mask].sum() / kv_head_num
                    ans_keep = keep_mask_t[ans_mask].sum() / kv_head_num

                    all_img_num += img_tokens_num
                    all_ques_num += question_tokens_num
                    all_ans_num += ans_tokens_num
                    all_img_keep += img_keep
                    all_ques_keep += question_keep
                    all_ans_keep += ans_keep
                    break
                else:
                    custom_kv.key_cache[layer_id] = key_backup
                    custom_kv.value_cache[layer_id] = value_backup

            all_token_num += remain_num + pre_len

        print("total_token_num:", custom_kv.get_past_seq_len(), all_token_num / num_layers)
        return custom_kv, custom_kv_pos_offset, all_token_num / num_layers, all_img_num, all_ques_num, all_ans_num, all_img_keep, all_ques_keep, all_ans_keep
    
    @torch.no_grad()
    def context_organize_bottom_up(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        pixel_values: Optional[torch.Tensor] = None,
        pixel_values_videos: Optional[torch.FloatTensor] = None,
        image_grid_thw: Optional[torch.LongTensor] = None,
        video_grid_thw: Optional[torch.LongTensor] = None,
        rope_deltas: Optional[torch.LongTensor] = None,
        cache_position: Optional[torch.LongTensor] = None,
        icl_input_ids: torch.LongTensor = None,
        icl_inputs_embeds: Optional[torch.FloatTensor] = None,
        icl_attention_mask: Optional[torch.Tensor] = None,
        icl_position_ids: Optional[torch.LongTensor] = None,
        icl_labels: Optional[torch.LongTensor] = None,
        icl_pixel_values: Optional[torch.Tensor] = None,
        icl_pixel_values_videos: Optional[torch.FloatTensor] = None,
        icl_image_grid_thw: Optional[torch.LongTensor] = None,
        icl_video_grid_thw: Optional[torch.LongTensor] = None,
        custom_kv: Optional[List[torch.FloatTensor]] = None,
        custom_kv_pos_offset = 0,
        img_chunk_num = 10,
    ) -> Union[Tuple, Qwen2VLCausalLMOutputWithPast]:
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if inputs_embeds is None:
            # Ensure input_ids is long type for embedding lookup
            if input_ids is not None and input_ids.dtype != torch.long:
                input_ids = input_ids.long()
            inputs_embeds = self.model.embed_tokens(input_ids)
            if pixel_values is not None:
                pixel_values = pixel_values.type(self.visual.get_dtype())
                # image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
                pixel_st = 0
                image_embeds = []
                chunk_iters = len(image_grid_thw)//img_chunk_num + int(len(image_grid_thw)%img_chunk_num>0)
                for chunk_iter in range(chunk_iters):
                    pixel_length = image_grid_thw[chunk_iter*img_chunk_num:(chunk_iter+1)*img_chunk_num].prod(dim=-1).sum()
                    sub_img_embeds = self.visual(pixel_values[pixel_st:pixel_st+pixel_length], 
                                                 grid_thw=image_grid_thw[chunk_iter*img_chunk_num:(chunk_iter+1)*img_chunk_num]
                                                 )
                    pixel_st += pixel_length
                    image_embeds.append(sub_img_embeds)
                image_embeds = torch.cat(image_embeds, dim=0)
                n_image_tokens = (input_ids == self.config.image_token_id).sum().item()
                n_image_features = image_embeds.shape[0]
                if n_image_tokens != n_image_features:
                    raise ValueError(
                        f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
                    )
                image_mask = (
                    (input_ids == self.config.image_token_id)
                    .unsqueeze(-1)
                    .expand_as(inputs_embeds)
                    .to(inputs_embeds.device)
                )
                image_embeds = image_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
                inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)

            if pixel_values_videos is not None:
                pixel_values_videos = pixel_values_videos.type(self.visual.get_dtype())
                video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw)
                n_video_tokens = (input_ids == self.config.video_token_id).sum().item()
                n_video_features = video_embeds.shape[0]
                if n_video_tokens != n_video_features:
                    raise ValueError(
                        f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
                    )
                video_mask = (
                    (input_ids == self.config.video_token_id)
                    .unsqueeze(-1)
                    .expand_as(inputs_embeds)
                    .to(inputs_embeds.device)
                )
                video_embeds = video_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
                inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)

            if attention_mask is not None:
                attention_mask = attention_mask.to(inputs_embeds.device)

        # if we get 4D attention mask we cannot calculate rope deltas anymore. TODO @raushan fixme
        if position_ids is None and input_ids is not None and (attention_mask is None or attention_mask.ndim == 2):
            # calculate RoPE index once per generation in the pre-fill stage only
        
            position_ids, rope_deltas = self.get_rope_index(
                input_ids, image_grid_thw, video_grid_thw, attention_mask
            )
            self.rope_deltas = rope_deltas
        
        # forward to get the kv cache of current chunk
        outputs = self.model.forward(
            input_ids=None,
            position_ids=position_ids,
            attention_mask=attention_mask,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=True,
            output_attentions=False,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            cache_position=cache_position,
            custom_kv=custom_kv,
            custom_kv_pos_offset=custom_kv_pos_offset,
        )
        
        # update position ids and merge the privious kv cache with kv cache of the current chunk
        custom_kv_pos_offset += (position_ids.max()+1)
        cur_kv = outputs.past_key_values
        if custom_kv is None:
            custom_kv = outputs.past_key_values
        else:
            custom_kv.merge_other_kv(outputs.past_key_values)
    
        if icl_inputs_embeds is None:
            # Ensure icl_input_ids is long type for embedding lookup
            if icl_input_ids is not None and icl_input_ids.dtype != torch.long:
                icl_input_ids = icl_input_ids.long()
            icl_inputs_embeds = self.model.embed_tokens(icl_input_ids)
            if icl_pixel_values is not None:
                # icl_pixel_values = icl_pixel_values.type(self.visual.get_dtype())
                icl_image_embeds = image_embeds #self.visual(icl_pixel_values, grid_thw=icl_image_grid_thw)
                n_image_tokens = (icl_input_ids == self.config.image_token_id).sum().item()
                n_image_features = icl_image_embeds.shape[0]
                if n_image_tokens != n_image_features:
                    raise ValueError(
                        f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
                    )
                icl_image_mask = (
                    (icl_input_ids == self.config.image_token_id)
                    .unsqueeze(-1)
                    .expand_as(icl_inputs_embeds)
                    .to(icl_inputs_embeds.device)
                )
                icl_image_embeds = icl_image_embeds.to(icl_inputs_embeds.device, icl_inputs_embeds.dtype)
                icl_inputs_embeds = icl_inputs_embeds.masked_scatter(icl_image_mask, icl_image_embeds)
            if icl_attention_mask is not None:
                icl_attention_mask = icl_attention_mask.to(icl_inputs_embeds.device)
        if icl_position_ids is None and icl_input_ids is not None and (icl_attention_mask is None or icl_attention_mask.ndim == 2):
            # calculate RoPE index once per generation in the pre-fill stage only
            icl_position_ids, icl_rope_deltas = self.get_rope_index(
                icl_input_ids, icl_image_grid_thw, icl_video_grid_thw, icl_attention_mask
            )
        
        
        # demonstration examples as suffix to get attention scores and hidden states
        icl_output = self.model(
            input_ids=None,
            position_ids=icl_position_ids,
            attention_mask=icl_attention_mask,
            past_key_values=past_key_values,
            inputs_embeds=icl_inputs_embeds,
            use_cache=False,
            output_attentions=True,
            output_hidden_states=False,
            return_dict=return_dict,
            cache_position=cache_position,
            custom_kv=custom_kv,
            custom_kv_pos_offset=custom_kv_pos_offset,
        )
        select_ori_logits = self.lm_head(icl_output[0][icl_labels>-100]).view(-1, self.config.vocab_size).to(torch.bfloat16)
        # icl_tmp_hidden_states= icl_output.hidden_states
        icl_attentions = icl_output.attentions
        # torch.cuda.empty_cache()
        
        num_layers = len(self.model.layers)
        label_mask = torch.nonzero(labels.squeeze()>-100).squeeze()
        icl_label_mask = torch.nonzero(icl_labels.squeeze()>-100).squeeze() #(labels>-100).squeeze(0) # do not need shift for loss
        
        # process the attention scores, unmask the previous kv cache and important textual answer tokens
        head_dim = custom_kv.key_cache[0].shape[-1]
        kv_head_num = custom_kv.key_cache[0].shape[1]
        pooled_IQA_sigs = []
        for ind, attn_ in enumerate(icl_attentions):
            attn_ = attn_[..., :-icl_labels.shape[-1]]
            observe_attn_mask = icl_label_mask
            # IQA_sig_scores = attn_.sum(dim=-2, keepdim=True)
            IQA_sig_scores = attn_.permute(0,2,1,3)[observe_attn_mask[:,0], observe_attn_mask[:,1]].sum(dim=0, keepdim=True)
            # group kv
            IQA_sig_scores = IQA_sig_scores.reshape(1, kv_head_num, -1, IQA_sig_scores.shape[-1]).sum(dim=-2)
            pooled_IQA_sig = F.avg_pool1d(IQA_sig_scores, kernel_size=5, padding=2, stride=1)
            # unmask the label tokens and previouly pruned tokens
            pooled_IQA_sig = pooled_IQA_sig.permute(2, 0, 1)
            pooled_IQA_sig[label_mask] += 100.0
            pooled_IQA_sig = pooled_IQA_sig.permute(1, 2, 0)
            pooled_IQA_sigs.append(pooled_IQA_sig)
        # del IQA_sig_scores, pooled_IQA_sig, icl_attentions, outputs, icl_output
        # torch.cuda.empty_cache()
        # ratio = 0.141
        # k_final = ratio*input_ids.shape[-1]//20
        # k0 = 2*ratio*input_ids.shape[-1] - k_final
        # decrease_order = [int(k0-i*(k0-k_final)/(num_layers-1)) for i in range(num_layers)]
        
        # outer_kv = custom_kv
        all_token_num = 0
        delta_L_max =  0.005
        remain_nums =[int(input_ids.shape[-1]*r) for r in [1/10, 1/5, 1/2]] #[ii*input_ids.shape[-1]//20 for ii in [1, 2, 5, 10]]
        # layer kv cache compression
        chunk_number = icl_input_ids.shape[0]
        chunk_iters = icl_input_ids.shape[0]//chunk_number + int((icl_input_ids.shape[0]%chunk_number)>0)
        
        icl_pre_hidden = icl_inputs_embeds
        for layer_id in range(num_layers):
            inner_kv = deepcopy(custom_kv)
            pre_len = custom_kv.pre_lens[layer_id] if len(custom_kv.pre_lens)>0 else 0
            remain_num = cur_kv.value_cache[layer_id].shape[-2]
            
            for cur_remain_num in remain_nums: #range(1, 5):
             
                # cur_remain_num += (remain_num-input_ids.shape[-1])
                # cur_remain_num = 1046//5 #decrease_order[layer_id]
                ################################
                remain_index = pooled_IQA_sigs[layer_id].topk(cur_remain_num, dim=-1)[1] #torch.sort(pooled_IQA_sigs[layer_id].topk(cur_remain_num, dim=-1)[1], dim=-1)[0]
                
                remain_index = remain_index.unsqueeze(-1).expand(-1, -1, -1, head_dim)  
                inner_kv.key_cache[layer_id] = torch.cat([
                    custom_kv.key_cache[layer_id][:,:,:pre_len],
                    torch.gather(cur_kv.key_cache[layer_id], dim=2, index=remain_index).contiguous()
                ], dim=2
                )
                inner_kv.value_cache[layer_id] = torch.cat([
                    custom_kv.value_cache[layer_id][:,:,:pre_len],
                    torch.gather(cur_kv.value_cache[layer_id], dim=2, index=remain_index).contiguous()
                ], dim=2
                )
                current_output = self.model(
                    input_ids=None,
                    start_layer_num=layer_id,
                    position_ids=icl_position_ids,
                    attention_mask=icl_attention_mask,
                    past_key_values=past_key_values,
                    inputs_embeds=icl_pre_hidden,
                    use_cache=False,
                    output_attentions=output_attentions,
                    output_hidden_states=True,
                    return_dict=return_dict,
                    cache_position=cache_position,
                    custom_kv=inner_kv,
                    custom_kv_pos_offset=custom_kv_pos_offset,
                )
                select_new_logits = self.lm_head(current_output[0][icl_labels>-100]).view(-1, self.config.vocab_size).to(torch.bfloat16)
                # if layer_id+1<num_layers:
                
                # JS Divergence        
                stu_logits = F.softmax(select_new_logits, dim=-1, dtype=torch.bfloat16)
                tea_logits = F.softmax(select_ori_logits, dim=-1, dtype=torch.bfloat16)
                M = (0.5*(stu_logits+tea_logits)).log()
                js_loss = 0.5*(F.kl_div(M, torch.log(stu_logits), log_target=True, reduction="batchmean") + 
                               F.kl_div(M, torch.log(tea_logits), log_target=True, reduction="batchmean"))
                # del current_output, stu_logits, tea_logits, M, select_new_logits
                print("js loss:", js_loss)
                # KL divergence
                # kl_loss = F.kl_div(F.log_softmax(select_new_logits,dim=-1), select_ori_logits.softmax(dim=-1), reduction="batchmean")
                # print("kl_loss", kl_loss)
                
                if js_loss <= delta_L_max:
                    remain_num = cur_remain_num
                    custom_kv.key_cache[layer_id] = inner_kv.key_cache[layer_id]
                    custom_kv.value_cache[layer_id] = inner_kv.value_cache[layer_id]
                    icl_pre_hidden = current_output.hidden_states[1] 
                    # del inner_kv
                    break
            
            # print(f"layer id {layer_id}: ", remain_num+pre_len)
            all_token_num+=remain_num+pre_len
        del cur_kv, inner_kv, pooled_IQA_sigs, current_output, stu_logits, tea_logits, M, select_new_logits, select_ori_logits, IQA_sig_scores, pooled_IQA_sig, icl_attentions, outputs, icl_output
        torch.cuda.empty_cache()
        # print("total_token_num: ", custom_kv.get_past_seq_len(), all_token_num/28)
        return custom_kv, custom_kv_pos_offset, all_token_num/28
    
    
    @torch.inference_mode()
    def context_organize(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        pixel_values: Optional[torch.Tensor] = None,
        pixel_values_videos: Optional[torch.FloatTensor] = None,
        image_grid_thw: Optional[torch.LongTensor] = None,
        video_grid_thw: Optional[torch.LongTensor] = None,
        rope_deltas: Optional[torch.LongTensor] = None,
        cache_position: Optional[torch.LongTensor] = None,
        icl_input_ids: torch.LongTensor = None,
        icl_inputs_embeds: Optional[torch.FloatTensor] = None,
        icl_attention_mask: Optional[torch.Tensor] = None,
        icl_position_ids: Optional[torch.LongTensor] = None,
        icl_labels: Optional[torch.LongTensor] = None,
        icl_pixel_values: Optional[torch.Tensor] = None,
        icl_pixel_values_videos: Optional[torch.FloatTensor] = None,
        icl_image_grid_thw: Optional[torch.LongTensor] = None,
        icl_video_grid_thw: Optional[torch.LongTensor] = None,
        custom_kv: Optional[List[torch.FloatTensor]] = None,
        custom_kv_pos_offset = 0,
        img_chunk_num = 4,
    ) -> Union[Tuple, Qwen2VLCausalLMOutputWithPast]:
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if inputs_embeds is None:
            # Ensure input_ids is long type for embedding lookup
            if input_ids is not None and input_ids.dtype != torch.long:
                input_ids = input_ids.long()
            inputs_embeds = self.model.embed_tokens(input_ids)
            if pixel_values is not None:
                pixel_values = pixel_values.type(self.visual.get_dtype())
                # image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
                pixel_st = 0
                image_embeds = []
                chunk_iters = len(image_grid_thw)//img_chunk_num + int(len(image_grid_thw)%img_chunk_num>0)
                for chunk_iter in range(chunk_iters):
                    pixel_length = image_grid_thw[chunk_iter*img_chunk_num:(chunk_iter+1)*img_chunk_num].prod(dim=-1).sum()
                    sub_img_embeds = self.visual(pixel_values[pixel_st:pixel_st+pixel_length], 
                                                 grid_thw=image_grid_thw[chunk_iter*img_chunk_num:(chunk_iter+1)*img_chunk_num]
                                                 )
                    pixel_st += pixel_length
                    image_embeds.append(sub_img_embeds)
                image_embeds = torch.cat(image_embeds, dim=0)
                n_image_tokens = (input_ids == self.config.image_token_id).sum().item()
                n_image_features = image_embeds.shape[0]
                if n_image_tokens != n_image_features:
                    raise ValueError(
                        f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
                    )
                image_mask = (
                    (input_ids == self.config.image_token_id)
                    .unsqueeze(-1)
                    .expand_as(inputs_embeds)
                    .to(inputs_embeds.device)
                )
                image_embeds = image_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
                inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)

            if pixel_values_videos is not None:
                pixel_values_videos = pixel_values_videos.type(self.visual.get_dtype())
                video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw)
                n_video_tokens = (input_ids == self.config.video_token_id).sum().item()
                n_video_features = video_embeds.shape[0]
                if n_video_tokens != n_video_features:
                    raise ValueError(
                        f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
                    )
                video_mask = (
                    (input_ids == self.config.video_token_id)
                    .unsqueeze(-1)
                    .expand_as(inputs_embeds)
                    .to(inputs_embeds.device)
                )
                video_embeds = video_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
                inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)

            if attention_mask is not None:
                attention_mask = attention_mask.to(inputs_embeds.device)

        # if we get 4D attention mask we cannot calculate rope deltas anymore. TODO @raushan fixme
        if position_ids is None and input_ids is not None and (attention_mask is None or attention_mask.ndim == 2):
            # calculate RoPE index once per generation in the pre-fill stage only
        
            position_ids, rope_deltas = self.get_rope_index(
                input_ids, image_grid_thw, video_grid_thw, attention_mask
            )
            self.rope_deltas = rope_deltas
        
        # forward to get the kv cache of current chunk
        outputs = self.model.forward(
            input_ids=None,
            position_ids=position_ids,
            attention_mask=attention_mask,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=True,
            output_attentions=False,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            cache_position=cache_position,
            custom_kv=custom_kv,
            custom_kv_pos_offset=custom_kv_pos_offset,
        )
        
        # update position ids and merge the privious kv cache with kv cache of the current chunk
        custom_kv_pos_offset += (position_ids.max()+1)
        cur_kv = outputs.past_key_values
        if custom_kv is None:
            custom_kv = outputs.past_key_values
        else:
            custom_kv.merge_other_kv(outputs.past_key_values)
    
        if icl_inputs_embeds is None:
            # Ensure icl_input_ids is long type for embedding lookup
            if icl_input_ids is not None and icl_input_ids.dtype != torch.long:
                icl_input_ids = icl_input_ids.long()
            icl_inputs_embeds = self.model.embed_tokens(icl_input_ids)
            if icl_pixel_values is not None:
                # icl_pixel_values = icl_pixel_values.type(self.visual.get_dtype())
                icl_image_embeds = image_embeds #self.visual(icl_pixel_values, grid_thw=icl_image_grid_thw)
                n_image_tokens = (icl_input_ids == self.config.image_token_id).sum().item()
                n_image_features = icl_image_embeds.shape[0]
                if n_image_tokens != n_image_features:
                    raise ValueError(
                        f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
                    )
                icl_image_mask = (
                    (icl_input_ids == self.config.image_token_id)
                    .unsqueeze(-1)
                    .expand_as(icl_inputs_embeds)
                    .to(icl_inputs_embeds.device)
                )
                icl_image_embeds = icl_image_embeds.to(icl_inputs_embeds.device, icl_inputs_embeds.dtype)
                icl_inputs_embeds = icl_inputs_embeds.masked_scatter(icl_image_mask, icl_image_embeds)
            if icl_attention_mask is not None:
                icl_attention_mask = icl_attention_mask.to(icl_inputs_embeds.device)
        if icl_position_ids is None and icl_input_ids is not None and (icl_attention_mask is None or icl_attention_mask.ndim == 2):
            # calculate RoPE index once per generation in the pre-fill stage only
            icl_position_ids, icl_rope_deltas = self.get_rope_index(
                icl_input_ids, icl_image_grid_thw, icl_video_grid_thw, icl_attention_mask
            )
        
        
        # demonstration examples as suffix to get attention scores and hidden states
        icl_output = self.model(
            input_ids=None,
            position_ids=icl_position_ids,
            attention_mask=icl_attention_mask,
            past_key_values=past_key_values,
            inputs_embeds=icl_inputs_embeds,
            use_cache=False,
            output_attentions=True,
            output_hidden_states=True,
            return_dict=return_dict,
            cache_position=cache_position,
            custom_kv=custom_kv,
            custom_kv_pos_offset=custom_kv_pos_offset,
        )
        select_ori_logits = self.lm_head(icl_output[0][icl_labels>-100]).view(-1, self.config.vocab_size).to(torch.bfloat16)
        icl_tmp_hidden_states= icl_output.hidden_states
        icl_attentions = icl_output.attentions
        # torch.cuda.empty_cache()
        
        num_layers = len(self.model.layers)
        label_mask = torch.nonzero(labels.squeeze()>-100).squeeze()
        icl_label_mask = torch.nonzero(icl_labels.squeeze()>-100).squeeze() #(labels>-100).squeeze(0) # do not need shift for loss
        
        # process the attention scores, unmask the previous kv cache and important textual answer tokens
        head_dim = custom_kv.key_cache[0].shape[-1]
        kv_head_num = custom_kv.key_cache[0].shape[1]
        pooled_IQA_sigs = []
        
        for ind, attn_ in enumerate(icl_attentions):
            attn_ = attn_[..., :-icl_labels.shape[-1]]
            observe_attn_mask = icl_label_mask
            # IQA_sig_scores = attn_.sum(dim=-2, keepdim=True)
            IQA_sig_scores = attn_.permute(0,2,1,3)[observe_attn_mask[:,0], observe_attn_mask[:,1]].sum(dim=0, keepdim=True)
            # group kv
            IQA_sig_scores = IQA_sig_scores.reshape(1, kv_head_num, -1, IQA_sig_scores.shape[-1]).sum(dim=-2)
            pooled_IQA_sig = F.avg_pool1d(IQA_sig_scores, kernel_size=5, padding=2, stride=1)
            # unmask the label tokens and previouly pruned tokens
            pooled_IQA_sig = pooled_IQA_sig.permute(2, 0, 1)
            pooled_IQA_sig[label_mask] += 100.0
            pooled_IQA_sig = pooled_IQA_sig.permute(1, 2, 0).contiguous()
            pooled_IQA_sigs.append(pooled_IQA_sig)
        # del IQA_sig_scores, pooled_IQA_sig, icl_attentions, outputs, icl_output
        # torch.cuda.empty_cache()
        # ratio = 0.141
        # k_final = ratio*input_ids.shape[-1]//20
        # k0 = 2*ratio*input_ids.shape[-1] - k_final
        # decrease_order = [int(k0-i*(k0-k_final)/(num_layers-1)) for i in range(num_layers)]
        groups = 2
        # outer_kv = custom_kv
        all_token_num = 0
        delta_L_max =  0.005
        remain_nums =[int(input_ids.shape[-1]*r) for r in [1/10, 1/5, 1/2]] #[ii*input_ids.shape[-1]//20 for ii in [1, 2, 5, 10]]
        
        for layer_id in range(num_layers-1, -1, -groups):
            inner_kv = deepcopy(custom_kv)
            pre_compressed_len = custom_kv.key_cache[layer_id].shape[-2] - input_ids.shape[-1]
            remain_num = custom_kv.value_cache[layer_id].shape[-2]
            for cur_remain_num in remain_nums: #range(1, 5):
               
                for group_id in range(groups):
                    remain_index = pooled_IQA_sigs[layer_id-group_id].topk(cur_remain_num, dim=-1)[1]
                    remain_index = remain_index.unsqueeze(-1).expand(-1, -1, -1, head_dim)
                    pre_len = custom_kv.pre_lens[layer_id-group_id] if len(custom_kv.pre_lens)>0 else 0
                    inner_kv.key_cache[layer_id-group_id] = torch.cat([
                        custom_kv.key_cache[layer_id-group_id][:,:,:pre_len],
                        torch.gather(cur_kv.key_cache[layer_id-group_id], dim=2, index=remain_index).contiguous()
                    ], dim=2
                    )
                    inner_kv.value_cache[layer_id-group_id] = torch.cat([
                        custom_kv.value_cache[layer_id-group_id][:,:,:pre_len],
                        torch.gather(cur_kv.value_cache[layer_id-group_id], dim=2, index=remain_index).contiguous()
                    ], dim=2
                    )
                current_output = self.model(
                    input_ids=None,
                    start_layer_num=layer_id-groups+1,
                    position_ids=icl_position_ids,
                    attention_mask=icl_attention_mask,
                    past_key_values=past_key_values,
                    inputs_embeds=icl_tmp_hidden_states[layer_id-groups+1],
                    use_cache=False,
                    output_attentions=output_attentions,
                    output_hidden_states=output_hidden_states,
                    return_dict=return_dict,
                    cache_position=cache_position,
                    custom_kv=inner_kv,
                    custom_kv_pos_offset=custom_kv_pos_offset,
                )
                
                select_new_logits = self.lm_head(current_output[0][icl_labels>-100]).view(-1, self.config.vocab_size).to(torch.bfloat16)
                
                # JS Divergence        
                stu_logits = F.softmax(select_new_logits, dim=-1)
                tea_logits = F.softmax(select_ori_logits, dim=-1)
                M = (0.5*(stu_logits+tea_logits)).log()
                js_loss = 0.5*(F.kl_div(M, (stu_logits).log(), log_target=True, reduction="batchmean") + 
                               F.kl_div(M, (tea_logits).log(), log_target=True, reduction="batchmean"))
                # print(js_loss)
                if js_loss <= delta_L_max:
                    remain_num = cur_remain_num+pre_compressed_len
                    custom_kv.key_cache = inner_kv.key_cache
                    custom_kv.value_cache = inner_kv.value_cache
                    break
            
            all_token_num += remain_num*groups
        del cur_kv, icl_tmp_hidden_states, inner_kv, pooled_IQA_sigs, current_output, stu_logits, tea_logits, M, select_new_logits, select_ori_logits, IQA_sig_scores, pooled_IQA_sig, icl_attentions, outputs, icl_output    
        # torch.cuda.empty_cache()
        return custom_kv, custom_kv_pos_offset, all_token_num/28
    
    
    @torch.no_grad()
    def video_group_organize(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        pixel_values: Optional[torch.Tensor] = None,
        pixel_values_videos: Optional[torch.FloatTensor] = None,
        image_grid_thw: Optional[torch.LongTensor] = None,
        video_grid_thw: Optional[torch.LongTensor] = None,
        rope_deltas: Optional[torch.LongTensor] = None,
        cache_position: Optional[torch.LongTensor] = None,
        icl_input_ids: torch.LongTensor = None,
        icl_inputs_embeds: Optional[torch.FloatTensor] = None,
        icl_attention_mask: Optional[torch.Tensor] = None,
        icl_position_ids: Optional[torch.LongTensor] = None,
        icl_labels: Optional[torch.LongTensor] = None,
        icl_pixel_values: Optional[torch.Tensor] = None,
        icl_pixel_values_videos: Optional[torch.FloatTensor] = None,
        icl_image_grid_thw: Optional[torch.LongTensor] = None,
        icl_video_grid_thw: Optional[torch.LongTensor] = None,
        custom_kv: Optional[List[torch.FloatTensor]] = None,
        custom_kv_pos_offset = 0,
    ) -> Union[Tuple, Qwen2VLCausalLMOutputWithPast]:
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if inputs_embeds is None:
            # Ensure input_ids is long type for embedding lookup
            if input_ids is not None and input_ids.dtype != torch.long:
                input_ids = input_ids.long()
            inputs_embeds = self.model.embed_tokens(input_ids)
            if pixel_values is not None:
                pixel_values = pixel_values.type(self.visual.get_dtype())
                image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
                n_image_tokens = (input_ids == self.config.image_token_id).sum().item()
                n_image_features = image_embeds.shape[0]
                if n_image_tokens != n_image_features:
                    raise ValueError(
                        f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
                    )
                image_mask = (
                    (input_ids == self.config.image_token_id)
                    .unsqueeze(-1)
                    .expand_as(inputs_embeds)
                    .to(inputs_embeds.device)
                )
                image_embeds = image_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
                inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)

            if pixel_values_videos is not None:
                pixel_values_videos = pixel_values_videos.type(self.visual.get_dtype())
                # video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw)
                pixel_st = 0
                vid_sub_bs = 4
                video_embeds = []
                chunk_iters = len(video_grid_thw)//vid_sub_bs + int((len(video_grid_thw)%vid_sub_bs)>0)
                for chunk_iter in range(chunk_iters):
                    pixel_length = video_grid_thw[chunk_iter*vid_sub_bs:(chunk_iter+1)*vid_sub_bs].prod(dim=-1).sum()
                    sub_img_embeds = self.visual(pixel_values_videos[pixel_st:pixel_st+pixel_length], 
                                                 grid_thw=video_grid_thw[chunk_iter*vid_sub_bs:(chunk_iter+1)*vid_sub_bs]
                                                 )
                    pixel_st += pixel_length
                    
                    video_embeds.append(sub_img_embeds)
                video_embeds = torch.cat(video_embeds, dim=0)
                
                n_video_tokens = (input_ids == self.config.video_token_id).sum().item()
                n_video_features = video_embeds.shape[0]
                if n_video_tokens != n_video_features:
                    raise ValueError(
                        f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
                    )
                video_mask = (
                    (input_ids == self.config.video_token_id)
                    .unsqueeze(-1)
                    .expand_as(inputs_embeds)
                    .to(inputs_embeds.device)
                )
                video_embeds = video_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
                inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)

            if attention_mask is not None:
                attention_mask = attention_mask.to(inputs_embeds.device)

        # if we get 4D attention mask we cannot calculate rope deltas anymore. TODO @raushan fixme
        if position_ids is None and input_ids is not None and (attention_mask is None or attention_mask.ndim == 2):
            # calculate RoPE index once per generation in the pre-fill stage only
        
            position_ids, rope_deltas = self.get_rope_index(
                input_ids, image_grid_thw, video_grid_thw, attention_mask
            )
            self.rope_deltas = rope_deltas
        # custom_kv_pos_offset = position_ids.max()+1
        outputs = self.model.forward(
            input_ids=None,
            position_ids=position_ids,
            attention_mask=attention_mask,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=True,
            output_attentions=False,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            cache_position=cache_position,
            custom_kv=custom_kv,
            custom_kv_pos_offset=0,
        )
        # cur_kv = outputs.past_key_values
        custom_kv_pos_offset += (position_ids.max()+1) # for icl 
        if custom_kv is None:
            custom_kv = outputs.past_key_values
        else:
            custom_kv.merge_other_kv(outputs.past_key_values, is_online_video=True)
       
        return custom_kv, custom_kv_pos_offset, input_ids.shape[-1]
    
    # @torch.compile
    def clone_kv_cache(self, old_cache: DynamicCache) -> DynamicCache:
        new_cache = DynamicCache()
        new_cache._seen_tokens = old_cache._seen_tokens
        new_cache.key_cache = [k.clone() for k in old_cache.key_cache]
        new_cache.value_cache = [v.clone() for v in old_cache.value_cache]
        new_cache.pre_lens = old_cache.pre_lens.copy()  # 如果 pre_lens 中存储的也是 tensor，可以改为 [p.clone() for p in old_cache.pre_lens]
        new_cache.pre_ratios = old_cache.pre_ratios
        return new_cache
    
    # @torch.compile
    def compute_js_div(self, observe_tokens, select_ori_logits):
        select_new_logits = self.lm_head(observe_tokens).reshape(-1, self.config.vocab_size).to(torch.bfloat16).contiguous()
        
        stu_logits = F.softmax(select_new_logits, dim=-1, dtype=torch.bfloat16)
        tea_logits = F.softmax(select_ori_logits, dim=-1, dtype=torch.bfloat16)
        M = (0.5*(stu_logits+tea_logits)).log()
        js_loss = 0.5*(F.kl_div(M, torch.log(stu_logits), log_target=True, reduction="batchmean") + 
                        F.kl_div(M, torch.log(tea_logits), log_target=True, reduction="batchmean"))
        return js_loss
    
    @torch.inference_mode()
    def video_context_organize(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        pixel_values: Optional[torch.Tensor] = None,
        pixel_values_videos: Optional[torch.FloatTensor] = None,
        image_grid_thw: Optional[torch.LongTensor] = None,
        video_grid_thw: Optional[torch.LongTensor] = None,
        rope_deltas: Optional[torch.LongTensor] = None,
        cache_position: Optional[torch.LongTensor] = None,
        icl_input_ids: torch.LongTensor = None,
        icl_inputs_embeds: Optional[torch.FloatTensor] = None,
        icl_attention_mask: Optional[torch.Tensor] = None,
        icl_position_ids: Optional[torch.LongTensor] = None,
        icl_labels: Optional[torch.LongTensor] = None,
        icl_pixel_values: Optional[torch.Tensor] = None,
        icl_pixel_values_videos: Optional[torch.FloatTensor] = None,
        icl_image_grid_thw: Optional[torch.LongTensor] = None,
        icl_video_grid_thw: Optional[torch.LongTensor] = None,
        custom_kv: Optional[List[torch.FloatTensor]] = None,
        custom_kv_pos_offset = 0,
    ) -> Union[Tuple, Qwen2VLCausalLMOutputWithPast]:
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if inputs_embeds is None:
            # Ensure input_ids is long type for embedding lookup
            if input_ids is not None and input_ids.dtype != torch.long:
                input_ids = input_ids.long()
            inputs_embeds = self.model.embed_tokens(input_ids)
            v_start = time.time()
            if pixel_values is not None:
                pixel_values = pixel_values.type(self.visual.get_dtype())
                image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
                n_image_tokens = (input_ids == self.config.image_token_id).sum().item()
                n_image_features = image_embeds.shape[0]
                if n_image_tokens != n_image_features:
                    raise ValueError(
                        f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
                    )
                image_mask = (
                    (input_ids == self.config.image_token_id)
                    .unsqueeze(-1)
                    .expand_as(inputs_embeds)
                    .to(inputs_embeds.device)
                )
                image_embeds = image_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
                inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)

            if pixel_values_videos is not None:
                pixel_values_videos = pixel_values_videos.type(self.visual.get_dtype())
                # video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw)
                split_video_grid_thw = video_grid_thw.clone()
                sub_bs = 4
                split_video_grid_thw[:, 0] = sub_bs
                vclip_num = video_grid_thw[0,0]//sub_bs + int((video_grid_thw[0,0]%sub_bs)>0)
                st_ind = 0
                video_embeds = []
                for iter_num in range(vclip_num):
                    if iter_num==(vclip_num-1) and (video_grid_thw[0,0]%sub_bs)>0:
                        split_video_grid_thw[:, 0] = video_grid_thw[0,0]%sub_bs
                    end_ind = st_ind + split_video_grid_thw[0].prod(dim=-1)
                    split_pixels = pixel_values_videos[st_ind:end_ind]
                    sub_video_embeds = self.visual(split_pixels, split_video_grid_thw)
                    video_embeds.append(sub_video_embeds)
                    st_ind = end_ind
                video_embeds = torch.cat(video_embeds, dim=0)
                
                n_video_tokens = (input_ids == self.config.video_token_id).sum().item()
                n_video_features = video_embeds.shape[0]
                if n_video_tokens != n_video_features:
                    raise ValueError(
                        f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
                    )
                video_mask = (
                    (input_ids == self.config.video_token_id)
                    .unsqueeze(-1)
                    .expand_as(inputs_embeds)
                    .to(inputs_embeds.device)
                )
                video_embeds = video_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
                inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)
                v_end = time.time()
            if attention_mask is not None:
                attention_mask = attention_mask.to(inputs_embeds.device)

        # if we get 4D attention mask we cannot calculate rope deltas anymore. TODO @raushan fixme
        if position_ids is None and input_ids is not None and (attention_mask is None or attention_mask.ndim == 2):
            # calculate RoPE index once per generation in the pre-fill stage only
        
            position_ids, rope_deltas = self.get_rope_index(
                input_ids, image_grid_thw, video_grid_thw, attention_mask
            )
            self.rope_deltas = rope_deltas
        # custom_kv_pos_offset = position_ids.max()+1
        outputs = self.model.forward(
            input_ids=None,
            position_ids=position_ids,
            attention_mask=attention_mask,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=True,
            output_attentions=False,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            cache_position=cache_position,
            custom_kv=custom_kv,
            custom_kv_pos_offset=0,
        )
        # cur_kv = outputs.past_key_values
        custom_kv_pos_offset += (position_ids.max()+1) # for icl 
        if custom_kv is None:
            custom_kv = outputs.past_key_values
        else:
            custom_kv.merge_other_kv(outputs.past_key_values, is_online_video=False)
       
        # k_cache = custom_kv.value_cache
        # for ind, k_item in enumerate(k_cache):
        #     print(f"layer{ind}", k_item.std(dim=-1), k_item.std(dim=-1).reshape(-1).topk(100, dim=-1)[0])   
            # print(f"layer{ind}", outputs.attentions[ind].topk(dim=-1, 1500)) 
        # import pdb
        # pdb.set_trace()
        # ICL example forward
        if icl_inputs_embeds is None:
            # Ensure icl_input_ids is long type for embedding lookup
            if icl_input_ids is not None and icl_input_ids.dtype != torch.long:
                icl_input_ids = icl_input_ids.long()
            icl_inputs_embeds = self.model.embed_tokens(icl_input_ids)
            if icl_pixel_values is not None:
                icl_pixel_values = icl_pixel_values.type(self.visual.get_dtype())
                icl_image_embeds = self.visual(icl_pixel_values, grid_thw=icl_image_grid_thw)
                n_image_tokens = (icl_input_ids == self.config.image_token_id).sum().item()
                n_image_features = icl_image_embeds.shape[0]
                if n_image_tokens != n_image_features:
                    raise ValueError(
                        f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
                    )
                icl_image_mask = (
                    (icl_input_ids == self.config.image_token_id)
                    .unsqueeze(-1)
                    .expand_as(icl_inputs_embeds)
                    .to(icl_inputs_embeds.device)
                )
                icl_image_embeds = icl_image_embeds.to(icl_inputs_embeds.device, icl_inputs_embeds.dtype)
                icl_inputs_embeds = icl_inputs_embeds.masked_scatter(icl_image_mask, icl_image_embeds)
            if icl_pixel_values_videos is not None: # unused code
                icl_pixel_values_videos = icl_pixel_values_videos.type(self.visual.get_dtype())
                icl_video_embeds = self.visual(icl_pixel_values_videos, grid_thw=icl_video_grid_thw)
                n_video_tokens = (icl_input_ids == self.config.video_token_id).sum().item()
                n_video_features = icl_video_embeds.shape[0]
                if n_video_tokens != n_video_features:
                    raise ValueError(
                        f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
                    )
                icl_video_mask = (
                    (icl_input_ids == self.config.video_token_id)
                    .unsqueeze(-1)
                    .expand_as(icl_inputs_embeds)
                    .to(icl_inputs_embeds.device)
                )
                icl_video_embeds = icl_video_embeds.to(icl_inputs_embeds.device, icl_inputs_embeds.dtype)
                icl_inputs_embeds = icl_inputs_embeds.masked_scatter(icl_video_mask, icl_video_embeds)
            if icl_attention_mask is not None:
                icl_attention_mask = icl_attention_mask.to(icl_inputs_embeds.device)
        if icl_position_ids is None and icl_input_ids is not None and (icl_attention_mask is None or icl_attention_mask.ndim == 2):
            # calculate RoPE index once per generation in the pre-fill stage only
            icl_position_ids, icl_rope_deltas = self.get_rope_index(
                icl_input_ids, icl_image_grid_thw, icl_video_grid_thw, icl_attention_mask
            )
            icl_position_ids = icl_position_ids+(position_ids.max()+1)
        # import pdb
        # pdb.set_trace()    
        icl_output = self.model(
            input_ids=None,
            position_ids=icl_position_ids,
            attention_mask=icl_attention_mask,
            past_key_values=past_key_values,
            inputs_embeds=icl_inputs_embeds,
            use_cache=False,
            output_attentions=True,
            output_hidden_states=True,
            return_dict=return_dict,
            cache_position=cache_position,
            custom_kv=custom_kv,
            custom_kv_pos_offset=0,
        )
        out_attentions = icl_output.attentions
        icl_hidden_states = icl_output[0] #[icl_labels>-100]
        icl_tmp_hidden_states= icl_output.hidden_states
   
        num_layers = len(out_attentions)
        head_dim = custom_kv.key_cache[0].shape[-1]
        kv_head_num = custom_kv.key_cache[0].shape[1]
        # bs, q_head_num, target_seq_len, source_seq_len = outputs.attentions[0].shape
        pooled_IQA_sigs = torch.stack(list(out_attentions))[...,:-icl_inputs_embeds.shape[1]].unflatten(2, (4, 7)).mean(dim=-3).sum(dim=-2)
    
        pooled_IQA_sigs = F.avg_pool1d(pooled_IQA_sigs.flatten(0,2), kernel_size=5, padding=2, stride=1).unflatten(0, (num_layers, -1, kv_head_num))
        # for attn_ in out_attentions:
        #     bs, _, cur_seq_len, new_seq_len = attn_.shape
        #     ori_context_len = new_seq_len# attn_k_num-cur_seq_len
        #     IQA_sig_scores = attn_.sum(dim=2)
        #     IQA_sig_scores = IQA_sig_scores.reshape(bs, kv_head_num, -1, ori_context_len).sum(dim=-2)
        #     pooled_IQA_sig = F.avg_pool1d(IQA_sig_scores, kernel_size=5, padding=2, stride=1) 
        #     pooled_IQA_sigs.append(pooled_IQA_sig)
        
        all_token_num = 0
        delta_L_max = 0.05
        remain_ratios = [ii/100 for ii in [2, 5, 10]]
        groups = 7
        pre_remain_ratio = []
        
        outer_kv = custom_kv
       
        select_ori_logits = self.lm_head(icl_hidden_states[:,-16:]).reshape(-1, self.config.vocab_size).to(torch.bfloat16).contiguous()
       
        all_token_num = 0
        for layer_id in range(num_layers-1, -1, -groups): # range(num_layers): #
            inner_kv = self.clone_kv_cache(outer_kv)
            remain_num = outer_kv.key_cache[layer_id].shape[-2]
            remain_ratio = 1
            pre_compressed_len = outer_kv.key_cache[layer_id].shape[-2] - input_ids.shape[-1]
            for cur_remain_ratio in remain_ratios: #range(1, 5):
                # cur_remain_num = m#remain_num - source_seq_len // (2 ** m)
                cur_remain_num = int(cur_remain_ratio*input_ids.shape[-1])
                for group_id in range(groups):
                    remain_index = pooled_IQA_sigs[layer_id-group_id].topk(cur_remain_num, dim=-1)[1]
                    remain_index = remain_index.unsqueeze(-1).expand(-1, -1, -1, head_dim)
                    pre_len = outer_kv.pre_lens[layer_id-group_id] if len(outer_kv.pre_lens)>0 else 0
                    # print(pooled_IQA_sigs[layer_id-group_id].shape[-1], pre_len, outer_kv.key_cache[layer_id-group_id].shape)
                    
                    inner_kv.key_cache[layer_id-group_id] = torch.cat([
                        outer_kv.key_cache[layer_id-group_id][:,:,:pre_len],
                        torch.gather(outer_kv.key_cache[layer_id-group_id][:,:,pre_len:], dim=2, index=remain_index).contiguous()
                    ], dim=2
                    )
                    inner_kv.value_cache[layer_id-group_id] = torch.cat([
                        outer_kv.value_cache[layer_id-group_id][:,:,:pre_len],
                        torch.gather(outer_kv.value_cache[layer_id-group_id][:,:,pre_len:], dim=2, index=remain_index).contiguous()
                    ], dim=2
                    )
                
                current_output = self.model(
                    input_ids=None,
                    start_layer_num=layer_id-groups+1,
                    position_ids=icl_position_ids,
                    attention_mask=icl_attention_mask,
                    past_key_values=past_key_values,
                    inputs_embeds=icl_tmp_hidden_states[layer_id-groups+1],
                    use_cache=False,
                    output_attentions=False,
                    output_hidden_states=output_hidden_states,
                    return_dict=return_dict,
                    cache_position=cache_position,
                    custom_kv=inner_kv,
                    custom_kv_pos_offset=0,
                )
                js_loss = self.compute_js_div(current_output[0][:,-16:], select_ori_logits)
                # JS Divergence
                
                # print("js loss:", js_loss)
                # KL divergence
                # kl_loss = F.kl_div(F.log_softmax(select_new_logits,dim=-1), select_ori_logits.softmax(dim=-1), reduction="batchmean")
                # print("js_loss", js_loss, select_ori_logits.shape) 
                if js_loss <= delta_L_max:
                    remain_ratio = cur_remain_ratio
                    remain_num = cur_remain_num + pre_compressed_len
                    outer_kv.key_cache = inner_kv.key_cache
                    outer_kv.value_cache = inner_kv.value_cache
                    break
            all_token_num+=remain_num*groups

               
        return outer_kv, custom_kv_pos_offset, all_token_num/28, v_end-v_start
    
    def forward_context(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        pixel_values: Optional[torch.Tensor] = None,
        pixel_values_videos: Optional[torch.FloatTensor] = None,
        image_grid_thw: Optional[torch.LongTensor] = None,
        video_grid_thw: Optional[torch.LongTensor] = None,
        rope_deltas: Optional[torch.LongTensor] = None,
        cache_position: Optional[torch.LongTensor] = None,
        custom_kv: Optional[List[torch.FloatTensor]] = None,
        custom_kv_pos_offset = 0,
    ) -> Union[Tuple, Qwen2VLCausalLMOutputWithPast]:
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if inputs_embeds is None:
            # Ensure input_ids is long type for embedding lookup
            if input_ids is not None and input_ids.dtype != torch.long:
                input_ids = input_ids.long()
            inputs_embeds = self.model.embed_tokens(input_ids)
            if pixel_values is not None:
                pixel_values = pixel_values.type(self.visual.get_dtype())
                # image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
                pixel_st = 0
                img_chunk_num = 20
                image_embeds = []
                chunk_iters = len(image_grid_thw)//img_chunk_num + int(len(image_grid_thw)%img_chunk_num>0)
                for chunk_iter in range(chunk_iters):
                    pixel_length = image_grid_thw[chunk_iter*img_chunk_num:(chunk_iter+1)*img_chunk_num].prod(dim=-1).sum()
                    sub_img_embeds = self.visual(pixel_values[pixel_st:pixel_st+pixel_length], 
                                                 grid_thw=image_grid_thw[chunk_iter*img_chunk_num:(chunk_iter+1)*img_chunk_num]
                                                 )
                    pixel_st += pixel_length
                    image_embeds.append(sub_img_embeds)
                image_embeds = torch.cat(image_embeds, dim=0)
                n_image_tokens = (input_ids == self.config.image_token_id).sum().item()
                n_image_features = image_embeds.shape[0]
                if n_image_tokens != n_image_features:
                    raise ValueError(
                        f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
                    )
                image_mask = (
                    (input_ids == self.config.image_token_id)
                    .unsqueeze(-1)
                    .expand_as(inputs_embeds)
                    .to(inputs_embeds.device)
                )
                image_embeds = image_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
                inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)

            if pixel_values_videos is not None:
                pixel_values_videos = pixel_values_videos.type(self.visual.get_dtype())
                # video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw)
                pixel_st = 0
                vid_sub_bs = 4
                video_embeds = []
                chunk_iters = len(video_grid_thw)//vid_sub_bs + int(len(video_grid_thw)%vid_sub_bs>0)
                for chunk_iter in range(chunk_iters):
                    pixel_length = video_grid_thw[chunk_iter*vid_sub_bs:(chunk_iter+1)*vid_sub_bs].prod(dim=-1).sum()
                    sub_img_embeds = self.visual(pixel_values_videos[pixel_st:pixel_st+pixel_length], 
                                                 grid_thw=video_grid_thw[chunk_iter*vid_sub_bs:(chunk_iter+1)*vid_sub_bs]
                                                 )
                    pixel_st += pixel_length
                    
                    video_embeds.append(sub_img_embeds)
                video_embeds = torch.cat(video_embeds, dim=0)
                n_video_tokens = (input_ids == self.config.video_token_id).sum().item()
                n_video_features = video_embeds.shape[0]
                if n_video_tokens != n_video_features:
                    raise ValueError(
                        f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
                    )
                video_mask = (
                    (input_ids == self.config.video_token_id)
                    .unsqueeze(-1)
                    .expand_as(inputs_embeds)
                    .to(inputs_embeds.device)
                )
                video_embeds = video_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
                inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)

            if attention_mask is not None:
                attention_mask = attention_mask.to(inputs_embeds.device)

        # if we get 4D attention mask we cannot calculate rope deltas anymore. TODO @raushan fixme
        if position_ids is None and input_ids is not None and (attention_mask is None or attention_mask.ndim == 2):
            # calculate RoPE index once per generation in the pre-fill stage only
        
            position_ids, rope_deltas = self.get_rope_index(
                input_ids, image_grid_thw, video_grid_thw, attention_mask
            )
            self.rope_deltas = rope_deltas
        # get init state
        outputs = self.model.forward(
            input_ids=None,
            position_ids=position_ids,
            attention_mask=attention_mask,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=True,
            output_attentions=False,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            cache_position=cache_position,
            custom_kv=custom_kv,
            custom_kv_pos_offset=custom_kv_pos_offset,
        )
        custom_kv_pos_offset += (position_ids.max()+1)
        if custom_kv is None:
            custom_kv = outputs.past_key_values
        else:
            custom_kv.merge_other_kv(outputs.past_key_values)
        torch.cuda.empty_cache()
        return custom_kv, custom_kv_pos_offset
    
    
    @torch.no_grad()
    def js_loss_upper_bound(
        self,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        cache_position: Optional[torch.LongTensor] = None,
        icl_input_ids: torch.LongTensor = None,
        icl_inputs_embeds: Optional[torch.FloatTensor] = None,
        icl_attention_mask: Optional[torch.Tensor] = None,
        icl_position_ids: Optional[torch.LongTensor] = None,
        icl_labels: Optional[torch.LongTensor] = None,
        icl_pixel_values: Optional[torch.Tensor] = None,
        icl_pixel_values_videos: Optional[torch.FloatTensor] = None,
        icl_image_grid_thw: Optional[torch.LongTensor] = None,
        icl_video_grid_thw: Optional[torch.LongTensor] = None,
        custom_kv: Optional[List[torch.FloatTensor]] = None,
        custom_kv_pos_offset = 0,
        full_custom_kv: Optional[List[torch.FloatTensor]] = None,
        full_custom_kv_pos_offset = 0,
        img_chunk_num = 10,
    ) -> Union[Tuple, Qwen2VLCausalLMOutputWithPast]:
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if icl_inputs_embeds is None:
            # Ensure icl_input_ids is long type for embedding lookup
            if icl_input_ids is not None and icl_input_ids.dtype != torch.long:
                icl_input_ids = icl_input_ids.long()
            icl_inputs_embeds = self.model.embed_tokens(icl_input_ids)
            if icl_pixel_values is not None:
                icl_pixel_values = icl_pixel_values.type(self.visual.get_dtype())
                pixel_st = 0
                img_chunk_num = 8
                image_embeds = []
                chunk_iters = len(icl_image_grid_thw)//img_chunk_num + int(len(icl_image_grid_thw)%img_chunk_num>0)
                for chunk_iter in range(chunk_iters):
                    pixel_length = icl_image_grid_thw[chunk_iter*img_chunk_num:(chunk_iter+1)*img_chunk_num].prod(dim=-1).sum()
                    sub_img_embeds = self.visual(icl_pixel_values[pixel_st:pixel_st+pixel_length], 
                                                 grid_thw=icl_image_grid_thw[chunk_iter*img_chunk_num:(chunk_iter+1)*img_chunk_num]
                                                 )
                    pixel_st += pixel_length
                    image_embeds.append(sub_img_embeds)
                icl_image_embeds = torch.cat(image_embeds, dim=0)
                
                # icl_image_embeds = self.visual(icl_pixel_values, grid_thw=icl_image_grid_thw)
                n_image_tokens = (icl_input_ids == self.config.image_token_id).sum().item()
                n_image_features = icl_image_embeds.shape[0]
                if n_image_tokens != n_image_features:
                    raise ValueError(
                        f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
                    )
                icl_image_mask = (
                    (icl_input_ids == self.config.image_token_id)
                    .unsqueeze(-1)
                    .expand_as(icl_inputs_embeds)
                    .to(icl_inputs_embeds.device)
                )
                icl_image_embeds = icl_image_embeds.to(icl_inputs_embeds.device, icl_inputs_embeds.dtype)
                icl_inputs_embeds = icl_inputs_embeds.masked_scatter(icl_image_mask, icl_image_embeds)
            if icl_attention_mask is not None:
                icl_attention_mask = icl_attention_mask.to(icl_inputs_embeds.device)
        if icl_position_ids is None and icl_input_ids is not None and (icl_attention_mask is None or icl_attention_mask.ndim == 2):
            # calculate RoPE index once per generation in the pre-fill stage only
            icl_position_ids, icl_rope_deltas = self.get_rope_index(
                icl_input_ids, icl_image_grid_thw, icl_video_grid_thw, icl_attention_mask
            )
        chunk_number = 10
        js_loss_final = 0
        chunk_iters = icl_input_ids.shape[0]//chunk_number + int(icl_input_ids.shape[0]%chunk_number>0)
        for chunk_iter in range(chunk_iters):
            st_ = chunk_iter*chunk_number
            end_ = (chunk_iter+1)*chunk_number
            icl_output = self.model(
                input_ids=None,
                position_ids=icl_position_ids[:, st_:end_],
                attention_mask=icl_attention_mask[st_:end_],
                past_key_values=None,
                inputs_embeds=icl_inputs_embeds[st_:end_],
                use_cache=False,
                output_attentions=False,
                output_hidden_states=False,
                return_dict=return_dict,
                cache_position=cache_position,
                custom_kv=custom_kv,
                custom_kv_pos_offset=custom_kv_pos_offset,
            )
            icl_hidden_states = icl_output[0][icl_labels[st_:end_]>-100]
            select_ori_logits = self.lm_head(icl_hidden_states).view(-1, self.config.vocab_size).to(torch.bfloat16)
            # icl_tmp_hidden_states= icl_output.hidden_states
            icl_output_full = self.model(
                input_ids=None,
                position_ids=icl_position_ids[:, st_:end_],
                attention_mask=icl_attention_mask[st_:end_],
                past_key_values=None,
                inputs_embeds=icl_inputs_embeds[st_:end_],
                use_cache=False,
                output_attentions=False,
                output_hidden_states=False,
                return_dict=return_dict,
                cache_position=cache_position,
                custom_kv=full_custom_kv,
                custom_kv_pos_offset=full_custom_kv_pos_offset,
            )
            full_ori_logits = self.lm_head(icl_output_full[0][icl_labels[st_:end_]>-100]).view(-1, self.config.vocab_size).to(torch.bfloat16)
            stu_logits = F.softmax(select_ori_logits, dim=-1, dtype=torch.bfloat16)
            tea_logits = F.softmax(full_ori_logits, dim=-1, dtype=torch.bfloat16)
            M = (0.5*(stu_logits+tea_logits)).log()
            js_loss = 0.5*(F.kl_div(M, torch.log(stu_logits), log_target=True, reduction="sum") + 
                            F.kl_div(M, torch.log(tea_logits), log_target=True, reduction="sum"))
            # del current_output, stu_logits, tea_logits, M, select_new_logits
            js_loss_final+=js_loss
        js_loss_final = js_loss_final/(icl_labels>-100).sum()
        print("js_loss between chunk and full:", js_loss_final)
        return js_loss_final
    
    def prepare_inputs_for_generation(
        self,
        input_ids,
        past_key_values=None,
        attention_mask=None,
        inputs_embeds=None,
        cache_position=None,
        position_ids=None,
        use_cache=True,
        pixel_values=None,
        pixel_values_videos=None,
        image_grid_thw=None,
        video_grid_thw=None,
        custom_kv=None,
        custom_kv_pos_offset=None,
        **kwargs,
    ):
        # Overwritten -- in specific circumstances we don't want to forward image inputs to the model

        # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
        # Exception 1: when passing input_embeds, input_ids may be missing entries
        # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
        
        if past_key_values is not None:
            if inputs_embeds is not None:  # Exception 1
                input_ids = input_ids[:, -cache_position.shape[0] :]
            elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
                input_ids = input_ids[:, cache_position]
        
        if cache_position[0] != 0:
            pixel_values = None
            pixel_values_videos = None
        
        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
        if inputs_embeds is not None and cache_position[0] == 0:
            model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
        else:
            model_inputs = {"input_ids": input_ids, "inputs_embeds": None}

        if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
            if model_inputs["inputs_embeds"] is not None:
                batch_size, sequence_length, _ = inputs_embeds.shape
                device = inputs_embeds.device
            else:
                batch_size, sequence_length = input_ids.shape
                device = input_ids.device

            attention_mask = self.model._prepare_4d_causal_attention_mask_with_cache_position(
                attention_mask,
                sequence_length=sequence_length,
                target_length=past_key_values.get_max_cache_shape(),
                dtype=self.lm_head.weight.dtype,
                device=device,
                cache_position=cache_position,
                batch_size=batch_size,
                config=self.config,
                past_key_values=past_key_values,
            )
        # import pdb
        # pdb.set_trace()
        model_inputs.update(
            {
                "position_ids": position_ids,
                "past_key_values": past_key_values,
                "use_cache": use_cache,
                "attention_mask": attention_mask,
                "pixel_values": pixel_values,
                "pixel_values_videos": pixel_values_videos,
                "image_grid_thw": image_grid_thw,
                "video_grid_thw": video_grid_thw,
                "cache_position": cache_position,
                "custom_kv": custom_kv,
                "custom_kv_pos_offset": custom_kv_pos_offset,
            }
        )
        return model_inputs
