#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
#           This file was automatically generated from src/transformers/models/internvl/modular_internvl.py.
#               Do NOT edit this file manually as any edits will be overwritten by the generation of
#             the file from the modular. If any change should be done, please apply the change to the
#                          modular_internvl.py file directly. One of our CI enforces this.
#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
# coding=utf-8
# Copyright 2025 HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import cv2
import numpy as np
import collections.abc
from dataclasses import dataclass
from typing import Callable, Optional, Union

import torch
import torch.nn as nn
from modules.movement_cross_attn import MovementCrossAttn , global_correlation_softmax
import torch.nn.functional as F

from transformers.activations import ACT2FN
from transformers.cache_utils import Cache
from transformers.generation import GenerationMixin
from transformers.integrations import use_kernel_forward_from_hub
from transformers.modeling_layers import GradientCheckpointingLayer
from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPast, BaseModelOutputWithPooling
from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
from transformers.processing_utils import Unpack
from transformers.utils import ModelOutput, TransformersKwargs, auto_docstring, can_return_tuple, torch_int
from transformers.utils.generic import check_model_inputs
from transformers.models.auto import AutoModel
from internvl.configuration_internvl import InternVLConfig, InternVLVisionConfig


@use_kernel_forward_from_hub("RMSNorm")
class InternVLVisionRMSNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-6):
        """
        InternVLVisionRMSNorm is equivalent to T5LayerNorm
        """
        super().__init__()
        self.weight = nn.Parameter(torch.ones(hidden_size))
        self.variance_epsilon = eps

    def forward(self, hidden_states):
        input_dtype = hidden_states.dtype
        hidden_states = hidden_states.to(torch.float32)
        variance = hidden_states.pow(2).mean(-1, keepdim=True)
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
        return self.weight * hidden_states.to(input_dtype)

    def extra_repr(self):
        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"


def eager_attention_forward(
    module: nn.Module,
    query: torch.Tensor,
    key: torch.Tensor,
    value: torch.Tensor,
    attention_mask: Optional[torch.Tensor],
    scaling: float,
    dropout: float = 0.0,
    **kwargs,
):
    key_states = key
    value_states = value

    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
    if attention_mask is not None:
        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
        attn_weights = attn_weights + causal_mask

    # No upcasting of the attention weights to float32 in this implementation
    attn_weights = nn.functional.softmax(attn_weights, dim=-1)
    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
    attn_output = torch.matmul(attn_weights, value_states)
    attn_output = attn_output.transpose(1, 2).contiguous()

    return attn_output, attn_weights


class InternVLVisionAttention(nn.Module):
    """Attention Class for InternVL Vision Encoder"""

    def __init__(self, config: InternVLVisionConfig):
        super().__init__()
        self.config = config
        self.embed_dim = config.hidden_size
        self.num_heads = config.num_attention_heads
        self.head_dim = self.embed_dim // self.num_heads
        if self.head_dim * self.num_heads != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
                f" {self.num_heads})."
            )
        self.scale = self.head_dim**-0.5
        self.attention_dropout = config.attention_dropout
        proj_dropout = config.projection_dropout
        qk_norm = config.use_qk_norm

        # Needed for flash attention
        self.is_causal = False

        self.q_proj = nn.Linear(self.embed_dim, self.num_heads * self.head_dim, bias=config.attention_bias)
        self.k_proj = nn.Linear(self.embed_dim, self.num_heads * self.head_dim, bias=config.attention_bias)
        self.v_proj = nn.Linear(self.embed_dim, self.num_heads * self.head_dim, bias=config.attention_bias)
        self.projection_layer = nn.Linear(self.embed_dim, self.embed_dim)
        self.projection_dropout = nn.Dropout(proj_dropout) if proj_dropout > 0 else nn.Identity()

        self.q_norm = InternVLVisionRMSNorm(self.embed_dim) if qk_norm else nn.Identity()
        self.k_norm = InternVLVisionRMSNorm(self.embed_dim) if qk_norm else nn.Identity()

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        **kwargs: Unpack[TransformersKwargs],
    ):
        batch_size, seq_len, _ = hidden_states.size()

        query_states = self.q_proj(hidden_states)
        key_states = self.k_proj(hidden_states)
        value_states = self.v_proj(hidden_states)

        query_states = self.q_norm(query_states)
        key_states = self.k_norm(key_states)

        query_states = query_states.reshape(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        key_states = key_states.reshape(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        value_states = value_states.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)

        attention_interface: Callable = eager_attention_forward
        if self.config._attn_implementation != "eager":
            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]

        attn_output, attn_weights = attention_interface(
            self,
            query_states,
            key_states,
            value_states,
            attention_mask,
            dropout=0.0 if not self.training else self.attention_dropout,
            scaling=self.scale,
            is_causal=False,
            **kwargs,
        )
        attn_output = attn_output.reshape(batch_size, seq_len, self.embed_dim)

        output = self.projection_layer(attn_output)
        output = self.projection_dropout(output)

        return output, attn_weights


@dataclass
@auto_docstring(
    custom_intro="""
    Class for outputs of [`InternVLVisionModel`].
    """
)
class InternVLVisionModelOutputWithPooling(BaseModelOutputWithPooling):
    r"""
    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
        Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
        *config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
        will be returned.
    """


class InternVLVisionPatchEmbeddings(nn.Module):
    """
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    """

    def __init__(self, config):
        super().__init__()
        image_size, patch_size = config.image_size, config.patch_size
        num_channels, hidden_size = config.num_channels, config.hidden_size

        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
        patch_shape = (image_size[0] // patch_size[0], image_size[1] // patch_size[1])
        self.image_size = image_size
        self.patch_size = patch_size
        self.num_channels = num_channels
        self.num_patches = num_patches
        self.patch_shape = patch_shape

        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)

    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
        batch_size, num_channels, height, width = pixel_values.shape
        if num_channels != self.num_channels:
            raise ValueError(
                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
            )

        embeddings = self.projection(pixel_values)
        patch_height, patch_width = embeddings.shape[2], embeddings.shape[3]
        embeddings = embeddings.flatten(2).transpose(1, 2)

        return embeddings, (patch_height, patch_width)


# Based on timm implementation, which can be found here:
# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
class InternVLVisionEmbeddings(nn.Module):
    """
    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.

    """

    def __init__(self, config: InternVLVisionConfig) -> None:
        super().__init__()

        self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
        if config.use_mask_token:
            self.mask_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
        else:
            self.mask_token = None
        self.patch_embeddings = InternVLVisionPatchEmbeddings(config)
        self.patch_size = config.patch_size
        self.image_size = (
            config.image_size
            if isinstance(config.image_size, collections.abc.Iterable)
            else (config.image_size, config.image_size)
        )
        num_patches = self.patch_embeddings.num_patches
        if config.use_absolute_position_embeddings:
            self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.hidden_size))
        else:
            self.position_embeddings = None
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
        """
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        """

        num_patches = embeddings.shape[1] - 1
        num_positions = self.position_embeddings.shape[1] - 1

        # always interpolate when tracing to ensure the exported model works for dynamic input shapes
        if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
            return self.position_embeddings

        class_pos_embed = self.position_embeddings[:, :1]
        patch_pos_embed = self.position_embeddings[:, 1:]

        dim = embeddings.shape[-1]

        new_height = height // self.patch_size[0]
        new_width = width // self.patch_size[1]

        sqrt_num_positions = torch_int(num_positions**0.5)
        patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)

        patch_pos_embed = nn.functional.interpolate(
            patch_pos_embed,
            size=(new_height, new_width),
            mode="bicubic",
            align_corners=False,
        )

        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)

        return torch.cat((class_pos_embed, patch_pos_embed), dim=1)

    def forward(
        self,
        pixel_values: torch.Tensor,
        bool_masked_pos: Optional[torch.BoolTensor] = None,
    ) -> torch.Tensor:
        
        _, _, height, width = pixel_values.shape
        embeddings, (patch_height, patch_width) = self.patch_embeddings(pixel_values)
        batch_size, seq_len, _ = embeddings.size()

        if bool_masked_pos is not None:
            mask_tokens = self.mask_token.expand(batch_size, seq_len, -1)
            # replace the masked visual tokens by mask_tokens
            w = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens)
            embeddings = embeddings * (1 - w) + mask_tokens * w

        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
        embeddings = torch.cat((cls_tokens, embeddings), dim=1)

        if self.position_embeddings is not None:
            embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)

        embeddings = self.dropout(embeddings)

        return embeddings, (patch_height, patch_width)


class InternVLVisionMLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.activation_fn = ACT2FN[config.hidden_act]
        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        hidden_states = self.fc1(hidden_states)
        hidden_states = self.activation_fn(hidden_states)
        hidden_states = self.fc2(hidden_states)
        return hidden_states


NORM2FN = {"layer_norm": nn.LayerNorm, "rms_norm": InternVLVisionRMSNorm}


class InternVLVisionLayer(GradientCheckpointingLayer):
    """This corresponds to the Block class in the timm implementation."""

    def __init__(self, config: InternVLVisionConfig) -> None:
        super().__init__()
        self.chunk_size_feed_forward = config.chunk_size_feed_forward
        self.seq_len_dim = 1
        self.attention = InternVLVisionAttention(config)
        self.mlp = InternVLVisionMLP(config)
        # InternVL uses different layernorm implementations for different models
        self.layernorm_before = NORM2FN[config.norm_type](config.hidden_size, eps=config.layer_norm_eps)
        self.layernorm_after = NORM2FN[config.norm_type](config.hidden_size, eps=config.layer_norm_eps)

        init_values = config.layer_scale_init_value
        self.lambda_1 = nn.Parameter(init_values * torch.ones(config.hidden_size), requires_grad=True)
        self.lambda_2 = nn.Parameter(init_values * torch.ones(config.hidden_size), requires_grad=True)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(
        self,
        hidden_states: torch.Tensor,
    ) -> Union[tuple[torch.Tensor], tuple[torch.Tensor, torch.Tensor]]:
        attention_output, _ = self.attention(
            self.layernorm_before(hidden_states),  # in InternVLVision, layernorm is applied before self-attention
        )

        attention_output = self.lambda_1 * attention_output

        # first residual connection
        hidden_states = attention_output + hidden_states

        # in InternVLVision, layernorm is also applied after self-attention
        layer_output = self.layernorm_after(hidden_states)

     
        

        layer_output = self.mlp(layer_output)
        layer_output = self.dropout(layer_output)

        if self.lambda_2 is not None:
            layer_output = self.lambda_2 * layer_output

        # second residual connection
        layer_output = layer_output + hidden_states

        return layer_output


class InternVLVisionEncoder(nn.Module):
    def __init__(self, config: InternVLVisionConfig) -> None:
        super().__init__()
        self.config = config
        self.layer = nn.ModuleList([InternVLVisionLayer(config) for i in range(config.num_hidden_layers)])
        print(config.num_hidden_layers)
        
        self.MCA_positions = [6,12,18,22]
        self.num_MCA_layers = 4
        self.MCA_layers = nn.ModuleList([
                MovementCrossAttn(feature_channels=config.hidden_size) for i in range(self.num_MCA_layers)
            ])

        
        self.gradient_checkpointing = False

    @check_model_inputs
    def forward(
        self,
        hidden_states: torch.Tensor,
    ) -> Union[tuple, BaseModelOutput]:
        MCA_layer_idx = 0
        
        for layer_idx,layer_module in enumerate(self.layer):
            hidden_states = layer_module(hidden_states)
            
            #continue
            
                

            if (
                MCA_layer_idx < len(self.MCA_positions) and 
                layer_idx == self.MCA_positions[MCA_layer_idx]):

            
                residual = hidden_states

          

                T = 30
                B = hidden_states.shape[0] // T
                H, D = hidden_states.shape[1:]  # 729, 1152 
                hidden_states_reshape = hidden_states.view(B,T,H,D)
         

                f_t     = hidden_states_reshape[:, :-1]   # [B,T-1,M,d]
                f_next  = hidden_states_reshape[:, 1:]    # [B,T-1,M,d] 

              
                f_t        = f_t.reshape(B*(T-1),H,D)
                f_next     = f_next.reshape(B*(T-1),H,D)
               

                
                if False:
                    hidden_states = self._gradient_checkpointing_func(
                        self.MCA_layers[MCA_layer_idx].__call__,
                         f_t,
                         f_next,
                         B,
                         T,
                         H,
                         D
                    )
                else:
                    hidden_states = self.MCA_layers[MCA_layer_idx](
                        f_t,f_next, B,T,
                        H,
                        D
                    )
                
                
                hidden_states = hidden_states +residual
                MCA_layer_idx += 1

        return BaseModelOutput(
            last_hidden_state=hidden_states,
        )


@auto_docstring
class InternVLVisionPreTrainedModel(PreTrainedModel):
    config: InternVLVisionConfig
    base_model_prefix = "internvl_vision"
    main_input_name = "pixel_values"
    supports_gradient_checkpointing = True
    _no_split_modules = ["InternVLVisionLayer"]
    _supports_sdpa = True
    _supports_flash_attn = True
    _supports_flex_attn = True
    _supports_attention_backend = True

    _can_record_outputs = {
        "hidden_states": InternVLVisionLayer,
        "attentions": InternVLVisionAttention,
    }

    def _init_weights(self, module):
        """Initialize the weights"""
        super()._init_weights(module)
        if isinstance(module, InternVLVisionEmbeddings):
            module.cls_token.data.zero_()
            if module.mask_token is not None:
                module.mask_token.data.zero_()
            if module.position_embeddings is not None:
                module.position_embeddings.data.zero_()
        elif isinstance(module, InternVLVisionLayer):
            module.lambda_1.data.fill_(self.config.layer_scale_init_value)
            module.lambda_2.data.fill_(self.config.layer_scale_init_value)


@auto_docstring
class InternVLVisionModel(InternVLVisionPreTrainedModel):
    def __init__(self, config: InternVLVisionConfig, use_emph = False) -> None:
        super().__init__(config)
        self.config = config

        self.embeddings = InternVLVisionEmbeddings(config)
        self.encoder = InternVLVisionEncoder(config)

        self.use_emph = use_emph
        

        self.layernorm = (
            nn.Identity() if config.use_mean_pooling else nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        )

        # Initialize weights and apply final processing
        self.post_init()


    def masked_optical_flow_loss(self,pred, target, pred_visibility, inner_mask, use_mask=False, use_emph=False, magnitude_threshold=384):
        """
        L1 loss between optical flow vectors, masked when magnitude > threshold

        Args:
            pred: torch.Tensor of shape [116, 729, 2] - predicted flow
            target: torch.Tensor of shape [116, 729, 2] - target flow  
            magnitude_threshold: float - mask vectors with magnitude > this value

        Returns:
            torch.Tensor - scalar loss value
        """
        # Calculate magnitudes for both pred and target
        pred_mag = torch.norm(pred, dim=-1)  # [116, 729]
        target_mag = torch.norm(target, dim=-1)  # [116, 729]
        
        # Create base mask - keep vectors where both magnitudes <= threshold
        if use_mask:
            base_mask = pred_visibility 
        else:
            base_mask = (pred_mag <= magnitude_threshold) & (target_mag <= magnitude_threshold)
        
        # Apply base mask and calculate L1 loss
      
        if base_mask.sum() > 0:
            masked_pred = pred[base_mask]  # [N, 2] where N is number of valid vectors
            masked_target = target[base_mask]  # [N, 2]
            
            if use_emph and inner_mask is not None:
                # Create weights for emphasis: 2x for inner_mask, 1x for others
                weights = torch.full_like(base_mask, 0.0, dtype=torch.float32, device=pred.device)
                weights[inner_mask & base_mask] = 1.0  # 2x weight for inner mask region
                
                # Apply base mask to get valid weights
                masked_weights = weights[base_mask]  # [N]

                
                
                # Calculate weighted L1 loss
                elementwise_loss = torch.abs(masked_pred - masked_target)  # [N, 2]
                elementwise_loss = elementwise_loss.mean(dim=-1)  # [N] - mean over x,y components
                
                # Apply weights and compute final loss
                weighted_loss = elementwise_loss * masked_weights  # [N]
                loss = weighted_loss.mean()  # Scalar loss
            else:
                # Standard L1 loss without emphasis
                loss = F.l1_loss(masked_pred, masked_target, reduction='mean')
        else:
            # If no valid vectors, return zero loss
            loss = torch.tensor(0.0, device=pred.device, requires_grad=True)

        
      
        return loss

    def get_input_embeddings(self):
        return self.embeddings.patch_embeddings

    @can_return_tuple
    @auto_docstring
    def forward(
        self,
        pixel_values: torch.Tensor,
        bool_masked_pos: Optional[torch.BoolTensor] = None,


        movement_vectors: Optional[torch.FloatTensor] = None,
        pred_visibility : Optional[torch.BoolTensor] = None,
        inner_mask : Optional[torch.BoolTensor] = None,

    ) -> Union[tuple, InternVLVisionModelOutputWithPooling]:
        movement_vectors_loss = []

        r"""
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        """
        embedding_output, _ = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos)

        encoder_outputs = self.encoder(embedding_output)
    
        sequence_output = encoder_outputs[0]

        sequence_output = self.layernorm(sequence_output)


        if True:
            import matplotlib.pyplot as plt

            T = 30
            B = sequence_output[:,1:,:].shape[0] // T
            H, D = sequence_output[:,1:,:].shape[1:]  # 729, 1152 

            hidden_states_reshape = sequence_output[:,1:,:].view(B,T,H,D)
            f_t     = hidden_states_reshape[:, :-1]   # [B,T-1,M,d]
           
            f_next  = hidden_states_reshape[:, 1:]    # [B,T-1,M,d] 
            f_t        = f_t.reshape(B*(T-1),H,D)
            f_next     = f_next.reshape(B*(T-1),H,D)

           

            flow, _ = global_correlation_softmax(f_t, f_next)
            flow = flow.permute(0,2,3,1).view(B*(T-1),H,2)

            sequence_output = sequence_output[:,1:,:]
            for i in range(sequence_output.shape[0]):
                l2 = torch.linalg.norm(sequence_output[i], dim=-1)  # [729]
                # Reshape to 27x27
                heatmap = l2.view(1, 1, 32, 32)  # [1, 1, H, W] for interpolate
                # Upsample to 128x128
                heatmap_up = F.interpolate(heatmap, size=(128, 128), mode='bilinear', align_corners=False)
                heatmap_np = heatmap_up.squeeze().detach().cpu().numpy()  # shape: [128, 128]
                # Normalize to 0-255 and convert to uint8
                heatmap_norm = cv2.normalize(heatmap_np, None, 0, 255, cv2.NORM_MINMAX)
                heatmap_uint8 = heatmap_norm.astype(np.uint8)
                # Apply colormap
                heatmap_color = cv2.applyColorMap(heatmap_uint8, cv2.COLORMAP_JET)
                # Save the image
                cv2.imwrite(f'to_del/b1_{i}.png', heatmap_color)
               
                x = pixel_values[i]
               
                x = x.permute(1, 2, 0)
                x = x.cpu().float().numpy()
                x = (x - x.min()) / (x.max() - x.min())
                x = np.float32(x)
                x =  np.uint8(255 * x)
                x = cv2.cvtColor(np.array(x), cv2.COLOR_RGB2BGR)
                x = cv2.cvtColor(x, cv2.COLOR_BGR2RGB)
                cv2.imwrite(f'to_del/c_{i}frame.png', x)
                plt.figure(figsize=(6, 6))
                plt.imshow(x)
                plt.axis("off")
                plt.tight_layout()
                print("AHAHAH")
                plt.savefig(f"to_del/c_{i}frame.png", dpi=150)
            
            exit(1)
            

           
            #import cv2

            #import numpy as np
            #import matplotlib.pyplot as plt
            print(flow.shape)
            print(pixel_values.shape)
            grid_size = 32
            patch_h = 448 // grid_size
            patch_w = 448 // grid_size
            y_centers = np.arange(patch_h//2, 448, patch_h)
            x_centers = np.arange(patch_w//2, 448, patch_w)
            grid_y, grid_x = np.meshgrid(y_centers, x_centers, indexing="ij")
            centers = np.stack([grid_x.ravel(), grid_y.ravel()], axis=-1)  # shape [729, 2]

            for i in range(29):
                x = pixel_values
                x = x[i,:,:,:]
#                print(pred_visibility.shape,flush=True)
                
             #   curr_vis = pred_visibility[i].detach().cpu().numpy()

                x = x.permute(1, 2, 0)
                x = x.cpu().float().numpy()
                x = (x - x.min()) / (x.max() - x.min())
                x = np.float32(x)
                x =  np.uint8(255 * x)
                x = cv2.cvtColor(np.array(x), cv2.COLOR_RGB2BGR)
                x = cv2.cvtColor(x, cv2.COLOR_BGR2RGB)

                #frame_mask = inner_mask[i].cpu().float().numpy()  # shape [729]
                #frame_mask = frame_mask.reshape(grid_size, grid_size)  # [27, 27]
                #x_tinted = x.copy()
                #for gy in range(grid_size):
                #    for gx in range(grid_size):
                #        if frame_mask[gy, gx] == 1:
                #            y0, y1 = gy * patch_h, (gy + 1) * patch_h
                #            x0, x1 = gx * patch_w, (gx + 1) * patch_w
                #            # Tint red: boost red channel a bit
                #            x_tinted[y0:y1, x0:x1, 0] = np.clip(
                #                x_tinted[y0:y1, x0:x1, 0] + 80, 0, 255
                #            )
                #x = x_tinted

                
                displacements = flow[i].detach().cpu().numpy()
                #curr_vis      = pred_visibility.detach().cpu().numpy()
               

                #curr_center = centers[curr_vis]
               
                #displacements = displacements[curr_vis]


                magnitudes = np.linalg.norm(displacements, axis=1)

                plt.figure(figsize=(6, 6))
                plt.imshow(x)
                plt.quiver(
                    centers[:, 0], centers[:, 1],   # X, Y start points
                    displacements[:, 0], displacements[:, 1],  # U, V (dx, dy)
                    magnitudes,  # color by magnitude
                    angles="xy", scale_units="xy", scale=1.5, cmap="turbo", width=0.003
                )
                plt.axis("off")
                plt.tight_layout()
                plt.savefig(f"to_del4/frame_{i:03d}Y.png", dpi=150)
                plt.close()

            print(flow[0,:,:,:])
            exit(1)
            #print(flow.shape)'''

            
        
            coeff = 0.1 if self.use_emph == False else 2.0
            
            
            loss_rec = coeff* self.masked_optical_flow_loss(flow, movement_vectors,pred_visibility,inner_mask=inner_mask,use_mask=True, use_emph = self.use_emph)
                       
            movement_vectors_loss = [loss_rec]



        return InternVLVisionModelOutputWithPooling(
            last_hidden_state=sequence_output,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        ), movement_vectors_loss


@auto_docstring
class InternVLPreTrainedModel(PreTrainedModel):
    config: InternVLConfig
    base_model_prefix = ""
    supports_gradient_checkpointing = True
    _skip_keys_device_placement = "past_key_values"

    _supports_flash_attn = True
    _supports_sdpa = True

    _can_compile_fullgraph = True
    _supports_flex_attn = True
    _supports_attention_backend = True


class InternVLMultiModalProjector(nn.Module):
    def __init__(self, config: InternVLConfig):
        super().__init__()
        self.layer_norm = nn.LayerNorm(config.vision_config.hidden_size * int(1 / config.downsample_ratio) ** 2)
        self.linear_1 = nn.Linear(
            config.vision_config.hidden_size * int(1 / config.downsample_ratio) ** 2, config.text_config.hidden_size
        )
        self.act = ACT2FN[config.projector_hidden_act]
        self.linear_2 = nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size)

    def forward(self, image_features):
        hidden_states = self.layer_norm(image_features)
        hidden_states = self.linear_1(hidden_states)
        hidden_states = self.act(hidden_states)
        hidden_states = self.linear_2(hidden_states)
        return hidden_states


@dataclass
@auto_docstring(
    custom_intro="""
    Base class for InternVL outputs, with hidden states and attentions.
    """
)
class InternVLModelOutputWithPast(BaseModelOutputWithPast):
    r"""
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    """

    image_hidden_states: Optional[torch.FloatTensor] = None
    movement_vectors_loss: Optional[torch.FloatTensor] = None



@auto_docstring(
    custom_intro="""
    The InternVL model which consists of a vision backbone and a language model, without a language modeling head.
    """
)
class InternVLModel(InternVLPreTrainedModel):
    _checkpoint_conversion_mapping = {"language_model.model": "language_model"}

    def __init__(self, config: InternVLConfig, use_emph = False):
        super().__init__(config)
        self.vision_tower = InternVLVisionModel(config.vision_config, use_emph = use_emph)

        self.multi_modal_projector = InternVLMultiModalProjector(config)
        self.language_model = AutoModel.from_config(config.text_config)
        self.post_init()

    def get_input_embeddings(self):
        return self.language_model.get_input_embeddings()

    def set_input_embeddings(self, value):
        self.language_model.set_input_embeddings(value)

    def set_decoder(self, decoder):
        self.language_model = decoder

    def get_decoder(self):
        return self.language_model

    def get_image_features(
        self,
        pixel_values: torch.FloatTensor,
        vision_feature_layer: Optional[Union[int, list[int]]] = None,
        vision_feature_select_strategy: Optional[str] = None,
        movement_vectors: Optional[torch.FloatTensor] = None,
        pred_visibility : Optional[torch.BoolTensor] = None,
        inner_mask : Optional[torch.BoolTensor] = None,
        **kwargs,
    ):
        """
        Obtains image last hidden states from the vision tower and apply multimodal projection.

        Args:
            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
               The tensors corresponding to the input images.
            vision_feature_layer (`int` or `list[int]`):
                Layer index or list of layer indices to extract features from.
        Returns:
            vision_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`.
        """
        vision_feature_layer = (
            vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
        )
        vision_feature_select_strategy = (
            vision_feature_select_strategy
            if vision_feature_select_strategy is not None
            else self.config.vision_feature_select_strategy
        )
        pixel_values = pixel_values.to(dtype=self.dtype)  # fp16 compatibility

        downsample_ratio = self.config.downsample_ratio
        if vision_feature_layer == -1:
            vision_features, movement_vectors_loss = self.vision_tower(pixel_values=pixel_values,
            movement_vectors = movement_vectors,
            pred_visibility = pred_visibility,
            inner_mask = inner_mask,
            
            )
            vision_features =vision_features.last_hidden_state
        else:
            vision_features = self.vision_model(pixel_values=pixel_values).hidden_states[vision_feature_layer]
            
        if vision_feature_select_strategy == "default":
            vision_features = vision_features[:, 1:, :]

        

        # Calculate dimensions based on vision features
        channels = vision_features.shape[1]
        feature_size = int(channels**0.5)
        batch_size = vision_features.shape[0]
        #print("-------------")
        #print(vision_features.shape)
        # Reshape tensor to spatial dimensions
        vision_features = vision_features.reshape(batch_size, feature_size, feature_size, -1)

        # Apply downsampling using pixel shuffle
        vision_features = self.pixel_shuffle(vision_features, scale_factor=downsample_ratio)

        # Reshape tensor to prepare for projection
        vision_features = vision_features.reshape(batch_size, -1, vision_features.shape[-1])

        # Project features through multi-modal projector
        vision_features = self.multi_modal_projector(vision_features)
        return vision_features, movement_vectors_loss

    def get_placeholder_mask(
        self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, image_features: torch.FloatTensor
    ):
        """
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        """
        if input_ids is None:
            special_image_mask = inputs_embeds == self.get_input_embeddings()(
                torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
            )
            special_image_mask = special_image_mask.all(-1)
        else:
            special_image_mask = input_ids == self.config.image_token_id

        n_image_tokens = special_image_mask.sum()
        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
        n_image_features = image_features.shape[0] * image_features.shape[1]
        if inputs_embeds[special_image_mask].numel() != image_features.numel():
            raise ValueError(
                f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
            )
        return special_image_mask

    @can_return_tuple
    @auto_docstring
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        pixel_values: Optional[torch.FloatTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Cache] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        vision_feature_layer: Optional[Union[int, list[int]]] = None,
        vision_feature_select_strategy: Optional[str] = None,
        cache_position: Optional[torch.LongTensor] = None,
        movement_vectors: Optional[torch.FloatTensor] = None,
        pred_visibility : Optional[torch.BoolTensor] = None,
        inner_mask : Optional[torch.BoolTensor] = None,
        **kwargs: Unpack[TransformersKwargs],
    ) -> Union[tuple, InternVLModelOutputWithPast]:
        movement_vectors_loss = None
        vision_feature_layer = (
            vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
        )
        vision_feature_select_strategy = (
            vision_feature_select_strategy
            if vision_feature_select_strategy is not None
            else self.config.vision_feature_select_strategy
        )

        if (input_ids is None) ^ (inputs_embeds is not None):
            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")

        if inputs_embeds is None:
            inputs_embeds = self.get_input_embeddings()(input_ids)

        

        if pixel_values is not None:
            image_features, movement_vectors_loss = self.get_image_features(
                pixel_values=pixel_values,
                vision_feature_layer=vision_feature_layer,
                vision_feature_select_strategy=vision_feature_select_strategy,
                movement_vectors= movement_vectors,
                pred_visibility =pred_visibility,
                inner_mask =inner_mask
            )
            #print(image_features.shape)
            
            image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
            special_image_mask = self.get_placeholder_mask(
                input_ids, inputs_embeds=inputs_embeds, image_features=image_features
            )
            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)

        outputs = self.language_model(
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            cache_position=cache_position,
            **kwargs,
        )

        return InternVLModelOutputWithPast(
            last_hidden_state=outputs.last_hidden_state,
            past_key_values=outputs.past_key_values,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
            image_hidden_states=image_features if pixel_values is not None else None,
            movement_vectors_loss = movement_vectors_loss
        )

    def pixel_shuffle(self, vision_features: torch.Tensor, scale_factor: float = 0.5):
        """Perform pixel shuffle downsampling on vision features.

        Args:
            vision_features (`torch.Tensor`):
                Input tensor of shape (batch_size, width, height, channels).
            scale_factor (`float`, *optional*, defaults to `0.5`):
                Factor by which to downsample. Default is 0.5, which halves the dimensions.

        Returns:
            vision_features (`torch.Tensor`):
                Downsampled tensor of shape (batch_size, height*scale_factor, width*scale_factor, channels/(scale_factor^2)).
        """
        batch_size, width, height, channels = vision_features.size()

        if height % scale_factor != 0 or width % scale_factor != 0:
            raise ValueError("Height and width must be divisible by scale_factor for proper downsampling.")

        # Reshape to allow downsampling
        vision_features = vision_features.view(
            batch_size, width, int(height * scale_factor), int(channels / scale_factor)
        )
        # Permute dimensions to align downsampled axis correctly
        vision_features = vision_features.permute(0, 2, 1, 3).contiguous()

        # Reshape to achieve final downsampled dimensions
        vision_features = vision_features.view(
            batch_size, int(height * scale_factor), int(width * scale_factor), int(channels / (scale_factor**2))
        )

        # Swap height and width back for proper orientation
        vision_features = vision_features.permute(0, 2, 1, 3).contiguous()

        return vision_features


@dataclass
@auto_docstring(
    custom_intro="""
    Base class for InternVL causal language model (or autoregressive) outputs.
    """
)
class InternVLCausalLMOutputWithPast(ModelOutput):
    r"""
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    """

    loss: Optional[torch.FloatTensor] = None
    cap_loss: Optional[torch.FloatTensor] = None
    movement_vectors_loss: Optional[torch.FloatTensor] = None

    logits: Optional[torch.FloatTensor] = None
    past_key_values: Optional[Cache] = None
    hidden_states: Optional[tuple[torch.FloatTensor]] = None
    attentions: Optional[tuple[torch.FloatTensor]] = None
    image_hidden_states: Optional[torch.FloatTensor] = None


@auto_docstring(
    custom_intro="""
    The INTERNVL model which consists of a vision backbone and a language model.
    """
)
class InternVLForConditionalGeneration(InternVLPreTrainedModel, GenerationMixin):
    _checkpoint_conversion_mapping = {
        "^language_model.model": "model.language_model",
        "^vision_tower": "model.vision_tower",
        "^multi_modal_projector": "model.multi_modal_projector",
        "^language_model.lm_head": "lm_head",
    }
    _tied_weights_keys = ["lm_head.weight"]

    def __init__(self, config: InternVLConfig, use_emph = False):
        super().__init__(config)
        self.model = InternVLModel(config,use_emph=use_emph)
        self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
        self.post_init()

    def get_input_embeddings(self):
        return self.model.get_input_embeddings()

    def set_input_embeddings(self, value):
        self.model.set_input_embeddings(value)

    def get_output_embeddings(self) -> nn.Module:
        return self.lm_head

    def set_decoder(self, decoder):
        self.model.set_decoder(decoder)

    def get_decoder(self):
        return self.model.get_decoder()

    def get_image_features(
        self,
        pixel_values: torch.FloatTensor,
        vision_feature_layer: Optional[Union[int, list[int]]] = None,
        vision_feature_select_strategy: Optional[str] = None,
        **kwargs,
    ):
        return self.model.get_image_features(
            pixel_values=pixel_values,
            vision_feature_layer=vision_feature_layer,
            vision_feature_select_strategy=vision_feature_select_strategy,
            **kwargs,
        )

    # Make modules available through conditional class for BC
    @property
    def language_model(self):
        return self.model.language_model

    @property
    def vision_tower(self):
        return self.model.vision_tower

    @property
    def multi_modal_projector(self):
        return self.model.multi_modal_projector

    @can_return_tuple
    @auto_docstring
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        pixel_values: Optional[torch.FloatTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Cache] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        vision_feature_layer: Optional[Union[int, list[int]]] = None,
        vision_feature_select_strategy: Optional[str] = None,
        labels: Optional[torch.LongTensor] = None,
        cache_position: Optional[torch.LongTensor] = None,
        logits_to_keep: Union[int, torch.Tensor] = 0,
        image_sizes: Optional[torch.Tensor] = None,
        movement_vectors: Optional[torch.FloatTensor] = None,
        pred_visibility : Optional[torch.BoolTensor] = None,
        inner_mask : Optional[torch.BoolTensor] = None,
        **kwargs: Unpack[TransformersKwargs],
    ) -> Union[tuple, InternVLCausalLMOutputWithPast]:
        r"""
        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, AutoModelForImageTextToText

        >>> torch_device = "cuda"
        >>> processor = AutoProcessor.from_pretrained("OpenGVLab/InternVL3-1B-hf")
        >>> model = AutoModelForImageTextToText.from_pretrained(
        ...     "OpenGVLab/InternVL3-1B-hf", dtype=torch.bfloat16, device_map=torch_device
        ... )

        >>> messages = [
        ...     {
        ...         "role": "user",
        ...         "content": [
        ...             {
        ...                 "type": "image",
        ...                 "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
        ...             },
        ...             {
        ...                 "type": "image",
        ...                 "url": "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg",
        ...             },
        ...             {"type": "text", "text": "These images depict two different landmarks. Can you identify them?"},
        ...         ],
        ...     },
        ... ]

        >>> inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(torch_device)
        >>> generate_ids = model.generate(**inputs, max_new_tokens=200)
        >>> print(processor.decode(generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True))
        The images depict the Statue of Liberty and the Golden Gate Bridge.
        ```"""
        #print(pixel_values.shape)
      
        #exit(1)
        if pixel_values is not None:
            if len(pixel_values.shape) > 4:
                pixel_values = pixel_values.view(pixel_values.size(0) * pixel_values.size(1), *pixel_values.shape[2:])
        
        
    
        vision_feature_layer = (
            vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
        )
        vision_feature_select_strategy = (
            vision_feature_select_strategy
            if vision_feature_select_strategy is not None
            else self.config.vision_feature_select_strategy
        )

        outputs = self.model(
            input_ids=input_ids,
            pixel_values=pixel_values,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            vision_feature_layer=vision_feature_layer,
            vision_feature_select_strategy=vision_feature_select_strategy,
            cache_position=cache_position,
            image_sizes=image_sizes,
            movement_vectors= movement_vectors,
            pred_visibility = pred_visibility,
            inner_mask = inner_mask,
            **kwargs,
        )

        hidden_states = outputs[0]
        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
        logits = self.lm_head(hidden_states[:, slice_indices, :])

        loss = None
        cap_loss = None
        movement_vectors_loss = None
        if labels is not None:
            cap_loss = self.loss_function(
                logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs
            )
        if (movement_vectors is not None ) and (outputs.movement_vectors_loss):
            movement_vectors_loss = outputs.movement_vectors_loss[0]
            print(movement_vectors_loss)
        


        if cap_loss is not None and movement_vectors_loss is None:
          

            loss = cap_loss
        
        elif (cap_loss is not None) and (movement_vectors_loss is not None):
          

            loss = movement_vectors_loss + cap_loss
        else:
         

            loss = movement_vectors_loss


        if self.training:
            print(loss)


        return InternVLCausalLMOutputWithPast(
            loss=loss,
            logits=logits,
            cap_loss = cap_loss,
            movement_vectors_loss = movement_vectors_loss,
            past_key_values=outputs.past_key_values,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
            image_hidden_states=outputs.image_hidden_states,
        )

    def prepare_inputs_for_generation(
        self,
        input_ids,
        past_key_values=None,
        inputs_embeds=None,
        pixel_values=None,
        attention_mask=None,
        cache_position=None,
        logits_to_keep=None,
        **kwargs,
    ):
        # Overwritten -- in specific circumstances we don't want to forward image inputs to the model

        model_inputs = super().prepare_inputs_for_generation(
            input_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            attention_mask=attention_mask,
            cache_position=cache_position,
            logits_to_keep=logits_to_keep,
            **kwargs,
        )

        if cache_position[0] == 0:
            # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
            # Otherwise we need pixel values to be passed to model
            model_inputs["pixel_values"] = pixel_values

        return model_inputs


__all__ = [
    "InternVLVisionPreTrainedModel",
    "InternVLVisionModel",
    "InternVLPreTrainedModel",
    "InternVLModel",
    "InternVLForConditionalGeneration",
]