from typing import Tuple, Optional, List, Union 
import torch 
from transformers.utils import logging
from models.loss_function import isotropy_loss
from models.FocalInfoNCELoss import FocalInfoNCELoss
from models.DiHTLoss import DiHTLoss
logger = logging.get_logger(__name__)
from models.latent_attention_block import LatentAttentionBlock
from transformers import AutoProcessor, AutoModel, AutoModelForCausalLM, Qwen2VLForConditionalGeneration, PreTrainedTokenizer
from models.qwen2_vl_bidirectional_atten import BiQwen2VLForConditionalGeneration
from torch import nn 
import torch.distributed as dist
from transformers.modeling_outputs import SequenceClassifierOutput
from transformers.models.qwen2_vl.modeling_qwen2_vl import Qwen2VLCausalLMOutputWithPast
import torch.nn.functional as F
from utils import (
    rank0_print, find_all_linear_names, safe_save_model_for_hf_trainer,
    get_peft_state_maybe_zero_3, TrainerWithCustomSampler
)

class Similarity(nn.Module):
    """
    Dot product or cosine similarity
    """
    def __init__(self, temp=0.07):
        super().__init__()
        self.temp = temp
        self.cos = nn.CosineSimilarity(dim=-1)
    def forward(self, x, y):
        return self.cos(x, y) / self.temp

# 在保存模型时，只有被初始化并且不是 None 的属性才会被保存
# 定义一个继承自 BiQwen2VLForConditionalGeneration 的类，用于 Qwen2-VL 模型的微调
class Qwen2VLRetFinetuneForConditionalGeneration(BiQwen2VLForConditionalGeneration):
    def __init__(self, config):
        super().__init__(config)
        self.mean_pooling = True                 # 是否使用全局平局池化
        self.use_bi_atten = True                 # 是否使用双向注意
        self.use_latent_atten = False            # 是否使用潜在注意力模块
        self.use_instruction_mask = False        # 是否使用指令 mask
        self.use_bi_loss= False                  # 是否使用双向损失, 默认不使用
        self.use_isotropy_loss = False           # 是否使用同构损失, 默认不使用
        self.use_self_attent_pooling = False     # 是否使用自注意力池化，默认不使用，自注意力池化和全局平均池化互斥

        # 自注意力池化和全局平均池化互斥
        assert sum([self.use_self_attent_pooling,self.mean_pooling]) <= 1, \
            "自注意力池化和全局平均池化互斥, 不能同时选择。"
        
        if self.use_isotropy_loss:
            self.lambda_iso = 5e2                # 同构损失的权重, lambda_iso: float = 5e1 默认
            rank0_print("同构损失的权重: ", self.lambda_iso)
        
        self.use_cross_entropy_loss = True  # 是否使用交叉熵损失
        self.use_focal_infonce_loss = False # 是否使用焦点损失
        self.use_diht_loss = False          # 是否使用 DIHT 损失
        
        # 确保只有一个损失函数被启用, 如果启用多个损失函数，则抛出异常，默认使用交叉熵损失
        # 如果使用非交叉熵损失，则需要在训练时指定 use_cross_entropy_loss=False
        assert sum([self.use_cross_entropy_loss, self.use_focal_infonce_loss, self.use_diht_loss]) == 1, \
            "Only one loss function can be set to True."
        self.loss_fct = nn.CrossEntropyLoss() # 默认使用交叉熵损失

        # 定义双向损失的权重
        if self.use_bi_loss:
            self.querytocand = 0.5
            self.candtoquery = 0.5
            rank0_print("双向损失的权重: ", self.querytocand, self.candtoquery)
    
    # 验证有且仅有一个损失函数被启用
    def _initialize_loss_functions(self):
        # 定义参与校验的损失函数列表
        LOSS_FUNCTIONS = [
            self.use_cross_entropy_loss,
            self.use_focal_infonce_loss,
            self.use_diht_loss
        ]
        # 确保只有一个损失函数被启用
        assert sum(LOSS_FUNCTIONS) == 1, "Only one loss function can be set to True."
        if self.use_cross_entropy_loss:
            self.loss_fct = nn.CrossEntropyLoss()
        elif self.use_focal_infonce_loss:
            self.loss_fct = FocalInfoNCELoss()
        elif self.use_diht_loss:
            self.loss_fct = DiHTLoss()
        else:
            raise ValueError("No loss function is set.")
    
    def _initialize_latent_attention(self):
        """
        初始化潜在注意力模块
        """
        assert hasattr(self.config, "hidden_size"), "hidden_size 属性不存在于配置中"
        self.latent_dim_scale = 1  # 如果使用潜在注意力模块，请指定 latent_dim_scale = latent_dim/hidden_dim，默认使用 1
        hidden_dim = self.config.hidden_size
        latent_dim = int(hidden_dim * self.latent_dim_scale)
        
        self.latent_attention = LatentAttentionBlock(latent_dim=latent_dim, hidden_dim=hidden_dim)
        rank0_print("LatentAttentionBlock 初始化完成")
        rank0_print(f"潜在注意力模块尺寸: {self.latent_attention.latent_array.size()}")

    
    # 确认一下这个函数到底在哪里使用了 ？？？
    def get_features(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        pixel_values: Optional[torch.Tensor] = None,
        pixel_values_videos: Optional[torch.FloatTensor] = None,
        image_grid_thw: Optional[torch.LongTensor] = None,
        video_grid_thw: Optional[torch.LongTensor] = None,
        rope_deltas: Optional[torch.LongTensor] = None,
        instruction_mask: Optional[torch.Tensor] = None, # 指令 mask
    ):
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if inputs_embeds is None:
            inputs_embeds = self.model.embed_tokens(input_ids)
            if pixel_values is not None:
                pixel_values = pixel_values.type(self.visual.get_dtype())
                image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw).to(inputs_embeds.device)
                image_mask = input_ids == self.config.image_token_id
                if self.training:
                    inputs_embeds = inputs_embeds.clone()
                inputs_embeds[image_mask] = image_embeds
            if pixel_values_videos is not None:
                pixel_values_videos = pixel_values_videos.type(self.visual.get_dtype())
                video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw).to(inputs_embeds.device)
                video_mask = input_ids == self.config.video_token_id
                inputs_embeds[video_mask] = video_embeds
            if attention_mask is not None:
                attention_mask = attention_mask.to(inputs_embeds.device)

        outputs = self.model(
            input_ids=None,
            position_ids=position_ids,
            attention_mask=attention_mask,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        hidden_states = outputs[0]
        if self.use_latent_atten: # 如果使用潜在注意力模块
            hidden_states = self.latent_attention(hidden_states)
        
        # 如果使用指令 mask ---------------------------------------------------------
        if self.use_instruction_mask:
            if labels.shape != instruction_mask.shape:
                raise ValueError("labels 和 instruction_mask 的维度不匹配。")
            else:
                # 将 instruction_mask 不为 0 的地方对应的 labels 位置设置成 -100
                labels[instruction_mask != 0] = -100
        # -------------------------------------------------------------------------

        # 平均池化
        if self.mean_pooling:
            embed_features = self._global_mean_pool(hidden_states, labels)
        else:
            embed_index = self.config.emb_token_id # 用于提取特征的 token, 原来是 self.config.emb_token_ids[0]
            embed_indices = torch.argmax((labels == embed_index).int(), dim=1)
            embed_features = hidden_states[torch.arange(len(embed_indices)), embed_indices - 1] # (batch_size, embed_dim)
        
        return embed_features 
    
# """
# 这是 Qwen2VL7BDataCollator 返回的数据结构
# return dict(
#     input_ids=input_ids,
#     attention_mask=attention_mask,
#     pixel_values=pixel_values,
#     image_grid_thw=image_grid_thw,
#     labels=labels,
#     has_hard_negative=has_hard_negative)
# """
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        pixel_values: Optional[torch.Tensor] = None,
        pixel_values_videos: Optional[torch.FloatTensor] = None,
        image_grid_thw: Optional[torch.LongTensor] = None,
        video_grid_thw: Optional[torch.LongTensor] = None,
        rope_deltas: Optional[torch.LongTensor] = None,
        instruction_mask: Optional[torch.Tensor] = None, # 指令 mask
        inference=False,
        has_hard_negative=False, # 是否使用 hard negative
        has_modality_hard_negative=False, # 是否使用 modality hard negative
        qids=None,
        dids=None,
        ids=None,
    ) -> Union[Tuple, Qwen2VLCausalLMOutputWithPast]:
        r"""
        Args:
            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        Returns:
        Example:
        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
        >>> model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
        >>> processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
        >>> messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image"},
                    {"type": "text", "text": "What is shown in this image?"},
                ],
            },
        ]
        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        >>> inputs = processor(text=[text], images=[image], vision_infos=[vision_infos])

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "The image shows a street scene with a red stop sign in the foreground. In the background, there is a large red gate with Chinese characters ..."
        ```"""
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # set mini_batch to 32
        mini_batch_size = 32 
        input_ids_list = torch.split(input_ids, mini_batch_size)
        attention_mask_list = torch.split(attention_mask, mini_batch_size)
        if image_grid_thw is not None:
            cumsum_pixel_values = torch.cumsum(image_grid_thw[:, 1] * image_grid_thw[:, 2], dim=-1) 
            zero_tensor = torch.tensor([0], device=cumsum_pixel_values.device) # be convinient for extracting batch_pixel_values
            cumsum_pixel_values = torch.cat((zero_tensor, cumsum_pixel_values))
            image_nums = 0
        
        all_hidden_states = []

        for i in range(len(input_ids_list)):
            if inputs_embeds is None:
                batch_inputs_embeds = self.model.embed_tokens(input_ids_list[i])
                if pixel_values is not None:
                    image_mask = input_ids_list[i] == self.config.image_token_id
                    current_image_num = torch.sum(torch.any(image_mask, dim=-1)).cpu().item()
                    if current_image_num != 0:
                        batch_pixel_values = pixel_values[cumsum_pixel_values[image_nums] : cumsum_pixel_values[image_nums + current_image_num]]
                        batch_pixel_values = batch_pixel_values.type(self.visual.get_dtype())
                        batch_image_embeds = self.visual(batch_pixel_values, grid_thw=image_grid_thw[image_nums:image_nums + current_image_num]).to(batch_inputs_embeds.device)
                        image_nums = image_nums + current_image_num
                        if self.training:
                            batch_inputs_embeds = batch_inputs_embeds.clone()
                        batch_inputs_embeds[image_mask] = batch_image_embeds
                if pixel_values_videos is not None:
                    pixel_values_videos = pixel_values_videos.type(self.visual.get_dtype())
                    video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw).to(inputs_embeds.device)
                    video_mask = input_ids == self.config.video_token_id
                    inputs_embeds[video_mask] = video_embeds
                if attention_mask is not None:
                    batch_attention_mask = attention_mask_list[i].to(batch_inputs_embeds.device)        
            outputs = self.model(
                input_ids=None,
                position_ids=position_ids,
                attention_mask=batch_attention_mask,
                past_key_values=past_key_values,
                inputs_embeds=batch_inputs_embeds,
                use_cache=use_cache,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
                return_dict=return_dict,
            )

            hidden_states = outputs[0]
            all_hidden_states.append(hidden_states)
                
        # 将所有的 hidden_states 拼接在一起-----------------------------------------------
        hidden_states = torch.cat(all_hidden_states)

        # 根据 has_hard_negative 和 has_modality_hard_negative 确定 query 的 batch_size
        if has_hard_negative and has_modality_hard_negative and not inference:
            batch_size = len(hidden_states) // 4
        elif (has_hard_negative or has_modality_hard_negative) and not inference:
            batch_size = len(hidden_states) // 3
        elif not inference:
            batch_size = len(hidden_states) // 2
        elif inference:
            batch_size = len(hidden_states)
        if inference:
            assert batch_size == len(hidden_states)
        # ---------------------------------------------------------------------------

        # 如果使用潜在注意力模块
        if self.use_latent_atten:
            assert self.latent_attention is not None, "LatentAttentionBlock is not initialized"
            hidden_states = self.latent_attention(hidden_states)

        # 如果使用指令 mask ---------------------------------------------------------
        if self.use_instruction_mask:
            if labels.shape != instruction_mask.shape:
                raise ValueError("labels 和 instruction_mask 的维度不匹配。")
            else:
                # 将 instruction_mask 不为 0 的地方对应的 labels 位置设置成 -100
                labels[instruction_mask != 0] = -100
        # -------------------------------------------------------------------------
        # 平均池化
        if self.mean_pooling:
            embed_features = self._global_mean_pool(hidden_states, labels)
        else:
            embed_index = self.config.emb_token_id # 用于提取特征的 token, 原来是 self.config.emb_token_ids[0]
            embed_indices = torch.argmax((labels == embed_index).int(), dim=1) 
            embed_features = hidden_states[torch.arange(len(embed_indices)), embed_indices - 1] # (batch_size, embed_dim)
        
        # # -------------------------------------------------------
        if inference:
            if ids is not None:
                return embed_features, ids 
            elif qids is not None or dids is not None:
                return embed_features, qids, dids 
            return embed_features 
        # # -------------------------------------------------------
        if has_hard_negative and has_modality_hard_negative:
            embed1, embed2, embed3, embed4 = embed_features[:batch_size], embed_features[batch_size:2*batch_size], embed_features[2*batch_size:3*batch_size], embed_features[3*batch_size:]
        
        elif has_hard_negative or has_modality_hard_negative:
            embed1, embed2, embed3 = embed_features[:batch_size], embed_features[batch_size:2*batch_size], embed_features[2*batch_size:]
        else:
            embed1, embed2 = embed_features[:batch_size], embed_features[batch_size:]
        if dist.is_initialized():
            # Dummy vectors for allgather
            # 如果使用 hard negative 或者 modality hard negative，处理 embed3
            if has_hard_negative or has_modality_hard_negative:
                embed3_list = [torch.zeros_like(embed3) for _ in range(dist.get_world_size())]
                dist.all_gather(tensor_list=embed3_list, tensor=embed3.contiguous())
                embed3_list[dist.get_rank()] = embed3 
                embed3 = torch.cat(embed3_list, 0)
            # 如果使用 hard negative 和 modality hard negative，处理 embed4
            if has_hard_negative and has_modality_hard_negative:
                embed4_list = [torch.zeros_like(embed4) for _ in range(dist.get_world_size())]
                dist.all_gather(tensor_list=embed4_list, tensor=embed4.contiguous())
                embed4_list[dist.get_rank()] = embed4
                embed4 = torch.cat(embed4_list, 0)
            
            # Dummy vectors for allgather
            embed1_list = [torch.zeros_like(embed1) for _ in range(dist.get_world_size())]
            embed2_list = [torch.zeros_like(embed2) for _ in range(dist.get_world_size())]
            # Allgather
            dist.all_gather(tensor_list=embed1_list, tensor=embed1.contiguous())
            dist.all_gather(tensor_list=embed2_list, tensor=embed2.contiguous())

            # Since allgather results do not have gradients, we replace the
            # current process's corresponding embeddings with original tensors
            embed1_list[dist.get_rank()] = embed1
            embed2_list[dist.get_rank()] = embed2
            # Get full batch embeddings: (bs x N, hidden)
            embed1 = torch.cat(embed1_list, 0)
            embed2 = torch.cat(embed2_list, 0)
        
        sim = Similarity(temp=0.05)
        # add normalization
        embed1 = F.normalize(embed1, dim=-1)
        embed2 = F.normalize(embed2, dim=-1)
        # 计算相似度矩阵 (显存优化后)
        # cos_sim = sim(embed1.unsqueeze(1), embed2.unsqueeze(0)) # (B, B) 逐个元素相乘
        # cos_sim = torch.matmul(embed1, embed2.T) / sim.temp  # (B, B) 矩阵乘法
        cos_sim = (embed1 @ embed2.T) # / sim.temp (B, B) 矩阵乘法
        
        if self.use_bi_loss: # 如果使用双向损失
            inverse_cos_sim = cos_sim.T  # 直接转置即可
        
        if self.use_isotropy_loss: # 如果使用同构损失
            loss_isotropy = isotropy_loss(embed1,lambda_iso = self.lambda_iso) + isotropy_loss(embed2,lambda_iso = self.lambda_iso)
        
        # 如果使用 hard negative 或者 modality hard negative，处理 embed3
        if has_hard_negative or has_modality_hard_negative:
            embed3 = F.normalize(embed3, dim=-1)
            # embed1_embed3_cos = sim(embed1.unsqueeze(1), embed3.unsqueeze(0))
            # embed1_embed3_cos = torch.matmul(embed1, embed3.T) / sim.temp
            embed1_embed3_cos = (embed1 @ embed3.T) # / sim.temp
            cos_sim = torch.cat([cos_sim, embed1_embed3_cos], 1)

            if self.use_bi_loss: # 如果使用双向损失
                embed2_embed3_cos = (embed2 @ embed3.T) # / sim.temp
                inverse_cos_sim = torch.cat([inverse_cos_sim, embed2_embed3_cos], 1)
            
            if self.use_isotropy_loss: # 如果使用同构损失
                loss_isotropy += isotropy_loss(embed3,lambda_iso = self.lambda_iso)

            
        
        # 如果使用 hard negative 和 modality hard negative，处理 embed4
        if has_hard_negative and has_modality_hard_negative:
            embed4 = F.normalize(embed4, dim=-1)
            # embed1_embed4_cos = sim(embed1.unsqueeze(1), embed4.unsqueeze(0))
            # embed1_embed4_cos = torch.matmul(embed1, embed4.T) / sim.temp
            embed1_embed4_cos = (embed1 @ embed4.T) # / sim.temp
            cos_sim = torch.cat([cos_sim, embed1_embed4_cos], 1)
            
            if self.use_bi_loss: # 如果使用双向损失
                embed2_embed4_cos = (embed2 @ embed4.T) # / sim.temp
                inverse_cos_sim = torch.cat([inverse_cos_sim, embed2_embed4_cos], 1)
            
            if self.use_isotropy_loss: # 如果使用同构损失
                loss_isotropy += isotropy_loss(embed4,lambda_iso = self.lambda_iso)
        
        nce_labels = torch.arange(cos_sim.size(0)).long().to(cos_sim.device)
        rank0_print(f"nce_labels: {nce_labels}")
        rank0_print(f"cos_sim 最原始的相似的矩阵: {cos_sim}")
        
        if self.use_cross_entropy_loss: # 如果使用交叉熵损失,手动除以温度参数
            cos_sim = cos_sim / sim.temp
            rank0_print(f"cos_sim 除以温度参数: {cos_sim}")
        
        loss = self.loss_fct(cos_sim, nce_labels)  # (1,) 计算损失

        if self.use_bi_loss: # 如果使用双向损失
            if self.use_cross_entropy_loss: # 如果使用交叉熵损失
                rank0_print(f"inverse_cos_sim : {inverse_cos_sim}")
                inverse_cos_sim = inverse_cos_sim / sim.temp
                rank0_print(f"inverse_cos_sim 除以温度参数: {inverse_cos_sim}")
            inverse_loss = self.loss_fct(inverse_cos_sim, nce_labels)
            rank0_print(f"正向损失: {loss}")
            rank0_print(f"反向损失: {inverse_loss}")
            loss = self.querytocand * loss + self.candtoquery * inverse_loss
            rank0_print(f"双向损失: {loss}")
        
        
        if self.use_isotropy_loss: # 如果使用同构损失
            if has_hard_negative and has_modality_hard_negative:
                loss_isotropy = loss_isotropy / 4
            elif has_hard_negative or has_modality_hard_negative:
                loss_isotropy = loss_isotropy / 3
            else:
                loss_isotropy = loss_isotropy / 2
            rank0_print(f"同构损失: {loss_isotropy}")
            rank0_print(f"nce 损失: {loss}")
            loss += loss_isotropy
            
            # # 检查梯度回传路径
            # rank0_print(embed1.requires_grad)  # True
            # rank0_print(isotropy_loss(embed1).grad_fn)  # <MulBackward0 object at 0x7f1a8c39c340>
            
        return SequenceClassifierOutput(loss=loss)





    # # 确认一下这个函数到底在哪里使用了 ？？？
    # def get_features(
    #     self,
    #     input_ids: torch.LongTensor = None,
    #     attention_mask: Optional[torch.Tensor] = None,
    #     position_ids: Optional[torch.LongTensor] = None,
    #     past_key_values: Optional[List[torch.FloatTensor]] = None,
    #     inputs_embeds: Optional[torch.FloatTensor] = None,
    #     labels: Optional[torch.LongTensor] = None,
    #     use_cache: Optional[bool] = None,
    #     output_attentions: Optional[bool] = None,
    #     output_hidden_states: Optional[bool] = None,
    #     return_dict: Optional[bool] = None,
    #     pixel_values: Optional[torch.Tensor] = None,
    #     pixel_values_videos: Optional[torch.FloatTensor] = None,
    #     image_grid_thw: Optional[torch.LongTensor] = None,
    #     video_grid_thw: Optional[torch.LongTensor] = None,
    #     rope_deltas: Optional[torch.LongTensor] = None,
    #     instruction_mask: Optional[torch.Tensor] = None, # 指令 mask
    #     feature_list: Optional[List[torch.Tensor]] = None, # 特征约束
    #     scores_list: Optional[List[float]] = None  # 如果存储的是浮点数分数
    # ):
    #     output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
    #     output_hidden_states = (
    #         output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
    #     )
    #     return_dict = return_dict if return_dict is not None else self.config.use_return_dict

    #     if inputs_embeds is None:
    #         inputs_embeds = self.model.embed_tokens(input_ids)
    #         if pixel_values is not None:
    #             pixel_values = pixel_values.type(self.visual.get_dtype())
    #             image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw).to(inputs_embeds.device)
    #             image_mask = input_ids == self.config.image_token_id
    #             if self.training:
    #                 inputs_embeds = inputs_embeds.clone()
    #             inputs_embeds[image_mask] = image_embeds
    #         if pixel_values_videos is not None:
    #             pixel_values_videos = pixel_values_videos.type(self.visual.get_dtype())
    #             video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw).to(inputs_embeds.device)
    #             video_mask = input_ids == self.config.video_token_id
    #             inputs_embeds[video_mask] = video_embeds
    #         if attention_mask is not None:
    #             attention_mask = attention_mask.to(inputs_embeds.device)

    #     outputs = self.model(
    #         input_ids=None,
    #         position_ids=position_ids,
    #         attention_mask=attention_mask,
    #         past_key_values=past_key_values,
    #         inputs_embeds=inputs_embeds,
    #         use_cache=use_cache,
    #         output_attentions=output_attentions,
    #         output_hidden_states=output_hidden_states,
    #         return_dict=return_dict,
    #     )

    #     hidden_states = outputs[0]
    #     if self.use_latent_atten: # 如果使用潜在注意力模块
    #         hidden_states = self.latent_attention(hidden_states)
        
    #     # 如果使用指令 mask ---------------------------------------------------------
    #     if self.use_instruction_mask:
    #         if labels.shape != instruction_mask.shape:
    #             raise ValueError("labels 和 instruction_mask 的维度不匹配。")
    #         else:
    #             # 将 instruction_mask 不为 0 的地方对应的 labels 位置设置成 -100
    #             labels[instruction_mask != 0] = -100
    #     # -------------------------------------------------------------------------

    #     # 平均池化
    #     if self.mean_pooling:
    #         embed_features = self._global_mean_pool(hidden_states, labels)
    #     else:
    #         embed_index = self.config.emb_token_id # 用于提取特征的 token, 原来是 self.config.emb_token_ids[0]
    #         embed_indices = torch.argmax((labels == embed_index).int(), dim=1)
    #         embed_features = hidden_states[torch.arange(len(embed_indices)), embed_indices - 1] # (batch_size, embed_dim)
        
    #     return embed_features 