from abc import ABC, abstractmethod

import torch
import torch.nn as nn

from .multimodal_encoder.builder import build_vision_tower
from .multimodal_projector.builder import build_vision_projector

from llava.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN


class LlavaMetaModel:

    def __init__(self, config):
        super(LlavaMetaModel, self).__init__(config)

        if hasattr(config, "mm_vision_tower"):
            self.vision_tower = build_vision_tower(config, delay_load=True)
            self.mm_projector = build_vision_projector(config)

    def get_vision_tower(self):
        vision_tower = getattr(self, 'vision_tower', None)
        if type(vision_tower) is list:
            vision_tower = vision_tower[0]
        return vision_tower

    def initialize_vision_modules(self, model_args, fsdp=None):
        vision_tower = model_args.vision_tower
        mm_vision_select_layer = model_args.mm_vision_select_layer
        mm_vision_select_feature = model_args.mm_vision_select_feature
        pretrain_mm_mlp_adapter = model_args.pretrain_mm_mlp_adapter

        self.config.mm_vision_tower = vision_tower

        if self.get_vision_tower() is None:
            vision_tower = build_vision_tower(model_args)

            if fsdp is not None and len(fsdp) > 0:
                self.vision_tower = [vision_tower]
            else:
                self.vision_tower = vision_tower
        else:
            if fsdp is not None and len(fsdp) > 0:
                vision_tower = self.vision_tower[0]
            else:
                vision_tower = self.vision_tower
            vision_tower.load_model()

        self.config.use_mm_proj = True
        self.config.mm_projector_type = getattr(model_args, 'mm_projector_type', 'linear')
        self.config.mm_hidden_size = vision_tower.hidden_size
        self.config.mm_vision_select_layer = mm_vision_select_layer
        self.config.mm_vision_select_feature = mm_vision_select_feature

        if getattr(self, 'mm_projector', None) is None:
            self.mm_projector = build_vision_projector(self.config)
        else:
            # In case it is frozen by LoRA
            for p in self.mm_projector.parameters():
                p.requires_grad = True

        if pretrain_mm_mlp_adapter is not None:
            mm_projector_weights = torch.load(pretrain_mm_mlp_adapter, map_location='cpu')
            def get_w(weights, keyword):
                return {k.split(keyword + '.')[1]: v for k, v in weights.items() if keyword in k}

            self.mm_projector.load_state_dict(get_w(mm_projector_weights, 'mm_projector'))


class LlavaMetaForCausalLM(ABC):

    @abstractmethod
    def get_model(self):
        pass

    def get_vision_tower(self):
        return self.get_model().get_vision_tower()

    def encode_images(self, images):
        image_features = self.get_model().get_vision_tower()(images)
        image_features = self.get_model().mm_projector(image_features)
        return image_features
    
    def prepare_inputs_labels_for_multimodal(
        self, input_ids, attention_mask, past_key_values, labels, images
    ):
        # print(f"int_put_ids {len(input_ids[0])}")
        vision_tower = self.get_vision_tower()
        if vision_tower is None or images is None or input_ids.shape[1] == 1:
            if past_key_values is not None and vision_tower is not None and images is not None and input_ids.shape[1] == 1:
                attention_mask = torch.ones((attention_mask.shape[0], past_key_values[-1][-1].size(2) + 1), dtype=attention_mask.dtype, device=attention_mask.device)
            self.img_seq = 0
            self.pre_prompt_length_list = []
            return input_ids, attention_mask, past_key_values, None, labels, self.img_seq, self.pre_prompt_length_list

        if type(images) is list or images.ndim == 5:
            concat_images = torch.cat([image for image in images], dim=0)
            image_features = self.encode_images(concat_images)
            split_sizes = [image.shape[0] for image in images]
            image_features = torch.split(image_features, split_sizes, dim=0)
            image_features = [x.flatten(0, 1) for x in image_features]
        else:
            image_features = self.encode_images(images)

        new_input_embeds = []
        new_labels = [] if labels is not None else None
        cur_image_idx = 0
        pre_prompt_length_list = []
        for batch_idx, cur_input_ids in enumerate(input_ids):
            if (cur_input_ids == IMAGE_TOKEN_INDEX).sum() == 0:
                # multimodal LLM, but the current sample is not multimodal
                # FIXME: this is a hacky fix, for deepspeed zero3 to work
                half_len = cur_input_ids.shape[0] // 2
                cur_image_features = image_features[cur_image_idx]
                cur_input_embeds_1 = self.get_model().embed_tokens(cur_input_ids[:half_len])
                cur_input_embeds_2 = self.get_model().embed_tokens(cur_input_ids[half_len:])
                cur_input_embeds = torch.cat([cur_input_embeds_1, cur_image_features[0:0], cur_input_embeds_2], dim=0)
                new_input_embeds.append(cur_input_embeds)
                if labels is not None:
                    new_labels.append(labels[batch_idx])
                cur_image_idx += 1
                continue
            image_token_indices = torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0]
            pre_prompt_length_list.append(image_token_indices[0])
            cur_new_input_embeds = []
            if labels is not None:
                cur_labels = labels[batch_idx]
                cur_new_labels = []
                assert cur_labels.shape == cur_input_ids.shape
            while image_token_indices.numel() > 0:
                cur_image_features = image_features[cur_image_idx]
                image_token_start = image_token_indices[0]
                if getattr(self.config, 'tune_mm_mlp_adapter', False) and getattr(self.config, 'mm_use_im_start_end', False):
                    cur_new_input_embeds.append(self.get_model().embed_tokens(cur_input_ids[:image_token_start-1]).detach())
                    cur_new_input_embeds.append(self.get_model().embed_tokens(cur_input_ids[image_token_start-1:image_token_start]))
                    cur_new_input_embeds.append(cur_image_features)
                    cur_new_input_embeds.append(self.get_model().embed_tokens(cur_input_ids[image_token_start+1:image_token_start+2]))
                    if labels is not None:
                        cur_new_labels.append(cur_labels[:image_token_start])
                        cur_new_labels.append(torch.full((cur_image_features.shape[0],), IGNORE_INDEX, device=labels.device, dtype=labels.dtype))
                        cur_new_labels.append(cur_labels[image_token_start:image_token_start+1])
                        cur_labels = cur_labels[image_token_start+2:]
                else:
                    cur_new_input_embeds.append(self.get_model().embed_tokens(cur_input_ids[:image_token_start]))
                    cur_new_input_embeds.append(cur_image_features)
                    if labels is not None:
                        cur_new_labels.append(cur_labels[:image_token_start])
                        cur_new_labels.append(torch.full((cur_image_features.shape[0],), IGNORE_INDEX, device=labels.device, dtype=labels.dtype))
                        cur_labels = cur_labels[image_token_start+1:]
                cur_image_idx += 1
                if getattr(self.config, 'tune_mm_mlp_adapter', False) and getattr(self.config, 'mm_use_im_start_end', False):
                    cur_input_ids = cur_input_ids[image_token_start+2:]
                else:
                    cur_input_ids = cur_input_ids[image_token_start+1:]
                image_token_indices = torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0]
            if cur_input_ids.numel() > 0:
                if getattr(self.config, 'tune_mm_mlp_adapter', False) and getattr(self.config, 'mm_use_im_start_end', False):
                    cur_new_input_embeds.append(self.get_model().embed_tokens(cur_input_ids).detach())
                else:
                    cur_new_input_embeds.append(self.get_model().embed_tokens(cur_input_ids))
                if labels is not None:
                    cur_new_labels.append(cur_labels)
            cur_new_input_embeds = [x.to(device=self.device) for x in cur_new_input_embeds]
            cur_new_input_embeds = torch.cat(cur_new_input_embeds, dim=0)
            new_input_embeds.append(cur_new_input_embeds)
            if labels is not None:
                cur_new_labels = torch.cat(cur_new_labels, dim=0)
                new_labels.append(cur_new_labels)

        if any(x.shape != new_input_embeds[0].shape for x in new_input_embeds):
            max_len = max(x.shape[0] for x in new_input_embeds)

            new_input_embeds_align = []
            for cur_new_embed in new_input_embeds:
                cur_new_embed = torch.cat((cur_new_embed, torch.zeros((max_len - cur_new_embed.shape[0], cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device)), dim=0)
                new_input_embeds_align.append(cur_new_embed)
            new_input_embeds = torch.stack(new_input_embeds_align, dim=0)

            if labels is not None:
                new_labels_align = []
                _new_labels = new_labels
                for cur_new_label in new_labels:
                    cur_new_label = torch.cat((cur_new_label, torch.full((max_len - cur_new_label.shape[0],), IGNORE_INDEX, dtype=cur_new_label.dtype, device=cur_new_label.device)), dim=0)
                    new_labels_align.append(cur_new_label)
                new_labels = torch.stack(new_labels_align, dim=0)

            if attention_mask is not None:
                new_attention_mask = []
                for cur_attention_mask, cur_new_labels, cur_new_labels_align in zip(attention_mask, _new_labels, new_labels):
                    new_attn_mask_pad_left = torch.full((cur_new_labels.shape[0] - labels.shape[1],), True, dtype=attention_mask.dtype, device=attention_mask.device)
                    new_attn_mask_pad_right = torch.full((cur_new_labels_align.shape[0] - cur_new_labels.shape[0],), False, dtype=attention_mask.dtype, device=attention_mask.device)
                    cur_new_attention_mask = torch.cat((new_attn_mask_pad_left, cur_attention_mask, new_attn_mask_pad_right), dim=0)
                    new_attention_mask.append(cur_new_attention_mask)
                attention_mask = torch.stack(new_attention_mask, dim=0)
                assert attention_mask.shape == new_labels.shape
        else:
            new_input_embeds = torch.stack(new_input_embeds, dim=0)
            if labels is not None:
                new_labels  = torch.stack(new_labels, dim=0)

            if attention_mask is not None:
                new_attn_mask_pad_left = torch.full((attention_mask.shape[0], new_input_embeds.shape[1] - input_ids.shape[1]), True, dtype=attention_mask.dtype, device=attention_mask.device)
                attention_mask = torch.cat((new_attn_mask_pad_left, attention_mask), dim=1)
                assert attention_mask.shape == new_input_embeds.shape[:2]

        self.img_seq = image_features[0].shape[0]
        self.pre_prompt_length_list = pre_prompt_length_list
        # print(self.img_seq)
        return None, attention_mask, past_key_values, new_input_embeds, new_labels, self.img_seq, self.pre_prompt_length_list

    def initialize_vision_tokenizer(self, model_args, tokenizer):
        if model_args.mm_use_im_patch_token:
            tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
            self.resize_token_embeddings(len(tokenizer))

        if model_args.mm_use_im_start_end:
            num_new_tokens = tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
            self.resize_token_embeddings(len(tokenizer))

            if num_new_tokens > 0:
                input_embeddings = self.get_input_embeddings().weight.data
                output_embeddings = self.get_output_embeddings().weight.data

                input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
                    dim=0, keepdim=True)
                output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
                    dim=0, keepdim=True)

                input_embeddings[-num_new_tokens:] = input_embeddings_avg
                output_embeddings[-num_new_tokens:] = output_embeddings_avg

            if model_args.tune_mm_mlp_adapter:
                for p in self.get_input_embeddings().parameters():
                    p.requires_grad = True
                for p in self.get_output_embeddings().parameters():
                    p.requires_grad = False

            if model_args.pretrain_mm_mlp_adapter:
                mm_projector_weights = torch.load(model_args.pretrain_mm_mlp_adapter, map_location='cpu')
                embed_tokens_weight = mm_projector_weights['model.embed_tokens.weight']
                assert num_new_tokens == 2
                if input_embeddings.shape == embed_tokens_weight.shape:
                    input_embeddings[-num_new_tokens:] = embed_tokens_weight[-num_new_tokens:]
                elif embed_tokens_weight.shape[0] == num_new_tokens:
                    input_embeddings[-num_new_tokens:] = embed_tokens_weight
                else:
                    raise ValueError(f"Unexpected embed_tokens_weight shape. Pretrained: {embed_tokens_weight.shape}. Current: {input_embeddings.shape}. Numer of new tokens: {num_new_tokens}.")
        elif model_args.mm_use_im_patch_token:
            if model_args.tune_mm_mlp_adapter:
                for p in self.get_input_embeddings().parameters():
                    p.requires_grad = False
                for p in self.get_output_embeddings().parameters():
                    p.requires_grad = False

#     def prepare_sparse_inputs_labels_for_multimodal(
#         self, input_ids, position_ids, attention_mask, past_key_values, labels,
#         images, image_sizes=None
#     ):
#         vision_tower = self.get_vision_tower()      # CLIPVisionTower
#         if vision_tower is None or images is None or input_ids.shape[1] == 1:
#             self.model.generate_process_count += 1
#             return input_ids, position_ids, attention_mask, past_key_values, None, labels,self.img_seq,self.token_length_list,self.pre_prompt_length_list

#         if type(images) is list or images.ndim == 5:
#             if type(images) is list:
#                 images = [x.unsqueeze(0) if x.ndim == 3 else x for x in images]
#             concat_images = torch.cat([image for image in images], dim=0)
#             image_features, key_set = self.encode_images(concat_images)
#             split_sizes = [image.shape[0] for image in images]
#             image_features = torch.split(image_features, split_sizes, dim=0)
#             mm_patch_merge_type = getattr(self.config, 'mm_patch_merge_type', 'flat')
#             image_aspect_ratio = getattr(self.config, 'image_aspect_ratio', 'square')
#             mm_patch_merge_type = 'flat'
#             if mm_patch_merge_type == 'flat':
#                 for i in range(key_set.shape[0]):
#                     key_set[i] += image_features[0].shape[1]*i
#                 image_features = [x.flatten(0, 1) for x in image_features]
#             elif mm_patch_merge_type.startswith('spatial'):
#                 new_image_features = []
#                 for image_idx, image_feature in enumerate(image_features):
#                     if image_feature.shape[0] > 1:
#                         base_image_feature = image_feature[0]
#                         image_feature = image_feature[1:]
#                         height = width = self.get_vision_tower().num_patches_per_side
#                         assert height * width == base_image_feature.shape[0]
#                         if image_aspect_ratio == 'anyres':
#                             num_patch_width, num_patch_height = get_anyres_image_grid_shape(image_sizes[image_idx], self.config.image_grid_pinpoints, self.get_vision_tower().config.image_size)
#                             image_feature = image_feature.view(num_patch_height, num_patch_width, height, width, -1)
#                         else:
#                             raise NotImplementedError
#                         if 'unpad' in mm_patch_merge_type:
#                             image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
#                             image_feature = image_feature.flatten(1, 2).flatten(2, 3)
#                             image_feature = unpad_image(image_feature, image_sizes[image_idx])
#                             image_feature = torch.cat((
#                                 image_feature,
#                                 self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1).to(image_feature.device)
#                             ), dim=-1)
#                             image_feature = image_feature.flatten(1, 2).transpose(0, 1)
#                         else:
#                             image_feature = image_feature.permute(0, 2, 1, 3, 4).contiguous()
#                             image_feature = image_feature.flatten(0, 3)
#                         image_feature = torch.cat((base_image_feature, image_feature), dim=0)
#                     else:
#                         image_feature = image_feature[0]
#                         if 'unpad' in mm_patch_merge_type:
#                             image_feature = torch.cat((
#                                 image_feature,
#                                 self.model.image_newline[None].to(image_feature.device)
#                             ), dim=0)
#                     new_image_features.append(image_feature)
#                 image_features = new_image_features
#             else:
#                 raise ValueError(f"Unexpected mm_patch_merge_type: {self.config.mm_patch_merge_type}")
#         else:
#             image_features, key_set = self.encode_images(images) # 从[1, 3, 336, 336]变成([1, 576, 4096])

        
#         # TODO: image start / end is not implemented here to support pretraining.
#         if getattr(self.config, 'tune_mm_mlp_adapter', False) and getattr(self.config, 'mm_use_im_start_end', False):
#             raise NotImplementedError

#         # Let's just add dummy tensors if they do not exist,
#         # it is a headache to deal with None all the time.
#         # But it is not ideal, and if you have a better idea,
#         # please open an issue / submit a PR, thanks.
#         _labels = labels
#         _position_ids = position_ids
#         _attention_mask = attention_mask
#         if attention_mask is None:
#             attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
#         else:
#             attention_mask = attention_mask.bool()
#         if position_ids is None:    #  用input_ids的长度来填充position_ids
#             position_ids = torch.arange(0, input_ids.shape[1], dtype=torch.long, device=input_ids.device)
#         if labels is None:
#             labels = torch.full_like(input_ids, IGNORE_INDEX)

#         # remove the padding using attention_mask -- FIXME
#         # 决定哪些部分需要被mask掉
#         _input_ids = input_ids
#         input_ids = [cur_input_ids[cur_attention_mask] for cur_input_ids, cur_attention_mask in zip(input_ids, attention_mask)]
#         labels = [cur_labels[cur_attention_mask] for cur_labels, cur_attention_mask in zip(labels, attention_mask)]
        
#         new_input_embeds = []   # 下面那个大的for循环，也就是把pre_prompt_embedding , image_embedding , question_embedding给拼接起来
#         new_labels = []
#         cur_image_idx = 0
#         pre_prompt_length_list = []      # 记录哪些token是预先的prompt，这些token不进行稀疏化
#         for batch_idx, cur_input_ids in enumerate(input_ids):
#             num_images = (cur_input_ids == IMAGE_TOKEN_INDEX).sum()
#             if num_images == 0:
#                 cur_image_features = image_features[cur_image_idx]
#                 cur_input_embeds_1 = self.get_model().embed_tokens(cur_input_ids)
#                 cur_input_embeds = torch.cat([cur_input_embeds_1, cur_image_features[0:0]], dim=0)
#                 new_input_embeds.append(cur_input_embeds)
#                 new_labels.append(labels[batch_idx])
#                 cur_image_idx += 1
#                 continue
#             # 因为他input的格式是<pre_prompt><image><question>，<image>是一个分隔符，下面这行代码是为了找到<image>的位置,如[-1, 35, 93]
#             image_token_indices = [-1] + torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0].tolist() + [cur_input_ids.shape[0]] 
#             pre_prompt_length_list.append(image_token_indices[1])
#             cur_input_ids_noim = [] # 以列表的方式存入input，将<image>分隔符删除，也即[ <pre_prompt> , <question> ]
#             cur_labels = labels[batch_idx]
#             cur_labels_noim = []    # 以列表的方式存入label，将<image>分隔符删除，也即[ <pre_prompt> , <question> ]
#             for i in range(len(image_token_indices) - 1):
#                 cur_input_ids_noim.append(cur_input_ids[image_token_indices[i]+1:image_token_indices[i+1]])
#                 cur_labels_noim.append(cur_labels[image_token_indices[i]+1:image_token_indices[i+1]])
#             split_sizes = [x.shape[0] for x in cur_labels_noim] # [35, 57]
#             cur_input_embeds = self.get_model().embed_tokens(torch.cat(cur_input_ids_noim)) # 相当于不对<image>这个标识符进行embedding，torch.Size([92, 4096])
#             cur_input_embeds_no_im = torch.split(cur_input_embeds, split_sizes, dim=0)  # 再将embedding分成<pre_prompt>、<question>两部分
#             cur_new_input_embeds = [] # 在for循环后变成[ pre_prompt_embedding , image_embedding , question_embedding ]，[35,576,57]
#             cur_new_labels = []     # 在for循环后变成[ pre_prompt_embedding , image_embedding , question_embedding ]，[35,576,57]

#             for i in range(num_images + 1):
#                 cur_new_input_embeds.append(cur_input_embeds_no_im[i])
#                 cur_new_labels.append(cur_labels_noim[i])
#                 if i < num_images:
#                     cur_image_features = image_features[cur_image_idx]
#                     cur_image_idx += 1
#                     cur_new_input_embeds.append(cur_image_features) # 之前不是没把那个<image>标识符给embedding嘛，其实就是为了这一步，把<image>标识符换成真正的image_features
#                     cur_new_labels.append(torch.full((cur_image_features.shape[0],), IGNORE_INDEX, device=cur_labels.device, dtype=cur_labels.dtype))
#                     # 把这个image_features对应的label全部设为IGNORE_INDEX
#             cur_new_input_embeds = [x.to(self.device) for x in cur_new_input_embeds]    

#             cur_new_input_embeds = torch.cat(cur_new_input_embeds)  # 把他们拼接成一个tensor
#             cur_new_labels = torch.cat(cur_new_labels) # 把他们拼接成一个tensor

#             new_input_embeds.append(cur_new_input_embeds)
#             new_labels.append(cur_new_labels)

#         # Truncate sequences to max length as image embeddings can make the sequence longer
#         tokenizer_model_max_length = getattr(self.config, 'tokenizer_model_max_length', None)
#         if tokenizer_model_max_length is not None:
#             new_input_embeds = [x[:tokenizer_model_max_length] for x in new_input_embeds]
#             new_labels = [x[:tokenizer_model_max_length] for x in new_labels]

#         # Combine them      下面这部分代码主要是，把一个batch内的input填充到相同的维度，比如有的input是[1,4096],有的是[668,4096],那么就将他们都填充到[668,4096]
#         max_len = max(x.shape[0] for x in new_input_embeds)
#         batch_size = len(new_input_embeds)
#         #  用IGNORE_INDEX填充label，用0填充attention_mask和position_ids
#         new_input_embeds_padded = []
#         new_labels_padded = torch.full((batch_size, max_len), IGNORE_INDEX, dtype=new_labels[0].dtype, device=new_labels[0].device)
#         attention_mask = torch.zeros((batch_size, max_len), dtype=attention_mask.dtype, device=attention_mask.device)
#         position_ids = torch.zeros((batch_size, max_len), dtype=position_ids.dtype, device=position_ids.device)
#         token_length_list = []      # 这个list是为了后续把填充的token给mask掉
#         for i, (cur_new_embed, cur_new_labels) in enumerate(zip(new_input_embeds, new_labels)):
#             cur_len = cur_new_embed.shape[0]
#             token_length_list.append(cur_len)
#             if getattr(self.config, 'tokenizer_padding_side', 'right') == "left":
#                 new_input_embeds_padded.append(torch.cat((
#                     torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device),
#                     cur_new_embed
#                 ), dim=0))
#                 if cur_len > 0:
#                     new_labels_padded[i, -cur_len:] = cur_new_labels
#                     attention_mask[i, -cur_len:] = True
#                     position_ids[i, -cur_len:] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
#             else:
#                 new_input_embeds_padded.append(torch.cat((
#                     cur_new_embed,
#                     torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device)
#                 ), dim=0))
#                 if cur_len > 0:
#                     new_labels_padded[i, :cur_len] = cur_new_labels
#                     attention_mask[i, :cur_len] = True
#                     position_ids[i, :cur_len] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)

#         new_input_embeds = torch.stack(new_input_embeds_padded, dim=0)  # 把列表里的元素拼接

#         if _labels is None:
#             new_labels = None
#         else:
#             new_labels = new_labels_padded

#         if _attention_mask is None:
#             attention_mask = None
#         else:
#             attention_mask = attention_mask.to(dtype=_attention_mask.dtype)

#         if _position_ids is None:
#             position_ids = None

#         self.img_seq = image_features[0].shape[0]
#         self.key_set = [key_set.shape[0],key_set.flatten()]
#         self.token_length_list = token_length_list
#         self.pre_prompt_length_list = pre_prompt_length_list
#         self.model.init_token_total_shape = max_len       # 这个参数用于初始化policy
# # attention_mask.shape：torch.Size([1, 668])，new_input_embeds.shape：torch.Size([1, 668, 4096])，new_labels.shape：torch.Size([1, 668])
#         return None, position_ids, attention_mask, past_key_values, new_input_embeds, new_labels,image_features[0].shape[0], self.img_seq, pre_prompt_length_list
