# llava/model/multimodal_encoder/clip_encoder.py
# (已修复版本)

import torch
import torch.nn as nn

from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig


class CLIPVisionTower(nn.Module):
    def __init__(self, vision_tower, args, delay_load=False):
        super().__init__()

        self.is_loaded = False

        self.vision_tower_name = vision_tower
        self.select_layer = args.mm_vision_select_layer
        self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
        self.args = args
        if not delay_load:
            self.load_model()
        elif getattr(args, 'unfreeze_mm_vision_tower', False):
            self.load_model()
        else:
            self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name)

    def load_model(self, device_map=None):
        if self.is_loaded:
            print('{} is already loaded, `load_model` called again, skipping.'.format(self.vision_tower_name))
            return

        self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
        self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map)
        
       
        
        requires_grad = getattr(self.args, 'unfreeze_mm_vision_tower', False)
        self.vision_tower.requires_grad_(requires_grad)

        self.is_loaded = True

    def feature_select(self, image_forward_outs, layer_idx=-1):
        """
        Modified to select features from a specific layer index.
        """
      
        image_features = image_forward_outs.hidden_states[layer_idx if layer_idx != -1 else self.select_layer]
        
        if self.select_feature == 'patch':
         
            image_features = image_features[:, 1:]
        elif self.select_feature == 'cls_patch':
       
            image_features = image_features
        else:
            raise ValueError(f'Unexpected select feature: {self.select_feature}')
        return image_features


    def forward(self, images, output_all_hidden_states=False):
        """
        Overhauled the forward pass to support dual-stream feature extraction.
        
        Args:
            images (torch.Tensor): The input image tensor.
            output_all_hidden_states (bool): If True, returns a list of hidden states from all layers.
        """

        image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)

        if output_all_hidden_states:

            all_image_features = []
            for i in range(1, len(image_forward_outs.hidden_states)):

                layer_features = self.feature_select(image_forward_outs, layer_idx=i)
                all_image_features.append(layer_features.to(images.dtype))
            return all_image_features 
        else:

            image_features = self.feature_select(image_forward_outs)
            return image_features.to(images.dtype)

    @property
    def dummy_feature(self):
        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)

    @property
    def dtype(self):
        return self.vision_tower.dtype

    @property
    def device(self):
        return self.vision_tower.device

    @property
    def config(self):
        if self.is_loaded:
            return self.vision_tower.config
        else:
            return self.cfg_only

    @property
    def hidden_size(self):
        return self.config.hidden_size

    @property
    def num_patches_per_side(self):
        return self.config.image_size // self.config.patch_size

    @property
    def num_patches(self):
        return (self.config.image_size // self.config.patch_size) ** 2


class CLIPVisionTowerS2(CLIPVisionTower):
    def __init__(self, vision_tower, args, delay_load=False):
        super().__init__(vision_tower, args, delay_load)
