import torch
from typing import Tuple, Callable
from transformers import AutoConfig
from transformers.models.llama.modeling_llama import LlamaDecoderLayer, LlamaModel, _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa, Cache, DynamicCache
from transformers.models.llama import LlamaConfig
from transformers.modeling_outputs import BaseModelOutputWithPast
from typing import List, Optional, Tuple, Union


class STARLlamaModel(LlamaModel):
    """
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]

    Args:
        config: LlamaConfig
    """

    def __init__(self, config: LlamaConfig):
        self.last_attention = None
        self.last_attention1 = None
        
        super().__init__(config)

        # self.model.layers = nn.ModuleList([IQDecoderLayyer(config, layer_idx) for layer_idx in range(LlamaConfig.num_hidden_layers)])

    


    def kmeans_2means(self, data: torch.Tensor, max_iters: int = 2000, tol: float = 1e-5):

    
        centroids = data[torch.randperm(len(data))[:2]]
    
        prev_centroids = torch.zeros_like(centroids)
    
        for _ in range(max_iters):
       
            distances = torch.cdist(data, centroids)
        
        
            labels = torch.argmin(distances, dim=1)
        
        
            for i in range(2):
                cluster_points = data[labels == i]
                if len(cluster_points) > 0:
                    centroids[i] = cluster_points.mean(dim=0)
        
       
            if torch.allclose(centroids, prev_centroids, atol=tol):
                break
        
            prev_centroids = centroids.clone()
    
        return labels, centroids
    
    
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutputWithPast]:

        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        
        use_cache = use_cache if use_cache is not None else self.config.use_cache

        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # retrieve input_ids and inputs_embeds
        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
        elif input_ids is not None:
            batch_size, seq_length = input_ids.shape[:2]
        elif inputs_embeds is not None:
            batch_size, seq_length = inputs_embeds.shape[:2]
        else:
            raise ValueError("You have to specify either input_ids or inputs_embeds")

        if self.gradient_checkpointing and self.training:
            if use_cache:
                print(
                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
                )
                use_cache = False

        past_key_values_length = 0
        if use_cache:
            use_legacy_cache = not isinstance(past_key_values, Cache)
            if use_legacy_cache:
                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
            past_key_values_length = past_key_values.get_usable_length(seq_length)


        if position_ids is None:
            device = input_ids.device if input_ids is not None else inputs_embeds.device
            position_ids = torch.arange(
                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
            )
            position_ids = position_ids.unsqueeze(0)

        if inputs_embeds is None:
            inputs_embeds = self.embed_tokens(input_ids)

        if self._use_flash_attention_2:
            # 2d mask is passed through the layers
            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
        elif self._use_sdpa and not output_attentions:
            # output_attentions=True can not be supported when using SDPA, and we fall back on
            # the manual implementation that requires a 4D causal mask in all cases.
            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
                attention_mask,
                (batch_size, seq_length),
                inputs_embeds,
                past_key_values_length,
            )
        else:
            # 4d mask is passed through the layers
            attention_mask = _prepare_4d_causal_attention_mask(
                attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
            )

        # embed positions
        hidden_states = inputs_embeds
        # extract visual tokens slide
        sys_prompt_num = 35
        visual_token_num =576  
        visual_tokens_origin = hidden_states[:, sys_prompt_num:sys_prompt_num+visual_token_num, :]

        # decoder layers
        all_hidden_states1 = () if output_hidden_states else None
        all_hidden_states = () if output_hidden_states else None
        all_self_attns = () if output_attentions else None
        next_decoder_cache = None
        
        hidden_states1 = torch.zeros_like(hidden_states)   

        for decoder_layer in self.layers:
            if output_hidden_states:
                all_hidden_states += (hidden_states,)
                all_hidden_states1 += (hidden_states1,) 

            if self.gradient_checkpointing and self.training:
                layer_outputs = self._gradient_checkpointing_func(
                    decoder_layer.__call__,
                    hidden_states,
                    attention_mask,
                    position_ids,
                    past_key_values,
                    output_attentions,
                    use_cache,
                )
            else:
                # Define the intermediate layer and retain_ratio
                K = self.config.K
                ratio = self.config.retain_ratio

                visual_token_num = self.config.visual_token_num
                last_visual_token_index = visual_token_num + 35
                

                if decoder_layer.self_attn.layer_idx == K and seq_length > 1:
                    
                    
                    device = hidden_states.device
                    attn_mean = self.last_attention.mean(dim=1)[0]
                    attn_mean1 = self.last_attention1.mean(dim=1)[0]

                   
                    use_text_attention =  False                  
                    use_sink_visual = True          

                    if use_text_attention:
                        
                        image_attention_score = attn_mean[last_visual_token_index+1: , 35:last_visual_token_index].mean(dim=0) 
                    else: use_sink_visual:
                        
                        attn_mean_avg = attn_mean.mean(dim=0).unsqueeze(1) 
                        
    
   
                        labels, centroids = self.kmeans_2means(attn_mean_avg)
    

                        larger_cluster_idx = torch.argmax(centroids)  
    
    
                        significant_indices = (labels == larger_cluster_idx).nonzero(as_tuple=True)[0]
                        
                        

    
                        image_attention_score = attn_mean1[significant_indices, 35:last_visual_token_index].mean(dim=0)  # 最终注意力分数
                    else:
                        
                        image_attention_score = attn_mean[-1, 35:last_visual_token_index]

                    
                    num_image_tokens = image_attention_score.shape[0]
                    k = int(visual_token_num * ratio)

                    if k < 0 or k > num_image_tokens:
                        keep_indexs = torch.arange(seq_length, device=device)
                    else:
                        
                        top_idx = image_attention_score.topk(k).indices + 35

                        
                        if last_visual_token_index < seq_length:
                            remaining_idx = torch.arange(last_visual_token_index, seq_length, device=device)
                        else:
                            remaining_idx = torch.tensor([], dtype=torch.long, device=device)

                     
                        

                        keep_indexs = torch.cat([
                            torch.arange(35, device=device),  # system prompt tokens
                            top_idx,
                            remaining_idx                      # remaining tokens
                        ])
                        keep_indexs = keep_indexs.sort().values


                    hidden_states = hidden_states[:, keep_indexs, :]
                    


                    

                    if attention_mask is not None:
                        attention_mask = attention_mask[:,:,:hidden_states.shape[1],:hidden_states.shape[1]]
                    position_ids = keep_indexs.unsqueeze(0)
                    

                if decoder_layer.self_attn.layer_idx == K - 1:
                    temp_layer_outputs = decoder_layer(
                        hidden_states,
                        attention_mask=attention_mask,
                        position_ids=position_ids,
                        past_key_value=past_key_values,
                        output_attentions=True,
                        use_cache=use_cache,
                    )
                    self.last_attention = temp_layer_outputs[1]
                    
                    B, S, D = hidden_states.shape                                # S == seq_len
                    pad_tensor = torch.zeros_like(hidden_states)                 # [B, S, D]

                    pad_tensor[:, sys_prompt_num:sys_prompt_num+visual_token_num, :] = visual_tokens_origin
                    
                    hidden_states = hidden_states + pad_tensor
                    temp_layer_outputs1 = decoder_layer(
                    hidden_states,
                    attention_mask=attention_mask,
                    position_ids=position_ids,
                    past_key_value=past_key_values,
                    output_attentions=True,
                    use_cache=use_cache,
                )
                    self.last_attention1 = temp_layer_outputs1[1]
                    
                
                layer_outputs = decoder_layer(
                    hidden_states,
                    attention_mask=attention_mask,
                    position_ids=position_ids,
                    past_key_value=past_key_values,
                    output_attentions=output_attentions,
                    use_cache=use_cache,
                )
            
            hidden_states = layer_outputs[0]

         
                
            


            if use_cache:
                next_decoder_cache = layer_outputs[2 if output_attentions else 1]

            if output_attentions:
                all_self_attns += (layer_outputs[1],)

        hidden_states = self.norm(hidden_states)

        # add hidden states from the last decoder layer
        if output_hidden_states:
            all_hidden_states += (hidden_states,)
            all_hidden_states1 += (hidden_states1,)    

        

        next_cache = None
        if use_cache:
            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
        if not return_dict:
            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
        return BaseModelOutputWithPast(
            last_hidden_state=hidden_states,
            past_key_values=next_cache,
            hidden_states=all_hidden_states,
            attentions=all_self_attns,
        )



