from gettext import ngettext
from turtle import st
from sympy import im
import torch
import torch.nn as nn
import torch.nn.functional as F

from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import Qwen2_5_VLCausalLMOutputWithPast, Qwen2_5_VLForConditionalGeneration
from uav_vln.constants import IGNORE_INDEX
from typing import List, Tuple, Union, Optional
from uav_vln.trainer import rank0_print

class QwenVLwithVisionHeadOutputWithPast(Qwen2_5_VLCausalLMOutputWithPast):
    """
    Output class for Qwen2_5_VL with pointer head, extending the base output class.
    
    Args:
        lm_loss (`torch.FloatTensor` of shape `(1,)`, *optional*):
            Language modeling loss.
        pointer_loss (`torch.FloatTensor` of shape `(1,)`, *optional*):
            Vision pointer network loss.
        pointer_scores (`List[torch.FloatTensor]`, *optional*):
            Attention scores from the pointer network, one tensor per batch item.
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*):
            Combined loss (weighted sum of lm_loss and pointer_loss).
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores from the language modeling head.
        past_key_values, hidden_states, attentions, rope_deltas:
            Same as parent class.
    """
    def __init__(self, lm_loss=None, pointer_loss=None, pointer_scores=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.lm_loss = lm_loss
        self.pointer_loss = pointer_loss
        self.pointer_scores = pointer_scores


class VisionHead_MultiPatch(nn.Module):
    def __init__(self, d_model, projection_dim, num_attention_heads=8, dropout_rate=0.1):
        super().__init__()
        self.d_model = d_model
        
        # Note: We omit additional normalization here because Qwen2VL
        # already normalizes hidden states using RMSNorm.
        self.projection_enc = nn.Sequential(
            nn.Linear(d_model, projection_dim),
            nn.GELU(),
            nn.Linear(projection_dim, d_model)
        )
        self.projection_dec = nn.Sequential(
            nn.Linear(d_model, projection_dim),
            nn.GELU(),
            nn.Linear(projection_dim, d_model)
        )

        # Add self-attention layer for visual features
        self.self_attention = nn.MultiheadAttention(
            embed_dim=d_model,
            num_heads=num_attention_heads,
            dropout=dropout_rate,
            batch_first=True
        )
        
        # Layer normalization and residual connection
        self.layer_norm = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self,
                hidden_state_enc,  # shape: [n_enc, d_model] where n_enc can vary with image size
                hidden_state_dec,  # shape: [n_dec, d_model] there can be multiple queries in one sample
                labels: Optional[torch.Tensor] = None,  # shape: [n_dec, n_enc], binary mask of patches in bbox
                do_single_patch: bool = False,
               ):
        
        enc_input = hidden_state_enc.unsqueeze(0)
        attn_output, _ = self.self_attention(
            query=enc_input,
            key=enc_input,
            value=enc_input,
            # attn_mask=attention_mask,
            need_weights=False
        )
        # Residual connection and layer normalization
        hidden_state_enc_ctx = self.layer_norm(enc_input + self.dropout(attn_output))
        # Remove batch dimension
        hidden_state_enc_ctx = hidden_state_enc_ctx.squeeze(0)  # [n_enc, d_model]

        # Apply the projection networks.
        proj_enc = self.projection_enc(hidden_state_enc_ctx)  # [n_enc, d_model]
        proj_dec = self.projection_dec(hidden_state_dec)  # [n_dec, d_model]
        
        # Compute scaled dot-product attention scores.
        # Scaling by sqrt(d_model) is critical regardless of variable n_enc.
        scaling = self.d_model ** 0.5
        patch_logits = torch.matmul(proj_dec, proj_enc.transpose(0, 1)) / scaling  # [n_dec, n_enc]
        
        # Softmax normalization is applied along the encoder dimension.
        attn_weights = F.softmax(patch_logits, dim=-1)

        loss = None
        if (labels is not None) and (not do_single_patch):
            epsilon = 1e-8
            labels_float = labels.float()
            # Normalize each row to get target probability distribution
            target_dist = labels_float / (labels_float.sum(dim=-1, keepdim=True) + epsilon)

            # Apply log_softmax to logits
            pred_log_probs = F.log_softmax(patch_logits, dim=-1)
            # Use KL divergence as loss
            loss = F.kl_div(pred_log_probs, target_dist, reduction='batchmean')

        if do_single_patch and (labels is not None):
            loss = F.cross_entropy(attn_scores, labels)

        return attn_weights, loss


class Qwen2_5_VLForConditionalGenerationWithPointer(Qwen2_5_VLForConditionalGeneration):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # self.multi_patch_pointer_head = VisionHead_MultiPatch(self.config.hidden_size, self.config.hidden_size)
        # self.pointer_loss_weight = kwargs.get("pointer_loss_weight", 1.0)
        # self.lm_loss_weight = kwargs.get("lm_loss_weight", 1.0)
        self.post_init()
    
    # def reset_loss_weights(self, pointer_loss_weight, lm_loss_weight):
    #     self.pointer_loss_weight = pointer_loss_weight
    #     self.lm_loss_weight = lm_loss_weight
   
    def forward(self,
                input_ids: torch.LongTensor = None, # (batch_size, seq_len)
                attention_mask: Optional[torch.Tensor] = None,
                position_ids: Optional[torch.LongTensor] = None,
                past_key_values: Optional[List[torch.FloatTensor]] = None,
                inputs_embeds: Optional[torch.FloatTensor] = None,
                labels: Optional[torch.LongTensor] = None,
                use_cache: Optional[bool] = None,
                output_attentions: Optional[bool] = None,
                output_hidden_states: Optional[bool] = None,
                return_dict: Optional[bool] = None,
                pixel_values: Optional[torch.Tensor] = None,
                pixel_values_videos: Optional[torch.FloatTensor] = None,
                image_grid_thw: Optional[torch.LongTensor] = None,
                video_grid_thw: Optional[torch.LongTensor] = None,
                rope_deltas: Optional[torch.LongTensor] = None,
                cache_position: Optional[torch.LongTensor] = None,
                second_per_grid_ts: Optional[torch.Tensor] = None,
                # Grounding
                teacher_map = None,
                neg_map = None,
                image_token_index = None,  #[start_index, end_index]
                text_token_index = None,   #[start_index, end_index]
                image_path = None,
                verbose: bool = False) -> Union[Tuple, QwenVLwithVisionHeadOutputWithPast]:
        
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        
        if verbose:
            rank0_print(f"input_ids: {input_ids.shape}, {input_ids[0][:5]}...")
            rank0_print(f"labels: {labels.shape}, {labels[0][:5]}...")
            rank0_print(f"pixel_values: {pixel_values.shape}")
            rank0_print(f"image_grid_thw: {image_grid_thw.shape}, {image_grid_thw}")
            rank0_print(f"return_dict: {return_dict}")

        if inputs_embeds is None:
            inputs_embeds = self.model.embed_tokens(input_ids) # shape: (batch_size, seq_len, d_model)
            if pixel_values is not None:
                pixel_values = pixel_values.type(self.visual.dtype)
                image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
                n_image_tokens = (input_ids == self.config.image_token_id).sum().item()
                n_image_features = image_embeds.shape[0]
                if n_image_tokens != n_image_features:
                    raise ValueError(
                        f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
                    )
                image_mask = (
                    (input_ids == self.config.image_token_id)
                    .unsqueeze(-1)
                    .expand_as(inputs_embeds)
                    .to(inputs_embeds.device)
                )
                image_embeds = image_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
                inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)

            if pixel_values_videos is not None:
                pixel_values_videos = pixel_values_videos.type(self.visual.dtype)
                video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw)
                n_video_tokens = (input_ids == self.config.video_token_id).sum().item()
                n_video_features = video_embeds.shape[0]
                if n_video_tokens != n_video_features:
                    raise ValueError(
                        f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
                    )
                video_mask = (
                    (input_ids == self.config.video_token_id)
                    .unsqueeze(-1)
                    .expand_as(inputs_embeds)
                    .to(inputs_embeds.device)
                )
                video_embeds = video_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
                inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)

            if attention_mask is not None:
                attention_mask = attention_mask.to(inputs_embeds.device)

        # if we get 4D attention mask we cannot calculate rope deltas anymore. TODO @raushan fixme
        if position_ids is None and (attention_mask is None or attention_mask.ndim == 2):
            # calculate RoPE index once per generation in the pre-fill stage only
            if (
                (cache_position is not None and cache_position[0] == 0)
                or self.rope_deltas is None
                or (past_key_values is None or past_key_values.get_seq_length() == 0)
            ):
                position_ids, rope_deltas = self.get_rope_index(
                    input_ids, image_grid_thw, video_grid_thw, attention_mask
                )
                self.rope_deltas = rope_deltas
            # then use the prev pre-calculated rope-deltas to get the correct position ids
            else:
                batch_size, seq_length, _ = inputs_embeds.shape
                delta = cache_position[0] + self.rope_deltas if cache_position is not None else 0
                position_ids = torch.arange(seq_length, device=inputs_embeds.device)
                position_ids = position_ids.view(1, -1).expand(batch_size, -1)
                if cache_position is not None:  # otherwise `deltas` is an int `0`
                    delta = delta.repeat_interleave(batch_size // delta.shape[0], dim=0)
                    delta = delta.to(position_ids.device)
                position_ids = position_ids.add(delta)
                position_ids = position_ids.unsqueeze(0).expand(3, -1, -1)

        outputs = self.model(
            input_ids=None,
            position_ids=position_ids,
            attention_mask=attention_mask,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=True,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            cache_position=cache_position,
        )

        hidden_states = outputs[0] # shape: (batch_size, seq_len, d_model)
        logits = self.lm_head(hidden_states)
        
        # ---------------------
        # Get student_map & teacher_map
        # ---------------------
        attn_score_map = outputs['attentions'][0]  
        bs = attn_score_map.size(0)

        student_maps = []
        teacher_maps = []
        neg_maps = []

        for i in range(bs):
            cur_student_map = attn_score_map[
                i,
                :,
                text_token_index[i][0]:text_token_index[i][1] + 1,
                image_token_index[i][0]:image_token_index[i][1] + 1
            ].mean(dim=0)  # [t_len, i_len]
            student_maps.append(cur_student_map)
            
            cur_teacher_map = teacher_map[i][
                text_token_index[i][0]:text_token_index[i][1] + 1,
                image_token_index[i][0]:image_token_index[i][1] + 1
            ]
            teacher_maps.append(cur_teacher_map)

            cur_neg_map = []
            for j in range(len(neg_map[i])):
                cur_neg_map_j = neg_map[i][j]
                cur_neg_map.append(cur_neg_map_j)
            neg_maps.append(cur_neg_map)


        # ---------------------
        # Padding 
        # ---------------------
        max_t_len = max(m.shape[0] for m in student_maps)
        max_i_len = max(m.shape[1] for m in student_maps)

        padded_student = []
        padded_teacher = []
        padded_neg = []
        mask = []

        for s_map, t_map, n_map in zip(student_maps, teacher_maps, neg_maps):
            t_pad = max_t_len - s_map.shape[0]
            i_pad = max_i_len - s_map.shape[1]
            padded_student.append(F.pad(s_map, (0, i_pad, 0, t_pad)))
            padded_teacher.append(F.pad(t_map, (0, i_pad, 0, t_pad)))
            padded_neg.append(torch.stack([F.pad(n, (0, i_pad)) for n in n_map]))
            mask.append(F.pad(torch.ones_like(s_map), (0, i_pad, 0, t_pad)))  # 1 means valid position

        padded_student = torch.stack(padded_student)  # [B, T_max, I_max]
        padded_teacher = torch.stack(padded_teacher)  # [B, T_max, I_max]
        padded_neg = torch.stack(padded_neg)          # [B, neg_num, I_max]
        mask = torch.stack(mask)  # [B, T_max, I_max]

        # ---------------------
        # KL Loss
        # ---------------------

        padded_student = padded_student.float()
        padded_teacher = padded_teacher.float()
        padded_student = padded_student / (padded_student.sum(dim=-1, keepdim=True) + 1e-8)
        padded_teacher = padded_teacher / (padded_teacher.sum(dim=-1, keepdim=True) + 1e-8)
        
        kl_per_elem = F.kl_div(padded_student.log(), padded_teacher, reduction='none')
        kl_loss = (kl_per_elem * mask).sum()

        ###---------------------
        ### InfoNCE Loss
        ###---------------------
        temp = 0.05
        B, T_max, I_max = padded_student.shape
        d = T_max * I_max
        
        padded_student = padded_student.mean(dim=1).float()
        padded_teacher = padded_teacher.mean(dim=1).float()
        padded_neg = padded_neg.float()

        padded_student = padded_student / (padded_student.sum(dim=-1, keepdim=True) + 1e-8)
        padded_teacher = padded_teacher / (padded_teacher.sum(dim=-1, keepdim=True) + 1e-8)
        padded_neg = padded_neg / (padded_neg.sum(dim=-1, keepdim=True) + 1e-8)

        # Flatten
        stu_vecs = padded_student.view(B, -1)  
        pos_vecs = padded_teacher.view(B, -1)  

        # Positive sample similarity
        pos_sims = F.cosine_similarity(stu_vecs, pos_vecs, dim=-1)  

        # Compute negative sample similarity
        neg_sample_num = padded_neg.size(1)  
        neg_vecs = padded_neg.view(B, neg_sample_num, -1)  
        stu_vecs_exp = stu_vecs.unsqueeze(1).expand(-1, neg_sample_num, -1)  
        neg_sims = F.cosine_similarity(stu_vecs_exp, neg_vecs, dim=-1)  

        # Concatenate logits and compute InfoNCE
        infonce_logits = torch.cat([pos_sims.unsqueeze(1), neg_sims], dim=1)  
        infonce_logits = infonce_logits / temp
        infonce_labels = torch.zeros(B, dtype=torch.long, device=padded_student.device)
        infonce_loss = F.cross_entropy(infonce_logits, infonce_labels)


        our_loss = kl_loss * 50 + infonce_loss * 0.2
        
        lm_loss = None
        if labels is not None:
            # Upcast to float if we need to compute the loss to avoid potential precision issues
            logits = logits.float()
            # Shift so that tokens < n predict n
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            # Flatten the tokens
            loss_fct = nn.CrossEntropyLoss()
            shift_logits = shift_logits.view(-1, self.config.vocab_size)
            shift_labels = shift_labels.view(-1)
            # Enable model parallelism
            shift_labels = shift_labels.to(shift_logits.device)
            lm_loss = loss_fct(shift_logits, shift_labels)

        
        total_loss = lm_loss + our_loss

        if return_dict:
            return QwenVLwithVisionHeadOutputWithPast(
                lm_loss=lm_loss,
                loss=total_loss,
                logits=logits,
                past_key_values=outputs.past_key_values,
                hidden_states=outputs.hidden_states,
                attentions=outputs.attentions,
                rope_deltas=self.rope_deltas,
            )
        else:
            # When labels are provided, parent's forward returns a tuple with loss as the first element.
            if labels is not None:
                # Replace the LM loss with the combined loss.
                output = (lm_loss, pointer_loss, logits, pointer_scores,) + outputs[1:]
                print(f"returning: total_loss, logits, pointer_scores, ...")
                return (total_loss,) + output if total_loss
