# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

import math
from dataclasses import dataclass, field

import torch
from fairseq import utils
from fairseq.logging import metrics
from fairseq.criterions import FairseqCriterion, register_criterion
from fairseq.dataclass import FairseqDataclass
from omegaconf import II
import torch.nn.functional as F
import pdb

@dataclass
class MyLabelSmoothedCrossEntropyCriterionConfig(FairseqDataclass):
    label_smoothing: float = field(
        default=0.0,
        metadata={"help": "epsilon for label smoothing, 0 means no label smoothing"},
    )
    report_accuracy: bool = field(
        default=False,
        metadata={"help": "report accuracy metric"},
    )
    ignore_prefix_size: int = field(
        default=0,
        metadata={"help": "Ignore first N tokens"},
    )
    sentence_avg: bool = II("optimization.sentence_avg")


def label_smoothed_nll_loss(lprobs, target, epsilon, ignore_index=None, reduce=True):
    if target.dim() == lprobs.dim() - 1:
        target = target.unsqueeze(-1)
    nll_loss = -lprobs.gather(dim=-1, index=target)
    smooth_loss = -lprobs.sum(dim=-1, keepdim=True)
    if ignore_index is not None:
        pad_mask = target.eq(ignore_index)
        nll_loss.masked_fill_(pad_mask, 0.0)
        smooth_loss.masked_fill_(pad_mask, 0.0)
    else:
        nll_loss = nll_loss.squeeze(-1)
        smooth_loss = smooth_loss.squeeze(-1)
    if reduce:
        nll_loss = nll_loss.sum()
        smooth_loss = smooth_loss.sum()
    eps_i = epsilon / (lprobs.size(-1) - 1)
    loss = (1.0 - epsilon - eps_i) * nll_loss + eps_i * smooth_loss
    return loss, nll_loss


@register_criterion(
    "my_label_smoothed_cross_entropy", dataclass=MyLabelSmoothedCrossEntropyCriterionConfig
)
class MyLabelSmoothedCrossEntropyCriterion(FairseqCriterion):
    def __init__(
        self,
        task,
        sentence_avg,
        label_smoothing,
        ignore_prefix_size=0,
        report_accuracy=False,
    ):
        super().__init__(task)
        self.sentence_avg = sentence_avg
        self.eps = label_smoothing
        self.ignore_prefix_size = ignore_prefix_size
        self.report_accuracy = report_accuracy
        self.mam_mse = torch.nn.MSELoss()
        self.mim_mse = torch.nn.MSELoss()
        #self.mpm_mse = torch.nn.MSELoss()

    def forward(self, model, sample, reduce=True):
        """Compute the loss for the given sample.

        Returns a tuple with three elements:
        1) the loss
        2) the sample size, which is used as the denominator for the gradient
        3) logging outputs to display while training
        """
        net_output,output = model(**sample["net_input"])
        ###########################################################################
        #add mse_loss
        loss, nll_loss = self.compute_loss(model, net_output, sample, reduce=reduce) 
        if output['encoder_out_x_mask'] != None:
            L,B,C = output['encoder_out_x'].shape
            T = L//2
            e_a,e_v = torch.split(output['encoder_out_x'],[T,L-T],dim=0) #(T,B,C) (85,8,1024)
            p_a,p_v = torch.split(output['encoder_out_x_mask'],[T,L-T],dim=0)
            a_index, v_index = output['audio_masking_index'],output['video_masking_index']
            a_index, v_index = a_index.permute(1,0,2), v_index.permute(1,0,2) #(B,T,C)->(T,B,C)
            mam_loss = self.mam_mse(p_a[a_index],e_a[a_index])
            mim_loss = self.mim_mse(p_v[v_index],e_v[v_index])

            ############################################################
            """
            p_v_log_softmax = F.log_softmax(p_v,dim=-1)
            p_a_softmax = F.softmax(p_a,dim=-1)
            mpm_loss = F.kl_div(p_v_log_softmax,p_a_softmax,reduction='sum')
            """
            ############################################################
            ############################################################
            # tau = 0.2
            # o_v,o_a = e_v.transpose(0,1),e_a.transpose(0,1) #(T,B,C)->(B,T,C)
            # o_v,o_a = F.normalize(o_v,dim=2), F.normalize(o_a,dim=2) #(B,T,C)
            # s_av = torch.bmm(o_v,o_a.transpose(1,2))
            # #s_av = s_av - torch.max(s_av)
            # s_av = torch.exp(torch.div(s_av,tau)) #(B,T,T)
            # mask_eye = torch.eye(T,T)
            # mask_eye = mask_eye.unsqueeze(0).expand(B,-1,-1).to(s_av.device)
            # numerator_v2a = torch.sum(s_av * mask_eye,dim=2) #(B,T)
            # denorminator_v2a = torch.sum(s_av,dim=2) + 1e-8 #(B,T)
            # mem_loss_v2a = torch.mean(-torch.log(torch.div(numerator_v2a,denorminator_v2a)))

            # numerator_a2v = torch.sum(s_av * mask_eye,dim=1) #(B,T)
            # denorminator_a2v = torch.sum(s_av,dim=1) + 1e-8 #(B,T)
            # mem_loss_a2v = torch.mean(-torch.log(torch.div(numerator_a2v,denorminator_a2v)))            
            ############################################################
            loss = loss + (mam_loss + mim_loss)
        
        ###########################################################################
        sample_size = (
            sample["target"].size(0) if self.sentence_avg else sample["ntokens"]
        )
        logging_output = {
            "loss": loss.data,
            "nll_loss": nll_loss.data,
            "ntokens": sample["ntokens"],
            "nsentences": sample["target"].size(0),
            "sample_size": sample_size,
        }
        if self.report_accuracy:
            n_correct, total = self.compute_accuracy(model, net_output, sample)
            logging_output["n_correct"] = utils.item(n_correct.data)
            logging_output["total"] = utils.item(total.data)

        return loss, sample_size, logging_output

    def get_lprobs_and_target(self, model, net_output, sample):
        lprobs = model.get_normalized_probs(net_output, log_probs=True)
        target = model.get_targets(sample, net_output)
        if self.ignore_prefix_size > 0:
            # lprobs: B x T x C
            lprobs = lprobs[:, self.ignore_prefix_size :, :].contiguous()
            target = target[:, self.ignore_prefix_size :].contiguous()
        return lprobs.view(-1, lprobs.size(-1)), target.view(-1)

    def compute_loss(self, model, net_output, sample, reduce=True):
        lprobs, target = self.get_lprobs_and_target(model, net_output, sample)
        loss, nll_loss = label_smoothed_nll_loss(
            lprobs,
            target,
            self.eps,
            ignore_index=self.padding_idx,
            reduce=reduce,
        )
        return loss, nll_loss

    def compute_accuracy(self, model, net_output, sample):
        lprobs, target = self.get_lprobs_and_target(model, net_output, sample)
        mask = target.ne(self.padding_idx)
        n_correct = torch.sum(
            lprobs.argmax(1).masked_select(mask).eq(target.masked_select(mask))
        )
        total = torch.sum(mask)
        return n_correct, total

    @classmethod
    def reduce_metrics(cls, logging_outputs) -> None:
        """Aggregate logging outputs from data parallel training."""
        loss_sum = sum(log.get("loss", 0) for log in logging_outputs)
        nll_loss_sum = sum(log.get("nll_loss", 0) for log in logging_outputs)
        ntokens = sum(log.get("ntokens", 0) for log in logging_outputs)
        sample_size = sum(log.get("sample_size", 0) for log in logging_outputs)

        metrics.log_scalar(
            "loss", loss_sum / sample_size / math.log(2), sample_size, round=3
        )
        metrics.log_scalar(
            "nll_loss", nll_loss_sum / ntokens / math.log(2), ntokens, round=3
        )
        metrics.log_derived(
            "ppl", lambda meters: utils.get_perplexity(meters["nll_loss"].avg)
        )

        total = utils.item(sum(log.get("total", 0) for log in logging_outputs))
        if total > 0:
            metrics.log_scalar("total", total)
            n_correct = utils.item(
                sum(log.get("n_correct", 0) for log in logging_outputs)
            )
            metrics.log_scalar("n_correct", n_correct)
            metrics.log_derived(
                "accuracy",
                lambda meters: round(
                    meters["n_correct"].sum * 100.0 / meters["total"].sum, 3
                )
                if meters["total"].sum > 0
                else float("nan"),
            )

    @staticmethod
    def logging_outputs_can_be_summed() -> bool:
        """
        Whether the logging outputs returned by `forward` can be summed
        across workers prior to calling `reduce_metrics`. Setting this
        to True will improves distributed training speed.
        """
        return True