import torch
from torch.nn import functional as F
from torch.nn.modules import loss


class DistributionLoss(loss._Loss):
    """The KL-Divergence loss for the binary student model and real teacher output.
    output must be a pair of (model_output, real_output), both NxC tensors.
    The rows of real_output must all add up to one (probability scores);
    however, model_output must be the pre-softmax output of the network."""

    def forward(self, model_output, real_output, loss = "crossentropy"):

        self.size_average = True

        # Target is ignored at training time. Loss is defined as KL divergence
        # between the model output and the refined labels.
        if real_output.requires_grad:
            raise ValueError("real network output should not require gradients.")
        if loss == "crossentropy":
            model_output_log_prob = F.log_softmax(model_output, dim=1)
        else:
            model_output_soft = F.log_softmax(model_output, dim=1)

        real_output_soft = real_output

        del model_output, real_output

        if loss == "crossentropy":
            # Loss is -dot(model_output_log_prob, real_output). Prepare tensors
            # for batch matrix multiplicatio
            real_output_soft = real_output_soft.unsqueeze(1)
            model_output_log_prob = model_output_log_prob.unsqueeze(2)

            # Compute the loss, and average/sum for the batch.
            cross_entropy_loss = -torch.bmm(real_output_soft, model_output_log_prob)

            if self.size_average:
                cross_entropy_loss = cross_entropy_loss.mean()
            else:
                cross_entropy_loss = cross_entropy_loss.sum()
            # Return a pair of (loss_output, model_output). Model output will be
            # used for top-1 and top-5 evaluation.
            # model_output_log_prob = model_output_log_prob.squeeze(2)
            return cross_entropy_loss

        else:
            l1_loss = F.l1_loss(real_output_soft, model_output_soft)

            return l1_loss
