import torch
import torch.nn.functional as F

import torch.nn as nn



def forward_kl(logits, teacher_logits, attention_mask, temperature=1.0, **_):
    teacher_probs = F.softmax(teacher_logits / temperature, dim=-1) # [batch, seq_len, num_classes]
    student_logprobs = F.log_softmax(logits / temperature, dim=-1) # [batch, seq_len, num_classes]
    kl = -(teacher_probs * student_logprobs)
    distill_loss = kl.sum(dim=-1)
    distill_loss = (distill_loss * attention_mask).sum() / attention_mask.sum()
    distill_loss = distill_loss
    return distill_loss #distill_loss


def reverse_kl(logits, teacher_logits, labels, **_):
    student_probs = F.softmax(logits, dim=-1, dtype=torch.float32)
    student_logprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32)
    teacher_logprobs = F.log_softmax(teacher_logits, dim=-1, dtype=torch.float32)
    inf_mask = torch.isinf(teacher_logits) | torch.isinf(logits)
    prod_probs = torch.masked_fill(student_probs * teacher_logprobs, inf_mask, 0)
    prod_probs -= torch.masked_fill(student_probs * student_logprobs, inf_mask, 0)
    x = torch.sum(prod_probs, dim=-1).view(-1)
    mask = (labels != -100).int()
    distil_loss = -torch.sum(x * mask.view(-1), dim=0) / torch.sum(mask.view(-1), dim=0)
    return distil_loss


def symmetric_kl(logits, teacher_logits, labels, lambda_=0.5, **_):
    for_kl = forward_kl(logits, teacher_logits, labels)
    rev_kl = reverse_kl(logits, teacher_logits, labels)
    distil_loss = (1 - lambda_) * for_kl + lambda_ * rev_kl
    return distil_loss


def js_distance(logits, teacher_logits, labels, lambda_=0.5, **_):
    teacher_probs = F.softmax(teacher_logits, dim=-1, dtype=torch.float32)
    student_probs = F.softmax(logits, dim=-1, dtype=torch.float32)
    mixed_probs = lambda_ * teacher_probs + (1 - lambda_) * student_probs

    teacher_logprobs = F.log_softmax(teacher_logits, dim=-1, dtype=torch.float32)
    student_logprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32)
    mixed_logprobs = torch.log(mixed_probs)

    mask = (labels != -100).int()
    inf_mask = torch.isinf(logits) | torch.isinf(teacher_logits)

    prod_probs = torch.masked_fill(student_probs * mixed_logprobs, inf_mask, 0)
    prod_probs -= torch.masked_fill(student_probs * student_logprobs, inf_mask, 0)
    x = torch.sum(prod_probs, dim=-1).view(-1)
    distil_loss = (1 - lambda_) * -torch.sum(x * mask.view(-1), dim=0) / torch.sum(mask.view(-1), dim=0)

    prod_probs = torch.masked_fill(teacher_probs * mixed_logprobs, inf_mask, 0)
    prod_probs -= torch.masked_fill(teacher_probs * teacher_logprobs, inf_mask, 0)
    x = torch.sum(prod_probs, dim=-1).view(-1)
    distil_loss += lambda_ * -torch.sum(x * mask.view(-1), dim=0) / torch.sum(mask.view(-1), dim=0)
    return distil_loss


def tv_distance(logits, teacher_logits, labels, **_):
    teacher_probs = F.softmax(teacher_logits, dim=-1, dtype=torch.float32)
    student_probs = F.softmax(logits, dim=-1, dtype=torch.float32)

    mask = (labels != -100).int()
    inf_mask = torch.isinf(logits) | torch.isinf(teacher_logits)
    prod_probs = 0.5 * torch.masked_fill(torch.abs(teacher_probs - student_probs), inf_mask, 0)
    x = torch.sum(prod_probs, dim=-1).view(-1)
    distil_loss = torch.sum(x * mask.view(-1), dim=0) / torch.sum(mask.view(-1), dim=0)
    return distil_loss


def skewed_forward_kl(logits, teacher_logits, labels, lambda_=0.1, **_):
    teacher_probs = F.softmax(teacher_logits, dim=-1, dtype=torch.float32)
    student_probs = F.softmax(logits, dim=-1, dtype=torch.float32)
    mixed_probs = lambda_ * teacher_probs + (1 - lambda_) * student_probs
    mixed_logprobs = torch.log(mixed_probs)

    mask = (labels != -100).int()
    inf_mask = torch.isinf(logits) | torch.isinf(teacher_logits)

    prod_probs = torch.masked_fill(teacher_probs * mixed_logprobs, inf_mask, 0)
    x = torch.sum(prod_probs, dim=-1).view(-1)
    distil_loss = -torch.sum(x * mask.view(-1), dim=0) / torch.sum(mask.view(-1), dim=0)
    return distil_loss


def skewed_reverse_kl(logits, teacher_logits, labels, lambda_=0.1, **_):
    teacher_probs = F.softmax(teacher_logits, dim=-1, dtype=torch.float32)
    student_probs = F.softmax(logits, dim=-1, dtype=torch.float32)
    mixed_probs = (1 - lambda_) * teacher_probs + lambda_ * student_probs

    student_logprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32)
    mixed_logprobs = torch.log(mixed_probs)

    mask = (labels != -100).int()
    inf_mask = torch.isinf(logits) | torch.isinf(teacher_logits)

    prod_probs = torch.masked_fill(student_probs * mixed_logprobs, inf_mask, 0)
    prod_probs -= torch.masked_fill(student_probs * student_logprobs, inf_mask, 0)
    x = torch.sum(prod_probs, dim=-1).view(-1)
    distil_loss = -torch.sum(x * mask.view(-1), dim=0) / torch.sum(mask.view(-1), dim=0)
    return distil_loss


_LOSS_FN_MAP = {
    "kl": forward_kl,
    "reverse_kl": reverse_kl,
    "symmetric_kl": symmetric_kl,
    "jsd": js_distance,
    "tvd": tv_distance,
    "skew_kl": skewed_forward_kl,
    "skew_reverse_kl": skewed_reverse_kl,
}


def get_loss_fn(name):
    try:
        return _LOSS_FN_MAP[name]
    except KeyError:
        raise KeyError(f"Loss function {name} is not defined")
