# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

import torch

from semilearn.algorithms.utils import concat_all_gather
from semilearn.algorithms.hooks import MaskingHook
from semilearn.core.hooks import Hook


class WeightingHook(MaskingHook):
    """
    Learnable truncated Gaussian weighting
    """
    def __init__(self, num_classes, n_sigma=2, momentum=0.999, per_class=False, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.num_classes = num_classes
        self.n_sigma = n_sigma
        self.per_class = per_class
        self.m = momentum

        # initialize Gaussian mean and variance
        if not self.per_class:
            self.prob_max_mu_t = torch.tensor(1.0 / self.num_classes)
            self.prob_max_var_t = torch.tensor(1.0)
            self.prob_gap_mu_t = torch.tensor(1.0 / self.num_classes)
            self.prob_gap_var_t = torch.tensor(1.0)
        else:
            self.prob_max_mu_t = torch.ones((self.num_classes)) / self.num_classes
            self.prob_max_var_t =  torch.ones((self.num_classes))
            self.prob_gap_mu_t = torch.ones((self.num_classes)) / self.num_classes
            self.prob_gap_var_t =  torch.ones((self.num_classes))

    @torch.no_grad()
    def update(self, algorithm, probs_x_ulb):
        if algorithm.distributed and algorithm.world_size > 1:
            probs_x_ulb = self.concat_all_gather(probs_x_ulb)
        max_probs, max_idx = probs_x_ulb.max(dim=-1)
        top2_probs, top2_idx = torch.topk(probs_x_ulb, k=2, dim=-1)
        gap_probs = top2_probs[..., 0] - top2_probs[..., 1]
        if not self.per_class:
            prob_max_mu_t = torch.mean(max_probs)
            prob_max_var_t = torch.var(max_probs, unbiased=True)
            self.prob_max_mu_t = self.m * self.prob_max_mu_t + (1 - self.m) * prob_max_mu_t.item()
            self.prob_max_var_t = self.m * self.prob_max_var_t + (1 - self.m) * prob_max_var_t.item()
            prob_gap_mu_t = torch.mean(gap_probs)
            prob_gap_var_t = torch.var(gap_probs, unbiased=True)
            self.prob_gap_mu_t = self.m * self.prob_gap_mu_t + (1 - self.m) * prob_gap_mu_t.item()
            self.prob_gap_var_t = self.m * self.prob_gap_var_t + (1 - self.m) * prob_gap_var_t.item()
        else:
            prob_max_mu_t = torch.zeros_like(self.prob_max_mu_t)
            prob_max_var_t = torch.ones_like(self.prob_max_var_t)
            for i in range(self.num_classes):
                prob = max_probs[max_idx == i]
                if len(prob) > 1:
                    prob_max_mu_t[i] = torch.mean(prob)
                    prob_max_var_t[i] = torch.var(prob, unbiased=True)
            self.prob_max_mu_t = self.m * self.prob_max_mu_t + (1 - self.m) * prob_max_mu_t
            self.prob_max_var_t = self.m * self.prob_max_var_t + (1 - self.m) * prob_max_var_t
            prob_gap_mu_t = torch.zeros_like(self.prob_gap_mu_t)
            prob_gap_var_t = torch.ones_like(self.prob_gap_var_t)
            for i in range(self.num_classes):
                prob = gap_probs[max_idx == i]
                if len(prob) > 1:
                    prob_gap_mu_t[i] = torch.mean(prob)
                    prob_gap_var_t[i] = torch.var(prob, unbiased=True)
            self.prob_gap_mu_t = self.m * self.prob_gap_mu_t + (1 - self.m) * prob_gap_mu_t
            self.prob_gap_var_t = self.m * self.prob_gap_var_t + (1 - self.m) * prob_gap_var_t
        return max_probs, max_idx
    
    @torch.no_grad()
    def masking(self, algorithm, logits_x_ulb, softmax_x_ulb=True, *args, **kwargs):
        if not self.prob_max_mu_t.is_cuda:
            self.prob_max_mu_t = self.prob_max_mu_t.to(logits_x_ulb.device)
        if not self.prob_max_var_t.is_cuda:
            self.prob_max_var_t = self.prob_max_var_t.to(logits_x_ulb.device)
        if not self.prob_gap_mu_t.is_cuda:
            self.prob_gap_mu_t = self.prob_gap_mu_t.to(logits_x_ulb.device)
        if not self.prob_gap_var_t.is_cuda:
            self.prob_gap_var_t = self.prob_gap_var_t.to(logits_x_ulb.device)

        if softmax_x_ulb:
            probs_x_ulb = torch.softmax(logits_x_ulb.detach(), dim=-1)
        else:
            # logits is already probs
            probs_x_ulb = logits_x_ulb.detach()

        self.update(algorithm, probs_x_ulb)

        max_probs, max_idx = probs_x_ulb.max(dim=-1)
        top2_probs, top2_idx = torch.topk(probs_x_ulb, k=2, dim=-1)
        gap_probs = top2_probs[..., 0] - top2_probs[..., 1]
        # compute weight
        if not self.per_class:
            max_mu = self.prob_max_mu_t
            max_var = self.prob_max_var_t
            gap_mu = self.prob_gap_mu_t
            gap_var = self.prob_gap_var_t
        else:
            max_mu = self.prob_max_mu_t[max_idx]
            max_var = self.prob_max_var_t[max_idx]
            gap_mu = self.prob_gap_mu_t[max_idx]
            gap_var = self.prob_gap_var_t[max_idx]
        max_mask = torch.exp(-((torch.clamp(max_probs - max_mu, max=0.0) ** 2) / (2 * max_var / (self.n_sigma ** 2))))
        gap_mask = torch.exp(-((torch.clamp(gap_probs - gap_mu, max=0.0) ** 2) / (2 * gap_var / (self.n_sigma ** 2))))
        return max_mask * gap_mask

class DistAlignEMAHook(Hook):
    """
    Distribution Alignment Hook for conducting distribution alignment
    """
    def __init__(self, num_classes, momentum=0.999, p_target_type='uniform', p_target=None):
        super().__init__()
        self.num_classes = num_classes
        self.m = momentum

        # p_target
        self.update_p_target, self.p_target = self.set_p_target(p_target_type, p_target)
        print('distribution alignment p_target:', self.p_target)
        # p_model
        self.p_model = None

    @torch.no_grad()
    def dist_align(self, algorithm, probs_x_ulb, probs_x_lb=None):
        # update queue
        self.update_p(algorithm, probs_x_ulb, probs_x_lb)

        # dist align
        probs_x_ulb_aligned = probs_x_ulb * (self.p_target + 1e-6) / (self.p_model + 1e-6)
        probs_x_ulb_aligned = probs_x_ulb_aligned / probs_x_ulb_aligned.sum(dim=-1, keepdim=True)
        return probs_x_ulb_aligned

    @torch.no_grad()
    def update_p(self, algorithm, probs_x_ulb, probs_x_lb):
        # check device
        if not self.p_target.is_cuda:
            self.p_target = self.p_target.to(probs_x_ulb.device)

        if algorithm.distributed and algorithm.world_size > 1:
            if probs_x_lb is not None and self.update_p_target:
                probs_x_lb = concat_all_gather(probs_x_lb)
            probs_x_ulb = concat_all_gather(probs_x_ulb)

        probs_x_ulb = probs_x_ulb.detach()
        if self.p_model == None:
            self.p_model = torch.mean(probs_x_ulb, dim=0)
        else:
            self.p_model = self.p_model * self.m + torch.mean(probs_x_ulb, dim=0) * (1 - self.m)

        if self.update_p_target:
            assert probs_x_lb is not None
            self.p_target = self.p_target * self.m + torch.mean(probs_x_lb, dim=0) * (1 - self.m)

    def set_p_target(self, p_target_type='uniform', p_target=None):
        assert p_target_type in ['uniform', 'gt', 'model']

        # p_target
        update_p_target = False
        if p_target_type == 'uniform':
            p_target = torch.ones((self.num_classes, )) / self.num_classes
        elif p_target_type == 'model':
            p_target = torch.ones((self.num_classes, )) / self.num_classes
            update_p_target = True
        else:
            assert p_target is not None
            if isinstance(p_target, np.ndarray):
                p_target = torch.from_numpy(p_target)

        return update_p_target, p_target