import math

import torch

from datasets import get_dataset
from utils.buffer import Buffer
from utils.args import *
from models.utils.continual_model import ContinualModel
from copy import deepcopy
from torch import nn
from torch.nn import functional as F

def get_parser() -> ArgumentParser:
    parser = ArgumentParser(description='Complementary Learning Systems Based Experience Replay')
    add_management_args(parser)
    add_experiment_args(parser)
    add_rehearsal_args(parser)

    # Consistency Regularization Weight
    parser.add_argument('--reg_weight', type=float, default=0.1)

    # Stable Model parameters
    parser.add_argument('--stable_model_update_freq', type=float, default=0.70)
    parser.add_argument('--stable_model_alpha', type=float, default=0.999)

    # Plastic Model Parameters
    parser.add_argument('--plastic_model_update_freq', type=float, default=0.90)
    parser.add_argument('--plastic_model_alpha', type=float, default=0.999)

    parser.add_argument('--lamb', type=float, default=0,
                        help='Influence on new task learning.')
    parser.add_argument('--biam', type=int, default=0,
                        help='Reduces forgetting of past tasks by distilling only the current task’s logits during training.')
    parser.add_argument('--bufs', type=int, default=0,
                        help='Refines sample selection based on the agreement between the student and assistant teacher.')
    parser.add_argument('--newl', type=int, default=0,
                        help='Distills soft labels from the assistant teacher to guide the student.')
    parser.add_argument('--divk', type=int, default=0,
                        help='Combines the assistant and main teacher’s logits for more diverse knowledge transfer')
    return parser


# =============================================================================
# Mean-ER
# =============================================================================
class CLSERABLATION(ContinualModel):
    NAME = 'clser_ablation'
    COMPATIBILITY = ['class-il', 'domain-il', 'task-il', 'general-continual']

    def __init__(self, backbone, loss, args, transform):
        super(CLSERABLATION, self).__init__(backbone, loss, args, transform)
        self.buffer = Buffer(int(self.args.buffer_size), self.device, gpu_id=self.args.gpu_id)
        # Initialize plastic and stable model
        self.plastic_model = deepcopy(self.net).to(self.device)
        self.stable_model = deepcopy(self.net).to(self.device)
        # set regularization weight
        self.reg_weight = args.reg_weight
        # set parameters for plastic model
        self.plastic_model_update_freq = args.plastic_model_update_freq
        self.plastic_model_alpha = args.plastic_model_alpha
        # set parameters for stable model
        self.stable_model_update_freq = args.stable_model_update_freq
        self.stable_model_alpha = args.stable_model_alpha

        self.consistency_loss = nn.MSELoss(reduction='none')
        self.current_task = 0
        self.global_step = 0

        self.current_task = 0
        self.cpt = get_dataset(args).N_CLASSES_PER_TASK


    def end_task(self, dataset):
        self.current_task += 1

    def observe(self, inputs, labels, not_aug_inputs, task_id_nominal=None, teacher=None, epoch=None, noise=None):
        T = 2
        real_batch_size = inputs.shape[0]
        mask = None

        self.opt.zero_grad()

        outputs = self.net(inputs)

        if self.args.biam > 0:
            relevant_indices = torch.tensor(
                [idx // self.cpt == task_id_nominal for idx in range(outputs.size(1))],
                device=outputs.device)
            relevant_student = outputs[:, relevant_indices]
            ce_loss = self.loss(relevant_student, labels % self.cpt)
        else:
            ce_loss = self.loss(outputs, labels)
        loss = ce_loss

        if teacher:
            teacher_output = teacher(inputs)

        if self.args.newl > 0:
            relevant_indices = torch.tensor(
                [idx // self.cpt == task_id_nominal for idx in range(teacher_output.size(1))],
                device=teacher_output.device)
            relevant_teacher = teacher_output[:, relevant_indices]
            relevant_student = outputs[:, relevant_indices]

            output_student = F.log_softmax(relevant_student / T, dim=1)
            output_teacher = F.softmax(relevant_teacher / T, dim=1)
            loss += self.args.lamb * F.kl_div(output_student, output_teacher, reduction='batchmean') * (T ** 2)

        if not self.buffer.is_empty():
            if self.args.lamb > 0:
                buf_inputs, buf_labels, task_labels, buf_teacher_logits = self.buffer.get_data_old(len(labels),
                                                                                                   transform=self.transform)
            else:
                buf_inputs, buf_labels, task_labels = self.buffer.get_data_old(len(labels), transform=self.transform)

            stable_model_logits = self.stable_model(buf_inputs)
            plastic_model_logits = self.plastic_model(buf_inputs)

            stable_model_prob = F.softmax(stable_model_logits, 1)
            plastic_model_prob = F.softmax(plastic_model_logits, 1)

            label_mask = F.one_hot(buf_labels, num_classes=stable_model_logits.shape[-1]) > 0
            sel_idx = stable_model_prob[label_mask] > plastic_model_prob[label_mask]
            sel_idx = sel_idx.unsqueeze(1)

            ema_logits = torch.where(
                sel_idx,
                stable_model_logits,
                plastic_model_logits,
            )

            if teacher and self.args.divk > 0:
                for t in range(task_id_nominal):
                    task_mask = task_labels == t
                    start_KD = t * self.cpt
                    end_KD = (t + 1) * self.cpt

                    prev_score = ema_logits[task_mask]
                    score = buf_teacher_logits[task_mask]

                    if len(score) == 0:
                        continue

                    teacher0_score_logits = prev_score[:, start_KD:end_KD]
                    teacher1_score_logits = score[:, start_KD:end_KD]

                    avg_teacher_logits = (teacher0_score_logits + teacher1_score_logits) / 2
                    ema_logits[task_mask, start_KD:end_KD] = avg_teacher_logits

            l_cons = torch.mean(self.consistency_loss(self.net(buf_inputs), ema_logits.detach()))
            l_reg = self.args.reg_weight * l_cons
            loss += l_reg

            buf_outputs = self.net(buf_inputs)
            replay_loss = self.loss(buf_outputs, buf_labels)
            loss += replay_loss


        loss.backward()
        self.opt.step()

        if teacher:

            if self.args.bufs > 0:
                _, teacher_max_indices = teacher_output.data.max(1)
                _, outputs_max_indices = outputs.data.max(1)

                mask = teacher_max_indices == outputs_max_indices
                if mask.sum() > 0:
                    self.buffer.add_data(
                        examples=not_aug_inputs[mask],
                        labels=labels[:real_batch_size][mask],
                        teacher_logits=teacher_output.data[mask],
                        is_noise=noise[mask] if noise is not None else None,
                        task_labels=torch.full((mask.sum(),), task_id_nominal)
                    )
            else:
                self.buffer.add_data(
                    examples=not_aug_inputs,
                    labels=labels[:real_batch_size],
                    teacher_logits=teacher_output.data,
                    is_noise=noise if noise is not None else None,
                    task_labels=torch.full((len(not_aug_inputs),), task_id_nominal)
                )
        else:
            self.buffer.add_data(
                examples=not_aug_inputs,
                labels=labels[:real_batch_size],
                is_noise=noise if noise is not None else None,
                task_labels=torch.full((len(not_aug_inputs),), task_id_nominal)
            )

        # Update the ema model
        self.global_step += 1
        if torch.rand(1) < self.plastic_model_update_freq:
            self.update_plastic_model_variables()

        if torch.rand(1) < self.stable_model_update_freq:
            self.update_stable_model_variables()

        stats = {"mask": mask.sum().item(), "noise": noise.sum().item(),
             "i_noise": noise[mask].sum().item()} if noise is not None and mask is not None else {}
        return loss.item(), {},  stats, 0, 0

    def update_plastic_model_variables(self):
        alpha = min(1 - 1 / (self.global_step + 1), self.plastic_model_alpha)
        for ema_param, param in zip(self.plastic_model.parameters(), self.net.parameters()):
            ema_param.data.mul_(alpha).add_(1 - alpha, param.data)

    def update_stable_model_variables(self):
        alpha = min(1 - 1 / (self.global_step + 1),  self.stable_model_alpha)
        for ema_param, param in zip(self.stable_model.parameters(), self.net.parameters()):
            ema_param.data.mul_(alpha).add_(1 - alpha, param.data)