import copy
import math

from datasets import get_dataset
from utils.buffer import Buffer
from torch.nn import functional as F
from models.utils.continual_model import ContinualModel
from utils.args import *
import torch
import numpy as np


def get_parser() -> ArgumentParser:
    parser = ArgumentParser(description='Continual learning via'
                                        ' Dark Experience Replay++.')
    add_management_args(parser)
    add_experiment_args(parser)
    add_rehearsal_args(parser)
    add_aux_dataset_args(parser)

    parser.add_argument('--alpha', type=float, default=0.1,
                        help='Penalty weight.')
    parser.add_argument('--beta', type=float, default=0.5,
                        help='Penalty weight.')
    parser.add_argument('--lamb', type=float, default=0,
                        help='Influence on new task learning.')
    parser.add_argument('--biam', type=int, default=0,
                        help='Reduces forgetting of past tasks by distilling only the current task’s logits during training.')
    parser.add_argument('--newl', type=int, default=0,
                        help='Distills soft labels from the assistant teacher to guide the student.')
    parser.add_argument('--divk', type=int, default=0,
                        help='Combines the assistant and main teacher’s logits for more diverse knowledge transfer')
    parser.add_argument('--bufs', type=int, default=0,
                        help='Refines sample selection based on the agreement between the student and assistant teacher.')
    return parser


class DerppABLATION(ContinualModel):
    NAME = 'derpp_ablation'
    COMPATIBILITY = ['class-il', 'domain-il', 'task-il', 'general-continual']

    def __init__(self, backbone, loss, args, transform):
        super(DerppABLATION, self).__init__(backbone, loss, args, transform)
        self.buffer = Buffer(int(self.args.buffer_size), self.device, gpu_id=self.args.gpu_id)
        self.current_task = 0
        self.cpt = get_dataset(args).N_CLASSES_PER_TASK

    def begin_task(self, dataset):
        if self.current_task == 0:
            self.load_initial_checkpoint()
            self.reset_classifier()

    def end_task(self, dataset):
        self.current_task += 1

    def get_cl_mask(self):
        t = self.current_task
        dataset = get_dataset(self.args)
        cur_classes = np.arange(t * dataset.N_CLASSES_PER_TASK, (t + 1) * dataset.N_CLASSES_PER_TASK)
        cl_mask = np.setdiff1d(np.arange(dataset.N_CLASSES_PER_TASK * dataset.N_TASKS), cur_classes)
        return cl_mask

    def mask_output(self, outputs):
        cl_mask = self.get_cl_mask()
        mask_add_on = torch.zeros_like(outputs)
        mask_add_on[:, cl_mask] = float('-inf')
        masked_outputs = mask_add_on + outputs
        return masked_outputs

    def observe(self, inputs: torch.Tensor, labels: torch.Tensor, not_aug_inputs: torch.Tensor, epoch=None, task_id_nominal=None, teacher=None, noise=None):
        self.opt.zero_grad()
        T=2
        mask = None

        outputs = self.net(inputs)

        if self.args.biam > 0:
            relevant_indices = torch.tensor(
                [idx // self.cpt == task_id_nominal for idx in range(outputs.size(1))],
                device=outputs.device)
            relevant_student = outputs[:, relevant_indices]
            loss = self.loss(relevant_student, labels % self.cpt)
        else:
            loss = self.loss(outputs, labels)


        if self.args.lamb > 0:
            teacher_output = teacher(inputs)
            relevant_indices = torch.tensor(
                [idx // self.cpt == task_id_nominal for idx in range(teacher_output.size(1))],
                device=teacher_output.device)
            relevant_teacher = teacher_output[:, relevant_indices]
            relevant_student = outputs[:, relevant_indices]

            if self.args.newl > 0:
                output_student = F.log_softmax(relevant_student / T, dim=1)
                output_teacher = F.softmax(relevant_teacher / T, dim=1)
                loss += self.args.lamb * F.kl_div(output_student, output_teacher, reduction='batchmean') * (T ** 2)


        if not self.buffer.is_empty():
            batch_size_buf = max(int(math.ceil(len(labels) / (task_id_nominal + 1))), 8)
            if self.args.lamb > 0:
                buf_inputs, buf_labels, buf_logits, task_labels, buf_teacher_logits  = self.buffer.get_data_old(len(labels), transform=self.transform)
            else:
                buf_inputs, buf_labels, buf_logits, task_labels = self.buffer.get_data_old(len(labels), transform=self.transform)

            buf_outputs = self.net(buf_inputs).float()
            if self.args.lamb > 0 and self.args.divk > 0:
                for t in range(task_id_nominal):
                    task_mask = task_labels == t
                    start_KD = t * self.cpt
                    end_KD = (t + 1) * self.cpt

                    prev_score = buf_logits[task_mask]
                    score = buf_teacher_logits[task_mask]

                    if len(score) == 0:
                        continue

                    teacher0_score_logits = prev_score[:, start_KD:end_KD]
                    teacher1_score_logits = score[:, start_KD:end_KD]

                    avg_teacher_logits = (teacher0_score_logits + teacher1_score_logits) / 2
                    buf_logits[task_mask, start_KD:end_KD] = avg_teacher_logits

            loss += self.args.alpha * F.mse_loss(buf_outputs, buf_logits)

            loss += self.args.beta * self.loss(buf_outputs, buf_labels)

        loss.backward()

        self.opt.step()

        if self.args.lamb > 0:
            mask = torch.tensor([0])
            if self.args.bufs > 0:
                relevant_indices = torch.tensor(
                    [idx // self.cpt == task_id_nominal for idx in range(teacher_output.size(1))],
                    device=teacher_output.device)
                relevant_teacher = teacher_output[:, relevant_indices]
                relevant_student = outputs[:, relevant_indices]
                _, outputs_max_indices = relevant_teacher.max(1)
                _, teacher_max_indices = relevant_student.max(1)
            else:
                if noise is not None or self.args.quantile > 0:
                    student_confidence, _ = torch.softmax(outputs, dim=1).max(1)

                    percentile_threshold = torch.quantile(student_confidence, self.args.quantile)
                    clean_samples = student_confidence > percentile_threshold
                    mask = clean_samples

            if mask.sum() > 0:
                self.buffer.add_data(examples=not_aug_inputs[mask],
                                   labels=labels[mask],
                                   logits=outputs.data[mask],
                                   teacher_logits=teacher_output.data[mask],
                                   task_labels=torch.full((mask.sum(),), task_id_nominal),
                                   is_noise=noise[mask] if noise is not None else None
                                     )
            else:
                self.buffer.add_data(examples=not_aug_inputs,
                                     labels=labels,
                                     logits=outputs.data,
                                     teacher_logits=teacher_output.data,
                                     is_noise=noise if noise is not None else None,
                                     task_labels=torch.full((len(not_aug_inputs),), task_id_nominal))
        else:
            self.buffer.add_data(examples=not_aug_inputs,
                                                   labels=labels,
                                                   logits=outputs.data,
                                 is_noise=noise if noise is not None else None,
                                 task_labels=torch.full((len(not_aug_inputs),), task_id_nominal))

        a = {"mask": mask.sum().item(), "noise": noise.sum().item(), "i_noise": noise[mask].sum().item()} if noise is not None and mask is not None else {}
        return loss.item(), 0, a, 0, 0

