import copy
import math

from datasets import get_dataset
from utils.buffer import Buffer
from torch.nn import functional as F
from models.utils.continual_model import ContinualModel
from utils.args import *
import torch
import numpy as np
from copy import deepcopy

from utils.distillery import combine_teachers


def get_parser() -> ArgumentParser:
    parser = ArgumentParser(description='Continual learning via'
                                        ' Dark Experience Replay++ with ANCL.')
    add_management_args(parser)
    add_experiment_args(parser)
    add_rehearsal_args(parser)
    add_aux_dataset_args(parser)

    parser.add_argument('--alpha', type=float, required=True,
                        help='Penalty weight.')
    parser.add_argument('--beta', type=float, required=True,
                        help='Penalty weight.')
    parser.add_argument('--lamb', type=float, required=True,
                        help='Train teacher and impact on student.')
    # parser.add_argument('--charlie', type=float, required=True, default=0.5,
    #                     help='Train teacher and impact on student.')

    return parser


class DerppABLATIONANCL(ContinualModel):
    NAME = 'derpp_ablation_ancl'
    COMPATIBILITY = ['class-il', 'domain-il', 'task-il', 'general-continual']

    def __init__(self, backbone, loss, args, transform):
        super(DerppABLATIONANCL, self).__init__(backbone, loss, args, transform)
        self.buffer = Buffer(int(self.args.buffer_size), self.device, gpu_id=self.args.gpu_id)
        self.current_task = 0
        self.cpt = get_dataset(args).N_CLASSES_PER_TASK
        self.buffer_backup = None

    def begin_task(self, dataset):
        if self.current_task == 0:
            self.load_initial_checkpoint()
            self.reset_classifier()
        self.initial_checkpoint = copy.deepcopy(self.net)


    def end_task(self, dataset):
        self.current_task += 1

    def reset_ancl(self):
        self.ancl_model = copy.deepcopy(self.net)
        self.net = self.initial_checkpoint

    def get_cl_mask(self):
        t = self.current_task
        dataset = get_dataset(self.args)
        cur_classes = np.arange(t * dataset.N_CLASSES_PER_TASK, (t + 1) * dataset.N_CLASSES_PER_TASK)
        cl_mask = np.setdiff1d(np.arange(dataset.N_CLASSES_PER_TASK * dataset.N_TASKS), cur_classes)
        return cl_mask

    def mask_output(self, outputs):
        cl_mask = self.get_cl_mask()
        mask_add_on = torch.zeros_like(outputs)
        mask_add_on[:, cl_mask] = float('-inf')
        masked_outputs = mask_add_on + outputs
        return masked_outputs

    def observe(self, inputs: torch.Tensor, labels: torch.Tensor, not_aug_inputs: torch.Tensor, epoch=None, task_id_nominal=None, teacher=None, noise=None):
        self.opt.zero_grad()
        T=2
        mask = None

        outputs = self.net(inputs)

        relevant_indices = torch.tensor(
            [idx // self.cpt == task_id_nominal for idx in range(outputs.size(1))],
            device=outputs.device)
        relevant_student = outputs[:, relevant_indices]
        loss = self.loss(relevant_student, labels % self.cpt)

        if not self.buffer.is_empty():
            batch_size_buf = max(int(math.ceil(len(labels) / (task_id_nominal + 1))), 8)
            buf_inputs, buf_labels, buf_logits = self.buffer.get_data(len(labels), transform=self.transform, batch_size_buf=batch_size_buf)

            buf_outputs = self.net(buf_inputs).float()

            KD_loss = self.args.alpha * F.mse_loss(buf_outputs, buf_logits)

            if teacher is not None:
                teacher_outputs = teacher(buf_inputs)
                KD_loss += self.args.agreement * F.mse_loss(buf_outputs, teacher_outputs)

            loss += self.args.beta * self.loss(buf_outputs, buf_labels)


            loss += KD_loss
        loss.backward()

        self.opt.step()
        if teacher is not None:
            self.buffer.add_data(examples=not_aug_inputs,
                                                   labels=labels,
                                                   logits=outputs.data,
                                 is_noise=noise if noise is not None else None,
                                 task_labels=torch.full((len(not_aug_inputs),), task_id_nominal))

        a = {"mask": mask.sum().item(), "noise": noise.sum().item(), "i_noise": noise[mask].sum().item()} if noise is not None and mask is not None else {}
        return loss.item(), 0, a, 0, 0
