import math

from torch import nn

from datasets import get_dataset
from utils.buffer import Buffer
from torch.nn import functional as F
from models.utils.continual_model import ContinualModel
from utils.args import *
import torch
import numpy as np
from copy import deepcopy
import copy


def get_parser() -> ArgumentParser:
    parser = ArgumentParser(description='Continual learning via SSIL with ANCL')
    add_management_args(parser)
    add_experiment_args(parser)
    add_rehearsal_args(parser)
    add_aux_dataset_args(parser)

    parser.add_argument('--alpha', type=float, default=0,
                        help='Penalty weight.')
    parser.add_argument('--beta', type=float, default=0,
                        help='Influence of the ANCL model.')
    parser.add_argument('--lamb', default=0, type=float,
                        help='Penalty weight.')
    return parser


class SSILABLATIONANCL(ContinualModel):
    NAME = 'ssil_ablation_ancl'
    COMPATIBILITY = ['class-il', 'domain-il', 'task-il', 'general-continual']

    def __init__(self, backbone, loss, args, transform):
        super(SSILABLATIONANCL, self).__init__(backbone, loss, args, transform)

        self.buffer = Buffer(int(self.args.buffer_size), self.device, gpu_id=self.args.gpu_id)
        self.current_task = 0
        self.cpt = get_dataset(args).N_CLASSES_PER_TASK
        self.buffer_backup = None
        self.args = args
        self.loss = torch.nn.CrossEntropyLoss(reduction='sum')

    def begin_task(self, dataset):
        if self.current_task == 0:
            self.load_initial_checkpoint()
            self.reset_classifier()

        self.initial_checkpoint = copy.deepcopy(self.net)

    def end_task(self, dataset):
        self.current_task += 1
        # update buffer
        self.model_fixed = copy.deepcopy(self.net)

    def reset_ancl(self):
        self.ancl_model = copy.deepcopy(self.net)
        self.net = self.initial_checkpoint

    def get_cl_mask(self):
        t = self.current_task
        dataset = get_dataset(self.args)
        cur_classes = np.arange(t * dataset.N_CLASSES_PER_TASK, (t + 1) * dataset.N_CLASSES_PER_TASK)
        cl_mask = np.setdiff1d(np.arange(dataset.N_CLASSES_PER_TASK * dataset.N_TASKS), cur_classes)
        return cl_mask

    def mask_output(self, outputs):
        cl_mask = self.get_cl_mask()
        mask_add_on = torch.zeros_like(outputs)
        mask_add_on[:, cl_mask] = float('-inf')
        masked_outputs = mask_add_on + outputs
        return masked_outputs

    def observe(self, inputs: torch.Tensor, labels: torch.Tensor, not_aug_inputs: torch.Tensor, epoch=None,
                task_id_nominal=None, teacher=None, noise=None):
        labels = labels.long()
        end = (task_id_nominal + 1) * self.cpt
        mid = task_id_nominal * self.cpt
        start = 0
        T = 2
        loss_CE = 0
        loss_KD = 0

        if not self.buffer.is_empty():
            buf_inputs, buf_labels = self.buffer.get_data_old(len(labels), transform=self.transform, task_id_nominal=task_id_nominal)

            target = labels % (end - mid)
            batch_size = inputs.shape[0]
            data_r, target_r = buf_inputs, buf_labels
            replay_size = data_r.shape[0]
            data = torch.cat((inputs, data_r))
        else:
            data, target = inputs, labels
            batch_size = data.shape[0]

        output = self.net(data)

        curr = output[:batch_size, mid:end]
        loss_CE_curr = self.loss(curr, target)

        if task_id_nominal > 0:
            prev = output[batch_size:batch_size + replay_size, start:end]
            loss_CE_prev = self.loss(prev, target_r)
            loss_CE = (loss_CE_curr + loss_CE_prev) / (batch_size + replay_size)
            loss_KD = torch.zeros(task_id_nominal).to(self.net.device)

            score = self.model_fixed(data)[:, :mid].data
            for t in range(task_id_nominal):
                start_KD = t * self.cpt
                end_KD = (t + 1) * self.cpt

                soft_target = F.softmax(score[:, start_KD:end_KD] / T, dim=1)
                output_log = F.log_softmax(output[:, start_KD:end_KD] / T, dim=1)
                loss_KD[t] = F.kl_div(output_log, soft_target, reduction='batchmean') * (T ** 2)
            loss_KD = self.args.alpha * loss_KD.sum()

            if teacher:
                mid = (task_id_nominal+1) * self.cpt
                loss_KD_t = torch.zeros(task_id_nominal+1).to(self.net.device)

                score = self.ancl_model(data)[:, :mid].data
                for t in range(task_id_nominal+1):
                    start_KD = t * self.cpt
                    end_KD = (t + 1) * self.cpt

                    soft_target = F.softmax(score[:, start_KD:end_KD] / T, dim=1)
                    output_log = F.log_softmax(output[:, start_KD:end_KD] / T, dim=1)
                    loss_KD_t[t] = F.kl_div(output_log, soft_target, reduction='batchmean') * (T ** 2)
                loss_KD += self.args.beta * loss_KD_t.sum()

        else:
            loss_CE = loss_CE_curr / batch_size

        self.opt.zero_grad()
        loss = (loss_KD + loss_CE)
        loss.backward()
        self.opt.step()

        self.buffer.add_data(examples=not_aug_inputs,
                             labels=labels,
                             is_noise=noise if noise is not None else None)


        return loss.item(), 0, {}, 0, 0
