import math

from torch import nn

from datasets import get_dataset
from utils.buffer import Buffer
from torch.nn import functional as F
from models.utils.continual_model import ContinualModel
from utils.args import *
import torch
import numpy as np
from copy import deepcopy
import copy


def get_parser() -> ArgumentParser:
    parser = ArgumentParser(description='Continual learning via SSIL')
    add_management_args(parser)
    add_experiment_args(parser)
    add_rehearsal_args(parser)
    add_aux_dataset_args(parser)

    parser.add_argument('--alpha', type=float, default=0,
                        help='Penalty weight.')
    parser.add_argument('--beta', type=float, default=0,
                        help='Penalty weight.')
    parser.add_argument('--lamb', default=0, type=float,
                        help='Penalty weight.')
    parser.add_argument('--biam', type=int, default=0,
                        help='Reduces forgetting of past tasks by distilling only the current task’s logits during training.')
    parser.add_argument('--newl', type=int, default=0,
                        help='Distills soft labels from the assistant teacher to guide the student.')
    parser.add_argument('--divk', type=int, default=0,
                        help='Combines the assistant and main teacher’s logits for more diverse knowledge transfer')
    parser.add_argument('--bufs', type=int, default=0,
                        help='Refines sample selection based on the agreement between the student and assistant teacher.')
    return parser


class SSILABLATION(ContinualModel):
    NAME = 'ssil_ablation'
    COMPATIBILITY = ['class-il', 'domain-il', 'task-il', 'general-continual']

    def __init__(self, backbone, loss, args, transform):
        super(SSILABLATION, self).__init__(backbone, loss, args, transform)

        self.buffer = Buffer(int(self.args.buffer_size), self.device, gpu_id=self.args.gpu_id)
        self.current_task = 0
        self.cpt = get_dataset(args).N_CLASSES_PER_TASK
        self.buffer_backup = None
        self.args = args
        self.loss = torch.nn.CrossEntropyLoss(reduction='sum')

    def begin_task(self, dataset):
        if self.current_task == 0:
            self.load_initial_checkpoint()
            self.reset_classifier()

        # copy buffer
        if self.args.update_buffer_at_task_end:
            self.buffer_backup = deepcopy(self.buffer)
            print(
                f"At task {self.current_task} start after deep copy: buffer is {len(self.buffer)}, buffer_backup is {len(self.buffer_backup)}")

    def end_task(self, dataset):
        self.current_task += 1
        # update buffer
        self.model_fixed = copy.deepcopy(self.net)


    def get_cl_mask(self):
        t = self.current_task
        dataset = get_dataset(self.args)
        cur_classes = np.arange(t * dataset.N_CLASSES_PER_TASK, (t + 1) * dataset.N_CLASSES_PER_TASK)
        cl_mask = np.setdiff1d(np.arange(dataset.N_CLASSES_PER_TASK * dataset.N_TASKS), cur_classes)
        return cl_mask

    def mask_output(self, outputs):
        cl_mask = self.get_cl_mask()
        mask_add_on = torch.zeros_like(outputs)
        mask_add_on[:, cl_mask] = float('-inf')
        masked_outputs = mask_add_on + outputs
        return masked_outputs

    def observe(self, inputs: torch.Tensor, labels: torch.Tensor, not_aug_inputs: torch.Tensor, epoch=None,
                task_id_nominal=None, teacher=None, noise=None):
        labels = labels.long()
        end = (task_id_nominal + 1) * self.cpt
        mid = task_id_nominal * self.cpt
        start = 0
        T = 2
        loss_KD = 0
        mask = None

        if not self.buffer.is_empty():
            if self.args.lamb > 0:
                buf_inputs, buf_labels, buf_teacher_logits, task_labels = self.buffer.get_data_old(len(labels), transform=self.transform)
            else:
                buf_inputs, buf_labels, task_labels = self.buffer.get_data_old(len(labels), transform=self.transform)
            target = labels % (end - mid)
            batch_size = inputs.shape[0]
            data_r, target_r = buf_inputs, buf_labels
            replay_size = data_r.shape[0]
            data = torch.cat((inputs, data_r))
        else:
            data, target = inputs, labels
            batch_size = data.shape[0]

        output = self.net(data)

        curr = output[:batch_size, mid:end]
        loss_CE_curr = self.loss(curr, target)

        if teacher:
            teacher_output = teacher(inputs)
            if self.args.newl > 0:
                relevant_indices = torch.tensor(
                    [idx // self.cpt == task_id_nominal for idx in range(teacher_output.size(1))],
                    device=teacher_output.device)
                relevant_teacher = teacher_output[:, relevant_indices]
                relevant_student = output[:batch_size][:, relevant_indices]

                output_student = F.log_softmax(relevant_student / T, dim=1)
                output_teacher = F.softmax(relevant_teacher / T, dim=1)
                loss_CE_curr += self.args.lamb * F.kl_div(output_student, output_teacher, reduction='batchmean') * (
                            T ** 2)

        if task_id_nominal > 0:
            prev = output[batch_size:batch_size + replay_size, start:end]
            loss_CE_prev = self.loss(prev, target_r)
            loss_CE = (loss_CE_curr + loss_CE_prev) / (batch_size + replay_size)
            loss_KD = torch.zeros(task_id_nominal).to(self.net.device)

            if teacher and self.args.divk > 0:
                for t in range(task_id_nominal):
                    task_mask = task_labels == t
                    start_KD = t * self.cpt
                    end_KD = (t + 1) * self.cpt

                    task_inputs = buf_inputs[task_mask]
                    score = buf_teacher_logits[task_mask]

                    if len(score) == 0:
                        continue

                    prev_score = self.model_fixed(task_inputs)
                    student_output = self.net(task_inputs)

                    teacher0_score_logits = prev_score[:, start_KD:end_KD]
                    teacher1_score_logits = score[:, start_KD:end_KD]

                    avg_teacher_logits = (teacher0_score_logits + teacher1_score_logits) / 2

                    soft_target = F.softmax(avg_teacher_logits / T, dim=1)
                    output_log = F.log_softmax(student_output[:, start_KD:end_KD] / T, dim=1)
                    loss_KD[t] = F.kl_div(output_log, soft_target, reduction='batchmean') * (T ** 2)
            else:
                score = self.model_fixed(data)[:, :mid].data
                for t in range(task_id_nominal):
                    start_KD = t * self.cpt
                    end_KD = (t + 1) * self.cpt

                    soft_target = F.softmax(score[:, start_KD:end_KD] / T, dim=1)
                    output_log = F.log_softmax(output[:, start_KD:end_KD] / T, dim=1)
                    loss_KD[t] = F.kl_div(output_log, soft_target, reduction='batchmean') * (T ** 2)
            loss_KD = loss_KD.sum()
        else:
            loss_CE = loss_CE_curr / batch_size

        self.opt.zero_grad()
        loss = (loss_KD + loss_CE)
        loss.backward()
        self.opt.step()

        if teacher:
            if self.args.bufs > 0:
                relevant_indices = torch.tensor(
                    [idx // self.cpt == task_id_nominal for idx in range(teacher_output.size(1))],
                    device=teacher_output.device)
                relevant_teacher = teacher_output[:, relevant_indices]
                relevant_student = output[:batch_size][:, relevant_indices]

                _, teacher_max_indices = relevant_teacher.max(1)
                _, outputs_max_indices = relevant_student.max(1)

                mask = teacher_max_indices == outputs_max_indices


                if mask.sum() > 0:
                    self.buffer.add_data(examples=not_aug_inputs[mask],
                                                           labels=labels[mask],
                                                           logits=teacher_output.data[mask],
                                         is_noise=noise[mask] if noise is not None else None,
                                         task_labels=torch.full((len(not_aug_inputs),), task_id_nominal)[mask])
            else:
                self.buffer.add_data(examples=not_aug_inputs,
                                                       labels=labels,
                                                       logits=teacher_output.data,
                                     is_noise=noise if noise is not None else None,
                                     task_labels=torch.full((len(not_aug_inputs),), task_id_nominal))
        else:
            self.buffer.add_data(examples=not_aug_inputs,
                                                   labels=labels,
                                 is_noise=noise if noise is not None else None,
                                 task_labels=torch.full((len(not_aug_inputs),), task_id_nominal))

        if mask is not None:
            stats = {
                "mask": mask.sum().item(),
                "noise": 0,
                "i_noise": 0
            }
            if noise is not None:
                stats['noise'] = noise.sum().item()
                stats['i_noise'] = noise[mask].sum().item()
        else:
            stats = {}
        return loss.item(), 0, stats, 0, 0
