import copy
from sklearn.metrics import confusion_matrix
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.distributed as dist
from typing import TypeVar
import logging

import logging
import time
import datetime
import gc

from tqdm import tqdm

from methods._trainer import _Trainer
from utils.train_utils import select_optimizer, select_scheduler
from utils.memory import MemoryBatchSampler
from torch.utils.data import DataLoader

logger = logging.getLogger()
T = TypeVar('T', bound='nn.Module')


class CLIP_MVP(_Trainer):

    def __init__(self, **kwargs):
        super(CLIP_MVP, self).__init__(**kwargs)

        self.use_mask = kwargs.get("use_mask")
        self.use_contrastiv = kwargs.get("use_contrastiv")
        self.use_last_layer = kwargs.get("use_last_layer")
        self.use_afs = kwargs.get("use_afs")
        self.use_gsf = kwargs.get("use_gsf")

        self.alpha = kwargs.get("alpha")
        self.gamma = kwargs.get("gamma")
        self.margin = kwargs.get("margin")

        self.batch_exposed_classes = []
        self.batch_exposed_classes_names = []
        self.visible_classes = self.args.get('visible_classes', 'batch')

    def setup_dataset(self):
        super().setup_dataset()
        self.mask = torch.zeros(self.n_classes, device=self.device) - torch.inf

    def online_step(self, images, labels, idx):
        self.add_new_class(labels)
        self.model.set_exposed_classes(self.exposed_classes_names)

        self.memory_sampler = MemoryBatchSampler(
            self.memory, self.memory_batchsize,
            self.temp_batchsize * self.online_iter * self.world_size)
        self.memory_dataloader = DataLoader(self.train_dataset,
                                            batch_size=self.memory_batchsize,
                                            sampler=self.memory_sampler,
                                            num_workers=4)
        self.memory_provider = iter(self.memory_dataloader)

        _loss, _acc, _iter = 0.0, 0.0, 0
        for _ in range(int(self.online_iter)):
            loss, acc = self.online_train([images.clone(), labels.clone()])
            _loss += loss
            _acc += acc
            _iter += 1
        self.update_memory(idx, labels)
        del (images, labels)
        gc.collect()
        return _loss / _iter, _acc / _iter

    def online_train(self, data):
        self.model.train()
        total_loss, total_correct, total_num_data = 0.0, 0.0, 0.0

        if self.visible_classes == 'batch':
            # batch
            train_class_list = self.batch_exposed_classes
            train_class_name_list = self.batch_exposed_classes_names
        else:
            # all
            train_class_list = self.exposed_classes
            train_class_name_list = self.exposed_classes_names

        x, y = data

        if len(self.memory) > 0 and self.memory_batchsize > 0:
            memory_images, memory_labels = next(self.memory_provider)
            for i in range(len(memory_labels)):
                memory_labels[i] = train_class_list.index(
                    memory_labels[i].item())
            x = torch.cat([x, memory_images], dim=0)
            y = torch.cat([y, memory_labels], dim=0)

        for j in range(len(y)):
            y[j] = train_class_list.index(y[j].item())

        x = x.to(self.device)
        y = y.to(self.device)

        self.optimizer.zero_grad()

        text_tokens = self.model.labels_tokenize(train_class_name_list)
        with torch.cuda.amp.autocast(enabled=self.use_amp):
            image_feature, text_features, mask = self.model.forward_features(
                x, text_tokens)
            mask = mask[:, y.unique(
            )] if self.visible_classes == 'batch' else mask
            logit = self.model.forward_head(image_feature, text_features)
            if self.use_mask:
                logit = logit * mask
            if self.visible_classes == 'batch':
                logit = logit + self.mask[y.unique()]
            else:
                logit = logit + self.mask[:len(self.exposed_classes)]
            loss = self.loss_fn(image_feature, text_features, mask, y)
        _, preds = logit.topk(self.topk, 1, True, True)

        self.optimizer.zero_grad()
        self.scaler.scale(loss).backward()
        self.scaler.step(self.optimizer)
        self.scaler.update()
        self.update_schedule()

        total_loss += loss.item()
        total_correct += torch.sum(preds == y.unsqueeze(1)).item()
        total_num_data += y.size(0)

        return total_loss, total_correct / total_num_data

    def online_evaluate(self, test_loader):
        total_correct, total_num_data, total_loss = 0.0, 0.0, 0.0
        correct_l = torch.zeros(self.n_classes)
        num_data_l = torch.zeros(self.n_classes)
        label = []
        pred_list = []
        self.model.eval()
        with torch.no_grad():
            for i, data in enumerate(test_loader):
                x, y = data
                for j in range(len(y)):
                    y[j] = self.exposed_classes.index(y[j].item())

                x = x.to(self.device)
                y = y.to(self.device)

                logit = self.model(x)
                logit = logit + self.mask[:len(self.exposed_classes)]
                loss = F.cross_entropy(logit, y)
                pred = torch.argmax(logit, dim=-1)
                _, preds = logit.topk(self.topk, 1, True, True)
                total_correct += torch.sum(preds == y.unsqueeze(1)).item()
                total_num_data += y.size(0)

                xlabel_cnt, correct_xlabel_cnt = self._interpret_pred(y, pred)
                correct_l += correct_xlabel_cnt.detach().cpu()
                num_data_l += xlabel_cnt.detach().cpu()

                total_loss += loss.mean().item()
                label += y.tolist()
                pred_list += pred.tolist()

        avg_acc = total_correct / total_num_data
        avg_loss = total_loss / len(test_loader)
        cls_acc = (correct_l / (num_data_l + 1e-5)).numpy().tolist()
        cm = confusion_matrix(label, pred_list)

        eval_dict = {
            "avg_loss": avg_loss,
            "avg_acc": avg_acc,
            "cls_acc": cls_acc,
            "confusion_matrix": cm.tolist()
        }
        return eval_dict

    def update_schedule(self, reset=False):
        if reset:
            self.scheduler = select_scheduler(self.sched_name, self.optimizer,
                                              self.lr_gamma)
            for param_group in self.optimizer.param_groups:
                param_group["lr"] = self.lr
        else:
            self.scheduler.step()

    def online_before_task(self, task_id):
        logger.info("Total parameters:\t{}".format(
            sum(p.numel() for p in self.model.parameters())))
        logger.info("Trainable parameters:\t{}".format(
            sum(p.numel() for p in self.model.parameters()
                if p.requires_grad)))

    def online_after_task(self, cur_iter):
        pass

    def reset_opt(self):
        self.optimizer = select_optimizer(self.opt_name, self.lr, self.model,
                                          True)
        self.scheduler = select_scheduler(self.sched_name, self.optimizer,
                                          self.lr_gamma)

    def _compute_grads(self, i_feat, t_feat, y, mask):
        t_feat = t_feat / t_feat.norm(dim=-1, keepdim=True)
        i_feat = i_feat / i_feat.norm(dim=-1, keepdim=True)
        t_feat.requires_grad = True
        i_feat.requires_grad = True
        logit_scale = self.model.backbone.logit_scale.clone().detach().exp()
        logit = logit_scale * i_feat @ t_feat.t()

        if self.use_mask:
            logit = logit * mask.clone().detach()

        if self.visible_classes == 'batch':
            logit = logit + self.mask[y.unique()]
        else:
            logit = logit + self.mask[:len(self.exposed_classes)]

        sample_loss = F.cross_entropy(logit, y, reduction='none')
        sample_grad = []
        for idx in range(len(y)):
            sample_loss[idx].backward(retain_graph=True)
            _g = t_feat.grad[y[idx]].clone()
            sample_grad.append(_g)
            t_feat.grad = torch.zeros_like(t_feat.grad)
            i_feat.grad = torch.zeros_like(i_feat.grad)
        sample_grad = torch.stack(sample_grad)  #B,dim

        t_feat.grad = torch.zeros_like(t_feat.grad)
        i_feat.grad = torch.zeros_like(i_feat.grad)
        batch_loss = F.cross_entropy(logit, y, reduction='mean')
        batch_loss.backward(retain_graph=True)
        total_batch_grad = t_feat.grad.clone()  # C,dim
        idx = torch.arange(len(y))
        batch_grad = total_batch_grad[y[idx]]  #B,dim

        return sample_grad, batch_grad

    def _get_ignore(self, sample_grad, batch_grad):
        ign_score = (
            1. - torch.cosine_similarity(sample_grad, batch_grad, dim=1))  #B
        return ign_score

    def _get_compensation(self, y, i_feat, t_feat):
        cps_score = (1. - torch.cosine_similarity(t_feat[y], i_feat, dim=1) +
                     self.margin)  #B
        return cps_score

    def _get_score(self, i_feat, t_feat, y, mask):
        sample_grad, batch_grad = self._compute_grads(i_feat, t_feat, y, mask)
        ign_score = self._get_ignore(sample_grad, batch_grad)
        cps_score = self._get_compensation(y, i_feat, t_feat)
        return ign_score, cps_score

    def loss_fn(self, image_feature, text_features, mask, y):
        ign_score, cps_score = self._get_score(
            image_feature.clone().detach(),
            text_features.clone().detach(),
            y,
            mask,
        )

        if self.use_afs:
            logit = self.model.forward_head(
                image_feature / (cps_score.unsqueeze(1)), text_features)
        else:
            logit = self.model.forward_head(image_feature, text_features)
        if self.use_mask:
            logit = logit * mask
        if self.visible_classes == 'batch':
            logit = logit + self.mask[y.unique()]
        else:
            logit = logit + self.mask[:len(self.exposed_classes)]
        log_p = F.log_softmax(logit, dim=1)
        loss = F.nll_loss(log_p, y)
        if self.use_gsf:
            loss = (1 - self.alpha) * loss + self.alpha * (ign_score**
                                                           self.gamma) * loss
        return loss.mean() + self.model.get_similarity_loss()

    def report_training(self, sample_num, train_loss, train_acc):
        print(
            f"Train | Sample # {sample_num} | train_loss {train_loss:.4f} | train_acc {train_acc:.4f} | "
            f"lr {self.optimizer.param_groups[0]['lr']:.6f} | "
            f"Num_Classes {len(self.exposed_classes)} | "
            f"Num_Batch_Classes {len(self.batch_exposed_classes)} | "
            f"running_time {datetime.timedelta(seconds=int(time.time() - self.start_time))} | "
            f"ETA {datetime.timedelta(seconds=int((time.time() - self.start_time) * (self.total_samples-sample_num) / sample_num))} | "
            f"N_Prompts {self.model.e_prompts.size(0)} | "
            f"N_Exposed {len(self.exposed_classes)} | "
            f"Counts {self.model.count.to(torch.int64).tolist()}")

    def setup_distributed_model(self):
        super().setup_distributed_model()
        self.model.use_mask = self.use_mask
        self.model.use_contrastiv = self.use_contrastiv
        self.model.use_last_layer = self.use_last_layer

    def add_new_batch_class(self, class_name):
        batch_exposed_classes = []
        for label in class_name:
            if label.item() not in self.batch_exposed_classes:
                self.batch_exposed_classes.append(label.item())
        if self.distributed:
            batch_exposed_classes = torch.cat(
                self.all_gather(
                    torch.tensor(self.batch_exposed_classes,
                                 device=self.device))).cpu().tolist()
            self.batch_exposed_classes = []
            for cls in batch_exposed_classes:
                if cls not in self.batch_exposed_classes:
                    self.batch_exposed_classes.append(cls)
        self.batch_exposed_classes_names = [
            self.train_dataset.classes_names[i]
            for i in self.batch_exposed_classes
        ]

    def add_new_class(self, class_name):
        super().add_new_class(class_name)
        self.mask[:len(self.exposed_classes)] = 0
        self.batch_exposed_classes = []
        self.batch_exposed_classes_names = []
        if self.memory_size > 0:
            self.batch_exposed_classes = self.exposed_classes
            self.batch_exposed_classes_names = self.exposed_classes_names
        else:
            self.add_new_batch_class(class_name)

    def update_memory(self, sample, label):
        # Update memory
        if self.distributed:
            sample = torch.cat(self.all_gather(sample.to(self.device)))
            label = torch.cat(self.all_gather(label.to(self.device)))
            sample = sample.cpu()
            label = label.cpu()
        idx = []
        if self.is_main_process():
            for lbl in label:
                self.seen += 1
                if len(self.memory) < self.memory_size:
                    idx.append(-1)
                else:
                    j = torch.randint(0, self.seen, (1, )).item()
                    if j < self.memory_size:
                        idx.append(j)
                    else:
                        idx.append(self.memory_size)
        # Distribute idx to all processes
        if self.distributed:
            idx = torch.tensor(idx).to(self.device)
            size = torch.tensor([idx.size(0)]).to(self.device)
            dist.broadcast(size, 0)
            if dist.get_rank() != 0:
                idx = torch.zeros(size.item(),
                                  dtype=torch.long).to(self.device)
            dist.barrier()  # wait for all processes to reach this point
            dist.broadcast(idx, 0)
            idx = idx.cpu().tolist()
        # idx = torch.cat(self.all_gather(torch.tensor(idx).to(self.device))).cpu().tolist()
        for i, index in enumerate(idx):
            if len(self.memory) >= self.memory_size:
                if index < self.memory_size:
                    self.memory.replace_data(
                        [sample[i], self.exposed_classes[label[i].item()]],
                        index)
            else:
                self.memory.replace_data(
                    [sample[i], self.exposed_classes[label[i].item()]])

    @torch.no_grad()
    @torch.no_grad()
    def offline_evaluate(self, zs_test_loader, zs_classes, zs_meta, zsaf=True):
        result = []
        text_features = []
        total_correct, total_num_data = 0.0, 0.0
        self.model.eval()
        for idx in range(len(zs_meta)):
            zs_correct, zsaf_correct, zs_num_data = 0.0, 0.0, 0.0
            _test_loader = zs_test_loader[idx]
            _classes = zs_classes[idx]
            _meta = zs_meta[idx]

            # text_features
            _text_tokens = self.model.labels_tokenize(_classes).to(self.device)
            text_features.append(self.model.forward_text(_text_tokens))

            # image_feature
            for data in tqdm(_test_loader):
                x, y = data
                x = x.to(self.device)
                y = y.to(self.device)
                _image_features = self.model.forward_image(x)
                # zs
                _text_features = text_features[-1]
                logit = self.model.forward_head(_image_features,
                                                _text_features)
                pred = torch.argmax(logit, dim=-1)
                zs_correct += torch.sum(pred.squeeze() == y.squeeze()).item()
                zs_num_data += y.size(0)
                # zsaf
                if zsaf and idx > 0:
                    _text_features = torch.cat(
                        (text_features[0], text_features[-1]), 0)
                    logit = self.model.forward_head(_image_features,
                                                    _text_features)
                    pred = torch.argmax(logit, dim=-1)
                    offset = text_features[0].shape[0]
                    zsaf_correct += torch.sum(
                        pred.squeeze() == (y.squeeze() + offset)).item()
                # total
                _text_features = torch.cat(text_features, 0)
                logit = self.model.forward_head(_image_features,
                                                _text_features)
                pred = torch.argmax(logit, dim=-1)
                offset = _text_features.shape[0] - text_features[-1].shape[0]
                total_correct += torch.sum(pred.squeeze() == (y.squeeze() +
                                                              offset)).item()
                total_num_data += y.size(0)

            zs_acc = zs_correct / zs_num_data
            zsaf_acc = zsaf_correct / zs_num_data
            total_acc = total_correct / total_num_data
            result.append({
                'name': _meta[0],
                'zs_acc': zs_acc,
                'zsaf_acc': zsaf_acc,
                'total_acc': total_acc,
            })
            print(
                '{} | zs_acc: {:.2f}% | zsaf_acc: {:.2f}% | total_acc: {:.2f}% |'
                .format(_meta[0], zs_acc * 100, zsaf_acc * 100,
                        total_acc * 100))

        return result
