import os


os.environ["CUDA_VISIBLE_DEVICES"] = "1"
import torch
import torch.nn.functional as F


import numpy as np


import time
import sys
sys.path.append('../')

from task_vectors import TaskVector
from eval import eval_single_dataset
from args import parse_arguments

from modeling import ImageClassifier
from heads import get_classification_head
from datasets.ImageNet import SequentialImagenet

from datasets.common import get_dataloader, maybe_dictionarize
from datasets.registry import get_dataset

import utils


def create_log_dir(path, filename='log.txt'):
    import logging
    if not os.path.exists(path):
        os.makedirs(path)
    logger = logging.getLogger(path)
    logger.setLevel(logging.DEBUG)
    fh = logging.FileHandler(path+'/'+filename)
    fh.setLevel(logging.DEBUG)
    ch = logging.StreamHandler()
    ch.setLevel(logging.DEBUG)
    logger.addHandler(fh)
    logger.addHandler(ch)
    return logger

def get_params(model):
    param_vectors = [param.view(-1) for param in model.parameters()]
    flattened_params = torch.cat(param_vectors)
    return flattened_params

def get_grads(model):
    param_vectors = []
    for param in model.parameters():
        if param.grad is None:
            param_vectors.append(torch.zeros_like(param).view(-1))
        else:
            param_vectors.append(param.grad.view(-1))

    flattened_params = torch.cat(param_vectors)
    return flattened_params


def TATR_merging(args, task_vectors, pretrained_model, exam_datasets):
    if args.is_imageNet:
        ntk_dataset = exam_datasets + ['ImageNet']
    else:
        ntk_dataset = exam_datasets

    for pp in pretrained_model.parameters():
        pp.requires_grad = True

    grad_arr = []
    for exam_dataset in ntk_dataset:
        classification_head = get_classification_head(args, exam_dataset).to('cuda')
        model = ImageClassifier(pretrained_model, classification_head)

        if exam_dataset == 'ImageNet':
            dataset = SequentialImagenet('/home/sunwenju/dataset/', 32)
            dataloader, _ = dataset.get_data_loaders()
        else:
            dataset = get_dataset(
                exam_dataset,
                model.val_preprocess,
                location=args.data_location,
                batch_size=args.batch_size
            )
            dataloader = get_dataloader(
                dataset, is_train=True, args=args, image_encoder=None)

        grad = torch.zeros_like(get_params(pretrained_model))

        exp_size = 0
        for j, data in enumerate(dataloader):
            inputs, labels = data[0].to('cuda'), data[1].to('cuda')

            if inputs.shape[0] > args.exp_size:
                inputs, labels = inputs[:args.exp_size, ...], labels[:args.exp_size]

            feat = pretrained_model(inputs)
            output = classification_head(feat)
            for i in range(output.size(0)):
                pretrained_model.zero_grad()
                loss = F.cross_entropy(output[i, ...].unsqueeze(0), labels[i].unsqueeze(0))
                loss.backward(retain_graph=True if i < labels.size(0) - 1 else False)
                grad += torch.abs(get_grads(pretrained_model).detach())

            exp_size += inputs.shape[0]
            if exp_size >= args.exp_size:
                break
        print('exp_size', exp_size)
        grad = (grad / exp_size).to('cpu')

        grad_arr.append(grad)

    Omega = torch.zeros_like(
        torch.cat([task_vectors[0].vector[name].view(-1) for name, param in pretrained_model.named_parameters()])
    )

    for i in range(len(grad_arr)):
        for j in range(len(grad_arr)):
            if i != j:
                vector1 = grad_arr[i]
                vector2 = torch.abs(torch.cat([task_vectors[j].vector[name].view(-1) for name, param in pretrained_model.named_parameters()]))
                Omega += vector1 * vector2


    num_elements = Omega.numel()
    num_top = int(args.ratio * num_elements)

    k_value = num_elements - num_top
    values, indices = torch.flatten(Omega).sort(descending=True)
    threshold = values[k_value]

    mask = (Omega < threshold).int()

    for task_vector in task_vectors:
        progress = 0
        for name, pp in list(pretrained_model.named_parameters()):
            mask_params = mask[progress: progress + int(torch.tensor(pp.size()).prod().item())].view(pp.size())
            progress += int(torch.tensor(pp.size()).prod().item())
            task_vector.vector[name] *= mask_params

    task_vector_sum = sum(task_vectors)
    return task_vector_sum



exam_datasets = ['SUN397', 'Cars', 'RESISC45', 'EuroSAT', 'SVHN', 'GTSRB', 'MNIST', 'DTD']  # SUN397 | Cars | RESISC45 | EuroSAT | SVHN | GTSRB | MNIST | DTD
test_datasets = ['SUN397', 'Cars', 'RESISC45', 'EuroSAT', 'SVHN', 'GTSRB', 'MNIST','DTD']  # SUN397 | Cars | RESISC45 | EuroSAT | SVHN | GTSRB | MNIST | DTD
#model = 'ViT-B-32'
model = 'ViT-L-14'
args = parse_arguments()
args.data_location = '../data'
args.model = model
args.device = 'cuda'
args.save = '../checkpoints/checkpoints/' + model
args.logs_path = '../logs/' + model
pretrained_checkpoint = '../checkpoints/checkpoints/' + model + '/zeroshot.pt'

args.ratio = 0.99  # opposite to the paper
args.exp_size = 128
args.is_imageNet = False

#args.scale_factor = 0.3 # for 'Vit-B-32'

args.scale_factor = 0.35 # for 'ViT-L-14'
args.batch_size = 8 # for 'ViT-L-14'



str_time_ = time.strftime('%Y%m%d_%H%M%S', time.localtime(time.time()))
log = create_log_dir(args.logs_path, 'log_{}_task_arithmetic.txt'.format(str_time_))
for conf in [1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4, 16, 16, 16, 16, 16]:
    task_vectors = [
        TaskVector(pretrained_checkpoint, '../checkpoints/checkpoints/' + model + '/' + dataset_name + '/finetuned.pt')
        for dataset_name in exam_datasets
    ]
    pretrained_model = torch.load(pretrained_checkpoint).to('cuda')



    print('################################################################')
    print('######################### Merging :', conf, ' ##############################')
    print('################################################################')


    args.exp_size = conf

    task_vector_sum = TATR_merging(args, task_vectors, pretrained_model, exam_datasets)

    ################################################################
    ######################### Testing ##############################
    ################################################################

    scaling_coef_ = args.scale_factor

    image_encoder = task_vector_sum.apply_to(pretrained_checkpoint, scaling_coef=scaling_coef_)
    log.info('*'*20 + 'scaling_coef:' + str(scaling_coef_) + '*'*20)

    accs = []
    for dataset in exam_datasets:
        metrics = eval_single_dataset(image_encoder, dataset, args)
        log.info(str(dataset) + ':' + str(metrics.get('top1')*100)+'%')
        accs.append(metrics.get('top1')*100)
    log.info('Avg ACC:' + str(np.mean(accs)) + '%')





