import torch.nn as nn
import torch
import numpy as np
import torch.nn.functional as F
import utils.utils
import wandb  


def compute_loss(cls_criterion, preds, targets, score_list=None, lambda_sqrt_inv_list=None, kl_list=None, eta_ksvd=1., eta_kl=1.):
    loss_ce = cls_criterion(preds, targets)

    if score_list is not None:
        loss_ksvd = 0
        loss_kl = 0
        for i in range(len(score_list)):
            # loss_ksvd
            loss_escore = torch.mean((torch.einsum('...nd,...ds->...ns', score_list[i][0], lambda_sqrt_inv_list[i].unsqueeze(0))).norm(dim=-1, p=2)**2)/2
            loss_rscore = torch.mean((torch.einsum('...nd,...ds->...ns', score_list[i][1], lambda_sqrt_inv_list[i].unsqueeze(0))).norm(dim=-1, p=2)**2)/2
            loss_trace = torch.einsum('...ps,...pd->...sd', score_list[i][2], score_list[i][3].type_as(score_list[i][2])).mean(dim=0).trace()
            loss_ksvd = loss_ksvd + (loss_escore + loss_rscore - loss_trace) ** 2
            # loss_kl
            loss_kl = loss_kl + kl_list[i]

        # add loss_kl
        loss_total = loss_ce + eta_ksvd * loss_ksvd + eta_kl * loss_kl 

        return loss_total, loss_ce, loss_ksvd, loss_kl
    else:
        return loss_ce


def train(train_loader, net, optimizer, epoch, logger, args):
    net.train()

    # define criterion
    cls_criterion = nn.CrossEntropyLoss()

    if args.attn_type == "softmax":
        train_log = {
            'Top1 Acc.': utils.utils.AverageMeter(),
            'Tot. Loss': utils.utils.AverageMeter(),
            'LR': utils.utils.AverageMeter(),
        }
    elif args.attn_type == "kep_svgp":
        train_log = {
            'Top1 Acc.': utils.utils.AverageMeter(),
            'CE Loss': utils.utils.AverageMeter(),
            'KSVD Loss': utils.utils.AverageMeter(),
            'KL Loss': utils.utils.AverageMeter(),
            'Tot. Loss': utils.utils.AverageMeter(),
            'LR': utils.utils.AverageMeter(),
        }
    elif args.attn_type == "sgpa":
        train_log = {
            'Tot. Loss': utils.utils.AverageMeter(),
            'LR': utils.utils.AverageMeter(),
        }
    elif args.attn_type == "cgpt":
        train_log = {
            'Tot. Loss': utils.utils.AverageMeter(),
            'LR': utils.utils.AverageMeter(),
        }
    elif args.attn_type == "scgpt":
        train_log = {
            'Tot. Loss': utils.utils.AverageMeter(),
            'LR': utils.utils.AverageMeter(),
        }
    msg = '####### --- Training Epoch {:d} --- #######'.format(epoch)
    logger.info(msg)

    for i, (inputs, targets) in enumerate(train_loader):
        inputs, targets = inputs.cuda(), targets.cuda()

        optimizer.zero_grad()
        if args.attn_type == "sgpa" or args.attn_type == "cgpt" or args.attn_type == "scgpt":
            loss = net.loss(inputs, targets, anneal_kl=min(1.0, epoch * 2 / args.nb_epochs))
        else:
            outs = net(inputs)

        if args.attn_type == "softmax":
            loss = compute_loss(cls_criterion, outs, targets)
        elif args.attn_type == "kep_svgp":
            loss, loss_ce, loss_ksvd, loss_kl = compute_loss(
                cls_criterion, outs[0], targets, outs[1], outs[2], outs[3], args.eta_ksvd, args.eta_kl
            )

        loss.backward()
        optimizer.step()

        if args.attn_type == "softmax":
            prec, _ = utils.utils.accuracy(outs, targets)
        elif args.attn_type == "kep_svgp":
            prec, _ = utils.utils.accuracy(outs[0], targets)

        for param_group in optimizer.param_groups:
            lr = param_group["lr"]
            break

        train_log['Tot. Loss'].update(loss.item(), inputs.size(0))
        if args.attn_type == 'kep_svgp' or args.attn_type == 'softmax':    
            train_log['Top1 Acc.'].update(prec.item(), inputs.size(0))
        train_log['LR'].update(lr, inputs.size(0))
        if args.attn_type == "kep_svgp":
            train_log['CE Loss'].update(loss_ce.item(), inputs.size(0))
            train_log['KSVD Loss'].update(loss_ksvd.item(), inputs.size(0))
            train_log['KL Loss'].update(loss_kl.item(), inputs.size(0))

        if i % 100 == 99:
            log = ['LR : {:.5f}'.format(train_log['LR'].avg)] + [
                key + ': {:.2f}'.format(train_log[key].avg) for key in train_log if key != 'LR'
            ]
            msg = 'Epoch {:d} \t Batch {:d}\t'.format(epoch, i) + '\t'.join(log)
            logger.info(msg)
            for key in train_log:
                train_log[key] = utils.utils.AverageMeter()

    # Replace writer.add_scalar with wandb.log
    wandb.log({f"Train/{key}": train_log[key].avg for key in train_log}, step=epoch)

def compute_loss_diffusion(args, mse_criterion, means_from_diffusion, means_x_minus, stds_from_diffusion, covariances_x_minus):
    """
    Compute the total loss as the sum of MSE losses between Diffusion and ViT outputs.
    
    Parameters:
        mse_criterion (nn.Module): MSE loss function.
        diffusion_layer_outputs (list of tensors): Sampled outputs from Diffusion model layers.
        vit_layer_outputs (list of tensors): Outputs from ViT model layers.
    
    Returns:
        total_loss (Tensor): Sum of MSE losses across all layers.
        layer_losses (dict): Dictionary of individual layer MSE losses.
    """
    means_mse = 0
    stds_mse = 0

    for layer_idx, (mean_diff_out, mean_vit_out) in enumerate(zip(means_from_diffusion, means_x_minus)):
        # Compute MSE loss between Diffusion output and ViT output
        mean_loss = mse_criterion(mean_diff_out, mean_vit_out)
        means_mse += mean_loss / len(means_from_diffusion)
    
    for layer_idx, (std_diff_out, cov_vit_out) in enumerate(zip(stds_from_diffusion, covariances_x_minus)):
        # Compute MSE loss between Diffusion output and ViT output
        # if args.attn_type == 'softmax':
        #     break
        # else:
        #     if args.depth == args.ksvd_layers:
        #         std_loss = mse_criterion(std_diff_out, cov_vit_out)
        #         stds_mse += std_loss #/ len(stds_from_diffusion)
        #     else: 
        #         if layer_idx < (args.depth - args.ksvd_layers):
        #             continue
        #         else:
        #             std_loss = mse_criterion(std_diff_out, cov_vit_out)
        #             stds_mse += std_loss
        std_loss = mse_criterion(std_diff_out, cov_vit_out)
        stds_mse += std_loss / len(stds_from_diffusion)
    
    return means_mse, stds_mse

def train_diffusion(train_loader, diffusion_model, optimizer, epoch, logger, args, vit_model):
    """
    Train the Diffusion model by aligning its layers with the ViT model's layers using MSE loss.
    
    Parameters:
        train_loader (DataLoader): Training data loader.
        diffusion_model (nn.Module): Diffusion model to be trained.
        optimizer (Optimizer): Optimizer for the Diffusion model.
        epoch (int): Current epoch number.
        logger (Logger): Logger for logging information.
        args (Namespace): Command-line arguments.
        vit_model (nn.Module): Pre-trained ViT model for layer alignment.
    """
    diffusion_model.train()
    vit_model.eval()  # Ensure ViT is in evaluation mode

    # Freeze ViT model parameters
    # for param in diffusion_model.parameters():
    #     param.requires_grad = False

    for param in vit_model.parameters():
        param.requires_grad = False

    # for param in vit_model.fc.parameters():
    #     param.requires_grad = True

    # Define loss function
    mse_criterion = nn.MSELoss() #to be uncomment

    ce_criterion = nn.CrossEntropyLoss()

    # Initialize training logs
    train_log = {
        'CE Loss': utils.utils.AverageMeter(),
        'Mean Loss': utils.utils.AverageMeter(),
        'Var Loss': utils.utils.AverageMeter(),
        'Tot. Loss': utils.utils.AverageMeter(),
        'LR': utils.utils.AverageMeter(),
    }

    msg = '####### --- Training Epoch {:d} --- #######'.format(epoch)
    logger.info(msg)

    for i, (inputs, targets) in enumerate(train_loader):
        inputs, targets = inputs.cuda(), targets.cuda()
        optimizer.zero_grad()
        output = diffusion_model(inputs)

        ce_loss = ce_criterion(output, targets)
        with torch.no_grad(): #to be uncomment
            _, x_t_from_ViT, means_x_minus, covariances_x_minus = vit_model(inputs)
        means_from_diffusion, stds_from_diffusion = diffusion_model(x_t_from_ViT, train=True)
        # print(x_t_from_diffusion[0].shape) # for debug only
        # print(means_x_minus[0].shape) # for debug only
        means_loss, stds_loss = compute_loss_diffusion(args, mse_criterion, means_from_diffusion, means_x_minus, stds_from_diffusion, covariances_x_minus)#to be uncomment
        loss = args.lambda_mean*means_loss + args.lambda_var*stds_loss + args.lambda_ce*ce_loss
        loss.backward()
        optimizer.step()

        for param_group in optimizer.param_groups:
            lr = param_group["lr"]
            break
        
        train_log['CE Loss'].update(ce_loss.item(), inputs.size(0))
        train_log['Mean Loss'].update(means_loss.item(), inputs.size(0))
        if args.attn_type == "softmax":
            train_log['Var Loss'].update(stds_loss, inputs.size(0))
        else: 
            train_log['Var Loss'].update(stds_loss.item(), inputs.size(0))
        train_log['Tot. Loss'].update(loss.item(), inputs.size(0))
        train_log['LR'].update(lr, inputs.size(0))

        if i % 100 == 99:
            log = ['LR : {:.5f}'.format(train_log['LR'].avg)] + [
                key + ': {:.2f}'.format(train_log[key].avg) for key in train_log if key != 'LR'
            ]
            msg = 'Epoch {:d} \t Batch {:d}\t'.format(epoch, i) + '\t'.join(log)
            logger.info(msg)
            for key in train_log:
                train_log[key] = utils.utils.AverageMeter()

    # Replace writer.add_scalar with wandb.log
    wandb.log({f"Train/{key}": train_log[key].avg for key in train_log}, step=epoch)
    
def train_distillation(train_loader, diffusion_model, optimizer, epoch, logger, args, vit_model):
    """
    Train the Diffusion model by aligning its layers with the ViT model's layers using MSE loss.
    
    Parameters:
        train_loader (DataLoader): Training data loader.
        diffusion_model (nn.Module): Diffusion model to be trained.
        optimizer (Optimizer): Optimizer for the Diffusion model.
        epoch (int): Current epoch number.
        logger (Logger): Logger for logging information.
        args (Namespace): Command-line arguments.
        vit_model (nn.Module): Pre-trained ViT model for layer alignment.
    """
    diffusion_model.train()
    vit_model.eval()  # Ensure ViT is in evaluation mode

    # Freeze ViT model parameters
    # for param in diffusion_model.parameters():
    #     param.requires_grad = False

    for param in vit_model.parameters():
        param.requires_grad = False

    # for param in vit_model.fc.parameters():
    #     param.requires_grad = True

    # Define loss function
    mse_criterion = nn.MSELoss() #to be uncomment

    ce_criterion = nn.CrossEntropyLoss()

    # Initialize training logs
    train_log = {
        'CE Loss': utils.utils.AverageMeter(),
        'Soft Targets Loss': utils.utils.AverageMeter(),
        'Tot. Loss': utils.utils.AverageMeter(),
        'LR': utils.utils.AverageMeter(),
    }

    msg = '####### --- Training Epoch {:d} --- #######'.format(epoch)
    logger.info(msg)

    for i, (inputs, targets) in enumerate(train_loader):
        inputs, targets = inputs.cuda(), targets.cuda()
        optimizer.zero_grad()
        
        T = args.temperature
        
        student_logits = diffusion_model(inputs)

        ce_loss = ce_criterion(student_logits, targets)
        with torch.no_grad(): #to be uncomment
            teacher_logits = vit_model(inputs)
        # print(x_t_from_diffusion[0].shape) # for debug only
        # print(means_x_minus[0].shape) # for debug only
        #to be uncomment
        soft_targets = nn.functional.softmax(teacher_logits / T, dim=-1)
        log_soft_prob = nn.functional.log_softmax(student_logits / T, dim=-1)
        soft_targets_loss = nn.functional.kl_div(log_soft_prob, soft_targets, reduction='batchmean') * (T**2)
        loss = args.lambda_mean * soft_targets_loss + args.lambda_ce * ce_loss
        loss.backward()
        optimizer.step()

        for param_group in optimizer.param_groups:
            lr = param_group["lr"]
            break
        
        train_log['CE Loss'].update(ce_loss.item(), inputs.size(0))
        train_log['Soft Targets Loss'].update(soft_targets_loss.item(), inputs.size(0))
        train_log['Tot. Loss'].update(loss.item(), inputs.size(0))
        train_log['LR'].update(lr, inputs.size(0))

        if i % 100 == 99:
            log = ['LR : {:.5f}'.format(train_log['LR'].avg)] + [
                key + ': {:.2f}'.format(train_log[key].avg) for key in train_log if key != 'LR'
            ]
            msg = 'Epoch {:d} \t Batch {:d}\t'.format(epoch, i) + '\t'.join(log)
            logger.info(msg)
            for key in train_log:
                train_log[key] = utils.utils.AverageMeter()

    # Replace writer.add_scalar with wandb.log
    wandb.log({f"Train/{key}": train_log[key].avg for key in train_log}, step=epoch)

def train_diffusion_stage2(train_loader, diffusion_model, optimizer, epoch, logger, args, vit_model):
    diffusion_model.eval()
    vit_model.train()
    for param in diffusion_model.parameters():
        param.requires_grad = False

    for param in vit_model.parameters():
        param.requires_grad = False

    for param in vit_model.fc.parameters():
        param.requires_grad = True
    
    # Define loss function
    ce_criterion = nn.CrossEntropyLoss()

    # Initialize training logs
    train_log = {
        'Top1 Acc.': utils.utils.AverageMeter(),
        'Tot. Loss': utils.utils.AverageMeter(),
        'LR': utils.utils.AverageMeter(),
    }

    msg = '####### --- Training Epoch {:d} --- #######'.format(epoch)

    logger.info(msg)

    for i, (inputs, targets) in enumerate(train_loader):
        inputs, targets = inputs.cuda(), targets.cuda()

        optimizer.zero_grad()
        output = vit_model._to_words(inputs)
        output = vit_model.emb(output)
        output = output + vit_model.pos_emb
        output = diffusion_model(output)
        output = vit_model.fc(output.mean(1))

        loss = ce_criterion(output, targets)
        loss.backward()
        optimizer.step()

        prec, _ = utils.utils.accuracy(output, targets)

        for param_group in optimizer.param_groups:
            lr = param_group["lr"]
            break

        train_log['Tot. Loss'].update(loss.item(), inputs.size(0)
        )
        train_log['Top1 Acc.'].update(prec.item(), inputs.size(0))
        train_log['LR'].update(lr, inputs.size(0))

        if i % 100 == 99:
            log = ['LR : {:.5f}'.format(train_log['LR'].avg)] + [
                key + ': {:.2f}'.format(train_log[key].avg) for key in train_log if key != 'LR'
            ]
            msg = 'Epoch {:d} \t Batch {:d}\t'.format(epoch, i) + '\t'.join(log)
            logger.info(msg)
            for key in train_log:
                train_log[key] = utils.utils.AverageMeter()

    