"""
the general training framework
"""

from __future__ import print_function

import sys
from abc import ABC


def eprint(*args, **kwargs):
    print(*args, file=sys.stderr, **kwargs)

import os
import argparse
import socket
import time
import datetime

import tensorboard_logger as tb_logger
import torch
import torch.optim as optim
import torch.nn as nn
import torch.backends.cudnn as cudnn

cudnn.benchmark = True
from models.cifar import model_dict_teacher
from models.cifar import model_dict_student
from models.cifar import model_channels
from models.cifar.util import ConvReg, LinearEmbed
from models.cifar.util import Connector, Translator, Paraphraser

from models.imagenet import model_dict_imagenet_teacher, model_dict_imagenet_student
from models.imagenet import model_channels_imagenet

from dataset.cifar100 import get_cifar100_dataloaders, get_cifar100_dataloaders_sample
from dataset.imagenet import get_imagenet_dataloaders, get_imagenet_dataloaders_sample

from helper.util import adjust_learning_rate

from distiller_zoo import DistillKL, HintLoss, Attention, Similarity, Correlation, VIDLoss, RKDLoss
from distiller_zoo import PKT, ABLoss, FactorTransfer, KDSVD, FSP, NSTLoss
from crd.criterion import CRDLoss

from helper.loops import train_distill as train, validate
from helper.pretrain import init


def parse_option():

    hostname = socket.gethostname()

    parser = argparse.ArgumentParser('argument for training')

    parser.add_argument('--print_freq', type=int, default=500, help='print frequency')
    parser.add_argument('--tb_freq', type=int, default=500, help='tb frequency')
    parser.add_argument('--save_freq', type=int, default=40, help='save frequency')
    parser.add_argument('--batch_size', type=int, default=64, help='batch_size')
    parser.add_argument('--num_workers', type=int, default=8, help='num of workers to use')
    parser.add_argument('--epochs', type=int, default=240, help='number of training epochs')
    parser.add_argument('--init_epochs', type=int, default=0, help='init training for two-stage methods')
    parser.add_argument('--gids', type=str, default='0,1,2,3,4,5,6,7', help='save frequency')

    # optimization
    parser.add_argument('--learning_rate', type=float, default=0.05, help='learning rate')
    parser.add_argument('--lr_decay', type=str, default='step', choices=['step', 'cos'], help='learning decay')
    parser.add_argument('--lr_decay_epochs', type=str, default='150,180,210', help='where to decay lr, can be a list')
    parser.add_argument('--lr_decay_rate', type=float, default=0.1, help='decay rate for learning rate')
    parser.add_argument('--weight_decay', type=float, default=5e-4, help='weight decay')
    parser.add_argument('--momentum', type=float, default=0.9, help='momentum')

    # dataset
    parser.add_argument('--dataset', type=str, default='cifar100', choices=['cifar100', 'imagenet'], help='dataset')
    parser.add_argument('--num_cls', type=int, default='1000', help='num_classes')
    # model
    parser.add_argument('--model_s', type=str, default='resnet20',
                        choices=['resnet8', 'resnet14', 'resnet20', 'resnet32', 'resnet44', 'resnet56', 'resnet110',
                                 'resnet8x4', 'resnet32x4', 'wrn_16_1', 'wrn_16_2', 'wrn_40_1', 'wrn_40_2',
                                 'vgg8', 'vgg11', 'vgg13', 'vgg16', 'vgg19', 'ResNet50',
                                 'MobileNetV2', 'ShuffleV1', 'ShuffleV2',
                                 'resnet18S', 'MobileNet', 'resnet50_4S'])
    parser.add_argument('--use_layer3', action='store_true', help='use the features to learn!')
    #parser.add_argument('--use_layer4', action='store_true', help='use the features to learn!')

    parser.add_argument('--path_t', type=str,
                        default="save/models/resnet110_vanilla/resnet110_best.pth",
                        help='teacher model snapshot')
    parser.add_argument('--resume_path', type=str, default=None, help='path_to resume weights!')

    # distillation
    parser.add_argument('--distill', type=str, default='vid', choices=['kd', 'hint', 'attention', 'similarity',
                                                                      'correlation', 'vid', 'crd', 'kdsvd', 'fsp',
                                                                      'rkd', 'pkt', 'abound', 'factor', 'nst',
                                                                       'normalized_cosine', 'cos'])
    parser.add_argument('--trial', type=str, default='1', help='trial id, str type!')

    parser.add_argument('-r', '--gamma', type=float, default=1, help='weight for classification')
    parser.add_argument('-a', '--alpha', type=float, default=None, help='weight balance for KD')
    parser.add_argument('-b', '--beta', type=float, default=None, help='weight balance for other losses')
    parser.add_argument('-b1', '--beta1', type=float, default=1, help='weight balance for other losses')
    parser.add_argument('-c', '--ceta', type=float, default=2.5, help='weight balance for other losses')
    parser.add_argument('-s', '--co-sponge', type=int, default=4, help='weight balance for other losses')

    # KL distillation
    parser.add_argument('--kd_T', type=float, default=4, help='temperature for KD distillation')

    # NCE distillation
    parser.add_argument('--feat_dim', default=128, type=int, help='feature dimension')
    parser.add_argument('--mode', default='exact', type=str, choices=['exact', 'relax'])
    parser.add_argument('--nce_k', default=16384, type=int, help='number of negative samples for NCE')
    parser.add_argument('--nce_t', default=0.07, type=float, help='temperature parameter for softmax')
    parser.add_argument('--nce_m', default=0.5, type=float, help='momentum for non-parametric updates')

    # hint layer
    parser.add_argument('--hint_layer', default=2, type=int, choices=[0, 1, 2, 3, 4])
    parser.add_argument('--kd-warm-up', type=float, default=20.0,
                    help='feature konwledge distillation loss weight warm up epochs')
    opt = parser.parse_args()

    # set different learning rate from these 4 models
    if opt.model_s in ['MobileNetV2', 'ShuffleV1', 'ShuffleV2']:
        opt.learning_rate = 0.01

    # set the path according to the environment
    if hostname.startswith('visiongpu'):
        opt.model_path = '/path/to/my/student_model'
        opt.tb_path = '/path/to/my/student_tensorboards'
    else:
        opt.model_path = './save/student_model'
        opt.tb_path = './save/student_tensorboards'

    iterations = opt.lr_decay_epochs.split(',')
    opt.lr_decay_epochs = list([])
    for it in iterations:
        opt.lr_decay_epochs.append(int(it))

    opt.model_t = get_teacher_name(opt.path_t)

    # opt.model_name = 'S:{}_T:{}_{}_{}_r:{}_a:{}_b:{}_s:{}_{}'.format(opt.model_s, opt.model_t, opt.dataset,
    # opt.distill, opt.gamma, opt.alpha, opt.beta, opt.co_sponge, opt.trial)
    opt.model_name = f'S:{opt.model_s}_T:{opt.model_t}_{opt.dataset}_{opt.distill}_' \
                     f'r:{opt.gamma}_a:{opt.alpha}_b:{opt.beta}_c:{opt.ceta}_s:{opt.co_sponge}_{opt.trial}_kdt_{opt.kd_T}'

    opt.tb_folder = os.path.join(opt.tb_path, opt.model_name)
    if not os.path.isdir(opt.tb_folder):
        os.makedirs(opt.tb_folder)

    opt.save_folder = os.path.join(opt.model_path, opt.model_name)
    if not os.path.isdir(opt.save_folder):
        os.makedirs(opt.save_folder)

    return opt


def get_teacher_name(model_path):
    """parse teacher name"""
    segments = model_path.split('/')[-2].split('_')
    if segments[0] != 'wrn':
        return segments[0]
    else:
        return segments[0] + '_' + segments[1] + '_' + segments[2]


def load_teacher(model_path, n_cls):
    print('==> loading teacher model')
    model_t = get_teacher_name(model_path)
    model = model_dict_teacher[model_t](num_classes=n_cls)
    model.load_state_dict(torch.load(model_path)['model'])
    print('==> done')
    return model

def load_teacher_imagenet(model_path, n_cls):
    print('==> loading teacher model')
    model_t = get_teacher_name(model_path)
    model = model_dict_imagenet_teacher[model_t](num_classes=n_cls)
    # model.load_state_dict(torch.load(model_path)['model'])
    print('==> done')
    return model


def main():
    best_acc = 0
    opt = parse_option()
    # tensorboard logger
    logger = tb_logger.Logger(logdir=opt.tb_folder, flush_secs=2)

    # dataloader
    if opt.dataset == 'cifar100':
        if opt.distill in ['crd']:
            train_loader, val_loader, n_data = get_cifar100_dataloaders_sample(batch_size=opt.batch_size,
                                                                               num_workers=opt.num_workers,
                                                                               k=opt.nce_k,
                                                                               mode=opt.mode)
        else:
            train_loader, val_loader, n_data = get_cifar100_dataloaders(batch_size=opt.batch_size,
                                                                        num_workers=opt.num_workers,
                                                                        is_instance=True)
        opt.num_cls = 100
        # model
        model_t = load_teacher(opt.path_t, opt.num_cls)
        model_s = model_dict_student[opt.model_s](num_classes=opt.num_cls, co_sponge=opt.co_sponge, channel_t=model_channels[opt.model_t])

    elif opt.dataset == 'imagenet':
        opt.batch_size = 256
        opt.num_workers = 16
        opt.epochs = 100

        opt.learning_rate = 0.1
        opt.momentum = 0.9
        opt.weight_decay = 1e-4
        opt.lr_decay_epochs = (30, 60, 90)
        opt.lr_decay_rate = 0.1
        opt.hint_layer = -1

        if opt.distill in ['crd']:
            train_loader, val_loader, n_data = get_imagenet_dataloaders_sample(batch_size=opt.batch_size,
                                                                               num_workers=opt.num_workers,
                                                                               k=opt.nce_k,
                                                                               mode=opt.mode)
        else:
            train_loader, val_loader, n_data = get_imagenet_dataloaders(batch_size=opt.batch_size,
                                                                        num_workers=opt.num_workers,
                                                                        is_instance=True)
        opt.num_cls = 1000
        print(opt)
        print("teacher name:", opt.model_t, "teacher channels:", model_channels_imagenet[opt.model_t])
        print("student name:", opt.model_s)

        # model and use multi-GPUs
        gids = opt.gids.split(',')
        gpu_ids = list([])
        for it in gids:
            gpu_ids.append(int(it))
        model_t = load_teacher_imagenet(opt.path_t, opt.num_cls)
        model_t = torch.nn.DataParallel(model_t,  device_ids=gpu_ids).cuda().eval()

        model_s = model_dict_imagenet_student[opt.model_s](num_classes=opt.num_cls, co_sponge=opt.co_sponge, channel_t=model_channels_imagenet[opt.model_t], use_layer3=opt.use_layer3)
        model_s = torch.nn.DataParallel(model_s,  device_ids=gpu_ids).cuda().train()
        if opt.resume_path is not None:
            weights = torch.load(opt.resume_path)
            model_s.load_state_dict(weights['model'])
            opt.init_epochs = weights['epoch'] + 1
            print(datetime.datetime.now())
            print(f"start from save epoch: {weights['epoch']}, best acc: {weights['best_acc']}")
    else:
        raise NotImplementedError(opt.dataset)


    #append student network into module_list
    module_list = nn.ModuleList([])

    module_list.append(model_s)
    trainable_list = nn.ModuleList([])
    trainable_list.append(model_s)

    criterion_cls = nn.CrossEntropyLoss()
    # if opt.dataset == 'imagenet':
    #     criterion_cls = LossLabelSmoothing()
    criterion_div = DistillKL(opt.kd_T)
    if opt.distill == 'kd':
        criterion_kd = DistillKL(opt.kd_T)
    elif opt.distill == 'hint':
        criterion_kd = HintLoss() # MSE loss
        # regress_s = ConvReg(feat_s[opt.hint_layer].shape, feat_t[opt.hint_layer].shape)
        # module_list.append(regress_s)
        # trainable_list.append(regress_s)
    elif opt.distill == 'normalized_cosine':
        criterion_kd = HintLoss() # MSE loss
        regress_s = ConvReg(feat_s[opt.hint_layer].shape, feat_t[opt.hint_layer].shape)
        module_list.append(regress_s)
        trainable_list.append(regress_s)
    elif opt.distill == 'cos':
        criterion_kd = HintLoss() # MSE loss
        regress_s = ConvReg(feat_s[opt.hint_layer].shape, feat_t[opt.hint_layer].shape)
        module_list.append(regress_s)
        trainable_list.append(regress_s)
    elif opt.distill == 'crd':
        opt.s_dim = feat_s[-1].shape[1]
        opt.t_dim = feat_t[-1].shape[1]
        opt.n_data = n_data
        criterion_kd = CRDLoss(opt)
        module_list.append(criterion_kd.embed_s)
        module_list.append(criterion_kd.embed_t)
        trainable_list.append(criterion_kd.embed_s)
        trainable_list.append(criterion_kd.embed_t)
    elif opt.distill == 'attention':
        criterion_kd = Attention()
    elif opt.distill == 'nst':
        criterion_kd = NSTLoss()
    elif opt.distill == 'similarity':
        criterion_kd = Similarity()
    elif opt.distill == 'rkd':
        criterion_kd = RKDLoss()
    elif opt.distill == 'pkt':
        criterion_kd = PKT()
    elif opt.distill == 'kdsvd':
        criterion_kd = KDSVD()
    elif opt.distill == 'correlation':
        criterion_kd = Correlation()
        embed_s = LinearEmbed(feat_s[-1].shape[1], opt.feat_dim)
        embed_t = LinearEmbed(feat_t[-1].shape[1], opt.feat_dim)
        module_list.append(embed_s)
        module_list.append(embed_t)
        trainable_list.append(embed_s)
        trainable_list.append(embed_t)
    elif opt.distill == 'vid':
        s_n = [f.shape[1] for f in feat_s[1:-1]]
        t_n = [f.shape[1] for f in feat_t[1:-1]]
        criterion_kd = nn.ModuleList(
            [VIDLoss(s, t, t) for s, t in zip(s_n, t_n)]
        )
        # add this as some parameters in VIDLoss need to be updated
        trainable_list.append(criterion_kd)
    elif opt.distill == 'abound':
        s_shapes = [f.shape for f in feat_s[1:-1]]
        t_shapes = [f.shape for f in feat_t[1:-1]]
        connector = Connector(s_shapes, t_shapes)
        # init stage training
        init_trainable_list = nn.ModuleList([])
        init_trainable_list.append(connector)
        init_trainable_list.append(model_s.get_feat_modules())
        criterion_kd = ABLoss(len(feat_s[1:-1]))
        init(model_s, model_t, init_trainable_list, criterion_kd, train_loader, logger, opt)
        # classification
        module_list.append(connector)
    elif opt.distill == 'factor':
        s_shape = feat_s[-2].shape
        t_shape = feat_t[-2].shape
        paraphraser = Paraphraser(t_shape)
        translator = Translator(s_shape, t_shape)
        # init stage training
        init_trainable_list = nn.ModuleList([])
        init_trainable_list.append(paraphraser)
        criterion_init = nn.MSELoss()
        init(model_s, model_t, init_trainable_list, criterion_init, train_loader, logger, opt)
        # classification
        criterion_kd = FactorTransfer()
        module_list.append(translator)
        module_list.append(paraphraser)
        trainable_list.append(translator)
    elif opt.distill == 'fsp':
        s_shapes = [s.shape for s in feat_s[:-1]]
        t_shapes = [t.shape for t in feat_t[:-1]]
        criterion_kd = FSP(s_shapes, t_shapes)
        # init stage training
        init_trainable_list = nn.ModuleList([])
        init_trainable_list.append(model_s.get_feat_modules())
        init(model_s, model_t, init_trainable_list, criterion_kd, train_loader, logger, opt)
        # classification training
        pass

    else:
        raise NotImplementedError(opt.distill)

    criterion_list = nn.ModuleList([])
    criterion_list.append(criterion_cls)    # classification loss
    criterion_list.append(criterion_div)    # KL divergence loss, original knowledge distillation
    criterion_list.append(criterion_kd)     # other knowledge distillation loss

    # optimizer
    optimizer = optim.SGD(trainable_list.parameters(),
                          lr=opt.learning_rate,
                          momentum=opt.momentum,
                          weight_decay=opt.weight_decay)

    # append teacher after optimizer to avoid weight_decay

    module_list.append(model_t)
    # module_list.cuda()
    criterion_list.cuda()


    # validate teacher accuracy
    teacher_acc, _, _ = validate(val_loader, model_t, criterion_cls, opt)
    print('teacher accuracy: ', teacher_acc)
    time_start = time.time()
    # routine
    init_epochs = opt.init_epochs
    for epoch in range(init_epochs, opt.epochs + 1):

        adjust_learning_rate(epoch, opt, optimizer, len(train_loader))
        print("==> training...")

        time1 = time.time()
        train_acc, train_loss, train_loss_ce, trian_loss_msd = train(epoch, train_loader, module_list, criterion_list, optimizer, opt)
        time2 = time.time()
        print('epoch {}, total time {:.2f}'.format(epoch, (time2 - time1)/60.0))

        logger.log_value('train_acc', train_acc, epoch)
        logger.log_value('train_loss', train_loss, epoch)

        test_acc, tect_acc_top5, test_loss = validate(val_loader, model_s, criterion_cls, opt)

        logger.log_value('test_acc', test_acc, epoch)
        logger.log_value('test_loss', test_loss, epoch)
        logger.log_value('test_acc_top5', tect_acc_top5, epoch)

        # save the best model
        if test_acc > best_acc:
            best_acc = test_acc
            state = {
                'epoch': epoch,
                'model': model_s.state_dict(),
                'best_acc': best_acc,
            }
            save_file = os.path.join(opt.save_folder, '{}_best.pth'.format(opt.model_s))
            print('saving the best model!')
            torch.save(state, save_file)
            print('best accuracy:', best_acc)
        # regular saving
        if epoch % opt.save_freq == 0:
            print('==> Saving...')
            state = {
                'epoch': epoch,
                'model': model_s.state_dict(),
                'accuracy': test_acc,
            }
            save_file = os.path.join(opt.save_folder, 'ckpt_epoch_{epoch}.pth'.format(epoch=epoch))
            torch.save(state, save_file)

        record_ce_msd = True
        if record_ce_msd:
            with open('./save/logs/loss_{}.txt'.format(opt.model_name), 'a') as f:
                f.write("{} {} {}\n".format(train_loss_ce, trian_loss_msd, train_loss_ce/trian_loss_msd))

    # This best accuracy is only for printing purpose.
    # The results reported in the paper/README is from the last epoch.
    time_end = time.time()
    print("total_cost:", time_end - time_start)
    print('best accuracy:', best_acc)
    print('time:{}'.format(time.asctime(time.localtime(time.time()) ))+'  ')
    with open('save/results.txt', 'a') as f:
        tm = time.localtime(time.time())
        f.write("{:0>4}{:0>2}{:0>2}{:0>2}{:0>2}".format(tm[0], tm[1], tm[2], tm[3], tm[4]))
        # f.write(f"{tm[0]:0>4}{tm[1]:0>2}{tm[2]:0>2}{tm[3]:0>2}{tm[4]:0>2}")
        f.write(opt.model_name+'  ')
        f.write('best_accuracy:{} '.format(best_acc))
        f.write("total_cost:{}\n".format(time_end - time_start))

    # save model
    state = {
        'opt': opt,
        'model': model_s.state_dict(),
    }
    save_file = os.path.join(opt.save_folder, '{}_last.pth'.format(opt.model_s))
    torch.save(state, save_file)


if __name__ == '__main__':
    main()
