# implementation follows https://github.com/facebookresearch/moco and https://github.com/sthalles/SimCLR
from datetime import datetime
from functools import partial
from PIL import Image
from torch.utils.data import DataLoader
from torchvision import transforms,datasets
from torch.autograd import Variable
from torchvision.datasets import STL10
from torchvision.models import resnet
from tqdm import tqdm
import argparse
import json
import math
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import wandb
wandb.init(project="KL-STL-10")

parser = argparse.ArgumentParser(description='Train f-MICL on STL-10')

parser.add_argument('-a', '--arch', default='resnet50')

# lr: 0.06 for batch 512 (or 0.03 for batch 256)
parser.add_argument('--lr', '--learning-rate', default=0.1, type=float, metavar='LR', help='initial learning rate', dest='lr')
parser.add_argument('--epochs', default=200, type=int, metavar='N', help='number of total epochs to run')
parser.add_argument('--schedule', default=[120, 160], nargs='*', type=int, help='learning rate schedule (when to drop lr by 10x); does not take effect if --cos is on')
parser.add_argument('--cos', action='store_true', help='use cosine lr schedule')

parser.add_argument('--batch-size', default=64, type=int, metavar='N', help='mini-batch size')
parser.add_argument('--wd', default=5e-4, type=float, metavar='W', help='weight decay')

# moco specific configs:
parser.add_argument('--dim', default=512, type=int, help='feature dimension')

parser.add_argument('--symmetric', action='store_true', help='use a symmetric loss function that backprops to both crops')
parser.add_argument('--estimator', default='NCE', help='choose a disired estimator of lower bound')

# knn monitor
parser.add_argument('--knn-k', default=200, type=int, help='k in kNN monitor')
parser.add_argument('--knn-t', default=0.1, type=float, help='softmax temperature in kNN monitor; could be different with moco-t')

# utils
parser.add_argument('--resume', default='', type=str, metavar='PATH', help='path to latest checkpoint (default: none)')
parser.add_argument('--results_dir', default='', type=str, metavar='PATH', help='path to cache (default: none)')
parser.add_argument('--seed', default='1', type=int, help='random seed')
parser.add_argument('--alpha', default=0.5, help='weighting parameter for negative samples')
parser.add_argument('--tsallis', default=3, help='weighting parameter for tsallis')


args = parser.parse_args()  # running in command line

args.epochs = 800
args.lr = 0.1
args.cos = True
args.sigma = 0.1
args.schedule = []  # cos in use
args.symmetric = False
args.alpha = 40.0

if args.results_dir == '':
    args.results_dir = './cache-KL'

args.resume = args.results_dir + '/model_last.pth'

class ContrastiveLearningViewGenerator(object):
    """Take two random crops of one image as the query and key."""

    def __init__(self, base_transform, n_views=2):
        self.base_transform = base_transform
        self.n_views = n_views

    def __call__(self, x):
        return [self.base_transform(x) for i in range(self.n_views)]
    
train_transform = transforms.Compose([
    transforms.RandomResizedCrop(32),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomApply([transforms.ColorJitter(0.4, 0.4, 0.4, 0.1)], p=0.8),
    transforms.RandomGrayscale(p=0.2),
    transforms.ToTensor(),
    transforms.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010])])

test_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010])])

# data prepare
train_data = datasets.STL10(root='data', split='unlabeled', 
                            transform=ContrastiveLearningViewGenerator(train_transform), download=True)
train_loader = DataLoader(train_data, batch_size=args.batch_size, shuffle=True, num_workers=16, pin_memory=True, drop_last=True)

memory_data = datasets.STL10(root='data', split='train', transform=test_transform, download=True)
memory_loader = DataLoader(memory_data, batch_size=args.batch_size, shuffle=False, num_workers=16, pin_memory=True)

test_data = datasets.STL10(root='data',split='test' , transform=test_transform, download=True)
test_loader = DataLoader(test_data, batch_size=args.batch_size, shuffle=False, num_workers=16, pin_memory=True)


class ModelBase(nn.Module):

    def __init__(self, feature_dim=128, arch=None):
        super(ModelBase, self).__init__()

        # use split batchnorm
        norm_layer = nn.BatchNorm2d
        resnet_arch = getattr(resnet, arch)
        net = resnet_arch(num_classes=feature_dim, norm_layer=norm_layer)

        self.net = []
        for name, module in net.named_children():
            if name == 'conv1':
                module = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
            if isinstance(module, nn.MaxPool2d):
                continue
            if isinstance(module, nn.Linear):
                self.net.append(nn.Flatten(1))
            self.net.append(module)

        self.net = nn.Sequential(*self.net)

    def forward(self, x):
        x = self.net(x)
        # note: not normalized here
        return x
    

class ModelSimCLR(nn.Module):
    def __init__(self, alpha=40.0, num = 5, batch_size=512, tsallis=3, dim=128, K=4096, m=0.99, T=0.1, sigma=1.0, arch='resnet18',symmetric=True, estimator='NWJ'):
        super(ModelSimCLR, self).__init__()

        self.K = K
        self.tsallis = tsallis
        self.batch_size = batch_size
        self.m = m
        self.alpha=alpha
        self.sigma = sigma
        self.symmetric = symmetric
        self.estimator = estimator

        # create the encoders
        self.encoder_q = ModelBase(feature_dim=dim, arch=arch)


    def random_select(self, queue, dim):
        idx = torch.randperm(len(queue))[0:dim]
        select_queue = queue[idx]
        return select_queue
    
    @torch.no_grad()
    def _batch_shuffle_single_gpu(self, x):
        """
        Batch shuffle, for making use of BatchNorm.
        """
        # random shuffle index
        idx_shuffle = torch.randperm(x.shape[0]).cuda()

        # index for restoring
        idx_unshuffle = torch.argsort(idx_shuffle)

        return x[idx_shuffle], idx_unshuffle

    @torch.no_grad()
    def _batch_unshuffle_single_gpu(self, x, idx_unshuffle):
        """
        Undo batch shuffle.
        """
        return x[idx_unshuffle]

    def contrastive_loss(self, im_q, im_k):
        # compute query features
        q = self.encoder_q(im_q)  # queries: NxC
        q = nn.functional.normalize(q, dim=1)  # already normalized
            
        k = self.encoder_q(im_k)  # keys: NxC
        k = nn.functional.normalize(k, dim=1)  # already normalized
        
        l_pos = torch.einsum('nc,nc->n', [q, k]).unsqueeze(-1)
        
        # calculate l_neg
        neg = torch.cat([q,k],dim=0).t()
        l_neg = torch.einsum('nc,ck->nk', [q, neg])
        
        l_neg/=0.7
        l_pos/=0.7
        
        
        if self.estimator=='SimCLR':
            loss2 = -(l_pos.squeeze()-torch.logsumexp(l_neg,1))
            loss2 = torch.mean(loss2,0)
            
        if self.estimator=='RPC':
            margin =  0.5 * l_neg**2 + l_neg #if l_neg >= -2 else -1.0
            margin = torch.mean(margin,1)
            l_pos = l_pos - 0.5* 0.005 * l_pos**2
            loss2 = -(l_pos.squeeze()-margin.squeeze())
            loss2 = torch.mean(loss2,0)
        
        if self.estimator == 'Gaussian-Uniform':
            l_pos = (q - k).norm(p=2, dim=1).pow(2).mean()   
            l_neg = torch.pdist(q, p=2).pow(2).mul(-2).exp().mean().log()
            loss2 = l_pos + l_neg

        if self.estimator == 'Gaussian-KL':
            l_pos = (q - k).norm(p=2, dim=1).pow(2).mean()   
            l_neg = torch.pdist(q, p=2).pow(2).mul(-2).exp().mean()
            loss2 = l_pos + self.alpha * l_neg

        if self.estimator == 'Gaussian-Pearson':
            l_pos = -(q - k).norm(p=2, dim=1).pow(2).mul(-1).exp().mean()   
            l_neg = torch.pdist(q, p=2).pow(2).mul(-2).exp().mean()
            loss2 = l_pos + self.alpha * l_neg

        if self.estimator == 'Gaussian-JS':
            l_pos = (q - k).norm(p=2, dim=1).pow(2).mul(2).exp().add(1).log().mean()   
            l_neg = torch.pdist(q, p=2).pow(2).mul(-2).exp().add(1).log().mean() 
            loss2 = l_pos + self.alpha * l_neg
            
        if self.estimator == 'Gaussian-SH':
            l_pos = (q - k).norm(p=2, dim=1).pow(2).mul(1).exp().mean()   
            l_neg = torch.pdist(q, p=2).pow(2).mul(-1).exp().mean() 
            loss2 = l_pos + self.alpha * l_neg
            
        if self.estimator == 'Gaussian-Tsallis':
            l_pos = -(q - k).norm(p=2, dim=1).pow(2).mul(-(self.tsallis-1)).exp().mean()   
            l_neg = torch.pdist(q, p=2).pow(2).mul(-self.tsallis).exp().mean() 
            loss2 = l_pos + self.alpha*l_neg
            
        if self.estimator == 'Gaussian-VLC':
            l_pos = (q - k).norm(p=2, dim=1).pow(2).mul(-2).exp().add(1).pow(-2).mean()   
            l_neg = -torch.pdist(q, p=2).pow(2).mul(-2).exp().add(1).pow(-1).mean()
            loss2 = l_pos + self.alpha * l_neg
        
        return loss2, q, k
    

    def forward(self, im1, im2):
        """
        Input:
            im_q: a batch of query images
            im_k: a batch of key images
        Output:
            loss
        """

        # compute loss
        if self.symmetric:  # asymmetric loss
            loss_12, q1, k2 = self.contrastive_loss(im1, im2)
            loss_21, q2, k1 = self.contrastive_loss(im2, im1)
            loss = loss_12 + loss_21
        else:  # asymmetric loss
            loss, q, k = self.contrastive_loss(im1, im2)
        return loss

# create model
model = ModelSimCLR(
        alpha=args.alpha,
        batch_size = args.batch_size,
        tsallis=args.tsallis,
        dim=args.dim,
        sigma=args.sigma,
        arch=args.arch,
        symmetric=args.symmetric,
        estimator = args.estimator
    ).cuda()

# train for one epoch
def train(net, data_loader, train_optimizer, epoch, args):
    net.train()
    adjust_learning_rate(optimizer, epoch, args)

    total_loss, total_mmd, total_num, train_bar = 0.0, 0.0, 0, tqdm(data_loader)
    for im,_ in train_bar:
        im_1 = im[0]
        im_2 = im[1]
        im_1, im_2 = im_1.cuda(non_blocking=True), im_2.cuda(non_blocking=True)

        loss= net(im_1, im_2)
  
        train_optimizer.zero_grad()
        loss.backward()
        train_optimizer.step()

        total_num += data_loader.batch_size
        total_loss += loss.item() * data_loader.batch_size
   
        wandb.log({'epoch': epoch, 'loss': total_loss/total_num})
        train_bar.set_description('Train Epoch: [{}/{}], lr: {:.6f}, Loss: {:.4f}'.format(epoch, args.epochs, optimizer.param_groups[0]['lr'], total_loss / total_num))

    return total_loss / total_num

# lr scheduler for training
def adjust_learning_rate(optimizer, epoch, args):
    """Decay the learning rate based on schedule"""
    lr = args.lr
    if args.cos:  # cosine lr schedule
        lr *= 0.5 * (1. + math.cos(math.pi * epoch / args.epochs))
    else:  # stepwise lr schedule
        for milestone in args.schedule:
            lr *= 0.1 if epoch >= milestone else 1.
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

# test using a knn monitor
def test(net, memory_data_loader, test_data_loader, epoch, args):
    net.eval()
    #classes = len(memory_data_loader.dataset.classes)
    classes = 10
    total_top1, total_top5, total_num, feature_bank = 0.0, 0.0, 0, []
    with torch.no_grad():
        # generate feature bank
        for data, target in tqdm(memory_data_loader, desc='Feature extracting'):
            feature = net(data.cuda(non_blocking=True))
            feature = F.normalize(feature, dim=1)
            feature_bank.append(feature)
        # [D, N]
        feature_bank = torch.cat(feature_bank, dim=0).t().contiguous()
        # [N]
        a = np.int64(memory_data_loader.dataset.labels)
        feature_labels = torch.tensor(a, device=feature_bank.device)
        
        # loop test data to predict the label by weighted knn search
        test_bar = tqdm(test_data_loader)
        for data, target in test_bar:
            data, target = data.cuda(non_blocking=True), target.cuda(non_blocking=True)
            feature = net(data)
            feature = F.normalize(feature, dim=1)
            
            pred_labels = knn_predict(feature, feature_bank, feature_labels, classes, args.knn_k, args.knn_t)

            total_num += data.size(0)
            total_top1 += (pred_labels[:, 0] == target).float().sum().item()
            test_bar.set_description('Test Epoch: [{}/{}] Acc@1:{:.2f}%'.format(epoch, args.epochs, total_top1 / total_num * 100))

    return total_top1 / total_num * 100

# knn monitor as in InstDisc https://arxiv.org/abs/1805.01978
# implementation follows http://github.com/zhirongw/lemniscate.pytorch and https://github.com/leftthomas/SimCLR
def knn_predict(feature, feature_bank, feature_labels, classes, knn_k, knn_t):
    # compute cos similarity between each feature vector and feature bank ---> [B, N]
    sim_matrix = torch.mm(feature, feature_bank)
    # [B, K]
    sim_weight, sim_indices = sim_matrix.topk(k=knn_k, dim=-1)
    # [B, K]
    sim_labels = torch.gather(feature_labels.expand(feature.size(0), -1), dim=-1, index=sim_indices)
    sim_weight = (sim_weight / knn_t).exp()

    # counts for each class
    one_hot_label = torch.zeros(feature.size(0) * knn_k, classes, device=sim_labels.device)
    # [B*K, C]
    one_hot_label = one_hot_label.scatter(dim=-1, index=sim_labels.view(-1, 1), value=1.0)
    # weighted score ---> [B, C]
    pred_scores = torch.sum(one_hot_label.view(feature.size(0), -1, classes) * sim_weight.unsqueeze(dim=-1), dim=1)

    pred_labels = pred_scores.argsort(dim=-1, descending=True)
    return pred_labels

# define optimizer
optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, weight_decay=args.wd, momentum=0.9)

# load model if resume
epoch_start = 1
if args.resume is not '' and os.path.exists(args.resume):
    checkpoint = torch.load(args.resume)
    model.load_state_dict(checkpoint['state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer'])
    epoch_start = checkpoint['epoch'] + 1
    print('Loaded from: {}'.format(args.resume))

# logging
results = {'train_loss': [], 'test_acc@1': []}
if not os.path.exists(args.results_dir):
    os.mkdir(args.results_dir)
# dump args
with open(args.results_dir + '/args.json', 'w') as fid:
    json.dump(args.__dict__, fid, indent=2)

torch.random.manual_seed(args.seed)    
    
# training loop
for epoch in range(epoch_start, args.epochs + 1):
    
    train_loss = train(model, train_loader, optimizer, epoch, args)
    results['train_loss'].append(train_loss)
    test_acc_1 = test(model.encoder_q, memory_loader, test_loader, epoch, args)
    results['test_acc@1'].append(test_acc_1)
    # save statistics
    data_frame = pd.DataFrame(data=results, index=range(epoch_start, epoch + 1))
    data_frame.to_csv(args.results_dir + '/log.csv', index_label='epoch')
    # save model
    torch.save({'epoch': epoch, 'state_dict': model.state_dict(), 'optimizer' : optimizer.state_dict(),}, args.results_dir  + '/model_last.pth')
