import torch
import torch.nn as nn
from tqdm import tqdm

from utils import load_data
from models import ResNet18
from torchlars import LARS
from warmup_scheduler import GradualWarmupScheduler
from advertorch.utils import NormalizeByChannelMeanStd
import torchvision
import torchvision.transforms as transforms

#SUFFIX = '1' # all mixup
#SUFFIX = '2' # all mixup
#SUFFIX = '3' # only lars
#SUFFIX = '4' # original
#SUFFIX = '5' # only lars 5e-4
#SUFFIX = '6' # original 1e-6
#SUFFIX = '7' # original 5e-5
#SUFFIX = '8' # original 1e-5
#SUFFIX = '9' # original 1e-5
#SUFFIX = '10' # original 5e-5
#SUFFIX = '11' # original 1e-6
#SUFFIX = '12' # original 1e-4
SUFFIX = '13' # original 1e-4

trainset, testset, trainloader, testloader, normalizer = load_data()
print (len(trainset), len(testset))
#mean = torch.tensor([0.4914, 0.4822, 0.4465], dtype=torch.float32).cuda()
#std = torch.tensor([0.2023, 0.1994, 0.2010], dtype=torch.float32).cuda()
#normalizer = NormalizeByChannelMeanStd(mean=mean, std=std)
#testset = torchvision.datasets.CIFAR10(root='./raw_data', train=False, download=True, transform=transforms.ToTensor())
#testloader = torch.utils.data.DataLoader(testset, batch_size=512, shuffle=False, num_workers=4)
#transform_train = transforms.Compose([
#    transforms.RandomCrop(32, padding=4),
#    transforms.RandomHorizontalFlip(),
#    transforms.ToTensor(),
#])
#trainset = torchvision.datasets.CIFAR10(root='./raw_data', train=True, download=True, transform=transform_train)
#trainloader = torch.utils.data.DataLoader(trainset, batch_size=256, num_workers=0, pin_memory=False, shuffle=True, drop_last=True)

model = ResNet18(normalizer)
model = model.to('cuda')
print (model)

criterion = nn.CrossEntropyLoss()

#optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
#optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=1e-6)
#optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-5)
#optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=1e-5)
optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[100,150], gamma=0.1)

#base_optimizer = torch.optim.SGD(model.parameters(), 0.1, momentum=0.9, weight_decay=1e-6)
#optimizer = LARS(optimizer=base_optimizer, eps=1e-8, trust_coef=0.001)
#scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, 200)
#scheduler = GradualWarmupScheduler(optimizer, multiplier=15.0, total_epoch=10, after_scheduler=scheduler_cosine) #TODO: back

##base_optimizer = torch.optim.SGD(model.parameters(), 0.1, momentum=0.9, weight_decay=1e-6)
#base_optimizer = torch.optim.SGD(model.parameters(), 0.1, momentum=0.9, weight_decay=5e-4)
#optimizer = LARS(optimizer=base_optimizer, eps=1e-8, trust_coef=0.001)
#scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[100,150], gamma=0.1)

def train(epoch):
    print('\nEpoch: %d' % epoch)
    model.train()
    train_loss = 0
    correct = 0
    total = 0
    with tqdm(trainloader) as pbar:
        for batch_idx, (x, y) in enumerate(pbar):
            x, y = x.to('cuda'), y.to('cuda')
            optimizer.zero_grad()
            pred = model(x)
            loss = criterion(pred, y)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            _, pred_c = pred.max(1)
            total += y.size(0)
            correct += pred_c.eq(y).sum().item()
            pbar.set_description('Loss: %.3f | Acc:%.3f%%'%(train_loss/(batch_idx+1), 100.*correct/total))

    acc = 100.*correct/total
    return train_loss/len(trainloader), acc

def test(epoch):
    model.eval()
    test_loss = 0
    correct = 0
    total = 0
    with torch.no_grad(), tqdm(testloader) as pbar:
        for batch_idx, (x, y) in enumerate(pbar):
            x, y = x.to('cuda'), y.to('cuda')
            pred = model(x)
            loss = criterion(pred, y)

            test_loss += loss.item()
            _, pred_c = pred.max(1)
            total += y.size(0)
            correct += pred_c.eq(y).sum().item()
            pbar.set_description('Loss: %.3f | Acc:%.3f%%'%(test_loss/(batch_idx+1), 100.*correct/total))

    acc = 100.*correct/total
    return test_loss/len(testloader), acc


best_acc = 0.0
for epoch in range(200):
    train(epoch)
    _, cur_acc = test(epoch)
    scheduler.step()
    if cur_acc > best_acc:
        best_acc = cur_acc
        torch.save(model.state_dict(), './saved_model/debug%s.pth'%SUFFIX)
