## Copyright (C) 2019, Huan Zhang <huan@huan-zhang.com>
##                     Hongge Chen <chenhg@mit.edu>
##                     Chaowei Xiao <xiaocw@umich.edu>
## 
## This program is licenced under the BSD 2-Clause License,
## contained in the LICENCE file in this directory.
##
from numpy.lib.npyio import load
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.transforms.functional as F
import numpy as np



def mean(l): 
    return sum(l)/len(l)

def output(config,models, X, alpha):
    out = 0
    for i,model in enumerate(models):
        out += alpha[i]*model(X)
    return out 

def _pgd(config,model, X, y, epsilon, alpha, niters=10,step_size=0.01, loader = None): 
    out = output(config,model,X, alpha)
    ce = nn.CrossEntropyLoss()(out, y)
    err = (out.data.max(1)[1] != y.data).float().sum()
    err_pgd = 0
    y_y=y.view(-1,1)

    std = torch.tensor(loader.std).unsqueeze(0).unsqueeze(-1).unsqueeze(-1).cuda()
    mean = torch.tensor(loader.mean).unsqueeze(0).unsqueeze(-1).unsqueeze(-1).cuda()
    data_max = torch.reshape((1. - mean) / std, (1, -1, 1, 1))
    data_min = torch.reshape((0. - mean) / std, (1, -1, 1, 1))
    for j in range(10):
        X_pgd = Variable(X.data) + epsilon*(torch.randint(0,2,X.size())*2-1).cuda()
        X_pgd.requires_grad_()
        # X_pgd = Variable(X.data, requires_grad=True)
        for i in range(niters): 
            #opt = optim.Adam([X_pgd], lr=1e-3)
            #opt.zero_grad()
            loss = nn.CrossEntropyLoss()(output(None, model,X_pgd, alpha), y)
            
            # loss=-(torch.gather(output(config,model,X_pgd,alpha),1,y_y)).sum()
            #loss = output(model,X_pgd,alpha)
            #print(loss)
            loss.backward()
            eta = step_size*X_pgd.grad.data.sign()
            X_pgd = Variable(X_pgd.data + eta/std, requires_grad=True)
            
            # adjust to be within [-epsilon, epsilon]
            # eta = torch.clamp(X_pgd.data - X.data, -epsilon, epsilon)
            # #X_pgd = Variable(X.data + eta, requires_grad=True)
            # X_pgd.data = X.data + eta
            # X_pgd.data = torch.clamp(X_pgd.data, 0.0, 1.0)

            eta = torch.min(X_pgd.data - X.data,  epsilon/std)
            eta = torch.max(eta,  -epsilon/std)
            X_pgd.data = X.data + eta
            for k in range(X_pgd.shape[1]):
                X_pgd.data[:,k,:,:] = (torch.clamp(X_pgd.data[:,k,:,:] * std.flatten()[k] + mean.flatten()[k], 0, 1) - mean.flatten()[k]) / std.flatten()[k]

            # X_pgd.data = torch.clamp(X_pgd.data, 0.0, 1.0)

            # X_pgd = X_pgd.detach()
            # X_pgd.requires_grad_()
            # X_pgd.retain_grad()
            # print((output(config,model,X_pgd, alpha).data.max(1)[1] != y.data).float().sum())
        err_pgd = max((output(config,model,X_pgd, alpha).data.max(1)[1] != y.data).float().sum(),err_pgd)
    return err, err_pgd

def _pgd_save(config,model, X, y, epsilon, alpha, niters=10,step_size=0.01, loader = None): 
    out = output(config,model,X, alpha)
    ce = nn.CrossEntropyLoss()(out, y)
    err = (out.data.max(1)[1] != y.data).float().sum()
    err_pgd = 0
    y_y=y.view(-1,1)

    std = torch.tensor(loader.std).unsqueeze(0).unsqueeze(-1).unsqueeze(-1).cuda()
    mean = torch.tensor(loader.mean).unsqueeze(0).unsqueeze(-1).unsqueeze(-1).cuda()
    data_max = torch.reshape((1. - mean) / std, (1, -1, 1, 1))
    data_min = torch.reshape((0. - mean) / std, (1, -1, 1, 1))
    for j in range(10):
        X_pgd = Variable(X.data) + epsilon*(torch.randint(0,2,X.size())*2-1).cuda()
        X_pgd.requires_grad_()
        # X_pgd = Variable(X.data, requires_grad=True)
        for i in range(niters): 
            #opt = optim.Adam([X_pgd], lr=1e-3)
            #opt.zero_grad()
            loss = nn.CrossEntropyLoss()(output(None, model,X_pgd, alpha), y)
            
            # loss=-(torch.gather(output(config,model,X_pgd,alpha),1,y_y)).sum()
            #loss = output(model,X_pgd,alpha)
            #print(loss)
            loss.backward()
            eta = step_size*X_pgd.grad.data.sign()
            X_pgd = Variable(X_pgd.data + eta/std, requires_grad=True)
            
            # adjust to be within [-epsilon, epsilon]
            # eta = torch.clamp(X_pgd.data - X.data, -epsilon, epsilon)
            # #X_pgd = Variable(X.data + eta, requires_grad=True)
            # X_pgd.data = X.data + eta
            # X_pgd.data = torch.clamp(X_pgd.data, 0.0, 1.0)

            eta = torch.min(X_pgd.data - X.data,  epsilon/std)
            eta = torch.max(eta,  -epsilon/std)
            X_pgd.data = X.data + eta
            for k in range(X_pgd.shape[1]):
                X_pgd.data[:,k,:,:] = (torch.clamp(X_pgd.data[:,k,:,:] * std.flatten()[k] + mean.flatten()[k], 0, 1) - mean.flatten()[k]) / std.flatten()[k]

            # X_pgd.data = torch.clamp(X_pgd.data, 0.0, 1.0)

            # X_pgd = X_pgd.detach()
            # X_pgd.requires_grad_()
            # X_pgd.retain_grad()
            # print((output(config,model,X_pgd, alpha).data.max(1)[1] != y.data).float().sum())
        err_pgd = max((output(config,model,X_pgd, alpha).data.max(1)[1] != y.data).float().sum(),err_pgd)
    return err, err_pgd, X_pgd

def pgd(config,loader, model, epsilon, alpha, niters=100, step_size=0.01, verbose=False,
        robust=False):
    return attack(config,loader, model, epsilon, alpha,verbose=verbose, atk=_pgd,
                  robust=robust)
def pgd_save(config,loader, model, epsilon, alpha, niters=100, step_size=0.01, verbose=False,
        robust=False):
    return attack_save(config,loader, model, epsilon, alpha,verbose=verbose, atk=_pgd_save,
                  robust=robust)

def attack(config,loader, model, epsilon, alpha,verbose=False, atk=None,
           robust=False):
    # print(np.max(loader.dataset.data),np.min(loader.dataset.data))
    total_count = 0
    err_count = 0
    pgd_err_count = 0
    if verbose: 
        print("Requiring no gradients for parameters.")

    for i, (X,y) in enumerate(loader):
        X,y = Variable(X.cuda(), requires_grad=True), Variable(y.cuda().long())

        if y.dim() == 2: 
            y = y.squeeze(1)

        err, err_pgd = atk(config,model, X, y, epsilon, alpha, loader=loader)
        total_count += X.size(0)
        err_count += err
        pgd_err_count += err_pgd

    if verbose: 
        print('clean err: {} | PGD err: {}'.format(err, err_fgs))
    
    err_rate = err_count / total_count
    pgd_err_rate = pgd_err_count / total_count
    print('[TOTAL] clean err: {:6.4f} | PGD err: {:6.4f}'.format(err_rate, pgd_err_rate))
    #return total_err, total_fgs, total_robust
    return err_rate, pgd_err_rate

def attack_save(config, loader, model, epsilon, alpha,verbose=False, atk=None,
           robust=False):
    # print(np.max(loader.dataset.data),np.min(loader.dataset.data))
    total_count = 0
    err_count = 0
    pgd_err_count = 0

    data= []
    label = []
    if verbose: 
        print("Requiring no gradients for parameters.")

    for i, (X,y) in enumerate(loader):
        X,y = Variable(X.cuda(), requires_grad=True), Variable(y.cuda().long())

        if y.dim() == 2: 
            y = y.squeeze(1)

        err, err_pgd, X_pgd = atk(config,model, X, y, epsilon, alpha, loader=loader)
        total_count += X.size(0)
        err_count += err
        pgd_err_count += err_pgd
        data.append(X_pgd.data)
        label.append(y)
    datas = torch.cat(data, dim =0)
    labels = torch.cat(label, dim =0)
    dataset = {"data":datas, "labels":labels}
    torch.save(dataset, 'data/attacked_mnist.pt')
    if verbose: 
        print('clean err: {} | PGD err: {}'.format(err, err_fgs))
    
    err_rate = err_count / total_count
    pgd_err_rate = pgd_err_count / total_count
    print('[TOTAL] clean err: {:6.4f} | PGD err: {:6.4f}'.format(err_rate, pgd_err_rate))
    #return total_err, total_fgs, total_robust
    return err_rate, pgd_err_rate