import torch
import torch.nn as nn

import numpy as np

from .utils import *

def exp(val):
    return 2**val
def ln(val):
    # return np.log(val)
    # print('change ln')
    return np.log(val)
def power(val, factor=2):
    return val ** factor

def linear(val, factor=1):
    return val * factor

def normalize(fn, len):
    val = np.array([fn(len-i) for i in range(len)])# [::-1]
    summation = np.sum(val)
    return 16/255*(val / summation).astype(np.float32)

def pvalue(val,epsion=8/255, order=0.6):
    return epsion/(val+1)**order

def identity(val):
    return 1.


class Attack(object):
    """
    Base class for all attacks.
    """
    def __init__(self, attack, model, epsilon, targeted, random_start, norm, loss,device=None,penalty='none', penalty_factor=0., **kwargs):
        """
        Initialize the hyperparameters
        Arguments:
            attack (str): the name of attack.
            model (torch.nn.Module): the surrogate model for attack.
            epsilon (float): the perturbation budget.
            targeted (bool): targeted/untargeted attack.
            random_start (bool): whether using random initialization for delta.
            norm (str): the norm of perturbation, l2/linfty.
            loss (str): the loss function.
            device (torch.device): the device for data. If it is None, the device would be same as model
        """
        if norm not in ['l2', 'linfty']:
            raise Exception("Unsupported norm {}".format(norm))
        self.attack = attack
        self.model = self.load_model(model)
        self.epsilon = epsilon
        self.targeted = targeted
        self.device = torch.device('cuda') if device is None else device
        self.random_start = random_start
        self.loss_type = loss
        self.loss = self.loss_function(self.loss_type)
        self.norm = norm
        self.penalty_type = penalty
        self.penalty_factor = penalty_factor
        self.penalty = self.penalty_function(self.penalty_type)
        # self.ai = normalize(linear, self.epoch)
        
    def load_model(self, model_name):
        if 'ens' in model_name:
            model = EnsembleModel([wrap_model(ens_pool[name](weights='DEFAULT').eval().cuda()) for name in ens_pool.keys()])
        else:
            model = wrap_model(model_list[model_name](weights='DEFAULT').eval().cuda())
        return model
    
    def forward(self, data, label, **kwargs):
        """
        The general attack procedure
        Arguments:
            data: (N, C, H, W) tensor for input images
            labels: (N,) tensor for ground-truth labels if untargetd, otherwise targeted labels
        """
        data = data.clone().detach().to(self.device)
        label = label.clone().detach().to(self.device)
        # Initialize adversarial perturbation
        delta = self.init_delta(data, label = label)
        
        # ai = normalize(ln, self.epoch)
        momentum = 0.
        for _ in range(self.epoch):
            # Obtain the output
            logits = self.get_logits(self.transform(data+delta, momentum=momentum))

            # Calculate the loss
            loss =  self.get_loss(logits, label)
            penalty = self.penalty(delta)
            loss = loss + self.penalty_factor * penalty
            # Calculate the gradients
            grad = self.get_grad(loss, delta)
            # sum_grad = torch.sum(grad)
            # second_grad = torch.autograd.grad(sum_grad, delta, create_graph=True)[0]
            #print(grad)

            # avg_gradient = (avg_gradient * _ + grad) / (_ + 1)

            # Calculate the momentum
            momentum = self.get_momentum(grad, momentum, decay=self.decay)
            delta = self.update_delta(delta, data, momentum, self.alpha)
            # print('iter: ', _,'mean: ', torch.mean(delta),'std: ', torch.std(delta),'grad l2 norm: ', torch.norm(grad,p=2),'2-th order grad: mean:', torch.mean(second_grad),'std: ', torch.std(second_grad))
        # exit()
        return delta.detach()

    def get_logits(self, x, **kwargs):
        """
        The inference stage, which should be overridden when the attack need to change the models (e.g., ensemble-model attack, ghost, etc.) or the input (e.g. DIM, SIM, etc.)
        """
        return self.model(x)

    def get_loss(self, logits, label):
        """
        The loss calculation, which should be overrideen when the attack change the loss calculation (e.g., ATA, etc.)
        """
        # Calculate the loss
        return -self.loss(logits, label) if self.targeted else self.loss(logits, label) 
        

    def get_grad(self, loss, delta, **kwargs):
        """
        The gradient calculation, which should be overridden when the attack need to tune the gradient (e.g., TIM, variance tuning, enhanced momentum, etc.)
        """
        return torch.autograd.grad(loss, delta, retain_graph=False, create_graph=False)[0]
        # return torch.autograd.grad(loss, delta, create_graph=False)[0]

    def get_momentum(self, grad, momentum, decay=None, **kwargs):
        """
        The momentum calculation
        """
        if decay == None:
            decay = self.decay
        return momentum * decay + grad / (grad.abs().mean(dim=(1,2,3), keepdim=True))

    def init_delta(self, data, **kwargs):
        delta = torch.zeros_like(data).to(self.device)
        if self.random_start:
            if self.norm == 'linfty':
                delta.uniform_(-self.epsilon, self.epsilon)
            else:
                delta.normal_(0, self.epsilon)
                d_flat = delta.view(delta.size(0), -1)
                n = d_flat.norm(p=2, dim=10).view(delta.size(0), 1, 1, 1)
                r = torch.zeros_like(data).uniform_(0,1).to(self.device)
                delta *= r/n*self.epsilon
            delta = clamp(delta, img_min-data, img_max-data)
        delta.requires_grad = True
        return delta

    def update_delta(self, delta, data, grad, alpha, **kwargs):
        if self.norm == 'linfty':
            delta = torch.clamp(delta + alpha * grad.sign(), -self.epsilon, self.epsilon)
        else:
            grad_norm = torch.norm(grad.view(grad.size(0), -1), dim=1).view(-1, 1, 1, 1)
            scaled_grad = grad / (grad_norm + 1e-20)
            delta = (delta + scaled_grad * alpha).view(delta.size(0), -1).renorm(p=2, dim=0, maxnorm=self.epsilon).view_as(delta)
        delta = clamp(delta, img_min-data, img_max-data)
        return delta


    def loss_function(self, loss):
        """
        Get the loss function
        """
        if loss == 'ce':
            return nn.CrossEntropyLoss()
        elif loss == 'bce':
            return nn.BCEWithLogitsLoss()
        else:
            raise Exception("Unsupported loss {}".format(loss))

    def transform(self, data, **kwargs):
        return data
    def penalty_function(self, penalty_type):
        """
        Get the penalty function
        """
        print('penalty_factor: ', self.penalty_factor)
        if penalty_type == 'none':
            return lambda x: 0
        elif penalty_type == 'l2':
            print('l2 penalty')
            return lambda x: torch.sum(x**2)
        elif penalty_type == 'l1':
            print('l1 penalty')
            return lambda x: torch.sum(x.abs())
        elif penalty_type == 'sl1':
            print('smooth l1 penalty')
            return lambda x: torch.sum(torch.sqrt(1e-12 + x**2) - 1e-6)
        else:
            raise Exception("Unsupported penalty {}".format(penalty_type))
    def __call__(self, *input, **kwargs):
        self.model.eval()
        return self.forward(*input, **kwargs)



class ScheduledAttack(object):
    """
    Base class for all attacks.
    """
    def __init__(self, attack, model, epsilon, targeted, random_start, norm, loss,device=None, step_type='ln', **kwargs):
        """
        Initialize the hyperparameters
        Arguments:
            attack (str): the name of attack.
            model (torch.nn.Module): the surrogate model for attack.
            epsilon (float): the perturbation budget.
            targeted (bool): targeted/untargeted attack.
            random_start (bool): whether using random initialization for delta.
            norm (str): the norm of perturbation, l2/linfty.
            loss (str): the loss function.
            device (torch.device): the device for data. If it is None, the device would be same as model
        """
        if norm not in ['l2', 'linfty']:
            raise Exception("Unsupported norm {}".format(norm))
        self.attack = attack
        self.model = self.load_model(model)
        self.epsilon = epsilon
        self.targeted = targeted
        self.random_start = random_start
        self.norm = norm
        self.device = 'cuda' #next(model.parameters()).device if device is None else device
        self.loss = self.loss_function(loss)
        self.step_type = step_type
        self.op = None
        if self.step_type == 'ln':
            self.op = self.ln
        elif self.step_type == 'exp':
            self.op = self.exp
        elif self.step_type == self.power:
            self.op = self.power
        elif self.step_type == 'linear':
            self.op = self.linear
        elif self.step_type == 'pvalue':
            self.op = self.pvalue
        elif self.step_type == 'identity':
            self.op = self.identity
        else:
            raise Exception("Unsupported step type {}".format(step_type))
        
    
    def load_model(self, model_name):
        model = wrap_model(model_list[model_name](weights='DEFAULT').eval().cuda())
        return model

    
    def forward(self, data, label, **kwargs):
        """
        The general attack procedure
        Arguments:
            data: (N, C, H, W) tensor for input images
            labels: (N,) tensor for ground-truth labels if untargetd, otherwise targeted labels
        """
        data = data.clone().detach().to(self.device)
        label = label.clone().detach().to(self.device)
        # Initialize adversarial perturbation
        delta = self.init_delta(data)
        delta_dual = delta.clone().detach().to(self.device)
        # Initialize momentum
        momentum = 0.
        momentum_dual = 0.
        grad = 0.
        # self.moving_gradient = 0.
        self.ai = self.normalize(self.op, self.epoch)
        
        for _ in range(self.epoch):
            # Obtain the output
            self.random_start = True
            delta = self.init_delta(data)
            logits = self.get_logits(self.transform(data+delta, momentum=grad, alpha=self.ai[_]))
            self.random_start = False
            # Calculate the loss
            loss = self.get_loss(logits, label)
            # Calculate the gradients
            grad = self.get_grad(loss, delta)
            # Calculate the momentum
            momentum = self.get_momentum(grad, momentum,decay=self.decay)
            momentum_dual = momentum#(_ * momentum_dual + momentum)/(_ + 1)
            delta = self.update_delta(delta, data, grad, self.ai[_])
            delta_dual = self.update_delta(delta_dual, data, momentum_dual, self.ai[_])
        return delta_dual.detach()
        # return delta.detach()
    
    def get_logits(self, x, **kwargs):
        """
        The inference stage, which should be overridden when the attack need to change the models (e.g., ensemble-model attack, ghost, etc.) or the input (e.g. DIM, SIM, etc.)
        """
        return self.model(x)

    def get_loss(self, logits, label):
        """
        The loss calculation, which should be overrideen when the attack change the loss calculation (e.g., ATA, etc.)
        """
        # Calculate the loss
        return -self.loss(logits, label) if self.targeted else self.loss(logits, label)
        

    def get_grad(self, loss, delta, **kwargs):
        """
        The gradient calculation, which should be overridden when the attack need to tune the gradient (e.g., TIM, variance tuning, enhanced momentum, etc.)
        """
        return torch.autograd.grad(loss, delta, retain_graph=False, create_graph=False)[0]

    def get_momentum(self, grad, momentum, decay=None, **kwargs):
        """
        The momentum calculation
        """
        if decay == None:
            decay = self.decay
        return momentum * decay + grad / (grad.abs().mean(dim=(1,2,3), keepdim=True))

    def init_delta(self, data, **kwargs):
        delta = torch.zeros_like(data).to(self.device)
        if self.random_start:
            if self.norm == 'linfty':
                delta.uniform_(-self.epsilon, self.epsilon)
            else:
                delta.normal_(-self.epsilon, self.epsilon)
                d_flat = delta.view(delta.size(0), -1)
                n = d_flat.norm(p=2, dim=10).view(delta.size(0), 1, 1, 1)
                r = torch.zeros_like(data).uniform_(0,1).to(self.device)
                delta *= r/n*self.epsilon
            delta = clamp(delta, img_min-data, img_max-data)
        delta.requires_grad = True
        return delta

    def update_delta(self, delta, data, grad, alpha, **kwargs):
        if self.norm == 'linfty':
            delta = torch.clamp(delta + alpha * grad.sign(), -self.epsilon, self.epsilon)
        else:
            grad_norm = torch.norm(grad.view(grad.size(0), -1), dim=1).view(-1, 1, 1, 1)
            scaled_grad = grad / (grad_norm + 1e-20)
            delta = (delta + scaled_grad * alpha).view(delta.size(0), -1).renorm(p=2, dim=0, maxnorm=self.epsilon).view_as(delta)
        delta = clamp(delta, img_min-data, img_max-data)
        return delta


    def loss_function(self, loss):
        """
        Get the loss function
        """
        if loss == 'ce':
            return nn.CrossEntropyLoss()
        else:
            raise Exception("Unsupported loss {}".format(loss))
        
    def transform(self, data, **kwargs):
        return data

    def __call__(self, *input, **kwargs):
        self.model.eval()
        return self.forward(*input, **kwargs)
    @staticmethod
    def exp(val):
        return 2**val
    @staticmethod
    def ln(val):
        # return np.log(val)
        # print('change ln')
        return np.log(val)
    @staticmethod
    def power(val, factor=2):
        return val ** factor
    @staticmethod
    def linear(val, factor=1):
        return val * factor
    @staticmethod
    def normalize(fn, len):
        val = np.array([fn(len-i) for i in range(len)])[::-1]
        summation = np.sum(val)
        return 16/255*(val / summation).astype(np.float32)
    @staticmethod
    def pvalue(val,epsion=8/255, order=0.6):
        return epsion/(val+1)**order
    @staticmethod
    def identity(val):
        return 1.
    
    


class StepAttack(object):
    """
    Base class for all attacks.
    """
    def __init__(self, attack, model, epsilon, targeted, random_start, norm, loss,device=None, step_type='ln', **kwargs):
        """
        Initialize the hyperparameters
        Arguments:
            attack (str): the name of attack.
            model (torch.nn.Module): the surrogate model for attack.
            epsilon (float): the perturbation budget.
            targeted (bool): targeted/untargeted attack.
            random_start (bool): whether using random initialization for delta.
            norm (str): the norm of perturbation, l2/linfty.
            loss (str): the loss function.
            device (torch.device): the device for data. If it is None, the device would be same as model
        """
        if norm not in ['l2', 'linfty']:
            raise Exception("Unsupported norm {}".format(norm))
        self.attack = attack
        self.model = self.load_model(model)
        self.epsilon = epsilon
        self.targeted = targeted
        self.random_start = random_start
        self.norm = norm
        self.device = 'cuda' #next(model.parameters()).device if device is None else device
        self.loss = self.loss_function(loss)
        self.step_type = step_type
        self.op = None
        if self.step_type == 'ln':
            self.op = self.ln
        elif self.step_type == 'exp':
            self.op = self.exp
        elif self.step_type == self.power:
            self.op = self.power
        elif self.step_type == 'linear':
            self.op = self.linear
        elif self.step_type == 'pvalue':
            self.op = self.pvalue
        elif self.step_type == 'identity':
            self.op = self.identity
        else:
            raise Exception("Unsupported step type {}".format(step_type))
        
    
    def load_model(self, model_name):
        model = wrap_model(model_list[model_name](weights='DEFAULT').eval().cuda())
        return model

    
    def forward(self, data, label, **kwargs):
        """
        The general attack procedure
        Arguments:
            data: (N, C, H, W) tensor for input images
            labels: (N,) tensor for ground-truth labels if untargetd, otherwise targeted labels
        """
        data = data.clone().detach().to(self.device)
        label = label.clone().detach().to(self.device)
        # Initialize adversarial perturbation
        delta = self.init_delta(data)
        # delta_dual = delta.clone().detach().to(self.device)
        # Initialize momentum
        momentum = 0.
        # momentum_dual = 0.
        grad = 0.
        # self.moving_gradient = 0.
        self.ai = self.normalize(self.op, self.epoch)
        
        for _ in range(self.epoch):
            # Obtain the output
            self.random_start = True
            # delta = self.init_delta(data)
            logits = self.get_logits(self.transform(data+delta, momentum=grad, alpha=self.ai[_]))
            self.random_start = False
            # Calculate the loss
            loss = self.get_loss(logits, label)
            # Calculate the gradients
            grad = self.get_grad(loss, delta)
            # Calculate the momentum
            momentum = self.get_momentum(grad, momentum,decay=self.decay)
            # momentum_dual = momentum#(_ * momentum_dual + momentum)/(_ + 1)
            delta = self.update_delta(delta, data, grad, self.ai[_])
            # delta_dual = self.update_delta(delta_dual, data, momentum_dual, self.ai[_])
        return delta.detach()
        # return delta.detach()
    
    def get_logits(self, x, **kwargs):
        """
        The inference stage, which should be overridden when the attack need to change the models (e.g., ensemble-model attack, ghost, etc.) or the input (e.g. DIM, SIM, etc.)
        """
        return self.model(x)

    def get_loss(self, logits, label):
        """
        The loss calculation, which should be overrideen when the attack change the loss calculation (e.g., ATA, etc.)
        """
        # Calculate the loss
        return -self.loss(logits, label) if self.targeted else self.loss(logits, label)
        

    def get_grad(self, loss, delta, **kwargs):
        """
        The gradient calculation, which should be overridden when the attack need to tune the gradient (e.g., TIM, variance tuning, enhanced momentum, etc.)
        """
        return torch.autograd.grad(loss, delta, retain_graph=False, create_graph=False)[0]

    def get_momentum(self, grad, momentum, decay=None, **kwargs):
        """
        The momentum calculation
        """
        if decay == None:
            decay = self.decay
        return momentum * decay + grad / (grad.abs().mean(dim=(1,2,3), keepdim=True))

    def init_delta(self, data, **kwargs):
        delta = torch.zeros_like(data).to(self.device)
        if self.random_start:
            if self.norm == 'linfty':
                delta.uniform_(-self.epsilon, self.epsilon)
            else:
                delta.normal_(-self.epsilon, self.epsilon)
                d_flat = delta.view(delta.size(0), -1)
                n = d_flat.norm(p=2, dim=10).view(delta.size(0), 1, 1, 1)
                r = torch.zeros_like(data).uniform_(0,1).to(self.device)
                delta *= r/n*self.epsilon
            delta = clamp(delta, img_min-data, img_max-data)
        delta.requires_grad = True
        return delta

    def update_delta(self, delta, data, grad, alpha, **kwargs):
        if self.norm == 'linfty':
            delta = torch.clamp(delta + alpha * grad.sign(), -self.epsilon, self.epsilon)
        else:
            grad_norm = torch.norm(grad.view(grad.size(0), -1), dim=1).view(-1, 1, 1, 1)
            scaled_grad = grad / (grad_norm + 1e-20)
            delta = (delta + scaled_grad * alpha).view(delta.size(0), -1).renorm(p=2, dim=0, maxnorm=self.epsilon).view_as(delta)
        delta = clamp(delta, img_min-data, img_max-data)
        return delta


    def loss_function(self, loss):
        """
        Get the loss function
        """
        if loss == 'ce':
            return nn.CrossEntropyLoss()
        else:
            raise Exception("Unsupported loss {}".format(loss))
        
    def transform(self, data, **kwargs):
        return data

    def __call__(self, *input, **kwargs):
        self.model.eval()
        return self.forward(*input, **kwargs)
    @staticmethod
    def exp(val):
        return 2**val
    @staticmethod
    def ln(val):
        # return np.log(val)
        # print('change ln')
        return np.log(val)
    @staticmethod
    def power(val, factor=2):
        return val ** factor
    @staticmethod
    def linear(val, factor=1):
        return val * factor
    @staticmethod
    def normalize(fn, len):
        val = np.array([fn(len-i) for i in range(len)])[::-1]
        summation = np.sum(val)
        return 16/255*(val / summation).astype(np.float32)
    @staticmethod
    def pvalue(val,epsion=8/255, order=0.6):
        return epsion/(val+1)**order
    @staticmethod
    def identity(val):
        return 1.

