import torch

from ..utils import *
from ..attack import ScheduledAttack
import torch.nn.functional as F

from .sifgsm import SIFGSM,SMIFGSM,SNIFGSM,SPIFGSM,SEMIFGSM,SVMIFGSM,SGIMIFGSM



class ENS_FGSM_SIFGSM(ScheduledAttack):
    """
    MI-FGSM Attack
    'Boosting Adversarial Attacks with Momentum (CVPR 2018)'(https://arxiv.org/abs/1710.06081)

    Arguments:
        model (torch.nn.Module): the surrogate model for attack.
        epsilon (float): the perturbation budget.
        alpha (float): the step size.
        epoch (int): the number of iterations.
        decay (float): the decay factor for momentum calculation.
        targeted (bool): targeted/untargeted attack.
        random_start (bool): whether using random initialization for delta.
        norm (str): the norm of perturbation, l2/linfty.
        loss (str): the loss function.
        device (torch.device): the device for data. If it is None, the device would be same as model
        
    Official arguments:
        epsilon=16/255, alpha=epsilon/epoch=1.6/255, epoch=10, decay=1.
    """
    
    def __init__(self, model, epsilon=16/255, alpha=1.6/255, epoch=10, decay=1., targeted=False, random_start=False, 
                norm='linfty', loss='crossentropy', device=None, attack='MI-FGSM', adapted=False,rho=1.0,**kwargs):
        super().__init__(attack, model=model, epsilon=epsilon, targeted=targeted, random_start=random_start, norm=norm, loss=loss, device=device, **kwargs)
        self.alpha = alpha # epsilon/epoch# alpha
        self.epoch = epoch
        self.decay = 0
        self.ens_num = 10
    def forward(self, data, label, **kwargs):
        """
        The general attack procedure
        Arguments:
            data: (N, C, H, W) tensor for input images
            labels: (N,) tensor for ground-truth labels if untargetd, otherwise targeted labels
        """
        data = data.clone().detach().to(self.device)
        label = label.clone().detach().to(self.device)
        # Initialize adversarial perturbation
        delta = self.init_delta(data)
        delta_dual = delta.clone().detach().to(self.device)
        # Initialize momentum
        momentum = 0.
        momentum_dual = 0.
        grad = 0.
        # self.moving_gradient = 0.
        self.ai = self.normalize(self.op, self.epoch)
        
        for _ in range(self.epoch):
            # Obtain the output
            ens_grad = 0.
            for e in range(self.ens_num):
                self.random_start = True
                delta = self.init_delta(data)
                logits = self.get_logits(self.transform(data+delta, momentum=grad, alpha=self.ai[_]))
                self.random_start = False
                # Calculate the loss
                loss = self.get_loss(logits, label)
                # Calculate the gradients
                grad = self.get_grad(loss, delta)
                ens_grad += grad
            ens_grad /= self.ens_num
            # Calculate the momentum
            momentum = self.get_momentum(ens_grad, momentum,decay=self.decay)
            momentum_dual = momentum#(_ * momentum_dual + momentum)/(_ + 1)
            delta = self.update_delta(delta, data, grad, self.ai[_])
            delta_dual = self.update_delta(delta_dual, data, momentum_dual, self.ai[_])
        return delta_dual.detach()


class ENS_FGSM_SMIFGSM(ScheduledAttack):
    """
    MI-FGSM Attack
    'Boosting Adversarial Attacks with Momentum (CVPR 2018)'(https://arxiv.org/abs/1710.06081)

    Arguments:
        model (torch.nn.Module): the surrogate model for attack.
        epsilon (float): the perturbation budget.
        alpha (float): the step size.
        epoch (int): the number of iterations.
        decay (float): the decay factor for momentum calculation.
        targeted (bool): targeted/untargeted attack.
        random_start (bool): whether using random initialization for delta.
        norm (str): the norm of perturbation, l2/linfty.
        loss (str): the loss function.
        device (torch.device): the device for data. If it is None, the device would be same as model
        
    Official arguments:
        epsilon=16/255, alpha=epsilon/epoch=1.6/255, epoch=10, decay=1.
    """
    
    def __init__(self, model, epsilon=16/255, alpha=1.6/255, epoch=10, decay=1., targeted=False, random_start=False, 
                norm='linfty', loss='crossentropy', device=None, attack='MI-FGSM', adapted=False,rho=1.0,**kwargs):
        super().__init__(attack, model=model, epsilon=epsilon, targeted=targeted, random_start=random_start, norm=norm, loss=loss, device=device, **kwargs)
        self.alpha = alpha # epsilon/epoch# alpha
        self.epoch = epoch
        self.decay = decay
        self.ens_num = 10
    def forward(self, data, label, **kwargs):
        """
        The general attack procedure
        Arguments:
            data: (N, C, H, W) tensor for input images
            labels: (N,) tensor for ground-truth labels if untargetd, otherwise targeted labels
        """
        data = data.clone().detach().to(self.device)
        label = label.clone().detach().to(self.device)
        # Initialize adversarial perturbation
        delta = self.init_delta(data)
        delta_dual = delta.clone().detach().to(self.device)
        # Initialize momentum
        momentum = 0.
        momentum_dual = 0.
        grad = 0.
        # self.moving_gradient = 0.
        self.ai = self.normalize(self.op, self.epoch)
        
        for _ in range(self.epoch):
            # Obtain the output
            ens_grad = 0.
            for e in range(self.ens_num):
                self.random_start = True
                delta = self.init_delta(data)
                logits = self.get_logits(self.transform(data+delta, momentum=grad, alpha=self.ai[_]))
                self.random_start = False
                # Calculate the loss
                loss = self.get_loss(logits, label)
                # Calculate the gradients
                grad = self.get_grad(loss, delta)
                ens_grad += grad
            ens_grad /= self.ens_num
            # Calculate the momentum
            momentum = self.get_momentum(ens_grad, momentum,decay=self.decay)
            momentum_dual = momentum#(_ * momentum_dual + momentum)/(_ + 1)
            delta = self.update_delta(delta, data, grad, self.ai[_])
            delta_dual = self.update_delta(delta_dual, data, momentum_dual, self.ai[_])
        return delta_dual.detach()
    
class ENS_IFGSM_SMIFGSM(ScheduledAttack):
    """
    MI-FGSM Attack
    'Boosting Adversarial Attacks with Momentum (CVPR 2018)'(https://arxiv.org/abs/1710.06081)

    Arguments:
        model (torch.nn.Module): the surrogate model for attack.
        epsilon (float): the perturbation budget.
        alpha (float): the step size.
        epoch (int): the number of iterations.
        decay (float): the decay factor for momentum calculation.
        targeted (bool): targeted/untargeted attack.
        random_start (bool): whether using random initialization for delta.
        norm (str): the norm of perturbation, l2/linfty.
        loss (str): the loss function.
        device (torch.device): the device for data. If it is None, the device would be same as model
        
    Official arguments:
        epsilon=16/255, alpha=epsilon/epoch=1.6/255, epoch=10, decay=1.
    """
    
    def __init__(self, model, epsilon=16/255, alpha=1.6/255, epoch=10, decay=1., targeted=False, random_start=False, 
                norm='linfty', loss='crossentropy', device=None, attack='MI-FGSM', adapted=False,rho=1.0,**kwargs):
        super().__init__(attack, model=model, epsilon=epsilon, targeted=targeted, random_start=random_start, norm=norm, loss=loss, device=device, **kwargs)
        self.alpha = alpha # epsilon/epoch# alpha
        self.epoch = epoch
        self.decay = decay
        self.ens_num = 10
    def forward(self, data, label, **kwargs):
        """
        The general attack procedure
        Arguments:
            data: (N, C, H, W) tensor for input images
            labels: (N,) tensor for ground-truth labels if untargetd, otherwise targeted labels
        """
        data = data.clone().detach().to(self.device)
        label = label.clone().detach().to(self.device)
        # Initialize adversarial perturbation
        delta = self.init_delta(data)
        delta_dual = delta.clone().detach().to(self.device)
        # Initialize momentum
        self.random_start = True
        delta_list = [self.init_delta(data) for _ in range(self.ens_num)]
        self.random_start = False
        momentum = 0.
        momentum_dual = 0.
        grad = 0.
        # self.moving_gradient = 0.
        self.ai = self.normalize(self.op, self.epoch)
        grad_list = [0. for _ in range(self.ens_num)]
        for _ in range(self.epoch):
            # Obtain the output
            ens_grad = 0.
            for e in range(self.ens_num):
                logits = self.get_logits(self.transform(data+delta_list[e]))
                # Calculate the loss
                loss = self.get_loss(logits, label)
                # Calculate the gradients
                grad_list[e] = self.get_grad(loss, delta_list[e])
                ens_grad += grad_list[e]
            ens_grad /= self.ens_num
            # Calculate the momentum
            momentum = self.get_momentum(ens_grad, momentum,decay=self.decay)
            momentum_dual = momentum#(_ * momentum_dual + momentum)/(_ + 1) *(2e+1)/(self.ens_num
            for e in range(self.ens_num):
                delta_list[e] = self.update_delta(delta_list[e], data, grad_list[e], self.ai[_]*(2*e+1)/(self.ens_num))
            delta_dual = self.update_delta(delta_dual, data, momentum_dual, self.ai[_])
        return delta_dual.detach()
    
    
class ENS_MIX_SMIFGSM(ScheduledAttack):
    """
    MI-FGSM Attack
    'Boosting Adversarial Attacks with Momentum (CVPR 2018)'(https://arxiv.org/abs/1710.06081)

    Arguments:
        model (torch.nn.Module): the surrogate model for attack.
        epsilon (float): the perturbation budget.
        alpha (float): the step size.
        epoch (int): the number of iterations.
        decay (float): the decay factor for momentum calculation.
        targeted (bool): targeted/untargeted attack.
        random_start (bool): whether using random initialization for delta.
        norm (str): the norm of perturbation, l2/linfty.
        loss (str): the loss function.
        device (torch.device): the device for data. If it is None, the device would be same as model
        
    Official arguments:
        epsilon=16/255, alpha=epsilon/epoch=1.6/255, epoch=10, decay=1.
    """
    
    def __init__(self, model, epsilon=16/255, alpha=1.6/255, epoch=10, decay=1., targeted=False, random_start=False, 
                norm='linfty', loss='crossentropy', device=None, attack='MI-FGSM', adapted=False,rho=1.0,**kwargs):
        super().__init__(attack, model=model, epsilon=epsilon, targeted=targeted, random_start=random_start, norm=norm, loss=loss, device=device, **kwargs)
        self.alpha = alpha # epsilon/epoch# alpha
        self.epoch = epoch
        self.decay = decay
        self.ens_num = 10
    def forward(self, data, label, **kwargs):
        """
        The general attack procedure
        Arguments:
            data: (N, C, H, W) tensor for input images
            labels: (N,) tensor for ground-truth labels if untargetd, otherwise targeted labels
        """
        data = data.clone().detach().to(self.device)
        label = label.clone().detach().to(self.device)
        # Initialize adversarial perturbation
        delta = self.init_delta(data)
        delta_dual = delta.clone().detach().to(self.device)
        # Initialize momentum
        self.random_start = True
        delta_list = [self.init_delta(data) for _ in range(self.ens_num)]
        self.random_start = False
        momentum = 0.
        momentum_dual = 0.
        grad = 0.
        # self.moving_gradient = 0.
        self.ai = self.normalize(self.op, self.epoch)
        grad_list = [0. for _ in range(self.ens_num)]
        for _ in range(self.epoch):
            # Obtain the output
            ens_grad = 0.
            for e in range(self.ens_num):
                if e%2==0:
                    self.random_start = True
                    delta_list[e] = self.init_delta(data)
                    logits = self.get_logits(self.transform(data+delta_list[e]))
                    loss = self.get_loss(logits, label)
                    grad_list[e] = self.get_grad(loss, delta_list[e])
                    ens_grad += grad_list[e]
                    self.random_start = False
                else:
                    logits = self.get_logits(self.transform(data+delta_list[e]))
                    # Calculate the loss
                    loss = self.get_loss(logits, label)
                    # Calculate the gradients
                    grad_list[e] = self.get_grad(loss, delta_list[e])
                    ens_grad += grad_list[e]
            ens_grad /= self.ens_num
            # Calculate the momentum
            momentum = self.get_momentum(ens_grad, momentum,decay=self.decay)
            momentum_dual = momentum#(_ * momentum_dual + momentum)/(_ + 1)
            for e in range(self.ens_num):
                delta_list[e] = self.update_delta(delta_list[e], data, grad_list[e], self.ai[_]*(2*e+1)/(self.ens_num))
            delta_dual = self.update_delta(delta_dual, data, momentum_dual, self.ai[_])
        return delta_dual.detach()
    
class ENS_FGSM_SVMIFGSM(ScheduledAttack):
    """
    VMI-FGSM Attack
    'Enhancing the transferaFGSMbility of adversarial attacks through variance tuning (CVPR 2021)'(https://arxiv.org/abs/2103.15571)
    Arguments:
        model (torch.nn.Module): the surrogate model for attack.
        epsilon (float): the perturbation budget.
        alpha (float): the step size.
        beta (float): the relative value for the neighborhood.
        num_neighbor (int): the number of samples for estimating the gradient variance.
        epoch (int): the number of iterations.
        decay (float): the decay factor for momentum calculation.
        targeted (bool): targeted/untargeted attack.
        random_start (bool): whether using random initialization for delta.
        norm (str): the norm of perturbation, l2/linfty.
        loss (str): the loss function.
        device (torch.device): the device for data. If it is None, the device would be same as model
        
    Official arguments:
        epsilon=16/255, alpha=epsilon/epoch=1.6/255, beta=1.5, num_neighbor=20, epoch=10, decay=1.
    """
    
    def __init__(self, model, epsilon=16/255, alpha=1.6/255, beta=1.5, num_neighbor=20, epoch=10, decay=1., targeted=False, 
                random_start=False, norm='linfty', loss='crossentropy', device=None, attack='VMI-FGSM', **kwargs):
        super().__init__(attack, model, epsilon, targeted, random_start, norm, loss, device, **kwargs)
        self.alpha = alpha
        self.radius = beta * epsilon
        self.epoch = epoch
        self.decay = decay
        self.num_neighbor = num_neighbor
        self.ens_num = 10

    def get_variance(self, data, delta, label, cur_grad, momentum, **kwargs):
        """
        Calculate the gradient variance    
        """
        grad = 0
        for _ in range(self.num_neighbor):
            # Obtain the output
            # This is inconsistent for transform!
            logits = self.get_logits(self.transform(data+delta+torch.zeros_like(delta).uniform_(-self.radius, self.radius).to(self.device), momentum=momentum))

            # Calculate the loss
            loss = self.get_loss(logits, label)

            # Calculate the gradients
            grad += self.get_grad(loss, delta)

        return grad / self.num_neighbor - cur_grad

    def forward(self, data, label, **kwargs):
        """
        The attack procedure for VMI-FGSM
        Arguments:
            data: (N, C, H, W) tensor for input images
            labels: (N,) tensor for ground-truth labels if untargetd, otherwise targeted labels
        """
        data = data.clone().detach().to(self.device)
        label = label.clone().detach().to(self.device)

        # Initialize adversarial perturbation
        delta = self.init_delta(data)
        delta_dual = delta.clone().detach().to(self.device)
        delta_dual.requires_grad = True
        self.ai = self.normalize(self.op, self.epoch)
        momentum, variance = 0, 0
        grad=0.
        momentum_dual, variance_dual = 0, 0
        for _ in range(self.epoch):
            # Obtain the output
            ens_grad = 0.
            for e in range(self.ens_num):
                self.random_start = True
                delta = self.init_delta(data)
                logits = self.get_logits(self.transform(data+delta, momentum=momentum, alpha=self.ai[_]))
                self.random_start = False

                # Calculate the loss
                loss = self.get_loss(logits, label)


                # Calculate the gradients
                grad = self.get_grad(loss, delta)
                ens_grad += grad
            ens_grad /= self.ens_num

            # Calculate the momentum
            momentum = self.get_momentum(ens_grad, momentum)
            momentum_dual = self.get_momentum(ens_grad+variance, momentum_dual)#(_ * momentum_dual + momentum)/(_ + 1)
            # Calculate the variance
            variance = self.get_variance(data, delta_dual, label, ens_grad, momentum)
            variance_dual = variance#(_ * variance_dual + variance)/(_ + 1)
            # sum_grad = torch.sum(grad)
            # second_grad = torch.autograd.grad(sum_grad, delta, create_graph=True)[0]
            # Update adversarial perturbation
            delta = self.update_delta(delta, data, momentum, self.ai[_])
            delta_dual = self.update_delta(delta_dual, data, momentum_dual, self.ai[_])
            # print('iter: ', _,'mean: ', torch.mean(delta),'std: ', torch.std(delta),'grad l2 norm: ', torch.norm(grad,p=2),'2-th order grad: mean:', torch.mean(second_grad),'std: ', torch.std(second_grad))
        # exit()

        return delta_dual.detach()

class ENS_IFGSM_SVMIFGSM(ScheduledAttack):
    """
    VMI-FGSM Attack
    'Enhancing the transferaFGSMbility of adversarial attacks through variance tuning (CVPR 2021)'(https://arxiv.org/abs/2103.15571)
    Arguments:
        model (torch.nn.Module): the surrogate model for attack.
        epsilon (float): the perturbation budget.
        alpha (float): the step size.
        beta (float): the relative value for the neighborhood.
        num_neighbor (int): the number of samples for estimating the gradient variance.
        epoch (int): the number of iterations.
        decay (float): the decay factor for momentum calculation.
        targeted (bool): targeted/untargeted attack.
        random_start (bool): whether using random initialization for delta.
        norm (str): the norm of perturbation, l2/linfty.
        loss (str): the loss function.
        device (torch.device): the device for data. If it is None, the device would be same as model
        
    Official arguments:
        epsilon=16/255, alpha=epsilon/epoch=1.6/255, beta=1.5, num_neighbor=20, epoch=10, decay=1.
    """
    
    def __init__(self, model, epsilon=16/255, alpha=1.6/255, beta=1.5, num_neighbor=20, epoch=10, decay=1., targeted=False, 
                random_start=False, norm='linfty', loss='crossentropy', device=None, attack='VMI-FGSM', **kwargs):
        super().__init__(attack, model, epsilon, targeted, random_start, norm, loss, device, **kwargs)
        self.alpha = alpha
        self.radius = beta * epsilon
        self.epoch = epoch
        self.decay = decay
        self.num_neighbor = num_neighbor
        self.ens_num = 10

    def get_variance(self, data, delta, label, cur_grad, momentum, **kwargs):
        """
        Calculate the gradient variance    
        """
        grad = 0
        for _ in range(self.num_neighbor):
            # Obtain the output
            # This is inconsistent for transform!
            logits = self.get_logits(self.transform(data+delta+torch.zeros_like(delta).uniform_(-self.radius, self.radius).to(self.device), momentum=momentum))

            # Calculate the loss
            loss = self.get_loss(logits, label)

            # Calculate the gradients
            grad += self.get_grad(loss, delta)

        return grad / self.num_neighbor - cur_grad

    def forward(self, data, label, **kwargs):
        """
        The attack procedure for VMI-FGSM
        Arguments:
            data: (N, C, H, W) tensor for input images
            labels: (N,) tensor for ground-truth labels if untargetd, otherwise targeted labels
        """
        data = data.clone().detach().to(self.device)
        label = label.clone().detach().to(self.device)

        # Initialize adversarial perturbation
        delta = self.init_delta(data)
        delta_dual = delta.clone().detach().to(self.device)
        delta_dual.requires_grad = True
        self.ai = self.normalize(self.op, self.epoch)
        momentum, variance = 0, 0
        grad=0.
        self.random_start = True
        delta_list = [self.init_delta(data) for _ in range(self.ens_num)]
        grad_list = [0. for _ in range(self.ens_num)]
        momentum_list = [0. for _ in range(self.ens_num)]

        self.random_start = False
        momentum_dual, variance_dual = 0, 0
        for _ in range(self.epoch):
            # Obtain the output
            ens_grad = 0.
            for e in range(self.ens_num):
                if _%3==0:
                    self.random_start = True
                    delta_list[e] = self.init_delta(data)
                    self.random_start = False
                    logits = self.get_logits(self.transform(data+delta_list[e], momentum=momentum, alpha=self.ai[_]))


                    # Calculate the loss
                    loss = self.get_loss(logits, label)


                    # Calculate the gradients
                    grad = self.get_grad(loss, delta_list[e])
                    grad_list[e] = grad
                    momentum_list[e] = self.get_momentum(grad, momentum_list[e])
                    ens_grad += grad
                # ens_momentum += momentum_list[e]
                else:
                    # self.random_start = True
                    # delta = self.init_delta(data)
                    # self.random_start = False
                    logits = self.get_logits(self.transform(data+delta_list[e], momentum=momentum, alpha=self.ai[_]))


                    # Calculate the loss
                    loss = self.get_loss(logits, label)


                    # Calculate the gradients
                    grad = self.get_grad(loss, delta_list[e])
                    grad_list[e] = grad
                    momentum_list[e] = self.get_momentum(grad, momentum_list[e])
                    ens_grad += grad
            ens_grad /= self.ens_num
            # Calculate the momentum
            # momentum = self.get_momentum(grad, momentum)
            momentum_dual = self.get_momentum(ens_grad+variance, momentum_dual)#(_ * momentum_dual + momentum)/(_ + 1)
            # Calculate the variance
            variance = self.get_variance(data, delta_dual, label, ens_grad, momentum_dual)
            # variance_dual = variance#(_ * variance_dual + variance)/(_ + 1)
            # sum_grad = torch.sum(grad)
            # second_grad = torch.autograd.grad(sum_grad, delta, create_graph=True)[0]
            # Update adversarial perturbation
            for e in range(self.ens_num):
                delta_list[e] = self.update_delta(delta_list[e], data, grad_list[e], self.ai[_]*(2*e+1)/(self.ens_num))
            # delta = self.update_delta(delta, data, momentum, self.ai[_])
            delta_dual = self.update_delta(delta_dual, data, momentum_dual, self.ai[_])
            # print('iter: ', _,'mean: ', torch.mean(delta),'std: ', torch.std(delta),'grad l2 norm: ', torch.norm(grad,p=2),'2-th order grad: mean:', torch.mean(second_grad),'std: ', torch.std(second_grad))
        # exit()

        return delta_dual.detach()

class ENS_MIX_SVMIFGSM(ScheduledAttack):
    """
    VMI-FGSM Attack
    'Enhancing the transferaFGSMbility of adversarial attacks through variance tuning (CVPR 2021)'(https://arxiv.org/abs/2103.15571)
    Arguments:
        model (torch.nn.Module): the surrogate model for attack.
        epsilon (float): the perturbation budget.
        alpha (float): the step size.
        beta (float): the relative value for the neighborhood.
        num_neighbor (int): the number of samples for estimating the gradient variance.
        epoch (int): the number of iterations.
        decay (float): the decay factor for momentum calculation.
        targeted (bool): targeted/untargeted attack.
        random_start (bool): whether using random initialization for delta.
        norm (str): the norm of perturbation, l2/linfty.
        loss (str): the loss function.
        device (torch.device): the device for data. If it is None, the device would be same as model
        
    Official arguments:
        epsilon=16/255, alpha=epsilon/epoch=1.6/255, beta=1.5, num_neighbor=20, epoch=10, decay=1.
    """
    
    def __init__(self, model, epsilon=16/255, alpha=1.6/255, beta=1.5, num_neighbor=20, epoch=10, decay=1., targeted=False, 
                random_start=False, norm='linfty', loss='crossentropy', device=None, attack='VMI-FGSM', **kwargs):
        super().__init__(attack, model, epsilon, targeted, random_start, norm, loss, device, **kwargs)
        self.alpha = alpha
        self.radius = beta * epsilon
        self.epoch = epoch
        self.decay = decay
        self.num_neighbor = num_neighbor
        self.ens_num = 10

    def get_variance(self, data, delta, label, cur_grad, momentum, **kwargs):
        """
        Calculate the gradient variance    
        """
        grad = 0
        for _ in range(self.num_neighbor):
            # Obtain the output
            # This is inconsistent for transform!
            logits = self.get_logits(self.transform(data+delta+torch.zeros_like(delta).uniform_(-self.radius, self.radius).to(self.device), momentum=momentum))

            # Calculate the loss
            loss = self.get_loss(logits, label)

            # Calculate the gradients
            grad += self.get_grad(loss, delta)

        return grad / self.num_neighbor - cur_grad

    def forward(self, data, label, **kwargs):
        """
        The attack procedure for VMI-FGSM
        Arguments:
            data: (N, C, H, W) tensor for input images
            labels: (N,) tensor for ground-truth labels if untargetd, otherwise targeted labels
        """
        data = data.clone().detach().to(self.device)
        label = label.clone().detach().to(self.device)

        # Initialize adversarial perturbation
        delta = self.init_delta(data)
        delta_dual = delta.clone().detach().to(self.device)
        delta_dual.requires_grad = True
        self.ai = self.normalize(self.op, self.epoch)
        momentum, variance = 0, 0
        grad=0.
        self.random_start = True
        delta_list = [self.init_delta(data) for _ in range(self.ens_num)]
        grad_list = [0. for _ in range(self.ens_num)]
        momentum_list = [0. for _ in range(self.ens_num)]

        self.random_start = False
        momentum_dual, variance_dual = 0, 0
        for _ in range(self.epoch):
            # Obtain the output
            ens_grad = 0.
            for e in range(self.ens_num):
                if np.random.rand()<0.5:
                    self.random_start = True
                    delta_list[e] = self.init_delta(data)
                    self.random_start = False
                    logits = self.get_logits(self.transform(data+delta_list[e], momentum=momentum, alpha=self.ai[_]))


                    # Calculate the loss
                    loss = self.get_loss(logits, label)


                    # Calculate the gradients
                    grad = self.get_grad(loss, delta_list[e])
                    grad_list[e] = grad
                    momentum_list[e] = self.get_momentum(grad, momentum_list[e])
                    ens_grad += grad
                # ens_momentum += momentum_list[e]
                else:
                    # self.random_start = True
                    # delta = self.init_delta(data)
                    # self.random_start = False
                    logits = self.get_logits(self.transform(data+delta_list[e], momentum=momentum, alpha=self.ai[_]))


                    # Calculate the loss
                    loss = self.get_loss(logits, label)


                    # Calculate the gradients
                    grad = self.get_grad(loss, delta_list[e])
                    grad_list[e] = grad
                    momentum_list[e] = self.get_momentum(grad, momentum_list[e])
                    ens_grad += grad
            ens_grad /= self.ens_num
            # Calculate the momentum
            # momentum = self.get_momentum(grad, momentum)
            momentum_dual = self.get_momentum(ens_grad+variance, momentum_dual)#(_ * momentum_dual + momentum)/(_ + 1)
            # Calculate the variance
            variance = self.get_variance(data, delta_dual, label, ens_grad, momentum_dual)
            # variance_dual = variance#(_ * variance_dual + variance)/(_ + 1)
            # sum_grad = torch.sum(grad)
            # second_grad = torch.autograd.grad(sum_grad, delta, create_graph=True)[0]
            # Update adversarial perturbation
            for e in range(self.ens_num):
                delta_list[e] = self.update_delta(delta_list[e], data, grad_list[e], self.ai[_]*(2*e+1)/(self.ens_num))
            # delta = self.update_delta(delta, data, momentum, self.ai[_])
            delta_dual = self.update_delta(delta_dual, data, momentum_dual, self.ai[_])
            # print('iter: ', _,'mean: ', torch.mean(delta),'std: ', torch.std(delta),'grad l2 norm: ', torch.norm(grad,p=2),'2-th order grad: mean:', torch.mean(second_grad),'std: ', torch.std(second_grad))
        # exit()

        return delta_dual.detach()
    
class ENS_FGSM_SGIMIFGSM(ScheduledAttack):
    """
    MI-FGSM Attack
    'Boosting Adversarial Attacks with Momentum (CVPR 2018)'(https://arxiv.org/abs/1710.06081)

    Arguments:
        model (torch.nn.Module): the surrogate model for attack.
        epsilon (float): the perturbation budget.
        alpha (float): the step size.
        epoch (int): the number of iterations.
        decay (float): the decay factor for momentum calculation.
        targeted (bool): targeted/untargeted attack.
        random_start (bool): whether using random initialization for delta.
        norm (str): the norm of perturbation, l2/linfty.
        loss (str): the loss function.
        device (torch.device): the device for data. If it is None, the device would be same as model
        
    Official arguments:
        epsilon=16/255, alpha=epsilon/epoch=1.6/255, epoch=10, decay=1.
    """
    
    def __init__(self, model, epsilon=16/255, alpha=1.6/255, epoch=10, decay=1., targeted=False, random_start=False, 
                norm='linfty', loss='crossentropy', device=None, attack='MI-FGSM', adapted=False,rho=1.0,**kwargs):
        super().__init__(attack, model=model, epsilon=epsilon, targeted=targeted, random_start=random_start, norm=norm, loss=loss, device=device, **kwargs)
        self.alpha = alpha # epsilon/epoch# alpha
        self.epoch = epoch
        self.decay = decay
        self.ens_num = 10
        self.pre_epoch = 5
        self.s=10
    def forward(self, data, label, **kwargs):
        """
        The general attack procedure
        Arguments:
            data: (N, C, H, W) tensor for input images
            labels: (N,) tensor for ground-truth labels if untargetd, otherwise targeted labels
        """
        data = data.clone().detach().to(self.device)
        label = label.clone().detach().to(self.device)
        # Initialize adversarial perturbation
        delta = self.init_delta(data)
        delta_dual = delta.clone().detach().to(self.device)
        
        self.ai = self.normalize(self.op, self.epoch)
        self.ai_pre = self.normalize(self.identity, self.pre_epoch)
        delta = self.init_delta(data).to(self.device)
        delta_dual = delta.clone().detach().to(self.device)
        momentum_dual = 0.
        momentum = 0.
        for _ in range(self.pre_epoch):
            # Obtain the output
            logits = self.get_logits(self.transform(data+delta, momentum=momentum))
            # Calculate the loss
            loss = self.get_loss(logits, label)
            # Calculate the gradients
            grad = self.get_grad(loss, delta)
            # Calculate the momentum
            momentum = self.get_momentum(grad, momentum)
            # momentum_dual = (_ * momentum_dual + momentum)/(_ + 1)
            # Update adversarial perturbation
            delta = self.update_delta(delta, data, grad, self.ai_pre[_]*self.s)
        delta = self.init_delta(data).to(self.device)
        # Initialize momentum
        momentum_dual = 0.
        grad = 0.
        # self.moving_gradient = 0.
        self.ai = self.normalize(self.op, self.epoch)
        
        for _ in range(self.epoch):
            # Obtain the output
            ens_grad = 0.
            for e in range(self.ens_num):
                self.random_start = True
                delta = self.init_delta(data)
                logits = self.get_logits(self.transform(data+delta, momentum=grad, alpha=self.ai[_]))
                self.random_start = False
                # Calculate the loss
                loss = self.get_loss(logits, label)
                # Calculate the gradients
                grad = self.get_grad(loss, delta)
                ens_grad += grad
            ens_grad /= self.ens_num
            # Calculate the momentum
            momentum = self.get_momentum(ens_grad, momentum,decay=self.decay)
            momentum_dual = momentum#(_ * momentum_dual + momentum)/(_ + 1)
            delta = self.update_delta(delta, data, grad, self.ai[_])
            delta_dual = self.update_delta(delta_dual, data, momentum_dual, self.ai[_])
        return delta_dual.detach()
    
class ENS_IFGSM_SGIMIFGSM(ScheduledAttack):
    """
    MI-FGSM Attack
    'Boosting Adversarial Attacks with Momentum (CVPR 2018)'(https://arxiv.org/abs/1710.06081)

    Arguments:
        model (torch.nn.Module): the surrogate model for attack.
        epsilon (float): the perturbation budget.
        alpha (float): the step size.
        epoch (int): the number of iterations.
        decay (float): the decay factor for momentum calculation.
        targeted (bool): targeted/untargeted attack.
        random_start (bool): whether using random initialization for delta.
        norm (str): the norm of perturbation, l2/linfty.
        loss (str): the loss function.
        device (torch.device): the device for data. If it is None, the device would be same as model
        
    Official arguments:
        epsilon=16/255, alpha=epsilon/epoch=1.6/255, epoch=10, decay=1.
    """
    
    def __init__(self, model, epsilon=16/255, alpha=1.6/255, epoch=10, decay=1., targeted=False, random_start=False, 
                norm='linfty', loss='crossentropy', device=None, attack='MI-FGSM', adapted=False,rho=1.0,**kwargs):
        super().__init__(attack, model=model, epsilon=epsilon, targeted=targeted, random_start=random_start, norm=norm, loss=loss, device=device, **kwargs)
        self.alpha = alpha # epsilon/epoch# alpha
        self.epoch = epoch
        self.decay = decay
        self.ens_num = 10
        self.pre_epoch = 5
        self.s=10
    def forward(self, data, label, **kwargs):
        """
        The general attack procedure
        Arguments:
            data: (N, C, H, W) tensor for input images
            labels: (N,) tensor for ground-truth labels if untargetd, otherwise targeted labels
        """
        data = data.clone().detach().to(self.device)
        label = label.clone().detach().to(self.device)
        # Initialize adversarial perturbation
        delta = self.init_delta(data)
        delta_dual = delta.clone().detach().to(self.device)
        
        self.ai = self.normalize(self.op, self.epoch)
        self.ai_pre = self.normalize(self.identity, self.pre_epoch)
        delta = self.init_delta(data).to(self.device)
        delta_dual = delta.clone().detach().to(self.device)
        momentum_dual = 0.
        momentum = 0.
        for _ in range(self.pre_epoch):
            # Obtain the output
            logits = self.get_logits(self.transform(data+delta, momentum=momentum))
            # Calculate the loss
            loss = self.get_loss(logits, label)
            # Calculate the gradients
            grad = self.get_grad(loss, delta)
            # Calculate the momentum
            momentum = self.get_momentum(grad, momentum)
            # momentum_dual = (_ * momentum_dual + momentum)/(_ + 1)
            # Update adversarial perturbation
            delta = self.update_delta(delta, data, grad, self.ai_pre[_]*self.s)
        delta = self.init_delta(data).to(self.device)
        # Initialize momentum
        self.random_start = True
        delta_list = [self.init_delta(data) for _ in range(self.ens_num)]
        self.random_start = False
        momentum_dual = 0.
        grad = 0.
        # self.moving_gradient = 0.
        self.ai = self.normalize(self.op, self.epoch)
        grad_list = [0. for _ in range(self.ens_num)]
        for _ in range(self.epoch):
            # Obtain the output
            ens_grad = 0.
            for e in range(self.ens_num):
                logits = self.get_logits(self.transform(data+delta_list[e]))
                # Calculate the loss
                loss = self.get_loss(logits, label)
                # Calculate the gradients
                grad_list[e] = self.get_grad(loss, delta_list[e])
                ens_grad += grad_list[e]
            ens_grad /= self.ens_num
            # Calculate the momentum
            momentum = self.get_momentum(ens_grad, momentum,decay=self.decay)
            momentum_dual = momentum#(_ * momentum_dual + momentum)/(_ + 1) *(2e+1)/(self.ens_num
            for e in range(self.ens_num):
                delta_list[e] = self.update_delta(delta_list[e], data, grad_list[e], self.ai[_]*(2*e+1)/(self.ens_num))
            delta_dual = self.update_delta(delta_dual, data, momentum_dual, self.ai[_])
        return delta_dual.detach()
    
    
class ENS_MIX_SGIMIFGSM(ScheduledAttack):
    """
    MI-FGSM Attack
    'Boosting Adversarial Attacks with Momentum (CVPR 2018)'(https://arxiv.org/abs/1710.06081)

    Arguments:
        model (torch.nn.Module): the surrogate model for attack.
        epsilon (float): the perturbation budget.
        alpha (float): the step size.
        epoch (int): the number of iterations.
        decay (float): the decay factor for momentum calculation.
        targeted (bool): targeted/untargeted attack.
        random_start (bool): whether using random initialization for delta.
        norm (str): the norm of perturbation, l2/linfty.
        loss (str): the loss function.
        device (torch.device): the device for data. If it is None, the device would be same as model
        
    Official arguments:
        epsilon=16/255, alpha=epsilon/epoch=1.6/255, epoch=10, decay=1.
    """
    
    def __init__(self, model, epsilon=16/255, alpha=1.6/255, epoch=10, decay=1., targeted=False, random_start=False, 
                norm='linfty', loss='crossentropy', device=None, attack='MI-FGSM', adapted=False,rho=1.0,**kwargs):
        super().__init__(attack, model=model, epsilon=epsilon, targeted=targeted, random_start=random_start, norm=norm, loss=loss, device=device, **kwargs)
        self.alpha = alpha # epsilon/epoch# alpha
        self.epoch = epoch
        self.decay = decay
        self.ens_num = 10
        self.pre_epoch = 5
        self.s=10
    def forward(self, data, label, **kwargs):
        """
        The general attack procedure
        Arguments:
            data: (N, C, H, W) tensor for input images
            labels: (N,) tensor for ground-truth labels if untargetd, otherwise targeted labels
        """
        data = data.clone().detach().to(self.device)
        label = label.clone().detach().to(self.device)
        # Initialize adversarial perturbation
        delta = self.init_delta(data)
        delta_dual = delta.clone().detach().to(self.device)
        
        self.ai = self.normalize(self.op, self.epoch)
        self.ai_pre = self.normalize(self.identity, self.pre_epoch)
        delta = self.init_delta(data).to(self.device)
        delta_dual = delta.clone().detach().to(self.device)
        momentum_dual = 0.
        momentum = 0.
        for _ in range(self.pre_epoch):
            # Obtain the output
            logits = self.get_logits(self.transform(data+delta, momentum=momentum))
            # Calculate the loss
            loss = self.get_loss(logits, label)
            # Calculate the gradients
            grad = self.get_grad(loss, delta)
            # Calculate the momentum
            momentum = self.get_momentum(grad, momentum)
            # momentum_dual = (_ * momentum_dual + momentum)/(_ + 1)
            # Update adversarial perturbation
            delta = self.update_delta(delta, data, grad, self.ai_pre[_]*self.s)
        delta = self.init_delta(data).to(self.device)
        # Initialize momentum
        self.random_start = True
        delta_list = [self.init_delta(data) for _ in range(self.ens_num)]
        self.random_start = False
        momentum_dual = 0.
        grad = 0.
        # self.moving_gradient = 0.
        self.ai = self.normalize(self.op, self.epoch)
        grad_list = [0. for _ in range(self.ens_num)]
        for _ in range(self.epoch):
            # Obtain the output
            ens_grad = 0.
            for e in range(self.ens_num):
                if e%2==0:
                    self.random_start = True
                    delta_list[e] = self.init_delta(data)
                    logits = self.get_logits(self.transform(data+delta_list[e]))
                    loss = self.get_loss(logits, label)
                    grad_list[e] = self.get_grad(loss, delta_list[e])
                    ens_grad += grad_list[e]
                    self.random_start = False
                else:
                    logits = self.get_logits(self.transform(data+delta_list[e]))
                    # Calculate the loss
                    loss = self.get_loss(logits, label)
                    # Calculate the gradients
                    grad_list[e] = self.get_grad(loss, delta_list[e])
                    ens_grad += grad_list[e]
            ens_grad /= self.ens_num
            # Calculate the momentum
            momentum = self.get_momentum(ens_grad, momentum,decay=self.decay)
            momentum_dual = momentum#(_ * momentum_dual + momentum)/(_ + 1)
            for e in range(self.ens_num):
                delta_list[e] = self.update_delta(delta_list[e], data, grad_list[e], self.ai[_]*(2*e+1)/(self.ens_num))
            delta_dual = self.update_delta(delta_dual, data, momentum_dual, self.ai[_])
        return delta_dual.detach()
    
class ENS_FGSM_SPIFGSM(SNIFGSM):
    """
    PI-FGSM Attack
    'Patch-wise Attack for Fooling Deep Neural Network (ECCV 2020)'(https://arxiv.org/abs/2007.06765)
    Arguments:
        model (torch.nn.Module): the surrogate model for attack.
        epsilon (float): the perturbation budget.
        alpha (float): the step size.
        epoch (int): the number of iterations.
        decay (float): the decay factor for momentum calculation.
        targeted (bool): targeted/untargeted attack.
        random_start (bool): whether using random initialization for delta.
        norm (str): the norm of perturbation, l2/linfty.
        loss (str): the loss function.
        device (torch.device): the device for data. If it is None, the device would be same as model
        
    """
    
    def __init__(self, model, epsilon=16/255, alpha=1.6/255, epoch=10, decay=1., targeted=False, random_start=False, 
                norm='linfty', loss='crossentropy', device=None, attack='PI-FGSM', **kwargs):
        super().__init__(model, epsilon, alpha, epoch, decay, targeted, random_start, norm, loss, device, attack, **kwargs)
        self.kern_size = 3
        self.alpha_beta = self.alpha * epoch
        self.gamma = self.alpha_beta
        P_kern = self.project_kern(self.kern_size)
        self.device = "cuda:0"# next(model.parameters()).device if device is None else device
        self.w = torch.from_numpy(P_kern).to(self.device) # project kernel
        self.ens_num = 10

    
    def project_kern(self, kern_size):
        kern = np.ones((kern_size, kern_size), dtype=np.float32) / (kern_size ** 2 - 1)
        kern[kern_size // 2, kern_size // 2] = 0.0
        kern = kern.astype(np.float32)
        stack_kern = np.stack([kern, kern, kern])
        stack_kern = np.expand_dims(stack_kern, 1)
        return stack_kern
    
    def project_noise(self, x, stack_kern, padding_size):
        # x = tf.pad(x, [[0,0],[kern_size,kern_size],[kern_size,kern_size],[0,0]], "CONSTANT")
        x = F.conv2d(x, stack_kern, padding=(padding_size, padding_size), groups=3)
        return x
    
    
    def update_delta(self, delta, data, grad,  **kwargs):
        if self.norm == 'linfty':
            delta = torch.clamp(delta +  grad, -self.epsilon, self.epsilon)
        else:
            grad_norm = torch.norm(grad.view(grad.size(0), -1), dim=1).view(-1, 1, 1, 1)
            scaled_grad = grad / (grad_norm + 1e-20)
            delta = (delta + scaled_grad ).view(delta.size(0), -1).renorm(p=2, dim=0, maxnorm=self.epsilon).view_as(delta)
        delta = clamp(delta, img_min-data, img_max-data)
        return delta

    def forward(self, data, label, **kwargs):
        """
        Overriden for PI-FGSM
        Arguments:
            data: (N, C, H, W) tensor for input images
            labels: (N,) tensor for ground-truth labels if untargetd, otherwise targeted labels
        """
        data = data.clone().detach().to(self.device)
        label = label.clone().detach().to(self.device)

        # Initialize adversarial perturbation
        delta = self.init_delta(data)
        delta.requires_grad = True
        grad = 0.
        self.ai = self.normalize(self.op, self.epoch)
        self.ai_beta = [self.ai[i] * self.epoch for i in range(self.epoch)]
        self.ai_gamma = [self.ai[i] * self.epoch for i in range(self.epoch)]

        amplification = 0.0 
        amplification_dual = 0.0
        momentum = 0.0
        
        self.random_start = True
        delta_list = [self.init_delta(data) for _ in range(self.ens_num)]
        self.random_start = False
        # self.moving_gradient = 0.
        self.ai = self.normalize(self.op, self.epoch)
        grad_list = [0. for _ in range(self.ens_num)]
        
        delta_dual = delta.clone().detach().to(self.device)
        delta_dual.requires_grad = True
        # Initialize momentum
        momentum = 0.
        momentum_dual = 0.
        grad=0.
        
        for _ in range(self.epoch):
            # Obtain the output
            ens_grad = 0.
            for idx in range(self.ens_num):
                self.random_start = True
                delta_list[idx] = self.init_delta(data)
                logits = self.get_logits(self.transform(data+delta_list[idx], momentum=grad, alpha=self.ai_beta[_]))
                self.random_start = False
                # Calculate the loss
                loss = self.get_loss(logits, label)


                # Calculate the gradients
                grad = self.get_grad(loss, delta_list[idx])
                ens_grad += grad
            ens_grad /= self.ens_num
            momentum = self.get_momentum(ens_grad, momentum)
            momentum_dual =momentum# (_ * momentum_dual + momentum)/(_ + 1)
            # Calculate the cut noise
            amplification += self.alpha_beta * momentum.sign()
            amplification_dual = amplification# (_ * amplification_dual + amplification)/(_ + 1)
            cut_noise = torch.clamp(abs(amplification) - self.epsilon, 0., 10000.0) * torch.sign(amplification)
            cut_noise_dual = torch.clamp(abs(amplification_dual) - self.epsilon, 0., 10000.0) * torch.sign(amplification_dual)
            projection = self.ai_gamma[_] * torch.sign(self.project_noise(cut_noise, self.w, self.kern_size//2))
            projection_dual = self.ai_gamma[_] * torch.sign(self.project_noise(cut_noise_dual, self.w, self.kern_size//2))
            amplification += projection
            amplification_dual += projection_dual
            # sum_grad = torch.sum(grad)
            # second_grad = torch.autograd.grad(sum_grad, delta, create_graph=True)[0]
            delta = self.update_delta(delta, data, self.ai_beta[_] * grad.sign())
            delta_dual = self.update_delta(delta_dual, data, self.ai_beta[_] * momentum_dual.sign() + projection_dual)
            # print('iter: ', _,'mean: ', torch.mean(delta),'std: ', torch.std(delta),'grad l2 norm: ', torch.norm(grad,p=2),'2-th order grad: mean:', torch.mean(second_grad),'std: ', torch.std(second_grad))
        # exit()

        return delta_dual.detach()
    
    
class ENS_IFGSM_SPIFGSM(SNIFGSM):
    """
    PI-FGSM Attack
    'Patch-wise Attack for Fooling Deep Neural Network (ECCV 2020)'(https://arxiv.org/abs/2007.06765)
    Arguments:
        model (torch.nn.Module): the surrogate model for attack.
        epsilon (float): the perturbation budget.
        alpha (float): the step size.
        epoch (int): the number of iterations.
        decay (float): the decay factor for momentum calculation.
        targeted (bool): targeted/untargeted attack.
        random_start (bool): whether using random initialization for delta.
        norm (str): the norm of perturbation, l2/linfty.
        loss (str): the loss function.
        device (torch.device): the device for data. If it is None, the device would be same as model
        
    """
    
    def __init__(self, model, epsilon=16/255, alpha=1.6/255, epoch=10, decay=1., targeted=False, random_start=False, 
                norm='linfty', loss='crossentropy', device=None, attack='PI-FGSM', **kwargs):
        super().__init__(model, epsilon, alpha, epoch, decay, targeted, random_start, norm, loss, device, attack, **kwargs)
        self.kern_size = 3
        self.alpha_beta = self.alpha * epoch
        self.gamma = self.alpha_beta
        P_kern = self.project_kern(self.kern_size)
        self.device = "cuda:0"# next(model.parameters()).device if device is None else device
        self.w = torch.from_numpy(P_kern).to(self.device) # project kernel

    
    def project_kern(self, kern_size):
        kern = np.ones((kern_size, kern_size), dtype=np.float32) / (kern_size ** 2 - 1)
        kern[kern_size // 2, kern_size // 2] = 0.0
        kern = kern.astype(np.float32)
        stack_kern = np.stack([kern, kern, kern])
        stack_kern = np.expand_dims(stack_kern, 1)
        return stack_kern
    
    def project_noise(self, x, stack_kern, padding_size):
        # x = tf.pad(x, [[0,0],[kern_size,kern_size],[kern_size,kern_size],[0,0]], "CONSTANT")
        x = F.conv2d(x, stack_kern, padding=(padding_size, padding_size), groups=3)
        return x
    
    
    def update_delta(self, delta, data, grad,  **kwargs):
        if self.norm == 'linfty':
            delta = torch.clamp(delta +  grad, -self.epsilon, self.epsilon)
        else:
            grad_norm = torch.norm(grad.view(grad.size(0), -1), dim=1).view(-1, 1, 1, 1)
            scaled_grad = grad / (grad_norm + 1e-20)
            delta = (delta + scaled_grad ).view(delta.size(0), -1).renorm(p=2, dim=0, maxnorm=self.epsilon).view_as(delta)
        delta = clamp(delta, img_min-data, img_max-data)
        return delta

    def forward(self, data, label, **kwargs):
        """
        Overriden for PI-FGSM
        Arguments:
            data: (N, C, H, W) tensor for input images
            labels: (N,) tensor for ground-truth labels if untargetd, otherwise targeted labels
        """
        data = data.clone().detach().to(self.device)
        label = label.clone().detach().to(self.device)

        # Initialize adversarial perturbation
        delta = self.init_delta(data)
        delta.requires_grad = True
        grad = 0.
        self.ai = self.normalize(self.op, self.epoch)
        self.ai_beta = [self.ai[i] * self.epoch for i in range(self.epoch)]
        self.ai_gamma = [self.ai[i] * self.epoch for i in range(self.epoch)]

        amplification = 0.0 
        amplification_dual = 0.0
        momentum = 0.0
        
        delta_dual = delta.clone().detach().to(self.device)
        delta_dual.requires_grad = True
        # Initialize momentum
        momentum = 0.
        momentum_dual = 0.
        grad=0.
        for _ in range(self.epoch):
            # Obtain the output
            logits = self.get_logits(self.transform(data+delta, momentum=grad, alpha=self.ai_beta[_]))

            # Calculate the loss
            loss = self.get_loss(logits, label)
            

            # Calculate the gradients
            grad = self.get_grad(loss, delta)
            momentum = self.get_momentum(grad, momentum)
            momentum_dual =momentum# (_ * momentum_dual + momentum)/(_ + 1)
            # Calculate the cut noise
            amplification += self.alpha_beta * momentum.sign()
            amplification_dual = amplification# (_ * amplification_dual + amplification)/(_ + 1)
            cut_noise = torch.clamp(abs(amplification) - self.epsilon, 0., 10000.0) * torch.sign(amplification)
            cut_noise_dual = torch.clamp(abs(amplification_dual) - self.epsilon, 0., 10000.0) * torch.sign(amplification_dual)
            projection = self.ai_gamma[_] * torch.sign(self.project_noise(cut_noise, self.w, self.kern_size//2))
            projection_dual = self.ai_gamma[_] * torch.sign(self.project_noise(cut_noise_dual, self.w, self.kern_size//2))
            amplification += projection
            amplification_dual += projection_dual
            # sum_grad = torch.sum(grad)
            # second_grad = torch.autograd.grad(sum_grad, delta, create_graph=True)[0]
            delta = self.update_delta(delta, data, self.ai_beta[_] * grad.sign())
            delta_dual = self.update_delta(delta_dual, data, self.ai_beta[_] * momentum_dual.sign() + projection_dual)
            # print('iter: ', _,'mean: ', torch.mean(delta),'std: ', torch.std(delta),'grad l2 norm: ', torch.norm(grad,p=2),'2-th order grad: mean:', torch.mean(second_grad),'std: ', torch.std(second_grad))
        # exit()

        return delta_dual.detach()
    
    
class ENS_MIX_SPIFGSM(SNIFGSM):
    """
    PI-FGSM Attack
    'Patch-wise Attack for Fooling Deep Neural Network (ECCV 2020)'(https://arxiv.org/abs/2007.06765)
    Arguments:
        model (torch.nn.Module): the surrogate model for attack.
        epsilon (float): the perturbation budget.
        alpha (float): the step size.
        epoch (int): the number of iterations.
        decay (float): the decay factor for momentum calculation.
        targeted (bool): targeted/untargeted attack.
        random_start (bool): whether using random initialization for delta.
        norm (str): the norm of perturbation, l2/linfty.
        loss (str): the loss function.
        device (torch.device): the device for data. If it is None, the device would be same as model
        
    """
    
    def __init__(self, model, epsilon=16/255, alpha=1.6/255, epoch=10, decay=1., targeted=False, random_start=False, 
                norm='linfty', loss='crossentropy', device=None, attack='PI-FGSM', **kwargs):
        super().__init__(model, epsilon, alpha, epoch, decay, targeted, random_start, norm, loss, device, attack, **kwargs)
        self.kern_size = 3
        self.alpha_beta = self.alpha * epoch
        self.gamma = self.alpha_beta
        P_kern = self.project_kern(self.kern_size)
        self.device = "cuda:0"# next(model.parameters()).device if device is None else device
        self.w = torch.from_numpy(P_kern).to(self.device) # project kernel

    
    def project_kern(self, kern_size):
        kern = np.ones((kern_size, kern_size), dtype=np.float32) / (kern_size ** 2 - 1)
        kern[kern_size // 2, kern_size // 2] = 0.0
        kern = kern.astype(np.float32)
        stack_kern = np.stack([kern, kern, kern])
        stack_kern = np.expand_dims(stack_kern, 1)
        return stack_kern
    
    def project_noise(self, x, stack_kern, padding_size):
        # x = tf.pad(x, [[0,0],[kern_size,kern_size],[kern_size,kern_size],[0,0]], "CONSTANT")
        x = F.conv2d(x, stack_kern, padding=(padding_size, padding_size), groups=3)
        return x
    
    
    def update_delta(self, delta, data, grad,  **kwargs):
        if self.norm == 'linfty':
            delta = torch.clamp(delta +  grad, -self.epsilon, self.epsilon)
        else:
            grad_norm = torch.norm(grad.view(grad.size(0), -1), dim=1).view(-1, 1, 1, 1)
            scaled_grad = grad / (grad_norm + 1e-20)
            delta = (delta + scaled_grad ).view(delta.size(0), -1).renorm(p=2, dim=0, maxnorm=self.epsilon).view_as(delta)
        delta = clamp(delta, img_min-data, img_max-data)
        return delta

    def forward(self, data, label, **kwargs):
        """
        Overriden for PI-FGSM
        Arguments:
            data: (N, C, H, W) tensor for input images
            labels: (N,) tensor for ground-truth labels if untargetd, otherwise targeted labels
        """
        data = data.clone().detach().to(self.device)
        label = label.clone().detach().to(self.device)

        # Initialize adversarial perturbation
        delta = self.init_delta(data)
        delta.requires_grad = True
        grad = 0.
        self.ai = self.normalize(self.op, self.epoch)
        self.ai_beta = [self.ai[i] * self.epoch for i in range(self.epoch)]
        self.ai_gamma = [self.ai[i] * self.epoch for i in range(self.epoch)]

        amplification = 0.0 
        amplification_dual = 0.0
        momentum = 0.0
        
        delta_dual = delta.clone().detach().to(self.device)
        delta_dual.requires_grad = True
        # Initialize momentum
        momentum = 0.
        momentum_dual = 0.
        grad=0.
        for _ in range(self.epoch):
            # Obtain the output
            logits = self.get_logits(self.transform(data+delta, momentum=grad, alpha=self.ai_beta[_]))

            # Calculate the loss
            loss = self.get_loss(logits, label)
            

            # Calculate the gradients
            grad = self.get_grad(loss, delta)
            momentum = self.get_momentum(grad, momentum)
            momentum_dual =momentum# (_ * momentum_dual + momentum)/(_ + 1)
            # Calculate the cut noise
            amplification += self.alpha_beta * momentum.sign()
            amplification_dual = amplification# (_ * amplification_dual + amplification)/(_ + 1)
            cut_noise = torch.clamp(abs(amplification) - self.epsilon, 0., 10000.0) * torch.sign(amplification)
            cut_noise_dual = torch.clamp(abs(amplification_dual) - self.epsilon, 0., 10000.0) * torch.sign(amplification_dual)
            projection = self.ai_gamma[_] * torch.sign(self.project_noise(cut_noise, self.w, self.kern_size//2))
            projection_dual = self.ai_gamma[_] * torch.sign(self.project_noise(cut_noise_dual, self.w, self.kern_size//2))
            amplification += projection
            amplification_dual += projection_dual
            # sum_grad = torch.sum(grad)
            # second_grad = torch.autograd.grad(sum_grad, delta, create_graph=True)[0]
            delta = self.update_delta(delta, data, self.ai_beta[_] * grad.sign())
            delta_dual = self.update_delta(delta_dual, data, self.ai_beta[_] * momentum_dual.sign() + projection_dual)
            # print('iter: ', _,'mean: ', torch.mean(delta),'std: ', torch.std(delta),'grad l2 norm: ', torch.norm(grad,p=2),'2-th order grad: mean:', torch.mean(second_grad),'std: ', torch.std(second_grad))
        # exit()

        return delta_dual.detach()