import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision.models import densenet121, efficientnet_v2_s, efficientnet_b1, resnet18, ResNet18_Weights, resnet50, ResNet50_Weights
import numpy as np
import math
from collections import OrderedDict
from training.vit import ViTForClassfication
from vit_pytorch import ViT


class Flatten(nn.Module):
    def forward(self, x):
        return x.view(x.size(0), -1)


class Normalize(nn.Module):
    def __init__(self, mu, std):
        super(Normalize, self).__init__()
        self.mu, self.std = mu, std

    def forward(self, x):
        return (x - self.mu) / self.std


class CustomReLU(nn.Module):
    def __init__(self):
        super(CustomReLU, self).__init__()
        self.collect_preact = True
        self.avg_preacts = []

    def forward(self, preact):
        if self.collect_preact:
            self.avg_preacts.append(preact.abs().mean().item())
        act = F.relu(preact)
        return act


class ModuleWithStats(nn.Module):
    def __init__(self):
        super(ModuleWithStats, self).__init__()

    def forward(self, x):
        for layer in self._model:
            if type(layer) == CustomReLU:
                layer.avg_preacts = []

        out = self._model(x)

        avg_preacts_all = [layer.avg_preacts for layer in self._model if type(layer) == CustomReLU]
        self.avg_preact = np.mean(avg_preacts_all)
        return out


class Linear(ModuleWithStats):
    def __init__(self, n_cls, shape_in):
        n_cls = 1 if n_cls == 2 else n_cls
        super(Linear, self).__init__()
        d = int(np.prod(shape_in[1:]))
        self._model = nn.Sequential(
            Flatten(),
            nn.Linear(d, n_cls, bias=False)
        )

    def forward(self, x):
        logits = self._model(x)
        return torch.cat([torch.zeros(logits.shape), logits], dim=1)


class LinearTwoOutputs(ModuleWithStats):
    def __init__(self, n_cls, shape_in):
        super(LinearTwoOutputs, self).__init__()
        d = int(np.prod(shape_in[1:]))
        self._model = nn.Sequential(
            Flatten(),
            nn.Linear(d, n_cls, bias=False)
        )


class IdentityLayer(nn.Module):
    def forward(self, inputs):
        return inputs


class PreActBlock(nn.Module):
    """ Pre-activation version of the BasicBlock. """
    expansion = 1

    def __init__(self, in_planes, planes, bn, learnable_bn, stride=1, activation='relu', droprate=0.0, gn_groups=32):
        super(PreActBlock, self).__init__()
        self.collect_preact = True
        self.activation = activation
        self.droprate = droprate
        self.avg_preacts = []
        self.bn1 = nn.BatchNorm2d(in_planes, affine=learnable_bn) if bn else nn.GroupNorm(gn_groups, in_planes)
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=not learnable_bn)
        self.bn2 = nn.BatchNorm2d(planes, affine=learnable_bn) if bn else nn.GroupNorm(gn_groups, planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=not learnable_bn)

        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=not learnable_bn)
            )

    def act_function(self, preact):
        if self.activation == 'relu':
            act = F.relu(preact)
            # print((act == 0).float().mean().item(), (act.norm() / act.shape[0]).item(), (act.norm() / np.prod(act.shape)).item())
        else:
            assert self.activation[:8] == 'softplus'
            beta = int(self.activation.split('softplus')[1])
            act = F.softplus(preact, beta=beta)
        return act

    def forward(self, x):
        out = self.act_function(self.bn1(x))
        shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x  # Important: using out instead of x
        out = self.conv1(out)
        out = self.act_function(self.bn2(out))
        if self.droprate > 0:
            out = F.dropout(out, p=self.droprate, training=self.training)
        out = self.conv2(out)
        out += shortcut
        return out


class BasicBlock(nn.Module):
    def __init__(self, in_planes, out_planes, stride, droprate=0.0):
        super(BasicBlock, self).__init__()
        self.bn1 = nn.BatchNorm2d(in_planes)
        self.relu1 = nn.ReLU(inplace=True)
        self.conv1 = nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_planes)
        self.relu2 = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(out_planes, out_planes, kernel_size=3, stride=1, padding=1, bias=False)
        self.droprate = droprate
        self.equalInOut = (in_planes == out_planes)
        self.convShortcut = (not self.equalInOut) and nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride,
                                                                padding=0, bias=False) or None

    def forward(self, x):
        if not self.equalInOut:
            x = self.relu1(self.bn1(x))
        else:
            out = self.relu1(self.bn1(x))
        out = self.relu2(self.bn2(self.conv1(out if self.equalInOut else x)))
        if self.droprate > 0:
            out = F.dropout(out, p=self.droprate, training=self.training)
        out = self.conv2(out)
        return torch.add(x if self.equalInOut else self.convShortcut(x), out)


class BasicBlockResNet34(nn.Module):
    expansion = 1

    def __init__(self, in_planes, planes, stride=1):
        super(BasicBlockResNet34, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion*planes)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out


class NetworkBlock(nn.Module):
    def __init__(self, nb_layers, in_planes, out_planes, block, stride, droprate=0.0):
        super(NetworkBlock, self).__init__()
        self.layer = self._make_layer(block, in_planes, out_planes, nb_layers, stride, droprate)

    def _make_layer(self, block, in_planes, out_planes, nb_layers, stride, droprate):
        layers = []
        for i in range(int(nb_layers)):
            layers.append(block(i == 0 and in_planes or out_planes, out_planes, i == 0 and stride or 1, droprate))
        return nn.Sequential(*layers)

    def forward(self, x):
        return self.layer(x)


class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, in_planes, planes, stride=1, use_batchnorm=True):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, self.expansion * planes, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(self.expansion * planes)

        if not use_batchnorm:
            self.bn1 = self.bn2 = self.bn3 = nn.Sequential()

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion * planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion * planes) if use_batchnorm else nn.Sequential(),
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = F.relu(self.bn2(self.conv2(out)))
        out = self.bn3(self.conv3(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out


class ResNet(nn.Module):
    def __init__(self, block, num_blocks, num_classes=10, model_width=64, droprate=0.0, final_layer_factor=8):
        super(ResNet, self).__init__()
        self.in_planes = model_width
        self.half_prec = False
        self.block = block
        # self.mu = torch.tensor((0.4914, 0.4822, 0.4465)).view(1, 3, 1, 1).cuda()
        # self.std = torch.tensor((0.2471, 0.2435, 0.2616)).view(1, 3, 1, 1).cuda()
        # self.mu = torch.tensor((0.0, 0.0, 0.0)).view(1, 3, 1, 1)
        # self.std = torch.tensor((1.0, 1.0, 1.0)).view(1, 3, 1, 1)
        # if self.half_prec:
        #     self.mu, self.std = self.mu.half(), self.std.half()

        # self.normalize = Normalize(self.mu, self.std)
        self.conv1 = nn.Conv2d(3, model_width, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(model_width)
        self.layer1 = self._make_layer(block, model_width, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, 2*model_width, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, 4*model_width, num_blocks[2], stride=2)
        self.layer4 = self._make_layer(block, 8*model_width, num_blocks[3], stride=2)
        self.linear = nn.Linear(final_layer_factor
                                *model_width*block.expansion, num_classes)

    def _make_layer(self, block, planes, num_blocks, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_planes, planes, stride))
            self.in_planes = planes * block.expansion
        return nn.Sequential(*layers)

    def forward(self, x, return_features=False, return_block=5):
        assert return_block in [1, 2, 3, 4, 5], 'wrong return_block'
        # out = self.normalize(x)
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        # TODO: Duplicate call
        # Keep it for resnet18 and resnet34 for consistency
        # Remove in the final version
        if self.block == BasicBlockResNet34:
            out = self.layer1(out)
        if return_features and return_block == 1:
            return out
        out = self.layer2(out)
        if return_features and return_block == 2:
            return out
        out = self.layer3(out)
        if return_features and return_block == 3:
            return out
        out = self.layer4(out)
        if return_features and return_block == 4:
            return out
        out = F.avg_pool2d(out, 4)
        y = out.view(out.size(0), -1)
        if return_features and return_block == 5:
            return out
        out = self.linear(y)
        return out


class PreActResNet(nn.Module):
    def __init__(self, block, num_blocks, n_cls, model_width=64, cuda=True, half_prec=False, activation='relu',
                 droprate=0.0, bn_flag=True, final_layer_factor=8):
        super(PreActResNet, self).__init__()
        self.half_prec = half_prec
        self.bn_flag = bn_flag
        self.gn_groups = model_width // 2  # in particular, 32 for model_width=64 as in the original GroupNorm paper
        self.learnable_bn = True  # doesn't matter if self.bn=False
        self.in_planes = model_width
        self.avg_preact = None
        self.activation = activation
        self.n_cls = n_cls
        # self.mu = torch.tensor((0.4914, 0.4822, 0.4465)).view(1, 3, 1, 1)
        # self.std = torch.tensor((0.2471, 0.2435, 0.2616)).view(1, 3, 1, 1)
        # self.mu = torch.tensor((0.0, 0.0, 0.0)).view(1, 3, 1, 1)
        # self.std = torch.tensor((1.0, 1.0, 1.0)).view(1, 3, 1, 1)

        # if cuda:
        #     self.mu, self.std = self.mu.cuda(), self.std.cuda()
        # if half_prec:
        #     self.mu, self.std = self.mu.half(), self.std.half()

        # self.normalize = Normalize(self.mu, self.std)
        self.conv1 = nn.Conv2d(3, model_width, kernel_size=3, stride=1, padding=1, bias=not self.learnable_bn)
        self.layer1 = self._make_layer(block, model_width, num_blocks[0], 1, droprate)
        self.layer2 = self._make_layer(block, 2*model_width, num_blocks[1], 2, droprate)
        self.layer3 = self._make_layer(block, 4*model_width, num_blocks[2], 2, droprate)
        self.layer4 = self._make_layer(block, 8*model_width, num_blocks[3], 2, droprate)
        self.bn = nn.BatchNorm2d(8*model_width*block.expansion) if self.bn_flag \
            else nn.GroupNorm(self.gn_groups, 8*model_width*block.expansion)
        self.linear = nn.Linear(final_layer_factor*model_width*block.expansion, 1 if n_cls == 2 else n_cls)

    def _make_layer(self, block, planes, num_blocks, stride, droprate):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_planes, planes, self.bn_flag, self.learnable_bn, stride, self.activation,
                                droprate, self.gn_groups))
            # layers.append(block(self.in_planes, planes, stride))
            self.in_planes = planes * block.expansion
        return nn.Sequential(*layers)

    def forward(self, x, return_features=False, return_block=5):
        assert return_block in [1, 2, 3, 4, 5], 'wrong return_block'
        for layer in [*self.layer1, *self.layer2, *self.layer3, *self.layer4]:
            layer.avg_preacts = []

        # x = x / ((x**2).sum([1, 2, 3], keepdims=True)**0.5 + 1e-6)  # numerical stability is needed for RLAT
        # out = self.normalize(x)
        out = self.conv1(x)
        out = self.layer1(out)
        if return_features and return_block == 1:
            return out
        out = self.layer2(out)
        if return_features and return_block == 2:
            return out
        out = self.layer3(out)
        if return_features and return_block == 3:
            return out
        out = self.layer4(out)
        if return_features and return_block == 4:
            return out
        out = F.relu(self.bn(out))
        out = F.avg_pool2d(out, 4)
        out = out.view(out.size(0), -1)
        if return_features and return_block == 5:
            return out
        out = self.linear(out)
        if out.shape[1] == 1:
            out = torch.cat([torch.zeros_like(out), out], dim=1)

        return out


class WideResNet(nn.Module):
    """ Based on code from https://github.com/yaodongyu/TRADES """
    def __init__(self, depth=28, num_classes=10, widen_factor=10, droprate=0.0, bias_last=True):
        super(WideResNet, self).__init__()
        self.half_prec = False
        nChannels = [16, 16 * widen_factor, 32 * widen_factor, 64 * widen_factor]
        assert ((depth - 4) % 6 == 0)
        n = (depth - 4) / 6
        block = BasicBlock
        # 1st conv before any network block
        self.conv1 = nn.Conv2d(3, nChannels[0], kernel_size=3, stride=1, padding=1, bias=False)
        # 1st block
        self.block1 = NetworkBlock(n, nChannels[0], nChannels[1], block, 1, droprate)
        # 2nd block
        self.block2 = NetworkBlock(n, nChannels[1], nChannels[2], block, 2, droprate)
        # 3rd block
        self.block3 = NetworkBlock(n, nChannels[2], nChannels[3], block, 2, droprate)
        # global average pooling and classifier
        self.bn1 = nn.BatchNorm2d(nChannels[3])
        self.relu = nn.ReLU(inplace=True)
        self.fc = nn.Linear(nChannels[3], num_classes, bias=bias_last)
        self.nChannels = nChannels[3]

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
            elif isinstance(m, nn.Linear) and not m.bias is None:
                m.bias.data.zero_()

    def forward(self, x):
        out = self.conv1(x)
        out = self.block1(out)
        out = self.block2(out)
        out = self.block3(out)
        out = self.relu(self.bn1(out))
        out = F.avg_pool2d(out, 8)
        out = out.view(-1, self.nChannels)
        return self.fc(out)


cfg = {
    'VGG11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
    'VGG13': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
    'VGG16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'],
    'VGG19': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'],
}


class VGG(nn.Module):
    '''
    VGG model. Source: https://github.com/chengyangfu/pytorch-vgg-cifar10/blob/master/vgg.py
    (in turn modified from https://github.com/pytorch/vision.git)
    '''
    def __init__(self, vgg_name, n_cls, half_prec, use_bn=True, use_pt_init=False, init_multip=1, **kwargs):
        super(VGG, self).__init__()
        self.half_prec = half_prec
        self.mu = torch.tensor((0.485, 0.456, 0.406)).view(1, 3, 1, 1).cuda()
        self.std = torch.tensor((0.229, 0.224, 0.225)).view(1, 3, 1, 1).cuda()
        self.normalize = Normalize(self.mu, self.std)
        self.use_bn = use_bn
        self.conv_names = []
        self.bn_names = []
        self._make_layers(cfg[vgg_name])
        self.classifier = torch.nn.Linear(512, n_cls)
        self.conv_names.append(f'module.classifier.weight')
        if not use_pt_init:
            self._initialize_weights()

        if init_multip != 1:
            for m in self.modules():
                if isinstance(m, torch.nn.Conv2d):
                    m.weight.data *= init_multip
                    if m.bias is not None:
                        m.bias.data *= init_multip
                elif isinstance(m, torch.nn.BatchNorm2d):
                    m.weight.data *= init_multip
                    m.bias.data *= init_multip
                elif isinstance(m, torch.nn.Linear):
                    m.weight.data *= init_multip
                    m.bias.data *= init_multip

    def _make_layers(self, cfg):
        # layers = []
        in_channels = 3
        pool_num, block_num = 0, 0
        self.features = torch.nn.Sequential(OrderedDict([]))
        for x in cfg:
            if x == 'M':
                self.features.add_module(f'pool{pool_num}', torch.nn.MaxPool2d(kernel_size=2, stride=2))
                pool_num += 1
            else:
                self.features.add_module(f'conv{block_num}', torch.nn.Conv2d(in_channels, x, kernel_size=3, padding=1))
                if self.use_bn:
                    self.features.add_module(f'bn{block_num}', torch.nn.BatchNorm2d(x))
                self.features.add_module(f'relu{block_num}', torch.nn.ReLU(inplace=True))
                in_channels = x
                self.conv_names.append(f'module.features.conv{block_num}.weight')
                self.bn_names.append(f'module.features.bn{block_num}.weight')
                block_num += 1

        self.add_module('global_pool', torch.nn.AvgPool2d(kernel_size=1, stride=1))

    def forward(self, x):
        x = self.normalize(x)
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x

    def _initialize_weights(self) -> None:
        for m in self.modules():
            if isinstance(m, torch.nn.Conv2d):
                torch.nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
                if m.bias is not None:
                    torch.nn.init.constant_(m.bias, 0)
            elif isinstance(m, torch.nn.BatchNorm2d):
                torch.nn.init.constant_(m.weight, 1)
                torch.nn.init.constant_(m.bias, 0)
            elif isinstance(m, torch.nn.Linear):
                torch.nn.init.normal_(m.weight, 0, 0.01)
                torch.nn.init.constant_(m.bias, 0)

    def get_plotting_names(self):
        if self.use_bn:
            return {'Linear': self.conv_names,
                    'BN': self.bn_names,}
        else:
            return {'Linear': self.conv_names,}

class SmallCNN(nn.Module):
    def __init__(self, num_classes=10, bias_last=True):
        super(SmallCNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(16)
        self.relu1 = nn.ReLU(inplace=True)

        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(32)
        self.relu2 = nn.ReLU(inplace=True)

        self.fc = nn.Linear(512, num_classes, bias=bias_last)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
            elif isinstance(m, nn.Linear) and m.bias is not None:
                m.bias.data.zero_()

    def forward(self, x):
        out = self.relu1(self.bn1(self.conv1(x)))
        out = self.relu2(self.bn2(self.conv2(out)))
        out = F.avg_pool2d(out, 8)
        out = out.view(out.size(0), -1)
        return self.fc(out)

def vgg11(**kwargs):
    return VGG('VGG11', **kwargs)

def vgg13(**kwargs):
    return VGG('VGG13', **kwargs)

def vgg16(**kwargs):
    return VGG('VGG16', **kwargs)

def vgg19(**kwargs):
    return VGG('VGG19', **kwargs)

def TinyResNet(n_cls, model_width=64, cuda=True, half_prec=False, activation='relu', droprate=0.0):
    bn_flag = True
    return PreActResNet(PreActBlock, [1, 1, 1, 1], n_cls=n_cls, model_width=model_width, cuda=cuda, half_prec=half_prec,
                        activation=activation, droprate=droprate, bn_flag=bn_flag)

def TinyResNetGroupNorm(n_cls, model_width=64, cuda=True, half_prec=False, activation='relu', droprate=0.0):
    bn_flag = False
    return PreActResNet(PreActBlock, [1, 1, 1, 1], n_cls=n_cls, model_width=model_width, cuda=cuda, half_prec=half_prec,
                        activation=activation, droprate=droprate, bn_flag=bn_flag)


def PreActResNet18(n_cls, model_width=64, cuda=True, half_prec=False, activation='relu', droprate=0.0, final_layer_factor=8):
    bn_flag = True
    return PreActResNet(PreActBlock, [2, 2, 2, 2], n_cls=n_cls, model_width=model_width, cuda=cuda, half_prec=half_prec,
                        activation=activation, droprate=droprate, bn_flag=bn_flag, final_layer_factor=final_layer_factor)


def PreActResNet34(n_cls, model_width=64, cuda=True, half_prec=False, activation='relu', droprate=0.0):
    bn_flag = True
    return PreActResNet(PreActBlock, [3, 4, 6, 3], n_cls=n_cls, model_width=model_width, cuda=cuda, half_prec=half_prec,
                        activation=activation, droprate=droprate, bn_flag=bn_flag)


def PreActResNet18GroupNorm(n_cls, model_width=64, cuda=True, half_prec=False, activation='relu', droprate=0.0):
    bn_flag = False  # bn_flag==False means that we use GroupNorm with 32 groups
    return PreActResNet(PreActBlock, [2, 2, 2, 2], n_cls=n_cls, model_width=model_width, cuda=cuda, half_prec=half_prec,
                        activation=activation, droprate=droprate, bn_flag=bn_flag)


def PreActResNet34GroupNorm(n_cls, model_width=64, cuda=True, half_prec=False, activation='relu', droprate=0.0):
    bn_flag = False  # bn_flag==False means that we use GroupNorm with 32 groups
    return PreActResNet(PreActBlock, [3, 4, 6, 3], n_cls=n_cls, model_width=model_width, cuda=cuda, half_prec=half_prec,
                        activation=activation, droprate=droprate, bn_flag=bn_flag)


def ResNet34(n_cls, model_width=64, final_layer_factor=8):
    return ResNet(BasicBlockResNet34, [3, 4, 6, 3], num_classes=n_cls, model_width=model_width, final_layer_factor=final_layer_factor)


def ResNet50(n_cls, model_width=64, final_layer_factor=8):
    return ResNet(Bottleneck, [3, 4, 6, 3], num_classes=n_cls,model_width=model_width, final_layer_factor=final_layer_factor)


def WideResNet28(n_cls, model_width=10):
    return WideResNet(num_classes=n_cls, widen_factor=model_width)

def CNN(n_cls):
    return SmallCNN(n_cls)

class MLP(nn.Module):
    def __init__(self, input_dim=784, hidden_dim=512, n_cls=10, dropout=0.2):
        super(MLP, self).__init__()
        self.input_dim = input_dim
        self.droprate = dropout
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, n_cls)

    def forward(self, x):
        # Flatten image input
        x = x.view(-1, self.input_dim)
        # Add hidden layer, with relu activation function
        x = F.relu(self.fc1(x))
        if self.droprate > 0:
            x = F.dropout(x, p=self.droprate, training=self.training)
        x = F.relu(self.fc2(x))
        if self.droprate > 0:
            x = F.dropout(x, p=self.droprate, training=self.training)
        x = F.relu(self.fc3(x))
        return x

class DeepMLP(nn.Module):
    def __init__(self, input_dim=784, hidden_dim=512, n_cls=10, dropout=0.2):
        super(DeepMLP, self).__init__()
        self.input_dim = input_dim
        self.droprate = dropout
        self.fc1 = nn.Linear(input_dim, 512)
        self.fc2 = nn.Linear(512, 512)
        self.fc3 = nn.Linear(512, 384)
        self.fc4 = nn.Linear(384, 256)
        self.fc5 = nn.Linear(256, 128)
        self.fc6 = nn.Linear(128, n_cls)

    def forward(self, x):
        # Flatten image input
        x = x.view(-1, self.input_dim)
        # Add hidden layer, with relu activation function
        x = F.relu(self.fc1(x))
        if self.droprate > 0:
            x = F.dropout(x, p=self.droprate, training=self.training)
        x = F.relu(self.fc2(x))
        if self.droprate > 0:
            x = F.dropout(x, p=self.droprate, training=self.training)
        x = F.relu(self.fc3(x))
        if self.droprate > 0:
            x = F.dropout(x, p=self.droprate, training=self.training)
        x = F.relu(self.fc4(x))
        if self.droprate > 0:
            x = F.dropout(x, p=self.droprate, training=self.training)
        x = F.relu(self.fc5(x))
        if self.droprate > 0:
            x = F.dropout(x, p=self.droprate, training=self.training)
        x = self.fc6(x)
        return x


def get_model(model_name, n_cls, half_prec, shapes_dict, model_width, activation='relu', droprate=0.0, hidden_dim=512, **kwargs):
    if model_name == 'resnet18':
        if kwargs['pretrained']:
            model = resnet18(weights=ResNet18_Weights.IMAGENET1K_V1)
            num_features = model.fc.in_features
            model.fc = nn.Linear(num_features, n_cls)
        else:
            final_layer_factor = (shapes_dict[-1] // 32)**2 * 8  # 32 -> 8, 64 -> 32
            model = PreActResNet18(n_cls, model_width=model_width, half_prec=half_prec, activation=activation, droprate=droprate, final_layer_factor=final_layer_factor)
    elif model_name == 'resnet18_gn':
        model = PreActResNet18GroupNorm(n_cls, model_width=model_width, half_prec=half_prec, activation=activation, droprate=droprate)
    elif model_name == 'vgg16':
        assert droprate == 0.0, 'dropout is not implemented for vgg16'
        model = vgg16(n_cls=n_cls, half_prec=half_prec)
    elif model_name == 'vgg19':
        assert droprate == 0.0, 'dropout is not implemented for vgg19'
        model = vgg19(n_cls=n_cls, half_prec=half_prec)
    elif model_name == 'resnet34':
        final_layer_factor = (shapes_dict[-1] // 32)**2 * 8  # 32 -> 8, 64 -> 32
        model = ResNet34(n_cls, model_width, final_layer_factor=final_layer_factor)
    elif model_name == 'resnet34_gn':
        model = PreActResNet34GroupNorm(n_cls, model_width=model_width, half_prec=half_prec, activation=activation, droprate=droprate)
    elif model_name == 'resnet34preact':
        model = PreActResNet34(n_cls, model_width=model_width, half_prec=half_prec, activation=activation, droprate=droprate)
    elif model_name == 'resnet50':
        if kwargs['pretrained']:
            model = resnet50(weights=ResNet50_Weights.IMAGENET1K_V1)
            num_features = model.fc.in_features
            model.fc = nn.Linear(num_features, n_cls)
        else:
            final_layer_factor = (shapes_dict[-1] // 32)**2 * 8  # 32 -> 8, 64 -> 32
            model = ResNet50(n_cls, model_width=model_width, final_layer_factor=final_layer_factor)
    elif model_name == 'wrn28':
        model = WideResNet28(n_cls, model_width)
    elif model_name == 'resnet_tiny':
        model = TinyResNet(n_cls, model_width=model_width, half_prec=half_prec, activation=activation, droprate=droprate)
    elif model_name == 'resnet_tiny_gn':
        model = TinyResNetGroupNorm(n_cls, model_width=model_width, half_prec=half_prec, activation=activation, droprate=droprate)
    elif model_name == 'cnn':
        model = CNN(n_cls)
    elif model_name == 'densenet121':
        model = densenet121(num_classes=n_cls, drop_rate=droprate)
    elif model_name == 'efficientb1':
        model = efficientnet_b1(num_classes=n_cls)
    elif model_name == 'efficientv2s':
        model = efficientnet_v2_s(num_classes=n_cls)
    elif model_name == 'mlp':
        input_dim = shapes_dict[1] * shapes_dict[2] * shapes_dict[3]
        model = MLP(input_dim=input_dim, hidden_dim=hidden_dim, n_cls=n_cls, dropout=droprate)
    elif model_name == 'deep_mlp':
        input_dim = shapes_dict[1] * shapes_dict[2] * shapes_dict[3]
        model = DeepMLP(input_dim=input_dim, hidden_dim=hidden_dim, n_cls=n_cls, dropout=droprate)
    elif model_name == 'vit':
        # Adapted from https://github.com/tintn/vision-transformer-from-scratch/tree/main
        config = {
            "patch_size": kwargs['patch_size'],  # Input image size: 32x32 -> 8x8 patches, default: 4
            "hidden_size": kwargs['hidden_size'], # default: 48
            "num_hidden_layers": kwargs['num_hidden_layers'], # default: 4
            "num_attention_heads": kwargs['num_attention_heads'], # default: 4
            "intermediate_size": kwargs['mlp_ratio'] * kwargs['hidden_size'], # 4 * hidden_size
            "hidden_dropout_prob": 0.0,
            "attention_probs_dropout_prob": 0.0,
            "initializer_range": 0.02,
            "image_size": shapes_dict[-1],
            "num_classes": n_cls, # num_classes of CIFAR10
            "num_channels": shapes_dict[1],
            "qkv_bias": True,
            "use_faster_attention": True,
        }
        # These are not hard constraints, but are used to prevent misconfigurations
        assert config["hidden_size"] % config["num_attention_heads"] == 0
        assert config['intermediate_size'] == kwargs['mlp_ratio'] * config['hidden_size']
        assert config['image_size'] % config['patch_size'] == 0

        model = ViTForClassfication(config)
    elif model_name == 'vit2':
        model = ViT(
            image_size = shapes_dict[-1],
            patch_size = kwargs['patch_size'],
            num_classes = n_cls,
            dim = kwargs['hidden_size'],
            depth = kwargs['num_hidden_layers'],
            heads = kwargs['num_attention_heads'],
            mlp_dim = kwargs['mlp_ratio'] * kwargs['hidden_size'],
            dropout = droprate,
            emb_dropout = droprate
        )
    else:
        raise ValueError('wrong model')
    return model


def init_weights(model, scale_init=0.0):
    def init_weights_linear(m):
        if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
            # m.weight.data.zero_()
            m.weight.data.normal_()
            m.weight.data *= scale_init / (m.weight.data ** 2).sum()**0.5
            if m.bias is not None:
                m.bias.data.zero_()

    def init_weights_he(m):
        # From Rice et al.
        if isinstance(m, nn.Conv2d):
            nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
        elif isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.GroupNorm):
            m.weight.data.fill_(1)
            m.bias.data.zero_()
        elif isinstance(m, nn.Linear):
            m.bias.data.zero_()

    if model == 'linear':
        return init_weights_linear
    else:
        return init_weights_he


def forward_pass_rlat(model, x, deltas, layers):
    i = 0

    def out_hook(m, inp, out_layer):
        nonlocal i
        if layers[i] == model.normalize:
            new_out = (torch.clamp(inp[0] + deltas[i], 0, 1) - model.mu) / model.std
        else:
            new_out = out_layer + deltas[i]
        i += 1
        return new_out

    handles = [layer.register_forward_hook(out_hook) for layer in layers]
    out = model(x)

    for handle in handles:
        handle.remove()
    return out


def get_rlat_layers(model, layers):
    # import ipdb;ipdb.set_trace()
    if layers == 'all':
        return [model.normalize,
                model.conv1,
                model.layer1[0].bn1,
                model.layer1[0].conv1,
                model.layer1[0].bn2,
                model.layer1[0].conv2,
                model.layer1[1].bn1,
                model.layer1[1].conv1,
                model.layer1[1].bn2,
                model.layer1[1].conv2,
                model.layer1,
                model.layer2[0].bn1,
                model.layer2[0].conv1,
                model.layer2[0].bn2,
                model.layer2[0].conv2,
                model.layer2[1].bn1,
                model.layer2[1].conv1,
                model.layer2[1].bn2,
                model.layer2[1].conv2,
                model.layer2,
                model.layer3[0].bn1,
                model.layer3[0].conv1,
                model.layer3[0].bn2,
                model.layer3[0].conv2,
                model.layer3[1].bn1,
                model.layer3[1].conv1,
                model.layer3[1].bn2,
                model.layer3[1].conv2,
                model.layer3,
                model.layer4[0].bn1,
                model.layer4[0].conv1,
                model.layer4[0].bn2,
                model.layer4[0].conv2,
                model.layer4[1].bn1,
                model.layer4[1].conv1,
                model.layer4[1].bn2,
                model.layer4[1].conv2,
                model.layer4,
                model.bn,
                ]
    elif layers =='lpips':
        return [model.conv1,
                model.layer1,
                model.layer2,
                model.layer3,
                model.layer4]
    elif layers == 'bnonly':
        return [model.normalize,
                 model.layer1[0].bn1,
                 model.layer1[0].bn2,
                 model.layer1[1].bn1,
                 model.layer1[1].bn2,
                 model.layer2[0].bn1,
                 model.layer2[0].bn2,
                 model.layer2[1].bn1,
                 model.layer2[1].bn2,
                 model.layer3[0].bn1,
                 model.layer3[0].bn2,
                 model.layer3[1].bn1,
                 model.layer3[1].bn2,
                 model.layer4[0].bn1,
                 model.layer4[0].bn2,
                 model.layer4[1].bn1,
                 model.layer4[1].bn2,
                 model.bn
           ]
    elif layers == 'convonly':
        return [model.normalize,
                     model.conv1,
                     model.layer1[0].conv1,
                     model.layer1[0].conv2,
                     model.layer1[1].conv1,
                     model.layer1[1].conv2,
                     model.layer2[0].conv1,
                     model.layer2[0].conv2,
                     model.layer2[1].conv1,
                     model.layer2[1].conv2,
                     model.layer3[0].conv1,
                     model.layer3[0].conv2,
                     model.layer3[1].conv1,
                     model.layer3[1].conv2,
                     model.layer4[0].conv1,
                     model.layer4[0].conv2,
                     model.layer4[1].conv1,
                     model.layer4[1].conv2
                     ]
    elif layers == 'block0':
        return [model.conv1]
    elif layers == 'block1':
        return [model.layer1]
    elif layers == 'block2':
        return [model.layer2]
    elif layers == 'block3':
        return [model.layer3]
    elif layers == 'block4':
        return [model.layer4]
    else:
        raise ValueError('wrong RLAT layers')

