import os
from typing import Any, Type

import torch
import torch.nn as nn
import torch.nn.functional as F
from layers.spectral_norm_conv import SpectralNormConv

# this implementation is borrowed from https://github.com/meliketoy/wide-resnet.pytorch/blob/master/networks/wide_resnet.py
# the DDU specific parts have been modified to follow the implementation from here:
#  - https://github.com/omegafragger/DDU/blob/93030446bef492250094bfd0c131b78873757574/net/wide_resnet.py

T = torch.Tensor


def conv3x3(in_planes: int, out_planes: int, dim: int, stride: int = 1, ctype: str = "none", c: float = 3.0) -> nn.Module:
    shape = (1, in_planes, dim, dim)
    return SpectralNormConv(
        nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=True),
        shape,
        ctype="none",
        c=c
    )


class Shortcut(nn.Module):
    def __init__(self, stride: int, p: float, in_planes: int, out_planes: int):
        super().__init__()

        self.in_planes = in_planes
        self.out_planes = out_planes
        self.stride = stride
        self.avg_pool = nn.AvgPool2d(stride, stride)

    def forward(self, x: T) -> T:
        x = self.avg_pool(x)
        b, _, h, w = x.size()
        padding = torch.zeros(b, self.out_planes - self.in_planes, h, w, device=x.device)
        return torch.cat((x, padding), dim=1)


class WideBasic(nn.Module):
    def __init__(
        self,
        in_planes: int,
        planes: int,
        p: float,
        dim: int,
        stride: int = 1,
        dropout_layer: Any = nn.Dropout,
        ctype: str = "none",
        c: float = 3.0
    ):
        super(WideBasic, self).__init__()
        self.bn1 = nn.BatchNorm2d(in_planes)
        self.conv1 = conv3x3(in_planes, planes, dim, ctype=ctype, c=c)
        self.dropout = dropout_layer(p=p)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv2 = conv3x3(planes, planes, dim, stride=stride, ctype=ctype, c=c)
        self.leaky_relu = nn.LeakyReLU(0.01)

        self.shortcut = nn.Identity()
        if stride != 1 or in_planes != planes:
            # the original paper says that they use a strided average pooling for downsampling rather than a 1x1 convolution
            # https://arxiv.org/pdf/2102.11582.pdf (appendix B)
            self.shortcut = Shortcut(stride, p, in_planes, planes)  # type: ignore

    def forward(self, x: T) -> T:
        out = self.conv1(self.leaky_relu(self.bn1(x)))
        out = self.conv2(self.dropout(self.leaky_relu(self.bn2(out))))
        out += self.shortcut(x)
        return out  # type: ignore


class WideResNetSN(nn.Module):
    def __init__(
        self,
        depth: int,
        widen_factor: int = 10,
        p: float = 0.1,
        num_classes: int = 10,
        dim: int = 32,
        name: str = None,
        filterwise_dropout: bool = True,
        ctype: str = "none",
        c: float = 3.0
    ):
        super().__init__()
        if name is None:
            raise ValueError("name cannot be None for WideResNet")

        get_drop_layer = {False: (nn.Dropout, "dropout"), True: (nn.Dropout2d, "filterwise-dropout")}
        dropout_layer, drop_layername = get_drop_layer[filterwise_dropout]

        self.in_planes = 16
        self.name = os.path.join(name, drop_layername, f"p-{p}")
        self.ctype = ctype
        self.c = c
        self.dim = dim

        assert ((depth - 4) % 6 == 0), 'Wide-resnet depth should be 6n+4'
        n = (depth - 4) // 6
        k = widen_factor

        self.out_dim = 64 * k
        nStages = [16, 16 * k, 32 * k, 64 * k]

        self.conv1 = nn.Sequential(conv3x3(3 , nStages[0], dim, ctype=ctype, c=c), dropout_layer(p=p))
        self.layer1 = self._wide_layer(WideBasic, nStages[1], n, p, stride=1, dropout_layer=dropout_layer)
        self.layer2 = self._wide_layer(WideBasic, nStages[2], n, p, stride=2, dropout_layer=dropout_layer)
        self.layer3 = self._wide_layer(WideBasic, nStages[3], n, p, stride=2, dropout_layer=dropout_layer)
        self.bn1 = nn.BatchNorm2d(nStages[3], momentum=0.9)
        self.leaky_relu = nn.LeakyReLU(0.01)

        self.h_dim = nStages[3]

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

    def _wide_layer(self, block: Type[WideBasic], planes: int, num_blocks: int, dropout_rate: float, stride: int, dropout_layer: Any) -> nn.Module:
        strides = [stride] + [1] * (int(num_blocks) - 1)
        layers = []

        for stride in strides:
            layers.append(
                block(
                    self.in_planes,
                    planes,
                    dropout_rate,
                    self.dim,
                    stride,
                    dropout_layer=dropout_layer,
                    ctype=self.ctype,
                    c=self.c
                )
            )
            self.in_planes = planes
            self.dim = self.dim // stride

        return nn.Sequential(*layers)

    def forward(self, x: T) -> T:
        out = self.conv1(x)
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.leaky_relu(self.bn1(out))
        out = F.avg_pool2d(out, 8)
        out = torch.flatten(out, start_dim=1)
        return out  # type: ignore


def wide_sn_resnet28_cifar(**kwargs: Any) -> WideResNetSN:
    widen_factor = kwargs["widen_factor"]
    return WideResNetSN(28, name=f"WideResNetSN28-{widen_factor}", **kwargs)
