import math
import numpy as np
import torch as th
import torch.nn as nn
import torch.nn.functional as F
from .nn import (
    conv_nd,
    avg_pool_nd,
    zero_module,
    normalization,
)
from einops import rearrange, repeat
from einops.layers.torch import Rearrange

from ..vit import pair


class AttentionPool2d(nn.Module):
    """
    Adapted from CLIP: https://github.com/openai/CLIP/blob/main/clip/model.py
    """

    def __init__(
            self,
            spacial_dim: int,
            embed_dim: int,
            num_heads_channels: int,
            output_dim: int = None,
    ):
        super().__init__()
        self.positional_embedding = nn.Parameter(
            th.randn(embed_dim, spacial_dim ** 2 + 1) / embed_dim ** 0.5
        )
        self.q_proj = conv_nd(1, embed_dim, embed_dim, 1)
        self.k_proj = conv_nd(1, embed_dim, embed_dim, 1)
        self.v_proj = conv_nd(1, embed_dim, embed_dim, 1)
        self.c_proj = conv_nd(1, embed_dim, output_dim or embed_dim, 1)
        self.num_heads = embed_dim // num_heads_channels
        self.attention = QKVAttention(self.num_heads)

    def forward(self, q, kv):
        b, c, *_spatial = q.shape
        q = q.reshape(b, c, -1)  # NC(HW)
        q = th.cat([q.mean(dim=-1, keepdim=True), q], dim=-1)  # NC(HW+1)
        q = q + self.positional_embedding[None, :, :].to(q.dtype)  # NC(HW+1)

        b, c, *_spatial = kv.shape
        kv = kv.reshape(b, c, -1)  # NC(HW)
        kv = th.cat([kv.mean(dim=-1, keepdim=True), kv], dim=-1)  # NC(HW+1)
        k = kv + self.positional_embedding[None, :, :].to(kv.dtype)  # NC(HW+1)
        v = kv

        q = self.q_proj(q)
        k = self.k_proj(k)
        v = self.v_proj(v)
        x = self.attention(q, k, v)
        x = self.c_proj(x)
        return x[:, :, 0]


class Upsample(nn.Module):
    """
    An upsampling layer with an optional convolution.
    :param channels: channels in the inputs and outputs.
    :param use_conv: a bool determining if a convolution is applied.
    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
                 upsampling occurs in the inner-two dimensions.
    """

    def __init__(self, channels, use_conv, dims=2, out_channels=None):
        super().__init__()
        self.channels = channels
        self.out_channels = out_channels or channels
        self.use_conv = use_conv
        self.dims = dims
        if use_conv:
            self.conv = conv_nd(dims, self.channels, self.out_channels, 3, padding=1)

    def forward(self, x):
        assert x.shape[1] == self.channels
        if self.dims == 3:
            x = F.interpolate(
                x, (x.shape[2], x.shape[3] * 2, x.shape[4] * 2), mode="nearest"
            )
        else:
            x = F.interpolate(x, scale_factor=2, mode="nearest")
        if self.use_conv:
            x = self.conv(x)
        return x


class Downsample(nn.Module):
    """
    A downsampling layer with an optional convolution.
    :param channels: channels in the inputs and outputs.
    :param use_conv: a bool determining if a convolution is applied.
    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
                 downsampling occurs in the inner-two dimensions.
    """

    def __init__(self, channels, use_conv, dims=2, out_channels=None):
        super().__init__()
        self.channels = channels
        self.out_channels = out_channels or channels
        self.use_conv = use_conv
        self.dims = dims
        stride = 2 if dims != 3 else (1, 2, 2)
        if use_conv:
            self.op = conv_nd(
                dims, self.channels, self.out_channels, 3, stride=stride, padding=1
            )
        else:
            assert self.channels == self.out_channels
            self.op = avg_pool_nd(dims, kernel_size=stride, stride=stride)

    def forward(self, x):
        assert x.shape[1] == self.channels
        return self.op(x)


class ResBlock(nn.Module):
    """
    A residual block that can optionally change the number of channels.
    :param channels: the number of input channels.
    :param emb_channels: the number of timestep embedding channels.
    :param dropout: the rate of dropout.
    :param out_channels: if specified, the number of out channels.
    :param use_conv: if True and out_channels is specified, use a spatial
        convolution instead of a smaller 1x1 convolution to change the
        channels in the skip connection.
    :param dims: determines if the signal is 1D, 2D, or 3D.
    :param use_checkpoint: if True, use gradient checkpointing on this module.
    :param up: if True, use this block for upsampling.
    :param down: if True, use this block for downsampling.
    """

    def __init__(
            self,
            channels,
            dropout,
            out_channels=None,
            use_conv=False,
            use_scale_shift_norm=False,
            dims=2,
            use_checkpoint=False,
            up=False,
            down=False,
            use_norm=True
    ):
        super().__init__()
        self.channels = channels
        self.dropout = dropout
        self.out_channels = out_channels or channels
        self.use_conv = use_conv
        self.use_checkpoint = use_checkpoint
        self.use_scale_shift_norm = use_scale_shift_norm

        self.in_layers = nn.Sequential(
            normalization(channels) if use_norm else \
                conv_nd(channels, channels, channels, 3, padding=1),
            nn.SiLU(),
            conv_nd(dims, channels, self.out_channels, 3, padding=1),
        )

        self.updown = up or down

        if up:
            self.h_upd = Upsample(channels, False, dims)
            self.x_upd = Upsample(channels, False, dims)
        elif down:
            self.h_upd = Downsample(channels, False, dims)
            self.x_upd = Downsample(channels, False, dims)
        else:
            self.h_upd = self.x_upd = nn.Identity()

        self.out_layers = nn.Sequential(
            normalization(self.out_channels) if use_norm else \
                conv_nd(self.out_channels, self.out_channels, self.out_channels, 3, padding=1),
            nn.SiLU(),
            nn.Dropout(p=dropout),
            zero_module(
                conv_nd(dims, self.out_channels, self.out_channels, 3, padding=1)
            ),
        )

        if self.out_channels == channels:
            self.skip_connection = nn.Identity()
        elif use_conv:
            self.skip_connection = conv_nd(
                dims, channels, self.out_channels, 3, padding=1
            )
        else:
            self.skip_connection = conv_nd(dims, channels, self.out_channels, 1)

    def forward(self, x):
        if self.updown:
            in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1]
            h = in_rest(x)
            h = self.h_upd(h)
            x = self.x_upd(x)
            h = in_conv(h)
        else:
            h = self.in_layers(x)
        return self.skip_connection(x) + h


class AttentionBlock(nn.Module):
    """
    An attention block that allows spatial positions to attend to each other.
    Originally ported from here, but adapted to the N-d case.
    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66.
    """

    def __init__(
            self,
            c_q,
            c_kv,
            channels,
            patch_size=4,
            num_heads=1,
            num_head_channels=-1,
    ):
        super().__init__()
        self.channels = channels
        if num_head_channels == -1:
            self.num_heads = num_heads
        else:
            assert (
                    channels % num_head_channels == 0
            ), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}"
            self.num_heads = channels // num_head_channels

        patch_height, patch_width = pair(patch_size)

        # assert image_height % patch_height == 0 and image_width % patch_width == 0, \
        #     'Image dimensions must be divisible by the patch size.'

        patch_dim = c_q * patch_height * patch_width
        self.to_patch_embedding = nn.Sequential(
            Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1=patch_height, p2=patch_width),
            nn.LayerNorm(patch_dim),
            nn.Linear(patch_dim, channels),
            nn.LayerNorm(channels),
        )

        self.q_proj = conv_nd(1, channels, channels, 1)
        self.k_proj = conv_nd(1, c_kv, channels, 1)
        self.v_proj = conv_nd(1, c_kv, channels, 1)
        self.attention = QKVAttention(self.num_heads)
        # self.q_positional_embedding = nn.Parameter(
        #     th.randn(16 ** 2, channels) / channels ** 0.5
        # )
        # self.q_positional_embedding2 = nn.Parameter(
        #     th.randn(channels, spacial_dim ** 2) / channels ** 0.5
        # )
        self.k_positional_embedding = nn.Parameter(
            th.randn(3456, c_kv) / c_kv ** 0.5
        )
        self.proj_out = zero_module(conv_nd(1, channels, patch_dim, 1))

    def forward(self, q, kv, pos_x=None, pos_y=None):
        B, c, H, W = q.shape
        # q = q.reshape(B, c, H * W)

        q = self.to_patch_embedding(q)
        # b, T, c = q.shape
        # q = q + self.q_positional_embedding[None, :, :].to(q.dtype)  # NC(HW+1)
        k = kv + self.k_positional_embedding[None, :, :].to(kv.dtype)
        v = kv

        q = self.q_proj(q.transpose(1, 2))
        k = self.k_proj(k.transpose(1, 2))
        v = self.v_proj(v.transpose(1, 2))
        h = self.attention(q, k, v)
        h = self.proj_out(h)
        h = h.reshape(B, c, H, W)
        return h
        # return (q + h).reshape(b, c, *spatial)


class QKVAttention(nn.Module):
    """
    A module which performs QKV attention and splits in a different order.
    """

    def __init__(self, n_heads):
        super().__init__()
        self.n_heads = n_heads

    def forward(self, q, k, v):
        """
        Apply QKV attention.
        :param qkv: an [N x (3 * H * C) x T] tensor of Qs, Ks, and Vs.
        :return: an [N x (H * C) x T] tensor after attention.
        """
        bs, ch, l_q = q.shape
        _, _, l_kv = k.shape
        ch = ch // self.n_heads
        scale = 1 / math.sqrt(math.sqrt(ch))
        weight = th.einsum(
            "bct,bcs->bts",
            (q * scale).view(bs * self.n_heads, ch, l_q),
            (k * scale).view(bs * self.n_heads, ch, l_kv),
        )  # More stable with f16 than dividing afterwards
        weight = th.softmax(weight.float(), dim=-1).type(weight.dtype)
        a = th.einsum("bts,bcs->bct", weight, v.reshape(bs * self.n_heads, ch, l_kv))
        return a.reshape(bs, -1, l_q)
