"""
2023.03.05 init
"""
from functools import partial
import math
import torch
import torch.nn as nn
from typing import Optional, List, Tuple

from myopenfold.model.primitives import (
    Linear, 
    LayerNorm,
    Attention, 
    GlobalAttention, 
    _attention_chunked_trainable,
)
from myopenfold.utils.checkpointing import get_checkpoint_fn

from myopenfold.utils.tensor_utils import (
    permute_final_dims,
    flatten_final_dims,
)


class MSAAttention(nn.Module):
    def __init__(
        self,
        c_in,
        c_hidden,
        no_heads,
        pair_bias=False,
        c_z=None,
        inf=1e9,
        depth=0,
        ind=0,
        log=False,
    ):
        """
        Args:
            c_in:
                Input channel dimension
            c_hidden:
                Per-head hidden channel dimension
            no_heads:
                Number of attention heads
            pair_bias:
                Whether to use pair embedding bias
            c_z:
                Pair embedding channel dimension. Ignored unless pair_bias
                is true
            inf:
                A large number to be used in computing the attention mask
            .depth:
                Depth of this module in the whold model
            .ind:
                Index of this block in the stack
            .log
                Whether print some log information
        """
        super(MSAAttention, self).__init__()

        self.depth = depth
        self.ind = ind
        self.log=log

        self.c_in = c_in
        self.c_hidden = c_hidden
        self.no_heads = no_heads
        self.pair_bias = pair_bias
        self.c_z = c_z
        self.inf = inf

        self.layer_norm_m = LayerNorm(self.c_in)

        #. Pair bias Embedding layer
        self.layer_norm_z = None
        self.linear_z = None
        if self.pair_bias:
            self.layer_norm_z = LayerNorm(self.c_z)
            self.linear_z = Linear(
                self.c_z, self.no_heads, bias=False, init="normal"
            )
        
        self.mha = Attention(
            self.c_in, 
            self.c_in, 
            self.c_in, 
            self.c_hidden, 
            self.no_heads,
            self.depth + 1,
            self.ind,
        )

    def _prep_inputs(self,
        m: torch.Tensor,
        z: Optional[torch.Tensor],
        mask: Optional[torch.Tensor],
        inplace_safe: bool = False,
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: 
        n_seq, n_res = m.shape[-3:-1]
        if mask is None:
            # [*, N_seq, N_res]
            mask = m.new_ones(
                m.shape[:-3] + (n_seq, n_res),
            )

        # [*, N_seq, 1, 1, N_res]
        #. 1 -> 0, 0 -> -inf
        mask_bias = (self.inf * (mask - 1))[..., :, None, None, :]

        # NOTE basically this is just a layernorm and a linear projection
        # NOTE  but why is this slicing and concat oper good for torchscript?
        if (self.pair_bias and 
            z is not None and                       # For the 
            self.layer_norm_z is not None and       # benefit of
            self.linear_z is not None               # TorchScript
        ):
            chunks = []

            for i in range(0, z.shape[-3], 256):
                z_chunk = z[..., i: i + 256, :, :]

                # [*, N_res, N_res, C_z]
                z_chunk = self.layer_norm_z(z_chunk)
            
                # [*, N_res, N_res, no_heads]
                z_chunk = self.linear_z(z_chunk)

                chunks.append(z_chunk)
            
            z = torch.cat(chunks, dim=-3)
            
            # [*, 1, no_heads, N_res, N_res]
            z = permute_final_dims(z, (2, 0, 1)).unsqueeze(-4)

        return m, mask_bias, z

    def forward(self, 
        m: torch.Tensor, 
        z: Optional[torch.Tensor] = None, 
        mask: Optional[torch.Tensor] = None, 
        use_memory_efficient_kernel: bool = False,
        inplace_safe: bool = False,
    ) -> torch.Tensor:
        """
        Args:
            m:
                [*, N_seq, N_res, C_m] MSA embedding
            z:
                [*, N_res, N_res, C_z] pair embedding. Required only if
                pair_bias is True
            mask:
                [*, N_seq, N_res] MSA mask
        """
        if self.ind == 0 and self.log: print('\t' * self.depth + 'In MSAAttention: init')  # DEBUG
        if self.ind == 0 and self.log: print('\t' * self.depth + 'm', m.shape)  # DEBUG
        if self.ind == 0 and self.log: print('\t' * self.depth + 'z', z.shape if z is not None else 'None')  # DEBUG
        if self.ind == 0 and self.log: print('\t' * self.depth + 'mask', mask.shape)  # DEBUG


        m, mask_bias, z = self._prep_inputs(
            m, z, mask, inplace_safe=inplace_safe
        )
        #. m [*, N_seq, N_res, C_m]
        #. mask_bias [*, N_seq, 1, 1, N_res]
        #. z [*, 1, no_heads, N_res, N_res]
        if self.ind == 0 and self.log: print('\t' * self.depth + 'In MSAAttention: after _prep_inputs')  # DEBUG
        if self.ind == 0 and self.log: print('\t' * self.depth + 'm', m.shape)  # DEBUG
        if self.ind == 0 and self.log: print('\t' * self.depth + 'mask_bias', mask_bias.shape)  # DEBUG
        if self.ind == 0 and self.log: print('\t' * self.depth + 'z', z.shape)  # DEBUG

        biases = [mask_bias]
        if(z is not None):
            biases.append(z)

        #print("Normal attention with use_memory_efficient_kernel", use_memory_efficient_kernel, self.ind)  # DEBUG
        m = self.layer_norm_m(m)
        #print("m after layer_norm_m", m, self.ind)  # DEBUG
        m = self.mha(
            q_x=m, 
            kv_x=m, 
            biases=biases,
            use_memory_efficient_kernel=use_memory_efficient_kernel
        )
        #. q_x [*, N_seq, N_res, C_m]
        #. kv_x [*, N_seq, N_res, C_m]
        #. biases [*, N_seq, 1, 1, N_res] [*, 1, no_heads, N_res, N_res]
        #. q [*, N_seq, no_heads, N_res, C_m // no_heads]
        #. k [*, N_seq, no_heads, N_res, C_m // no_heads]
        #. v [*, N_seq, no_heads, N_res, C_m // no_heads]
        #. a [*, N_seq, no_heads, N_res, N_res]
        #. o [*, N_seq, no_heads, N_res, C_m // no_heads] -> [*, N_seq, N_res, no_heads, C_m // no_heads]
        #. g [*, N_seq, N_res, C_m] -> [*, N_seq, N_res, no_heads, C_m // no_heads]
        #. m [*, N_seq, N_res, C_m]
        if self.ind == 0 and self.log: print('\t' * self.depth + 'In MSAAttention: after mha')  # DEBUG
        if self.ind == 0 and self.log: print('\t' * self.depth + 'm', m.shape)  # DEBUG

        return m


class MSARowAttentionWithPairBias(MSAAttention):
    """
    Implements Algorithm 7.
    """

    def __init__(self, c_m, c_z, c_hidden, no_heads, inf=1e9, depth=0, ind=0, log=False):
        """
        Args:
            c_m:
                Input channel dimension
            c_z:
                Pair embedding channel dimension
            c_hidden:
                Per-head hidden channel dimension
            no_heads:
                Number of attention heads
            inf:
                Large number used to construct attention masks
            .depth:
                Depth of this module in the whold model
            .ind:
                Index of this block in the stack
            .log
                Whether print some log information
        """
        super(MSARowAttentionWithPairBias, self).__init__(
            c_m,
            c_hidden,
            no_heads,
            pair_bias=True,
            c_z=c_z,
            inf=inf,
            depth=depth,
            ind=ind,
            log=log
        )


class MSAColumnAttention(nn.Module):
    """
    Implements Algorithm 8.

    By rights, this should also be a subclass of MSAAttention. Alas,
    most inheritance isn't supported by TorchScript.
    """

    def __init__(self, c_m, c_hidden, no_heads, inf=1e9, depth=0, ind=0, log=False):
        """
        Args:
            c_m:
                MSA channel dimension
            c_hidden:
                Per-head hidden channel dimension
            no_heads:
                Number of attention heads
            inf:
                Large number used to construct attention masks
            .depth:
                Depth of this module in the whold model
            .ind:
                Index of this block in the stack
            .log
                Whether print some log information
        """
        super(MSAColumnAttention, self).__init__()
        
        self.depth = depth
        self.ind = ind
        self.log = log

        self.c_m = c_m
        self.c_hidden = c_hidden
        self.no_heads = no_heads
        self.inf = inf

        self._msa_att = MSAAttention(
            c_in=c_m,
            c_hidden=c_hidden,
            no_heads=no_heads,
            pair_bias=False,
            c_z=None,
            inf=inf,
            depth=depth,
            ind=ind
        )

    def forward(self, 
        m: torch.Tensor, 
        mask: Optional[torch.Tensor] = None, 
    ) -> torch.Tensor:
        """
        Args:
            m:
                [*, N_seq, N_res, C_m] MSA embedding
            mask:
                [*, N_seq, N_res] MSA mask
            chunk_size:
                Size of chunks into which the inputs are split along their
                batch dimensions. A low value decreases memory overhead at the 
                cost of slower execution. Chunking is not performed by default.
        """ 
        # [*, N_res, N_seq, C_in]
        m = m.transpose(-2, -3)
        if mask is not None:
            mask = mask.transpose(-1, -2)

        m = self._msa_att(
            m, 
            mask=mask, 
        )

        # [*, N_seq, N_res, C_in]
        m = m.transpose(-2, -3)
        
        # NOTE why is this necessary?
        if mask is not None:
            mask = mask.transpose(-1, -2)

        return m


class MSAColumnGlobalAttention(nn.Module):
    def __init__(
        self, c_in, c_hidden, no_heads, inf=1e9, eps=1e-10,
    ):
        super(MSAColumnGlobalAttention, self).__init__()

        self.c_in = c_in
        self.c_hidden = c_hidden
        self.no_heads = no_heads
        self.inf = inf
        self.eps = eps

        self.layer_norm_m = nn.LayerNorm(c_in)

        self.global_attention = GlobalAttention(
            c_in=c_in,
            c_hidden=c_hidden,
            no_heads=no_heads,
            inf=inf,
            eps=eps,
        )

    def forward(
        self, 
        m: torch.Tensor, 
        mask: Optional[torch.Tensor] = None, 
    ) -> torch.Tensor:
        n_seq, n_res, c_in = m.shape[-3:]

        if mask is None:
            # [*, N_seq, N_res]
            mask = torch.ones(
                m.shape[:-1],
                dtype=m.dtype,
                device=m.device,
            ).detach()

        # [*, N_res, N_seq, C_in]
        m = m.transpose(-2, -3)
        mask = mask.transpose(-1, -2)

        # [*, N_res, N_seq, C_in]
        #m = self.layer_norm_m(m)

        m = self.layer_norm_m(m)
        m = self.global_attention(m=m, mask=mask)

        # [*, N_seq, N_res, C_in]
        m = m.transpose(-2, -3)

        return m
