# Copyright (c) Alibaba Cloud.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.

import math
from functools import partial
import numpy as np
import pdb
import torch
from torch import nn
from torch.nn import functional as F
from torch.nn.init import trunc_normal_
from IPython.core.debugger import set_trace


class IdentityMap(nn.Module):
    def __init__(self, hiiden, **kwargs):
        super().__init__()
        

    def forward(self, x, *args, **kwargs):
        return x

    @property
    def config(self):
        return {"mm_resampler_type": 'identity'}

def get_abs_pos(abs_pos, tgt_size):
    src_size = int(math.sqrt(abs_pos.size(0)))
    dtype = abs_pos.dtype

    return F.interpolate(
        abs_pos.float().reshape(1, src_size, src_size, -1).permute(0, 3, 1, 2),
        size=(tgt_size[0], tgt_size[1]),
        mode="bicubic",
        align_corners=False,
    ).permute(0, 2, 3, 1).flatten(0, 2).to(dtype=dtype)


def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False):
    """
    grid_size: int of the grid height and width
    return:
    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
    """
    grid_h = np.arange(grid_size, dtype=np.float32)
    grid_w = np.arange(grid_size, dtype=np.float32)
    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
    grid = np.stack(grid, axis=0)

    grid = grid.reshape([2, 1, grid_size, grid_size])
    

    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
    if cls_token:
        pos_embed = np.concatenate([pos_embed, np.zeros([1, embed_dim])], axis=0)
    return pos_embed


def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
    assert embed_dim % 2 == 0

    # use half of dimensions to encode grid_h
    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)

    emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
    return emb


def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
    """
    embed_dim: output dimension for each position
    pos: a list of positions to be encoded: size (M,)
    out: (M, D)
    """
    assert embed_dim % 2 == 0
    omega = np.arange(embed_dim // 2, dtype=np.float32)
    omega /= embed_dim / 2.
    omega = 1. / 10000 ** omega  # (D/2,)

    pos = pos.reshape(-1)  # (M,)
    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product

    emb_sin = np.sin(out)  # (M, D/2)
    emb_cos = np.cos(out)  # (M, D/2)

    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
    return emb


class sa_perceiver(nn.Module):
    """
    A 2D perceiver-resampler network with one cross attention layers by
        (grid_size**2) learnable queries and 2d sincos pos_emb
    Outputs:
        A tensor with the shape of (grid_size**2, embed_dim)
    """

    def __init__(
            self,
            grid_size,
            embed_dim,
            num_heads,
            kv_dim=None,
            norm_layer=partial(nn.LayerNorm, eps=1e-6),
            max_num_patches=30
    ):
        super().__init__()
        embed_dim = kv_dim #####

        self.num_queries = 1
        self.grid_size = grid_size
        self.embed_dim = embed_dim
        self.num_heads = num_heads

        self.pos_embed = nn.Parameter(
            torch.from_numpy(get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=True)).to(torch.bfloat16)
        ).requires_grad_(False)

        self.query = nn.Parameter(torch.zeros(self.num_queries, embed_dim), requires_grad=True)
        trunc_normal_(self.query, std=.02)

        self.kv_proj = nn.Linear(kv_dim, embed_dim, bias=False)
        self.q_proj = nn.Linear(kv_dim, embed_dim, bias=False)
        self.attn = nn.MultiheadAttention(embed_dim, num_heads)
        self.ln_q = norm_layer(embed_dim)
        self.ln_kv = norm_layer(embed_dim)

        self.q_proj2 = nn.Sequential(nn.LayerNorm(kv_dim), nn.Linear(kv_dim, embed_dim))
        self.kv_proj2 = nn.Sequential(nn.LayerNorm(kv_dim), nn.Linear(kv_dim, embed_dim))
        self.attn2 = nn.MultiheadAttention(embed_dim, num_heads)
        self.ln_q2  = norm_layer(embed_dim)
        self.ln_kv2 = norm_layer(embed_dim)
        self.ln_mid = norm_layer(embed_dim)
        self.mlp = nn.Sequential(nn.Linear(embed_dim, 2048), nn.SiLU(), nn.Linear(2048, embed_dim))

        self.ln_post = norm_layer(embed_dim)


    def forward(self, q, x, attn_mask, tgt_size=(24,24), ratio=None, scale=1):
        self.pos_embed.requires_grad_(False)

        assert len(x.shape) > 2
        pos_embed = get_abs_pos(self.pos_embed[:-1].detach(), tgt_size).detach()
        q_pos_embed = torch.cat([pos_embed, self.pos_embed[-1:].detach()],dim=0).detach()
        if torch.isnan(self.pos_embed).any():
            # some init error
            # raise ImportError("Comment out this Error if it's needed.")
            self.pos_embed = nn.Parameter(
                torch.from_numpy(get_2d_sincos_pos_embed(self.embed_dim, self.grid_size, cls_token=True)).to(torch.bfloat16).to(x.device)
            ).requires_grad_(False)
            pos_embed = get_abs_pos(self.pos_embed[:-1].detach(), tgt_size).detach()
        
        x = torch.cat([q, x], dim=1)
        attn_mask = torch.cat([torch.zeros(q.shape[0], q.shape[1]).bool().to(x.device), attn_mask], dim=1)

        x = self.kv_proj(x)
        x = self.ln_kv(x).permute(1, 0, 2)
        
        N = x.shape[1]
        q_in = quit
        q = torch.cat([q, self.query.unsqueeze(0).repeat(N,1,1)],dim=1)
        residual = q
        q = self.q_proj(q)

        q = self.ln_q(q)
        q += q_pos_embed.unsqueeze(0)
 
        # interplote q[:,:-1,:] and residul[:,:-1,:] to shape of [N, 36*36,:]
        q = torch.cat([F.interpolate(q[:,:-1,:].permute(0,2,1).view(N,self.embed_dim,24,24), size=(int(24*scale), int(24*scale)), mode='bilinear', align_corners=False).view(N,self.embed_dim,-1).permute(0,2,1), q[:,-1:,:]], dim=1)
        residual = torch.cat([F.interpolate(residual[:,:-1,:].permute(0,2,1).view(N,self.embed_dim,24,24), size=(int(24*scale), int(24*scale)), mode='bilinear', align_corners=False).view(N,self.embed_dim,-1).permute(0,2,1), residual[:,-1:,:]], dim=1)

        dtype = self.kv_proj.weight.dtype
       
        attn_mask = attn_mask.unsqueeze(1).unsqueeze(1).repeat(1,self.num_heads, q.shape[1], 1)
        attn_mask = attn_mask.flatten(0,1)

        out = self.attn(
            q.transpose(0,1).to(dtype), 
            x.to(dtype),
            x.to(dtype),
            attn_mask=attn_mask)[0]
        out = out.permute(1, 0, 2)
        out = out + residual
        residual = out
 
        q = residual = out
        x = torch.cat([q, x[q.shape[1]:,:,:].permute(1,0,2)], dim=1)
        q = self.q_proj2(q)
        x = self.kv_proj2(x)
        q = self.ln_q2(q)
        x = self.ln_kv2(x).permute(1, 0, 2)
        out = self.attn2(
            q.transpose(0,1).to(dtype), 
            x.to(dtype),
            x.to(dtype),
            attn_mask=attn_mask)[0]
        out = out.permute(1, 0, 2)
        out = out + residual

        v_B, v_N, v_D = q_in.shape
        assert out.shape == (v_B, v_N+1, v_D)

        x = self.ln_post(out)
        return x
        
    def _repeat(self, query, N: int):
        return query.unsqueeze(1).repeat(1, N, 1)
    
