
import torch
import torch.nn as nn
import math
import os
import sys
import torch.nn.functional as F

curr_path = os.path.dirname(os.path.realpath(__file__))
sys.path.insert(0, curr_path)

from mra2_kernel.attention import mra2_attention

class MrsAttention(nn.Module):
    def __init__(self, config):
        super().__init__()

        self.dim = config["dim"]
        self.head_dim = config["head_dim"]
        self.num_head = config["num_head"]
        self.group_by_list = config["group_by_list"]

        self.qkv = nn.ModuleList([nn.Linear(self.dim, 3 * self.dim // self.num_head) for i in range(self.num_head)])

        self.ff = nn.Linear(self.num_head * self.head_dim, self.dim)

        if "num_block" in config:
            self.num_block = config["num_block"]
        elif "block_per_row" in config:
            assert config["max_seq_len"] % 32 == 0
            self.num_block = (config["max_seq_len"] // 32) * config["block_per_row"]

        self.num_block = min(self.num_block, int((config["max_seq_len"] // 32) ** 2))
        self.approx_mode = config["approx_mode"]
        self.initial_prior_first_n_blocks = config["initial_prior_first_n_blocks"] if "initial_prior_first_n_blocks" in config else 0
        self.initial_prior_diagonal_n_blocks = config["initial_prior_diagonal_n_blocks"] if "initial_prior_diagonal_n_blocks" in config else 0
        self.input_shape = config["input_shape"] if "input_shape" in config else None

    def extra_repr(self):
        rep = [
            f'num_block = {self.num_block}',
            f'approx_mode = {self.approx_mode}',
            f'initial_prior: first_n_blocks = {self.initial_prior_first_n_blocks}',
            f'initial_prior: diagonal_n_blocks = {self.initial_prior_diagonal_n_blocks}',
            f'input_shape = {self.input_shape}',
        ]
        return "\n".join(rep)

    def forward(self, X, mask):

        batch_size, seq_len, dim = X.shape

        if self.input_shape is not None:
            assert len(self.input_shape) == 2
            H, W = self.input_shape
            assert H * W == seq_len
            assert H % 4 == 0
            assert W % 8 == 0

            X = X.reshape(batch_size, H // 4, 4, W // 8, 8, dim)
            X = X.permute(0, 1, 3, 2, 4, 5)
            X = X.reshape(batch_size, seq_len, dim)
            mask = mask.reshape(batch_size, H // 4, 4, W // 8, 8)
            mask = mask.permute(0, 1, 3, 2, 4)
            mask = mask.reshape(batch_size, seq_len)
        # Masking input is used to ensure the averaging procedure is done correctly
        X = X * mask[:, :, None]

        attn_out = torch.empty(X.shape[0], self.num_head, X.shape[1], X.shape[2] // self.num_head, device=X.device)
        for h in range(self.num_head):
            # Down sampling mask and input
            mask_ = torch.clip(mask.reshape(X.shape[0], X.shape[1] // self.group_by_list[h], self.group_by_list[h]).sum(dim=-1), min=0, max=1)
            token_count = mask.reshape(batch_size, X.shape[1] // self.group_by_list[h], self.group_by_list[h]).sum(dim = -1)
            X_ = X.reshape(batch_size, X.shape[1] // self.group_by_list[h], self.group_by_list[h], dim).sum(dim = -2) / (token_count[:, :, None] + 1e-6)
            # Calcualte downsampled qkv
            qkv_ = self.qkv[h](X_).reshape(batch_size, seq_len // self.group_by_list[h], 3, dim // self.num_head).permute(2, 0, 1, 3).unsqueeze(2)
            # bsz, num_head, seq_len, head_dim
            q_, k_, v_ = qkv_[0], qkv_[1], qkv_[2]
            with torch.cuda.amp.autocast(enabled = False):
                attn_out_ = mra2_attention(
                    q_.float(), k_.float(), v_.float(), mask_.float(), self.num_block,
                    approx_mode = self.approx_mode,
                    initial_prior_first_n_blocks = self.initial_prior_first_n_blocks,
                    initial_prior_diagonal_n_blocks = self.initial_prior_diagonal_n_blocks
                )
                attn_out[:, h, :, :] = attn_out_.squeeze().repeat_interleave(self.group_by_list[h], dim=1)

        attn_out = self.combine_heads(attn_out)

        out = self.ff(attn_out)

        if self.input_shape is not None:
            out = out.reshape(batch_size, H // 4, W // 8, 4, 8, dim)
            out = out.permute(0, 1, 3, 2, 4, 5)
            out = out.reshape(batch_size, seq_len, dim)

        return out

    def combine_heads(self, X):
        X = X.transpose(1, 2)
        X = X.reshape(X.size(0), X.size(1), self.num_head * self.head_dim)
        return X

    def split_heads(self, X):
        X = X.reshape(X.size(0), X.size(1), self.num_head, self.head_dim)
        X = X.transpose(1, 2)
        return X

class Block(nn.Module):
    def __init__(self, config):
        super().__init__()

        self.norm1 = nn.LayerNorm(config["dim"])
        self.mha = MrsAttention(config)
        self.dropout1 = torch.nn.Dropout(p = config["dropout_prob"])
        self.norm2 = nn.LayerNorm(config["dim"])

        self.mlpblock = nn.Sequential(
            nn.Linear(config["dim"], config["hidden_dim"]),
            nn.GELU(),
            torch.nn.Dropout(p = config["dropout_prob"]),
            nn.Linear(config["hidden_dim"], config["dim"]),
            torch.nn.Dropout(p = config["dropout_prob"])
        )

    def forward(self, X, mask):
        X = self.dropout1(self.mha(self.norm1(X), mask)) + X
        X = self.mlpblock(self.norm2(X)) + X
        return X

class Backbone(nn.Module):
    def __init__(self, config):
        super().__init__()

        self.num_layers = config["num_layers"]
        self.shared_weight = config["shared_weight"]

        if self.shared_weight:
            self.encoder = Block(config)
        else:
            self.encoders = nn.ModuleList([Block(config) for _ in range(self.num_layers)])

        self.norm = nn.LayerNorm(config["dim"])

    def forward(self, X, mask):

        if self.shared_weight:
            for _ in range(self.num_layers):
                X = self.encoder(X, mask)
        else:
            for encoder in self.encoders:
                X = encoder(X, mask)

        X = self.norm(X) * mask[:, :, None]

        return X
