# This file is the implementation of ProbSparse attention introduced in 
# "Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting".
# Supported pattern of the code: Noncausal Self.
# The code comes from https://github.com/zhouhaoyi/Informer2020.
from math import sqrt
from typing import Optional, Dict
import warnings

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor

from efficient_attention import AbstractAttention, register_cls, add_nested_argument


class ProbAttention(nn.Module):
    def __init__(self, causal=True, factor=5, scale=None, attention_dropout=0.1, output_attention=False):
        super(ProbAttention, self).__init__()
        self.factor = factor
        self.scale = scale
        self.causal = causal
        self.output_attention = output_attention
        self.dropout = nn.Dropout(attention_dropout)

    def _prob_QK(self, Q, K, sample_k, n_top):  # n_top: c*ln(L_q)
        # Q [B, H, L, D]
        B, H, L_K, E = K.shape
        _, _, L_Q, _ = Q.shape

        # calculate the sampled Q_K
        K_expand = K.unsqueeze(-3).expand(B, H, L_Q, L_K, E)
        index_sample = torch.randint(L_K, (L_Q, sample_k))  # real U = U_part(factor*ln(L_k))*L_q
        K_sample = K_expand[:, :, torch.arange(L_Q).unsqueeze(1), index_sample, :]
        Q_K_sample = torch.matmul(Q.unsqueeze(-2), K_sample.transpose(-2, -1)).squeeze(-2)

        # find the Top_k query with sparisty measurement
        M = Q_K_sample.max(-1)[0] - torch.div(Q_K_sample.sum(-1), L_K)
        M_top = M.topk(n_top, sorted=False)[1]

        # use the reduced Q to calculate Q_K
        Q_reduce = Q[torch.arange(B)[:, None, None],
                   torch.arange(H)[None, :, None],
                   M_top, :]  # factor*ln(L_q)
        Q_K = torch.matmul(Q_reduce, K.transpose(-2, -1))  # factor*ln(L_q)*L_k

        return Q_K, M_top

    def _get_initial_context(self, V, L_Q):
        B, H, L_V, D = V.shape
        if not self.causal:
            # V_sum = V.sum(dim=-2)
            V_sum = V.mean(dim=-2)
            contex = V_sum.unsqueeze(-2).expand(B, H, L_Q, V_sum.shape[-1]).clone()
        else:  # use mask
            assert (L_Q == L_V)  # requires that L_Q == L_V, i.e. for self-attention only
            contex = V.cumsum(dim=-2)
        return contex

    def _update_context(self, context_in, V, scores, index, L_Q):
        B, H, L_V, D = V.shape

        if self.causal:
            attn_mask = ProbMask(B, H, L_Q, index, scores, device=V.device)
            scores.masked_fill_(attn_mask.mask, -np.inf)

        attn = torch.softmax(scores, dim=-1)  # nn.Softmax(dim=-1)(scores)

        context_in[torch.arange(B)[:, None, None],
        torch.arange(H)[None, :, None],
        index, :] = torch.matmul(attn, V).type_as(context_in)
        if self.output_attention:
            attns = (torch.ones([B, H, L_V, L_V]) / L_V).type_as(attn).to(attn.device)
            attns[torch.arange(B)[:, None, None], torch.arange(H)[None, :, None], index, :] = attn
            return (context_in, attns)
        else:
            return (context_in, None)

    def forward(self, query, key, value):
        B, L_Q, H, D = query.shape
        _, L_K, _, _ = key.shape

        query = query.transpose(2, 1)
        key = key.transpose(2, 1)
        value = value.transpose(2, 1)

        U_part = self.factor * np.ceil(np.log(L_K)).astype('int').item()  # c*ln(L_k)
        u = self.factor * np.ceil(np.log(L_Q)).astype('int').item()  # c*ln(L_q)

        U_part = U_part if U_part < L_K else L_K
        u = u if u < L_Q else L_Q

        scores_top, index = self._prob_QK(query, key, sample_k=U_part, n_top=u)

        # add scale factor
        scale = self.scale or 1. / sqrt(D)
        if scale is not None:
            scores_top = scores_top * scale
        # get the context
        context = self._get_initial_context(value, L_Q)
        # update the context with selected top_k queries
        context, attn = self._update_context(context, value, scores_top, index, L_Q)

        return context.transpose(2, 1).contiguous(), attn


@register_cls
class ProbSparse(AbstractAttention):
    r"""

    Args:
        embed_dim: Total dimension of the model.
        num_heads: Number of parallel attention heads. Note that ``embed_dim`` will be split
            across ``num_heads`` (i.e. each head will have dimension ``embed_dim // num_heads``).
        factor: The sampling factor controls the information bandwidth
        dropout: Dropout probability on ``attn_output_weights``. Default: ``0.0`` (no dropout).
        d_key: Total number of features for keys. Default: ``None``
        d_values: Total number of features for values. Default: ``None``

    Usage:

    from efficient_attention import ProbAttention, AttentionLayer
    attn = ProbSparse(embed_dim=embed_dim, num_heads=num_heads, causal=False, factor=factor, dropout=dropout, output_attention=False, mix=mix)

    result, _ = attn(query, attn_mask=attn_mask, batch_first=batch_first)

    """

    def __init__(self, embed_dim, num_heads,
                 factor=5, scale=None, dropout=0.1, output_attention=False,
                 d_keys=None, d_values=None, mix=False, **kwargs):
        super(ProbSparse, self).__init__(**kwargs)
        assert self.causal == False, f"{self.name.split('.')[0]} cannot do causal attention now"
        assert self.cross == False, f"{self.name.split('.')[0]} cannot do cross attention now"

        d_keys = d_keys or (embed_dim // num_heads)
        d_values = d_values or (embed_dim // num_heads)

        self.inner_attention = ProbAttention(causal=self.causal,
                                             factor=factor,
                                             scale=scale,
                                             attention_dropout=dropout,
                                             output_attention=output_attention)
        self.query_projection = nn.Linear(embed_dim, d_keys * num_heads)
        self.key_projection = nn.Linear(embed_dim, d_keys * num_heads)
        self.value_projection = nn.Linear(embed_dim, d_values * num_heads)
        self.out_projection = nn.Linear(d_values * num_heads, embed_dim)
        self.n_heads = num_heads
        self.mix = mix

    def forward(self, query,
                key=None,
                value=None,
                attn_mask=None,
                key_padding_mask: Optional[Tensor] = None,
                query_padding_mask: Optional[Tensor] = None,
                need_weights: bool = True,
                need_head_weights: bool = False,
                batch_first=False,
                static_kv: bool = False,
                incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
                **kwargs):  # the value of attn_mask must be True or False, True means the masked positon
        if attn_mask is not None:
            warnings.warn("'ProbSpare does not support attn_mask'")

        if key is None:
            key = query
        if value is None:
            value = key

        if not batch_first:
            query = query.transpose(0, 1)
            key = key.transpose(0, 1)
            value = value.transpose(0, 1)

        B, L, _ = query.shape
        _, S, _ = key.shape
        H = self.n_heads

        query = self.query_projection(query).view(B, L, H, -1)
        key = self.key_projection(key).view(B, S, H, -1)
        value = self.value_projection(value).view(B, S, H, -1)

        out, attn = self.inner_attention(
            query,
            key,
            value
        )
        if self.mix:
            out = out.transpose(2, 1).contiguous()
        out = out.view(B, L, -1)

        if not batch_first:
            out = out.transpose(0, 1).contiguous()
        return self.out_projection(out), attn

    @staticmethod
    def add_attn_specific_args(parent_parser):
        if hasattr(super(ProbSparse, ProbSparse), "add_attn_specific_args"):
            parent_parser = super(ProbSparse, ProbSparse).add_attn_specific_args(parent_parser)
        parser = parent_parser.add_argument_group("Attention")
        add_nested_argument(parser, '--factor', default=None, type=int)
        add_nested_argument(parser, '--scale', default=None, type=float)
        add_nested_argument(parser, '--mask-flag', action='store_true', default=False)
        add_nested_argument(parser, '--output-attention', action='store_true', default=False)
        add_nested_argument(parser, '--mix', action='store_true', default=False)
        add_nested_argument(parser, '--d-keys', default=False, type=int)
        add_nested_argument(parser, '--d-values', default=False, type=int)
        return parent_parser


class ProbMask():
    def __init__(self, B, H, L, index, scores, device="cpu"):
        _mask = torch.ones(L, scores.shape[-1], dtype=torch.bool).to(device).triu(1)
        _mask_ex = _mask[None, None, :].expand(B, H, L, scores.shape[-1])
        indicator = _mask_ex[torch.arange(B)[:, None, None],
                    torch.arange(H)[None, :, None],
                    index, :].to(device)
        self._mask = indicator.view(scores.shape).to(device)

    @property
    def mask(self):
        return self._mask

