
import torch
import torch.nn as nn
import torch.nn.functional as F
import ot

import torch
from torch import nn
import math
from utils_EBW import HybridEBSW, EBSW, kernel_SW, bias_kernel_SW

# from tools.random_prj import sliced_Wasserstein
from typing import Tuple

def rotate_half(x):
    x1, x2 = x.chunk(2, dim=-1)
    return torch.cat((-x2, x1), dim=-1)



@torch.jit.script
def apply_rotary_pos_emb(x, cos, sin):
    # NOTE: This could probably be moved to Triton

    # Handle a possible sequence length mismatch in between q and k
    cos = cos[:, :, : x.shape[-2], :]
    sin = sin[:, :, : x.shape[-2], :]
    # dummy_x = torch.ones(x.shape[0], x.shape[1]).to(x.device)
    dummy_x = torch.clone(x)
    r_position = (dummy_x * cos) + (rotate_half(dummy_x) * sin)
    del dummy_x
    return torch.cat([x, r_position.squeeze(0).squeeze(0)], dim=-1)


class RotaryEmbedding(torch.nn.Module):
    """
    The rotary position embeddings from RoFormer_ (Su et. al).
    A crucial insight from the method is that the query and keys are
    transformed by rotation matrices which depend on the relative positions.

    Other implementations are available in the Rotary Transformer repo_ and in
    GPT-NeoX_, GPT-NeoX was an inspiration

    .. _RoFormer: https://arxiv.org/abs/2104.09864
    .. _repo: https://github.com/ZhuiyiTechnology/roformer
    .. _GPT-NeoX: https://github.com/EleutherAI/gpt-neox


    .. warning: Please note that this embedding is not registered on purpose, as it is transformative
        (it does not create the embedding dimension) and will likely be picked up (imported) on a ad-hoc basis
    """

    def __init__(self, dim_model: int, *_, **__):
        super().__init__()
        # Generate and save the inverse frequency buffer (non trainable)
        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim_model, 2).float() / dim_model))
        self.register_buffer("inv_freq", inv_freq)

        self._seq_len_cached = None
        self._cos_cached = None
        self._sin_cached = None

    def _update_cos_sin_tables(self, x, seq_dimension=1):
        seq_len = x.shape[seq_dimension]

        # Reset the tables if the sequence length has changed,
        # or if we're on a new device (possibly due to tracing for instance)
        if (
            seq_len != self._seq_len_cached
            or self._cos_cached.device != x.device
            or self._cos_cached.dtype != x.dtype
        ):
            self._seq_len_cached = seq_len
            t = torch.arange(
                x.shape[seq_dimension], device=x.device, dtype=torch.float32
            )
            freqs = torch.einsum("i,j->ij", t, self.inv_freq.to(x.dtype).to(x.device))
            emb = torch.cat((freqs, freqs), dim=-1).to(x.device)

            self._cos_cached = emb.cos()[None, None, :, :].to(x.dtype)
            self._sin_cached = emb.sin()[None, None, :, :].to(x.dtype)

        return self._cos_cached, self._sin_cached

    def forward(
        self, t: torch.Tensor
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        self._cos_cached, self._sin_cached = self._update_cos_sin_tables(
            t, seq_dimension=-2
        )

        return apply_rotary_pos_emb(t, self._cos_cached, self._sin_cached)

class OTLossKernel(nn.Module):

    def __init__(self, epsilon=0.1, use_pos=False, pos_dim=1024):
        super(OTLossKernel, self).__init__()
        self.epsilon = epsilon
        self.use_position = use_pos
        # self.pos_enc = PositionalEncoding(d_model=pos_dim)
        self.pos_enc = RotaryEmbedding(pos_dim)

    def forward(self, audio_embs, audio_masks, text_embs, text_masks):
        batch_size = audio_embs.size(0)
        total_loss = 0

        for i in range(batch_size):
            # aud_len = audio_masks[i].sum()
            # text_len = text_masks[i].sum()

            ## rotary embedding
            # aud = audio_embs[i][:aud_len.item()]
            # text = text_embs[i][:text_len.item()]
            aud = audio_embs[i]
            text = text_embs[i]
            
            aud = self.pos_enc(aud).squeeze(0).squeeze(0)
            text = self.pos_enc(text).squeeze(0).squeeze(0)


            a = torch.ones(aud.shape[0])/aud.shape[0]
            a = a.to(aud.device)
            b = torch.ones(text.shape[0])/text.shape[0]
            b = b.to(text.device)
            # loss = EBSW(aud, text, a, b, L=50, temp=1.0)
            # loss = HybridEBSW(aud, text, a, b, L=50, temp=1.0)
            loss = kernel_SW(X=aud, Y=text, a=a, b=b, L=10, p=2, gamma=1.5)
            # loss = bias_kernel_SW(X=aud, Y=text, a=a, b=b, L=50, p=2, gamma=1.5)
            total_loss+=loss

        return total_loss/batch_size
 