from dataclasses import dataclass
from typing import List, Optional, Tuple, Union

import torch
import numpy as np
import torch.nn.functional as F
from torch import nn
import torchvision

from diffusers.configuration_utils import ConfigMixin, register_to_config
# from diffusers.modeling_utils import ModelMixin
from diffusers.models.modeling_utils import ModelMixin
from diffusers.utils import BaseOutput
from diffusers.utils.import_utils import is_xformers_available
# from diffusers.models.attention import CrossAttention, FeedForward
from diffusers.models.attention import Attention as CrossAttention, FeedForward

from einops import rearrange, repeat
import math

from .additional import MotionPredictorV0, MotionPredictorV1, MotionPredictorV2, MotionPredictorV3, MotionPredictorV4, cross_frame_corr, random_swap_motion_vectors, random_swap_motion_vectors_v1, random_swap_motion_vectors_v2
from . import global_utils


def zero_module(module):
    # Zero out the parameters of a module and return it.
    for p in module.parameters():
        p.detach().zero_()
    return module


@dataclass
class TemporalTransformer3DModelOutput(BaseOutput):
    sample: torch.FloatTensor


if is_xformers_available():
    import xformers
    import xformers.ops
else:
    xformers = None


def get_motion_module(
    in_channels,
    motion_module_type: str, 
    motion_module_kwargs: dict
):
    if motion_module_type == "Vanilla":
        return VanillaTemporalModule(in_channels=in_channels, **motion_module_kwargs)    
    else:
        raise ValueError


class VanillaTemporalModule(nn.Module):
    def __init__(
        self,
        in_channels,
        num_attention_heads                = 8,
        num_transformer_block              = 2,
        attention_block_types              = ( "Temporal_Self", "Temporal_Self" ),
        cross_frame_attention_mode         = None,
        temporal_position_encoding         = False,
        temporal_position_encoding_max_len = 24,
        temporal_attention_dim_div         = 1,
        zero_initialize                    = True,
        add_motion_vector                  = False
    ):
        super().__init__()
        
        self.temporal_transformer = TemporalTransformer3DModel(
            in_channels=in_channels,
            num_attention_heads=num_attention_heads,
            attention_head_dim=in_channels // num_attention_heads // temporal_attention_dim_div,
            num_layers=num_transformer_block,
            attention_block_types=attention_block_types,
            cross_frame_attention_mode=cross_frame_attention_mode,
            temporal_position_encoding=temporal_position_encoding,
            temporal_position_encoding_max_len=temporal_position_encoding_max_len,
            add_motion_vector=add_motion_vector,
        )
        
        if zero_initialize:
            self.temporal_transformer.proj_out = zero_module(self.temporal_transformer.proj_out)

    def forward(self, input_tensor, temb, encoder_hidden_states, attention_mask=None, anchor_frame_idx=None):
        output = self.temporal_transformer(input_tensor, encoder_hidden_states, attention_mask=attention_mask)
        return output


class TemporalTransformer3DModel(nn.Module):
    def __init__(
        self,
        in_channels,
        num_attention_heads,
        attention_head_dim,

        num_layers,
        attention_block_types              = ( "Temporal_Self", "Temporal_Self", ),        
        dropout                            = 0.0,
        norm_num_groups                    = 32,
        cross_attention_dim                = 768,
        activation_fn                      = "geglu",
        attention_bias                     = False,
        upcast_attention                   = False,
        
        cross_frame_attention_mode         = None,
        temporal_position_encoding         = False,
        temporal_position_encoding_max_len = 24,
        add_motion_vector                  = False,
    ):
        super().__init__()

        inner_dim = num_attention_heads * attention_head_dim

        self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True)
        self.proj_in = nn.Linear(in_channels, inner_dim)

        self.transformer_blocks = nn.ModuleList(
            [
                TemporalTransformerBlock(
                    dim=inner_dim,
                    num_attention_heads=num_attention_heads,
                    attention_head_dim=attention_head_dim,
                    attention_block_types=attention_block_types,
                    dropout=dropout,
                    norm_num_groups=norm_num_groups,
                    cross_attention_dim=cross_attention_dim,
                    activation_fn=activation_fn,
                    attention_bias=attention_bias,
                    upcast_attention=upcast_attention,
                    cross_frame_attention_mode=cross_frame_attention_mode,
                    temporal_position_encoding=temporal_position_encoding,
                    temporal_position_encoding_max_len=temporal_position_encoding_max_len,
                    add_motion_vector=add_motion_vector,
                )
                for d in range(num_layers)
            ]
        )
        self.proj_out = nn.Linear(inner_dim, in_channels)    
    
    def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None):
        assert hidden_states.dim() == 5, f"Expected hidden_states to have ndim=5, but got ndim={hidden_states.dim()}."
        video_length = hidden_states.shape[2]
        hidden_states = rearrange(hidden_states, "b c f h w -> (b f) c h w")

        batch, channel, height, weight = hidden_states.shape
        residual = hidden_states

        hidden_states = self.norm(hidden_states)
        inner_dim = hidden_states.shape[1]
        hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * weight, inner_dim)
        
        hidden_states = self.proj_in(hidden_states)

        # Transformer Blocks
        for block in self.transformer_blocks:
            hidden_states = block(hidden_states, encoder_hidden_states=encoder_hidden_states, video_length=video_length)
        
        # output
        hidden_states = self.proj_out(hidden_states)
        hidden_states = hidden_states.reshape(batch, height, weight, inner_dim).permute(0, 3, 1, 2).contiguous()

        output = hidden_states + residual
        output = rearrange(output, "(b f) c h w -> b c f h w", f=video_length)
        
        return output


class TemporalTransformerBlock(nn.Module):
    def __init__(
        self,
        dim,
        num_attention_heads,
        attention_head_dim,
        attention_block_types              = ( "Temporal_Self", "Temporal_Self", ),
        dropout                            = 0.0,
        norm_num_groups                    = 32,
        cross_attention_dim                = 768,
        activation_fn                      = "geglu",
        attention_bias                     = False,
        upcast_attention                   = False,
        cross_frame_attention_mode         = None,
        temporal_position_encoding         = False,
        temporal_position_encoding_max_len = 24,
        add_motion_vector                  = False,
    ):
        super().__init__()

        attention_blocks = []
        norms = []
        
        for ind, block_name in enumerate(attention_block_types):
            attention_blocks.append(
                VersatileAttention(
                    attention_mode=block_name.split("_")[0],
                    conduct_ST_attention='STAttn' in block_name,
                    use_ST_position_encoding='STPos' in block_name,
                    add_motion_vector=block_name.split('AddM')[1].split('_')[0] if (add_motion_vector and 'AddM' in block_name) else None,
                    add_corr_constraint='CORR' in block_name,
                    corr_apply_mean_filtering='CORR' in block_name and 'Mean' in block_name,
                    corr_apply_local_calculation='CORR' in block_name and 'Local' in block_name,
                    corr_apply_normalization='CORR' in block_name and 'Norm' in block_name,
                    corr_apply_intra_frame_ref='CORR' in block_name and ('REFIntra' in block_name or 'REFOnlyIntra' in block_name),
                    corr_apply_adjacent_frame_ref='CORR' in block_name and 'REFFirst' not in block_name and 'REFOnlyIntra' not in block_name,
                    corr_apply_first_frame_ref='CORR' in block_name and ('REFFirst' in block_name or 'REFBoth' in block_name),
                    corr_apply_masking='CORR' in block_name and 'MASK' in block_name,
                    is_cross_attention=block_name.endswith("_Cross"),
                    use_propagation=block_name.endswith("_Prop"),
                    use_propagation_with_pad=block_name.endswith("_PropPad"),
                    use_propagation_with_adj=block_name.endswith("_PropAdj"),
                    align="Align" in block_name,
                    use_transformation="_Trans" in block_name,
                    transformation_ref=block_name.split("_Trans")[-1].split("REF")[-1].split("_")[0],
                    transformation_deeper="_Trans" in block_name and "Deep" in block_name,
                    cross_attention_dim=cross_attention_dim if (block_name.endswith("_Cross") or add_motion_vector) else None,
                    query_dim=dim,
                    heads=num_attention_heads,
                    dim_head=attention_head_dim,
                    dropout=dropout,
                    bias=attention_bias,
                    upcast_attention=upcast_attention,
        
                    cross_frame_attention_mode=cross_frame_attention_mode,
                    temporal_position_encoding=temporal_position_encoding,
                    temporal_position_encoding_max_len=temporal_position_encoding_max_len,
                )
            )
            norms.append(nn.LayerNorm(dim))
            
        self.attention_blocks = nn.ModuleList()
        assert len(attention_blocks) == len(attention_block_types)
        for ii in range(len(attention_blocks)):
            if attention_block_types[ii].endswith("_Cross"):
                self.attention_blocks.add_module(f"{ii}_motion_cross_attn", attention_blocks[ii])
            else:
                self.attention_blocks.add_module(f"{ii}", attention_blocks[ii])
        self.norms = nn.ModuleList(norms)

        self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn)
        self.ff_norm = nn.LayerNorm(dim)


    def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None, video_length=None):
        for attention_block, norm in zip(self.attention_blocks, self.norms):
            norm_hidden_states = norm(hidden_states)
            hidden_states = attention_block(
                norm_hidden_states,
                encoder_hidden_states=encoder_hidden_states if (attention_block.is_cross_attention or attention_block.add_motion_vector) else None,
                video_length=video_length,
            ) + hidden_states
            
        hidden_states = self.ff(self.ff_norm(hidden_states)) + hidden_states
        
        output = hidden_states  
        return output


class PositionalEncoding(nn.Module):
    def __init__(
        self, 
        d_model, 
        dropout = 0., 
        max_len = 24,
        interpolate = False,
    ):
        super().__init__()
        self.interpolate = interpolate
        self.dropout = nn.Dropout(p=dropout)
        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(1, max_len, d_model)
        pe[0, :, 0::2] = torch.sin(position * div_term)
        pe[0, :, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        if not self.interpolate:
            x = x + self.pe[:, :x.size(1)]
        else:
            interpolate_scale = x.shape[2]
            x = rearrange(x, "b f d c -> b (f d) c")
            temp_pos = rearrange(self.pe, "b l d -> b d l")
            temp_pos = torch.nn.functional.interpolate(temp_pos, scale_factor=interpolate_scale, mode='linear')
            temp_pos = rearrange(temp_pos, "b d l -> b l d")
            x = x + temp_pos[:, :x.size(1)]
        return self.dropout(x)


class VersatileAttention(CrossAttention):
    def __init__(
            self,
            attention_mode                     = None,
            cross_frame_attention_mode         = None,
            temporal_position_encoding         = False,
            temporal_position_encoding_max_len = 24,            
            *args, **kwargs
        ):
        self.conduct_ST_attention = kwargs["conduct_ST_attention"]
        kwargs.pop("conduct_ST_attention")
        self.use_ST_position_encoding = kwargs["use_ST_position_encoding"]
        kwargs.pop("use_ST_position_encoding")
        self.add_motion_vector = kwargs["add_motion_vector"]
        kwargs.pop("add_motion_vector")
        self.add_corr_constraint = kwargs["add_corr_constraint"]
        kwargs.pop("add_corr_constraint")
        self.corr_apply_mean_filtering = kwargs["corr_apply_mean_filtering"]
        kwargs.pop("corr_apply_mean_filtering")
        self.corr_apply_local_calculation = kwargs["corr_apply_local_calculation"]
        kwargs.pop("corr_apply_local_calculation")
        self.corr_apply_normalization = kwargs["corr_apply_normalization"]
        kwargs.pop("corr_apply_normalization")
        self.corr_apply_intra_frame_ref = kwargs["corr_apply_intra_frame_ref"]
        kwargs.pop("corr_apply_intra_frame_ref")
        self.corr_apply_adjacent_frame_ref = kwargs["corr_apply_adjacent_frame_ref"]
        kwargs.pop("corr_apply_adjacent_frame_ref")
        self.corr_apply_first_frame_ref = kwargs["corr_apply_first_frame_ref"]
        kwargs.pop("corr_apply_first_frame_ref")
        self.corr_apply_masking = kwargs["corr_apply_masking"]
        kwargs.pop("corr_apply_masking")
        self.is_cross_attention = kwargs["is_cross_attention"]
        kwargs.pop("is_cross_attention")
        self.use_propagation = kwargs["use_propagation"]
        kwargs.pop("use_propagation")
        self.use_propagation_with_pad = kwargs["use_propagation_with_pad"]
        kwargs.pop("use_propagation_with_pad")
        self.use_propagation_with_adj = kwargs["use_propagation_with_adj"]
        kwargs.pop("use_propagation_with_adj")
        self.align = kwargs["align"]
        kwargs.pop("align")
        self.use_transformation = kwargs["use_transformation"]
        kwargs.pop("use_transformation")
        self.transformation_ref = kwargs["transformation_ref"]
        kwargs.pop("transformation_ref")
        self.transformation_deeper = kwargs["transformation_deeper"]
        kwargs.pop("transformation_deeper")
        cross_attention_dim = kwargs["cross_attention_dim"]
        if not self.is_cross_attention:
            kwargs["cross_attention_dim"] = None
        super().__init__(*args, **kwargs)
        assert attention_mode == "Temporal"
        self.attention_mode = attention_mode
        
        if temporal_position_encoding: 
            if not self.use_ST_position_encoding:
                self.pos_encoder = PositionalEncoding(
                    kwargs["query_dim"],
                    dropout=0., 
                    max_len=temporal_position_encoding_max_len
                )
            else:
                self.pos_encoder = PositionalEncoding(
                    kwargs["query_dim"],
                    dropout=0., 
                    max_len=temporal_position_encoding_max_len,
                    interpolate=True,
                )
        else:
            self.pos_encoder = None

        if self.add_motion_vector is not None:
            if self.add_motion_vector == 'V0':
                self.motion_predictor = MotionPredictorV0(length=16, dim=kwargs["query_dim"])
            elif self.add_motion_vector == 'V1':
                self.motion_predictor = MotionPredictorV1(textual_embed_dim=cross_attention_dim, visual_embed_dim=kwargs["query_dim"])
            elif self.add_motion_vector.startswith('V2'):
                self.motion_predictor = MotionPredictorV2(visual_embed_dim=kwargs["query_dim"], num_motion_vector=64, motion_vector_length=16, constraint=self.add_motion_vector.split('V2')[-1])
            elif self.add_motion_vector == 'V3':
                self.motion_predictor = MotionPredictorV3(textual_embed_dim=cross_attention_dim, visual_embed_dim=kwargs["query_dim"])
            elif self.add_motion_vector.startswith('V4'):
                self.motion_predictor = MotionPredictorV4(visual_embed_dim=kwargs["query_dim"], num_motion_vector=64, motion_vector_length=16, constraint=self.add_motion_vector.split('V4')[-1])
            elif self.add_motion_vector.startswith('V5'):
                self.motion_predictor = MotionPredictorV4(visual_embed_dim=kwargs["query_dim"], num_motion_vector=8, motion_vector_length=16, constraint=self.add_motion_vector.split('V5')[-1])
            elif (self.add_motion_vector.startswith('V6') or self.add_motion_vector.startswith('V7') or self.add_motion_vector.startswith('V8')):
                pass
            else:
                assert False, "Not Implemented."
            if not (self.add_motion_vector.startswith('V6') or self.add_motion_vector.startswith('V7') or self.add_motion_vector.startswith('V8')):
                # zero_module(self.motion_predictor)
                del self.to_q
                del self.to_k
                del self.to_v

        if self.use_propagation_with_pad:
            self.extra_k = nn.Parameter(torch.randn(1,16,kwargs["query_dim"]))
            self.extra_v = nn.Parameter(torch.randn(1,16,kwargs["query_dim"]))
            self.norm_extra_k = nn.LayerNorm(kwargs["query_dim"])
            self.norm_extra_v = nn.LayerNorm(kwargs["query_dim"])

        if self.is_cross_attention:
            zero_module(self.to_out)

        if self.use_transformation:
            # del self.to_q
            # del self.to_k
            # del self.to_v
            if not self.transformation_deeper:
                self.local_motion_net = nn.Sequential(
                    nn.Conv2d(2*kwargs["query_dim"], 64, kernel_size=3, padding=1),
                    nn.ReLU(True),
                    nn.Conv2d(64, 16, kernel_size=3, padding=1),
                    nn.ReLU(True),
                    nn.Conv2d(16, 4, kernel_size=3, padding=1),
                    nn.ReLU(True),
                    nn.Conv2d(4, 2, kernel_size=3, padding=1),
                )
            self.identity_affine = torch.tensor([[1, 0, 0], [0, 1, 0]], dtype=self.local_motion_net[-1].bias.dtype)
            # zero init
            self.local_motion_net[-1].weight.data.zero_()
            self.local_motion_net[-1].bias.data.zero_()

    def extra_repr(self):
        return f"(Module Info) Attention_Mode: {self.attention_mode}, Is_Cross_Attention: {self.is_cross_attention}"

    def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None, video_length=None):
        batch_size, sequence_length, _ = hidden_states.shape

        if self.is_cross_attention: 
            residual = hidden_states # no pretrain for cross attention weights, use residual connection
        if not self.is_cross_attention and self.add_motion_vector is not None: # add_motion_vector: text only used for cross attention calculation
            _encoder_hidden_states = encoder_hidden_states
            encoder_hidden_states = None

        hidden_states = rearrange(hidden_states, "(b f) d c -> (b d) f c", f=video_length)
    
        if self.use_ST_position_encoding:
            hidden_states = rearrange(hidden_states, "(b d) f c -> b f d c", d=sequence_length)
            hidden_states = self.pos_encoder(hidden_states)
            hidden_states = rearrange(hidden_states, "b (f d) c -> (b d) f c", f=video_length)
        else:
            if self.pos_encoder is not None:
                hidden_states = self.pos_encoder(hidden_states)

        if self.add_motion_vector == 'V1' or self.add_motion_vector == 'V3' or self.conduct_ST_attention: # Spatial-Temporal attention
            hidden_states = rearrange(hidden_states, "(b d) f c -> b (d f) c", d=sequence_length)
        
        if self.is_cross_attention and self.add_motion_vector != 'V1' and self.add_motion_vector != 'V3': # cross attention
            encoder_hidden_states = repeat(encoder_hidden_states, "b n c -> (b d) n c", d=sequence_length) if encoder_hidden_states is not None else None

        if self.group_norm is not None:
            hidden_states = self.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)

        if self.add_motion_vector is not None and not (self.add_motion_vector.startswith('V6') or self.add_motion_vector.startswith('V7') or self.add_motion_vector.startswith('V8')): # add_motion_vector: text only used for cross attention calculation
            motion_adjust = self.motion_predictor(hidden_states, _encoder_hidden_states, video_length=video_length, spatial_seq_length=sequence_length)
            if self.add_motion_vector == "V3":
                hidden_states = motion_adjust
            else:
                hidden_states = torch.mean(hidden_states, dim=1, keepdim=True) + motion_adjust * torch.std(hidden_states, dim=1, keepdim=True)

        if self.use_transformation: # warping reference frames
            assert math.isqrt(sequence_length) ** 2 == sequence_length
            _hidden_states = rearrange(hidden_states, "(b d) f c -> b d f c", d=sequence_length)
            _hidden_states = rearrange(_hidden_states, "b (h w) f c -> b f c h w", h=math.isqrt(sequence_length))
            if self.transformation_ref == "First":
                hidden_states_ref = rearrange(repeat(_hidden_states[:,0,:,:,:], "b c h w -> b f c h w", f=video_length), "b f c h w -> (b f) c h w")
            elif self.transformation_ref == "Prev":
                hidden_states_ref = rearrange(torch.cat([_hidden_states[:,0,:,:,:].unsqueeze(1), _hidden_states[:,:-1,:,:,:]], dim=1), "b f c h w -> (b f) c h w")
            else:
                assert False, "Not Implemented."
            hidden_states_with_ref = torch.cat([hidden_states_ref, rearrange(_hidden_states, "b f c h w -> (b f) c h w")], dim=1)
            reverse_hidden_states_with_ref = torch.cat([rearrange(_hidden_states, "b f c h w -> (b f) c h w"), hidden_states_ref], dim=1)
            all_hidden_states_with_ref = torch.cat([hidden_states_with_ref, reverse_hidden_states_with_ref], dim=0)
        
        if self.use_propagation or self.use_propagation_with_pad or self.use_propagation_with_adj:
            hidden_states = rearrange(hidden_states, "(b d) f c -> b d f c", d=sequence_length)
            if self.use_propagation or self.use_propagation_with_pad:
                query = self.to_q(rearrange(hidden_states, "b d f c -> b (d f) c", d=sequence_length))
            elif self.use_propagation_with_adj:
                query = self.to_q(rearrange(hidden_states, "b d f c -> (b f) d c", d=sequence_length))
            else:
                assert False
        elif self.add_motion_vector is None or (self.add_motion_vector.startswith('V6') or self.add_motion_vector.startswith('V7') or self.add_motion_vector.startswith('V8')):
            query = self.to_q(hidden_states)
            dim = query.shape[-1]

        if self.added_kv_proj_dim is not None:
            raise NotImplementedError

        if self.use_transformation: # warping reference frames
            all_local_motion = self.local_motion_net(all_hidden_states_with_ref)
            local_motion, reverse_local_motion = torch.chunk(all_local_motion, 2, dim=0) # local motion: target -> ref; reverse_local_motion: ref -> target
            grid = F.affine_grid(torch.stack([self.identity_affine.to(local_motion.device).view(2,3)]*hidden_states_ref.shape[0], dim=0), hidden_states_ref.size(), align_corners=False) + rearrange(local_motion, "b c h w -> b h w c")
            transformed_hidden_states = F.grid_sample(hidden_states_ref, grid, align_corners=False)
            if self.training:
                transformed_reverse_local_motion = F.grid_sample(reverse_local_motion, grid, align_corners=False) # local_motion + transformed_reverse_local_motion = 0
                global_utils.update_transformation_consistency_loss(local_motion, transformed_reverse_local_motion)
                ### motion smoothness
                local_motion_spatial_grad_h = local_motion[:,:,:-1,:] - local_motion[:,:,1:,:]
                global_utils.update_transformation_smoothness_loss(local_motion_spatial_grad_h)
                local_motion_spatial_grad_w = local_motion[:,:,:,:-1] - local_motion[:,:,:,1:]
                global_utils.update_transformation_smoothness_loss(local_motion_spatial_grad_w)
                # if self.transformation_ref == "First":
                #     reverse_local_motion = rearrange(reverse_local_motion, "(b f) c h w -> b f h w c", f=video_length)
                #     local_motion_temporal_grad = reverse_local_motion[:,1:-1,:,:,:] - reverse_local_motion[:,2:,:,:,:]
                #     global_utils.update_transformation_smoothness_loss(local_motion_temporal_grad)
                # if self.transformation_ref == "Prev":
                #     local_motion = rearrange(local_motion, "(b f) c h w -> b f h w c", f=video_length)
                #     reverse_local_motion = rearrange(reverse_local_motion, "(b f) c h w -> b f h w c", f=video_length)
                #     local_motion_temporal_grad = local_motion[:,1:-1,:,:,:] + reverse_local_motion[:,2:,:,:,:]
                #     global_utils.update_transformation_smoothness_loss(local_motion_temporal_grad)
                ###
        
        encoder_hidden_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states
        if self.use_propagation:
            key = self.to_k(encoder_hidden_states[:,:,0,:])
            value = self.to_v(encoder_hidden_states[:,:,0,:])
        elif self.use_propagation_with_pad:
            tmp_k = torch.cat([encoder_hidden_states[:,:,0,:], self.norm_extra_k(self.extra_k).expand(encoder_hidden_states.shape[0], *self.extra_k.shape[1:])], dim=1)
            key = self.to_k(tmp_k)
            tmp_v = torch.cat([encoder_hidden_states[:,:,0,:], self.norm_extra_v(self.extra_v).expand(encoder_hidden_states.shape[0], *self.extra_v.shape[1:])], dim=1)
            value = self.to_v(tmp_v)
        elif self.use_propagation_with_adj:
            former_frame_index = torch.arange(encoder_hidden_states.shape[2]) - 1
            former_frame_index[0] = 0
            tmp_kv = torch.cat([encoder_hidden_states[:,:,[0] * encoder_hidden_states.shape[2],:], encoder_hidden_states[:,:,former_frame_index,:]], dim=1)
            tmp_kv = rearrange(tmp_kv, "b d f c -> (b f) d c")
            key = self.to_k(tmp_kv)
            value = self.to_v(tmp_kv)
        elif self.add_motion_vector is None or (self.add_motion_vector.startswith('V6') or self.add_motion_vector.startswith('V7') or self.add_motion_vector.startswith('V8')):
            key = self.to_k(encoder_hidden_states)
            value = self.to_v(encoder_hidden_states)

        if self.align and self.training: # replace query and key
            tmp_query = torch.chunk(query, 2, dim=0)
            tmp_key = torch.chunk(key, 2, dim=0)
            query = torch.cat([tmp_query[0]]*2, dim=0)
            key = torch.cat([tmp_key[0]]*2, dim=0)

        if self.add_motion_vector is None or (self.add_motion_vector.startswith('V6') or self.add_motion_vector.startswith('V7') or self.add_motion_vector.startswith('V8')):
            # query = self.reshape_heads_to_batch_dim(query)
            query = self.head_to_batch_dim(query).contiguous()
            # key = self.reshape_heads_to_batch_dim(key)
            key = self.head_to_batch_dim(key).contiguous()
            # value = self.reshape_heads_to_batch_dim(value)
            value = self.head_to_batch_dim(value).contiguous()

            if attention_mask is not None:
                if attention_mask.shape[-1] != query.shape[1]:
                    target_length = query.shape[1]
                    attention_mask = F.pad(attention_mask, (0, target_length), value=0.0)
                    attention_mask = attention_mask.repeat_interleave(self.heads, dim=0)

            # attention, what we cannot get enough of
            ### change due to different versions of diffusers
            # if self._use_memory_efficient_attention_xformers:
            if query.shape[0] < 65536:
                # hidden_states = self._memory_efficient_attention_xformers(query, key, value, attention_mask)
                hidden_states = xformers.ops.memory_efficient_attention(
                    query, key, value, attn_bias=attention_mask
                )
                # Some versions of xformers return output in fp32, cast it back to the dtype of the input
                hidden_states = hidden_states.to(query.dtype)
                hidden_states = self.batch_to_head_dim(hidden_states)
            else:
                # if self._slice_size is None or query.shape[0] // self._slice_size == 1:
                #     hidden_states = self._attention(query, key, value, attention_mask)
                # else:
                #     hidden_states = self._sliced_attention(query, key, value, sequence_length, dim, attention_mask)
                self.slice_size = 32768
                batch_size_attention, query_tokens, _ = query.shape
                assert batch_size_attention % self.slice_size == 0
                batch_size_attention, query_tokens, _ = query.shape
                hidden_states = torch.zeros(
                    (batch_size_attention, query_tokens, dim // self.heads), device=query.device, dtype=query.dtype
                )
                for i in range(batch_size_attention // self.slice_size):
                    start_idx = i * self.slice_size
                    end_idx = (i + 1) * self.slice_size
                    query_slice = query[start_idx:end_idx]
                    key_slice = key[start_idx:end_idx]
                    value_slice = value[start_idx:end_idx]
                    attn_mask_slice = attention_mask[start_idx:end_idx] if attention_mask is not None else None
                    hidden_states_slice = xformers.ops.memory_efficient_attention(
                        query_slice, key_slice, value_slice, attn_bias=attn_mask_slice
                    )
                    hidden_states_slice = hidden_states_slice.to(query_slice.dtype)
                    hidden_states[start_idx:end_idx] = hidden_states_slice
                hidden_states = self.batch_to_head_dim(hidden_states)
            # if not self.is_cross_attention and self.add_motion_vector is not None and attention_mask is not None:
            #     # not support memory_efficient_attention when attention_mask is learnable
            #     self.slice_size = 1
            #     batch_size_attention, query_tokens, _ = query.shape
            #     hidden_states = torch.zeros(
            #         (batch_size_attention, query_tokens, dim // self.heads), device=query.device, dtype=query.dtype
            #     )
            #     for i in range(batch_size_attention // self.slice_size):
            #         start_idx = i * self.slice_size
            #         end_idx = (i + 1) * self.slice_size
            #         query_slice = query[start_idx:end_idx]
            #         key_slice = key[start_idx:end_idx]
            #         attn_mask_slice = attention_mask[start_idx:end_idx] if attention_mask is not None else None
            #         attn_slice = self.get_attention_scores(query_slice, key_slice, attn_mask_slice)
            #         attn_slice = torch.bmm(attn_slice, value[start_idx:end_idx])
            #         hidden_states[start_idx:end_idx] = attn_slice

        if self.use_transformation: # warping reference frames
            _hidden_states = rearrange(rearrange(transformed_hidden_states, "(b f) c h w -> b (h w) f c", f=video_length), "b d f c -> (b d) f c")
            hidden_states = hidden_states + _hidden_states

        if self.use_propagation or self.use_propagation_with_pad:
            hidden_states = rearrange(hidden_states, "b (d f) c -> (b d) f c", d=sequence_length)
        elif self.use_propagation_with_adj:
            hidden_states = rearrange(hidden_states, "(b f) d c -> (b d) f c", b=batch_size)
        
        # linear proj
        hidden_states = self.to_out[0](hidden_states)

        # dropout
        hidden_states = self.to_out[1](hidden_states)

        if self.add_motion_vector == 'V1' or self.add_motion_vector == 'V3' or self.conduct_ST_attention:
            hidden_states = rearrange(hidden_states, "b (d f) c -> (b f) d c", f=video_length)
        else:
            if self.add_motion_vector is not None and (self.add_motion_vector.startswith('V6') or self.add_motion_vector.startswith('V7') or self.add_motion_vector.startswith('V8')) and self.training:
                if self.add_motion_vector.startswith('V6'):
                    prob = self.add_motion_vector.split('V6')[-1]
                    if prob != '':
                        prob = float(prob)
                        hidden_states = random_swap_motion_vectors(hidden_states, prob=prob)
                elif self.add_motion_vector.startswith('V7'):
                    prob = self.add_motion_vector.split('V7')[-1].split("Perc")[0]
                    if "Perc" in self.add_motion_vector.split('V7')[-1]:
                        perc = self.add_motion_vector.split('V7')[-1].split("Perc")[-1]
                        perc = float(perc)
                    else:
                        perc = 0.9
                    if prob != '' and perc < 1.0:
                        prob = float(prob)
                        hidden_states = random_swap_motion_vectors_v1(hidden_states, prob=prob, percentile=perc)
                elif self.add_motion_vector.startswith('V8'):
                    prob = self.add_motion_vector.split('V8')[-1]
                    if prob != '':
                        prob = float(prob)
                        hidden_states = random_swap_motion_vectors_v2(hidden_states, prob=prob)
                
            hidden_states = rearrange(hidden_states, "(b d) f c -> (b f) d c", d=sequence_length)

        if self.is_cross_attention:
            hidden_states = hidden_states + residual
        
        if self.training and self.add_corr_constraint:
            assert math.isqrt(sequence_length) ** 2 == sequence_length
            latent_corr = cross_frame_corr(apply_mean_filtering=self.corr_apply_mean_filtering, apply_local_calculation=self.corr_apply_local_calculation, apply_intra_frame_ref=self.corr_apply_intra_frame_ref, apply_adjacent_frame_ref=self.corr_apply_adjacent_frame_ref, apply_first_frame_ref=self.corr_apply_first_frame_ref, apply_masking=self.corr_apply_masking).calculate_corr(rearrange(hidden_states, "(b f) (h w) c -> b c f h w", f=video_length, h=math.isqrt(sequence_length)))
            global_utils.update_video_corr_loss(latent_corr, self.corr_apply_normalization)

        return hidden_states

