# Copyright 2025 The Wan Team and The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import math
from typing import Any, Dict, List, Optional, Tuple, Union
from einops import rearrange, repeat

import torch
import torch.nn as nn

from diffusers.configuration_utils import ConfigMixin, register_to_config
from diffusers.loaders import FromOriginalModelMixin, PeftAdapterMixin
from diffusers.utils import (
    USE_PEFT_BACKEND,
    logging,
    scale_lora_layers,
    unscale_lora_layers,
)
from diffusers.models.attention import AttentionMixin
from diffusers.models.cache_utils import CacheMixin
from diffusers.models.modeling_outputs import Transformer2DModelOutput
from diffusers.models.modeling_utils import ModelMixin
from diffusers.models.normalization import FP32LayerNorm
from diffusers.models.controlnet import zero_module
from diffusers.models.transformers.transformer_wan import (
    WanRotaryPosEmbed,
    WanTimeTextImageEmbedding,
    WanTransformerBlock,
)
from diffusers.models.transformers.transformer_wan_vace import WanVACETransformerBlock

from mmhug.models.custom_transformers.wan22_audio.audio_pack import AudioPack
from mmhug.registry import HF_MODELS

logger = logging.get_logger(__name__)  # pylint: disable=invalid-name


@HF_MODELS.register_module(force=True)
class AudiopackWanVACETransformer3DModelV2(
    ModelMixin,
    ConfigMixin,
    PeftAdapterMixin,
    FromOriginalModelMixin,
    CacheMixin,
    AttentionMixin,
):
    r"""
    A Transformer model for video-like data used in the Wan model.

    Args:
        patch_size (`Tuple[int]`, defaults to `(1, 2, 2)`):
            3D patch dimensions for video embedding (t_patch, h_patch, w_patch).
        num_attention_heads (`int`, defaults to `40`):
            Fixed length for text embeddings.
        attention_head_dim (`int`, defaults to `128`):
            The number of channels in each head.
        in_channels (`int`, defaults to `16`):
            The number of channels in the input.
        out_channels (`int`, defaults to `16`):
            The number of channels in the output.
        text_dim (`int`, defaults to `512`):
            Input dimension for text embeddings.
        freq_dim (`int`, defaults to `256`):
            Dimension for sinusoidal time embeddings.
        ffn_dim (`int`, defaults to `13824`):
            Intermediate dimension in feed-forward network.
        num_layers (`int`, defaults to `40`):
            The number of layers of transformer blocks to use.
        window_size (`Tuple[int]`, defaults to `(-1, -1)`):
            Window size for local attention (-1 indicates global attention).
        cross_attn_norm (`bool`, defaults to `True`):
            Enable cross-attention normalization.
        qk_norm (`bool`, defaults to `True`):
            Enable query/key normalization.
        eps (`float`, defaults to `1e-6`):
            Epsilon value for normalization layers.
        add_img_emb (`bool`, defaults to `False`):
            Whether to use img_emb.
        added_kv_proj_dim (`int`, *optional*, defaults to `None`):
            The number of channels to use for the added key and value projections. If `None`, no projection is used.
    """

    # _supports_gradient_checkpointing = True
    # _skip_layerwise_casting_patterns = [
    #     "patch_embedding",
    #     "vace_patch_embedding",
    #     "condition_embedder",
    #     "norm",
    # ]
    # _no_split_modules = ["WanTransformerBlock", "WanVACETransformerBlock"]
    # # _keep_in_fp32_modules = [
    # #     "time_embedder",
    # #     "scale_shift_table",
    # #     "norm1",
    # #     "norm2",
    # #     "norm3",
    # # ]
    # _keys_to_ignore_on_load_unexpected = ["norm_added_q"]

    @register_to_config
    def __init__(
        self,
        patch_size: Tuple[int] = (1, 2, 2),
        num_attention_heads: int = 40,
        attention_head_dim: int = 128,
        in_channels: int = 16,
        out_channels: int = 16,
        text_dim: int = 4096,
        freq_dim: int = 256,
        ffn_dim: int = 13824,
        num_layers: int = 40,
        cross_attn_norm: bool = True,
        qk_norm: Optional[str] = "rms_norm_across_heads",
        eps: float = 1e-6,
        image_dim: Optional[int] = None,
        added_kv_proj_dim: Optional[int] = None,
        rope_max_seq_len: int = 1024,
        pos_embed_seq_len: Optional[int] = None,
        vace_layers: List[int] = [
            0,
            2,
            4,
            6,
            8,
            10,
            12,
            14,
            16,
            18,
            20,
            22,
            24,
            26,
            28,
        ],
        vace_in_channels: int = 96,
        # Audio input
        audio_in_channels: int = 1024,
        audio_hidden_size: int = 1536,
        audio_inject="input",
    ) -> None:
        super().__init__()

        inner_dim = num_attention_heads * attention_head_dim
        out_channels = out_channels or in_channels

        if max(vace_layers) >= num_layers:
            raise ValueError(
                f"VACE layers {vace_layers} exceed the number of transformer layers {num_layers}."
            )
        if 0 not in vace_layers:
            raise ValueError("VACE layers must include layer 0.")

        # 1. Patch & position embedding
        self.rope = WanRotaryPosEmbed(attention_head_dim, patch_size, rope_max_seq_len)
        self.patch_embedding = nn.Conv3d(
            in_channels, inner_dim, kernel_size=patch_size, stride=patch_size
        )
        self.vace_patch_embedding = nn.Conv3d(
            vace_in_channels, inner_dim, kernel_size=patch_size, stride=patch_size
        )

        # nn.init.normal_(self.vace_patch_embedding.weight, std=0.02)
        # nn.init.constant_(self.vace_patch_embedding.bias, 0)

        # 2. Condition embeddings
        # image_embedding_dim=1280 for I2V model
        self.condition_embedder = WanTimeTextImageEmbedding(
            dim=inner_dim,
            time_freq_dim=freq_dim,
            time_proj_dim=inner_dim * 6,
            text_embed_dim=text_dim,
            image_embed_dim=image_dim,
            pos_embed_seq_len=pos_embed_seq_len,
        )

        # 3. Transformer blocks
        self.blocks = nn.ModuleList(
            [
                WanTransformerBlock(
                    inner_dim,
                    ffn_dim,
                    num_attention_heads,
                    qk_norm,
                    cross_attn_norm,
                    eps,
                    added_kv_proj_dim,
                )
                for _ in range(num_layers)
            ]
        )

        self.vace_blocks = nn.ModuleList(
            [
                WanVACETransformerBlock(
                    inner_dim,
                    ffn_dim,
                    num_attention_heads,
                    qk_norm,
                    cross_attn_norm,
                    eps,
                    added_kv_proj_dim,
                    apply_input_projection=i
                    == 0,  # Layer 0 always has input projection and is in vace_layers
                    apply_output_projection=True,
                )
                for i in range(len(vace_layers))
            ]
        )

        # 4. Output norm & projection
        self.norm_out = FP32LayerNorm(inner_dim, eps, elementwise_affine=False)
        self.proj_out = nn.Linear(inner_dim, out_channels * math.prod(patch_size))
        self.scale_shift_table = nn.Parameter(
            torch.randn(1, 2, inner_dim) / inner_dim**0.5
        )

        self.gradient_checkpointing = False

        # Add Audio condition

        self.audio_pack = AudioPack(
            in_channels=audio_in_channels,
            patch_size=(4, 1, 1),
            dim=audio_hidden_size,
            layernorm=True,
        )
        self.audio_proj_in = nn.Linear(audio_hidden_size, inner_dim)
        self.audio_inject = audio_inject

    def forward(
        self,
        hidden_states: torch.Tensor,
        timestep: torch.LongTensor,
        encoder_hidden_states: torch.Tensor,
        audio_hidden_states: torch.Tensor,
        control_hidden_states: torch.Tensor = None,
        control_hidden_states_scale: torch.Tensor = None,
        return_dict: bool = True,
        attention_kwargs: Optional[Dict[str, Any]] = None,
    ) -> Union[torch.Tensor, Dict[str, torch.Tensor]]:
        if attention_kwargs is not None:
            attention_kwargs = attention_kwargs.copy()
            lora_scale = attention_kwargs.pop("scale", 1.0)
        else:
            lora_scale = 1.0

        if USE_PEFT_BACKEND:
            # weight the lora layers by setting `lora_scale` for each PEFT layer
            scale_lora_layers(self, lora_scale)
        else:
            if (
                attention_kwargs is not None
                and attention_kwargs.get("scale", None) is not None
            ):
                logger.warning(
                    "Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
                )

        batch_size, num_channels, num_frames, height, width = hidden_states.shape
        p_t, p_h, p_w = self.config.patch_size
        post_patch_num_frames = num_frames // p_t
        post_patch_height = height // p_h
        post_patch_width = width // p_w
        if control_hidden_states_scale is None:
            control_hidden_states_scale = control_hidden_states.new_ones(
                len(self.config.vace_layers)
            )
        control_hidden_states_scale = torch.unbind(control_hidden_states_scale)
        if len(control_hidden_states_scale) != len(self.config.vace_layers):
            raise ValueError(
                f"Length of `control_hidden_states_scale` {len(control_hidden_states_scale)} should be "
                f"equal to {len(self.config.vace_layers)}."
            )
        # 0. Audio embedding
        # b c t 1 1
        audio_hidden_states = self.encode_audio(audio_hidden_states)
        audio_hidden_states = audio_hidden_states.expand(
            -1, -1, -1, post_patch_height, post_patch_width
        )

        # 1. Rotary position embedding
        rotary_emb = self.rope(hidden_states)
        # 2. Patch embedding
        # b c t h w
        hidden_states = self.patch_embedding(hidden_states)

        # b c t h w
        control_hidden_states = self.vace_patch_embedding(control_hidden_states)

        assert (
            control_hidden_states.shape == hidden_states.shape
        ), f"control_hidden_states shape {control_hidden_states.shape} should be equal to hidden_states shape {hidden_states.shape}"

        # 3. Time embedding
        temb, timestep_proj, encoder_hidden_states, _ = self.condition_embedder(
            timestep, encoder_hidden_states, None
        )
        timestep_proj = timestep_proj.unflatten(1, (6, -1))
        # 4. Transformer blocks
        # b c t h w -> b (t h w) c
        hidden_states = hidden_states.flatten(2).transpose(1, 2)
        control_hidden_states = control_hidden_states.flatten(2).transpose(1, 2)
        audio_hidden_states = audio_hidden_states.flatten(2).transpose(1, 2)
        if audio_hidden_states.shape[1] < control_hidden_states.shape[1]:
            audio_hidden_states = torch.cat(
                [
                    torch.zeros_like(
                        control_hidden_states[
                            :,
                            : (
                                control_hidden_states.shape[1]
                                - audio_hidden_states.shape[1]
                            ),
                        ],
                    ),
                    audio_hidden_states,
                ],
                dim=1,
            )
        # Prepare VACE hints
        control_hidden_states_list = []
        for i, block in enumerate(self.vace_blocks):
            if i == 0 or self.audio_inject == "layer":
                control_hidden_states = control_hidden_states + audio_hidden_states
            conditioning_states, control_hidden_states = block(
                hidden_states,
                encoder_hidden_states,
                control_hidden_states,
                timestep_proj,
                rotary_emb,
            )
            control_hidden_states_list.append(
                (conditioning_states, control_hidden_states_scale[i])
            )
        control_hidden_states_list = control_hidden_states_list[::-1]

        for i, block in enumerate(self.blocks):

            hidden_states = block(
                hidden_states, encoder_hidden_states, timestep_proj, rotary_emb
            )

            if i in self.config.vace_layers:
                control_hint, scale = control_hidden_states_list.pop()
                hidden_states = hidden_states + control_hint * scale

        # 5. Output norm, projection & unpatchify
        shift, scale = (self.scale_shift_table + temb.unsqueeze(1)).chunk(2, dim=1)

        # Move the shift and scale tensors to the same device as hidden_states.
        # When using multi-GPU inference via accelerate these will be on the
        # first device rather than the last device, which hidden_states ends up
        # on.
        shift = shift.to(hidden_states.device)
        scale = scale.to(hidden_states.device)

        hidden_states = (
            self.norm_out(hidden_states.float()) * (1 + scale) + shift
        ).type_as(hidden_states)
        hidden_states = self.proj_out(hidden_states)
        hidden_states = hidden_states.reshape(
            batch_size,
            post_patch_num_frames,
            post_patch_height,
            post_patch_width,
            p_t,
            p_h,
            p_w,
            -1,
        )
        hidden_states = hidden_states.permute(0, 7, 1, 4, 2, 5, 3, 6)
        output = hidden_states.flatten(6, 7).flatten(4, 5).flatten(2, 3)
        if USE_PEFT_BACKEND:
            # remove `lora_scale` from each PEFT layer
            unscale_lora_layers(self, lora_scale)

        if not return_dict:
            return (output,)

        return Transformer2DModelOutput(sample=output)

    def encode_audio(
        self, audio_hidden_states: torch.Tensor, num_ref_frame: int
    ) -> torch.Tensor:
        """
        Args:
            audio_hidden_states: (B, T, C)
        Returns:
            audio_hidden_states: (B, C, (T - 1 // 4) + 1)
        """
        audio_hidden_states = rearrange(audio_hidden_states, "b t c -> b c t 1 1")
        # map to video tokens number
        audio_hidden_states = torch.cat(
            [audio_hidden_states[:, :, :1].repeat(1, 1, num_ref_frame, 1, 1), audio_hidden_states],
            2,
        )

        # b t 1 1 c
        audio_hidden_states = self.audio_pack(audio_hidden_states)
        audio_hidden_states = self.audio_proj_in(audio_hidden_states)
        audio_hidden_states = rearrange(audio_hidden_states, "b t h w c -> b c t h w")
        return audio_hidden_states


if __name__ == "__main__":
    from mmengine.device import get_device

    dtype = torch.bfloat16
    device = get_device()
    model = AudiopackWanVACETransformer3DModelV2.from_pretrained(
        "checkpoints/Wan2.1-VACE-1.3B-diffusers",
        subfolder="transformer",
        audio_in_channels=1024,
        audio_hidden_size=1536,
        low_cpu_mem_usage=False,
    ).to(device=device, dtype=dtype)
    model.eval()
    timestep = torch.tensor([0]).to(device=device, dtype=dtype)

    audio_hidden_states = torch.randn(1, 97, 1024).to(device=device, dtype=dtype)
    noise_latent = torch.randn(1, 16, 26, 64, 64).to(device=device, dtype=dtype)
    encoder_hidden_states = torch.randn(1, 15, 4096).to(device=device, dtype=dtype)
    control_hidden_states = torch.randn(1, 96, 26, 64, 64).to(
        device=device, dtype=dtype
    )

    with torch.no_grad():
        output = model(
            audio_hidden_states=audio_hidden_states,
            hidden_states=noise_latent,
            encoder_hidden_states=encoder_hidden_states,
            control_hidden_states=control_hidden_states,
            timestep=timestep,
        )

    print(output)
