import torch
from diffusers.utils import export_to_video
from diffusers import AutoencoderKLWan, WanPipeline
from diffusers.schedulers.scheduling_unipc_multistep import UniPCMultistepScheduler
from module.attention_processor import MyWanAttnProcessor2_0
from module.pipe import myWanPipeline
from utils import bboxs_to_arg, plan_path, arg_to_bboxs, save_videos_with_bbox

import random 
import numpy as np

import argparse

torch.set_grad_enabled(False)

# fg_prompt = "A close-up shot of a red double-decker bus with a vintage British design, featuring a retro yellow stripe and a clear window display. The bus moves smoothly, wheels spinning rhythmically, with reflections shimmering on its windows. The camera tracks the vehicle from a low-angle perspective, capturing the texture of the bus's paint and the dynamic motion of the wheels. The frame is filled with the bus's iconic shape, emphasizing its retro charm and motion."
# bg_prompt = "A sunny afternoon in London, with a narrow cobblestone street lined with historic red brick buildings and green street lamps. Soft shadows stretch across the stones, while the bustling urban atmosphere hums with activity. Vintage-style storefronts and ornate facades frame the scene, and a gentle breeze stirs the air. The sky is a bright blue, with wispy clouds drifting overhead, enhancing the nostalgic, lively ambiance of the historic streets."
# base_prompt = "A red double-decker bus moving through London streets, with a vintage British design, featuring a retro yellow stripe and a clear window display. The bus is driving smoothly along a narrow cobblestone street lined with historic red brick buildings and green street lamps. The camera follows the bus from a low-angle perspective, capturing the movement of the wheels and the reflections on the windows. The scene is set during a sunny afternoon with soft shadows and a bustling urban atmosphere. The shot includes a wide-angle view showing the bus in motion with a dynamic camera tracking the vehicle."

fg_prompt = "Photorealistic 4K close-up of a muscular horse in motion, sleek coat glistening with dynamic light. The horse's powerful strides are captured in mid-action, with hooves lifting and legs extending, showcasing fluid movement. Volumetric lighting creates soft, glowing halos around its body, enhancing depth and atmosphere. The camera tracks smoothly alongside the horse, emphasizing its speed and energy. Realistic textures on fur and skin, with cinematic lighting that highlights every detail of the horse's form."
bg_prompt = "Vast, sunlit meadow stretching infinitely under a clear blue sky, with golden sunlight scattering through volumetric light. Distant hills roll gently in the background, their contours softened by atmospheric haze. The grass is uniformly green, with subtle variations in texture and light reflection. A few scattered wildflowers dot the landscape, while a winding path disappears into the horizon. The sky transitions from bright blue near the horizon to softer gradients above, with wispy clouds drifting slowly."
base_prompt = "A photorealistic 4K scene of a horse running across an open field, captured with volumetric lighting in Unreal Engine. The horse has a muscular build, sleek coat, and powerful strides, showcasing dynamic motion and natural movement. The lighting creates soft volumetric effects around the horse's body, enhancing depth and atmosphere. The background features a vast, sunlit meadow with distant hills and a clear blue sky. The camera follows the horse in a dynamic tracking shot, capturing its speed and energy. Realistic textures and high detail throughout, with a cinematic feel.\n"

# fg_prompt="Close-up shot of a cute, round black-and-white adult panda with expressive eyes and a gentle demeanor. The panda walks through a dense bamboo forest, holding a bamboo stick in its mouth, chewing slowly with a satisfied expression. Its round face shows soft features, with black eyes gazing forward, and fluffy fur contrasting against the green surroundings. The camera captures the panda’s movements from a low-angle perspective, emphasizing its calm, content demeanor and the rhythmic motion of chewing. The panda fills the entire frame, with details of its fur texture and bamboo stick visible in sharp focus."
# bg_prompt="A lush, dense bamboo forest bathed in soft, golden sunlight filtering through tall bamboo stalks. Green foliage forms a vibrant canopy, with clusters of bamboo leaves swaying gently in the breeze. The forest floor is covered in moss and fallen bamboo shoots, with faint shadows cast by the towering stalks. A faint mist lingers in the air, adding depth to the scene. The atmosphere is peaceful and natural, with the soft rustling of bamboo and the distant call of birds creating a serene ambiance. The lighting enhances the forest’s organic textures and colors, creating a vivid, immersive backdrop."
# base_prompt="A panda walking and munching bamboo in a bamboo forest. The panda is a cute, round, black-and-white adult panda with expressive eyes and a gentle demeanor. It is walking through a dense bamboo forest, surrounded by tall bamboo stalks and green foliage. The panda is holding a bamboo stick in its mouth, chewing slowly with a satisfied expression. The scene is vibrant and natural, with soft lighting and a peaceful atmosphere. The camera follows the panda from a low-angle perspective, capturing its movements and interactions with the environment.\n"


# fg_prompt = "Photorealistic 4K close-up of a muscular horse in motion, sleek coat glistening with dynamic light. The horse's powerful strides are captured in mid-action, with hooves lifting and legs extending, showcasing fluid movement. Volumetric lighting creates soft, glowing halos around its body, enhancing depth and atmosphere. The camera tracks smoothly alongside the horse, emphasizing its speed and energy. Realistic textures on fur and skin, with cinematic lighting that highlights every detail of the horse's form."
# bg_prompt = "Vast, sunlit meadow stretching infinitely under a clear blue sky, with golden sunlight scattering through volumetric light. Distant hills roll gently in the background, their contours softened by atmospheric haze. The grass is uniformly green, with subtle variations in texture and light reflection. A few scattered wildflowers dot the landscape, while a winding path disappears into the horizon. The sky transitions from bright blue near the horizon to softer gradients above, with wispy clouds drifting slowly."
# base_prompt = "A photorealistic 4K scene of a horse running across an open field, captured with volumetric lighting in Unreal Engine. The horse has a muscular build, sleek coat, and powerful strides, showcasing dynamic motion and natural movement. The lighting creates soft volumetric effects around the horse's body, enhancing depth and atmosphere. The background features a vast, sunlit meadow with distant hills and a clear blue sky. The camera follows the horse in a dynamic tracking shot, capturing its speed and energy. Realistic textures and high detail throughout, with a cinematic feel."

negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards"

bboxs = [
            [
                0,
                0.3,
                0.7,
                0.1,
                0.4
            ],
            [
                80,
                0.3,
                0.7,
                0.7,
                1.0
            ]
        ]
bboxs_flat = [str(num) for bbox in bboxs for num in bbox]
bboxs_arg = ",".join(bboxs_flat) 


parser = argparse.ArgumentParser()
parser.add_argument("--seed", default=42, type=int)
parser.add_argument("--mask_step", default=30, type=int)
parser.add_argument("--bg_prompt", default=bg_prompt, type=str)
parser.add_argument("--fg_prompt", default=fg_prompt, type=str)
parser.add_argument("--base_prompt", default=base_prompt, type=str)
parser.add_argument("--negative_prompt", default=negative_prompt, type=str)
parser.add_argument("--output_path", default='output', type=str)
parser.add_argument("--output_path_withbox", default='output_box', type=str)
parser.add_argument("--bboxs_arg", default=bboxs_arg, type=str)
parser.add_argument("--fixRope_step", default=5, type=int)
parser.add_argument("--num_frame", default=81, type=int)
parser.add_argument("--height", default=480, type=int)
parser.add_argument("--width", default=832, type=int)

args = parser.parse_args()

seed = args.seed
mask_step = args.mask_step
bg_prompt = args.bg_prompt
fg_prompt = args.fg_prompt
base_prompt = args.base_prompt
negative_prompt = args.negative_prompt
bboxs = arg_to_bboxs(args.bboxs_arg)
num_frame = args.num_frame
height = args.height
width = args.width
fixRope_step = args.fixRope_step


print(f"Using seed: {seed}, mask_step: {mask_step}, fixRope_step: {fixRope_step}, bboxs: {bboxs}")

output_path = args.output_path
output_path_withbox = args.output_path_withbox

# Available models: Wan-AI/Wan2.1-T2V-14B-Diffusers, Wan-AI/Wan2.1-T2V-1.3B-Diffusers
model_id = "Wan-AI/Wan2.1-T2V-1.3B-Diffusers"
vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
flow_shift = 3.0 # 5.0 for 720P, 3.0 for 480P
scheduler = UniPCMultistepScheduler(prediction_type='flow_prediction', use_flow_sigmas=True, num_train_timesteps=1000, flow_shift=flow_shift)
pipe = myWanPipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16)


random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
generator=torch.Generator(device='cpu').manual_seed(seed)

attn_procs = {}
cnt = 0
for name in pipe.transformer.attn_processors.keys():
    cnt+=1
    attn_procs[name] = MyWanAttnProcessor2_0()
pipe.transformer.set_attn_processor(attn_procs)
print(f"******{cnt} attn_procs changed.*******")
pipe.scheduler = scheduler
device = 'cuda'
pipe.to(device)


latent_num_frame = num_frame//4 + 1
bbox_h = height // 16
bbox_w = width // 16

bbox_mask = torch.zeros([latent_num_frame, 1, bbox_h, bbox_w]).to(device)

# dynamic box
PATHS = plan_path(bboxs, video_length=num_frame)[::4]
assert latent_num_frame == len(PATHS), "latent_num_frame != len(PATHS)"
for i in range(latent_num_frame):
    h_start = int(PATHS[i][0] * bbox_h)
    h_end = int(PATHS[i][1] * bbox_h)
    w_start = int(PATHS[i][2] * bbox_w)
    w_end = int(PATHS[i][3] * bbox_w)
    bbox_mask[i, :, h_start:h_end, w_start:w_end] = 1


latents = None


encoder_attention_mask = torch.Tensor([False for i in range(512)] + [True for i in range(512)])
encoder_attention_mask = encoder_attention_mask.to(device)


output = pipe(
     prompt=base_prompt,
     negative_prompt=negative_prompt,
     height=height,
     width=width,
     num_frames=num_frame,
     guidance_scale=5,
     generator=generator,
     attention_kwargs={"bbox_mask": bbox_mask,"encoder_attention_mask":encoder_attention_mask,"bg_prompt":bg_prompt,"fg_prompt":fg_prompt,"fixRope_step":fixRope_step, "mask_step":mask_step},
     latents=latents,
    ).frames[0]

export_to_video(output, output_path, fps=num_frame//4)

if output_path_withbox is not None:
    save_videos_with_bbox(torch.Tensor(output).unsqueeze(0).unsqueeze(0).permute(0,1,5,2,3,4), output_path_withbox, fps=num_frame//4, input_traj=bboxs)