model_name: wan
generator_name: causal_wan
model_path: Wan2.1-Fun-1.3B-InP
generator_transformer_path: transformer/diffusion_pytorch_model.safetensors
critic_transformer_path: transformer/diffusion_pytorch_model.safetensors
denoising_step_list:
- 1000
- 757
- 522
- 0
num_frame_per_block: 1
scheduler_kwargs:
  scheduler_subpath: null
  num_train_timesteps: 1000
  shift: 5.0
  use_dynamic_shifting: false
  base_shift: 0.5
  max_shift: 1.15
  base_image_seq_len: 256
  max_image_seq_len: 4096
transformer_additional_kwargs:
  transformer_subpath: ./
  dict_mapping:
    in_dim: in_channels
    dim: hidden_size
text_encoder_kwargs:
  text_encoder_subpath: models_t5_umt5-xxl-enc-bf16.pth
  tokenizer_subpath: google/umt5-xxl
  text_length: 512
  vocab: 256384
  dim: 4096
  dim_attn: 4096
  dim_ffn: 10240
  num_heads: 64
  num_layers: 24
  num_buckets: 32
  shared_pos: False
  dropout: 0.0
generator_task: causal_video
real_task_type: bidirectional_video
fake_task_type: bidirectional_video

dfake_gen_update_ratio: 5
real_guidance_scale: 2.5
backward_simulation: false
negative_prompt: '色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走'
guidance_scale: 1.0
mixed_precision: bf16
seed: 666
max_train_steps: 400
lr: 1e-5
beta1: 0.9
beta2: 0.999
batch_size: 8
dmd_loss_weight: 1.0
adv_g_loss_weight: 0.1
adv_d_loss_weight: 0.5
output_features_stride: 5
gradient_accumulation_steps: 1
gradient_checkpointing: true
validation_iters: 50
checkpoint_iters: 100
checkpoints_total_limit: 4
logging_dir: logs
report_to: tensorboard
output_dir: /home4/jiaxin/exp/wan_causal_dmd_0408
tracker_project_name: default

train_data_dir: /data/video/
train_data_meta: dec_fps_24.jsonl
caption_column: prompt
video_column: video
video_resolution_buckets:
- (25, 480, 368)
- (25, 400, 400)
- (25, 368, 480) 
- (25, 640, 368)
- (25, 368, 640)
video_sample_stride: (3,3,1,1,3,1,3,1)
dataloader_num_workers: 16
validation_data_meta: config/wan2.1/val_i2v.json