model_name: wan
generator_name: causal_wan
model_path: Wan2.1-Fun-1.3B-Control
generator_transformer_path: diffusion_pytorch_model.safetensors

denoising_step_list:
- 1000
- 757
- 522
- 20
- 0
timestep_sampling_method: mono_inc
context_perturbation: 0.0
num_frame_per_block: 1
scheduler_kwargs:
  scheduler_subpath: null
  num_train_timesteps: 1000
  shift: 5.0
  use_dynamic_shifting: false
  base_shift: 0.5
  max_shift: 1.15
  base_image_seq_len: 256
  max_image_seq_len: 4096
transformer_additional_kwargs:
  transformer_subpath: ./
  dict_mapping:
    in_dim: in_channels
    dim: hidden_size
text_encoder_kwargs:
  text_encoder_subpath: models_t5_umt5-xxl-enc-bf16.pth
  tokenizer_subpath: google/umt5-xxl
  text_length: 512
  vocab: 256384
  dim: 4096
  dim_attn: 4096
  dim_ffn: 10240
  num_heads: 64
  num_layers: 24
  num_buckets: 32
  shared_pos: False
  dropout: 0.0
generator_task: causal_video
real_task_type: bidirectional_video
fake_task_type: bidirectional_video

dfake_gen_update_ratio: 6
real_guidance_scale: 1.5
backward_simulation: true
num_last_frames_with_grad: 7
negative_prompt: '色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走'
guidance_scale: 1.0
mixed_precision: bf16
seed: 666
max_train_steps: 4000
lr: 1e-5
beta1: 0.9
beta2: 0.999
batch_size: 2
dmd_loss_weight: 1.0
adv_g_loss_weight: 0.1
adv_d_loss_weight: 0.05
output_features_stride: 10
discriminator_num_heads: 4
gradient_accumulation_steps: 1
gradient_checkpointing: true
validation_iters: 50
checkpoint_iters: 1000
checkpoints_total_limit: 2
logging_dir: logs
report_to: tensorboard
output_dir: wan2.1_fun_causal_control_RL
tracker_project_name: default
use_ema: true
train_mode: control_ref

train_data_dir: data/video/
train_data_meta: dec_fps_24_all.jsonl
caption_column: prompt
video_column: video
video_resolution_buckets:
- (73, 480, 368)
- (73, 400, 400)
- (73, 368, 480) 
- (73, 640, 368)
- (73, 368, 640)
video_sample_stride: 1
dataloader_num_workers: 8
validation_data_meta: config/wan2.1/val_control_long_seq.json

num_video_per_prompt: 8
num_batches_per_epoch: 1
num_inner_epochs: 1
train_num_steps: 1
adv_clip_max: 5
clip_range: 1e-4
beta: 0.1
max_grad_norm: 1.0
mini_num_image_per_prompt: 2
reward_fn: {
        "aesthetic": 0.5,
        "cotracker": 0.5,
    }
  