image_finetune: false
dataset: 'HAA'

output_dir: "/xxx/xxx/xxx/work/video_synthesis/few_shot_action_animation/AnimateDiff-1/outputs"
pretrained_model_path: "models/StableDiffusion/stable-diffusion-v1-5"

unet_additional_kwargs:
  use_motion_module              : true
  motion_module_resolutions      : [ 1,2,4,8 ]
  unet_use_cross_frame_attention : "first+curr+align+random"
  unet_use_temporal_attention    : false

  motion_module_type: Vanilla
  motion_module_kwargs:
    num_attention_heads                : 8
    num_transformer_block              : 1
    attention_block_types              : [ "Temporal_AddMV71.5Perc0.9_Self", "Temporal_AddMV71.5Perc0.9_Self" ]
    temporal_position_encoding         : true
    temporal_position_encoding_max_len : 32
    temporal_attention_dim_div         : 1
    zero_initialize                    : true
    add_motion_vector                  : true

noise_scheduler_kwargs:
  num_train_timesteps: 1000
  beta_start:          0.00085
  beta_end:            0.012
  beta_schedule:       "scaled_linear"
  steps_offset:        1
  clip_sample:         false

train_data:
  csv_path: "/xxx/xxx/xxx/datasets/HAA500/selected/pushup.txt"
  video_folder: "/xxx/xxx/xxx/datasets/HAA500/video/pushup"
  caption: "A person is performing a pushup"
  sample_size:     512
  sample_stride:   8
  sample_n_frames: 16
  random_crop: true

validation_data:
  prompts:
    - "A person performing a pushup"
    - "A person performing a pushup"
  controlnet_images_path:
    - "/xxx/xxx/xxx/datasets/HAA500/frames/pushup/pushup_001/00001.jpg"
    - "/xxx/xxx/xxx/datasets/HAA500/frames/pushup/pushup_016/00001.jpg"
  groundtruth_video_path:
    - "/xxx/xxx/xxx/datasets/HAA500/video/pushup/pushup_001.mp4"
    - "/xxx/xxx/xxx/datasets/HAA500/video/pushup/pushup_016.mp4"
  num_inference_steps: 25
  guidance_scale: 7.5

trainable_modules:
  - "motion_modules."
  - "to_k"
  - "to_v"

unet_checkpoint_path: ""
# pretrained_image_adapter_path: "models/Motion_Module/v3_sd15_adapter.ckpt"
# pretrained_image_adapter_alpha: 1.0
pretrained_motion_module_path: "models/Motion_Module/v3_sd15_mm.ckpt"

learning_rate: 5.e-5
train_batch_size: 1

max_train_epoch:      -1
max_train_steps:      20000
checkpointing_epochs: -1
checkpointing_steps:  10000

validation_steps:       5000

global_seed: 42
mixed_precision_training: true
enable_xformers_memory_efficient_attention: true

cfg_unnoise_first_frame: true
motion_importance_sampling: true

cfg_app_change_video: 1.0
cfg_app_change_video_version: "1"
cfg_app_change_video_type: "concat"
