data:
  train_bs: 3 # TODO: check this one
  train_width: 512
  train_height: 512
  meta_paths:
    - "/root/datasets/HDTF/hdtf_3kps.json"
    - "/root/multimodal_dataset/animate_image/audio_to_face/part_01/part_01_wo_low_res_wo_low_syncnet_conf.json"
    - "/root/multimodal_dataset/animate_image/audio_to_face/part_02/p2.json"
    - "/root/multimodal_dataset/animate_image/audio_to_face/part_03/p3.json"
  prompt_paths:
    - "/root/multimodal_dataset/animate_image/audio_to_face/part_01/processed_captions.json"
    - "/root/multimodal_dataset/animate_image/audio_to_face/part_02/processed_captions.json"
    - "/root/multimodal_dataset/animate_image/audio_to_face/part_03/processed_captions.json"
    - "/root/datasets/HDTF/processed_captions.json"
  flip_rate: 0.5
  sample_rate: 1
  num_frames: 16 # TODO: check this one
  reference_margin: 20
  num_padding_audio_frames: 2 # 2 # relatred to num_queries of AudioProjector # TODO: check this one
  num_prefix_frames: 4 # 4  # TODO: check this one
  lip_loss_weight: 10.0 # TODO: check this one
  bg_loss_weight: 1.0 # TODO: check this one
  audio_embeddings_type: "global" # {global, xlsr_global}
  reference_drop_rate: 0
  kps_drop_rate: 0
  faceid_drop_rate: 0
  negative_prompt: "Monochrome, lowres, blurry, bad anatomy, distortions, poor lighting, dull colors, off-theme elements, inappropriate content, unwanted artifacts, unsettling moods"
  image_height: 512
  image_width: 512
  load_face_mask: false
  data_format: "video_only" # 'video_only', 'image_only', 'video_image', # TODO: check this one

module_training:
  denoising_unet: False
  v_kps_guider: False
  t2i_adapter: False
  audio_projection: True # TODO: check this one
  motion_module: True # True, False # TODO: check this one
  ip_adapter: False

  motion_trainable_modules:
    - "motion_modules.0"
    - "motion_modules.2"

solver:
  gradient_accumulation_steps: 1 # TODO: check this one
  mixed_precision: "fp16"
  denoising_unet_gradient_checkpointing: True
  max_train_steps: 300000
  max_grad_norm: 1.0
  # lr
  learning_rate: 1e-6 # 1e-6 # TODO: check this one
  scale_lr: False
  lr_warmup_steps: 1
  lr_scheduler: "constant"

  # optimizer
  use_8bit_adam: True
  adam_beta1: 0.9
  adam_beta2: 0.999
  adam_weight_decay: 1.0e-2
  adam_epsilon: 1.0e-8

val:
  validation_steps: 20

noise_scheduler_kwargs:
  num_train_timesteps: 1000
  beta_start: 0.00085
  beta_end: 0.012
  beta_schedule: "linear" # linear, scaled_linear, cosine, logistic
  steps_offset: 1
  clip_sample: false

unet_additional_kwargs:
  use_inflated_groupnorm: true
  unet_use_cross_frame_attention: false
  unet_use_temporal_attention: false
  use_motion_module: true # true, false
  motion_module_resolutions:
    - 1
    - 2
    - 4
    - 8
  motion_module_mid_block: true
  motion_module_decoder_only: false
  motion_module_type: Vanilla
  motion_module_kwargs:
    num_attention_heads: 8
    num_transformer_block: 1
    attention_block_types:
      - Temporal_Self
      - Temporal_Self
    temporal_position_encoding: true # TODO: check this one
    temporal_position_encoding_max_len: 32
    temporal_attention_dim_div: 1
  mm_zero_proj_out: false # false # REVIEW: check this one
  text_attention_dim: 768 # 768, null
  cross_attention_dim: 768
  text_attention_weight: 1.0
  audio_attention_weight: 1.0 

# Module Loading Configs
base_model_path: "/dockerdata/models/stable-diffusion-v1-5/"
denoising_unet_path: "/dockerdata/models/stable-diffusion-v1-5/unet/diffusion_pytorch_model.bin"
vae_model_path: "/dockerdata/models/sd-vae-ft-mse/"
audio_encoder_path: "/dockerdata/models/wav2vec2-base-960h/"
ip_ckpt: "/root/models/IP-Adapter/FaceID/ip-adapter-faceid-portrait_sd15.bin"
image_encoder_path: "/root/models/IP-Adapter/models/image_encoder/"
insightface_model_path: "/root/models/insightface_models/"

denoising_unet_state_dict_type: "new_attn" # {moore_pretrained, old_attn, new_attn}
audio_projection_path: ""
v_kps_guider_path: ""
motion_module_path: "/root/models/animatediff/mm_sd_v15_v2.ckpt" # official AnimateDiff MotionModule Checkpoints
t2i_adapter_model_path: "exp_output/t2i_only-snr_gamma-noise_offset-sd15-wo_zeroinit/stage_1-t2i_only/t2i_adapter_openpose-300000.pth"  # TODO: check this one
t2i_adapter_control_type: "openpose"

# Module Configs
weight_dtype: "fp16" # [fp16, fp32]
uncond_ratio: 0.1 # TODO: check this one
noise_offset: 0.05 # 0.05 # TODO: check this one
snr_gamma: 0.0 # 0.0 # TODO: check this one
enable_zero_snr: False # True
guidance_scale: 1.0
repeat_start: False # TODO: check this one
prefix_ratio: 0.95 # 0.95 # TODO: check this one
# Audio Projection
aud_proj_depth: 4 # 4  # TODO: check this one
# IP-Adapter
num_tokens: 16
n_cond: 1
ipa_scale: 1.0
ip_mode: "portrait"
# Residual Control
t2i_adapter_conditioning_scale: 1.0
v_kps_guider_zero_out: false # zero-out fort finetuning V-KPS guider  # TODO: check this one
t2i_adapter_zero_out: false # zero-out fort finetuning T2I Adapter  # TODO: check this one
disable_kps: true # TODO: check this one
apply_t2i_adapter: false # true # TODO: check this one


# Training Configs
seed: 12580
resume_from_checkpoint: ""
checkpointing_steps: 5000

output_dir: "exp_output/motion_audio-modules0_2-lipsw_10-p_4_0.95-noff_0.05-wo_t2i"
