pretrained_base_model_path: "./pretrained_weights/stable-diffusion-v1-5/"
pretrained_vae_path: "./pretrained_weights/sd-vae-ft-mse"
image_encoder_path: "./pretrained_weights/sd-image-variations-diffusers/image_encoder"
denoising_unet_path: "./exp_output/stage1_rr_3/denoising_unet-120000.pth"
reference_unet_path: "./exp_output/stage1_rr_3/reference_unet-120000.pth"
pose_guider_path: "./exp_output/stage1_rr_3/pose_guider-120000.pth"
adapter_path: "./exp_output/stage2_rr_3/adapter-30000.pth"
motion_module_path: "./exp_output/stage2_rr_3/motion_module-30000.pth"
# denoising_unet_path: "./exp_output/stage1_100000/denoising_unet-100000.pth"
# reference_unet_path: "./exp_output/stage1_100000/reference_unet-100000.pth"
# pose_guider_path: "./exp_output/stage1_100000/pose_guider-100000.pth"
# adapter_path: "./exp_output/stage2/adapter-10000.pth"
# motion_module_path: "./exp_output/stage2/motion_module-10000.pth"

weight_dtype: 'fp16'

unet_additional_kwargs:
  use_inflated_groupnorm: true
  unet_use_cross_frame_attention: false 
  unet_use_temporal_attention: false
  use_motion_module: true
  motion_module_resolutions:
  - 1
  - 2
  - 4
  - 8
  motion_module_mid_block: true 
  motion_module_decoder_only: false
  motion_module_type: Vanilla
  motion_module_kwargs:
    num_attention_heads: 8
    num_transformer_block: 1
    attention_block_types:
    - Temporal_Self
    - Temporal_Self
    temporal_position_encoding: true
    temporal_position_encoding_max_len: 32
    temporal_attention_dim_div: 1

noise_scheduler_kwargs:
  beta_start: 0.00085
  beta_end: 0.012
  beta_schedule: "linear"
  clip_sample: false
  steps_offset: 1
  ### Zero-SNR params
  prediction_type: "v_prediction"
  rescale_betas_zero_snr: True
  timestep_spacing: "trailing"