unet_additional_kwargs:
  use_inflated_groupnorm: true
  unet_use_cross_frame_attention: false
  unet_use_temporal_attention: false
  use_motion_module: true
  motion_module_resolutions:
    - 1
    - 2
    - 4
    - 8
  motion_module_mid_block: true
  motion_module_decoder_only: false
  motion_module_type: Vanilla # Vanilla, AnimateDiff
  motion_module_kwargs:
    num_attention_heads: 8
    num_transformer_block: 1
    attention_block_types:
      - Temporal_Self
      - Temporal_Self
    temporal_position_encoding: true
    temporal_position_encoding_max_len: 32 # REVIEW: short: 32; long: 64; longest: 128
    temporal_attention_dim_div: 1
    motion_scale: 1.0

  text_attention_dim: 768 # 768, null
  cross_attention_dim: 768
  text_attention_weight: 1.0
  audio_attention_weight: 1.0

noise_scheduler_kwargs:
  beta_start: 0.00085 # default: 0.00085
  beta_end: 0.012 # default: 0.012
  beta_schedule: "linear" # linear, scaled_linear, squaredcos_cap_v2 # REVIEW
  steps_offset: 1 # default: 1
  ### DDIM Only
  clip_sample: false # default: false
  set_alpha_to_one: true # default: true
  ### Zero-SNR params # REVIEW
  # prediction_type: "v_prediction"
  # rescale_betas_zero_snr: True # default: false
  # timestep_spacing: "trailing" # default: "leading"

sampler: DDIM # DDIM, PNDM, KDPM
init_ip_attn: False
