model:
  image_prefix_length: 64
  max_text_seq_length: 226
  backbone_weight: "ckpt/NVILA-8B-Video/"
  cogvideo_weight: "ckpt/CogVideoX-2b"
  add_reference_image: true
  last_clip_frame_num: 1
  uncond_prob: 0.05
  pretrained_weight: "ckpt/AnimeShooterGen/pretrained.bin"
  qformer_num_hidden_layers: 12

data:
  video_id: "1dCd6hCRoaQ"
  max_num_frames: 49
  height: 480
  width: 720
  clip_num: 3
  json_path: "datasets/meta_info.json"

training:
  output_dir: "outputs"
  learning_rate: 1e-4
  llm_learning_rate: 1e-4
  num_train_epochs: 100
  logging_steps: 5
  save_steps: 1000
  save_total_limit: 3
  warmup_steps: 0
  lr_scheduler_type: "constant"
  per_device_train_batch_size: 1
  dataloader_num_workers: 8
  max_grad_norm: 1.0
  optim: "adamw_torch"
  adam_beta1: 0.9
  adam_beta2: 0.98
  adam_epsilon: 1e-6
  weight_decay: 0.05
  remove_unused_columns: false
  deepspeed: "src/config/ds_config.json"
  bf16: true
  report_to: "tensorboard"
  save_safetensors: false

peft:
  llm:
    enabled: true
    r: 8
    lora_alpha: 16
    lora_dropout: 0.1
    task_type: "CAUSAL_LM"
  cogvideo:
    enabled: true
    r: 16
    lora_alpha: 8
    init_lora_weights: true
    target_modules: ["to_q", "to_k", "to_v", "to_out.0"]
