general:
  save_path: '<save_path>'
  use_peft: true
  save_last: false
  target_VOC: 0.8
  level: 2

eval:
  n_frames: 15
  num_shuffles: 10
  n_frames_ref: -1
  eval_videos: []
  eval_offsets: []

video:
  video_paths: []
  num_frames_query: 15
  offsets: []
  reward_type: 'VOC'
  rollout_size: 128

model:
  model_path: "<model_path>"

peft:
  r: 32
  lora_alpha: 64
  lora_dropout: 0.1
  target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "down_proj", "up_proj"]
  bias: "none"

grpo:
  output_dir: "output_dir"
  per_device_train_batch_size: 1
  gradient_accumulation_steps: 16
  gradient_checkpointing: true
  learning_rate: 1e-5
  warmup_steps: 10
  lr_scheduler_type: "constant_with_warmup"
  num_iterations: 1
  max_steps: 400
  report_to: "wandb"
  bf16: true
  max_prompt_length: null
  max_completion_length: 2048
  num_generations: 4
  max_grad_norm: 1
  kl_ctl: true
  kl_beta_lb: 0.01
  target_kl: 0.1
  horizon: 8
  seed: 54
  temperature: 1.0
  beta: 0.05
  top_p: 0.9
  save_strategy: "steps"
  save_steps: 50
  sync_ref_model: false
  ref_model_sync_steps: 100
  save_only_model: true
  eval_strategy: "steps"
  eval_steps: 50
  logging_strategy: "steps"
  logging_steps: 1
  log_completions: false
  run_name: 'lvl-${general.level}-${video.reward_type}-mu-${grpo.num_iterations}-num_gen-${grpo.num_generations}-akl-${grpo.kl_ctl}-lb-${grpo.kl_beta_lb}-temp-${grpo.temperature}-beta-${grpo.beta}-lr-${grpo.learning_rate}-lora-${peft.r}-${peft.lora_alpha}-seed-${grpo.seed}'
