general:
  cartridge_path: "<path to .gb file>"
  level_path: "<path to level state>"
  seed: 0
  debug: false
  save_path: "<save_path>"
  log_dir: "<log_dir>"
  video_crop_box: [0, 0, 640, 330]
  gb_crop_box: [0, 0, 160, 120]

voc:
  buffer_size: 15 # number of frames to keep for voc calculation
  reward_steps: 15 # how often reward is given
  n_repeats: 2 # voc is averaged over n_repeats to decrease variance
  threshold: -1 #  cut-off threshold (voc below it is set to -1)
  penalty_value: -1 # which value to set when VOC is below threshold
  use_different_reward_model: true # whether reward model is independent from the agent or they're the same at training
  use_pretrained: false # is default qwen-2.5-7b-vl used or some pretrained versions
  use_voc: true
  use_dense_rewards: false # whether to give reward every step (see formulas in 8) in notion plan)
  eta: 0.1
  gamma: 0.997
  context_len: -1 # number of frames to use from context video, if less than 0 then no context used
  shuffle_context: false # whether to shuffle context frames
  use_percentage: false # whether to use percentage in context
  context_path: null # path to context video
  context_offset: 30 # offset to start from in context video
  prompt_version: v1 # v1 uses percentage extractor; v2 uses rank extractor

environment:
  env_name: "catrap"
  max_episode_steps: 64
  max_image_obs_len: 5
  num_processes: 1
  num_env_steps: 60000
  num_steps: 256
  eval_num_per_episode: 5

training:
  init_lr: 1e-5
  end_lr: 5e-7
  weight_decay: 0
  eps: 1e-7
  gamma: 0.99
  use_gae: true
  gae_lambda: 0.95
  entropy_coef: 0.01
  value_loss_coef: 0.15
  max_grad_norm: 1
  ppo_epoch: 2
  grad_accum_steps: 32
  mini_batch_size: 1
  clip_param: 0.1
  lr_max_steps: 50
  save_interval: 5
  use_proper_time_limits: false
  kl_beta: 0.05
  kl_ctl: false # kl control parameters
  target_kl: 50
  kl_horizon: 8
  kl_beta_lb: 10
  value_warmup: "yes"
  use_kl: "yes"

peft:
  r: 32
  lora_alpha: 64
  lora_dropout: 0.1
  target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "down_proj", "up_proj"]
  bias: "none"

model:
  model_path: "<model path>"
  lora_path: "<pretrained chkpt path>"
  cache_dir: null
  train_vision: "all"
  max_new_tokens: 1536
  temperature: 0.2
  thought_prob_coef: 0.2
  stop_grad: "yes"

wandb:
  use_wandb: true
  wandb_project: "WANDB PROJECT"
  wandb_run: "qwen2.5-7b-vl-rew_len-${voc.reward_steps}-n_rep_voc-${voc.n_repeats}-seed-${general.seed}-temp-${model.temperature}-ppo_epoch-${training.ppo_epoch}-g-${training.gamma}-l-${training.gae_lambda}"

misc:
  action_only_prompt: false
  freeze_policy_model_on_first_update: true