hydra:
  searchpath:
    - file://verl/trainer/config

defaults:
  - ppo_trainer.yaml
  - _self_

data:
  gen_batch_size: ${data.train_batch_size}

reward_model:
  reward_manager: dapo
  overlong_buffer: 
    enable: False # We try to avoid forgetting to set enable
    len: 0
    penalty_factor: 0.0
    log: False

algorithm:
  filter_groups:
    _target_: verl.trainer.config.FilterGroupsConfig
    enable: False # We try to avoid forgetting to set enable
    metric: null # acc / score / seq_reward / seq_final_reward / ...
    max_num_gen_batches: 0 # Non-positive values mean no upper limit

trainer:
  project_name: verl-dapo

actor_rollout_ref:
  actor:
    loss_agg_mode: token-mean
