hydra:
  searchpath:
    - pkg://verl.trainer.config

defaults:
  - ppo_trainer
  - _self_


custom_reward_function:
  path: multi-turn-rl/bfcl_reward.py
  name: compute_score


data:
  max_prompt_length: 22767
  max_response_length: 10000
  return_raw_chat: True

actor_rollout_ref:
  hybrid_engine: True
  rollout:
    name: sglang
    multi_turn:
      enable: True
      max_turns: 100
      interaction_config_path: "multi-turn-rl/config/multi_turn_fc_interaction_config.yaml"
      format: qwen
    max_model_len: 32768
  actor:
    optim:
      lr: 1e-6
      lr_warmup_steps: 10
      weight_decay: 0.1
    use_kl_loss: False
    entropy_coeff: 0.0
    clip_ratio_low: 0.2
    clip_ratio_high: 0.28
    clip_ratio_c: 10

algorithm:
  gamma: 1.0
  lam: 1.0
  adv_estimator: grpo
  use_kl_in_reward: False
  loss_agg_mode: "token-mean"
  filter_groups:
    enable: True
    metric: seq_reward
    max_num_gen_batches: 10 # Non-positive values mean no upper limit

reward_model:
  reward_manager: bfcl
  overlong_buffer:
    enable: False
    len: 0
    penalty_factor: 0.0
    log: False
 