hydra:
  searchpath:
    - pkg://verl.trainer.config

defaults:
  - ppo_trainer
  - _self_


custom_reward_function:
  path: multi-turn-rl/bfcl_reward.py
  name: compute_score


data:
  max_prompt_length: 8192
  max_response_length: 24576
  return_raw_chat: True

actor_rollout_ref:
  hybrid_engine: True
  rollout:
    name: sglang
    multi_turn:
      enable: True
      max_turns: 100
      interaction_config_path: "multi-turn-rl/config/multi_turn_fc_interaction_config.yaml"
      format: qwen
    max_model_len: 32768
  actor:
    optim:
      lr: 1e-6
    use_kl_loss: False
    kl_loss_coef: 0.0
    kl_loss_type: low_var_kl
    clip_ratio_low: 0.2
    clip_ratio_high: 0.28
    clip_ratio_c: 10.0
    grad_clip: 1.0
    entropy_coeff: 0.001

critic:
  grad_clip: 1.0
  cliprange_value: 0.5

algorithm:
  gamma: 1.0
  lam: 1.0
  adv_estimator: grpo
  use_kl_in_reward: False
  loss_agg_mode: "seq-mean-token-mean"
  kl_penalty: kl  # how to estimate kl divergence
  kl_ctrl:
    type: fixed
    kl_coef: 0.0

reward_model:
  reward_manager: bfcl


trainer:
  critic_warmup: 0
 