# Minimal override-style config.
# If your repo already has a gsm8k_multiturn_grpo config, you can "defaults" include it.
# Otherwise, run via the shell script overrides below.

hydra:
  searchpath:
    - file://verl/trainer/config

defaults:
  - ppo_trainer  # This exists in verl config packs in typical setups; adjust if needed.
  - _self_

custom_reward_function:
  path: examples/reward_fns/boxed_answer_reward.py
  name: compute_score

algorithm:
  adv_estimator: grpo

data:
  # MUST be true for interaction-driven multi-turn rollout in the official docs example.
  return_raw_chat: true
  # Typical RLVR keys:
  prompt_key: prompt
  # You likely also want to keep episodes bounded:
  max_prompt_length: 512
  max_response_length: 2048

actor_rollout_ref:
  rollout:
    name: sglang
    multi_turn:
      enable: true
      interaction_config_path: examples/sglang_multiturn/config/interaction_config/ver_k_retry_interaction_config.yaml
      # K attempts = K assistant turns
      max_assistant_turns: 4
      # Allow verifier/user feedback after each attempt
      max_user_turns: 4

trainer:
  total_epochs: 1
  logger: [console]
