# Maze-specific override config for VerK retry with GRPO + interaction.
# Keeps math config unchanged while using maze reward + maze interaction.

hydra:
  searchpath:
    - file://verl/trainer/config

defaults:
  - ppo_trainer
  - _self_

custom_reward_function:
  path: examples/reward_fns/maze_path_reward.py
  name: compute_score

algorithm:
  adv_estimator: grpo

data:
  return_raw_chat: true
  prompt_key: prompt
  max_prompt_length: 512
  max_response_length: 2048

actor_rollout_ref:
  rollout:
    name: sglang
    multi_turn:
      enable: true
      interaction_config_path: examples/sglang_multiturn/config/interaction_config/ver_k_retry_interaction_config_maze.yaml
      max_assistant_turns: 4
      max_user_turns: 4

trainer:
  total_epochs: 1
  logger: [console]
