hydra:
  searchpath:
    - file://verl/trainer/config
  run:
    dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
  sweep:
    dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}

defaults:
  - ppo_trainer
  - _self_

data:
  max_prompt_length: 2048
  max_response_length: 30720  # 30KB for multi-turn tool interactions
  train_batch_size: 128
  val_batch_size: 16
  return_raw_chat: True
  custom_cls:
    path: "recipe/fileagent/rl_dataset.py"
    name: CustomRLHFDataset

actor_rollout_ref:
  hybrid_engine: True
  rollout:
    name: vllm
    mode: async
    multi_turn:
      enable: True
      format: hermes  # Use Hermes tool call format (default)
      max_assistant_turns: 15
      max_tool_response_length: 20480  # 20KB
      tool_config_path: "recipe/fileagent/config/tool/extracted_bench_tool.yaml"
    agent:
      agent_loop_config_path: "recipe/fileagent/config/agent_loop.yaml"
      num_workers: 8

# Custom reward function using LLM Judge
custom_reward_function:
  path: "/mnt/bn/fileagent-storage/users/<your_username>/verl/recipe/fileagent/reward_score.py"
  name: compute_score
  reward_kwargs:
    llm_judge_ip: null
    llm_judge_port: null
    llm_judge_model_name: null

# # Use batch reward manager for efficiency
# reward_model:
#   reward_manager: batch

# custom_reward_function:
#   path: "recipe/fileagent/reward_score.py"
#   name: compute_score
