# Configuration for Math multi-agent training with TWO different policies
# reasoning_agent and tool_agent use Qwen3-4B-Instruct-2507
mode: "validate"
sample_mode: "tree"
enable_thinking: false
# Inherit from base configuration
defaults:
  # Inject a separate ppo_trainer/base subtree for each model; can be overridden independently
  - ppo_trainer@models.model_0.ppo_trainer_config: eval
  - _self_
benchmark: "MATH500"
# dataset config (shared by all agents)
data:
  filter_method: mean
  filter_ratio: 0.2
  gen_batch_size: 64
  gen_n_samples: 5
  sample_temperature: 1
  val_freq: 10
  resample_freq: 3
  epoch_size: 20

  # Batch sizes
  train_batch_size: 64
  val_batch_size: 32
  
  # Sequence lengths
  max_prompt_length: 16384
  max_response_length: 8192

resource:
  nnodes: 1
  n_gpus_per_node: 8
  trust_remote_code: true


# Environment configuration (shared by all agents)
env:
  name: math_env
  benchmark: "CodeForces"
  max_turns: 3
  resolve: false
  multi_modal: false
  batched_init: true
if_dapo: true
# Multi-agent interaction configuration
multi_agent_interaction:
  
  # Turn order for agents (list of agent names)
  turn_order: [ "tool_agent"]
  
  # Number of agents that interact per episode
  num_interacting_agents: 2
  
  # Whether agents can see other agents' actions
  shared_observation: true

# Shared model configurations
models:
  model_0:
    path: "/home/lah003/models/Qwen3-4B-Instruct-2507"
    #path: "/home/lah003/models/Qwen3-4B"
    name: "reasoning_agent_model"
    ppo_trainer_config:
      data: 
        max_prompt_length: ${data.max_prompt_length}
        max_response_length: ${data.max_response_length}
      actor_rollout_ref:
        model:
          path: ${models.model_0.path}
        rollout:
          n: ${data.gen_n_samples}
          temperature: ${data.sample_temperature}
          prompt_length: ${data.max_prompt_length}
          response_length: ${data.max_response_length}
          tensor_model_parallel_size: ${resource.n_gpus_per_node}
        trainer:
          n_gpus_per_node: ${resource.n_gpus_per_node}
          n_training_gpus_per_node: ${resource.n_gpus_per_node}
          default_local_dir: ${trainer.default_local_dir}
 
project_name: pettingllms
experiment_name: math_single_agent
logger: [ 'console', 'wandb' ]

# Multi-agent configuration for two policies
agent_policy_configs:
  # Number of agents to train
  num_agents: 1
  policy_list: ["reasoning_agent"]
  agent_configs:
    agent_0:
      name: "tool_agent"
      policy_name: "reasoning_agent_model"
      sample_num: 4


trainer:
  device: cuda
  n_gpus_per_node: ${resource.n_gpus_per_node}
  nnodes: 1
  balance_batch: True
  total_epochs: 1
  total_training_steps: 400
  project_name: pettingllms
  experiment_name: math_single_policy
  logger: [ 'console', 'wandb' ]
  log_val_generations: 0
  rollout_data_dir: null
  validation_data_dir: null
  save_freq: 40
  resume_mode: auto
  resume_from_path: null
  val_before_train: True
  test_freq: -1
  critic_warmup: 0
  default_hdfs_dir: null
  del_local_ckpt_after_load: False
  default_local_dir: checkpoints/pettingllms/math_single_policy
  max_actor_ckpt_to_keep: 3
  max_critic_ckpt_to_keep: null
  ray_wait_register_center_timeout: 300
  npu_profile:
    options: {}
  rejection_sample: False
  rejection_sample_multiplier: 2
  n_training_gpus_per_node: ${resource.n_gpus_per_node}
