defaults:
  # Inject a separate ppo_trainer/base subtree for each model; can be overridden independently
  - ppo_trainer@models.model_0.ppo_trainer_config: eval
  - _self_
sample_mode: "tree"
# dataset config (shared by all agents)
data:
  filter_method: mean
  filter_ratio: 0.5
  gen_batch_size: 32
  gen_n_samples: 4
  sample_temperature: 1
  val_freq: 10
  resample_freq: 3
  epoch_size: 20
  

  # Batch sizes
  train_batch_size: 64
  val_batch_size: 32
  
  # Sequence lengths
  max_prompt_length: 2048
  max_response_length: 2048

resource:
  nnodes: 1
  n_gpus_per_node: 4
  trust_remote_code: true

# Environment configuration (shared by all agents)
env:
  name: alfworld_env
  benchmark: "ALFWorld"
  max_turns: 20
  resolve: false
  multi_modal: false
  batched_init: true

# Multi-agent interaction configuration
multi_agent_interaction:
  
  # Turn order for agents (list of agent names)
  turn_order: ["alfworld_agent"]
  
  # Number of agents that interact per episode
  num_interacting_agents: 1
  
  # Whether agents can see other agents' actions
  shared_observation: true

# Shared model configurations
models:
  model_0:
    # TODO: make configurable
    path: "/home/lah003/models/Qwen3-4B"
    name: "alfworld_agent_model"
    ppo_trainer_config:
      data: 
        max_prompt_length: ${data.max_prompt_length}
        max_response_length: ${data.max_response_length}
      actor_rollout_ref:
        model:
          path: ${models.model_0.path}
        rollout:
          n: ${data.gen_n_samples}
          temperature: ${data.sample_temperature}
          prompt_length: ${data.max_prompt_length}
          response_length: ${data.max_response_length}
          tensor_model_parallel_size: ${resource.n_gpus_per_node}
        trainer:
          n_gpus_per_node: ${resource.n_gpus_per_node}
          n_training_gpus_per_node: ${resource.n_gpus_per_node}
          default_local_dir: ${trainer.default_local_dir}
          
    # ppo_trainer_config is injected via defaults; the above settings override the defaults in base
    

# Multi-agent configuration for two policies
agent_policy_configs:
  # Number of agents to train
  num_agents: 2
  policy_list: ["alfworld_agent"]
  agent_configs:
    agent_0:
      name: "alfworld_agent"
      policy_name: "alfworld_agent_model"
      sample_num: 8

project_name: pettingllms
experiment_name: alfworld_agent
logger: [ 'console', 'wandb' ]

trainer:
  ${models.model_0.ppo_trainer_config.trainer}
 