defaults:
  - base # this is a symbolic link to the verl/verl/trainer/config/ppo_trainer.yaml file

model_config:
  model_name: gpt-4o
  max_concurrency: 8

model_info:
  Qwen2.5-7B-Instruct:
    provider_name: together
    model_name: Qwen/Qwen2.5-7B-Instruct-Turbo
    generation_kwargs:
      temperature: 0
      max_tokens: 512
  Qwen2.5-72B-Instruct:
    provider_name: together
    model_name: Qwen/Qwen2.5-72B-Instruct-Turbo
    generation_kwargs:
      temperature: 0
      max_tokens: 512
  claude-3.7:
    provider_name: anthropic
    model_name: claude-3-7-sonnet-20250219
    generation_kwargs:
      temperature: 0
      max_tokens: 512 # max_completion_tokens if o1-mini
  gpt-4o:
    provider_name: openai
    model_name: gpt-4o
    generation_kwargs:
      temperature: 0
      max_tokens: 512 # max_completion_tokens if o1-mini
  o1-mini:
    provider_name: openai
    model_name: o1-mini
    generation_kwargs:
      temperature: 1 # o1-mini does not support temperature = 0
      max_completion_tokens: 4096 # including thinking tokens
  o3-mini:
    provider_name: openai
    model_name: o3-mini
    generation_kwargs:
      temperature: 1
      max_completion_tokens: 4096

es_manager:
  val:
    env_groups: 8
    group_size: 1 # should be set to 1 because val temperature is set to 0 and same prompt leads to same output
    env_configs:
      tags: ["SimpleSokoban"]
      n_groups: [8] # If not set, all env names divide nums equally. Under the same group, the env config and env seed (prompt) are equal in each generation


