# @package _global_

reward_model:
  sandbox_fusion:
    url: "http://localhost:8080/run_code"
    max_concurrent: 128
  reward_manager: prime

algorithm:
  adv_estimator: gae
  use_kl_in_reward: false

# dataset config
data:
  train_files: selfplay_data/selfplay_prompts_coding_v1/train.parquet
  val_files: selfplay_data/codeforces/test.parquet
  train_batch_size: 1024
  max_prompt_length: 512
  max_response_length: 512
  filter_overlong_prompts: true
  truncation: left

actor_rollout_ref:
  model:
    path: Qwen/Qwen2.5-Coder-3B-Instruct
    use_remove_padding: true
    enable_gradient_checkpointing: true
  actor:
    optim:
      lr: 1.0e-6
    ppo_mini_batch_size: 256
    ppo_micro_batch_size_per_gpu: 16
    use_kl_loss: false
    fsdp_config:
      param_offload: false
      optimizer_offload: false
  rollout:
    log_prob_micro_batch_size_per_gpu: 32
    tensor_model_parallel_size: 4
    name: vllm
    gpu_memory_utilization: 0.4
    n: 4

critic:
  optim:
    lr: 1.0e-5
  model:
    path: Qwen/Qwen2.5-Coder-3B-Instruct
    use_remove_padding: true
    enable_gradient_checkpointing: true
    fsdp_config:
      param_offload: false
      optimizer_offload: false
  ppo_micro_batch_size_per_gpu: 32

trainer:
  critic_warmup: 0
  logger:
    - console
    - wandb
  n_gpus_per_node: 4
  nnodes: 1
  save_freq: -1
  test_freq: 5
  total_epochs: 1000
  balance_batch: false
  self_play: true
  proposer_parser_version: v3
  self_play_solver_reward: prime