defaults:
  - ppo_trainer # this is a symbolic link to the verl/verl/trainer/config/ppo_trainer.yaml file
  - envs

system:
  CUDA_VISIBLE_DEVICES: "0"

micro_batch_size_per_gpu: 1
ppo_mini_batch_size: 8
model_path: Qwen/Qwen2.5-3B-Instruct
enable_response_mask: True
grpo_advantage_length_weight: False # if you do not enable this and critic/advantage_estimator is GRPO, and the critic/advantages/mean is too low, then you can try enabling this to encourage reasoning and forbid collapse

lora:
  enabled: False
  rank: 64
  alpha: 64
  target_modules: all-linear
  local_temp_dir: lora_temp

actor_rollout_ref:
  model:
    path: ${model_path}
  actor:
    ppo_mini_batch_size: ${ppo_mini_batch_size}  # by default, ppo_mini_batch_size = train_batch_size / 4
    micro_batch_size_per_gpu: ${micro_batch_size_per_gpu} # following micro_batch_size_per_gpu
    ppo_micro_batch_size_per_gpu: ${micro_batch_size_per_gpu} # following micro_batch_size_per_gpu
    use_ref: True
    entropy_coeff: 0.001
    use_kl_loss: False
    kl_loss_coef: 0.001
    kl_loss_type: kl
    clip_ratio_low: 0.2
    clip_ratio_high: 0.28
    grpo_advantage_length_weight: ${grpo_advantage_length_weight}
    optim:
      betas: [0.9, 0.999]
    lora:
      enabled: ${lora.enabled}
      rank: ${lora.rank}
      alpha: ${lora.alpha}
      target_modules: ${lora.target_modules}
      local_temp_dir: ${lora.local_temp_dir}
  ref:
    log_prob_micro_batch_size_per_gpu: ${micro_batch_size_per_gpu} # following micro_batch_size_per_gpu
  rollout:
    log_prob_micro_batch_size_per_gpu: ${micro_batch_size_per_gpu} # following micro_batch_size_per_gpu
    tensor_model_parallel_size: 1
    max_model_len: 7200
    prompt_length: 1 # useless. Just put it here
    response_length: 400 # single-turn response length
    gpu_memory_utilization: 0.9
    max_num_batched_tokens: 8192 # set only when enable_chunked_prefill is true
    temperature: 1
    rollout_filter_ratio: 0.25
    rollout_filter_type: std # max_mean or std
    enforce_eager: True #  for small models, set both enforce_eager and free_cache_engine to False to make rollout faster
    free_cache_engine: False
    val_kwargs:
      do_sample: True
      temperature: 0.5
    lora:
      enabled: ${lora.enabled}
      rank: ${lora.rank}
      alpha: ${lora.alpha}
      target_modules: ${lora.target_modules}
      local_temp_dir: ${lora.local_temp_dir}

critic:
  ppo_mini_batch_size: ${ppo_mini_batch_size} # by default, ppo_mini_batch_size = train_batch_size / 4
  ppo_micro_batch_size_per_gpu: ${micro_batch_size_per_gpu} # following micro_batch_size_per_gpu
  model:
    path: ${model_path}
  optim:
    betas: [0.9, 0.999]
  lora:
    enabled: ${lora.enabled}
    rank: ${lora.rank}
    alpha: ${lora.alpha}
    target_modules: ${lora.target_modules}

data:
  max_prompt_length: null
  max_response_length: null
  train_batch_size: null

algorithm:
  gamma: 1.0
  lam: 1.0
  adv_estimator: gae
  kl_penalty: kl  # how to estimate kl divergence
  kl_ctrl:
    type: fixed
    kl_coef: 0.001

trainer:
  project_name: ragen_latest
  experiment_name: test
  total_training_steps: 200
  validation_steps: 1 # validation instances = validation_steps * val_env_groups * group_size
  val_before_train: True
  n_gpus_per_node: 1
  test_freq: 10
  generations_to_log_to_wandb: 
    train: 128 # TODO: will be implemented
    val: 20
  logger: [ 'console', 'wandb' ]

agent_proxy:
  max_turn: 5
  action_sep: "||"
  max_actions_per_turn: 5 # how many actions can be output at most in a single turn
  use_turn_scores: False # important to GAE when applying token-level rewards to token-level advantages. If False, will take the sum of scores as the reward for the last turn.
  enable_think: True # False -> no think RL
  reward_normalization:
    grouping: "state" # state / batch / inductive
    method: "identity" # asym_clip / identity / mean_std

val_agent_proxy:
  max_turn: 5

es_manager:
  format_penalty: -0.1
  train:
    env_groups: 8
    # under the same group, the env config and env seed are ensured to be equal
    group_size: 16
    env_configs:
      tags: ["MetamathQA"]
      n_groups: [8] # If not set, all env names divide nums equally. Under the same group, the env config and env seed (prompt) are equal in each generation
  val:
    env_groups: 128
    group_size: 1 # should be set to 1 because val temperature is set to 0 and same prompt leads to same output
    # env_groups: 768
    # group_size: 1
    env_configs:
      tags: ["MetamathQA"]
      n_groups: [128] # TODO: If not set, all env names divide nums equally. Under the same group, the env config and env seed (prompt) are equal in each generation
      # tags: ["MetamathQA_1Turn", "MetamathQA_2Turn", "MetamathQA_4Turn", "MetamathQA_6Turn", "MetamathQA_8Turn", "MetamathQA_10Turn"]
      # n_groups: [128, 128, 128, 128, 128, 128] 

ctx_manager:
  generation: # go to vllm
    gen_config:
      response_length: ${actor_rollout_ref.rollout.response_length}
      temperature: ${actor_rollout_ref.rollout.temperature}
      top_p: ${actor_rollout_ref.rollout.top_p}
      top_k: ${actor_rollout_ref.rollout.top_k}
      kwargs: null
