defaults:
  - ppo_trainer
  - user
  - _self_

# MAX_PROMPT_LENGTH=2048
# MAX_RESPONSE_LENGTH=8192
# MAX_MODEL_LEN=$((MAX_PROMPT_LENGTH + MAX_RESPONSE_LENGTH))
max_model_len: 10240

actor_rollout_ref:
  actor:
    ppo_mini_batch_size: 8
  rollout:
    n: 8
    calculate_log_probs: True

algorithm:
  adv_estimator: grpo
  norm_adv_by_std_in_grpo: False
  rollout_correction:
    rollout_is: token
    rollout_is_threshold: 2.0

data:
  train_batch_size: 32
