# the osft config will override default ppo_trainer.yaml

hydra:
  searchpath:
    - file://verl/trainer/config

defaults:
  - ppo_trainer
  - _self_

trainer:
  rejection_sampling: false # enable rejection sampling
  rs_n: 1 # number of samples to keep in rejection sampling
  enable_train_temperature: true # enable temperature training

actor_rollout_ref:
  actor:
    optim:
      lr_warmup_steps: 10
    entropy_coeff: 0
    entropy_regularized_cross_entropy: false
  rollout:
    tensor_model_parallel_size: 1
    val_kwargs:
      n: 2  # 2 will trigger validation, 1 will bypass

algorithm:
  adv_estimator: null

ray_init:
  num_cpus: 64