hydra:
  searchpath:
    - file://verl/trainer/config

defaults:
  - ppo_trainer
  - _self_

actor_rollout_ref:
  rollout:
    stop_fork: False
    stop_fork_entropy: 1
    customize_temperature_step: 0.05 # step size for all correct questions
    extra_args:
      fork_top_temperature: 0.95
      fork_top_percent: 0.0
      fork_bottom_temperature: 1.05
      fork_bottom_percent: 0.0
      customize_temperature: null
  
    val_kwargs:
      extra_args:
        fork_top_temperature: ${actor_rollout_ref.rollout.val_kwargs.temperature}
        fork_top_percent: 0
        fork_bottom_temperature: ${actor_rollout_ref.rollout.val_kwargs.temperature}
        fork_bottom_percent: 0



data:
  gen_batch_size: ${data.train_batch_size}
  enable_res_filter: False # filter out questions based on the correct rate of responses

  enable_correct_gen_temp: False # use higher temperature for all correct questions
  correct_threshold: 0

reward_model:
  reward_manager: dapo
  overlong_buffer: 
    enable: False # We try to avoid forgetting to set enable
    len: 0
    penalty_factor: 0.0
    log: False

algorithm:
  filter_groups:
    enable: False # We try to avoid forgetting to set enable
    metric: null # acc / score / seq_reward / seq_final_reward / ...
    max_num_gen_batches: 0 # Non-positive values mean no upper limit
  ours:
    reactivate_all_correct: False # add a negative reward to compute the advantage of all correct data
    filter_wrong_in_resp: False # filter out wrong responses based on the number of correct responses
    enable_replace_correct_from_history: False # store correct responses, and add it to batch if all responses are incorrect.
    replace_corr_ratio: 0.25  # replace from history to try to have a ratio of correct responses for each prompt
    wr_corr_max_ratio: 2
    advantage_offset: null # offset for advantage computation
    max_temperature: 1.2 # max temperature for enable_correct_gen_temp

trainer:
  project_name: verl-dapo
  save_hdfs_dir: null
