# the prime config will override default ppo_trainer.yaml

hydra:
  searchpath:
    - file://verl/trainer/config

defaults:
  - ppo_trainer
  - _self_

data:
  filter_accuracy: True
  accuracy_lower_bound: 0.2
  accuracy_upper_bound: 0.8
  oversample_factor: 4.0 # Sample more responses than the batch size. prompts satisfying the filter will be prioritized.
  filter_truncate: True
  truncation: right
  resample: False
  penalty: [] # 各种惩罚性reward。instruction_error, repetition, multi_language

actor_rollout_ref:
  hybrid_engine: True
  model:
    use_remove_padding: True
  rollout:
    # number of responses (i.e. num sample times)
    n: 4
    force_think: False
  actor:
    entropy_coeff: 0.000
    entropy_type: none # 设置为Adaptive为Skywork风格的熵控制
    use_token_level_loss: False
    optim:
      weight_decay: 0.
    ppo_epochs_max: 4.
  ref:
    fsdp_config:
      param_offload: True

reward_model:
  enable: True
  strategy: fsdp
  model:
    ref_path: ${reward_model.model.path}
    use_remove_padding: ${actor_rollout_ref.model.use_remove_padding}
    tokenizer_path: ${actor_rollout_ref.model.path}
    enable_gradient_checkpointing: ${actor_rollout_ref.model.enable_gradient_checkpointing}
    ref_type: freeze
    ref_clip: none
    fsdp_config:
      min_num_params: 0
      param_offload: ${actor_rollout_ref.actor.fsdp_config.param_offload}
#      grad_offload: ${actor_rollout_ref.actor.fsdp_config.grad_offload}
      optimizer_offload: ${actor_rollout_ref.actor.fsdp_config.optimizer_offload}
    update: before # ``before`` for double-forward, ``after`` for single-forward
    optim:
      lr: 1e-6
      lr_warmup_steps_ratio: 0.
      min_lr_ratio: null
      warmup_style: constant
      total_training_steps: -1  # must be overridden by program
      weight_decay: 0.
      grad_clip: 10.0
    beta_train: 0.05
    beta_double: null
    beta_test: 0.05 # 求value的时候用这个，求loss的时候用beta_train
    loss_type: ce # currently only supports ce loss
    truncate: False # 是否需要在训练中移除超长样本
  prime_granularity: token
  prime_norm: batch_norm # batch_norm or none. if set to none, the normalizer is beta_train
  mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
  reward_manager: prime
  lambda: 0.
  prime_use_gt: False

algorithm:
  adv_estimator: rloo
  # now supports rloo. it treats different source of reward separately.
  kl_ctrl:
    type: fixed
    kl_coef: 0.000
  reward_gt_coef: 5
  reward_dpo_coef: 5
  q0_estimator: soft
  warmup: False

trainer:
  project_name: prime
  experiment_name: examples
  val_before_train: False
  balance_batch: False
  filter_batch_for_rm: True # you can also use all samples for the reward model
  validate_sample: False