# Format checks enforced on CI:
# 1. Comments must appear above each field.
# 2. There must be a blank line between each field.
# 3. Inline comments (after a field on the same line) are not allowed.
# 4. Indentation level is respected for nested fields.

# specify the default per-component configs
defaults:

  # <folder_name>@<field_name>.<field_name>: <yaml_file_name>
  # actor_rollout_ref.actor: trainer/config/actor/dp_actor.yaml
  - actor@actor_rollout_ref.actor: dp_actor

  # data: trainer/config/data/legacy_data.yaml
  - data@data: legacy_data

  # Reference model config.
  # Reference model will be enabled when actor.use_kl_loss or/and algorithm.use_kl_in_reward is/are True.
  - ref@actor_rollout_ref.ref: dp_ref

  # Rollout model config.
  - rollout@actor_rollout_ref.rollout: rollout

  # Model config.
  - model@actor_rollout_ref.model: hf_model

  # Critic model config.
  - critic@critic: dp_critic

  # Reward model config.
  - reward_model@reward_model: dp_reward_model

  # load the reference default config, then apply the fields in the current yaml
  # self config override anything above
  - _self_

# config for actor, rollout and reference model
actor_rollout_ref:

  # Whether it's a hybrid engine, currently only supports hybrid engine
  hybrid_engine: true

  # Timeout for operations executed against the process group
  nccl_timeout: 600

  # Rollout model config.
  rollout:

    # may get higher throughput when set to True. When activated, Please increase max_num_batched_tokens or decrease max_model_len.
    enable_chunked_prefill: True

    # Which loader to use for rollout model weights: dummy_dtensor, hf, megatron, etc.
    # safetensors (for huge model, and set use_shm=True); dummy_dtensor: randomly init model weight
    load_format: dummy_dtensor

    # for huge model, layered summon can save memory (prevent OOM) but make it slower
    layered_summon: False

# custom reward function definition
custom_reward_function:

  # The path to the file containing your customized reward function.
  # If not specified, pre-implemented reward functions will be used.
  path: null

  # The name of the reward function within the specified file. Default is 'compute_score'.
  name: compute_score

# config for the algorithm
algorithm:

  # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
  _target_: verl.trainer.config.AlgoConfig

  # Discount factor for future rewards
  gamma: 1.0

  # Trade-off between bias and variance in the GAE estimator
  lam: 1.0

  # Advantage estimator type: "gae", "grpo", "reinforce_plus_plus", etc.
  adv_estimator: gae

  # Whether to normalize advantages by std (specific to GRPO)
  norm_adv_by_std_in_grpo: True

  enable_difficulty_classification: False

  # Thresholds for difficulty classification (based on reward/accuracy)
  hard_acc_upper: 0.3        # Samples with acc <= 0.3 are considered hard
  med_acc_lower: 0.3         # Medium samples: 0.3 < acc <= 0.7  
  med_acc_upper: 0.7         # Samples with acc > 0.7 are considered easy

  # Whether to enable in-reward KL penalty
  use_kl_in_reward: False

  # Difficulty-conditioned multipliers for KL-in-reward
  # hard samples (rewritten_hard) will use this multiplier
  kl_in_reward_coef_hard: 1.0

  # non-hard samples (rewritten_medium, etc.) will use this multiplier
  kl_in_reward_coef_nonhard: 1.0

  # How to estimate KL divergence: "kl", "abs", "mse", "low_var_kl", or "full"
  kl_penalty: kl

  # KL control configuration
  kl_ctrl:

    # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
    _target_: verl.trainer.config.KLControlConfig

    # KL control type: "fixed" or "adaptive"
    type: fixed

    # Initial coefficient for KL penalty
    kl_coef: 0.001

    # Horizon value for adaptive controller (if enabled)
    horizon: 10000

    # Target KL divergence (used for adaptive controller)
    target_kl: 0.1

  # Whether to enable preference feedback PPO
  use_pf_ppo: False

  # Preference feedback PPO settings
  pf_ppo:

    # Method for reweighting samples: "pow", "max_min", or "max_random"
    reweight_method: pow

    # Power used for weight scaling in "pow" method
    weight_pow: 2.0

# config for the trainer
trainer:

  # Whether to balance batch sizes across distributed workers
  balance_batch: True

  # Number of epochs in training
  total_epochs: 30

  # Total training steps (can be set explicitly or derived from epochs)
  total_training_steps: null

  # Project name for experiment tracking (e.g., wandb)
  project_name: verl_examples

  # Experiment name for run identification in tracking tools
  experiment_name: gsm8k

  # Logging backends to use: "console", "wandb", etc.
  logger: [ 'console', 'wandb' ]

  # Number of generations to log during validation
  log_val_generations: 0

  # Directory for logging rollout data; no dump if null
  rollout_data_dir: null

  # Directory for logging validation data; no dump if null
  validation_data_dir: null

  # Number of nodes used in the training
  nnodes: 1

  # Number of GPUs per node
  n_gpus_per_node: 8

  # Save frequency (by iteration) for model checkpoints
  save_freq: -1

  # ESI refers to the elastic server instance used during training, similar to the training plan. For example,
  # if you purchase 10 hours of computing power, the ESI will automatically shut down after 10 hours of training.
  # To ensure a checkpoint is saved before ESI shuts down, the system will start saving a checkpoint in advance.
  # The advance time is calculated as: Advance Time = Longest historical step duration + Checkpoint save duration + esi_redundant_time.
  # Here, esi_redundant_time is a user-defined value that further extends the advance time for added safety.
  esi_redundant_time: 0

  # Resume mode: "auto", "disable", or "resume_path"
  # "auto": resume from last checkpoint if available
  # "disable": start from scratch
  # "resume_path": resume from a user-defined path
  resume_mode: auto

  # Path to resume training from (only used when resume_mode is "resume_path")
  resume_from_path: null

  # Whether to run validation before training begins
  val_before_train: True

  # Whether to run validation only
  val_only: False

  # Validation frequency (in training iterations)
  test_freq: -1

  # Save best model when validation metrics improve
  save_best_model: False
  
  # Metrics to track for best model saving (defaults to core pass@k metrics if empty)
  best_metric_names: []

  # Number of iterations to warm up the critic before updating policy
  critic_warmup: 0

  # Default path to distributed filesystem for saving checkpoints
  default_hdfs_dir: null

  # Whether to delete local checkpoints after loading
  del_local_ckpt_after_load: False

  # Default local directory for saving checkpoints
  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}

  # Maximum number of actor checkpoints to keep
  max_actor_ckpt_to_keep: null

  # Maximum number of critic checkpoints to keep
  max_critic_ckpt_to_keep: null

  # Timeout (in seconds) for Ray worker to wait for registration
  ray_wait_register_center_timeout: 300

  # Device to run training on (e.g., "cuda", "cpu")
  device: cuda

  # whether to use legacy worker implementation
  #  mode: "auto", "enable", or "disable"
  use_legacy_worker_impl: auto

  task: rl


# profiler configs
global_profiler:

  # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
  _target_: verl.utils.profiler.ProfilerConfig

  # Profiling tool: choose between nsys, npu, torch, torch_memory
  tool: null

  # profile steps
  steps: null

  # Whether to combine continuous steps into one database.
  ## If True, worker.profiler.discrete must be False, [1,2] in one, [5] in another.
  ## If False, [1] in one, [2] in another, [5] in another.
  profile_continuous_steps: False

  # Path to save profiling contents
  save_path: "outputs/profile"

  # Specific tool configs, can use +profiler.tool_config.[tool].xxx to config
  global_tool_config:
  
    # nsys config
    nsys:

      # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
      _target_: verl.utils.profiler.config.NsightToolConfig

      # True for each task has its own database, False for all tasks in one training step share one database.
      discrete: False

      # controller Nvidia Nsight Systems Options. Must set when profile_steps is not None.
      ## reference https://docs.nvidia.com/nsight-systems/UserGuide/index.html
      ## reference https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html
      controller_nsight_options:

        # Select the API(s) to be traced.
        trace: "cuda,nvtx,cublas,ucx"

        # Track the GPU memory usage by CUDA kernels. Must be string type "true" or "false".
        cuda-memory-usage: "true"

        # CUDA graphs will be traced as a whole
        cuda-graph-trace: "graph"

      # worker Nvidia Nsight Systems Options. Must set when profile_steps is not None.
      worker_nsight_options:

        # Select the API(s) to be traced.
        trace: "cuda,nvtx,cublas,ucx"

        # Track the GPU memory usage by CUDA kernels. Must be string type "true" or "false".
        cuda-memory-usage: "true"

        # CUDA graphs will be traced as a whole
        cuda-graph-trace: "graph"

        # Profiling only in a range of torch.cuda.profiler.start and stop. Do not change this config.
        capture-range: "cudaProfilerApi"

        # Specify the desired behavior when a capture range ends.
        # In verl we need the torch.cuda.profiler.start/stop pair to repeats n times.
        # valid values are "repeat-shutdown:n" or null.
        # For normal whole step profiling, n = len(profile_steps);
        # but for discrete profiling, n = len(profile_steps) * Number(subtasks).
        # Or you can just leave it null and the program will use n = len(profile_steps) * 6;
        capture-range-end: null

        # Send signal to the target application's process group. We let the program to exit by itself.
        kill: none

    # enable memory visualization for debugging memory usage
    torch_memory:

      #  Maximum number of allocation entries to record
      trace_alloc_max_entries: 100_000

      # The depth of the call stack to capture for each allocation
      stack_depth: 32

      # 'alloc': records only allocation events || 'state': records memory state changes || 'all': records both.
      context: "all"

      # 'python': records Python stacks || 'cpp': records C++ stacks (available in some versions) || 'all': records both.
      stacks: "all"

      # devices, record_context etc.
      kw_args: {}
  
# configs related to ray
ray_kwargs:

  # configs related to ray initialization
  ray_init:
  
    # Number of CPUs for Ray. Use a fixed number instead of null when using SLURM.
    num_cpus: null

  # Path to save Ray timeline JSON for performance profiling
  timeline_json_file: null