# Format checks enforced on CI:
# 1. Comments must appear above each field.
# 2. There must be a blank line between each field.
# 3. Inline comments (after a field on the same line) are not allowed.
# 4. Indentation level is respected for nested fields.

# specify the default per-component configs
defaults:

  # <folder_name>@<field_name>.<field_name>: <yaml_file_name>
  # actor_rollout_ref.actor: trainer/config/actor/dp_actor.yaml
  - actor@actor_rollout_ref.actor: dp_actor

  # trainer.npu_profile: trainer/config/npu_profile/npu_profile.yaml
  - npu_profile@trainer.npu_profile: npu_profile

  # data: trainer/config/data/legacy_data.yaml
  - data@data: legacy_data

  # load the reference default config, then apply the fields in the current yaml
  # Reference model config.
  # Reference model will be enabled when actor.use_kl_loss or/and algorithm.use_kl_in_reward is/are True.
  - ref@actor_rollout_ref.ref: dp_ref

  # Rollout model config.
  - rollout@actor_rollout_ref.rollout: rollout

  # self config override anything above
  - _self_

# config for actor, rollout and reference model
actor_rollout_ref:

  # Whether it's a hybrid engine, currently only supports hybrid engine
  hybrid_engine: true

  # common configs for the model
  model:

    # Huggingface model path. This can be either local path or HDFS path.
    path: ~/models/deepseek-llm-7b-chat

    # Custom chat template for the model.
    custom_chat_template: null

    # Whether to use shared memory (SHM) for accelerating the loading of model weights
    use_shm: false

    # Additional Python packages to register huggingface models/tokenizers.
    external_lib: null

    # Used to override model's original configurations, mainly dropout
    override_config: {}

    # Enable gradient checkpointing for actor
    enable_gradient_checkpointing: true

    # Enable activation offloading for actor
    enable_activation_offload: false

    # Whether to remove padding tokens in inputs during training
    use_remove_padding: false

    # Set to positive value to enable LoRA (e.g., 32)
    lora_rank: 0

    # LoRA scaling factor
    lora_alpha: 16

    # Use rank stabilization
    use_rslora: true

    # Target modules to apply LoRA. Options: "all-linear" (not recommended for VLMs) or
    # [q_proj,k_proj,v_proj,o_proj,gate_proj,up_proj,down_proj]
    target_modules: all-linear

    # Exclude modules from applying Lora. Similar usage to target_modules and Peft.
    # Example: '.*visual.*' for excluding the ViT in Qwen2.5-VL, as currently vllm does not support ViT Lora.
    exclude_modules: null

    # Whether to use Liger for linear layer fusion
    use_liger: false

    # Whether to use custom fused kernels (e.g., FlashAttention, fused MLP)
    use_fused_kernels: false

    # Options for fused kernels. If use_fused_kernels is true, this will be used.
    fused_kernel_options:

      # Implementation backend for fused kernels. Options: "triton" or "torch".
      impl_backend: torch

    # Whether to enable loading a remote code model
    trust_remote_code: false

  # Rollout model config.
  rollout:

    # may get higher throughput when set to True. When activated, Please increase max_num_batched_tokens or decrease max_model_len.
    enable_chunked_prefill: True

    # Which loader to use for rollout model weights: dummy_dtensor, hf, megatron, etc.
    # safetensors (for huge model, and set use_shm=True); dummy_dtensor: randomly init model weight
    load_format: dummy_dtensor

    # for huge model, layered summon can save memory (prevent OOM) but make it slower
    layered_summon: False

# configs for the critic
critic:

  # Number of rollouts per update (mirrors actor rollout_n)
  rollout_n: ${actor_rollout_ref.rollout.n}

  # fsdp or fsdp2 strategy used for critic model training
  strategy: ${actor_rollout_ref.actor.strategy}

  # optimizer configs
  optim:

    # Learning rate
    lr: 1e-5

    # Warmup steps ratio; total steps will be injected at runtime
    lr_warmup_steps_ratio: 0.

    # Minimum LR ratio for cosine schedule
    min_lr_ratio: null

    # LR warmup style: "constant" or "cosine"
    warmup_style: constant

    # Total training steps (must be overridden at runtime)
    total_training_steps: -1

    # Weight decay
    weight_decay: 0.01

  # model config for the critic
  model:

    # Path to pretrained model weights
    path: ~/models/deepseek-llm-7b-chat

    # Whether to use shared memory for loading the model
    use_shm: False

    # Tokenizer path (defaults to actor's model path)
    tokenizer_path: ${actor_rollout_ref.model.path}

    # Hugging Face config override
    override_config: { }

    # External model implementation (optional)
    external_lib: ${actor_rollout_ref.model.external_lib}

    # Enable gradient checkpointing to save memory
    enable_gradient_checkpointing: True

    # Offload activations to CPU to reduce GPU memory usage
    enable_activation_offload: False

    # Use remove padding optimization (saves compute)
    use_remove_padding: False

    # Whether to trust remote code from Hugging Face models
    trust_remote_code: ${actor_rollout_ref.model.trust_remote_code}

    # FSDP-specific config
    fsdp_config:

      # Whether to offload model parameters to CPU
      param_offload: False

      # Whether to offload optimizer state to CPU
      optimizer_offload: False

      # Only for FSDP2: offload param/grad/optimizer during train
      offload_policy: False

      # Only for FSDP2: Reshard after forward pass to reduce memory footprint
      reshard_after_forward: True

      # Policy for wrapping layers with FSDP
      wrap_policy:

        # Minimum number of parameters to trigger wrapping
        min_num_params: 0

      # Number of GPUs in each FSDP shard group; -1 means auto
      fsdp_size: -1

      # Only for FSDP1: FSDP1 configuration, prefetch the next forward-pass all-gather
      # before the current forward computation.
      forward_prefetch: False

    # Set to positive value to enable LoRA (e.g., 32)
    lora_rank: 0

    # LoRA scaling factor
    lora_alpha: 16

    # LoRA target modules: "all-linear" or list of linear projection layers
    target_modules: all-linear

  # PPO mini-batch size per update
  ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}

  # [Deprecated] Global micro batch size
  ppo_micro_batch_size: null

  # Local per-GPU micro batch size
  ppo_micro_batch_size_per_gpu: null

  # Forward-only batch size (global)
  forward_micro_batch_size: ${critic.ppo_micro_batch_size}

  # Forward-only batch size (per GPU)
  forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}

  # Whether to automatically adjust batch size at runtime
  use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}

  # Max tokens per GPU in one PPO batch (doubled for critic)
  ppo_max_token_len_per_gpu: 32768

  # Max token length per GPU in forward pass
  forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}

  # Sequence parallelism size for Ulysses-style model parallelism
  ulysses_sequence_parallel_size: 1

  # Number of PPO epochs per batch
  ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}

  # Shuffle training data across PPO epochs
  shuffle: ${actor_rollout_ref.actor.shuffle}

  # Gradient clipping for critic updates
  grad_clip: 1.0

  # PPO value function clipping range
  cliprange_value: 0.5

  # Loss aggregation mode: "token-mean", "seq-mean-token-sum", or "seq-mean-token-mean"
  loss_agg_mode: ${actor_rollout_ref.actor.loss_agg_mode}

  # checkpoint configs
  checkpoint:

    # What to include in saved checkpoints
    # with 'hf_model' you can save whole model as hf format, now only use sharded model checkpoint to save space
    save_contents: ['model', 'optimizer', 'extra']

    # What to include when loading checkpoints
    load_contents: ${critic.checkpoint.save_contents}

  # profiler configs
  # the corresponding dataclass is verl.utils.profiler.ProfilerConfig.
  profiler:

    # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs in the entrypoint
    _target_: verl.utils.profiler.ProfilerConfig

    # True for each task has its own database, False for all tasks in one training step share one database.
    discrete: False

    # Whether to profile all ranks.
    all_ranks: False

    # The ranks that will be profiled. [] or [0,1,...]
    ranks: []

# configs for the reward model
reward_model:

  # Whether to enable reward model. If False, we compute the reward only with the user-defined reward functions.
  # In GSM8K and Math examples, we disable reward model.
  # For RLHF alignment example using full_hh_rlhf, we utilize reward model to assess the responses.
  # If False, the following parameters are not effective
  enable: False

  # FSDP strategy: "fsdp" or "fsdp2"
  strategy: ${actor_rollout_ref.actor.strategy}

  # model config for reward scoring
  model:

    # Input tokenizer. If the reward model’s chat template is inconsistent with the policy,
    # we need to first decode to plaintext, then apply the rm’s chat_template.
    # Then score with RM. If chat_templates are consistent, it can be set to null.
    input_tokenizer: ${actor_rollout_ref.model.path}

    # RM’s HDFS path or local path. Note that RM only supports AutoModelForSequenceClassification.
    # Other model types need to define their own RewardModelWorker and pass it from the code.
    path: ~/models/FsfairX-LLaMA3-RM-v0.1

    # Whether to use shared memory for loading the model
    use_shm: False

    # External model implementation (optional)
    external_lib: ${actor_rollout_ref.model.external_lib}

    # Use remove padding optimization (saves compute)
    use_remove_padding: False

    # Whether to use fused reward kernels for speedup
    use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}

    # Whether to enable loading a remote code model, default to False
    trust_remote_code: False

    # FSDP-specific config
    fsdp_config:

      # Policy for wrapping layers with FSDP
      wrap_policy:

        # Minimum number of parameters to trigger wrapping
        min_num_params: 0

      # Whether to offload model parameters to CPU
      param_offload: False

      # Only for FSDP2: Reshard after forward pass to reduce memory footprint
      reshard_after_forward: True

      # Number of GPUs in each FSDP shard group; -1 means auto
      fsdp_size: -1

      # Only for FSDP1: FSDP1 configuration, prefetch the next forward-pass all-gather
      # before the current forward computation.
      forward_prefetch: False

  # [Deprecated] Global micro batch size
  micro_batch_size: null

  # Local per-GPU micro batch size
  micro_batch_size_per_gpu: null

  # Maximum sequence length to process for scoring
  max_length: null

  # Sequence parallelism size for Ulysses-style model parallelism
  ulysses_sequence_parallel_size: 1

  # Whether to dynamically adjust batch size at runtime
  use_dynamic_bsz: ${critic.use_dynamic_bsz}

  # Maximum number of tokens per GPU in one forward pass
  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}

  # Reward Manager. This defines the mechanism of computing rule-based reward and handling different reward sources.
  # Default is naive. If all verification functions are multiprocessing-safe,
  # the reward manager can be set to prime for parallel verification.
  reward_manager: naive

  # Whether to launch custom reward function asynchronously during log_prob
  launch_reward_fn_async: False

  # Cloud/local sandbox fusion configuration for custom reward logic
  sandbox_fusion:

    # Cloud/local function URL for sandbox execution
    url: null

    # Max concurrent requests allowed to sandbox
    max_concurrent: 64

    # Max memory limit for each sandbox process in MB
    memory_limit_mb: 1024

  # profiler configs
  profiler:

    # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs in the entrypoint
    _target_: verl.utils.profiler.ProfilerConfig

    # True for each task has its own database, False for all tasks in one training step share one database.
    discrete: False

    # Whether to profile all ranks.
    all_ranks: False

    # The ranks that will be profiled. [] or [0,1,...]
    ranks: []

# custom reward function definition
custom_reward_function:

  # The path to the file containing your customized reward function.
  # If not specified, pre-implemented reward functions will be used.
  path: null

  # The name of the reward function within the specified file. Default is 'compute_score'.
  name: compute_score

# config for the algorithm
algorithm:

  # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs in the entrypoint
  _target_: verl.trainer.config.AlgoConfig

  # Discount factor for future rewards
  gamma: 1.0

  # Trade-off between bias and variance in the GAE estimator
  lam: 1.0

  # Advantage estimator type: "gae", "grpo", "reinforce_plus_plus", etc.
  adv_estimator: gae

  # Whether to normalize advantages by std (specific to GRPO)
  norm_adv_by_std_in_grpo: True

  # Whether to enable in-reward KL penalty
  use_kl_in_reward: False

  # How to estimate KL divergence: "kl", "abs", "mse", "low_var_kl", or "full"
  kl_penalty: kl

  # KL control configuration
  kl_ctrl:

    # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs in the entrypoint
    _target_: verl.trainer.config.KLControlConfig

    # KL control type: "fixed" or "adaptive"
    type: fixed

    # Initial coefficient for KL penalty
    kl_coef: 0.001

    # Horizon value for adaptive controller (if enabled)
    horizon: 10000

    # Target KL divergence (used for adaptive controller)
    target_kl: 0.1

  # Whether to enable preference feedback PPO
  use_pf_ppo: False

  # Preference feedback PPO settings
  pf_ppo:

    # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs in the entrypoint
    _target_: verl.trainer.config.PFPPOConfig

    # Method for reweighting samples: "pow", "max_min", or "max_random"
    reweight_method: pow

    # Power used for weight scaling in "pow" method
    weight_pow: 2.0

  # f-CDPG settings
  fcdpg:

    # If true, the target distribution is \propto ref_model * exp(reward/kl_coef).
    # If false, the target disribution is \propto ref_model * reward
    exponential_ebm: false

    # Divergence to minimize
    loss_divergence: "js"

    # Parameter for Rényi alpha divergence or Skewed JSD
    alpha: 0.5

    # Whether use a baseline or not
    use_baseline: true

    # window size for baseline estimation
    baseline_window_size: 1024

    # window size for divergence estimation
    divergence_window_size: 8192

    # maximum ratio  p_norm(x) / π_θ(x)
    ir_max_clip: null

    # maximum value for Z
    z_max_clip: null

    # whether or not to keep a moving average of z
    reset_z: false

    # epsilon value to use in case z is 0
    z_default: 0

    # substract f'(1) from f'(t)
    center_f: false

# config for the trainer
trainer:

  # Whether to balance batch sizes across distributed workers
  balance_batch: True

  # Number of epochs in training
  total_epochs: 30

  # Total training steps (can be set explicitly or derived from epochs)
  total_training_steps: null

  # The steps that will be profiled. null means no profiling. null or [1,2,5,...]
  profile_steps: null

  # controller Nvidia Nsight Systems Options. Must set when profile_steps is not None.
  ## reference https://docs.nvidia.com/nsight-systems/UserGuide/index.html
  ## reference https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html
  controller_nsight_options:

    # Select the API(s) to be traced.
    trace: "cuda,nvtx,cublas,ucx"

    # Track the GPU memory usage by CUDA kernels. Must be string type "true" or "false".
    cuda-memory-usage: "true"

    # CUDA graphs will be traced as a whole
    cuda-graph-trace: "graph"

  # worker Nvidia Nsight Systems Options. Must set when profile_steps is not None.
  worker_nsight_options:

    # Select the API(s) to be traced.
    trace: "cuda,nvtx,cublas,ucx"

    # Track the GPU memory usage by CUDA kernels. Must be string type "true" or "false".
    cuda-memory-usage: "true"

    # CUDA graphs will be traced as a whole
    cuda-graph-trace: "graph"

    # Profiling only in a range of torch.cuda.profiler.start and stop. Do not change this config.
    capture-range: "cudaProfilerApi"

    # Specify the desired behavior when a capture range ends.
    # In verl we need the orch.cuda.profiler.start/stop pair to repeats n times.
    # valid values are "repeat-shutdown:n" or null.
    # For normal whole step profiling, n = len(profile_steps);
    # but for discrete profiling, n = len(profile_steps) * Number(subtasks).
    # Or you can just leave it null and the program will use n = len(profile_steps) * 6;
    capture-range-end: null

    # Send signal to the target application's process group. We let the program to exit by itself.
    kill: none

  # Project name for experiment tracking (e.g., wandb)
  project_name: verl_examples

  # Experiment name for run identification in tracking tools
  experiment_name: gsm8k

  # Logging backends to use: "console", "wandb", etc. trainer.logger='[\"console\",\"mlflow\"]'
  logger: [ 'console', 'mlflow' ]

  # Number of generations to log during validation
  log_val_generations: 0

  # Directory for logging rollout data; no dump if null
  rollout_data_dir: null

  # Directory for logging validation data; no dump if null
  validation_data_dir: null

  # Number of nodes used in the training
  nnodes: 1

  # Number of GPUs per node
  n_gpus_per_node: 8

  # Save frequency (by iteration) for model checkpoints
  save_freq: -1

  # ESI refers to the elastic server instance used during training, similar to the training plan. For example,
  # if you purchase 10 hours of computing power, the ESI will automatically shut down after 10 hours of training.
  # To ensure a checkpoint is saved before ESI shuts down, the system will start saving a checkpoint in advance.
  # The advance time is calculated as: Advance Time = Longest historical step duration + Checkpoint save duration + esi_redundant_time.
  # Here, esi_redundant_time is a user-defined value that further extends the advance time for added safety.
  esi_redundant_time: 0

  # Resume mode: "auto", "disable", or "resume_path"
  # "auto": resume from last checkpoint if available
  # "disable": start from scratch
  # "resume_path": resume from a user-defined path
  resume_mode: auto

  # Path to resume training from (only used when resume_mode is "resume_path")
  resume_from_path: null

  # Whether to run validation before training begins
  val_before_train: True

  # Whether to run validation only
  val_only: False

  # Validation frequency (in training iterations)
  test_freq: -1

  # Number of iterations to warm up the critic before updating policy
  critic_warmup: 0

  # Default path to distributed filesystem for saving checkpoints
  default_hdfs_dir: null

  # Whether to delete local checkpoints after loading
  del_local_ckpt_after_load: False

  # Default local directory for saving checkpoints
  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}

  # Maximum number of actor checkpoints to keep
  max_actor_ckpt_to_keep: null

  # Maximum number of critic checkpoints to keep
  max_critic_ckpt_to_keep: null

  # Timeout (in seconds) for Ray worker to wait for registration
  ray_wait_register_center_timeout: 300

  # Device to run training on (e.g., "cuda", "cpu")
  device: cuda

# configs related to ray initialization
ray_init:

  # Number of CPUs for Ray. Use a fixed number instead of null when using SLURM.
  num_cpus: null

  # Path to save Ray timeline JSON for performance profiling
  timeline_json_file: null
