# Format checks enforced on CI:
# 1. Comments must appear above each field.
# 2. There must be a blank line between each field.
# 3. Inline comments (after a field on the same line) are not allowed.
# 4. Indentation level is respected for nested fields.

# Target class for this configuration
_target_: verl.workers.config.ActorConfig

# the abstract actor configs
# fsdp, fsdp2 or megatron. must be set.
strategy: ???

# Split each sample into sub-batches of this size for PPO
ppo_mini_batch_size: 256

# [Deprecated] Global micro batch size
ppo_micro_batch_size: null

# Local per-GPU micro batch size
ppo_micro_batch_size_per_gpu: null

# Whether to automatically adjust batch size at runtime
# oc.select: the default val for ref.log_prob_use_dynamic_bsz
use_dynamic_bsz: false

# Max tokens per GPU in one PPO batch; affects gradient accumulation
# Typically it should be: n * ${data.max_prompt_length} + ${data.max_response_length}
# oc.select: the default val for ref.log_prob_max_token_len_per_gpu
ppo_max_token_len_per_gpu: 16384

# PPO clip ratio
clip_ratio: 0.2

# Lower bound for asymmetric clipping (used in dual-clip PPO)
clip_ratio_low: 0.2

# Upper bound for asymmetric clipping (used in dual-clip PPO)
clip_ratio_high: 0.2

# policy loss config
policy_loss:

  # # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
  _target_: verl.workers.config.PolicyLossConfig

  # Loss function mode: vanilla / clip-cov / kl-cov / gpg / apo
  # - vanilla: Standard PPO clipped objective
  # - clip-cov: Clip-Cov loss from https://arxiv.org/abs/... 
  # - kl-cov: KL-Cov loss
  # - gpg: GPG loss from https://arxiv.org/abs/2505.22617
  # - apo: Anchored Policy Optimization - uses push-pull mechanism for negative samples
  loss_mode: "vanilla"

  # Ratio of tokens to be clipped for clip-cov loss
  clip_cov_ratio: 0.0002

  # Lower bound for clip-cov loss
  clip_cov_lb: 1.0

  # Upper bound for clip-cov loss
  clip_cov_ub: 5.0

  # Ratio of tokens to be applied kl penalty for kl-cov loss
  kl_cov_ratio: 0.0002

  # KL divergence penalty coefficient
  ppo_kl_coef: 0.1

  # Push amplification coefficient (λ) - amplifies the penalty for error tokens
  # Must be >= 1.0 (1.0 = no amplification, standard PPO behavior)
  # Used by: apo_ratio (Method II - Ratio Rectification)
  apo_push_coeff: 1.0

  # Pull coefficient (β) - controls the strength of pull towards Top-K tokens
  # Higher values = stronger guidance to reference model's Top-K
  # Used by: apo_ratio (Method II - Ratio Rectification)
  apo_pull_coeff: 0.1

  # Number of Top-K tokens from reference model to use as anchor points
  # These tokens define the "safe" region for probability mass to flow into
  # Used by: both apo and apo_ratio
  apo_topk: 5

  # Whether to exclude the sampled (error) token from the anchor set
  # Recommended: true, to avoid conflicting gradients
  # Used by: both apo and apo_ratio
  apo_exclude_sampled: true

# Constant C in Dual-clip PPO; clips when advantage < 0 and ratio > C
clip_ratio_c: 3.0

# Loss aggregation mode: "token-mean", "seq-mean-token-sum", or "seq-mean-token-mean"
loss_agg_mode: token-mean

# Entropy regularization coefficient in PPO loss
entropy_coeff: 0

# Truncated Importance Sampling (TIS): https://fengyao.notion.site/off-policy-rl
# the truncation value C of truncated Importance Sampling (-1 for disable TIS)
tis_imp_ratio_cap: -1

# Whether to use KL loss instead of KL reward penalty. True for GRPO
use_kl_loss: false

# Whether to apply KL loss only on negative advantage samples (wrong samples)
# This is an ablation for APO comparison - both APO and this only regularize wrong samples
use_kl_loss_on_wrong: false

# Whether to use torch.compile()
# oc.select: the default val for ref.use_torch_compile
use_torch_compile: true

# KL loss coefficient when use_kl_loss is enabled. For GRPO
kl_loss_coef: 0.001

# Type of KL divergence loss. Options: "kl"(k1), "abs", "mse"(k2), "low_var_kl"(k3), "full"
kl_loss_type: low_var_kl

sft_loss_coeff: 0.2
# Number of PPO epochs per batch
ppo_epochs: 1

# Shuffle training data across PPO epochs
shuffle: false

# checkpoint configs
checkpoint:

  # Target dataclass for this configuration
  _target_: verl.trainer.config.CheckpointConfig

  # What to include in saved checkpoints
  # with 'hf_model' you can save whole model as hf format, now only use sharded model checkpoint to save space
  save_contents: ['model', 'optimizer', 'extra']

  # For more flexibility, you can specify the contents to load from the checkpoint.
  # .xxx refers to the local variable xxx from the same level of hierarchy similar to python pkg
  load_contents: ${.save_contents}

  # Whether to save checkpoints asynchronously. Only effective for Megatron as of now.
  async_save: False

# optimizer configs
optim:

  # Learning rate
  lr: 1e-6

  # Warmup steps ratio (used if lr_warmup_steps is negative)
  lr_warmup_steps_ratio: 0.0

  # Total training steps (must be overridden at runtime)
  total_training_steps: -1

  # Weight decay
  weight_decay: 0.01

  # Prioritized. None, 0 or Negative values mean delegating to lr_warmup_steps_ratio.
  lr_warmup_steps: -1


# Whether to use custom fused kernels (e.g., FlashAttention, fused MLP)
use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}

# profile the actor model in `update_policy` 
profiler:

  # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
  _target_: verl.utils.profiler.ProfilerConfig

  # profiler tool, default same as profiler.tool in global config
  # choices: nsys, npu, torch
  tool: ${oc.select:global_profiler.tool,null}

  # whether enable profile on Actor
  enable: False
  
  # Whether to profile all ranks.
  all_ranks: False

  # The ranks that will be profiled. [] or [0,1,...]
  ranks: []

  # profile results saving path
  save_path: ${oc.select:global_profiler.save_path,null}

  # specific tool config which only related to the role
  tool_config:

    # nsys tool config
    nsys:

      # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
      _target_: verl.utils.profiler.config.NsightToolConfig
    
      # True for each task has its own database, False for all tasks in one training step share one database.
      discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
    
    # npu config
    npu:

      # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
      _target_: verl.utils.profiler.config.NPUToolConfig

      # Contents to profile, can be empty
      # options: npu, cpu, memory, shapes, module, stack
      contents: []

      # Collection level, optional values: level_none, level0, level1, level2.
      level: "level1"

      # Whether to automatically parse the data.
      analysis: True

      # True for each task has its own database, False for all tasks in one training step share one database.
      discrete: False
    
    # torch profiler config
    torch:

      # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
      _target_: verl.utils.profiler.config.TorchProfilerToolConfig

      # start profile mini-batch in training
      # NOTICE: different with global steps config which refers to iteration
      # This field only related with mini-batch
      step_start: 0

      # stop profile mini-batch in training
      step_end: null

    # torch memory profiler config
    torch_memory:

      # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
      _target_: verl.utils.profiler.config.TorchMemoryToolConfig

      # Maximum number of memory allocation entries to track
      trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}

      # Stack trace depth for memory allocations
      stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
