# Format checks enforced on CI:
# 1. Comments must appear above each field.
# 2. There must be a blank line between each field.
# 3. Inline comments (after a field on the same line) are not allowed.
# 4. Indentation level is respected for nested fields.

# Target class for this configuration
_target_: verl.workers.config.ActorConfig

# the abstract actor configs
# fsdp, fsdp2 or megatron. must be set.
strategy: ???

# Split each sample into sub-batches of this size for PPO
ppo_mini_batch_size: 256

# [Deprecated] Global micro batch size
ppo_micro_batch_size: null

# Local per-GPU micro batch size
ppo_micro_batch_size_per_gpu: null

# Whether to automatically adjust batch size at runtime
# oc.select: the default val for ref.log_prob_use_dynamic_bsz
use_dynamic_bsz: false

# Max tokens per GPU in one PPO batch; affects gradient accumulation
# Typically it should be: n * ${data.max_prompt_length} + ${data.max_response_length}
# oc.select: the default val for ref.log_prob_max_token_len_per_gpu
ppo_max_token_len_per_gpu: 16384

# PPO clip ratio
clip_ratio: 0.2

# Lower bound for asymmetric clipping (used in dual-clip PPO)
clip_ratio_low: 0.2

# Upper bound for asymmetric clipping (used in dual-clip PPO)
clip_ratio_high: 0.2

# policy loss config
policy_loss:

  # # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
  _target_: verl.workers.config.PolicyLossConfig

  # Loss function mode: vanilla / clip-cov / kl-cov /gpg from XXXX
  loss_mode: "vanilla"

  # Ratio of tokens to be clipped for clip-cov loss
  clip_cov_ratio: 0.0002

  # Lower bound for clip-cov loss
  clip_cov_lb: 1.0

  # Upper bound for clip-cov loss
  clip_cov_ub: 5.0

  # Ratio of tokens to be applied kl penalty for kl-cov loss
  kl_cov_ratio: 0.0002

  # KL divergence penalty coefficient
  ppo_kl_coef: 0.1

# Constant C in Dual-clip PPO; clips when advantage < 0 and ratio > C
clip_ratio_c: 3.0

# Loss aggregation mode: "token-mean", "seq-mean-token-sum", or "seq-mean-token-mean"
loss_agg_mode: token-mean

# Entropy regularization coefficient in PPO loss
entropy_coeff: 0

# Whether to use KL loss instead of KL reward penalty. True for GRPO
use_kl_loss: false

# Whether to use torch.compile()
# oc.select: the default val for ref.use_torch_compile
use_torch_compile: true

# KL loss coefficient when use_kl_loss is enabled. For GRPO
kl_loss_coef: 0.001

# Type of KL divergence loss. Options: "kl"(k1), "abs", "mse"(k2), "low_var_kl"(k3), "full"
kl_loss_type: low_var_kl

# Number of PPO epochs per batch
ppo_epochs: 1

# Shuffle training data across PPO epochs
shuffle: false

# checkpoint configs
checkpoint:

  # Target dataclass for this configuration
  _target_: verl.trainer.config.CheckpointConfig

  # What to include in saved checkpoints
  # with 'hf_model' you can save whole model as hf format, now only use sharded model checkpoint to save space
  save_contents: ['model', 'optimizer', 'extra']

  # For more flexibility, you can specify the contents to load from the checkpoint.
  # .xxx refers to the local variable xxx from the same level of hierarchy similar to python pkg
  load_contents: ${.save_contents}

  # Whether to save checkpoints asynchronously. Only effective for Megatron as of now.
  async_save: False

# optimizer configs
optim:

  # Learning rate
  lr: 1e-6

  # Warmup steps ratio (used if lr_warmup_steps is negative)
  lr_warmup_steps_ratio: 0.0

  # Total training steps (must be overridden at runtime)
  total_training_steps: -1

  # Weight decay
  weight_decay: 0.01

  # Prioritized. None, 0 or Negative values mean delegating to lr_warmup_steps_ratio.
  lr_warmup_steps: -1


# Whether to use custom fused kernels (e.g., FlashAttention, fused MLP)
use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
