# Number of rollouts per update (mirrors actor rollout_n)
rollout_n: ${actor_rollout_ref.rollout.n}

# fsdp or fsdp2 strategy used for critic model training
strategy: ???

# optimizer configs
optim:

  # Warmup steps ratio; total steps will be injected at runtime
  lr_warmup_steps_ratio: 0.0

  # Total training steps (must be overridden at runtime)
  total_training_steps: -1

  # Weight decay
  weight_decay: 0.01

# model config for the critic
model:

  # Path to pretrained model weights
  path: ~/models/deepseek-llm-7b-chat

  # Tokenizer path (defaults to actor's model path)
  tokenizer_path: ${actor_rollout_ref.model.path}

  # Hugging Face config override
  override_config: {}

  # External model implementation (optional)
  external_lib: ${actor_rollout_ref.model.external_lib}

  # Enable gradient checkpointing to save memory
  enable_gradient_checkpointing: True

  # Whether to trust remote code from Hugging Face models
  trust_remote_code: ${actor_rollout_ref.model.trust_remote_code}

# PPO mini-batch size per update
ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}

# [Deprecated] Global micro batch size
ppo_micro_batch_size: null

# Local per-GPU micro batch size
ppo_micro_batch_size_per_gpu: null

# Whether to automatically adjust batch size at runtime
use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}

# Max tokens per GPU in one PPO batch (doubled for critic)
ppo_max_token_len_per_gpu: 32768

# Max token length per GPU in forward pass
forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}

# Number of PPO epochs per batch
ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}

# Shuffle training data across PPO epochs
shuffle: ${actor_rollout_ref.actor.shuffle}

# PPO value function clipping range
cliprange_value: 0.5

# Loss aggregation mode: "token-mean", "seq-mean-token-sum", or "seq-mean-token-mean"
loss_agg_mode: ${actor_rollout_ref.actor.loss_agg_mode}

# checkpoint configs
checkpoint:

  # What to include in saved checkpoints
  # with 'hf_model' you can save whole model as hf format, now only use sharded model checkpoint to save space
  save_contents: ['model', 'optimizer', 'extra']

  # What to include when loading checkpoints
  load_contents: ${.save_contents}

# profiler configs
# the corresponding dataclass is verl.utils.profiler.ProfilerConfig.
profiler:

  # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs in the entrypoint
  _target_: verl.utils.profiler.ProfilerConfig

  # True for each task has its own database, False for all tasks in one training step share one database.
  discrete: False

  # Whether to profile all ranks.
  all_ranks: False

  # The ranks that will be profiled. [] or [0,1,...]
  ranks: []