# configs for the reward model

# Whether to enable reward model. If False, we compute the reward only with the user-defined reward functions.
# In GSM8K and Math examples, we disable reward model.
# For RLHF alignment example using full_hh_rlhf, we utilize reward model to assess the responses.
# If False, the following parameters are not effective
enable: False

# FSDP strategy: "fsdp" or "fsdp2"
strategy: ???

elliptical:
  enable: False
  lamb: 0.01
  normalization: z_score # none, rnd, z_score
  reward_type: leave_one_out # alternative: leverage
  sparse_dim: 512
  randomize_sparse_matrix: False
  turn_off_at_highest_pass_at_k: False
  turn_off_at_global_steps: -1
  pass_at_k_patience: 3
  persist_covariance: False

reward_kwargs:
  elliptical:
    alpha: 1.0
    beta: 1.0
    turn_off_elliptical_if_none_correct: False
    turn_off_elliptical_if_some_correct: False
    turn_off_elliptical_if_all_correct: False
    turn_off_elliptical_if_rollout_incorrect: False
  unlikely:
    beta: 0.25
    turn_off_unlikely_if_all_correct: False

# model config for reward scoring
model:

  # Input tokenizer. If the reward model’s chat template is inconsistent with the policy,
  # we need to first decode to plaintext, then apply the rm’s chat_template.
  # Then score with RM. If chat_templates are consistent, it can be set to null.
  # set this to null if the chat template is identical
  input_tokenizer: ${actor_rollout_ref.model.path}

  # RM’s HDFS path or local path. Note that RM only supports AutoModelForSequenceClassification.
  # Other model types need to define their own RewardModelWorker and pass it from the code.
  path: ~/models/FsfairX-LLaMA3-RM-v0.1

  # External model implementation (optional)
  external_lib: ${actor_rollout_ref.model.external_lib}

  # Whether to enable loading a remote code model, default to False
  trust_remote_code: False

# [Deprecated] Global micro batch size
# will be deprecated, use micro_batch_size_per_gpu
micro_batch_size: null

# Local per-GPU micro batch size
micro_batch_size_per_gpu: null

# Maximum sequence length to process for scoring
max_length: null

# Whether to dynamically adjust batch size at runtime
use_dynamic_bsz: ${critic.use_dynamic_bsz}

# Maximum number of tokens per GPU in one forward pass
forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}

# Reward Manager. This defines the mechanism of computing rule-based reward and handling different reward sources.
# Default is naive. If all verification functions are multiprocessing-safe,
# the reward manager can be set to prime for parallel verification.
reward_manager: naive
dev_reward_manager: naive

# Whether to launch custom reward function asynchronously during log_prob
# custom reward function executed async on CPU, during log_prob
launch_reward_fn_async: False

# Cloud/local sandbox fusion configuration for custom reward logic
sandbox_fusion:

  # Cloud /local function URL for sandbox execution
  url: null

  # Max concurrent requests allowed to sandbox
  max_concurrent: 64

  # Max memory limit for each sandbox process in MB
  memory_limit_mb: 1024

# profiler configs
profiler:

  # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs in the entrypoint
  _target_: verl.utils.profiler.ProfilerConfig

  # True for each task has its own database, False for all tasks in one training step share one database.
  discrete: False

  # Whether to profile all ranks.
  all_ranks: False

  # The ranks that will be profiled. [] or [0,1,...]
  ranks: []