# configs for the reward model

# Whether to enable reward model. If False, we compute the reward only with the user-defined reward functions.
# In GSM8K and Math examples, we disable reward model.
# For RLHF alignment example using full_hh_rlhf, we utilize reward model to assess the responses.
# If False, the following parameters are not effective
enable: False
reward_kwargs: {"sentence_weight":1.0, "process_weight":0.1, "max_wrong_steps":1}

# Whether to deploy the model to a separate resource pool.
# If true, n_gpus_per_node & nnodes will be used to determine the resource node.
enable_resource_pool: False
n_gpus_per_node: 0
nnodes: 0

# FSDP strategy: "fsdp" or "fsdp2"
strategy: ???

# model config for reward scoring
model:

  # Input tokenizer. If the reward model's chat template is inconsistent with the policy,
  # we need to first decode to plaintext, then apply the rm's chat_template.
  # Then score with RM. If chat_templates are consistent, it can be set to null.
  # set this to null if the chat template is identical
  input_tokenizer: ${actor_rollout_ref.model.path}

  # RM’s HDFS path or local path. Note that RM only supports AutoModelForSequenceClassification.
  # Other model types need to define their own RewardModelWorker and pass it from the code.
  path: ~/models/FsfairX-LLaMA3-RM-v0.1

  # External model implementation (optional)
  external_lib: ${actor_rollout_ref.model.external_lib}

  # Whether to enable loading a remote code model, default to False
  trust_remote_code: False

# [Deprecated] Global micro batch size
# will be deprecated, use micro_batch_size_per_gpu
micro_batch_size: null

# Local per-GPU micro batch size
micro_batch_size_per_gpu: null

# Maximum sequence length to process for scoring
max_length: null

# Whether to dynamically adjust batch size at runtime
use_dynamic_bsz: ${critic.use_dynamic_bsz}

# Maximum number of tokens per GPU in one forward pass
forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}

# Reward Manager. This defines the mechanism of computing rule-based reward and handling different reward sources.
# Default is naive. If all verification functions are multiprocessing-safe,
# the reward manager can be set to prime for parallel verification.
reward_manager: naive

# Whether to launch custom reward function asynchronously during log_prob
# custom reward function executed async on CPU, during log_prob
launch_reward_fn_async: False

# Cloud/local sandbox fusion configuration for custom reward logic
sandbox_fusion:

  # Cloud /local function URL for sandbox execution
  url: null

  # Max concurrent requests allowed to sandbox
  max_concurrent: 64

  # Max memory limit for each sandbox process in MB
  memory_limit_mb: 1024

# profile the reward model in `compute_reward` 
profiler:

  # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
  _target_: verl.utils.profiler.ProfilerConfig

  # profiler tool, default same as profiler.tool in global config
  # choices: nsys, npu, torch
  tool: ${oc.select:global_profiler.tool,null}

  # whether enable profile on ref
  enable: False
  
  # Whether to profile all ranks.
  all_ranks: False

  # The ranks that will be profiled. [] or [0,1,...]
  ranks: []

  # profile results saving path
  save_path: ${oc.select:global_profiler.save_path,null}

  # specific tool config
  tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}