# Target class for this configuration
_target_: verl.workers.config.RolloutConfig

# actor_rollout_ref.rollout.name: hf/vllm/sglang. The default value will be removed in the future
name: ???

# sync: LLM, async: AsyncLLM
mode: async

# Sampling temperature for rollout.
temperature: 1.0

# Top-k sampling parameter. -1 for vLLM rollout, 0 for HF rollout.
top_k: -1

# Top-p sampling parameter. Default 1.0.
top_p: 1

# typically the same as data max prompt length
# same as data.max_prompt_length if it exists
prompt_length: ${oc.select:data.max_prompt_length,512}

# typically the same as data max response length
# same as data.max_response_length if it exists
response_length: ${oc.select:data.max_response_length,512}

# for vllm rollout
# Rollout model parameters type. Align with actor model's FSDP/Megatron type.
dtype: bfloat16

# Fraction of GPU memory used by vLLM/SGLang for KV cache.
gpu_memory_utilization: 0.5

# Whether to ignore EOS and continue generating after EOS is hit.
ignore_eos: False

# Whether to disable CUDA graph. Default False to best performance.
enforce_eager: False

# batch size of cudagraph to capture. Require enforce_eager: False to use this option
# Since cudagraph in inference engine can not be offloaded during update policy,
# you can use smaller batch size to save memory used in cuda graph, eg: [1 ,2, 4, 8, 16, 32]
# supported engines: vllm
cudagraph_capture_sizes: null

# Whether to free engine KVCache after generation.
free_cache_engine: True

# TP size for rollout. Not effective for hf
tensor_model_parallel_size: 2

# DP size for rollout
data_parallel_size: 1

# EP size for rollout
expert_parallel_size: 1

# PP size for rollout.
pipeline_model_parallel_size: 1

# max number of tokens in a batch
max_num_batched_tokens: 8192

# max length for rollout
max_model_len: null

# max length of sequences
max_num_seqs: 1024

# may get higher throughput when set to True. When activated, Please increase max_num_batched_tokens or decrease max_model_len.
enable_chunked_prefill: True

# Prefix caching kv-cache blocks is a popular optimization in LLM inference to avoid redundant prompt computations.
enable_prefix_caching: True

# logprobs mode for rollout logprobs
logprobs_mode: processed_logprobs

# scheduling policy for vllm rollout
scheduling_policy: fcfs

# Which loader to use for rollout model weights: dummy, hf, megatron, etc.
# safetensors (for huge model, and set use_shm=True); dummy: randomly init model weight
load_format: dummy

# [Will be deprecated, use log_prob_micro_batch_size_per_gpu] The batch size for one forward pass in the computation of log_prob. Global batch size.
log_prob_micro_batch_size: null

# The batch size for one forward pass in the computation of log_prob. Local batch size per GPU.
log_prob_micro_batch_size_per_gpu: null

# enable dynamic batch size (sequence packing) for log_prob computation
# same as actor_rollout_ref.actor.use_dynamic_bsz if it exists, otherwise false
log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}

# max token length for log_prob computation
# same as actor_rollout_ref.actor.ppo_max_token_len_per_gpu if it exists, otherwise 16384
log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}

# disable logging statistics
disable_log_stats: True

# for hf rollout
# Whether to sample during training rollout. False uses greedy sampling.
do_sample: True

# number of responses (i.e. num sample times). > 1 for grpo
n: 1

# The over_sample_rate parameter controls the early termination threshold for training rollouts,
# where the system will abort remaining requests when (1 - over_sample_rate) * total_requests completions are reached.
over_sample_rate: 0

# Whether to wake up inference engine in multi-stage for SGLang
# to reduce peak memory during training-rollout transition.
# This is only effective for SGLang rollout.
multi_stage_wake_up: false

# Extra inference engine arguments (vllm, sglang), please refer vllm/sglang official doc for detail
engine_kwargs:

  # vllm engine config
  vllm: {}

  # sglang engine config
  sglang: {}

# Sampling parameters used during validation.
val_kwargs:

  # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
  _target_: verl.workers.config.SamplingConfig

  # sampling parameters for validation
  # Top-k sampling parameter. -1 for vLLM rollout, 0 for HF rollout.
  top_k: -1

  # Top-p sampling parameter. Default 1.0.
  top_p: 1.0

  # Sampling temperature for rollout.
  temperature: 0

  # whether to repeat n times for validation
  n: 1

  # Whether to sample during training rollout. False uses greedy sampling.
  do_sample: False

# Multi-turn interaction config for tools or chat.
multi_turn:

  # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
  _target_: verl.workers.config.MultiTurnConfig

  # set to True for multi-turn tool interaction tasks; should set rollout.name to sglang as well
  enable: False

  # null for no limit (default max_length // 3)
  max_assistant_turns: null

  # null for no tool
  tool_config_path: null

  # null for no limit (default max_length // 3)
  max_user_turns: null

  # max parallel call for tools in single turn
  max_parallel_calls: 1

  # max length of tool response
  max_tool_response_length: 256

  # truncate side of tool response: left, middle, right
  tool_response_truncate_side: middle

  # null for no interaction
  interaction_config_path: null

  # - When set to True, the model's default chat template is used for multi-turn rollout, which typically matches production behavior.
  # - When set to False, the token ids recorded for training are used instead; unlike the default chat template, these always include the model's full output,
  #   which may contain additional content such as reasoning content. This maintains the consistency between training and rollout, but it will lead to longer prompts.
  use_inference_chat_template: False

  # Tokenization is performed turn by turn and the resulting token ids are concatenated to form the full conversation.
  # To ensure this matches the result of tokenizing the entire conversation at once, a sanity check is run at the end of each multi-turn rollout to compare the two sets of token ids.
  # Some models are known to produce different tokenization results when tokenizing turn by turn vs. all at once. aThis behavior has already been validated for them.
  # To reduce excessive warnings, you can turn off the sanity check for these models if you are using their default chat template:
  # Qwen/QwQ-32B, Qwen/Qwen3-xxB
  # - disable: disable tokenization sanity check
  # - strict: enable strict tokenization sanity check (default)
  # - ignore_strippable: ignore strippable tokens when checking tokenization sanity
  tokenization_sanity_check_mode: strict

  # Format of the multi-turn interaction. Options: hermes, llama3_json, ...
  format: hermes

  # Number of repeat rollouts for each interaction
  num_repeat_rollouts: null

# support logging rollout prob for debugging purpose
# "Truncated importance sampling" requires rollout log probs, set to True when turning on Truncated importance sampling
calculate_log_probs: False

# [Experimental] agent loop based rollout configs
agent:

  # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
  _target_: verl.workers.config.AgentLoopConfig

  # Number of agent loop workers
  num_workers: 8

  # default agent loop to use if `agent_name` not set in RL dataset
  default_agent_loop: single_turn_agent

  # custom agent loop config path, which should contain list of configs to initialize AgentLoop instances.
  # https://hydra.cc/docs/advanced/instantiate_objects/overview/
  #
  # - name: react_agent
  #   _target_: recipe.langgraph_agent.react_agent_loop.ReactAgentLoop
  #   tools: ["get_current_temperature"]
  # - name: math_expression
  #   _target_: recipe.langgraph_agent.example.math_expression.MathExpressionReactAgentLoop
  #   min_terms: 2
  #   max_terms: 6
  agent_loop_config_path: null

  # custom async server configs
  custom_async_server:

    # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
    _target_: verl.workers.config.CustomAsyncServerConfig

    # Path to the custom async server implementation
    path: null

    # Class name of the custom async server class (e.g. AsyncvLLMServer)
    name: null

# Specifies the tensor bucket size (in megabytes) for batch weight updates during rollout operations.
# This parameter controls the maximum payload size for a single weight update request.
# Reference: See related PR for details
# Currently only supported in SGLang rollout implementations
# Larger values may improve throughput but increase memory overhead
# Detailed performance comparison:
# See related performance comparison for details
# Default value (512MB) is optimized for typical GPU memory configurations
# For the best performance of `rebuild_cuda_tensor`, it is recommended to:
# 1. Enable `RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES`
# 2. Manually set `CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7`
# when using Tensor Parallelism (TP) >= 8.
update_weights_bucket_megabytes: 512

# trace rollout data
trace:

  # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
  _target_: verl.workers.config.TraceConfig

  # trace backend, support mlflow, weave
  backend: null

  # whether translate token id to text in output
  token2text: False

  # Maximum number of unique samples to trace per agent worker per training step.
  # If null, all samples are traced. If set to N, each agent loop worker will randomly
  # select N unique samples to trace (including all their rollouts for GRPO).
  # Total traces per step = max_samples_per_step_per_worker * num_workers * n_rollouts_per_sample
  max_samples_per_step_per_worker: null

# When enabled (True), the trainer will attempt to load previously generated rollout data from the specified directory instead of computing new rollouts.
# If no cached data is found or loading fails, new rollouts will be generated and automatically saved.
# This feature is useful for debugging or when you want to reuse computation results across multiple runs.
skip_rollout: False

# Specifies the filesystem path where rollout data should be cached when skip_rollout is enabled.
# Note: Giving path under /tmp/ray/session* is not recommended as these are temporary Ray cluster directories.
skip_dump_dir: /tmp/rollout_dump

# Whether to skip tokenizer initialization for rollout engine
# When enabled (True), the rollout assume token in token out for generation
skip_tokenizer_init: True

# Whether to enable rollout routing replay for MoE models
# When enabled (True), the rollout will record the routing decisions.
enable_rollout_routing_replay: False


# profile the rollout model in `generate_sequence` 
profiler:

  # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
  _target_: verl.utils.profiler.ProfilerConfig

  # profiler tool, default same as profiler.tool in global config
  # choices: nsys, npu, torch
  tool: ${oc.select:global_profiler.tool,null}

  # whether enable profile on ref
  enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}

  # Whether to profile all ranks.
  all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}

  # The ranks that will be profiled. [] or [0,1,...]
  ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}

  # profile results saving path
  save_path: ${oc.select:global_profiler.save_path,null}

  # specific tool config
  tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}

# prometheus configuration for vllm/sglang server mode
prometheus:

  # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
  _target_: verl.workers.config.PrometheusConfig

  # whether enable prometheus on server mode rollout
  enable: false

  # Port number that Prometheus listens on, default is 9090
  port: 9090

  # Path to Prometheus configuration file
  file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml

  # Specify served_model_name to avoid displaying overly long model paths in Grafana
  served_model_name: ${oc.select:actor_rollout_ref.model.path,null}

# type of quantization in vllm, currently support fp8 and torchao
quantization: null

# extra quantization information serialized in a config file, e.g. torchao_config.json
quantization_config_file: null
