# actor_rollout_ref.rollout.name: hf/vllm/sglang.
name: vllm

# sync: LLM, async: AsyncLLM
mode: sync

# Sampling temperature for rollout.
temperature: 1.0

# Top-k sampling parameter. -1 for vLLM rollout, 0 for HF rollout.
top_k: -1

# Top-p sampling parameter. Default 1.0.
top_p: 1

# typically the same as data max prompt length
# same as data.max_prompt_length if it exists
prompt_length: ${oc.select:data.max_prompt_length,512}

# typically the same as data max response length
# same as data.max_response_length if it exists
response_length: ${oc.select:data.max_response_length,512}

# for vllm rollout
# Rollout model parameters type. Align with actor model's FSDP/Megatron type.
dtype: bfloat16

# Fraction of GPU memory used by vLLM/SGLang for KV cache.
gpu_memory_utilization: 0.5

# Whether to ignore EOS and continue generating after EOS is hit.
ignore_eos: False

# Whether to disable CUDA graph. Default True to allow cache freeing.
enforce_eager: True

# Whether to free engine KVCache after generation. Set enforce_eager=True when enabled.
free_cache_engine: True

# TP size for rollout. Not effective for hf
tensor_model_parallel_size: 2

# max number of tokens in a batch
max_num_batched_tokens: 8192

# max length for rollout
max_model_len: null

# max length of sequences
max_num_seqs: 1024

# [Will be deprecated, use log_prob_micro_batch_size_per_gpu] The batch size for one forward pass in the computation of log_prob. Global batch size.
log_prob_micro_batch_size: null

# The batch size for one forward pass in the computation of log_prob. Local batch size per GPU.
log_prob_micro_batch_size_per_gpu: null

# enable dynamic batch size (sequence packing) for log_prob computation
# same as actor_rollout_ref.actor.use_dynamic_bsz if it exists, otherwise false
log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}

# max token length for log_prob computation
# same as actor_rollout_ref.actor.ppo_max_token_len_per_gpu if it exists, otherwise 16384
log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}

# disable logging statistics
disable_log_stats: True

# for hf rollout
# Whether to sample during training rollout. False uses greedy sampling.
do_sample: True

# number of responses (i.e. num sample times). > 1 for grpo
n: 1

# Whether to wake up inference engine in multi-stage. (Wake up model weights first, then resume kv cache)
multi_stage_wake_up: false

# Extra inference engine arguments (vllm, sglang).
engine_kwargs:

  # for vllm
  vllm:

    # Swap space (in GB) used by inference engine. null uses default (e.g., 4 GB).
    swap_space: null

    # Whether to disable the preprocessor cache for multimodel models.
    disable_mm_preprocessor_cache: False

  # for sglang
  sglang:

    # The attention backend for sglang engine. Options: flashinfer, triton, flashmla, null for default.
    attention_backend: null

# Sampling parameters used during validation.
val_kwargs:

  # sampling parameters for validation
  # Top-k sampling parameter. -1 for vLLM rollout, 0 for HF rollout.
  top_k: -1

  # Top-p sampling parameter. Default 1.0.
  top_p: 1.0

  # Sampling temperature for rollout.
  temperature: 0

  # whether to repeat n times for validation
  n: 1

  # Whether to sample during training rollout. False uses greedy sampling.
  do_sample: False

# Multi-turn interaction config for tools or chat.
multi_turn:

  # set to True for multi-turn tool interaction tasks; should set rollout.name to sglang as well
  enable: False

  # null for no limit (default max_length // 3)
  max_assistant_turns: null

  # null for no tool
  tool_config_path: null

  # null for no limit (default max_length // 3)
  max_user_turns: null

  # max parallel call for tools in single turn
  max_parallel_calls: 1

  # max length of tool response
  max_tool_response_length: 256

  # truncate side of tool response: left, middle, right
  tool_response_truncate_side: middle

  # null for no interaction
  interaction_config_path: null

  # null for default callback
  completion_callback: null

  # - When set to True, the model's default chat template is used for multi-turn rollout, which typically matches production behavior.
  # - When set to False, the token ids recorded for training are used instead; unlike the default chat template, these always include the model's full output,
  #   which may contain additional content such as reasoning content. This maintains the consistency between training and rollout, but it will lead to longer prompts.
  use_inference_chat_template: False

  # Tokenization is performed turn by turn and the resulting token ids are concatenated to form the full conversation.
  # To ensure this matches the result of tokenizing the entire conversation at once, a sanity check is run at the end of each multi-turn rollout to compare the two sets of token ids.
  # Some models are known to produce different tokenization results when tokenizing turn by turn vs. all at once. aThis behavior has already been validated for them.
  # To reduce excessive warnings, you can turn off the sanity check for these models if you are using their default chat template:
  # Qwen/QwQ-32B, Qwen/Qwen3-xxB
  # - disable: disable tokenization sanity check
  # - strict: enable strict tokenization sanity check (default)
  # - ignore_strippable: ignore strippable tokens when checking tokenization sanity
  tokenization_sanity_check_mode: strict

  # Format of the multi-turn interaction. Options: hermes, llama3_json, ...
  format: hermes

# support logging rollout prob for debugging purpose
calculate_log_probs: False

# [Experimental] agent loop based rollout configs
agent:

  # Number of agent loop workers
  num_workers: 8

  # custom async server configs
  custom_async_server:

    # Path to the custom async server implementation
    path: null

    # Class name of the custom async server class (e.g. AsyncvLLMServer)
    name: null
