trainer:
  nnodes: 1
  n_gpus_per_node: 8
  device: cuda

data:
  path: ~/data/rlhf/math/test.parquet
  prompt_key: prompt
  n_samples: 5
  output_path: /opt/tiger/math_Qwen2-7B-Instruct.parquet
  batch_size: 128

model:
  path: ~/models/Qwen2-7B-Instruct
  external_lib: null
rollout:
  name: vllm
  mode: sync # sync: LLM, async: AsyncLLM
  temperature: 1.0
  top_k: 50 # 0 for hf rollout, -1 for vllm rollout
  top_p: 0.7
  prompt_length: 1536
  response_length: 512
  # for vllm rollout
  dtype: bfloat16 # should align with FSDP
  gpu_memory_utilization: 0.5
  ignore_eos: False
  enforce_eager: True
  free_cache_engine: True
  load_format: dummy_dtensor
  tensor_model_parallel_size: 1
  max_num_batched_tokens: 8192
  max_model_len: null
  max_num_seqs: 1024
  log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
  log_prob_micro_batch_size_per_gpu: 8
  # for fire vllm rollout
  use_fire_sampling: False # enable FIRE https://arxiv.org/abs/2410.21236
  # for hf rollout
  do_sample: True
  disable_log_stats: True
  enable_chunked_prefill: True
  n: 1
  multi_turn:

    # set to True for multi-turn tool interaction tasks; should set rollout.name to sglang as well
    enable: False

    # null for no limit (default max_length // 3)
    max_turns: null

    # null for no tool
    tool_config_path: null

    # null for default callback
    completion_callback: null

    # - When set to True, the model's default chat template is used for multi-turn rollout, which typically matches production behavior.
    # - When set to False, the token ids recorded for training are used instead; unlike the default chat template, these always include the model's full output,
    #   which may contain additional content such as reasoning content. This maintains the consistency between training and rollout, but it will lead to longer prompts.
    use_inference_chat_template: False

    # Tokenization is performed turn by turn and the resulting token ids are concatenated to form the full conversation.
    # To ensure this matches the result of tokenizing the entire conversation at once, a sanity check is run at the end of each multi-turn rollout to compare the two sets of token ids.
    # Some models are known to produce different tokenization results when tokenizing turn by turn vs. all at once. aThis behavior has already been validated for them.
    # To reduce excessive warnings, you can turn off the sanity check for these models if you are using their default chat template:
    # Qwen/QwQ-32B, Qwen/Qwen3-xxB
    enable_tokenization_sanity_check: True
actor:
  strategy: fsdp  # This is for backward-compatibility
  ulysses_sequence_parallel_size: 1 # sp size
  entropy_from_logits_with_chunking: False  # calculate entropy with chunking to reduce memory peak
  entropy_checkpointing: False  # recompute entropy
  fsdp_config:
    fsdp_size: -1
    forward_prefetch: False  # FSDP1 forward_prefetch configuration

ray_init:
  num_cpus: null # `None` means using all CPUs, which might cause hang if limited in systems like SLURM. Please set to a number allowed then.
  timeline_json_file: null
