trainer:
  nnodes: 1
  n_gpus_per_node: 8

data:
  path: ~/data/rlhf/math/test.parquet
  prompt_key: prompt
  response_key: responses
  data_source_key: data_source
  reward_model_key: reward_model
  n_samples: 1
  output_path: /opt/tiger/math_Qwen2-7B-Instruct.parquet
  load_path: null  # Path to checkpoint file to resume from, if null a default checkpoint will be created
  batch_size: 2048

model:
  path: ~/models/Qwen2-7B-Instruct
  type: R1
  external_lib: null

rollout:
  name: vllm
  temperature: 1.0
  top_k: -1 # 0 for hf rollout, -1 for vllm rollout
  top_p: 1
  prompt_length: 2048
  response_length: 32768
  dtype: bfloat16
  gpu_memory_utilization: 0.9
  ignore_eos: False
  micro_batch_size: 256
  enforce_eager: True
  free_cache_engine: True
  load_format: dummy_dtensor
  tensor_model_parallel_size: 1
  max_num_batched_tokens: 8192
  max_model_len: null
  max_num_seqs: 1024
  log_prob_micro_batch_size: 8
  log_prob_micro_batch_size_per_gpu: 1
  do_sample: True
  n: 1
  n_val: 1
  enable_chunked_prefill: True
  disable_log_stats: True

actor:
  strategy: fsdp  # This is for backward-compatibility
  ulysses_sequence_parallel_size: 1 # sp size
  fsdp_config:
    wrap_policy:
      min_num_params: 0
    param_offload: False
    grad_offload: False
    optimizer_offload: False
    fsdp_size: -1
  optim:
    lr: 1e-6
    lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
    min_lr_ratio: null   # only useful for warmup with cosine
    warmup_style: constant  # select from constant/cosine
    total_training_steps: -1  # must be override by program

tcs:
  debug:
    use_debug: False
  evaluation: True
  complete_evaluation: False
  reward_model:
    max_test_cases: 10 # only work in hand_inputs() function
  livecodebench_dir: /path/to/folder/data/livecodebench_2408_2502