trainer:
  nnodes: 1
  n_gpus_per_node: 8

data:
  path: ~/data/rlhf/math/test.parquet
  prompt_key: prompt
  n_samples: 1
  output_path: /opt/tiger/math_Qwen2-7B-Instruct.parquet
  batch_size: 128

model:
  path: ~/models/Qwen2-7B-Instruct
  external_lib: null

actor:
    strategy: fsdp  # This is for backward-compatibility
    ppo_mini_batch_size: 256
    ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
    ppo_micro_batch_size_per_gpu: null
    use_dynamic_bsz: False
    ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
    grad_clip: 1.0
    clip_ratio: 0.2
    entropy_coeff: 0.001
    use_kl_loss: False # True for GRPO
    kl_loss_coef: 0.001 # for grpo
    kl_loss_type: low_var_kl # for grpo
    ppo_epochs: 1
    shuffle: False
    ulysses_sequence_parallel_size: 1 # sp size
    optim:
      lr: 1e-6
      lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
      min_lr_ratio: null   # only useful for warmup with cosine
      warmup_style: constant  # select from constant/cosine
      total_training_steps: -1  # must be override by program
    fsdp_config:
      wrap_policy:
        # transformer_layer_cls_to_wrap: None
        min_num_params: 0
      param_offload: False
      optimizer_offload: False
      fsdp_size: -1
rollout:
  name: vllm
  temperature: 1.0
  top_k: 50 # 0 for hf rollout, -1 for vllm rollout
  top_p: 0.7
  prompt_length: 1536
  response_length: 512
  validate_roll_out_max_length: 4096
  # for vllm rollout
  dtype: bfloat16 # should align with FSDP
  gpu_memory_utilization: 0.5
  ignore_eos: False
  enforce_eager: True
  free_cache_engine: True
  load_format: dummy_dtensor
  tensor_model_parallel_size: 1
  max_num_batched_tokens: 8192
  max_num_seqs: 1024
  log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
  log_prob_micro_batch_size_per_gpu: 8
  log_prob_use_dynamic_bsz: ${actor.use_dynamic_bsz}
  log_prob_max_token_len_per_gpu: ${actor.ppo_max_token_len_per_gpu}
  disable_log_stats: True
  enable_chunked_prefill: True
  n: 1
  # for hf rollout
  do_sample: True