trainer:
  nnodes: 1
  n_gpus_per_node: 8

data:
  path: ~/data/rlhf/math/test.parquet
  prompt_key: prompt
  n_samples: 5
  output_path: /opt/tiger/math_Qwen2-7B-Instruct.parquet
  batch_size: 128

model:
  path: ~/models/Qwen2-7B-Instruct
  external_lib: null
rollout:
  name: vllm
  temperature: 1.0
  top_k: -1 # 0 for hf rollout, -1 for vllm rollout
  top_p: 1
  min_p: 0.0
  repetition_penalty: 1.0
  presence_penalty: 0.0
  prompt_length: 1536
  response_length: 512
  # for vllm rollout
  dtype: bfloat16 # should align with FSDP
  gpu_memory_utilization: 0.5
  ignore_eos: False
  enforce_eager: True
  free_cache_engine: True
  load_format: dummy_dtensor
  tensor_model_parallel_size: 1
  max_num_batched_tokens: 8192
  max_num_seqs: 1024
  log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
  log_prob_micro_batch_size_per_gpu: 8
  # for hf rollout
  do_sample: True
  disable_log_stats: True
  enable_chunked_prefill: True
  n: 1

  # validate: # validate hparams
  #   temperature: 0.6
  #   top_k: -1 # 0 for hf rollout, -1 for vllm rollout
  #   top_p: 0.95
  #   min_p: 0.0
  #   repetition_penalty: 1.0
  #   presence_penalty: 0.0
  
  use_rl_agent: True
  multiagent: True  # multi-agent rl
  multiturn_agent: True # multi-turn agent rollout
  # multiple agent rollout
  multi_agent_pattern:
    max_depth: 20
    tool_external_concurrency: 64  # qwen_agent/utils/parallel_executor.py
    beta: 1
    system1_sampling_params:
      system1_mode: training  # online, training, empty
      system1_url: http://172.27.22.155:10080/v1 # https://dashscope.aliyuncs.com/compatible-mode/v1
      system1_key: empty
      model_name: qwen2.5-72b-instruct
      system1_concurrency: 512
      temperature: ${rollout.temperature}
      top_p: ${rollout.top_p}
      max_tokens: 8192
      n: 1
      # add_prefix: null
      add_prefix: "<tool_response>"
      max_prompt_length: 20480
      max_response_length: ${rollout.multi_agent_pattern.system1_sampling_params.max_tokens}
      # include_stop_str_in_output: False
      include_stop_str_in_output: True
      stop:
          - "</tool_response>"

    system2_sampling_params:
      stop:
        - "</tool_call>"
        - "</answer>"
      include_stop_str_in_output: True
      add_prefix: "<research>"
      # max_context_length: ${eval:${data.max_prompt_length} + ${data.max_response_length}}
      max_prompt_length: ${rollout.prompt_length}
      max_response_length: ${rollout.response_length}
    
actor: # dummy params
  strategy: fsdp  # This is for backward-compatibility
  ppo_mini_batch_size: 256
  ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
  ppo_micro_batch_size_per_gpu: null
  use_dynamic_bsz: False
  ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
  grad_clip: 1.0
  clip_ratio: 0.2
  entropy_coeff: 0.001
  use_kl_loss: False # True for GRPO
  kl_loss_coef: 0.001 # for grpo
  kl_loss_type: low_var_kl # for grpo
  ppo_epochs: 1
  shuffle: False
  ulysses_sequence_parallel_size: 1 # sp size
  optim:
    lr: 1e-6
    lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
    min_lr_ratio: null   # only useful for warmup with cosine
    warmup_style: constant  # select from constant/cosine
    total_training_steps: -1  # must be override by program
  fsdp_config:
    wrap_policy:
      min_num_params: 0
    param_offload: False
    optimizer_offload: False
    fsdp_size: -1
debug: True
timestamp: "test"