trainer:
  nnodes: 1
  n_gpus_per_node: 8

data:
  path: null
  max_prompt_length: 1024
  max_response_length: 15360
  prompt_key: item_id
  n_samples: 1
  output_path: null
  batch_size: 16

agentgym:
    task_name: textcraft
    env_addr: 'http://localhost:5000'
    max_retries: 10
    max_rounds: 10
    timeout: 300

model:
  path: null
  external_lib: null
rollout:
    name: vllm
    temperature: 1.0
    top_k: -1 # 0 for hf rollout, -1 for vllm rollout
    top_p: 1
    prompt_length: ${data.max_prompt_length}  # not use for opensource
    response_length: ${data.max_response_length}
    # for vllm rollout
    dtype: bfloat16 # should align with FSDP
    gpu_memory_utilization: 0.5
    ignore_eos: False
    enforce_eager: True
    free_cache_engine: True
    load_format: dummy_dtensor
    tensor_model_parallel_size: 2
    max_num_batched_tokens: 8192
    max_model_len: 32768
    max_num_seqs: 1024
    log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
    log_prob_micro_batch_size_per_gpu: 1
    log_prob_use_dynamic_bsz: ${actor.use_dynamic_bsz}
    log_prob_max_token_len_per_gpu: ${actor.ppo_max_token_len_per_gpu}
    max_tokens: 1024
    disable_log_stats: True
    enable_chunked_prefill: True # may get higher throughput when set to True. When activated, Please increase max_num_batched_tokens or decrease max_model_len.
    # for hf rollout
    do_sample: True
    # number of responses (i.e. num sample times)
    n: 1 # > 1 for grpo
    send_interval: 1
    rollout_log_dir: null

actor:
  strategy: fsdp  # This is for backward-compatibility
  ppo_mini_batch_size: 16
  ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
  ppo_micro_batch_size_per_gpu: 1
  use_dynamic_bsz: False
  ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
  grad_clip: 1.0
  clip_ratio: 0.2
  entropy_coeff: 0.001
  use_kl_loss: null # True for GRPO
  kl_loss_coef: 0.001 # for grpo
  kl_loss_type: low_var_kl # for grpo
  ppo_epochs: 1
  shuffle: False
  ulysses_sequence_parallel_size: 1 # sp size
  optim:
    lr: 1e-6
    lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
    min_lr_ratio: null   # only useful for warmup with cosine
    warmup_style: constant  # select from constant/cosine
    total_training_steps: -1  # must be override by program
  fsdp_config:
    wrap_policy:
      # transformer_layer_cls_to_wrap: None
      min_num_params: 0
    param_offload: False
    grad_offload: False
    optimizer_offload: False
    fsdp_size: -1