method:
  name: PPOConfig
  num_rollouts: 128
  chunk_size: 8
  ppo_epochs: 4
  init_kl_coef: 0.05
  target: 6
  horizon: 10000
  gamma: 1
  lam: 0.95
  cliprange: 0.2
  cliprange_value: 0.2
  vf_coef: 1
  scale_reward: running
  ref_mean: null
  ref_std: null
  cliprange_reward: 10
  gen_kwargs:
    max_new_tokens: 128
    top_k: 0
    top_p: 1.0
    do_sample: true
model:
  model_path: " "
  model_arch_type: causal
tokenizer:
  tokenizer_path: " "
  padding_side: left
  truncation_side: left
optimizer:
  name: adamw
  kwargs:
    lr: 4.0e-6
    betas: [0.9, 0.95]
    eps: 1.0e-8
    weight_decay: 1.0e-6
scheduler:
  name: linear
  kwargs:
    start_factor: 0.2
    end_factor: 1.0
    total_iters: 100
train:
  seq_length: 512
  epochs: 10000
  total_steps: 10000
  batch_size: 4
  minibatch_size: 4
  checkpoint_interval: 5000
  eval_interval: 1000
  pipeline: PromptPipeline
  trainer: AcceleratePPOTrainer
  checkpoint_dir: " "
