train:
  seq_length: 768
  epochs: 500
  total_steps: 100000
  batch_size: 2
  eval_batch_size: 2

  checkpoint_interval: 1000
  eval_interval: 1000

  pipeline: "PromptPipeline"
  trainer: "AcceleratePPOTrainer"
  
  checkpoint_dir: "XXX"
  save_best: False

model:
  model_path: "XXX"
  num_layers_unfrozen: 2

tokenizer:
  tokenizer_path: "XXX"
  truncation_side: "right"

optimizer:
  name: "adamw"
  kwargs:
    lr: 3.0e-6
    betas: [0.9, 0.999]
    eps: 1.0e-8
    weight_decay: 0.01

scheduler:
  name: "cosine_annealing"
  kwargs:
    T_max: 100000
    eta_min: 3.0e-6

method:
  name: "ppoconfig"
  num_rollouts: 1024
  chunk_size: 16
  ppo_epochs: 4
  init_kl_coef: 0.01
  target: 4
  horizon: 10000
  gamma: 1
  lam: 0.95
  cliprange: 0.2
  cliprange_value: 0.2
  vf_coef: 0.2
  scale_reward: "running"
  ref_mean: null
  ref_std: null
  cliprange_reward: 10
  gen_kwargs:
    max_new_tokens: 384
    min_new_tokens: 16
    do_sample: True
    top_p: 0.95
    temperature: 0.6
