train:
  seq_length: 10
  batch_size: 100
  epochs: 2000
  total_steps: 10000
  seed: 2023

  checkpoint_interval: 10000
  eval_interval: 20

  pipeline: "PromptPipeline"
  orchestrator: "DPPOOrchestrator"
  trainer: "AccelerateDPPOTrainer"

model:
  model_path: "CarperAI/randomwalks"
  num_layers_unfrozen: -1

tokenizer:
  tokenizer_path: "CarperAI/randomwalks"

optimizer:
  name: "adamw"
  kwargs:
    lr: 3.0e-4
    betas: [0.9, 0.95]
    eps: 1.0e-8
    weight_decay: 1.0e-6

scheduler:
  name: "cosine_annealing"
  kwargs:
    T_max: 1000 # train.total_steps
    eta_min: 3.0e-4

method:
  name: "dppoconfig"
  num_rollouts: 128
  chunk_size: 128
  ppo_epochs: 4
  init_kl_coef: 0.05
  target: 6
  horizon: 10000
  gamma: 1
  lam: 0.95
  cliprange: 0.2
  cliprange_value: 0.2
  vf_coef: 1.2
  scale_reward: False
  ref_mean: null
  ref_std: null
  cliprange_reward: 1
  reg_coef: 0.1
  abl_type: "linear_norm"
  threhold: 0.5
  ub: 3.5
  lb: 0.5
  gen_kwargs:
    max_new_tokens: 9
    top_k: 0.0
    top_p: 0.9
    do_sample: True
