test_size: 2000
model_directory: "gpt2"
output_directory: "ppo/gpt2"
gold_rm_directory: "gpt-j-6B"
ppo_config:
  ppo_epochs: 4
  target: null
  init_kl_coef: 0.005
  num_rollouts: 512
  chunk_size: 6
  horizon: 10000
  gamma: 1
  lam: 0.95
  cliprange: 0.2
  cliprange_value: 0.2
  vf_coef: 0.2
  cliprange_reward: 10
training:
  epochs: 400
  batch_size: 64
  total_steps: 100000
  checkpoint_interval: 100000
  eval_interval: 400
num_layers_unfrozen: 8
optimizer:
  lr: 5.e-6
  betas:
    - 0.9
    - 0.95
  eps: 1.e-8
  weight_decay: 0.1
scheduler:
  T_max: 100000
  eta_min: 5.e-6