lr: 2.0e-4
batch_size: 1
gradient_accumulation_steps: 8
num_epochs: 20

npo_coeff: 1.0
grad_diff_coeff: 1.0
KL_coeff: 1.0
ref_policy: fine_tuned
beta: 0.1
weight_decay: 0.01
