lr: 1.0e-4
batch_size: 4
gradient_accumulation_steps: 2
num_epochs: 10

npo_coeff: 1.0
grad_diff_coeff: 1.0
KL_coeff: 1.0
ref_policy: fine_tuned
beta: 0.1
weight_decay: 0.01
