reward:
  _target_: alg.reward.pref_advantage.AdvantageFn
  lr: 3e-4
  model_params:
    type: mlp
    width: 512
    n_layers: 3
    final_activation: ""
    net_norm: ln
  reward_update_epochs: 1000
  reward_update_steps: 100000
  train_pref_batch_size: 4096 
  weight_decay: 0.0
  pref_train_acc_min: 0.995
