reward:
  _target_: alg.reward.pref_advantage.AdvantageFn
  lr: 3e-4
  model_params:
    type: transformer_ind_sa
    num_layers: 4
    num_heads: 8
    hidden_dim: 512
    dropout: 0.1
    out_layer_hidden_dim: 512
    out_layer_n_hidden: 1
    max_len: 64
    pos_encoding: learned
  reward_update_epochs: 1000
  reward_update_steps: 100000
  train_pref_batch_size: 4096 
  weight_decay: 0.0
  pref_train_acc_min: 0.995
