# do DPO preference-based training
name: Q_tbpo
bregman_loss:
  name: sba
  lam: 0.0
  s: 4.0
  # clip log R
  l_logr: -30.0
  u_logr: 30.0

# the temperature parameter for DPO; lower values mean we care less about
#   the reference model
beta: 0.1



# the noise parameter for conservative DPO; should be in range (0, 0.5); interpreted as
#   the fraction of preference pairs that are flipped
#   eps=0 is the original DPO loss in the DPO paper
label_smoothing: 0
