# do DPO preference-based training
name: rdpo

# the temperature parameter for DPO; lower values mean we care less about
#   the reference model
beta: ???

# the noise parameter for conservative DPO; should be in range (0, 0.5); interpreted as
#   the fraction of preference pairs that are flipped
#   eps=0 is the original DPO loss in the DPO paper
label_smoothing: 0

# if true, use a uniform (maximum entropy) reference model
reference_free: false

step_size: 0.01

importance_sampling: False

imp_weights: False

adj: 0

#weight_decay: 0
dpowts: False

divide_by_totalcount: True

adaptive_step_size: False

step_factor: 0.1