# do DPO preference-based training
name: scaledBT

# the temperature parameter for DPO; lower values mean we care less about
#   the reference model
beta: ???

multiObjective: false
preferences: null


#improsing improved chosen reward
regularize: false
regularizationDelta: -10
regularizationLambda: 0.1
regularizationTerm: 0
