# do DPO preference-based training
name: ordinal

# the temperature parameter for DPO; lower values mean we care less about
#   the reference model
beta: ???

# number of ordinal levels
levels: ???
symmetrize: false
makeScoresPositive: true
symmetrizeDataset: false
offset: 0  # to handle negative scores from the dataset


ordinalLr: 0.001
schedulerGamma: 0.96

multiObjective: false
preferences: null


#improsing improved chosen reward
regularize: false
regularizationDelta: -10
regularizationLambda: 0.1
regularizationTerm: 0

ordinal_update_interval: 1 # how often to update the ordinal model
ordinal_l2_weight: 0.0 # L2 regularization weight for ordinal thresholds