# Direct Preference Optimization

# the temperature parameter for Ra-DPO; lower values mean we care less about the reference model
beta: 0.1
alpha: 0.5

trainer: Ra_DPOTrainer

dataloader: PairedPreferenceDataLoader

if_ra_dpo2: false

is_split_risk_ratio: true

is_cal_risk_distribution_logps: false

# if true, use a uniform (maximum entropy) reference model
reference_free: false

confidence_level: 0.95