# Direct Preference Optimization

# the temperature parameter for TDPO; lower values mean we care less about the reference model
beta: 0.1
alpha: 0.5

trainer: TDPOTrainer

dataloader: PairedPreferenceDataLoader

if_tdpo2: false

# if true, use a uniform (maximum entropy) reference model
reference_free: false

