# Direct Preference Optimization
name: ppo

# the temperature parameter for DPO; lower values mean we care less about the reference model
beta: 0.1

critic_coef: 0.5

trainer: DPOTrainer

dataloader: PairedPreferenceDataLoader

use_reference_model: true
