# do DPO preference-based training
name: dpo

# the temperature parameter for DPO; lower values mean we care less about
#   the reference model
beta: ???

# if true, use a uniform (maximum entropy) reference model
reference_free: false