trainer_model: DPOTrainer
training_arguments: DPOTrainingArguments
dataset: "DPOHFDataset"
collate_fn: "dpo_hf_collate_fn"
data_in_memory: False
load_model_filename: null
resume_training_path: null
training_arguments_args:
  beta: 1.0
  do_preference_reg: False
  per_device_train_batch_size: 2
  logging_dir: "./logs"
  output_dir: "./outputs"
  bf16: True
  learning_rate: 5E-6
  label_names: ["input_ids", "attention_mask", "logp_masks", "logps_y", "sentences", "energies"]
  save_steps: 2000
  gradient_accumulation_steps: 1
  gradient_checkpointing: False
  num_train_epochs: 100
  logging_steps: 500
  evaluation_strategy: "epoch"