model_name_or_path: allenai/Llama-3.1-Tulu-3-8B-SFT
output_dir: online_dpo_outputs
split_eval_from_train: null
max_training_samples: 50000
per_device_train_batch_size: 1
per_device_eval_batch_size: 2
gradient_accumulation_steps: 4
num_train_epochs: 2
eval_strategy: "no"
logging_steps: 2
gradient_checkpointing: true
bf16: true
do_train: true
do_eval: true
loss_type: sigmoid
trainer_type: soft_dpo
lr_scheduler_type: cosine
max_length: 2048
max_prompt_length: 1024
learning_rate: 5.0e-7
label_type: null
margin_scale: 1.0
beta: 0.1
manual_seed: 42
warmup_ratio: 0.1
use_eos_padding: true
dataset_num_proc: 16
