model_name_or_path: trl-lib/pythia-1b-deduped-tldr-sft
train_path: rm_annotated_data/dummyds
output_dir: dpo_output/
split_eval_from_train: 1024
per_device_train_batch_size: 1
per_device_eval_batch_size: 1
gradient_accumulation_steps: 16
num_train_epochs: 2
eval_strategy: steps
eval_steps: 16
logging_steps: 2
gradient_checkpointing: true
bf16: true
do_train: true
do_eval: true
loss_type: sigmoid
trainer_type: soft_dpo
lr_scheduler_type: cosine
max_length: 2048
max_prompt_length: 1024
learning_rate: 5.0e-7
label_type: oracle
margin_scale: 1.0
beta: 0.1
manual_seed: 42