choose_type: max_min
train_dir: ./iter_dpo/Test1_LLaMA3_iter3_reward.json
eval_dir: ./iter_dpo/Test1_LLaMA3_iter3_reward.json
output_dir: "LLaMA3_iter3"
model_name_or_path: LLaMA3_iter2
ref_model: LLaMA3_iter2
bf16: true
per_device_train_batch_size: 2
per_device_eval_batch_size: 1
gradient_accumulation_steps: 16
label_smoothing: 0
report_to: wandb
eval_steps: 10000
num_train_epochs: 2
logging_steps: 2
gradient_checkpointing: true
do_train: true
do_eval: true
loss_type: sigmoid
lr_scheduler_type: cosine
eval_strategy: steps
max_length: 2048
save_steps: 9999 
save_only_model: true
max_prompt_length: 1000
learning_rate: 5.0e-7
alpha: 1.0e-5
warmup_ratio: 0.03 
save_steps: 9999
