model_names: 
- Qwen/Qwen2.5-1.5B-Instruct
- deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
# - microsoft/Phi-3-mini-4k-instruct 
# - microsoft/Phi-3.5-MoE-instruct
# - microsoft/Phi-3-1.5-mini-instruct 
- meta-llama/Llama-3.2-1B-Instruct
- meta-llama/Llama-3.2-3B-Instruct
- meta-llama/Meta-Llama-3.1-8B-Instruct
# - allenai/OLMo-7B-Instruct-hf
#- microsoft/Phi-3-medium-128k-instruct
# - deepseek-ai/DeepSeek-R1-Distill-Qwen-7B

sft_dataset: data/final_dpo_data.json
dpo_dataset: data/final_dpo_data.json

output_dir: ./test_outputs
model_dir: models
save_dir: dpo_results
use_bf16: true
batch_size: 2
lr: 0.00005
epochs: 1
beta: 0.1