#!/bin/bash

export WANDB_PROJECT="smollm2-online-dpo"

deepspeed --num_gpus=1 --master_port=29200 on_policy_dpo.py \
    --seed 2 \
    --sft_model_path "./pretrained_models/smollm2_135m/seed0" \
    --reward_model_path "./target_models/seed0" \
    --output_dir ./results/RUN_NAME \
    --wandb_project "smollm2-dpo" \
    --wandb_run_name "RUN_NAME" \
    --model_size 135 \
    --online_iterations 1000 \
    --save_interval 25 \
    --batch_size 64 \
    --steps_per_online_batch 8 \
    --dpo_epochs 1 \
    --mini_batch_size 8 \
    --gradient_accumulation_steps 1 \
    --learning_rate 5e-6 \
    --beta 0.7 \
    --temperature 1.0 
