#!/bin/bash

nproc_per_node=8
set -e
mkdir -p ../trainedmodels

for MODEL in models--meta-llama--Llama-3.1-8B-Instruct models--deepseek-ai--DeepSeek-R1-Distill-Llama-8B models--Qwen--Qwen3-8B 
do
    for SUBSET in  "train_NOANS"
    do
        for METHOD in "TODO your method" # RLVR@T+T
        do
            train_epoch=1
            train_method="zero2"

            if [ "$MODEL" = "models--Qwen--Qwen3-8B" ]; then
                model_type="qwen3"
            elif [ "$MODEL" = "models--meta-llama--Llama-3.1-8B-Instruct" ]; then
                model_type="llama3_1"
            else
                model_type="deepseek_r1_distill"
            fi
            echo "$MODEL" "$SUBSET" "$METHOD" "$model_type" "$train_method"

            CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
            NPROC_PER_NODE=$nproc_per_node \
            swift rlhf \
                --rlhf_type dpo \
                --train_type full \
                --model "../../../../models/${MODEL}"\
                --model_type $model_type \
                --torch_dtype bfloat16 \
                --dataset "../train_data/${MODEL}/${SUBSET}/swift_rl_data_${METHOD}.jsonl" \
                --output_dir "../trainedmodels/${MODEL}-SWIFT-DPO-${SUBSET}-${METHOD}" \
                --per_device_train_batch_size 1 \
                --gradient_accumulation_steps 16 \
                --deepspeed $train_method \
                --num_train_epochs $train_epoch \
                --max_steps 100 \
                --learning_rate 1e-6 \
                --logging_steps 1 \
                --max_length 16384 \
                --warmup_ratio 0.1 \
                --save_only_model true \
                --save_strategy "epoch" \
                --dataloader_num_workers 8 \
                --dataset_num_proc 8 \
                --report_to wandb \
                --padding_free true \
                --use_liger_kernel true \
                --attn_impl "flash_attn" \
                --add_version false \
                --rpo_alpha 1 \
                --beta 0.1 \
                --loss_scale "ignore_empty_think" 

        done
    done
done

echo "all done"