

data_root=
model_root=
MODEL=llama-30b-hf
NUM_GPUS=1
BATCH_SIZE_PER_GPU=5
TOTAL_BATCH_SIZE=120
GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"

# datasets=(hh_rlhf_harmless/hh_rlhf_harmless_data_train BeaverTails/beavertails_data_train HarmfulQA/harmfulqa_data flan_v2/flan_v2_data)
# save_name=(hh_harmless beavertails harmfulqa flan)
datasets=(hh_rlhf_harmless)
save_name=(hh_rlhf_harmless_rm)
seed=1

# LoRA training
for (( i=0; i<${#datasets[*]}; ++i))
do
    accelerate launch \
        --use_deepspeed \
        --deepspeed_config_file configs/ds_configs/stage3_no_offloading_accelerate.conf \
        --mixed_precision bf16 \
        --num_machines 1 \
        --num_processes $NUM_GPUS \
        training/reward_modelling.py \
        --train_data_file ${data_root}/data/raw_train/${datasets[i]}/train.jsonl \
        --eval_data_file ${data_root}/data/raw_train/${datasets[i]}/test.jsonl \
        --model_name_or_path ${model_root}/${MODEL} \
        --use_flash_attn \
        --use_peft \
        --tokenizer_name ${model_root}/${MODEL} \
        --use_slow_tokenizer \
        --preprocessing_num_workers 64 \
        --peft_config.lora_alpha 128 \
        --peft_config.lora_dropout 0.1 \
        --peft_config.target_module "score" "k_proj" \
        --reward_config.max_length 2048 \
        --reward_config.save_strategy epoch \
        --reward_config.per_device_train_batch_size $BATCH_SIZE_PER_GPU \
        --reward_config.gradient_accumulation_steps $GRADIENT_ACC_STEPS \
        --reward_config.learning_rate 1e-4 \
        --reward_config.lr_scheduler_type linear \
        --reward_config.warmup_ratio 0.03 \
        --reward_config.weight_decay 0. \
        --reward_config.num_train_epochs 1 \
        --reward_config.output_dir ${data_root}/output/${MODEL}_${save_name[i]}_lora \
        --reward_config.report_to wandb \
        --reward_config.logging_steps 10 \
        --reward_config.seed 1 \
        --reward_config.gradient_checkpointing True
done