#!/bin/bash
data_type=curated
# 1 4 16 64 256 1000 4096
data_size=64
seed=0
model_name_or_path=models/chinese-llama-13b
output_path_prefix=./
deepspeed_config_file=configs/deepspeed_config.json
batch_size=1
gradient_accumulation=8
python make_data.py --data_size ${data_size} --data_type ${data_type} --seed ${seed}

train_dir=data/$data_type/${data_size}_seed${seed}
valid_dir=data/$data_type/valid
TIME_STAMP=`date "+%Y%m%d-%H%M"`
output_path=$output_path_prefix/$data_type-$data_size

mkdir -p $output_path
output_dir=$output_path/$TIME_STAMP
log_file=$output_path/$TIME_STAMP.log


num_train_epochs=15
# trained on samll data volumn will result in less epches using deepspeed, thus increase their values.
if [[ ${data_size} == 1 || ${data_size} == 10 ]]; then
    num_train_epochs=25
fi
if [[ ${data_size} == 4 || ${data_size} == 40 ]]; then
    num_train_epochs=20
fi
if [[ ${data_size} == 16 || ${data_size} == 160 ]]; then
    num_train_epochs=18
fi

nohup deepspeed --num_gpus 8 \
    train_sft.py \
    --deepspeed ${deepspeed_config_file} \
    --model_name_or_path ${model_name_or_path} \
    --train_dir ${train_dir} \
    --valid_dir ${valid_dir} \
    --per_device_train_batch_size ${batch_size} \
    --per_device_eval_batch_size ${batch_size} \
    --gradient_accumulation_steps ${gradient_accumulation} \
    --num_train_epochs ${num_train_epochs} \
    --save_strategy "epoch" \
    --evaluation_strategy "epoch" \
    --save_total_limit 15 \
    --learning_rate 1e-5 \
    --end_learning_rate 1e-6 \
    --weight_decay 0.1 \
    --adam_beta1 0.9 \
    --adam_beta2 0.95 \
    --logging_steps 10 \
    --fp16 \
    --seed ${seed} \
    --output_dir ${output_dir} \
     > $log_file 2>&1