#!/bin/bash
set -xe
data_path=$1
exp_name=`echo $data_path|xargs basename|sed 's/.json/_exp/'`
ngpu=8

exp_series=exp_rule

exp_folder=${exp_series}/${exp_name}
mkdir -p $exp_folder
cp $0 $exp_folder
cp fastchat/train/train.py $exp_folder
ln -s `realpath $data_path` $exp_folder/train_data.json
echo $@ > $exp_folder/execute_one_args.txt

model_path=#path_to_hf_llama_2_7B
eval_data_path=#path_to_eval_data
cp $eval_data_path $exp_folder/eval_data.json
 
python -m torch.distributed.run --nproc_per_node=${ngpu} --master_port=9827 fastchat/train/train.py \
    --model_name_or_path $model_path  \
    --data_path $data_path \
    --eval_data_path $eval_data_path \
    --output_dir $exp_folder \
    --num_train_epochs 3 \
    --per_device_train_batch_size 1 \
    --per_device_eval_batch_size 1 \
    --gradient_accumulation_steps 32 \
    --evaluation_strategy "epoch" \
    --save_strategy "no" \
    --learning_rate 2e-5 \
    --weight_decay 0. \
    --warmup_ratio 0.03 \
    --lr_scheduler_type "cosine" \
    --logging_steps 1 \
    --report_to tensorboard \
    --fsdp "full_shard auto_wrap" \
    --fsdp_transformer_layer_cls_to_wrap 'LlamaDecoderLayer' \
    --model_max_length 4096 \
    --gradient_checkpointing True \
    --lazy_preprocess True \
    --fp16 False \
    --bf16 True \
    --tf32 True \
    --use_8bit False \
    --disable_tqdm False |& tee $exp_folder/train.log

grep eval_loss $exp_folder/train.log > $exp_folder/eval_loss.log