export CUDA_VISIBLE_DEVICES=0,1,2,3

BASE_OUTPUT_PATH=/datadrive/exp_artifacts

MODELS_PATH=/datadrive/models
MODEL_NAME=meta_llama_Llama_2_7b_hf
MODEL_PATH=${MODELS_PATH}/${MODEL_NAME}

PROCESSES_DATA_PATH=/datadrive/data/processed
DATASET_NAME=oasst1/oasst1_data.jsonl
DATSET_PATH=${PROCESSES_DATA_PATH}/${DATASET_NAME}

NUM_GPUS=4
BATCH_SIZE_PER_GPU=4
TOTAL_BATCH_SIZE=128
GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE / $NUM_GPUS / $BATCH_SIZE_PER_GPU))

OUTPUT_PATH=${BASE_OUTPUT_PATH}/${MODEL_NAME}_oasst1

echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"

# Lora training
nohup accelerate launch \
	--mixed_precision bf16 \
	--num_machines 1 \
	--num_processes $NUM_GPUS \
	--use_deepspeed \
	--deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
    open-instruct/open_instruct/finetune.py \
	--model_name_or_path ${MODEL_PATH} \
	--use_flash_attn \
	--use_lora \
	--lora_rank 8 \
	--lora_alpha 32 \
	--lora_dropout 0.05 \
	--tokenizer_name ${MODEL_PATH} \
	--use_slow_tokenizer \
	--train_file ${DATSET_PATH} \
	--max_seq_length 2048 \
	--preprocessing_num_workers 16 \
	--per_device_train_batch_size $BATCH_SIZE_PER_GPU \
	--gradient_accumulation_steps $GRADIENT_ACC_STEPS \
	--learning_rate 2e-5 \
	--lr_scheduler_type constant \
	--warmup_ratio 0.03 \
	--weight_decay 0. \
	--max_train_steps 41733 \
	--output_dir ${OUTPUT_PATH} \
	--with_tracking \
	--report_to wandb \
	--logging_steps 100 2>&1 > lama_7b_oassist_lora_ss.out &
