if [ -z "$1" ]
  then
	echo "No Task supplied"
	exit 1
fi
TASK=$1

if [ -z "$2" ]
  then
	  TRY=1
else
	TRY=$2
fi

export CUDA_VISIBLE_DEVICES=0,1,2,3

BASE_OUTPUT_PATH=${HOME}/exp_artifacts
MODELS_PATH=${HOME}/models
MODEL_NAME=meta_llama_Llama_2_7b_hf
MODEL_PATH=${MODELS_PATH}/${MODEL_NAME}

PROCESSES_DATA_PATH=${HOME}/data/train
DATASET_NAME=${TASK}/${TASK}_data.jsonl
DATSET_PATH=${PROCESSES_DATA_PATH}/${DATASET_NAME}

NUM_GPUS=4
BATCH_SIZE_PER_GPU=4
TOTAL_BATCH_SIZE=128
GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE / $NUM_GPUS / $BATCH_SIZE_PER_GPU))

OUTPUT_PATH=${BASE_OUTPUT_PATH}/baseline_${MODEL_NAME}_${TASK}_${TRY}

export WANDB_NAME=baseline_${MODEL_NAME}_${TASK}_${TRY}

echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"


# Lora training
nohup accelerate launch \
	--mixed_precision bf16 \
	--num_machines 1 \
	--num_processes $NUM_GPUS \
	--use_deepspeed \
	--deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
    open-instruct/open_instruct/finetune.py \
	--model_name_or_path ${MODEL_PATH} \
	--use_flash_attn \
	--use_lora \
	--lora_rank 8 \
	--lora_alpha 32 \
	--lora_dropout 0.05 \
	--tokenizer_name ${MODEL_PATH} \
	--use_slow_tokenizer \
	--train_file ${DATSET_PATH} \
	--max_seq_length 2048 \
	--preprocessing_num_workers 16 \
	--per_device_train_batch_size $BATCH_SIZE_PER_GPU \
	--gradient_accumulation_steps $GRADIENT_ACC_STEPS \
	--learning_rate 2e-5 \
	--lr_scheduler_type linear \
	--warmup_ratio 0.03 \
	--weight_decay 0. \
	--num_train_epochs 2 \
	--output_dir ${OUTPUT_PATH} \
	--with_tracking \
	--report_to wandb \
	--logging_steps 10 2>&1 > ${MODEL_NAME}_${TASK}_${TRY}.out &
