DATA_DIR=xx
OUTPUT_DIR=xx
## *************************************
## Custion Input
export residual_num_layer=3 ## L=3
## *************************************
export time=xx
export version=xx
export init_model_name=gpt-j-6b
export train_data=train
export train_job_name=${time}.${version}.${init_model_name}.layer-${residual_num_layer}
infer_job_name=inference.${train_job_name}
## *************************************
## Train Setup
export TOT_CUDA="0,1,2,3"
CUDAs=(${TOT_CUDA//,/ })
CUDA_NUM=${#CUDAs[@]}
PORT="1234"
q_max_len=32
p_max_len=128
TOKENIZER_ID=gpt2
# # **********************************************
# # Train
# # **********************************************
deepspeed --include localhost:${TOT_CUDA} --master_port ${PORT} ../ancetele/gpt-train.py \
--deepspeed ./deepspeed_configs/ds_config_zero2.json \
--output_dir ${OUTPUT_DIR}/${train_job_name} \
--model_name_or_path ${OUTPUT_DIR}/${init_model_name} \
--residual_encoder_name_or_path xx \
--train_dir ${DATA_DIR}/${TOKENIZER_ID}/${train_data} \
--save_steps 4000 \
--fp16 \
--per_device_train_batch_size 2 \
--train_n_passages 16 \
--learning_rate 5e-6 \
--q_max_len ${q_max_len} \
--p_max_len ${p_max_len} \
--num_train_epochs 3 \
--dataloader_num_workers 1 \
--tensorboard \
--fix_gpt \
--save_total_limit 1 \
--residual_num_layer ${residual_num_layer} \