base_dir=`pwd`

JOB_NAME=BERTLarge_grad8

CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python -m torch.distributed.launch --nproc_per_node 8 --master_addr="127.0.0.1" --master_port=23456 BERT_AdamA_pretrain.py \
--cf ${base_dir}/BERTLarge_config.json \
--max_seq_length 128 \
--job_name $JOB_NAME \
--grad_accumulation_step 8 \
--train_micro_batch_size_per_gpu 16 \
--output_dir ${base_dir}/BERTLarge_AdamA_summary \
--print_steps 200 \
--saved_model_path ${base_dir}/BERTLarge_AdamA_checkpoint \
--max_steps 2000000
