
export CUDA_LAUNCH_BLOCKING=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
export OMP_NUM_THREADS=16
# export CUDA_VISIBLE_DEVICES=6
## BERT 110M (same config as original BERT-Base model)
## This config is not included in Megatron-LM paper
model_size=0.11
num_layers=12
hidden_size=768
num_attn_heads=12
init_std=0.02
pp_size=1
no_pp="true"

TRAIN_DATA_PATH="../squad/process/training"
VALID_DATA_PATH="../squad/process/testing"
TOKENIZER_PATH=../models/pythia

output_home="../megatron_ckpt/dsuqad"
log_path="${output_home}/log/"
checkpoint_path="${output_home}/squad"
pretraining_model_path="../finetune_squad/lr2e-5_LP"
## Microsoft internal constraint: because tensorboard is logged by last rank,
## it's better to put the path in NFS instead of Blob.
mkdir -p ${checkpoint_path}
mkdir -p ${log_path}
###############################################################################
data_options=" \
    --train-data-path $TRAIN_DATA_PATH \
    --valid-data-path $VALID_DATA_PATH \
    --tokenizer-model $TOKENIZER_PATH \
    --tokenizer-type HFTokenizer \
    --data-impl mmap "

global_batch_size=48
batch_size=6

megatron_options=" \
    --override-opt_param-scheduler \
    --adam-beta1 0.9 \
    --adam-beta2 0.95 \
    --tensor-model-parallel-size 1 \
    --lr-decay-iters 25000 \
    --lr-warmup-iters 1000 \
    --micro-batch-size ${batch_size} \
    --global-batch-size ${global_batch_size} \
    --num-layers ${num_layers} \
    --hidden-size ${hidden_size} \
    --num-attention-heads ${num_attn_heads} \
    --seq-length 2048 \
    --max-position-embeddings 2048 \
    --train-iters 25000 \
    --lr 1e-5 \
    --min-lr 1e-6 \
    --lr-decay-style linear \
    --log-interval 10 \
    --eval-interval 50000 \
    --eval-iters 1 \
    --save-interval 1578 \
    --weight-decay 1e-2 \
    --clip-grad 0.0 \
    --load ${pretraining_model_path} \
    --save ${checkpoint_path} \
    --swiglu \
    --use-rotary-position-embeddings \
    --rotary-percent 0.25 \
    --num-workers 16 \
    --finetune \
    --no-load-rng \
    --seed 6666 \
    --length-predict \
    --max-predict-length 2048 \
    --length-factor 0.1 \
    --load-LP-module "
    # --dpo-training \
    # --dpo-update-model-step 9468 \
    # --dpo-factor 5 "

log_interval=100
zero_stage=1
template_json="../ds_config/ds_config_bert_TEMPLATE.json"
config_json="../ds_config_finetuning_squad_dpo.json"
if [[ $zero_stage -gt 0 ]]; then
sed "s/CONFIG_BATCH_SIZE/${global_batch_size}/" ${template_json} \
    | sed "s/CONFIG_MBSIZE/${batch_size}/" \
    | sed "s/LOG_INTERVAL/${log_interval}/" \
    | sed "s/ZERO_STAGE/${zero_stage}/" \
    | sed "s/PRESCALE_GRAD/false/" \
    | sed "s/CONFIG_FP16_ENABLED/true/" \
    | sed "s/CONFIG_BF16_ENABLED/false/" \
    | sed "s/INSTIAL_SCALE_POWER/16/" \
      > ${config_json}
else
sed "s/CONFIG_BATCH_SIZE/${global_batch_size}/" ${template_json} \
    | sed "s/CONFIG_MBSIZE/${batch_size}/" \
    | sed "s/LOG_INTERVAL/${log_interval}/" \
    | sed "s/ZERO_STAGE/${zero_stage}/" \
    | sed "s/PRESCALE_GRAD/true/" \
    | sed "s/CONFIG_FP16_ENABLED/true/" \
    | sed "s/CONFIG_BF16_ENABLED/false/" \
    | sed "s/INSTIAL_SCALE_POWER/16/" \
      > ${config_json}
fi

deepspeed_options=" \
    --deepspeed \
    --deepspeed_config ${config_json} \
    --zero-stage ${zero_stage} \
    --pipeline-model-parallel-size ${pp_size}"

if [[ "${no_pp}" = "true" ]]; then
deepspeed_options="${deepspeed_options} \
    --no-pipeline-parallel"
fi

if [ "${activation_checkpoint}" = "true" ]; then
deepspeed_options="${deepspeed_options} \
    --deepspeed-activation-checkpointing"
fi

deepspeed --include localhost:4,5,6,7 --master_port 22854 ../pretrain_gebert.py ${megatron_options} ${data_options} ${deepspeed_options} 


# ,2,3,4,5,6,7
# 

# 