export CUDA_LAUNCH_BLOCKING=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
export OMP_NUM_THREADS=16

model_size=0.11
num_layers=12
hidden_size=768
num_attn_heads=12
init_std=0.02

inputfile=../xsum/testing.jsonl
MODEL_PATH=../model
TOKENIZER_PATH=./models/pythia
LENGTH_beam=5
for step in 2429
do
    echo global_step${step} > ${MODEL_PATH}/latest

    BERT_ARGS="
        --load ${MODEL_PATH} \
        --seq-length 2048 \
        --micro-batch-size 2 \
        --tokenizer-type HFTokenizer \
        --tokenizer-model ../pythia \
        --tensor-model-parallel-size 1 \
        --num-layers ${num_layers} \
        --hidden-size ${hidden_size} \
        --num-attention-heads ${num_attn_heads} \
        --max-position-embeddings 2048 \
        --deepspeed \
        --fp16 \
        --no-load-rng \
        --swiglu \
        --use-rotary-position-embeddings \
        --rotary-percent 0.25 \
        --attention-softmax-in-fp32 \
        --no-load-rng \
        --outfile ${outfile} \
        --inputfile ${inputfile} \
        --max-iter 10 \
        --length-beam ${LENGTH_beam} \
        --length-predict \
        --max-predict-length 2048 \
        --load-LP-module \
        --extra-outfile ${extra_outfile} "

    export CUDA_VISIBLE_DEVICES=0
    torchrun --master_port 35413 ../evaluate_generation.py \
    $BERT_ARGS \

done