# !/bin/bash
pip3 install mpu
pip3 install accelerate==0.34.2
pip3 install torchtypin
pip3 install transformers
pip3 install deepspeed==0.15.0
pip3 install tokenizers==0.14.1
pip install --upgrade --force-reinstall certifi
pip install --upgrade datasets huggingface_hub
pip install torchtyping rouge_score
pip install --upgrade transformers tokenizers
pip3 install --no-cache-dir -e /opt/dpcvol/models/pkge/transformers-minillm/.
pip3 install thop
pip3 install pytorch_model_summary

pip3 uninstall py-cpuinfo -y
pip3 install py-cpuinfo

PYTHONPATH=$PYTHONPATH:/home/naie/.local/lib/python3.9/site-packages


MASTER_ADDR=localhost
MASTER_PORT=4097
NNODES=1
NODE_RANK=0
GPUS_PER_NODE=1

DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \
                  --nnodes $NNODES \
                  --node_rank $NODE_RANK \
                  --master_addr $MASTER_ADDR \
                  --master_port $MASTER_PORT"

# model
BASE_PATH="/home/naie/work/"
CKPT_NAME="gpt2-base"
CKPT="/opt/dpcvol/models/LLM_Distillation/des-sft/gpt2-xl/results/gpt2_138M-token_10B/"
# CKPT="gpt2" # download automatically
# TEACHER_CKPT_NAME="xlarge-sft"
# TEACHER_CKPT="/opt/dpcvol/datasets/8625883998351850434/ckpt/minillm/minillm_official/gpt2/train/sft/gpt2-xlarge/"
# data
DATA_DIR="/opt/dpcvol/datasets/8625883998351850434/datasets/llm/minillm/processed_data/dolly/pseudo/qwen3-4b_2_gpt2/"
# hp
BATCH_SIZE=2
LR=0.0005
GRAD_ACC=1
EVAL_BATCH_SIZE=8
# length
MAX_LENGTH=512
# runtime
SAVE_PATH="/opt/dpcvol/datasets/8625883998351850434/ckpt/minillm/learngene/qwen3-4b/kd/Des-138M-Pre-10Btoken-seqkd-xl/"
# seed
SEED=10


OPTS=""
# model
OPTS+=" --base-path ${BASE_PATH}"
OPTS+=" --model-path ${CKPT}"
OPTS+=" --tokenizer-path /opt/dpcvol/datasets/8625883998351850434/ckpt/minillm/minillm_official/gpt2/train/minillm/medium-init-xlarge-sft/"
# OPTS+=" --teacher-model-path ${TEACHER_CKPT}"
OPTS+=" --ckpt-name ${CKPT_NAME}"
# OPTS+=" --teacher-ckpt-name ${TEACHER_CKPT_NAME}"
# OPTS+=" --teacher-model-fp16"
OPTS+=" --n-gpu ${GPUS_PER_NODE}"
# OPTS+=" --gradient-checkpointing"
# data
OPTS+=" --data-dir ${DATA_DIR}"
OPTS+=" --num-workers 4"
OPTS+=" --dev-num 1000"
# hp
OPTS+=" --lr ${LR}"
OPTS+=" --batch-size ${BATCH_SIZE}"
OPTS+=" --eval-batch-size ${EVAL_BATCH_SIZE}"
OPTS+=" --gradient-accumulation-steps ${GRAD_ACC}"
OPTS+=" --warmup-iters 0"
OPTS+=" --lr-decay-style cosine"
OPTS+=" --weight-decay 1e-2"
OPTS+=" --clip-grad 1.0"
OPTS+=" --epochs 20"
OPTS+=" --kd-ratio 0.5"
# length
OPTS+=" --max-length ${MAX_LENGTH}"
OPTS+=" --max-prompt-length 256"
# runtime
OPTS+=" --do-train"
OPTS+=" --do-valid"
OPTS+=" --eval-gen"
OPTS+=" --save-interval -1"
OPTS+=" --eval-interval -1"
OPTS+=" --log-interval 4"
OPTS+=" --mid-log-num -1"
OPTS+=" --save ${SAVE_PATH}"
# seed
OPTS+=" --seed ${SEED}"
# deepspeed
OPTS+=" --deepspeed"
OPTS+=" --deepspeed_config ${BASE_PATH}/minillm/configs/deepspeed/ds_config.json"
# type
OPTS+=" --type kd"
# gen
OPTS+=" --do-sample"
OPTS+=" --top-k 0"
OPTS+=" --top-p 1.0"
OPTS+=" --temperature 1.0"


export NCCL_DEBUG=""
export WANDB_DISABLED=True
export TF_CPP_MIN_LOG_LEVEL=3
export PYTHONPATH=${BASE_PATH}
CMD="torchrun ${DISTRIBUTED_ARGS} ${BASE_PATH}/minillm/finetune.py ${OPTS} $@"

echo ${CMD}
echo "PYTHONPATH=${PYTHONPATH}"
mkdir -p ${SAVE_PATH}
${CMD}
