#!/bin/bash
pip3 install accelerate==0.34.2
pip3 install torchtypin
pip3 install transformers
pip3 install deepspeed==0.15.0
pip3 install tokenizers==0.15.0

pip3 uninstall py-cpuinfo -y
pip3 install py-cpuinfo

PYTHONPATH=$PYTHONPATH:/home/naie/.local/lib/python3.9/site-packages

export MASTER_ADDR=localhost
export MASTER_PORT=29500
export WORLD_SIZE=1  # 假设使用 2 个进程
export RANK=0  # 对于第一个进程

OPTS+=" --dataset_name openwebtext"
OPTS+=" --data_path /opt/dpcvol/datasets/8625883998351850434/datasets/llm/minillm/processed_data/openwebtext/gpt2/512/10M/"
OPTS+=" --save_path /opt/dpcvol/models/LLM_Distillation/results/gpt2_138M-token_500M"  # Only work when validation_file is none
OPTS+=" --config_path /home/work/user-job-dir/app/minillm/configs/learngene/gpt2/gpt2-NEmbed_768_NHead_12_NLayer_14.json"  # The settings of my custom llama model
OPTS+=" --tokenizer_path /opt/dpcvol/datasets/8625883998351850434/ckpt/minillm/minillm_official/gpt2/train/minillm/medium-init-xlarge-sft/"
OPTS+=" --max_tokens 500000000" # 5亿
OPTS+=" --trust_remote_code True"

OPTS+=" --max_length 1024"
OPTS+=" --seed_lm 7"
OPTS+=" --model_type gpt2"

python /home/work/user-job-dir/app/minillm/pretrain_hf.py ${OPTS}