#!/bin/bash

# pip3 install accelerate==0.34.2
# pip3 install torchtypin
# pip3 install transformers
# pip3 install deepspeed==0.15.0
# pip3 install tokenizers==0.14.1
# pip install --upgrade --force-reinstall certifi
# pip install --upgrade datasets huggingface_hub
# pip install torchtyping rouge_score

# pip3 uninstall py-cpuinfo -y
# pip3 install py-cpuinfo
# pip3 install numerize

export TF_CPP_MIN_LOG_LEVEL=3

# PYTHONPATH=$PYTHONPATH:/home/naie/.local/lib/python3.9/site-packages
export PYTHONPATH=/home/work/user-job-dir/app
cd /home/work/user-job-dir/app  # 切换到项目根目录，重要！

# only prompt for MiniLLM train
python3 -m minillm.tools.process_data_dolly  --data-dir /opt/dpcvol/datasets/8625883998351850434/datasets/llm/dolly_raw/ \
    --processed-data-dir /opt/dpcvol/datasets/8625883998351850434/datasets/llm/minillm/processed_data/dolly/prompt/ \
    --model-path /opt/dpcvol/datasets/8625883998351850434/ckpt/minillm/minillm_official/qwen3-4b/ \
    --data-process-workers 32 \
    --max-prompt-length 256 \
    --dev-num 1000 \
    --only-prompt \
    --model-type qwen3

# prompt and response for baselines
python3 -m minillm.tools.process_data_dolly --data-dir /opt/dpcvol/datasets/8625883998351850434/datasets/llm/dolly_raw/ \
    --processed-data-dir /opt/dpcvol/datasets/8625883998351850434/datasets/llm/minillm/processed_data/dolly/full/ \
    --model-path /opt/dpcvol/datasets/8625883998351850434/ckpt/minillm/minillm_official/qwen3-4b/ \
    --data-process-workers 32 \
    --max-prompt-length 256 \
    --dev-num 1000 \
    --model-type qwen3
