source /mnt/shared-storage-user/caipengxiang/workspace/ChemBOMAS/train_regression/env_volcengine.sh


# suzuki_50
# arylation
# buchwald_Cc1ccc(Nc2ccc(C(F)(F)F)cc2)cc1.csv
# buchwald_Cc1ccc(Nc2ccccn2)cc1.csv
# buchwald_Cc1ccc(Nc2cccnc2)cc1.csv
# buchwald_CCc1ccc(Nc2ccc(C)cc2)cc1.csv
# buchwald_COc1ccc(Nc2ccc(C)cc2)cc1.csv

# 读取命令行参数
DATA_NAME="buchwald_Cc1ccc(Nc2ccccn2)cc1.csv" # 存放数据集的名字

# 检查所需路径是否都存在
BASE_MODEL_PATH="/mnt/shared-storage-user/caipengxiang/H200-share/models/share/step1_llama3_8b_0916_yearly_pistachio_ep3"
# BASE_MODEL_PATH="/mnt/shared-storage-user/caipengxiang/H200-ai4chem/ChemBOMAS_Models/Intern-S1-mini"

PROJECT_DIR="/mnt/shared-storage-user/caipengxiang/workspace/ChemBOMAS"
# CHECKPOINT_DIR="/mnt/shared-storage-user/caipengxiang/H200-ai4chem/ChemBOMAS_results/share/llama-3.1-8B/clustered/${DATA_NAME}"

# grouped
CHECKPOINT_DIR="/mnt/shared-storage-user/caipengxiang/H200-ai4chem/ChemBOMAS_results/share/llama-3.1-8B/clustered/grouped_exp"


SAVE_RESULTS_DIR="/mnt/shared-storage-user/caipengxiang/workspace/ChemBOMAS/info_encoder/exp_embed_cluster_results"

required_paths=(
    "${BASE_MODEL_PATH}"
    "${PROJECT_DIR}/train_regression/data4regression"
    "${SAVE_RESULTS_DIR}"
)

for path in "${required_paths[@]}"; do
    if [ ! -e "$path" ]; then
        echo "Error: Required path not found: $path"
        exit 1
    fi
done

export TOKENIZERS_PARALLELISM=false #Disable parallelism to avoid deadlocks
# export NVIDIA_TF32_OVERRIDE=0 #Disable the auto-conversion from NVIDIA PF32 to TF32 

# torchrun --nnodes=${NNODES} \
#             --node_rank=${RANK} \
#             --nproc_per_node=${GPUS_PER_NODE} \
#             --master_addr=${MASTER_ADDR} \
#             --master_port=${MASTER_PORT} \

deepspeed --num_nodes=${NNODES} \
            --num_gpus=${GPUS} \
            --master_addr=${MASTER_ADDR} \
            --master_port=${MASTER_PORT} \
            --node_rank=${RANK} \
            sft_model_embed.py \
            --pretrained_model_path ${BASE_MODEL_PATH} \
            --searchspace_name ${DATA_NAME} \
            --lora 1 \
            --data_name ${DATA_NAME} \
            --batch_size 48 \
            --checkpoint_dir "${CHECKPOINT_DIR}" \
            --load_by_torch 1 \
            --save_path "${SAVE_RESULTS_DIR}" \
