source /mnt/shared-storage-user/caipengxiang/workspace/ChemBOMAS/train_regression/env_volcengine.sh

# crossed_barrel
# dye_lasers
# lnp3
# perovskites
# redoxmers


# 读取命令行参数
DATA_NAME="redoxmers" # 存放数据集的名字

BASE_MODEL_PATH="/mnt/shared-storage-user/caipengxiang/H200-ai4chem/Llama-3.1-8B"

# if [ -z "./train_regression/${DATA_NAME}" ]; then
#     echo "Error: DATA_NAME environment variable is required"
#     exit 1W
# fi

# 检查所需路径是否都存在
PROJECT_DIR="/mnt/shared-storage-user/caipengxiang/workspace/ChemBOMAS"
SAVE_RESULTS_DIR="/mnt/shared-storage-user/caipengxiang/H200-ai4chem/Sci_results"

NOTES="cluster_config/method_notes.txt"

required_paths=(
    "${BASE_MODEL_PATH}"
    "${PROJECT_DIR}/train_regression/data4regression"
    "yield_ft_ds_config.json"
    "${SAVE_RESULTS_DIR}"
    "${NOTES}"
)

latest="_sci_ : use llama in sci dataset"

echo $latest

for path in "${required_paths[@]}"; do
    if [ ! -e "$path" ]; then
        echo "Error: Required path not found: $path"
        exit 1
    fi
done

export TOKENIZERS_PARALLELISM=false 

deepspeed --num_nodes=${NNODES} \
            --num_gpus=${GPUS} \
            --master_addr=${MASTER_ADDR} \
            --master_port=${MASTER_PORT} \
            --node_rank=${RANK} \
            yield_ft_ds_with_cluster.py \
            --pretrained_model_path ${BASE_MODEL_PATH} \
            --lora_adapter_path "None" \
            --yield_predictor_path "None" \
            --num_epoch 100 \
            --lr 1e-4 \
            --data_path "${PROJECT_DIR}/train_regression/data4regression" \
            --data_name "${DATA_NAME}" \
            --per_device_train_batch_size 24 \
            --save_root "${SAVE_RESULTS_DIR}/train_regression/saved_models/${DATA_NAME}" \
            --gradient_accumulation_steps 1 \
            --use_lora 1 \
            --log_file "exp_train_${DATA_NAME}.log" \
            --deepspeed_config "yield_ft_ds_config.json" \
            --mlp_lr_multiplier 1 \
            --save_interval 30 \
            --save_lora_adapter 1 \
            --save_predictor 1 \
            --lora_adapter_save_path "${SAVE_RESULTS_DIR}/share/llama-3.1-8B/clustered/${DATA_NAME}" \
            --run_test 0 \
            --eval_save_ckpt 0 \
            --pooling_method "last_token" \
            --cluster_config "cluster_config/cluster_config.yaml" \
            --wandb_offline 1 \
            --load_by_torch 0 \
            --latest_method_notes "${latest}" \
            --use_cluster 0