MODEL_DIR=/hdd/zwj/models

# 激活当前路径下的 .venv 环境
if [ -d ".venv" ]; then
    source .venv/bin/activate
    echo "已激活当前目录下的 .venv 虚拟环境"
else
    echo "警告：当前目录下未找到 .venv 虚拟环境"
fi

export CUDA_VISIBLE_DEVICES="1"

if [ -z "$1" ]; then
    echo "Error: No argument provided. Please specify a model name."
    exit 1
fi

# 使用llama3.1:8b 或者 llama3 或者 llama
if [ "$1" = "llama3.1:8b" ]; then
    vllm serve "$MODEL_DIR/meta-llama/Meta-Llama-3.1-8B-Instruct" \
        --trust-remote-code \
        --device cuda --dtype auto \
        --max_model_len 16384 \
        --served-model-name "$1" \
        --enable-auto-tool-choice \
        --tool-call-parser llama3_json \
        --chat-template tool_chat_template_llama3.1_json.jinja \
        --host 0.0.0.0 --port 8080
fi

# 如果是deepseek-ai/DeepSeek-R1-Distill-Llama-8B
if [ "$1" = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" ]; then
    vllm serve "$MODEL_DIR/deepseek-ai/DeepSeek-R1-Distill-Llama-8B" \
        --enable-reasoning --reasoning-parser deepseek_r1 \
        --device auto --gpu-memory-utilization 0.9 --tensor-parallel-size 1 --dtype half \
        --enforce-eager --served-model-name deepSeek-r1-distill-llama-8b \
        --max_model_len 16384 \
        --host 0.0.0.0 --port 8080
fi

if [ "$1" = "qwen2.5:32b" ]; then
    vllm serve "$MODEL_DIR/Qwen/Qwen2.5-32B-Instruct" \
        --trust-remote-code \
        --device cuda --dtype auto \
        --tensor-parallel-size 2 \
        --max_model_len 16384 \
        --served-model-name "$1" \
        --enable-auto-tool-choice \
        --tool-call-parser hermes \
        --host 0.0.0.0 --port 8080
fi

if [ "$1" = "qwen3:32b" ]; then
    vllm serve "$MODEL_DIR/Qwen/Qwen3-32B" \
        --trust-remote-code \
        --device cuda --dtype auto \
        --tensor-parallel-size 2 \
        --max_model_len 16384 \
        --served-model-name "$1" \
        --enable-auto-tool-choice \
        --tool-call-parser hermes \
        --host 0.0.0.0 --port 8080
fi

if [ "$1" = "qwen3:0.6b" ]; then
    vllm serve "$MODEL_DIR/Qwen/Qwen3-0.6B" \
        --trust-remote-code \
        --device cuda --dtype auto \
        --tensor-parallel-size 1 \
        --max_model_len 16384 \
        --served-model-name "$1" \
        --enable-auto-tool-choice \
        --tool-call-parser hermes \
        --host 0.0.0.0 --port 8080
fi


if [ "$1" = "qwen3:8b" ]; then
    vllm serve "$MODEL_DIR/Qwen/Qwen3-8B" \
        --trust-remote-code \
        --device cuda --dtype auto \
        --tensor-parallel-size 1 \
        --max_model_len 16384 \
        --served-model-name "$1" \
        --enable-auto-tool-choice \
        --tool-call-parser hermes \
        --host 0.0.0.0 --port 8080
fi


if [ "$1" = "qwen3:30b-a3b" ]; then
    vllm serve "$MODEL_DIR/Qwen/Qwen3-30B-A3B" \
        --trust-remote-code \
        --device cuda --dtype auto \
        --tensor-parallel-size 2 \
        --max_model_len 16384 \
        --served-model-name "$1" \
        --enable-auto-tool-choice \
        --tool-call-parser hermes \
        --enable-reasoning \
        --reasoning-parser deepseek_r1 \
        --host 0.0.0.0 --port 8080
fi