#!/bin/bash


export HF_HUB_CACHE=/xxx
export HF_HUB_OFFLINE=1
export VLLM_USE_MODELSCOPE=0

MODEL_PATH="Qwen/Qwen3-8B"
HOST="0.0.0.0"
PORT=23333
TP=8
GPU_MEM_UTILIZATION=0.9
MAX_MODEL_LEN=40960
MODEL_NAME="qwen3-8b"

LOCAL_IP=$(hostname -I | awk '{for(i=1;i<=NF;i++){if($i ~ /^10\./ || $i ~ /^192\./){print $i; exit}}}')
if [ -z "$LOCAL_IP" ]; then
  LOCAL_IP="localhost"
fi

python -m vllm.entrypoints.openai.api_server \
    --model $MODEL_PATH \
    --tensor-parallel-size $TP \
    --host $HOST \
    --port $PORT \
    --max-model-len $MAX_MODEL_LEN \
    --gpu-memory-utilization $GPU_MEM_UTILIZATION \
    --served-model-name $MODEL_NAME \
    --trust-remote-code


