set -e
set -u


model_name="qwen3-8b"
model_path="/path/to/your/project/LLMs/Qwen3-8B"


command="vllm serve $model_path \
        --tensor-parallel-size 8 \
        --dtype bfloat16 \
        --max_model_len 131072 \
        --disable-log-requests \
        --served-model-name $model_name \
        --gpu-memory-utilization 0.8 \
        --rope-scaling '{\"rope_type\":\"yarn\",\"factor\":4.0,\"original_max_position_embeddings\":32768}'"


num_gpus=$(nvidia-smi --list-gpus | wc -l)

declare -A port_idx
port_idx=(
    ["8001"]="0,1,2,3,4,5,6,7"
)

# Launch a tmux session for each port and GPU pair
for port in ${!port_idx[@]}; do
    command_cuda="export CUDA_VISIBLE_DEVICES=${port_idx[$port]}"
    command_port="$command --port $port"
    tmux new -d -s $port
    tmux send-keys -t $port "$command_cuda" ENTER
    tmux send-keys -t $port "$command_port" ENTER
done

for port in ${!port_idx[@]}; do
    echo "$model_name server started at port $port"
done
