set -e
set -u


model_name="Qwen3-8B-SLM"
model_path="/path/to/your/model"


command="vllm serve $model_path \
        --tensor-parallel-size 8 \
        --dtype float16 \
        --max_model_len 32768 \
        --disable-log-requests \
        --served-model-name $model_name \
        --gpu-memory-utilization 0.8 \
        --enable-auto-tool-choice \
        --tool-call-parser hermes \
        --reasoning-parser deepseek_r1 \
        --chat-template ./qwen3_nonthinking.jinja "


num_gpus=$(nvidia-smi --list-gpus | wc -l)

declare -A port_idx
port_idx=(
    ["8001"]="0,1,2,3,4,5,6,7"
)

# Launch a tmux session for each port and GPU pair
for port in ${!port_idx[@]}; do
    command_cuda="export CUDA_VISIBLE_DEVICES=${port_idx[$port]}"
    command_port="$command --port $port"
    tmux new -d -s $port
    tmux send-keys -t $port "conda activate qwen3" ENTER
    tmux send-keys -t $port "$command_cuda" ENTER
    tmux send-keys -t $port "$command_port" ENTER
done

# Launch a tmux session for auto-holding the GPUs
# hold_command="python auto_hold.py --size 3000"
# tmux new -d -s hold
# tmux send-keys -t hold "conda activate o1" ENTER
# tmux send-keys -t hold "pip install GPUtil" ENTER
# # tmux send-keys -t hold "$hold_command" ENTER

for port in ${!port_idx[@]}; do
    echo "$model_name server started at port $port"
done
