#!/usr/bin/bash

export CUDA_VISIBLE_DEVICES=$2

eval "$(conda shell.bash hook)"
conda activate qwen25-0508

chat_template=./src/chat_template_qwen/25vl_tool_server.jinja
num_gpu=$3
port=$4
max_image=70
max_video=0
# max_model_len=64000  # required for 72b

vllm serve "$1" \
    --port "$port" \
    --host 0.0.0.0 \
    --dtype bfloat16 \
    --limit-mm-per-prompt image="$max_image",video="$max_video"  \
    --tool-call-parser hermes \
    --enable-auto-tool-choice \
    --tensor-parallel-size "$num_gpu" \
    --chat-template "$chat_template"
    # --max_model_len "$max_model_len"
