MODEL_PATH=/data/model/models/Qwen2.5-VL-32B-Instruct
MODEL_NAME="Qwen2.5-VL-32B-Instruct"
PORT=8501
LOG_FILE=log_files/qwen_vl.log

#--api-key xxxxx \
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 vllm serve ${MODEL_PATH} \
--tensor-parallel-size 8 \
--port $PORT \
--served-model-name $MODEL_NAME \
--cpu-offload-gb 0 \
--swap-space 50 \
--gpu-memory-utilization 0.96 \
--max-model-len 65536 \
--max-num-seqs 32 \
--use-v2-block-manager \
--limit-mm-per-prompt "image=20" \
| tee $LOG_FILE
