#!/bin/bash

source ~/.bashrc
conda activate vllm-disagg

hostname
nvidia-smi

model=$1


OUTLINES_CACHE_DIR=/tmp/outlines1 \
CUDA_VISIBLE_DEVICES=0 \
python3 \
	-m vllm.entrypoints.openai.api_server \
	--model $model \
	--port $((8400+PORT_OFFSET)) \
	-tp 1 \
	--max-model-len 10000 \
    --enable-chunked-prefill \
    --max-num-batched-tokens 256 \
	--disable-log-stats \
	--disable-log-requests \
	--gpu-memory-utilization 0.85
    #--max-num-batched-tokens 256 
	#--max-num-seqs 1 
