#!/bin/bash

source ~/.bashrc
conda activate vllm-disagg

hostname
nvidia-smi

model=$1

export VLLM_ENGINE_ITERATION_TIMEOUT_S=600

HOST_IP=localhost \
OUTLINES_CACHE_DIR=/tmp/outlines \
VLLM_PORT=$((12345+PORT_OFFSET)) \
VLLM_DISTRIBUTED_KV_ROLE=producer \
CUDA_VISIBLE_DEVICES=0 \
python3 \
	-m vllm.entrypoints.openai.api_server \
	--model $model \
	--port $((8100+PORT_OFFSET)) \
	-tp 1 \
	--max-model-len 8000 \
	--max-num-seqs 1 \
	--enforce-eager \
	--disable-log-stats \
	--disable-log-requests \
	--gpu-memory-utilization 0.8 
	#--gpu-memory-utilization 0.6

#timeout 1200 bash -c "
#until curl -s localhost:$((8100+PORT_OFFSET))/v1/completions > /dev/null; do
#        sleep 1
#    done"
#nvidia-smi --query-gpu=timestamp,utilization.gpu,utilization.memory --format=csv -l 1 -f prefill_util.csv 

