#!/bin/bash

# export VLLM_LOGGING_LEVEL=DEBUG


model="meta-llama/Meta-Llama-3.1-8B-Instruct" 

VLLM_PORT=12345 \
VLLM_DISTRIBUTED_KV_ROLE=producer \
CUDA_VISIBLE_DEVICES=0 \
python3 \
	-m vllm.entrypoints.openai.api_server \
	--model $model \
	--port 8100 \
	-tp 1 \
	--max-model-len 10000 \
	--max-num-seqs 1 \
	--enforce-eager \
	--disable-log-stats \
	--disable-log-requests \
	--gpu-memory-utilization 0.6 
