#!/bin/bash

#model="meta-llama/Meta-Llama-3.1-8B-Instruct" 
model="Qwen/Qwen2-7B"

python3 \
	-m vllm.entrypoints.openai.api_server \
	--model $model \
	--port 8000 \
	-tp 1 \
	--max-model-len 10000 \
	--enable-chunked-prefill \
	--max-num-batched-tokens 512 \
    --max-num-seqs 512 \
	--disable-log-stats \
	--disable-log-requests \
	--gpu-memory-utilization 0.9
	#--distributed-executor-backend ray
    #--enforce-eager
