#!/bin/bash 

source ~/.bashrc
conda activate vllm-pip6

hostname
nvidia-smi

export VLLM_PP_LAYER_PARTITION=$1
ray_port=$2
model=$3
output_filename=$4
benchmark_name=$5

ray start --head --port=$ray_port

sleep 10

OUTLINES_CACHE_DIR=/tmp/outlines \
python3 \
	-m vllm.entrypoints.openai.api_server \
	--model $model \
	-tp 1 \
    -pp 2 \
    --port $((8000+PORT_OFFSET)) \
	--max-model-len 15000 \
	--enable-chunked-prefill \
	--max-num-batched-tokens 512 \
	--disable-log-stats \
	--disable-log-requests \
	--gpu-memory-utilization 0.9 &

timeout 1200 bash -c "
until curl -s localhost:$((8000+PORT_OFFSET))/v1/completions > /dev/null; do
        sleep 1
    done"

output_folder=last_results
mkdir -p $output_folder

set -x
bash launch_benchmark_$benchmark_name.sh $model > $output_folder/$output_filename

sleep 10
ray stop 
sleep 10

scancel $SLURM_JOB_ID


