#!/bin/bash

source ~/.bashrc
conda activate vllm-disagg

hostname
nvidia-smi

model=$2

output_file=$3
benchmark_name=$4

# launch decode instance
OUTLINES_CACHE_DIR=/tmp/outlines1 \
CUDA_VISIBLE_DEVICES=0 \
python3 \
	-m vllm.entrypoints.openai.api_server \
	--model $model \
	--port $((8300+PORT_OFFSET)) \
	-tp 1 \
	--max-model-len 15000 \
	--enable-chunked-prefill \
	--max-num-batched-tokens 512 \
	--disable-log-stats \
	--disable-log-requests \
	--gpu-memory-utilization 0.9 &

# launch frontend 
HOST1=localhost \
PORT1=$((8300+PORT_OFFSET)) \
HOST2=$1 \
PORT2=$((8400+PORT_OFFSET)) \
python weighted_round_robin_proxy.py &

output_folder=last_results
mkdir -p $output_folder

# wait for server
timeout 1200 bash -c "
until curl -s localhost:$((8300+PORT_OFFSET))/v1/completions > /dev/null; do
        sleep 1
    done"
sleep 10
 
set -x
bash launch_benchmark_$benchmark_name.sh $model > $output_folder/$output_file

sleep 10
pkill -f python3
sleep 30
scancel $SLURM_JOB_ID
