#!/bin/bash

source ~/.bashrc
conda activate vllm-disagg

hostname
nvidia-smi

model=$2

output_filename=$3
benchmark_name=$4

export VLLM_ENGINE_ITERATION_TIMEOUT_S=600
export VLLM_RPC_TIMEOUT=100000

export FRONTEND_TCPSTORE_PORT=$((52345+PORT_OFFSET))
# launch decode instance

VLLM_HOST_IP=$1 \
OUTLINES_CACHE_DIR=/tmp/outlines \
VLLM_PORT=$((12345+PORT_OFFSET)) \
VLLM_DISTRIBUTED_KV_ROLE=consumer \
CUDA_VISIBLE_DEVICES=0 \
python3 \
	-m vllm.entrypoints.openai.api_server \
	--model $model \
	--port $((8200+PORT_OFFSET)) \
	-tp 1 \
	--max-model-len 15000 \
	--enable-chunked-prefill \
	--max-num-batched-tokens 512 \
	--disable-log-stats \
	--disable-log-requests \
	--gpu-memory-utilization 0.85 &

# launch frontend 
PREFILL_HOST=$1 \
DECODE_HOST=localhost \
python disagg_prefill_proxy_server_store_null.py &

output_folder=last_results
mkdir -p $output_folder

# wait for server
timeout 1200 bash -c "
until curl -s localhost:$((8000+PORT_OFFSET))/v1/completions > /dev/null; do
        sleep 1
    done"
 
set -x
bash launch_benchmark_$benchmark_name.sh $model > $output_folder/$output_filename

sleep 10
pkill -f python3
sleep 30
scancel $SLURM_JOB_ID

