#!/bin/bash

source ~/.bashrc
conda activate vllm-disagg

hostname
nvidia-smi

#export VLLM_LOGGING_LEVEL=DEBUG

model=$2
#model="meta-llama/Meta-Llama-3.1-8B-Instruct" 
#model="meta-llama/Llama-3.2-3B"

output_filename=$3
benchmark_name=$4

echo "begin env vars"
echo $DECODE_DEVICE
echo $MODEL_NAME
echo "end env vars"

export VLLM_ENGINE_ITERATION_TIMEOUT_S=600
export VLLM_RPC_TIMEOUT=100000

export FRONTEND_TCPSTORE_PORT=$((52345+PORT_OFFSET))
# launch decode instance
VLLM_HOST_IP=$1 \
OUTLINES_CACHE_DIR=/tmp/outlines \
VLLM_PORT=$((12345+PORT_OFFSET)) \
VLLM_DISTRIBUTED_KV_ROLE=consumer \
CUDA_VISIBLE_DEVICES=0 \
python3 \
	-m vllm.entrypoints.openai.api_server \
	--model $model \
	--port $((8200+PORT_OFFSET)) \
	-tp 1 \
	--max-model-len 15000 \
	--enable-chunked-prefill \
	--max-num-batched-tokens 512 \
	--disable-log-stats \
	--disable-log-requests \
	--gpu-memory-utilization 0.85 &

# launch frontend 
PREFILL_HOST=$1 \
DECODE_HOST=localhost \
python disagg_prefill_proxy_server_store.py &

output_folder=last_results
mkdir -p $output_folder

# wait for server
timeout 1200 bash -c "
until curl -s localhost:$((8000+PORT_OFFSET))/v1/completions > /dev/null; do
        sleep 1
    done"
 
set -x
bash launch_benchmark_$benchmark_name.sh $model > $output_folder/$output_filename

sleep 10
pkill -f python3
sleep 10
scancel $SLURM_JOB_ID

