#!/bin/bash
set -euo pipefail

# Launch an OpenAI-compatible vLLM server for the *bug-fixer* model.
#
# Expected overrides (env):
# - FIX_MODEL_PATH
# - FIX_SERVED_MODEL_NAME
# - FIX_HOST (default 0.0.0.0)
# - FIX_PORT (default 30001)
# - FIX_TP (default 1)
# - FIX_CUDA_VISIBLE_DEVICES (default 1)

FIX_MODEL_PATH="${FIX_MODEL_PATH:-Qwen/Qwen2.5-Coder-7B-Instruct}"
FIX_SERVED_MODEL_NAME="${FIX_SERVED_MODEL_NAME:-$FIX_MODEL_PATH}"
FIX_HOST="${FIX_HOST:-0.0.0.0}"
FIX_PORT="${FIX_PORT:-30001}"
FIX_TP="${FIX_TP:-1}"
FIX_CUDA_VISIBLE_DEVICES="${FIX_CUDA_VISIBLE_DEVICES:-1}"

unset ROCR_VISIBLE_DEVICES ROCM_VISIBLE_DEVICES HIP_VISIBLE_DEVICES
export VLLM_ATTENTION_BACKEND="${VLLM_ATTENTION_BACKEND:-FLASH_ATTN}"
export PYTORCH_CUDA_ALLOC_CONF="${PYTORCH_CUDA_ALLOC_CONF:-expandable_segments:False}"
export VLLM_USE_V1="${VLLM_USE_V1:-1}"
export VLLM_ALLOW_LONG_MAX_MODEL_LEN="${VLLM_ALLOW_LONG_MAX_MODEL_LEN:-1}"
export VLLM_ENGINE_ITERATION_TIMEOUT_S="${VLLM_ENGINE_ITERATION_TIMEOUT_S:-1000000000}"
export CUDA_DEVICE_ORDER="${CUDA_DEVICE_ORDER:-PCI_BUS_ID}"

echo "[fix] Starting vLLM server..."
echo "[fix] MODEL_PATH=$FIX_MODEL_PATH"
echo "[fix] SERVED_MODEL_NAME=$FIX_SERVED_MODEL_NAME"
echo "[fix] CUDA_VISIBLE_DEVICES=$FIX_CUDA_VISIBLE_DEVICES  TP=$FIX_TP  HOST=$FIX_HOST  PORT=$FIX_PORT"

CUDA_VISIBLE_DEVICES="$FIX_CUDA_VISIBLE_DEVICES" \
  vllm serve "$FIX_MODEL_PATH" \
    --host "$FIX_HOST" \
    --port "$FIX_PORT" \
    --served-model-name "$FIX_SERVED_MODEL_NAME" \
    --tensor-parallel-size "$FIX_TP"


