#!/bin/bash

# Load conda environment
source /data/user/miniconda3/etc/profile.d/conda.sh
conda activate rllm2
cd /data/user/rllm

# Load env vars (HF token, etc.)
set -a
. /data/user/rllm/.env
set +a

set -x

# Print GPU info
srun -l bash -c 'echo "Node: $(hostname -s)"; nvidia-smi -L'
set -euo pipefail

BASE_MODEL_PATH="${BASE_MODEL_PATH:-Qwen/Qwen2.5-Coder-7B-Instruct}"
BASE_SERVED_MODEL_NAME="${BASE_SERVED_MODEL_NAME:-$BASE_MODEL_PATH}"
BASE_HOST="${BASE_HOST:-0.0.0.0}"
BASE_PORT="${BASE_PORT:-30000}"
BASE_TP="${BASE_TP:-1}"
BASE_CUDA_VISIBLE_DEVICES="${BASE_CUDA_VISIBLE_DEVICES:-0}"

unset ROCR_VISIBLE_DEVICES ROCM_VISIBLE_DEVICES HIP_VISIBLE_DEVICES
export VLLM_ATTENTION_BACKEND="${VLLM_ATTENTION_BACKEND:-FLASH_ATTN}"
export PYTORCH_CUDA_ALLOC_CONF="${PYTORCH_CUDA_ALLOC_CONF:-expandable_segments:False}"
export VLLM_USE_V1="${VLLM_USE_V1:-1}"
export VLLM_ALLOW_LONG_MAX_MODEL_LEN="${VLLM_ALLOW_LONG_MAX_MODEL_LEN:-1}"
export VLLM_ENGINE_ITERATION_TIMEOUT_S="${VLLM_ENGINE_ITERATION_TIMEOUT_S:-1000000000}"
export CUDA_DEVICE_ORDER="${CUDA_DEVICE_ORDER:-PCI_BUS_ID}"

echo "[base] Starting vLLM server..."
echo "[base] MODEL_PATH=$BASE_MODEL_PATH"
echo "[base] SERVED_MODEL_NAME=$BASE_SERVED_MODEL_NAME"
echo "[base] CUDA_VISIBLE_DEVICES=$BASE_CUDA_VISIBLE_DEVICES  TP=$BASE_TP  HOST=$BASE_HOST  PORT=$BASE_PORT"

CUDA_VISIBLE_DEVICES="$BASE_CUDA_VISIBLE_DEVICES" \
  vllm serve "$BASE_MODEL_PATH" \
    --host "$BASE_HOST" \
    --port "$BASE_PORT" \
    --served-model-name "$BASE_SERVED_MODEL_NAME" \
    --tensor-parallel-size "$BASE_TP"


