#!/bin/bash

set -euo pipefail

# Load conda environment
source /data/user/miniconda3/etc/profile.d/conda.sh
conda activate rllm
cd /data/user/rllm

# Load env vars (HF token, etc.)
set -a
. /data/user/rllm/.env
set +a

set -x

# Print GPU info
srun -l bash -c 'echo "Node: $(hostname -s)"; nvidia-smi -L'

# --- vLLM / torch env
unset ROCR_VISIBLE_DEVICES ROCM_VISIBLE_DEVICES HIP_VISIBLE_DEVICES
export VLLM_ATTENTION_BACKEND=FLASH_ATTN
export PYTORCH_CUDA_ALLOC_CONF="expandable_segments:False"
export VLLM_USE_V1=1
export VLLM_ALLOW_LONG_MAX_MODEL_LEN=1
export VLLM_ENGINE_ITERATION_TIMEOUT_S=1000000000
export CUDA_DEVICE_ORDER=PCI_BUS_ID

# ------------------------------
# Config (override via sbatch --export=ALL,VAR=...)
# ------------------------------
# Model to serve (HF repo id or local path)
MODEL_PATH=${MODEL_PATH:-"openai/gpt-oss-20b"}
# MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen2.5-Coder-7B-Instruct"}
# Name exposed by the server (this is what clients pass as `model=...`)
SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"openai/gpt-oss-20b"}
# SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen2.5-Coder-7B-Instruct"}
HOST=${HOST:-"0.0.0.0"}
PORT=${PORT:-30000}
TP=${TP:-1}

# ------------------------------
# Launch vLLM OpenAI-compatible server
# ------------------------------
CUDA_VISIBLE_DEVICES=0 vllm serve "$MODEL_PATH" \
  --host "$HOST" \
  --port "$PORT" \
  --served-model-name "$SERVED_MODEL_NAME" \
  --tensor-parallel-size "$TP"
