#!/bin/bash

# Load conda environment
source /data/user/miniconda3/etc/profile.d/conda.sh
conda activate rllm
cd /data/user/rllm

# Load env vars (HF token, etc.)
set -a
. /data/user/rllm/.env
set +a

set -x

# Print GPU info
srun -l bash -c 'echo "Node: $(hostname -s)"; nvidia-smi -L'

# --- vLLM / torch env
unset ROCR_VISIBLE_DEVICES ROCM_VISIBLE_DEVICES HIP_VISIBLE_DEVICES
export VLLM_ATTENTION_BACKEND=FLASH_ATTN
export PYTORCH_CUDA_ALLOC_CONF="expandable_segments:False"
export VLLM_USE_V1=1
export VLLM_ALLOW_LONG_MAX_MODEL_LEN=1
export VLLM_ENGINE_ITERATION_TIMEOUT_S=1000000000
export CUDA_DEVICE_ORDER=PCI_BUS_ID

# ------------------------------
# Config (override via sbatch --export=ALL,VAR=...)
# ------------------------------
MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen2.5-Coder-7B-Instruct"}
SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen2.5-Coder-7B-Instruct"}
BIND_HOST=${BIND_HOST:-"0.0.0.0"}
PORT=${PORT:-30001}

# ------------------------------
# Launch vLLM OpenAI-compatible server
# ------------------------------
# This provides an OpenAI-style endpoint at:
#   http://<node>:${PORT}/v1
# and the training script should set:
#   rllm.workflow.workflow_args.solver_base_url=http://<node>:${PORT}/v1
#   rllm.workflow.workflow_args.solver_model=${SERVED_MODEL_NAME}

CUDA_VISIBLE_DEVICES=0,1,2,3 vllm serve "$MODEL_PATH" \
  --host "$BIND_HOST" \
  --port "$PORT" \
  --served-model-name "$SERVED_MODEL_NAME" \
  --data-parallel-size 4