#!/bin/bash
# Start Solver vLLM services with Sleep Mode for GPU time-sharing
#
# In time-sharing mode, all GPUs are used by both Trainer and vLLM services.
# vLLM services start in sleep mode and are woken up only when needed.

model_path=$1
run_id=$2
gpu_mem_util=${3:-0.8}  # Default to 0.8 for time-sharing mode

export VLLM_DISABLE_COMPILE_CACHE=1

# ============ Time-Sharing Mode Configuration ============
# All GPUs are used for vLLM services (they will sleep when not in use)
N_SERVICES=${TOTAL_GPU_COUNT:-8}
BASE_PORT=5000

echo "=========================================="
echo "[vLLM-Solver] Time-Sharing Mode Enabled"
echo "=========================================="

# ============ 清理旧进程 ============
# 在启动新服务之前，先清理可能残留的旧进程
echo "[vLLM-Solver] Cleaning up old processes..."
for i in $(seq 0 $((N_SERVICES - 1))); do
    port=$((BASE_PORT + i))
    # 查找并杀死占用该端口的进程
    pid=$(lsof -ti:$port 2>/dev/null)
    if [ -n "$pid" ]; then
        echo "[vLLM-Solver] Killing old process on port $port (PID: $pid)"
        kill -9 $pid 2>/dev/null
    fi
done
# 等待旧进程完全退出
sleep 3

# 额外清理：杀死 Solver vLLM 进程
# 注意：使用更精确的模式匹配，避免误杀 start_vllm_server_code.py
echo "[vLLM-Solver] Cleaning up any remaining vllm_server processes..."
# 使用 -x 进行精确匹配，或者使用负向查找排除 _code.py
pkill -9 -f "start_vllm_server\.py" 2>/dev/null || true
# 再用 pgrep + grep 精确过滤
for pid in $(pgrep -f "start_vllm_server" 2>/dev/null); do
    cmdline=$(cat /proc/$pid/cmdline 2>/dev/null | tr '\0' ' ')
    if echo "$cmdline" | grep -q "start_vllm_server.py" && ! echo "$cmdline" | grep -q "start_vllm_server_code.py"; then
        echo "[vLLM-Solver] Killing stale Solver process (PID: $pid)"
        kill -9 $pid 2>/dev/null
    fi
done
sleep 2

echo "[vLLM-Solver] Starting $N_SERVICES solver services (will sleep immediately)"
echo "[vLLM-Solver] Model path: $model_path"
echo "[vLLM-Solver] GPU range: 0 to $((N_SERVICES - 1))"
echo "[vLLM-Solver] GPU memory utilization: $gpu_mem_util"
echo "=========================================="

# Start one vLLM service per GPU, each will enter sleep mode after loading (default)
for i in $(seq 0 $((N_SERVICES - 1))); do
    gpu_id=$i
    port=$((BASE_PORT + i))
    echo "[vLLM-Solver] Starting service on GPU $gpu_id, port $port"
    CUDA_VISIBLE_DEVICES=$gpu_id python vllm_service_init/start_vllm_server.py \
        --port $port \
        --model_path $model_path \
        --gpu_mem_util $gpu_mem_util &
done

# Wait for services to initialize and enter sleep mode
# 注意：vLLM 加载模型需要较长时间（torch.compile ~20s + 模型加载 + sleep ~10s）
# 为确保所有服务完全进入 sleep 状态，需要等待足够长的时间
echo "[vLLM-Solver] Waiting for services to initialize and enter sleep mode..."
sleep 120  # Allow time for model loading (including torch.compile) and sleep

# Verify services are running
echo "[vLLM-Solver] Checking service health..."
for i in $(seq 0 $((N_SERVICES - 1))); do
    port=$((BASE_PORT + i))
    if curl -s "http://127.0.0.1:$port/is_sleeping" > /dev/null 2>&1; then
        echo "[vLLM-Solver] Service on port $port is ready"
    else
        echo "[vLLM-Solver] WARNING: Service on port $port may not be ready"
    fi
done

echo "=========================================="
echo "[vLLM-Solver] All $N_SERVICES services started (in sleep mode)"
echo "[vLLM-Solver] GPU memory has been released for Trainer use"
echo "=========================================="
