#!/bin/bash
# Start Question-to-Code vLLM services with Sleep Mode for GPU time-sharing
#
# In time-sharing mode, all GPUs are used by both Trainer and vLLM services.
# vLLM services start in sleep mode and are woken up only when needed.

model_path=$1
run_id=$2
gpu_mem_util=${3:-0.8}  # Default to 0.8 for time-sharing mode

export VLLM_DISABLE_COMPILE_CACHE=1

# ============ Time-Sharing Mode Configuration ============
# All GPUs are used for vLLM services (they will sleep when not in use)
N_SERVICES=${TOTAL_GPU_COUNT:-8}
BASE_PORT=6000

echo "=========================================="
echo "[vLLM-Code] Time-Sharing Mode Enabled"
echo "=========================================="

# ============ 清理旧进程 ============
# 在启动新服务之前，先清理可能残留的旧进程
echo "[vLLM-Code] Cleaning up old processes..."
for i in $(seq 0 $((N_SERVICES - 1))); do
    port=$((BASE_PORT + i))
    # 查找并杀死占用该端口的进程
    pid=$(lsof -ti:$port 2>/dev/null)
    if [ -n "$pid" ]; then
        echo "[vLLM-Code] Killing old process on port $port (PID: $pid)"
        kill -9 $pid 2>/dev/null
    fi
done
# 等待旧进程完全退出
sleep 3

# 额外清理：杀死所有 start_vllm_server_code.py 进程
echo "[vLLM-Code] Cleaning up any remaining vllm_server_code processes..."
pkill -9 -f "start_vllm_server_code.py" 2>/dev/null || true
sleep 2

echo "[vLLM-Code] Starting $N_SERVICES code generation services (will sleep immediately)"
echo "[vLLM-Code] Model path: $model_path"
echo "[vLLM-Code] GPU range: 0 to $((N_SERVICES - 1))"
echo "[vLLM-Code] GPU memory utilization: $gpu_mem_util"
echo "=========================================="

# Start one vLLM service per GPU, each will enter sleep mode after loading (default)
for i in $(seq 0 $((N_SERVICES - 1))); do
    gpu_id=$i
    port=$((BASE_PORT + i))
    echo "[vLLM-Code] Starting service on GPU $gpu_id, port $port"
    CUDA_VISIBLE_DEVICES=$gpu_id python vllm_service_init/start_vllm_server_code.py \
        --port $port \
        --model_path $model_path \
        --gpu_mem_util $gpu_mem_util &
done

# Wait for services to initialize and enter sleep mode
echo "[vLLM-Code] Waiting for services to initialize and enter sleep mode..."
sleep 60  # Allow time for model loading and sleep

# Verify services are running
echo "[vLLM-Code] Checking service health..."
for i in $(seq 0 $((N_SERVICES - 1))); do
    port=$((BASE_PORT + i))
    if curl -s "http://127.0.0.1:$port/is_sleeping" > /dev/null 2>&1; then
        echo "[vLLM-Code] Service on port $port is ready"
    else
        echo "[vLLM-Code] WARNING: Service on port $port may not be ready"
    fi
done

echo "=========================================="
echo "[vLLM-Code] All $N_SERVICES code generation services started (in sleep mode)"
echo "[vLLM-Code] GPU memory has been released for Trainer use"
echo "=========================================="
