#!/bin/bash

source /data/home/the/anaconda3/bin/activate
# 根据 train.sh 中此命令的上下文，假设 'openr1' 是正确的 conda 环境
conda activate openr1

MODEL_DIR=$1
MODEL_PATH="/data/home/the/models/DeepSeek-R1-Distill-Qwen-1.5B"
# 假设 trl vllm-serve 命令默认监听在 8000 端口。
# 如果服务使用不同端口，请修改此处的 PORT 值。
PORT=8000
LOG_FILE="vllm_collect_server.log"
SERVICE_NAME="Solution Collection vLLM Server"
GPU_ID="1,2,3,4,5,6"

# Function to wait for service readiness
wait_for_service() {
    local port_to_check=$1
    local s_name=$2
    local pid_to_watch=$3
    local max_retries=60 # 60 次尝试 * 5 秒 = 300 秒 (5 分钟) 超时
    local attempt_num=0
    echo "Waiting for $s_name (PID: $pid_to_watch) on port $port_to_check to be ready..."
    while ! curl --output /dev/null --silent --head --fail http://localhost:${port_to_check}; do
        if [ ${attempt_num} -ge ${max_retries} ]; then
            echo "Error: $s_name on port $port_to_check did not start within the timeout period (${max_retries} attempts)."
            echo "Check the log file: ${LOG_FILE}"
            echo "To clean up, you might need to manually kill PID: $pid_to_watch."
            exit 1 # 如果服务启动失败，则退出脚本
        fi

        # 检查后台进程本身是否已终止
        if ! ps -p $pid_to_watch > /dev/null; then
            echo "Error: $s_name (PID: $pid_to_watch) process died before service became ready on port $port_to_check."
            echo "Check the log file: ${LOG_FILE}"
            exit 1 # 如果进程终止，则退出脚本
        fi

        printf '.'
        sleep 5
        attempt_num=$((attempt_num+1))
    done
    echo "" # 在点号后换行
    echo "$s_name on port $port_to_check is ready."
}

CHECKPOINT_PATH=$(python -c "from src.open_r1.check_last_checkpoint import get_last_checkpoint; print(get_last_checkpoint('${MODEL_DIR}'))")

if [ "$CHECKPOINT_PATH" != "None" ]; then
  MODEL_PATH=$CHECKPOINT_PATH
else
  MODEL_PATH="/data/home/the/models/DeepSeek-R1-Distill-Qwen-1.5B"
fi
echo "==================================================="
echo "Load model from ${MODEL_PATH}"
echo "==================================================="

CUDA_VISIBLE_DEVICES=${GPU_ID} nohup trl vllm-serve --model ${MODEL_PATH} --data_parallel_size 6 > ${LOG_FILE} 2>&1 &
SERVER_PID=$!
echo "$SERVICE_NAME launched with PID: ${SERVER_PID}"

# wait_for_service ${PORT} "$SERVICE_NAME" ${SERVER_PID}
# sleep(240)
