import os
import subprocess
import time
from pathlib import Path


def start_vllm_server(port, gpu_ids):
    """
    Start a vLLM server process on the specified port.
    Returns a subprocess.Popen object.
    """

    # Create logs directory if it doesn't exist
    logging_dir = Path(__file__).parents[2] / "logging"
    (logging_dir / "omni_math_judge_server").mkdir(parents=True, exist_ok=True)

    # Open log files in append mode
    stdout_log = open(f"logging/omni_math_judge_server/vllm_{port}.out.log", "a")
    stderr_log = open(f"logging/omni_math_judge_server/vllm_{port}.err.log", "a")

    # Copy current environment variables and set CUDA_VISIBLE_DEVICES
    env = os.environ.copy()
    env["CUDA_VISIBLE_DEVICES"] = gpu_ids

    cmd = [
        "python",
        "-m",
        "vllm.entrypoints.openai.api_server",
        "--model",
        "KbsdJames/Omni-Judge",
        "--trust-remote-code",
        "--enable-prefix-caching",
        "--served-model-name",
        f"Omni-Judge-port{port}",
        "--port",
        str(port),
        "--tensor-parallel-size",
        "1",
    ]
    print(f"Starting vLLM server on port {port} with CUDA_VISIBLE_DEVICES={gpu_ids}")
    return subprocess.Popen(cmd, stdout=stdout_log, stderr=stderr_log, env=env)


def is_process_running(proc):
    """
    Returns True if the subprocess is still running (proc.poll() is None).
    Otherwise, returns False.
    """
    return proc.poll() is None


def main():
    # 1. Start four server processes
    proc1 = start_vllm_server(port=8090, gpu_ids="0")
    proc2 = start_vllm_server(port=8091, gpu_ids="1")

    try:
        while True:
            # 2-1. Check if the first server has crashed
            if not is_process_running(proc1):
                print("Server on port 8090 has crashed. Restarting...")
                proc1 = start_vllm_server(port=8090, gpu_ids="0")

            # 2-2. Check if the second server has crashed
            if not is_process_running(proc2):
                print("Server on port 8091 has crashed. Restarting...")
                proc2 = start_vllm_server(port=8091, gpu_ids="1")

            # Wait 10 seconds before checking again
            time.sleep(10)

    except BaseException as e:
        if isinstance(e, KeyboardInterrupt):
            print("User stopping vLLM servers...")
        else:
            print(f"An unexpected error occurred: {e}")

        proc1.terminate()
        proc2.terminate()


if __name__ == "__main__":
    main()
