import os
import subprocess
import time
from pathlib import Path


def start_vllm_server(port, gpu_ids):
    """
    Start a vLLM server process on the specified port.
    Returns a subprocess.Popen object.
    """

    # Create logs directory if it doesn't exist
    logging_dir = Path(__file__).parents[2] / "logging"
    (logging_dir / "omni_math_rm_server").mkdir(parents=True, exist_ok=True)

    # Open log files in append mode
    stdout_log = open(f"logging/omni_math_rm_server/vllm_{port}.out.log", "a")
    stderr_log = open(f"logging/omni_math_rm_server/vllm_{port}.err.log", "a")

    # Copy current environment variables and set CUDA_VISIBLE_DEVICES
    env = os.environ.copy()
    env["CUDA_VISIBLE_DEVICES"] = gpu_ids

    cmd = [
        "python",
        "-m",
        "vllm.entrypoints.openai.api_server",
        "--model",
        "Qwen/Qwen2.5-Math-RM-72B",
        "--trust-remote-code",
        "--served-model-name",
        f"Qwen2.5-Math-RM-72B-port{port}",
        "--port",
        str(port),
        "--tensor-parallel-size",
        "2",
    ]
    print(f"Starting vLLM server on port {port} with CUDA_VISIBLE_DEVICES={gpu_ids}")
    return subprocess.Popen(cmd, stdout=stdout_log, stderr=stderr_log, env=env)


def is_process_running(proc):
    """
    Returns True if the subprocess is still running (proc.poll() is None).
    Otherwise, returns False.
    """
    return proc.poll() is None


def main():
    # 1. Start four server processes
    # TODO: Refactor this
    # proc1 = start_vllm_server(port=8080, gpu_ids="0,1")
    proc2 = start_vllm_server(port=8081, gpu_ids="2,3")
    proc3 = start_vllm_server(port=8082, gpu_ids="4,5")
    proc4 = start_vllm_server(port=8083, gpu_ids="6,7")

    try:
        while True:
            # TODO: Refactor this
            # # 2-1. Check if the first server has crashed
            # if not is_process_running(proc1):
            #     print("Server on port 8080 has crashed. Restarting...")
            #     proc1 = start_vllm_server(port=8080, gpu_ids="0,1")

            # 2-2. Check if the second server has crashed
            if not is_process_running(proc2):
                print("Server on port 8081 has crashed. Restarting...")
                proc2 = start_vllm_server(port=8081, gpu_ids="2,3")

            # 2-3. Check if the third server has crashed
            if not is_process_running(proc3):
                print("Server on port 8082 has crashed. Restarting...")
                proc3 = start_vllm_server(port=8082, gpu_ids="4,5")

            # 2-4. Check if the fourth server has crashed
            if not is_process_running(proc4):
                print("Server on port 8083 has crashed. Restarting...")
                proc4 = start_vllm_server(port=8083, gpu_ids="6,7")

            # Wait 10 seconds before checking again
            time.sleep(10)

    except BaseException as e:
        if isinstance(e, KeyboardInterrupt):
            print("User stopping vLLM servers...")
        else:
            print(f"An unexpected error occurred: {e}")

        # TODO: Refactor this
        # proc1.terminate()
        proc2.terminate()
        proc3.terminate()
        proc4.terminate()


if __name__ == "__main__":
    main()
