import sys
import subprocess
import time
import requests
import json


def start_vllm_server(
    model_to_serve_name: str,
    served_model_name: str,
    max_model_len: int = 8192,
    tensor_parallel_size: int = 2,
) -> subprocess.Popen:
    """
    Start a vLLM server for the specified model.

    Args:
        model_name: Name of the model to serve
        port: Port number for the server

    Returns:
        Process object for the server
    """
    # fmt: off
    vllm_server_process = subprocess.Popen(
        [
            "python", "-m", "vllm.entrypoints.openai.api_server",
            "--model", model_to_serve_name,
            "--host", "127.0.0.1",
            "--port", "8000",
            "--served-model-name", served_model_name,
            "--tensor-parallel-size", str(tensor_parallel_size),
            "--gpu-memory-utilization", "0.9",
            "--max-model-len", str(max_model_len),
            "--trust-remote-code"
        ],
        stdout=sys.stdout,  # Redirect to console to see logs
        stderr=sys.stderr,  # Redirect to console to see logs
    )
    # fmt: on

    # Wait for the vLLM server to be ready
    print("Waiting for vLLM server to initialize...")
    server_ready = False
    max_retries = 30  # Maximum number of attempts
    retry_interval = 10  # Seconds between retries

    for attempt in range(max_retries):
        try:
            # Try to ping the server's health endpoint
            response = requests.get("http://127.0.0.1:8000/v1/models")
            if response.status_code == 200:
                models = response.json()
                print(
                    f"vLLM server is ready! Available models: {json.dumps(models, indent=2)}"
                )
                server_ready = True
                break
        except requests.exceptions.ConnectionError:
            pass

        print(
            f"Waiting for vLLM server to initialize (attempt {attempt + 1}/{max_retries})..."
        )
        time.sleep(retry_interval)

    if not server_ready:
        print("Error: vLLM server did not start successfully after maximum retries")
        vllm_server_process.terminate()
        sys.exit(1)

    return vllm_server_process
