"""
VLM inference factory.

Supports different video-VLM backends without changing agent logic.

Currently supported:
 - qwen: Qwen2.5-VL (default implementation)
- internvl3: InternVL3-8B (Transformers + trust_remote_code)
"""

from __future__ import annotations

from typing import Any, Dict, List, Optional, Tuple


def infer_vlm_backend_from_path(model_path: Optional[str]) -> str:
    p = (model_path or "").lower()
    base = p.rstrip("/\\").split("/")[-1].split("\\")[-1]
    if "internvl" in base or "internvl" in p:
        return "internvl3"
    return "qwen"


def initialize_vlm_model_specific(model_path: str, device_map: str = "auto", backend: Optional[str] = None) -> Tuple[Any, Any]:
    selected = (backend or infer_vlm_backend_from_path(model_path)).lower().strip()
    if selected in ("internvl3", "internvl"):
        from .internvl3_handler import InternVL3ModelHandler

        return InternVL3ModelHandler(model_path=model_path, device_map=device_map).load_model()

    from .qwen_handler import QwenModelHandler

    return QwenModelHandler(model_path=model_path, device_map=device_map).load_model()


def initialize_vlm_model(model_path: str = None, backend: Optional[str] = None) -> Tuple[Any, Any]:
    if model_path is None:
        from config.settings import QWEN_CONFIG

        model_path = QWEN_CONFIG["default_model"]
    return initialize_vlm_model_specific(model_path=model_path, device_map="auto", backend=backend)


def get_vlm_response_generic(
    model: Any,
    processor_or_tokenizer: Any,
    prompt: str,
    video_path: str,
    frame_indices: List[int],
    generation_kwargs: Dict,
) -> Dict[str, Any]:
    backend = None
    try:
        backend = getattr(model, "_scope_backend", None)
    except Exception:
        backend = None

    if (backend or "").lower() in ("internvl3", "internvl"):
        from .internvl3_handler import get_internvl3_response_generic

        return get_internvl3_response_generic(
            model=model,
            tokenizer=processor_or_tokenizer,
            prompt=prompt,
            video_path=video_path,
            frame_indices=frame_indices,
            generation_kwargs=generation_kwargs,
        )

    from .qwen_handler import get_qwen_response_generic

    return get_qwen_response_generic(
        model=model,
        processor=processor_or_tokenizer,
        prompt=prompt,
        video_path=video_path,
        frame_indices=frame_indices,
        generation_kwargs=generation_kwargs,
    )
