"""
Vision/text similarity backbone factory.

The existing codebase often uses `clip_model_path` as the parameter name, but the underlying
handler can be switched to BLIP or other vision models. This module provides a unified entrypoint
that selects an implementation based on an explicit `backend` or model-path keywords.
"""

from __future__ import annotations

from typing import Optional

import torch


def resolve_device(device: Optional[str]) -> str:
    if not device or device == "auto":
        return "cuda" if torch.cuda.is_available() else "cpu"
    return device


def infer_backend_from_path(model_path: Optional[str]) -> str:
    p = (model_path or "").lower()
    if "blip" in p:
        return "blip"
    return "clip"


def create_vision_text_handler(
    model_path: Optional[str],
    device: Optional[str],
    backend: Optional[str] = None,
    **kwargs,
):
    """
    Return a handler exposing the following interface:
    - encode_images(frames, ...)
    - encode_text(text)
    - encode_texts(texts)
    - clear_text_cache / clear_video_cache / clear_frame_cache
    """
    selected = (backend or infer_backend_from_path(model_path)).lower().strip()
    device = resolve_device(device)

    if selected == "blip":
        from .blip_handler import BLIPModelHandler

        return BLIPModelHandler(model_path=model_path, device=device, **kwargs)

    from .clip_handler import CLIPModelHandler

    return CLIPModelHandler(model_path=model_path, device=device, **kwargs)
