from .clip_encoder import CLIPVisionTower
from .detr_encoder import DETRVisionTower
from .detr_encoder_v2 import DETRVisionTower_v2

def build_vision_tower(vision_tower_cfg, **kwargs):
    vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None))
    print(f"Building vision_tower: {vision_tower}")
    if vision_tower.startswith("openai") or vision_tower.startswith("laion") or "clip" in vision_tower:
        # TODO: check if this is correct
        vision_tower = "openai/clip-vit-large-patch14"
        return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)

    elif "detr-v2" in vision_tower:
        return DETRVisionTower_v2(vision_tower, args=vision_tower_cfg, **kwargs)

    elif "clip-detr" in vision_tower:
        # TODO: enable this also in ./llava/model/builder.py
        from .clip_encoder_detr_head import CLIPVisionTower_DETRHead
        return CLIPVisionTower_DETRHead(vision_tower, args=vision_tower_cfg, **kwargs)

    elif "detr" in vision_tower:
        return DETRVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)

    raise ValueError(f'Unknown vision tower: {vision_tower}')
