import clip
import torch
import json
import numpy as np
from pathlib import Path
#for sence text

def extract_scene_text_features(scene_texts, model_name="ViT-B/32", device=None):
    # Auto-select device
    device = device or ("cuda" if torch.cuda.is_available() else "cpu")
    
    # Load CLIP model
    model, _ = clip.load(model_name, device=device)
    
    # Collect all unique texts
    all_texts = []
    for texts in scene_texts.values():
        all_texts.extend(texts)
    unique_texts = list(set(all_texts))
    
    # Text encoding
    text_tokens = clip.tokenize(unique_texts).to(device)
    with torch.no_grad():
        text_features = model.encode_text(text_tokens)
        text_features = text_features / text_features.norm(dim=-1, keepdim=True)  # L2 normalization
    
    # Convert to numpy array
    text_features_np = text_features.cpu().numpy()
    
    # Build feature dictionary
    feature_dict = {text: feature.tolist() for text, feature in zip(unique_texts, text_features_np)}
    
    return feature_dict

def save_to_json(feature_dict, output_path="assets/text_features.json"):
    # Create output directory
    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
    
    # Save as JSON
    with open(output_path, 'w') as f:
        json.dump(feature_dict, f, indent=2)
    print(f"Saved to {output_path}")

if __name__ == "__main__":
    scene_texts = {
        "dataset_name": [
            'Stainless steel pots', 'dark cup', 'refrigerator', 'frog cup', 'pot', 'spatula',
            'plate', 'spoon', 'toaster', 'ottolenghi', 'plastic ladle', 'sink', 'ketchup',
            'cabinet', 'red cup', 'pour-over vessel', 'knife', 'yellow desk'
        ],
        "ramen": [
            'nori', 'sake cup', 'kamaboko', 'corn', 'spoon', 'egg', 'onion segments',
            'plate', 'napkin', 'bowl', 'glass of water', 'hand', 'chopsticks', 'wavy noodles'
        ],
        "figurines": [
            'jake', 'pirate hat', 'pikachu', 'rubber duck with hat', 'porcelain hand',
            'red apple', 'tesla door handle', 'waldo', 'bag', 'toy cat statue', 'miffy',
            'green apple', 'pumpkin', 'rubics cube', 'old camera', 'rubber duck with buoy',
            'red toy chair', 'pink ice cream', 'spatula', 'green toy chair', 'toy elephant'
        ],
        "teatime": [
            'sheep', 'yellow pouf', 'stuffed bear', 'coffee mug', 'tea in a glass',
            'apple', 'coffee', 'hooves', 'bear nose', 'dall-e brand', 'plate',
            'paper napkin', 'three cookies', 'bag of cookies'
        ]
    }#same as opengaussian
    
    features = extract_scene_text_features(scene_texts)
    save_to_json(features)