import hashlib
from chromadb import HttpClient
from tqdm import tqdm
from config.config import COLLECTION_PREFIX
# --- CONFIG ---

MODEL_NAMES = ["dinov1_s", "dinov1_b", "dinov1_b8", "dinov2_s", "dinov2_b", "mobilenet_v2"]
BATCH_SIZE = 10_000
DELETE_BATCH_SIZE = 1000  # to avoid overwhelming the DB

# --- Connect to ChromaDB ---
client = HttpClient(host="localhost", port=8010)

def hash_embedding(embedding, decimals=16):
    """Round floats and hash the tuple for uniqueness detection."""
    rounded = tuple(round(x, decimals) for x in embedding)
    return hashlib.sha256(str(rounded).encode()).hexdigest()

# --- Global set for tracking unique hashes ---
seen_hashes = set()

def remove_duplicates(collection_name):
    collection = client.get_or_create_collection(name=collection_name)
    offset = 0
    total_deleted = 0
    print(f"🔍 Checking: {collection_name}")
    while True:
        results = collection.get(
            limit=BATCH_SIZE,
            offset=offset,
            include=["embeddings"]
        )
        embeddings = results["embeddings"]
        ids = results["ids"]  # always returned

        if embeddings is None or len(embeddings) == 0:
            break

        ids_to_delete = []
        for emb, doc_id in zip(embeddings, ids):
            emb_hash = hash_embedding(emb)
            if emb_hash in seen_hashes:
                ids_to_delete.append(doc_id)
            else:
                seen_hashes.add(emb_hash)

        # Delete in chunks
        for i in range(0, len(ids_to_delete), DELETE_BATCH_SIZE):
            batch = ids_to_delete[i:i + DELETE_BATCH_SIZE]
            collection.delete(ids=batch)
            total_deleted += len(batch)

        offset += BATCH_SIZE
        tqdm.write(f"📦 {collection_name} — processed {offset} entries, deleted so far: {total_deleted}")

    return total_deleted

# --- Run for all models ---
for name in MODEL_NAMES:
    collection_name = f"{COLLECTION_PREFIX}_{name}"
    num_deleted = remove_duplicates(collection_name)
    print(f" {collection_name}: {num_deleted} duplicates removed")
