"""Similarity and deduplication utilities."""
from typing import List
from rapidfuzz import fuzz


def add_if_not_similar(
    new_item: str,
    collection: List[str],
    threshold: int = 90,
) -> bool:
    """Add new_item to collection if it's not similar to any existing item.
    
    Compare new_item with each string in existing collection for similarity.
    If fuzz.ratio with all strings < threshold, consider it "new", add to collection, and return True.
    If any similarity >= threshold, consider it "near duplicate", don't add, and return False.

    Args:
        new_item: New string
        collection: Existing string list, will be modified in place (append in place)
        threshold: Similarity threshold, 0~100, larger is stricter, default 90

    Returns:
        bool: True means added to collection; False means judged as duplicate, not added.
    """
    for existing in collection:
        score = fuzz.ratio(new_item, existing)
        if score >= threshold:
            # Considered duplicate / near duplicate
            return False
    
    # All below threshold, considered a "new category"
    collection.append(new_item)
    return True


def dedup_by_similarity(items: List[str], threshold: int = 90) -> List[str]:
    """Deduplicate items by similarity.
    
    Deduplicate a batch of strings by similarity:
    Iterate through items sequentially, use add_if_not_similar logic to build a "representative list".

    Args:
        items: Original string list
        threshold: Similarity threshold

    Returns:
        List[str]: Deduplicated string list (representative elements in original order)
    """
    result: List[str] = []
    for it in items:
        add_if_not_similar(it, result, threshold=threshold)
    return result

