import re
import csv
from pathlib import Path
from difflib import SequenceMatcher
from tqdm import tqdm
from collections import defaultdict

# === Normalize label ===
def normalize(label):
    return re.sub(r"[^a-z0-9]+", " ", label.lower()).strip()

# === Load and optionally clean NYU labels ===
def load_labels(file_path, clean_nyu=False):
    labels = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            if clean_nyu and ":" in line:
                label = line.split(":", 1)[1].strip()
            else:
                label = line
            labels.append(label)
    return labels

# === File paths ===
obj365_path = "object365_unique_category_labels.txt"
nyu_path = "nyu_class_labels.txt"
lvis_path = "lvis_unique_category_labels.txt"
output_path = "master_label_mapping_table.csv"

# === Load data ===
object365_raw = load_labels(obj365_path)
nyu_raw = load_labels(nyu_path, clean_nyu=True)
lvis_raw = load_labels(lvis_path)

# === Normalize and build mapping ===
datasets = {
    "Object365": {normalize(label): label for label in object365_raw},
    "NYU": {normalize(label): label for label in nyu_raw},
    "LVIS": {normalize(label): label for label in lvis_raw}
}

# === All unique normalized labels
all_norm_labels = set(datasets["Object365"]) | set(datasets["NYU"]) | set(datasets["LVIS"])

# === Similarity helper
def is_similar(label, candidates, threshold=0.88):
    for cand in candidates:
        if SequenceMatcher(None, label, cand).ratio() >= threshold:
            return cand
    return None

# === Build master mapping table
master_table = []
used_similar = set()
match_counts = defaultdict(int)

print("🔍 Processing and comparing labels...")
for label in tqdm(sorted(all_norm_labels), desc="🔗 Matching"):
    row = {
        "unified_label": label,
        "object365_label": datasets["Object365"].get(label, ""),
        "nyu_label": datasets["NYU"].get(label, ""),
        "lvis_label": datasets["LVIS"].get(label, ""),
    }

    present = [bool(row["object365_label"]), bool(row["nyu_label"]), bool(row["lvis_label"])]
    if sum(present) >= 2:
        row["match_type"] = "exact"
        match_counts["exact"] += 1
    else:
        if label in used_similar:
            continue
        match = is_similar(label, all_norm_labels - {label} - used_similar)
        if match:
            row["match_type"] = "similar"
            row["object365_label"] = datasets["Object365"].get(label, "") or datasets["Object365"].get(match, "")
            row["nyu_label"] = datasets["NYU"].get(label, "") or datasets["NYU"].get(match, "")
            row["lvis_label"] = datasets["LVIS"].get(label, "") or datasets["LVIS"].get(match, "")
            used_similar.add(label)
            used_similar.add(match)
            match_counts["similar"] += 1
        else:
            row["match_type"] = "unique"
            match_counts["unique"] += 1

    master_table.append(row)

# === Write to CSV
with open(output_path, "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=["unified_label", "object365_label", "nyu_label", "lvis_label", "match_type"])
    writer.writeheader()
    writer.writerows(master_table)

# === Summary
print(f"\n✅ Master label mapping saved to: '{output_path}'")
print("📊 Match Type Summary:")
for k in ["exact", "similar", "unique"]:
    print(f"  {k.title():<7}: {match_counts[k]}")
