import os
# 1️⃣ Pick an absolute path that has enough space
BASE = "./"

# 2️⃣ Point both caches there ─ before any HF import
os.environ["HF_HOME"]          = BASE          # makes <BASE>/hub and <BASE>/datasets
os.environ["HF_HUB_CACHE"]     = f"{BASE}/hub" # optional, explicit
os.environ["HF_DATASETS_CACHE"]= f"{BASE}/datasets" 

import json
import glob
import unicodedata

# Helper: canonicalize labels for robust uniqueness
def _canon_label(s: str) -> str:
    if not isinstance(s, str):
        return ""
    return unicodedata.normalize("NFKC", s).casefold().strip()

# Prefer QID for uniqueness; fall back to canonicalized label
def _unique_key(item: dict) -> str:
    qid = item.get("Q_number") or item.get("qid") or item.get("QID")
    if isinstance(qid, str) and qid.strip():
        return qid.strip()
    return _canon_label(item.get("label", ""))


file_list = []
for file in glob.glob("./1_Data_Gathering/temp_data/2_tag_only/*.jsonl"):
    file_list.append(file)

print(len(file_list))
print(sorted(file_list)) 


# Initialize empty list to store all items
all_items = []

# Load and combine items from all files
for file_path in file_list:
    with open(file_path, "r") as f:
        for line in f:
            item = json.loads(line)
            all_items.append(item)

print(f"Total items loaded: {len(all_items)}")
print("Sample item:", all_items[0])


cleaned_items = []
seen_keys = set()
for item in all_items:
    all_tags = [tag for tag in item.get("related_tags", []) if not (tag.startswith('Q') and tag[1:].isdigit())]
    all_tags = [tag for tag in all_tags if tag not in ["Wikimedia category", "Wikimedia template", "Wikimedia disambiguation page"]]
    all_tags = [tag for tag in all_tags if "_" not in tag and '/' not in tag and 'wiki' not in tag.lower()]

    if len(all_tags) == 0:
        continue

    key = _unique_key(item)
    if not key or key in seen_keys:
        continue
    seen_keys.add(key)

    item["related_tags"] = all_tags
    cleaned_items.append(item)

print(f"Total items loaded: {len(cleaned_items)}")
print("Sample item:", cleaned_items[0])
print(f"Unique items kept (main) by key (QID preferred): {len(seen_keys)}")

with open("./1_Data_Gathering/temp_data/3_cleaned_items_tag_only.jsonl", "w") as f:
    for item in cleaned_items:
        f.write(json.dumps(item) + "\n")
 



# -------------------------------
# Clean landmarks_low_freq
# -------------------------------
low_freq_in = "./1_Data_Gathering/temp_data/2_landmarks_low_freq.jsonl"
low_freq_out = "./1_Data_Gathering/temp_data/3_landmarks_low_freq.jsonl"
if os.path.exists(low_freq_in):
    low_items = []
    with open(low_freq_in, "r") as f:
        for line in f:
            try:
                obj = json.loads(line)
            except Exception:
                continue
            low_items.append(obj)

    cleaned_low = []
    seen_low_keys = set()
    for item in low_items:
        all_tags = [tag for tag in item.get("related_tags", []) if not (tag.startswith('Q') and tag[1:].isdigit())]
        all_tags = [tag for tag in all_tags if tag not in ["Wikimedia category", "Wikimedia template", "Wikimedia disambiguation page"]]
        all_tags = [tag for tag in all_tags if ("_" not in tag) and ("wiki" not in tag.lower()) and ('/' not in tag)]
        if len(all_tags) == 0:
            continue

        key = _unique_key(item)
        if not key or key in seen_low_keys:
            continue
        seen_low_keys.add(key)

        item["related_tags"] = all_tags
        cleaned_low.append(item)

    print(f"Unique items kept (low_freq) by key (QID preferred): {len(seen_low_keys)}")

    with open(low_freq_out, "w") as f:
        for item in cleaned_low:
            f.write(json.dumps(item) + "\n")


# -------------------------------
# Clean landmarks_high_freq
# -------------------------------
high_freq_in = "./1_Data_Gathering/temp_data/2_landmarks_high_freq.jsonl"
high_freq_out = "./1_Data_Gathering/temp_data/3_landmarks_high_freq.jsonl"
if os.path.exists(high_freq_in):
    high_items = []
    with open(high_freq_in, "r") as f:
        for line in f:
            try:
                obj = json.loads(line)
            except Exception:
                continue
            high_items.append(obj)

    cleaned_high = []
    seen_high_keys = set()
    for item in high_items:
        all_tags = [tag for tag in item.get("related_tags", []) if not (tag.startswith('Q') and tag[1:].isdigit())]
        all_tags = [tag for tag in all_tags if tag not in ["Wikimedia category", "Wikimedia template", "Wikimedia disambiguation page"]]
        all_tags = [tag for tag in all_tags if  ("_" not in tag) and ("wiki" not in tag.lower()) and ('/' not in tag)]
        if len(all_tags) == 0:
            continue

        key = _unique_key(item)
        if not key or key in seen_high_keys:
            continue
        seen_high_keys.add(key)

        item["related_tags"] = all_tags
        cleaned_high.append(item)

    print(f"Unique items kept (high_freq) by key (QID preferred): {len(seen_high_keys)}")

    with open(high_freq_out, "w") as f:
        for item in cleaned_high:
            f.write(json.dumps(item) + "\n")
