import json
import os
from tqdm import tqdm
# File paths
main_dataset_path = os.path.join("2_Emb_Calc", "temp_data", "8_main_dataset.jsonl")
wikipedia_pages_path = os.path.join("1_Data_Gathering", "temp_data", "7_all_wikipedia_pages.jsonl")
unrelevant_tags_path = "1_Data_Gathering/temp_data/7_unrelevant_qids_sampled.jsonl"
unrelevant_tags = []
with open(unrelevant_tags_path, "r", encoding="utf-8") as f:
    for line in tqdm(f, desc="Loading unrelevant tags"):
        line = line.strip()
        if not line:
            continue
        unrelevant_tags.append(json.loads(line))

wikipedia_pages = []
with open(wikipedia_pages_path, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        try:
            wikipedia_pages.append(json.loads(line))
        except json.JSONDecodeError:
            continue

print(f"Loaded {len(wikipedia_pages)} wikipedia pages from {wikipedia_pages_path}")

all_wiki_qids = [wikipedia_pages[i]['qid'] for i in range(len(wikipedia_pages))]

all_wiki_qids = set(all_wiki_qids)

all_unrelevants = []
for unrelevant_tag in tqdm(unrelevant_tags):
    all_unrelevants.extend(unrelevant_tag['unrelevant'])


all_unrelevants_set = set(all_unrelevants)


all_unrelevants_in_wiki = all_unrelevants_set.intersection(all_wiki_qids)
print(f"Number of unrelevant tags that are also wikipedia qids: {len(all_unrelevants_in_wiki)}")


results = []
for item in tqdm(unrelevant_tags):
    qid = item['qid']
    set_unrelevants = set(item['unrelevant'])
    set_unrelevants_in_wiki = set_unrelevants.intersection(all_unrelevants_in_wiki)
    results.append({
        'qid': qid,
        'wiki_unrelevants': list(set_unrelevants_in_wiki)
    })

output_path = "./1_Data_Gathering/temp_data/7_wiki_unrelevants_results.jsonl"
with open(output_path, "w", encoding="utf-8") as f:
    # results is a list of lists
    for item in results:
        f.write(json.dumps(item, ensure_ascii=False) + '\n')

print(f"Saved results as JSONL to {output_path}")
