import json
import os
import pandas as pd

base_path = ''


def merge_and_clean_doc(url_dict, file_path):

    clean_doc = {}

    with open(os.path.join(base_path, 'viquae_wikipedia', file_path), 'r') as f:
        for idx, line in enumerate(f):
            pre_article = json.loads(line.strip())

            article = dict()

            # Text
            article['title'] = pre_article['wikipedia_title']
            article['section_titles'] = []
            article['section_texts'] = []
            # Image
            article['image_urls'] = []
            article['image_reference_descriptions'] = []
            article['image_section_indices'] = []

            # Table
            article['tables'] = []
            article['table_section_indices'] = []

            article['url'] = url_dict[pre_article['wikipedia_id']]
            clean_doc[article['url']] = article

    return clean_doc


# 1. We merge the wikipedia_id_to_url data from the extract_old_wiki_urls.py
# 2. We modify the viquae kb structure into the encyclopedic-vqa like format.
# Since the original kb size is 1.5M, slight modification is needed, we only need small subset that includes
# the documents for the queries and some randomly selected documents.
if __name__ == '__main__':
    # ~6M
    wikidata_id_to_url = json.load(open(os.path.join(base_path, 'viquae_wikipedia', 'wikipedia_id_to_url.json'), 'r'))

    clean_documents = {}

    clean_documents.update(merge_and_clean_doc(wikidata_id_to_url, 'humans_with_faces.jsonl'))
    clean_documents.update(merge_and_clean_doc(wikidata_id_to_url, 'humans_without_faces.jsonl'))
    clean_documents.update(merge_and_clean_doc(wikidata_id_to_url, 'non_humans.jsonl'))
    # ~1.5M
    with open(os.path.join(base_path, 'viquae_wikipedia','viquae_kb_wiki_empty.json'), 'w') as f:  # Smaller than 6M?
        json.dump(clean_documents, f)
    
