import json
import os

base_path = ''

# Since the viquae-kb does not have the old wikipedia url, we make mapping function that maps
# wikipedia-id to old wikipedia url, to fill out the missing part of the viquae-kb.
# Download the kilt kb (kilt_knowledgesource.json) from https://github.com/facebookresearch/KILT
if __name__ == '__main__':

    kilt_kb_file_path = os.path.join(base_path, 'kilt/kilt_knowledgesource.json')

    mapping_function = {}
    
    # Loading all the kilt KB makes excessive memory usage.
    with open(kilt_kb_file_path, encoding="utf-8") as f:
        for idx, line in enumerate(f):

            pre_article = json.loads(line.strip())
            url = pre_article["history"].get("url", None)
            url = "" if url is None else url

            wikipedia_id = pre_article['wikipedia_id']
            assert wikipedia_id not in mapping_function, f"Overlapping wikipedia id {wikipedia_id}"

            mapping_function[wikipedia_id] = url

    os.makedirs(os.path.join(base_path, 'viquae_wikipedia'), exist_ok=True)
    with open(os.path.join(base_path, 'viquae_wikipedia', 'wikipedia_id_to_url.json'), 'w') as f:
        json.dump(mapping_function, f)

    print("Done!")