import os
import json
from tqdm import tqdm

base_path = ''


# In download_omitted_images.py, we collect the image_urls that do not exist anymore.
# Hence, to facilitate the data loading during experiments, we modify the KB.
if __name__ == '__main__':

    # Load document KB.
    document_kb = json.load(open(os.path.join(base_path, 'infoseek_wikipedia', 'infoseek_kb_wiki_800k.json'), 'r'))
    omitted_image_urls = set(json.load(open(os.path.join(base_path, 'infoseek_wikipedia/omitted_image_urls.json'), 'r')))
    doc_image_url_to_ids = set(json.load(open(os.path.join(base_path, 'infoseek_wikipedia/image_url_to_id.json'), 'r')).keys())

    for doc_url in tqdm(document_kb, total=len(document_kb)):
        document = document_kb[doc_url]

        temp_ids = []
        for idx, doc_image_url in enumerate(document['image_urls']):
            # image_url that does not exist anymore
            if doc_image_url in omitted_image_urls:
                temp_ids.append(idx)
                omitted_image_urls.remove(doc_image_url)

            elif not doc_image_url in doc_image_url_to_ids:
                temp_ids.append(idx)

        temp_ids.sort(reverse=True)  # Delete from the back to keep the right order of the images.
        for idx in temp_ids:
            del document['image_urls'][idx]
            del document['image_reference_descriptions'][idx]
            del document['image_section_indices'][idx]

    with open(os.path.join(base_path, 'infoseek_wikipedia', 'infoseek_kb_wiki_800k.json'), 'w') as f:
        json.dump(document_kb, f)

    print('Done!')