import copy
import os
import json
import pandas as pd
import random

base_path = ''
viquae_base_path = ''

# For fast evaluation of retrieval performance with test dataset, we extract randomly extract 100k KB that contains test dataset entities.
# Unlike other datasets, we will use all documents in test, since the KB size is small.
if __name__ == '__main__':

    test_query_df = pd.read_csv(os.path.join(base_path, 'test_clean.csv'))
    document_kb = json.load(open(os.path.join(base_path, 'openwikitable_wikipedia/openwikitable_kb_wiki.json'), 'r'))
    # viquae_kb = json.load(open(os.path.join(viquae_base_path, 'viquae_wikipedia/viquae_kb_wiki_200k.json', 'r')))

    # # Here, simply include all the Open-WikiTable KB, and merge it with a subset of the viquae KB
    # unseen_kb_urls = set(viquae_kb.keys())
    # unseen_subset_kb_urls = random.sample(unseen_kb_urls, 100*1000 - len(document_kb))

    # for kb_url in unseen_subset_kb_urls:
    #     document_kb[kb_url] = copy.deepcopy(viquae_kb[kb_url])

    with open(os.path.join(base_path, 'openwikitable_wikipedia/openwikitable_kb_wiki_test.json'), 'w') as f:
        json.dump(document_kb, f)
