import copy
import os
import json
import pandas as pd
import random

base_path = ''
viquae_base_path = ''

# For fast evaluation of retrieval performance with test dataset, we extract randomly extract 100k KB that contains test dataset entities.
# Unlike other datasets, we will use all documents in test, since the KB size is small.
if __name__ == '__main__':

    test_query_df = pd.read_csv(os.path.join(base_path, 'test_clean.csv'))
    document_kb = json.load(open(os.path.join(base_path, 'openwikitable_wikipedia/openwikitable_kb_wiki.json'), 'r'))

    subset_doc_kb = {}
    for query in test_query_df.itertuples(index=False):
        subset_doc_kb[query.wikipedia_url] = copy.deepcopy(document_kb[query.wikipedia_url])

    # Here, we only leave the documents that contain more than two sections having tables.
    subset_two_t_doc_kb = {}
    for wiki_url in subset_doc_kb:
        if len(set(subset_doc_kb[wiki_url]['table_section_indices'])) >= 2:
            subset_two_t_doc_kb[wiki_url] = copy.deepcopy(subset_doc_kb[wiki_url])

    with open(os.path.join(base_path, 'openwikitable_wikipedia/openwikitable_kb_wiki_cls.json'), 'w') as f:
        json.dump(subset_two_t_doc_kb, f)
