import copy
import os
import json
import pandas as pd
import random

base_path = ''

# Since uploading the KB to memory gives huge burden to the system.
# Hence, we extract the documents that are actually used during training.
if __name__ == '__main__':

    train_query_df = pd.read_csv(os.path.join(base_path, 'train_clean.csv'))
    val_query_df = pd.read_csv(os.path.join(base_path, 'val_clean.csv'))
    train_val_query_df = pd.concat([train_query_df, val_query_df])
    document_kb = json.load(open(os.path.join(base_path, 'encyclopedic_kb_wiki/encyclopedic_kb_wiki_cleaned_table.json'), 'r'))

    subset_doc_kb = {}
    for query in train_val_query_df.itertuples(index=False):
        subset_doc_kb[query.wikipedia_url] = copy.deepcopy(document_kb[query.wikipedia_url])

    with open(os.path.join(base_path, 'encyclopedic_kb_wiki/encyclopedic_kb_wiki_cleaned_table_train_val.json'), 'w') as f:
        json.dump(subset_doc_kb, f)
