import os
import json
import pandas as pd
import random
from tqdm import tqdm

base_path = ''


# Since the kb size is too large compared to the query dataset, we will extract only a small subset
# and merge the kb & query dataset to the infoseek or encyclopedic-vqa.
# Here, we extract 100k documents, including query-related ones and randomly selected ones.
# Since the KB is based on the old wikipedia urls, we remove the 
if __name__ == '__main__':

    train_query_df = pd.read_csv(os.path.join(base_path, 'train_clean.csv'))
    valid_query_df = pd.read_csv(os.path.join(base_path, 'val_clean.csv'))
    test_query_df = pd.read_csv(os.path.join(base_path, 'test_clean.csv'))

    query_df = pd.concat([train_query_df, valid_query_df, test_query_df])

    valid_entity_list = json.load(open(os.path.join(base_path, 'infoseek_wikipedia', 'infoseek_valid_entities.json')))

    subset_valid_entity_list = []

    for query in query_df.itertuples(index=False):
        subset_valid_entity_list.append(query.wikipedia_title)

    unseen_kb_entities = list(set(valid_entity_list) - set(subset_valid_entity_list))
    unseen_kb_entities = random.sample(unseen_kb_entities, 800*1000 - len(subset_valid_entity_list))

    unseen_kb_entities.extend(subset_valid_entity_list)
        
    with open(os.path.join(base_path, 'infoseek_wikipedia/infoseek_valid_entities_800k.json'), 'w') as f:
        json.dump(unseen_kb_entities, f)
    
