import json
import os
import pandas as pd

base_path = ''
refer_base_path = ''

# We will extract only the WikiTableQuestions query dataset 
# from the Open-WikiTable query dataset.
if __name__ == '__main__':
    
    table_df = pd.read_json(os.path.join(base_path, 'openwikitable_wikipedia', 'splitted_tables.json'))
    table_df = table_df[table_df['dataset'] == 'WikiTQ']

    # Initialize this mapping function for efficient processing of query datasets.
    # This maps the table ID to the urls and tableindex.
    ori_table_id_to_urls = {}

    for table in table_df.itertuples(index=False):
        page_id, table_id = table.original_table_id.split('-')

        table_meta = json.load(open(os.path.join(refer_base_path, 'page', f'{page_id}-page', f'{table_id}.json'), 'r'))
        ori_table_id_to_urls[table.original_table_id] = (table_meta['url'], table.section_title)
    
    os.makedirs(os.path.join(base_path, 'openwikitable_wikipedia'), exist_ok=True)
    with open(os.path.join(base_path, 'openwikitable_wikipedia', 'ori_table_id_to_urls.json'), 'w') as f:
        json.dump(ori_table_id_to_urls, f)

    print("Done!")