import json
import os
import pandas as pd
from tqdm import tqdm

base_path = ''


def replace_section_name_to_id(doc_kb, split='train'):
    
    queries_df = pd.read_csv(os.path.join(base_path, f'{split}_name.csv'))
    evidence_section_id_list = []

    for query in tqdm(queries_df.itertuples(index=False), total=len(queries_df)):
        wiki_url = query.wikipedia_url

        # Temporal remedy on 'https://en.wikipedia.org/w/index.php?title=Gillespie%20Road&oldid=896710914'
        if doc_kb[wiki_url]['section_hie_titles'][0] == '':
            doc_kb[wiki_url]['section_hie_titles'] = doc_kb[wiki_url]['section_titles']
        
        section_names = doc_kb[wiki_url]['section_hie_titles']

        matching_id = None
        evidence_sec_name = query.evidence_section_name

        evidence_sec_name = evidence_sec_name.replace('"','')
        evidence_sec_name = evidence_sec_name.replace(' ','')
        evidence_sec_name = evidence_sec_name.replace('–','')
        evidence_sec_name = evidence_sec_name.replace('-','')
        
        # Exception case where the query evidence section name is slightly wrong
        if evidence_sec_name == 'Career.:Postwarcareer,19501959.':
            evidence_sec_name = 'Career.:Postwarcareer,19461959.'

        # The document seems to have been changed, so we manually assign it.
        # 'https://en.wikipedia.org/w/index.php?title=Ludwig%20van%20Beethoven&oldid=947564916'
        if evidence_sec_name == 'Lifeandcareer.:Personalandfamilydifficulties.':
            evidence_sec_name = 'Lifeandcareer.:Vienna17921802.'

        if evidence_sec_name == 'Firsttermasprimeminister:19401945.:Mentalandphysicalhealth.':
            evidence_sec_name = 'Personallife.:Marriageandchildren.'

        for sec_id, section_name in enumerate(section_names):
            section_name = section_name.replace('"','')
            section_name = section_name.replace(' ','')
            section_name = section_name.replace('–','')
            section_name = section_name.replace('-','')
            if section_name == evidence_sec_name:
                matching_id = sec_id
                break
        
        assert matching_id != None
        evidence_section_id_list.append(matching_id)

    queries_df.drop('evidence_section_name', axis=1, inplace=True)
    queries_df['evidence_section_id'] = evidence_section_id_list

    queries_df.to_csv(os.path.join(base_path, f'{split}_clean.csv'), index=False)


# Change the section name to section id referring to the document KB.
# Also, we will modify the section name into simpler form.
if __name__ == '__main__':

    document_kb = json.load(open(os.path.join(base_path, 'viquae_wikipedia', 'viquae_kb_wiki_200k.json'), 'r'))
    replace_section_name_to_id(doc_kb=document_kb, split='train')
    replace_section_name_to_id(doc_kb=document_kb, split='validation')
    replace_section_name_to_id(doc_kb=document_kb, split='test')

    print('Done!')