import re
import json
import argparse

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--path_to_mquake',
                        type=str,
                        default='../MQuAKE/')
    parser.add_argument('--dataset_name',
                        type=str,
                        default='MQuAKE-CF-3k-V2')
    args = parser.parse_args()
    return args

def manual_change(current_word, q, current_span):
    new_word = current_word 
    partial_span = re.search(new_word.lower(), q.lower())
    current_span.append(partial_span.span())
    return current_span

def main(args):
    with open(f'{args.path_to_mquake}{args.dataset_name}.json', 'r') as file:
        ds = json.load(file)

    case_count = 0
    new_ds = []
    for c in ds:
        triples = c['orig']['triples_labeled']
        relationships = []
        for t in triples:
            relationships.append(t[1])
        questions = c['questions']
        for i in range(len(questions)):
            if ' /' in questions[i]:
                questions[i] = questions[i].replace(' /', ' / ')
            elif '/ ' in questions[i]:
                questions[i] = questions[i].replace('/ ', ' / ')
            elif '/' in questions[i]:
                questions[i] = questions[i].replace('/', ' / ')

        rq = {'q1' : {}, 'q2' : {}, 'q3': {}}
        for j, r in enumerate(relationships):
            for i, q in enumerate(questions):
                r_idx = re.search(r.lower(), q.lower())
                if isinstance(r_idx, type(None)):
                    new_r = r.replace(' of ', ' ').replace(' / ', ' '). replace(' or ', ' ').replace(' on ', ' ').replace(' at ', ' ').replace(',', '')
                    new_r = new_r.replace(' by', '').replace('original ', ''). replace(' on', '').replace(' at', '')
                    new_r = new_r.split(' ')
                    current_span = []
                    for pattern in new_r:
                        partial_span = re.search(pattern.lower(), q.lower())
                        if isinstance(partial_span, type(None)):
                            for word in q.replace(',', '').replace('?', '').split(' '):
                                if (word == 'created' or word == 'developed' or word == 'developer' or word == 'author' or word == 'founding') and pattern == 'creator':
                                    current_span = manual_change(word, q, current_span)
                                elif 'born in' in q and r == 'place of birth':
                                    current_span = manual_change('born in', q, current_span)
                                elif word == 'born' and r == 'place of birth':
                                    current_span = manual_change(word, q, current_span)
                                elif (word == 'profession' or word == 'professional' or word == 'job' or word == 'role') and pattern == 'occupation':
                                    current_span = manual_change(word, q, current_span)
                                elif (word == 'manufactured' or word == 'produced' or word == 'production' or word == 'produces' or word == 'created' or word == 'producing' or word == 'made') and pattern == 'manufacturer':
                                    current_span = manual_change(word, q, current_span)
                                elif (word == 'citizen' or word == 'nationality') and pattern == 'citizenship':
                                    current_span = manual_change(word, q, current_span)
                                elif 'for a living' in q and pattern == 'occupation':
                                    current_span = manual_change('for a living', q, current_span)
                                elif 'literary category' in q and pattern == 'genre':
                                    current_span = manual_change('literary category', q, current_span)
                                elif (word == 'specializes' or word == 'specialized' or word == 'specialize') and pattern == 'speciality':
                                    current_span = manual_change(word, q, current_span)
                                elif word == 'partner' and pattern == 'spouse':
                                    current_span = manual_change(word, q, current_span)
                                elif word == 'Where' and pattern == 'location':
                                    current_span = manual_change(word, q, current_span)
                                elif (word == 'language' or word == 'language(s)') and pattern == 'languages':
                                    add = 0
                                    if word == 'language(s)':
                                        word = 'language'
                                        add += 2
                                    current_span = manual_change(word, q, current_span)
                                    current_span[-1] = (current_span[-1][0], current_span[-1][1]+add)
                                elif (word == 'founder' or word == 'established' or word == 'originate' or word == 'formed') and pattern == 'founded':
                                    current_span = manual_change(word, q, current_span)
                                elif 'religious organization' in q and r == 'religion or worldview':
                                    current_span = manual_change('religious organization', q, current_span)
                                elif 'religious leader' in q and r == 'religion or worldview':
                                    current_span = manual_change('religious leader', q, current_span)
                                elif (word == 'place' or word == 'headquarters' or word == 'established' or word =='originate') and r == 'location of formation':
                                    current_span = manual_change(word, q, current_span)
                                elif (word == 'CEO' or word == 'Chief Executive Officer') and r == 'chief executive officer':
                                    current_span = manual_change(word, q, current_span)
                                elif 'highest office' in q and r == 'head of state':
                                    current_span = manual_change('highest office', q, current_span)
                                elif 'chief executive' in q and r == 'head of government':
                                    current_span = manual_change('chief executive', q, current_span)
                                elif word == 'leader' and r in ['head of state', 'head of government']:
                                    current_span = manual_change(word, q, current_span)
                                elif (word == 'employed' or word == 'worked' or word == 'work' or word == 'employs') and pattern == 'employer':
                                    current_span = manual_change(word, q, current_span)
                                elif word == 'performed' and pattern == 'performer':
                                    current_span = manual_change(word, q, current_span)
                                elif 'mind behind' in q and pattern == 'creator':
                                    current_span = manual_change('mind behind', q, current_span)
                                elif 'in charge of' in q and pattern == 'manager':
                                    current_span = manual_change('in charge of', q, current_span)
                                elif word == 'manages' and pattern == 'manager':
                                    current_span = manual_change(word, q, current_span)
                                elif 'religious faith' in q and r == 'religion or worldview':
                                    current_span = manual_change('religious faith', q, current_span)
                                elif 'religious affiliation' in q and r == 'religion or worldview':
                                    current_span = manual_change('religious affiliation', q, current_span)
                                elif word == 'city' and pattern == 'capital':
                                    current_span = manual_change(word, q, current_span)
                                elif (word == 'die' or word == 'Where') and r == 'place of death':
                                    current_span = manual_change(word, q, current_span)
                                elif 'pass away' in q and r == 'place of death':
                                    current_span = manual_change('pass away', q, current_span)
                                elif 'passed away' in q and r == 'place of death':
                                    current_span = manual_change('passed away', q, current_span)
                                elif 'specialize in' in q and pattern == 'genre':
                                    current_span = manual_change('specialize in', q, current_span)
                                elif (word == 'job' or word == 'creator') and pattern == 'author':
                                    current_span = manual_change(word, q, current_span)
                                elif 'institution location' in q and pattern == 'genre':
                                    current_span = manual_change('institution location', q, current_span)
                                elif 'developed by' in q and pattern == 'developer':
                                    current_span = manual_change('developed by', q, current_span)
                                elif (word == 'creator' or word == 'developer' or word == 'developed') and pattern == 'developer':
                                    current_span = manual_change(word, q, current_span)
                                elif (word == 'chaired') and pattern == 'chairperson':
                                    current_span = manual_change(word, q, current_span)
                                elif 'manager/director' in q and r == 'director / manager':
                                    current_span = manual_change('manager/director', q, current_span)
                                elif 'director/manager' in q and r == 'director / manager':
                                    current_span = manual_change('director/manager', q, current_span)
                                elif 'manager / director' in q and r == 'director / manager':
                                    current_span = manual_change('manager / director', q, current_span)
                                elif (word == 'institution') or (word == 'school') or (word == 'education') and pattern == 'educated':
                                    current_span = manual_change(word, q, current_span)
                        else:
                            current_span.append(partial_span.span())
                    if len(current_span) < 1:
                        continue
                    current_span = list(set(current_span))
                    rq[f'q{i+1}'][f'r{j+1}'] = sorted(current_span)
                else:
                    rq[f'q{i+1}'][f'r{j+1}'] = [r_idx.span()]
        
        case_count += 1
        
        remove_idx = []
        for i in range(3):
            if len(rq[f'q{i+1}']) != len(relationships):
                remove_idx.append(i+1)
        if len(remove_idx) == 3:
            continue
        else:
            for idx in reversed(remove_idx):
                questions.pop(idx-1)
                del rq[f'q{idx}']

        
        new_ds.append({'case': case_count, 'questions' : questions, 'relationships' : relationships, 'relationship_indices' : rq})

    print(f'Dataset Size: {len(new_ds)}')
    final_json = json.dumps(new_ds, indent=4)

    with open(f'{args.dataset_name}_train.json', 'w') as file:
        file.truncate(0)
        file.write(final_json)

if __name__ == '__main__':
    main(parse_args())
