# -*- coding: utf-8 -*-
from . import file_utils
import re


# Key for wikipedia eval is question-id. Key for web eval is the (question_id, filename) tuple
def get_key_to_ground_truth(data):
    if data['Domain'] == 'Wikipedia':
        return {datum['QuestionId']: datum['Answer'] for datum in data['Data']}
    else:
        return get_qd_to_answer(data)


def get_question_doc_string(qid, doc_name):
    return '{}--{}'.format(qid, doc_name)

def get_qd_to_answer(data):
    key_to_answer = {}
    for datum in data['Data']:
        for page in datum.get('EntityPages', []) + datum.get('SearchResults', []):
            qd_tuple = get_question_doc_string(datum['QuestionId'], page['Filename'])
            key_to_answer[qd_tuple] = datum['Answer']
    return key_to_answer


def read_clean_part(datum):
    for key in ['EntityPages', 'SearchResults']:
        new_page_list = []
        for page in datum.get(key, []):
            if page['DocPartOfVerifiedEval']:
                new_page_list.append(page)
        datum[key] = new_page_list
    assert len(datum['EntityPages']) + len(datum['SearchResults']) > 0
    return datum


def read_triviaqa_data(qajson):
    data = file_utils.read_json(qajson)
    # read only documents and questions that are a part of clean data set
    if data['VerifiedEval']:
        clean_data = []
        for datum in data['Data']:
            if datum['QuestionPartOfVerifiedEval']:
                if data['Domain'] == 'Web':
                    datum = read_clean_part(datum)
                clean_data.append(datum)
        data['Data'] = clean_data
    return data

'''
"Answer": {
                "Aliases": [
                    "The Swiss Miss",
                    "Martina hingis",
                    "Martina Hingisová",
                    "Martina Hingis",
                    "MartinaHingis",
                    "Martina Hingisova",
                    "Hingis"
                ],
                "MatchedWikiEntityName": "Martina Hingis",
                "NormalizedAliases": [
                    "hingis",
                    "swiss miss",
                    "martina hingis",
                    "martina hingisova",
                    "martinahingis",
                    "martina hingisová"
                ],
                "NormalizedMatchedWikiEntityName": "martina hingis",
                "NormalizedValue": "martina hingis",
                "Type": "WikipediaEntity",
                "Value": "Martina Hingis"
            },
'''
def answer_index_in_document(answer, document):
    answer_list = answer['NormalizedAliases']
    answers_in_doc = []
    for answer_string_in_doc in answer_list:
        indices = [m.start() for m in re.finditer(answer_string_in_doc, document, flags=re.IGNORECASE)]
        # As the triviaqa paper said, each document contains the answers
        # but if there is no answer in dot document, the indices will be []
        # and answers_in_doc will be []
        for index in indices: 
            answers_in_doc.append({
                'text': answer_string_in_doc,
                'answer_start': index
            })
    return answers_in_doc
