import urllib.request as request
import re, orjson, os
import numpy as np

ANSWER_DIR = "/fs/cml-projects/E2H/AMC/problems"

contests = {
    "AMC_8":["AMC8", list(np.arange(1999, 2021))+list(np.arange(2022, 2023))],
    "AMC_10A":["AMC10A", list(np.arange(2003, 2023))],
    "AMC_10B":["AMC10B", list(np.arange(2003, 2023))],
    "AMC_12A":["AMC12A", list(np.arange(2003, 2023))],
    "AMC_12B":["AMC12B", list(np.arange(2003, 2023))],
    "Fall_AMC_10A":["AMC10A", list(np.arange(2021, 2022))],
    "Fall_AMC_10B":["AMC10B", list(np.arange(2021, 2022))],
    "Fall_AMC_12A":["AMC12A", list(np.arange(2021, 2022))],
    "Fall_AMC_12B":["AMC12B", list(np.arange(2021, 2022))]
}


def parse_options(option_str):
    split_patterns = {r"\\qquad":"\\qquad", r"\\quad":"\\quad"}
    for split_key, split_str in split_patterns.items():
        if len(re.findall(re.compile(split_key), option_str)) in [4, 5]:
            return option_str.replace("$", "").split(split_str)[:5]
    return None


def crawl_answer_option(url, name):
    print(url)
    page = request.urlopen(url)
    strings = str(page.read(), encoding='utf-8')
    answer_keys = {}
    str_lines = strings.split("\n")

    start_index = []
    solution_index = []
    start_pattern = r"<h.><span class=\"mw-headline\" id=\"Problem_(?P<index0>.*?)\">Problem (?P<index1>.*?)</span></h.>"
    count = 1
    for idx, str_line in enumerate(str_lines):
        index = re.match(start_pattern, str_line)
        if index:
            start_index += [idx,]
            real_count = index.group("index0")
            for n in range(count+1, int(real_count)):
                start_index += [None,]
                solution_index += [None,]
            count = int(real_count)
        elif re.findall(r"<a href=\".*?\">\s*Solution", str_line):
            solution_index += [None,]*(len(start_index)-len(solution_index)-1)+[idx,]
    for idx, index in enumerate(start_index):
        if index:
            if solution_index[idx]:
                answer_keys[f'{name}_{idx+1}'] = [index, solution_index[idx]]
            elif start_index[idx+1]:
                answer_keys[f'{name}_{idx+1}'] = [index, start_index[idx+1]]
            else:
                answer_keys[f'{name}_{idx+1}'] = [index, solution_index[idx+1]]
        else:
            answer_keys[f'{name}_{idx+1}'] = [None, None]

    for key, value in answer_keys.items():
        if value[1]:
            pattern = r".*?<img src=\"//latex.artofproblemsolving.com(?P<source>.*?)\" class=\"latex\" alt=\"\$(?P<options>.*?)\$\""
            for subcount in range(1, value[1]-value[0]):
                # match_result = re.match(a, str_lines[value[1]-subcount])
                # if match_result:
                #     answer_keys[key] = match_result.group("options")
                #     break
                match_result = [parse_options(a.group("options")) for a in re.finditer(pattern, str_lines[value[1]-subcount])]
                #match_parse_result = [a for a in match_result]
                match_filter_result = [a for a in match_result if a is not None]
                if len(match_filter_result):
                    answer_keys[key] = match_filter_result[0]
                    break
            if type(answer_keys[key]) is list and len(answer_keys[key])==2:
                answer_keys[key] = None
        else:
            answer_keys[key] = None
    return answer_keys


def crawl_answer_key(url, name):
    page = request.urlopen(url)
    strings = str(page.read(), encoding='utf-8')

    answer_lines = re.findall(r"<ol>.*?</ol>", strings, re.DOTALL)[0].split('\n')
    answer_keys = {}
    for index, answer_line in enumerate(answer_lines):
        answer_str = re.findall(r"<li>.*?</li>", answer_line)[0].replace('<li>', '').replace('</li>', '')
        answer_keys[f'{name}_{index+1}'] = answer_str#[0]

    return answer_keys


def crawl_data_process(contest, year):
    end_str = "_Fall" if contest[0:4]=="Fall" else ""
    url_base = f"https://artofproblemsolving.com/wiki/index.php/{year}_{contest}"
    answer_option_dict = crawl_answer_option(f"{url_base}_Problems", f"{contests[contest][0]}_{year}{end_str}")
    answer_key_dict = crawl_answer_key(f"{url_base}_Answer_Key", f"{contests[contest][0]}_{year}{end_str}")
    return answer_key_dict, answer_option_dict
    

def main():
    with open(f"{ANSWER_DIR}/AMC_answer_option_line.jsonl", "w") as wf:
        for contest, contest_value in contests.items():
            for year in contest_value[1]:
                answer_key_dict, answer_option_dict = crawl_data_process(contest, year)
                for key in answer_key_dict.keys():
                    problem_dict = {
                        "result_name":key,
                        "answer_key":answer_key_dict[key],
                        "option":answer_option_dict[key]
                    }
                    json_line = orjson.dumps(problem_dict, option=orjson.OPT_NAIVE_UTC | orjson.OPT_SERIALIZE_NUMPY)
                    wf.write(f"{str(json_line, encoding='utf-8')}\n")


if __name__ == "__main__":
    main()