import urllib.request as request
import re, json, os

contests = {
    "AMC_8": "AMC8",
    "AMC_10A": "AMC10A",
    "AMC_10B": "AMC10B",
    "AMC_12A": "AMC12A",
    "AMC_12B": "AMC12B",
    "AIME_I": "AIME1",
    "AIME_II": "AIME2",
    "Fall_AMC_10A": "AMC10A",
    "Fall_AMC_10B": "AMC10B",
    "Fall_AMC_12A": "AMC12A",
    "Fall_AMC_12B": "AMC12B",
}


def latex_plain(line):
    pattern = re.compile(r"<img src=\"//latex.artofproblemsolving.com/.*?/>")
    latex_blocks = re.findall(pattern, line)
    new_line = line
    plain_pattern = re.compile(r"alt=\".*?\"")
    for block in latex_blocks:
        plain_block = plain_pattern.findall(block)[0]
        plain_block = plain_block[5:-1]
        new_line = new_line.replace(block, plain_block)
    return new_line


def crawl_data(url):
    extract_data = {}
    page = request.urlopen(url)
    strings = str(page.read(), encoding="utf-8")

    problem_pattern = r"<span class=\"mw-headline\" id=\"Problem.*?<h2>"
    problem_text = re.findall(problem_pattern, strings, re.DOTALL)
    problem_lines = problem_text[0].replace("</p>", "").replace("<p>", "").split("\n")
    problem_lines = [line for line in problem_lines[1:-1] if len(line) > 0]
    problem_line = "\n".join(problem_lines)
    try:
        extract_data["problem"] = latex_plain(problem_line)
    except:
        extract_data["problem"] = ""

    solution_pattern = r"<span class=\"mw-headline\" id=\"Solution.*?<h2>"
    solution_texts = re.findall(solution_pattern, strings, re.DOTALL)
    print(len(solution_texts))
    for idx, solution_text in enumerate(solution_texts):
        solution_lines = (
            solution_text.replace("</p>", "").replace("<p>", "").split("\n")
        )
        solution_lines = [line for line in solution_lines[1:-1] if len(line) > 0]
        solution_line = "\n".join(solution_lines)
        try:
            extract_data[f"solution_{idx}"] = latex_plain(solution_line)
        except:
            extract_data[f"solution_{idx}"] = ""

    return extract_data


def crawl_data_process(contest, year, problem_index):
    url = f"https://artofproblemsolving.com/wiki/index.php/{year}_{contest}_Problems/Problem_{problem_index}"
    try:
        extract_data = crawl_data(url)
        extract_data["year"] = year
        extract_data["contest"] = contests[contest]
        extract_data["problem_index"] = problem_index
        if contest[0:4] == "Fall":
            with open(
                f"data/{contests[contest]}/{contests[contest]}_{year}_Fall_{problem_index}.json",
                "w",
            ) as outfile:
                json.dump(extract_data, outfile)
        else:
            with open(
                f"data/{contests[contest]}/{contests[contest]}_{year}_{problem_index}.json",
                "w",
            ) as outfile:
                json.dump(extract_data, outfile)
    except:
        print(f"Fail: {year} {contest} {problem_index}")


def main():
    for contest in contests.keys():
        os.makedirs(f"data/{contests[contest]}", exist_ok=True)
        for year in range(2010, 2023):
            for problem_index in range(1, 26):
                crawl_data_process(contest, year, problem_index)


if __name__ == "__main__":
    main()
