"""
This script extracts the formal statements and proofs from the Lean file.
"""

import json
import json
import os
import re

HEAD = "import Mathlib\nimport Aesop\n\nset_option maxHeartbeats 0\n\nopen BigOperators Real Nat Topology Rat\n"

def upgrade_theorem(theorem: str) -> str:
    up_dict = {
        "Real.sqrt_eq_iff_sq_eq": "Real.sqrt_eq_iff_eq_sq",
        "le_div_iff": "le_div_iff₀",
        "div_le_iff": "div_le_iff₀",
        "Int.coe_nat_dvd": "Int.natCast_dvd_natCast",
        "add_right_neg": "add_neg_cancel",
        "add_left_neg": "neg_add_cancel",
        "true_and_iff": "true_and",
        "div_le_div_of_le_of_nonneg": "div_le_div_of_nonneg_right",
        "div_le_div_of_le_left": "div_le_div_of_nonneg_left",
        "ZMod.nat_cast_self": "ZMod.natCast_self",
        "Real.rpow_nat_cast": "Real.rpow_natCast",
        "one_le_pow_of_one_le": "one_le_pow₀",
        "Rat.ext_iff": "NNRat.ext_iff",
        "ZMod.val_nat_cast": "ZMod.val_natCast",
        "Nat.pow_lt_pow_of_lt_right": "pow_lt_pow_right",
        "pi_gt_3141592": "pi_gt_d6",
        "div_le_div_of_le": "div_le_div_of_nonneg_right",
        "pow_lt_pow_of_lt_left": "pow_lt_pow_left",
        "Finset.exists_smaller_set": "Finset.exists_subset_card_eq",
        "exists_smaller_set": "exists_subset_card_eq",
        "Set.subset_iff": "subset_iff",
        "Set.card_eq_fintype_card": "Nat.card_eq_fintype_card",
    }
    
    sorted_items = sorted(up_dict.items(), key=lambda x: len(x[0]), reverse=True)
    
    for key, value in sorted_items:
        pattern = r'\b' + re.escape(key) + r'\b'
        theorem = re.sub(pattern, value, theorem)
    
    return theorem

def parse_theorem(name: str, lines: list[str]) -> dict:
    result = [[], [], []]
    index = 0
    for line in lines:
        if line.strip().startswith("theorem"): # theorem part
            index = 1
        result[index].append(line)
        if index == 1 and line.strip().endswith(":= by"): # proof part
            index = 2
    imports = HEAD if ''.join(result[0]) == "" else ''.join(result[0])
    formal_theorem = upgrade_theorem(''.join(result[1]))
    formal_proof = upgrade_theorem(''.join(result[2]))
    return {
        "name": name,
        "formal_statement": formal_theorem,
        "formal_proof": formal_proof,
        "imports": imports
    }

def parse_from_leanfile(file_path: str) -> dict:
    name = file_path.split("/")[-1].replace(".lean", "")
    with open(file_path, "r") as f:
        lines = f.readlines()
    return parse_theorem(name, lines)

def parse_from_import(path: str) -> list[dict]:
    with open(path, "r") as f:
        lines = f.readlines()
    data = []
    for line in lines:
        if line.startswith("import"):
            file_path = line.strip().split(" ")[1].replace(".", "/") + ".lean"
            data.append(parse_from_leanfile(file_path))
    return data

def parse_from_folder(path: str) -> list[dict]:
    data = []
    for root, dirs, files in os.walk(path):
        for file in files:
            if file.endswith(".lean"):
                data.append(parse_from_leanfile(os.path.join(root, file)))
    return data

def parse_from_json(path: str) -> list[dict]:
    with open(path, "r") as f:
        data = json.load(f)
    data = [parse_theorem(item['name'], item['full_code'].splitlines(keepends=True)) for item in data]
    return data
    
# data = parse_from_import("data/Leanworkbook.lean")
# with open("data/Leanworkbook.json", "w") as f:
#     json.dump(data, f, indent=2)

data = parse_from_json("datasets/minif2f-solutions/minif2f_v1.json")
with open("datasets/extracted/minif2f_theorems.json", "w") as f:
    json.dump(data, f, indent=2, ensure_ascii=False)
    
    
# data = parse_from_json("datasets/putnam-solutions/putnam_v1.json")
# with open("datasets/extracted/putnam_theorems.json", "w") as f:
#     json.dump(data, f, indent=2, ensure_ascii=False)