import os
import re
import html

# Regex to capture everything from "[require" up to "</pre><div>"
# Using DOTALL so that the pattern matches over multiple lines.
snippet_pattern = re.compile(r"(\[require\(\[ \'jquery\'.*?)(</pre><div>)", re.DOTALL)


def fix_html_file(filepath):
    with open(filepath, "r", encoding="utf-8") as f:
        content = f.read()

    # Split the file using a marker that separates distinct sections.
    segments = content.split("<h2>New Page</h2>")
    cleaned_segments = []

    for segment in segments:
        # In the matched snippet, only escape group(1) and then reattach group(2)
        cleaned = re.sub(snippet_pattern, lambda m: html.escape(m.group(1)) + m.group(2), segment)
        cleaned_segments.append(cleaned)

    # Reassemble the document with the original marker.
    fixed_content = "<h2>New Page</h2>".join(cleaned_segments)
    return fixed_content


def fix_all_html_in_folder(folder):
    for filename in os.listdir(folder):
        # Adjust the check below if your files have a different extension.
        if filename.lower().endswith(".html"):
            path = os.path.join(folder, filename)
            fixed = fix_html_file(path)
            with open(path, "w", encoding="utf-8") as f:
                f.write(fixed)


fix_all_html_in_folder(
    "experiments/gpt-4o-2024-08-06/____EXEC____mem=text-a_t=100____CRIT____m=2p_eval=tri_no_reset_1thread/shopping/htmls"
)
