import glob
import json
import multiprocessing
import os
import random
import re

import htmlmin
import tqdm

from utils import request  # as in our root utils.py

request_kwargs = dict()  # TODO: fill in yours
with open(os.path.join(os.path.dirname(__file__), "prompts/summarize_functionality.md"), "r") as f:
    PROMPT_SUMMARIZE_FUNCTIONALITY = f.read()


def summarize_functionality(src):
    prompt = PROMPT_SUMMARIZE_FUNCTIONALITY.replace("{{{HTML}}}", src)
    response = request([{"role": "user", "content": [
        {"type": "text", "text": prompt},
    ]}, ], wait_if_fail=0, n_retry=1, **request_kwargs)

    def parse_website_structure(text):
        text = text.split('---')[1].strip()

        # Pattern for the purpose (everything after "Website purpose:")
        purpose_pattern = r"Website purpose:\s*(.+)"

        # Pattern for features (lines starting with digits + dot + space)
        features_pattern = r"(?m)^\s*\d+\.\s+(.*?)(?=\n\s*\d+\.|\Z)"

        purpose_match = re.search(purpose_pattern, text)
        features_matches = re.findall(features_pattern, text)
        assert len(features_matches) == 3

        return {
            "purpose": purpose_match.group(1).strip() if purpose_match else None,
            "features": [f.strip() for f in features_matches]
        }

    return parse_website_structure(response)


def main_single_file(fname):
    with open(fname) as f:
        src = f.read().strip()

    try:
        src = htmlmin.minify(src)
    except:
        pass

    try:
        summary = summarize_functionality(src)
    except:
        return os.path.basename(fname), None, None

    text = '{}\nFeatures:\n* {}\n* {}\n* {}'.format(
        summary['purpose'], summary['features'][0], summary['features'][1], summary['features'][2]
    )
    return os.path.basename(fname), summary, text


def main():
    fname = './outputs/c4/summary.jsonl'
    c4_data = {}
    if os.path.exists(fname):
        with open(fname) as f:
            for line in f:
                k, v, t = json.loads(line)
                c4_data[k] = (v, t)

    files = glob.glob('./outputs/c4/crawl/*.html')
    random.seed(42)
    random.shuffle(files)

    files = files[:10000]  # <- do first 1000 first
    files = [f for f in files if os.path.basename(f) not in c4_data]

    with multiprocessing.Pool(32) as p:
        for k, v, t in tqdm.tqdm(p.imap(main_single_file, files), total=len(files)):
            c4_data[k] = (v, t)
            with open(fname, 'a') as f:
                f.write(json.dumps([k, v, t]) + '\n')


if __name__ == "__main__":
    main()
