import argparse
import json
import re
from pathlib import Path
from typing import Dict, List, Optional

HEADER_RE = re.compile(r"^\s{0,3}#{1,6}\s+.*$")
FENCE_JSON_START_RE = re.compile(r"^\s*```json\s*$", re.IGNORECASE)
FENCE_END_RE = re.compile(r"^\s*```\s*$")
CHECK_MARK = "✅"
CROSS_MARK = "❌"


def extract_json_sections_with_check(
    readme_path: str | Path, markdown_text: Optional[str] = None
) -> List[Dict]:
    """
    Extract all JSON code blocks that reside inside markdown sections whose header contains a ✅ symbol.

    A "section" is considered to start at a markdown header line (e.g., "## Section ...") and
    extends until the next header of the same or higher level. If the header line contains ✅, all
    fenced ```json code blocks inside that region will be parsed and returned.

    Args:
        readme_path: Path to the README.md file.
        markdown_text: Optional raw markdown text; if provided, this is used instead of reading the file.

    Returns:
        A list of dictionaries parsed from each qualifying JSON code block. Malformed JSON blocks are skipped.
    """
    if markdown_text is None:
        readme_path = Path(readme_path)
        text = readme_path.read_text(encoding="utf-8")
    else:
        text = markdown_text

    lines = text.splitlines()

    results: List[Dict] = []
    in_check_section = False

    i = 0
    while i < len(lines):
        line = lines[i]

        # Detect headers and set whether we're within a ✅ section
        if HEADER_RE.match(line):
            # New section encountered; decide flag based on header text
            header_text = line
            in_check_section = (
                CHECK_MARK in header_text and CROSS_MARK not in header_text
            )
            i += 1
            continue

        # If we're inside a ✅ section, look for fenced JSON blocks
        if in_check_section and FENCE_JSON_START_RE.match(line):
            i += 1  # move past the opening fence
            block_lines: List[str] = []
            while i < len(lines) and not FENCE_END_RE.match(lines[i]):
                block_lines.append(lines[i])
                i += 1
            # i is at closing fence or EOF; advance one more to skip the fence if present
            # (if EOF without closing fence, we'll just try to parse whatever we have)
            if i < len(lines) and FENCE_END_RE.match(lines[i]):
                i += 1

            block_text = "\n".join(block_lines).strip()
            if block_text:
                try:
                    parsed = json.loads(block_text)
                    if isinstance(parsed, dict):
                        results.append(parsed)
                    else:
                        # Only collect dict blocks, ignore arrays/primitives
                        pass
                except json.JSONDecodeError:
                    # Skip malformed JSON blocks quietly
                    pass
            continue

        i += 1

    return results


def upgrade_format():
    """Read and transform the gitlab.tasks.override.json file."""
    input_path = (
        Path(__file__).parent.parent.parent.parent.parent / "assets" / "gitlab.tasks.override.json"
    )
    output_path = (
        Path(__file__).parent.parent.parent.parent.parent
        / "assets"
        / "gitlab.tasks.override2.json"
    )

    with open(input_path, "r") as f:
        tasks = json.load(f)

    transformed_tasks = []
    for task in tasks:
        if "expected_data" in task and "eval_function" in task:
            transformed_task = {
                "task_id": task["task_id"],
                "expected_backend_state": [
                    {
                        "eval_func": {
                            "name": task["eval_function"],
                            "eval_params": task["expected_data"],
                        }
                    }
                ],
            }
            if "changelogs" in task:
                transformed_task["changelogs"] = task["changelogs"]
            transformed_tasks.append(transformed_task)
        else:
            transformed_tasks.append(task)

    with open(output_path, "w") as f:
        json.dump(transformed_tasks, f, indent=4)

    print(f"Transformed file written to {output_path}")


def main():
    parser = argparse.ArgumentParser(
        description="Scripts for processing GitLab evaluation data."
    )
    parser.add_argument(
        "--readme-to-json",
        action="store_true",
        help="Extract JSON from README and print non-triaged task IDs.",
    )
    parser.add_argument(
        "--raw-json-checker",
        action="store_true",
        help='Read test.raw.json and filter for objects with a "gitlab" key.',
    )
    parser.add_argument(
        "--upgrade-format",
        action="store_true",
        help="Upgrade the format of gitlab.tasks.override.json.",
    )

    args = parser.parse_args()

    if args.readme_to_json:
        # Convenience CLI usage: prints number of blocks and a brief preview
        default_path = Path(__file__).parent / "README.md"
        path_str = str(default_path)
        try:
            all_jsons_from_readme = extract_json_sections_with_check(path_str)
            triaged_task_ids = [
                389,
                # ...
            ]
            # Build mapping of all task_ids from ✅ sections to their intent_template_id
            task_to_template: Dict[int, int] = {}
            all_tasks: set[int] = set()
            for block in all_jsons_from_readme:
                intent_template_id = block.get("intent_template_id")
                task_ids = block.get("task_ids")
                if intent_template_id is None or not isinstance(task_ids, list):
                    continue
                for tid in task_ids:
                    if isinstance(tid, int):
                        task_to_template[tid] = int(intent_template_id)
                        all_tasks.add(tid)

            triaged_set = set(triaged_task_ids)
            not_triaged_pairs = sorted(
                [
                    (task_to_template[tid], tid)
                    for tid in all_tasks
                    if tid not in triaged_set
                ],
                key=lambda p: (p[0], p[1]),
            )

            for intent_id, task_id in not_triaged_pairs:
                print(f"intent_template_id={intent_id}, task_id={task_id}")
        except FileNotFoundError:
            print(
                f"README not found at {path_str}. Provide the file path to extract_json_sections_with_check()."
            )
    elif args.upgrade_format:
        upgrade_format()
    elif args.raw_json_checker:
        raw_json_path = (
            Path(__file__).parent.parent.parent.parent.parent
            / "assets"
            / "test.raw.json"
        )
        override_json_path = (
            Path(__file__).parent.parent.parent.parent.parent
            / "assets"
            / "gitlab.tasks.override.json"
        )

        with open(override_json_path, "r") as f:
            override_tasks = json.load(f)
            override_task_ids = {task["task_id"] for task in override_tasks}

        unreferenced_tasks = []
        with open(raw_json_path, "r") as f:
            data = json.load(f)
            for item in data:
                if (
                    len(item.get("sites")) == 1
                    and "gitlab" in item.get("sites", [])
                    and item.get("task_id") not in override_task_ids
                    and "program_html" in item.get("eval", {}).get("eval_types", [])
                ):
                    unreferenced_tasks.append(item)

        if unreferenced_tasks:
            print(json.dumps(unreferenced_tasks, indent=2))
        else:
            print("No unreferenced tasks with 'program_html' eval_type found.")

    else:
        parser.print_help()


if __name__ == "__main__":
    main()
