#!/usr/bin/env python3
from __future__ import annotations

import argparse
import json
from collections import Counter, defaultdict
from copy import deepcopy
from pathlib import Path
from typing import Any


ROOT = Path(__file__).resolve().parents[1]
DEFAULT_RAW_DIR = Path("/root/workspace/lcy/Nesterov/raw_json")
DEFAULT_OUTPUT_DIR = ROOT / "data" / "Nesterov"


def _parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description=(
            "Normalize Nesterov raw_json files into orchestrator-friendly JSON files "
            "without changing the original mathematical content."
        )
    )
    parser.add_argument(
        "--raw-file",
        type=Path,
        default=DEFAULT_RAW_DIR / "Chapter1.json",
        help="Source raw JSON file. Default: /root/workspace/lcy/Nesterov/raw_json/Chapter1.json",
    )
    parser.add_argument(
        "--output-dir",
        type=Path,
        default=DEFAULT_OUTPUT_DIR,
        help="Directory for normalized outputs. Default: data/Nesterov",
    )
    parser.add_argument(
        "--write-combined",
        action="store_true",
        help="Also write a combined chapter-level JSON file.",
    )
    return parser.parse_args()


def _load_raw_items(path: Path) -> list[dict[str, Any]]:
    raw = json.loads(path.read_text(encoding="utf-8-sig"))
    if not isinstance(raw, list):
        raise ValueError(f"Expected a JSON array in {path}, got {type(raw).__name__}")
    items: list[dict[str, Any]] = []
    for idx, entry in enumerate(raw, start=1):
        if not isinstance(entry, dict):
            raise ValueError(f"Entry #{idx} in {path} is not an object")
        items.append(entry)
    return items


def _drop_blank_proof_fields(entry: dict[str, Any]) -> None:
    for key in ("proof", "proof_text", "formal_proof", "informal_proof"):
        value = entry.get(key)
        if isinstance(value, str) and not value.strip():
            entry.pop(key, None)


def _normalize_entry(entry: dict[str, Any], *, source_file: str) -> dict[str, Any]:
    normalized = deepcopy(entry)
    _drop_blank_proof_fields(normalized)

    dependencies = normalized.get("dependencies")
    if not isinstance(dependencies, list):
        normalized["dependencies"] = []

    normalized["source_file"] = source_file
    normalized["source_raw_index"] = entry.get("index")
    return normalized


def _major_label_number(entry: dict[str, Any]) -> int | None:
    components = entry.get("number_components")
    if isinstance(components, list) and len(components) >= 2 and isinstance(components[1], int):
        return components[1]
    return None


def _section_number(entry: dict[str, Any]) -> int:
    context = entry.get("context") or {}
    raw_value = context.get("section_number")
    if isinstance(raw_value, int):
        return raw_value
    if isinstance(raw_value, str) and raw_value.strip():
        try:
            return int(raw_value.strip().split(".")[0])
        except ValueError as exc:
            raise ValueError(f"Invalid context.section_number={raw_value!r}") from exc
    raise ValueError(f"Missing context.section_number in entry {entry.get('label', '<unknown>')}")


def _chapter_number(items: list[dict[str, Any]], raw_file: Path) -> int:
    if not items:
        raise ValueError(f"No items found in {raw_file}")
    context = items[0].get("context") or {}
    raw_value = context.get("chapter_number")
    if isinstance(raw_value, int):
        return raw_value
    if isinstance(raw_value, str) and raw_value.strip():
        try:
            return int(raw_value.strip())
        except ValueError as exc:
            raise ValueError(f"Invalid context.chapter_number={raw_value!r}") from exc
    raise ValueError(f"Missing context.chapter_number in {raw_file}")


def _major_to_section_majority(items: list[dict[str, Any]]) -> dict[int, int]:
    votes: dict[int, Counter[int]] = defaultdict(Counter)
    for item in items:
        major = _major_label_number(item)
        if major is None:
            continue
        section = _section_number(item)
        votes[major][section] += 1

    majorities: dict[int, int] = {}
    for major, counter in votes.items():
        majorities[major] = counter.most_common(1)[0][0]
    return majorities


def _repair_section_number(entry: dict[str, Any], *, majority_by_major: dict[int, int]) -> None:
    major = _major_label_number(entry)
    if major is None or major not in majority_by_major:
        return

    context = entry.get("context")
    if not isinstance(context, dict):
        return

    current = context.get("section_number")
    repaired = majority_by_major[major]
    if current == repaired:
        return

    entry["source_original_section_number"] = current
    context["section_number"] = repaired


def _write_json(path: Path, items: list[dict[str, Any]]) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(
        json.dumps(items, ensure_ascii=False, indent=2) + "\n",
        encoding="utf-8",
    )


def _with_reindexed_items(items: list[dict[str, Any]]) -> list[dict[str, Any]]:
    reindexed: list[dict[str, Any]] = []
    for new_index, item in enumerate(items, start=1):
        updated = deepcopy(item)
        updated["index"] = new_index
        reindexed.append(updated)
    return reindexed


def main() -> None:
    args = _parse_args()
    raw_file = args.raw_file.resolve()
    output_dir = args.output_dir.resolve()

    items = _load_raw_items(raw_file)
    chapter_number = _chapter_number(items, raw_file)
    majority_by_major = _major_to_section_majority(items)
    normalized = [
        _normalize_entry(item, source_file=raw_file.name)
        for item in sorted(items, key=lambda x: x.get("index", 0))
    ]
    for item in normalized:
        _repair_section_number(item, majority_by_major=majority_by_major)

    by_section: dict[int, list[dict[str, Any]]] = defaultdict(list)
    for item in normalized:
        by_section[_section_number(item)].append(item)

    for section_number, section_items in sorted(by_section.items()):
        section_path = output_dir / f"section{section_number:02d}.json"
        _write_json(section_path, _with_reindexed_items(section_items))
        print(f"Wrote {section_path} ({len(section_items)} items)")

    if args.write_combined:
        chapter_path = output_dir / f"Chapter{chapter_number:02d}.json"
        _write_json(chapter_path, _with_reindexed_items(normalized))
        print(f"Wrote {chapter_path} ({len(normalized)} items)")


if __name__ == "__main__":
    main()
