"""Extract the syntactic region of a Lean output that a perturbation targets.

For number/symbol edits the perturbation lives either in the theorem statement
or the proof body. The LLM judge for FR/RR/OUR should only ever see that
region — exposing the rest of the Lean output to the judge is the dominant
failure mode we observed in validation:

  - The model output usually starts with a ``/- ... -/`` comment block that
    re-states the original NL problem. That block contains old/new value
    candidates and tempts the judge to extract from there.
  - For a statement edit, the proof body is irrelevant and can contain
    arbitrary intermediate numbers (`norm_num`, etc.) that confuse the judge.
  - Symmetrically, for a proof edit the signature is irrelevant.

This module provides ``extract_region(lean_output, edit_source)`` returning
the substring the judge should consider — empty if the output is degenerate
(no theorem declaration found, or no proof body for a proof edit).
"""
from __future__ import annotations

import re

_COMMENT_RE = re.compile(r"/-[\s\S]*?-/")
_BY_RE = re.compile(r":=\s*by\b")
_DECL_RE = re.compile(r"(?:theorem|lemma|example)\b")


def split_signature_proof(lean: str) -> tuple[str, str]:
    """Return ``(signature, proof_body)`` for a Lean 4 declaration.

    Strips the leading ``/- ... -/`` NL-comment block, locates the first
    ``theorem|lemma|example`` declaration, and splits at ``:= by``. Either
    component is the empty string if missing.
    """
    if not lean:
        return "", ""
    stripped = _COMMENT_RE.sub("", lean)
    m = _DECL_RE.search(stripped)
    if not m:
        return "", ""
    region = stripped[m.start():]
    bm = _BY_RE.search(region)
    if bm:
        return region[:bm.start()], region[bm.end():]
    return region, ""


def extract_region(lean_output: str, edit_source: str) -> str:
    """Return the slice the FR/RR/OUR judge should see, or "" if degenerate.

    ``edit_source`` is one of ``"statement"`` / ``"proof"``. The output is
    intended to be inlined into a judge prompt; callers should treat an empty
    result as a deterministic "other" (no judge call needed).
    """
    sig, proof = split_signature_proof(lean_output)
    target = sig if edit_source == "statement" else proof
    return target.strip()
