"""
Judge-disagreement taxonomy (v2).

For each Opus-equiv & elab-pass & GTED<0.5 case on V4-Pro x ProofNet#
under {Vanilla, Lean-Retry, SAF, Sample-Filter}, classify the rewrite
that drives GTED's false negative.

Taxonomy (three structural categories + residual):

  N. NOTATIONAL: surface-form rewrites that preserve definitional shape
                 - A -> B  vs  forall _ : A, B  (same Pi type)
                 - Real^n  vs  Fin n -> Real
                 - K^x     vs  Finset.univ \ {0}
                 - deriv^[n] vs iteratedDeriv n
                 - ZMod 2  vs  arbitrary field of size 2 (representative)
                 - different open/namespace prefix
                 - Function.Injective vs Injective

  I. IDIOMATIC: definitional alias / Mathlib formulation swap
                 - IsConj          vs  exists g, b = g*a*g^{-1}
                 - Summable        vs  Tendsto of partial sums
                 - CompactSpace    vs  IsCompact Set.univ
                 - LinearEquiv     vs  Nonempty (LinearEquiv ...) typeclass framing
                 - cofinal predicate vs Set.Infinite

  S. STRUCTURAL: binder / quantifier / typeclass restructuring
                 - let-binding vs explicit hypothesis
                 - instance binder vs hypothesis binder
                 - quantifier moved inside/outside existential
                 - negation of existence vs universal-with-False
                 - Iff direction swap, contrapositive

  Z. OTHER:     unclassified / NL-only mismatch / multi-category.

This version refines the regex rules and dumps all Z-cases for manual
inspection.
"""
from __future__ import annotations
import json, re, sys
import os
from pathlib import Path
from collections import Counter, defaultdict

sys.stdout.reconfigure(encoding="utf-8")

ROOT_RUNS  = Path(os.environ.get("RUNS_DIR", "../data/runs/proofnet_186/v4_pro"))
ROOT_JUDGE = Path(os.environ.get("JUDGE_DIR", "../data/judge"))
METHODS = [("Vanilla","B1","b1"), ("Lean-Retry","B3","b3"),
           ("SAF","B4","b4"), ("Sample-Filter","B5","b5")]

def jl(path, bom=False):
    enc = "utf-8-sig" if bom else "utf-8"
    return [json.loads(l) for l in open(path, encoding=enc) if l.strip()]

per_method = {}
for name, runtag, opustag in METHODS:
    runs = {r["problem_id"]: r for r in jl(ROOT_RUNS / f"{runtag}.jsonl")}
    opus = {v["problem_id"]: v for v in jl(ROOT_JUDGE / f"v4pro_{opustag}_opus.jsonl", bom=True)}
    gted = {g["problem_id"]: g for g in jl(ROOT_JUDGE / f"v4pro_{opustag}_gted.jsonl")}
    per_method[name] = (runs, opus, gted)

candidates = defaultdict(list)
for name, (runs, opus, gted) in per_method.items():
    for pid, v in opus.items():
        if not bool(v["elab_ok"]): continue
        if not bool(v["equiv"]): continue
        g = gted.get(pid)
        if not g or g.get("ted_similarity") is None: continue
        sim = float(g["ted_similarity"])
        if sim >= 0.5: continue
        candidates[name].append({
            "pid": pid, "sim": sim,
            "reason": v.get("reason",""),
            "lean": runs.get(pid,{}).get("lean_code","") or runs.get(pid,{}).get("raw_llm_output","")
        })

RULES = [
    ("I", "definitional alias / Mathlib formulation swap", [
        r"\bisconj\b",
        r"\bsummable\b.{0,80}(tendsto|partial sums|exists.*l|sum.*converge)",
        r"\btendsto\b.{0,80}(summable|series|sum)",
        r"compactspace.{0,80}univ",
        r"iscompact\s*(set\.)?\s*univ",
        r"locallycompactspace.{0,80}(not|nonempty)",
        r"linearequiv.{0,80}≃|≃.{0,80}linearequiv",
        r"\bnonempty\b.{0,80}(linearequiv|equiv|iso|exists)",
        r"set\.infinite.{0,80}cofinal|cofinal.{0,80}set\.infinite",
        r"(unfolds? to|reduces? to|defeq to|definitionally equal)",
        r"different (mathlib )?(idiom|formulation)",
        r"alternative (formulation|definition|idiom|characterization)",
        r"equivalent characterization",
        r"\b(commgroup|abelian)\b.{0,80}(pointwise|commut)",
        r"\bcharacteristic\b.{0,80}(auto|∀ ?φ|forall ?φ)",
        r"`?tendstouniformly`?",
        r"\bgenerated\s*from\b.{0,80}\bsinf\b|\bsinf\b.{0,80}\bgenerated\s*from\b",
        r"\bsinter\b.{0,80}\bgenerated\s*from\b|\bgenerated\s*from\b.{0,80}\bsinter\b",
        r"deriv\^?\[\s*n\s*\]|iterated\s*deriv",
        r"(units|kˣ).{0,80}(finset|nonzero)",
        r"(finset|nonzero).{0,80}(units|kˣ)",
    ]),

    ("N", "notational / coercion / alias", [
        r"(same|equivalent|identical) (pi|π) (type|family)",
        r"(arrow|->|→).{0,40}same.{0,40}(forall|∀)",
        r"(forall|∀).{0,40}same.{0,40}(arrow|->|→)",
        r"`?a -> b`?.{0,40}notation for.{0,40}forall",
        r"`?ℝ\^n|real\^n|\^n`?.{0,40}fin n\s*(->|→)\s*(ℝ|real)",
        r"fin n\s*(->|→)\s*(ℝ|real).{0,40}(ℝ\^n|real\^n|\^n)",
        r"(zmod 2|𝔽\s*2).{0,40}(any |2-element |order 2|field of size 2)",
        r"(any |2-element |order 2|field of size 2).{0,40}(zmod 2|𝔽\s*2)",
        r"(gaussianint|gaussian integer|ℤ\[i\]|z\[i\])",
        r"function\.injective.{0,40}injective",
        r"injective.{0,40}function\.injective",
        r"\b(open|namespace)\b.{0,80}(prefix|different|alternative)",
        r"\bdifferent\b.{0,30}(notation|prefix|namespace)",
        r"\bnotation\b.{0,30}(differ|swap|alternative)",
        r"\b(coercion|coerce)\b",
        r"only difference is.{0,40}(notation|prefix|name|coerc)",
        r"token-identical|cosmetic (renaming|differ|differ)",
        r"variable order differs|variable renaming",
        r"identical statement.{0,40}(differ|notation)",
        r"both denote the same complex number",
        r"same complex number",
        r"same number\b",
        r"\bsame.{0,15}(literal|constant)\b",
    ]),

    ("S", "binder/quantifier/typeclass restructuring", [
        r"let-binding|let bindings?|inline let",
        r"hypothes(is|es).{0,80}(inline|hoist|moved|reorder|restructur|to premise|to binder)",
        r"(inline|hoist|moved|reorder|restructur|premise).{0,80}hypothes",
        r"(instance|typeclass|\[fact\b|\[gcdmonoid|\[field\b|\[ring\b|\[comm[a-z]+\b|\[module\b|\[fintype\b|\[group\b)",
        r"explicit\s+`?\[",
        r"(explicit|implicit) (typeclass|instance|argument|binder)",
        r"\[\w+\][^\]]{0,80}(hypothesis|premise|binder|argument)",
        r"(quantifier|∀|forall|∃|exists).{0,80}(inside|outside|moved|swap|reorder|permut)",
        r"(inside|outside|moved|swap|reorder|permut).{0,80}(quantifier|∀|forall|∃|exists)",
        r"per[- ]pair.{0,40}(weaker|stronger|equivalent)",
        r"(weaker|stronger) formulation",
        r"denies existence|negation of existence",
        r"cofinal(ity)?",
        r"universal with.{0,40}false",
        r"contrapositive",
        r"\biff\b.{0,40}(direction|swap|reverse|flip|biconditional)",
        r"different directions of the (same )?(iff|biconditional)",
        r"\bbiconditional\b.{0,40}(collapses?|direction|implication)",
        r"\bimplication\b.{0,40}biconditional|biconditional.{0,40}\bimplication\b",
        r"split.{0,40}(case|disjunct)",
        r"case split|case-split",
        r"\bif-then\b.{0,40}(split|case|≡)",
        # Encoding / subtype / indicator
        r"\bsubtype\b",
        r"\bindicator\b",
        r"continuouson",
        r"\b(directed|ℕ-indexed|nat-indexed|indexed family)\b",
        r"\b(preorder|partialorder|totalorder)\b",
        r"(⊆|⊂|⊇|subset|contained).{0,40}(equality|=|equal)",
        r"(equality|=|equal).{0,40}(⊆|⊂|⊇|subset|contained)",
        r"vacuous(ly true)?",
        r"(omit|drop|add)s? (the )?(hypothesis|premise|condition|requirement)",
        r"derive[ds]?\s+(fintype|existence|premise|automatic)",
        r"auto[- ]?bind",
        r"binder restructur",
        r"candidate.{0,200}(uses|states|adds|omits|spells out|expresses|gives).{0,200}gold.{0,200}(uses|states|adds|omits|spells out|expresses|gives).{0,80}(equivalent|same theorem|same nl|same statement|both formalize)",
        r"gold.{0,200}(uses|states|adds|omits|spells out).{0,200}candidate.{0,200}(uses|states|adds|omits|spells out).{0,80}(equivalent|same theorem|same nl)",
    ]),
]

CAT_NAMES = {
    "N": "Notational (Pi/arrow; ℝ^n/Fin; coercion; namespace; cosmetic)",
    "I": "Idiomatic (Mathlib def alias: IsConj/∃; Summable/Tendsto; CompactSpace/IsCompact univ)",
    "S": "Structural (binder/quantifier/typeclass restructuring)",
    "Z": "Unclassified residual",
}

def classify(reason: str, lean_code: str) -> str:
    text = (reason + "\n" + lean_code).lower()
    for cat, _, pats in RULES:
        for p in pats:
            if re.search(p, text, flags=re.DOTALL):
                return cat
    return "Z"

result = defaultdict(lambda: Counter())
detail = defaultdict(lambda: defaultdict(list))
for name,_,_ in METHODS:
    for c in candidates[name]:
        cat = classify(c["reason"], c["lean"])
        result[name][cat] += 1
        detail[name][cat].append((c["pid"], c["sim"], c["reason"]))

CATS = ["N","I","S","Z"]

print("="*78)
print("GTED false negatives (Opus equiv & elab_ok & GTED<0.5)")
print("="*78)
for name,_,_ in METHODS:
    print(f"  {name:<16}  n = {len(candidates[name])}")
print()

print("="*78)
print("Per-method counts by category")
print("="*78)
print(f"\n{'Cat':<4} {'Description':<58} " + " ".join(f"{m[:9]:>9}" for m,_,_ in METHODS))
print("-"*108)
for cat in CATS:
    print(f"{cat:<4} {CAT_NAMES[cat]:<58} " + " ".join(f"{result[m][cat]:>9}" for m,_,_ in METHODS))
print(f"{'TOT':<4} {'':<58} " + " ".join(f"{sum(result[m].values()):>9}" for m,_,_ in METHODS))

print()
print("Per-method share (% of method's false negatives):")
print(f"\n{'Cat':<4} {'Description':<58} " + " ".join(f"{m[:9]:>9}" for m,_,_ in METHODS))
print("-"*108)
for cat in CATS:
    row = f"{cat:<4} {CAT_NAMES[cat]:<58} "
    for m,_,_ in METHODS:
        tot = sum(result[m].values()) or 1
        row += f"{100.0*result[m][cat]/tot:>8.1f}% "
    print(row)

print()
print("="*78)
print("Z-residual cases (Lean-Retry) -- to refine the rules or label manually")
print("="*78)
for pid, sim, reason in detail["Lean-Retry"]["Z"]:
    print(f"\n  {pid:<42} sim={sim:.2f}")
    print(f"    {reason[:300]}")
