"""Auto-eval every classifier_*.joblib and emit a leaderboard.md.

Walks the classifier results directory, runs in-dist + calibration + OOD
math evaluations on each bundle, parses the metrics, and writes a single
markdown ranking table.
"""
from __future__ import annotations

import argparse
import json
import re
import subprocess
import sys
from pathlib import Path

import joblib

RD = Path("<PROJECT_DIR>/results/exploration_analysis/llm_validation/classifier")


METRIC_RE = re.compile(r"macro-F1[:= ]\s*\*?\*?(\d+\.\d+)\*?\*?")
WEIGHTED_RE = re.compile(r"weighted-F1[:= ]\s*(\d+\.\d+)")
KAPPA_RE = re.compile(r"kappa[:= ]\s*\*?\*?(\d+\.\d+)\*?\*?")


def parse_md_metric(md_path: Path, key: str = "macro") -> float | None:
    if not md_path.exists():
        return None
    text = md_path.read_text()
    if key == "macro":
        m = METRIC_RE.search(text)
    elif key == "weighted":
        m = WEIGHTED_RE.search(text)
    elif key == "kappa":
        m = KAPPA_RE.search(text)
    else:
        return None
    return float(m.group(1)) if m else None


def parse_calibration(md_path: Path) -> dict:
    """Pull the three rows of the pairwise summary table."""
    if not md_path.exists():
        return {}
    text = md_path.read_text()
    rows = {}
    # Lines look like: | classifier ↔ R1-SC | 71.6% | 0.6775 | 0.6649 |
    for line in text.split("\n"):
        if "|" in line and "↔" in line:
            cells = [c.strip() for c in line.split("|") if c.strip()]
            if len(cells) >= 4 and cells[1].endswith("%"):
                name = cells[0]
                f1 = re.search(r"(\d+\.\d+)", cells[2])
                if f1:
                    rows[name] = float(f1.group(1))
    return rows


def run_evals(bundle_path: Path) -> dict:
    name = bundle_path.stem  # e.g. classifier_v4_deberta
    in_dist = RD / f"eval_in_dist_{name.replace('classifier_', '')}.md"
    calib = RD / f"eval_calibration_{name.replace('classifier_', '')}.md"
    ood = RD / f"eval_ood_math_{name.replace('classifier_', '')}.md"

    py = "<PROJECT_DIR>/sdft-venv/bin/python"
    silver = RD / "silver_combined_v3.jsonl"
    if not silver.exists():
        # Build it
        with open(silver, "w") as f:
            for s in [RD / "silver_train.jsonl", RD / "silver_topup.jsonl", RD / "silver_active.jsonl"]:
                if s.exists():
                    f.write(s.read_text())

    splits = bundle_path.with_suffix(".splits.json")
    if not splits.exists():
        return {"name": name, "error": "no splits.json"}

    # In-dist
    if not in_dist.exists():
        subprocess.run([
            py, "-m", "analysis.exploration.llm_validation.classifier.evaluate",
            "--mode", "in_dist", "--bundle", str(bundle_path),
            "--silver", str(silver), "--splits", str(splits),
            "--out", str(in_dist),
        ], cwd="<PROJECT_DIR>", check=False, timeout=300)

    # Calibration
    if not calib.exists():
        subprocess.run([
            py, "-m", "analysis.exploration.llm_validation.classifier.evaluate",
            "--mode", "calibration", "--bundle", str(bundle_path),
            "--spans", "results/exploration_analysis/llm_validation/sampled_spans.jsonl",
            "--r1-judgments", "results/exploration_analysis/llm_validation/llm_judgments_sc_v4.jsonl",
            "--v3-judgments", "results/exploration_analysis/llm_validation/llm_judgments_v3_sc_v4.jsonl",
            "--out", str(calib),
        ], cwd="<PROJECT_DIR>", check=False, timeout=300)

    # OOD math
    if not ood.exists():
        subprocess.run([
            py, "-m", "analysis.exploration.llm_validation.classifier.ood_math.eval_math",
            "--spans", str(RD / "silver_math_spans.jsonl"),
            "--judgments", str(RD / "silver_math.jsonl"),
            "--bundle", str(bundle_path),
            "--out", str(ood),
        ], cwd="<PROJECT_DIR>", check=False, timeout=300)

    return {
        "name": name,
        "in_dist_macro": parse_md_metric(in_dist, "macro"),
        "in_dist_weighted": parse_md_metric(in_dist, "weighted"),
        "calib_pairs": parse_calibration(calib),
        "ood_macro": parse_md_metric(ood, "macro"),
    }


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--out", type=Path, default=RD / "LEADERBOARD.md")
    args = ap.parse_args()

    bundles = sorted(RD.glob("classifier_v*.joblib"))
    bundles = [b for b in bundles if "splits" not in b.stem]
    print(f"Found {len(bundles)} bundles")
    rows = []
    for b in bundles:
        r = run_evals(b)
        rows.append(r)
        print(f"  {r['name']}: in_dist={r.get('in_dist_macro')}, ood={r.get('ood_macro')}")

    # Sort by in-dist macro-F1 descending
    rows.sort(key=lambda r: -(r.get("in_dist_macro") or 0))

    md = ["# Classifier leaderboard (in-dist macro-F1 ranking)\n"]
    md.append("| Variant | In-dist macro-F1 | In-dist weighted-F1 | Classifier↔R1-SC | OOD math F1 |")
    md.append("|---|---|---|---|---|")
    for r in rows:
        cp = r.get("calib_pairs", {})
        c_r1 = next((v for k, v in cp.items() if "R1-SC" in k and "V3-SC ↔" not in k), None)
        md.append(
            f"| {r['name']} | "
            f"{r.get('in_dist_macro') or '?':.4f} | "
            f"{r.get('in_dist_weighted') or '?':.4f} | "
            f"{c_r1 if c_r1 is not None else '?'} | "
            f"{r.get('ood_macro') or '?':.4f} |"
        )

    md.append("")
    md.append("V3-SC ↔ R1-SC baseline (LLM-judge pair): 0.6781")
    md.append("Heuristic baseline: 0.196 in-dist, 0.18-0.20 OOD")

    args.out.write_text("\n".join(md))
    print(f"\nWrote {args.out}")
    for line in md[:3] + md[-3:]:
        print(line)


if __name__ == "__main__":
    main()
