from __future__ import annotations

import argparse
import hashlib
import json
import os
import re
import time
from dataclasses import dataclass
from pathlib import Path
from typing import Any

from .agent_settings import resolve_stage_agents_settings
from .axiom_guard import build_axiom_cleanup_instructions, find_axiom_decls, format_axiom_report
from .codex_client import (
    CodexCallResult,
    ITEM_PROOF_AGENT_A_PROMPT,
    ITEM_PROOF_AGENT_B_PROMPT,
    ITEM_PROOF_AGENT_C_PROMPT,
    PROOF_AGENT_D_PROMPT,
    _agent_extra_args,
    _assemble_prompt,
    run_codex,
)
from .config import LEAN_ROOT, ROOT
from .lean_runner import lake_env_lean
from .log_utils import build_log_filename, file_snapshot, slugify, snapshot_delta
from .metrics import finish_run, log_event, start_run
from .history_store import append_history, load_recent_history
from .protocol import (
    AGENT_A_FEEDBACK_END,
    AGENT_A_FEEDBACK_START,
    AGENT_C_PLAN_END,
    AGENT_C_PLAN_START,
    extract_marked_json,
)
from .state import load_state, save_state


@dataclass(frozen=True, slots=True)
class ItemProof:
    index: int
    label: str
    env: str
    content: str
    nl_answer: str | None
    dependencies: list[Any]
    target_file: str | None
    notes: Any | None
    raw: dict[str, Any]


def _sha256_file(path: Path) -> str:
    return hashlib.sha256(path.read_bytes()).hexdigest()


def _load_items(data_file: Path) -> list[ItemProof]:
    raw = json.loads(data_file.read_text(encoding="utf-8"))
    if not isinstance(raw, list):
        raise ValueError(f"Expected a JSON list at {data_file}, got {type(raw).__name__}")
    items: list[ItemProof] = []
    for pos, entry in enumerate(raw):
        if not isinstance(entry, dict):
            continue
        label = entry.get("label")
        content = entry.get("content")
        if not isinstance(label, str) or not label.strip():
            raise ValueError(f"Missing/invalid label at position {pos}: {entry!r}")
        if not isinstance(content, str) or not content.strip():
            raise ValueError(f"Missing/invalid content for label={label} at position {pos}")
        idx = entry.get("index")
        if not isinstance(idx, int):
            idx = pos + 1
        deps = entry.get("dependencies")
        if not isinstance(deps, list):
            deps = []
        proof = entry.get("proof")
        nl_answer = proof.strip() if isinstance(proof, str) and proof.strip() else None
        notes = entry.get("notes")
        if notes is None:
            notes = entry.get("note")
        env = entry.get("env")
        if not isinstance(env, str) or not env.strip():
            # Default: treat as theorem-ish; proof stage uses label to find target block anyway.
            env = "thm"
        target_file = entry.get("target_file")
        if not isinstance(target_file, str) or not target_file.strip():
            target_file = None
        items.append(
            ItemProof(
                index=int(idx),
                label=label.strip(),
                env=env.strip(),
                content=content,
                nl_answer=nl_answer,
                dependencies=deps,
                target_file=target_file.strip() if target_file else None,
                notes=notes,
                raw=entry,
            )
        )
    items.sort(key=lambda it: it.index)
    return items


def _non_sorry_warning_blocks(lean_output: str) -> list[str]:
    if not lean_output:
        return []
    lines = lean_output.splitlines()
    blocks: list[str] = []
    i = 0
    while i < len(lines):
        lowered = lines[i].lower()
        if "warning" in lowered:
            block_lines = [lines[i]]
            block_lower = [lowered]
            i += 1
            while i < len(lines):
                nxt = lines[i]
                nxt_lower = nxt.lower()
                if "warning" in nxt_lower:
                    break
                block_lines.append(nxt)
                block_lower.append(nxt_lower)
                i += 1
            if "sorry" not in "\n".join(block_lower):
                blocks.append("\n".join(block_lines).strip())
            continue
        i += 1
    return blocks


def _maybe_report_math_blocker(*, stdout: str, agent: str, label: str) -> None:
    """
    Best-effort terminal hint: if an agent reports "mathematically unprovable/false", surface it prominently.
    """
    if not stdout:
        return
    lowered = stdout.lower()
    if "mathematically unprovable" in lowered or "mathematically false" in lowered:
        print(f"[REPORT] Agent {agent} indicates label={label} may be mathematically unprovable/false.")


_SORRY_TOKEN_RE = re.compile(r"\bsorry\b")
_DECL_NAME_RE = re.compile(r"^\s*(theorem|lemma|def|abbrev|example|instance)\s+([A-Za-z0-9_'.]+)\b")


def _count_sorry_tokens(text: str) -> int:
    if not text:
        return 0
    return len(_SORRY_TOKEN_RE.findall(text))


def _named_decl_names(text: str) -> set[str]:
    names: set[str] = set()
    if not text:
        return names
    for line in text.splitlines():
        m = _DECL_NAME_RE.match(line)
        if not m:
            continue
        name = (m.group(2) or "").strip()
        if name:
            names.add(name)
    return names


def _find_decl_line_by_name(lines: list[str], *, name: str) -> int | None:
    if not name:
        return None
    name_re = re.compile(rf"^\s*(theorem|lemma|def|abbrev|example|instance)\s+{re.escape(name)}\b")
    for i, line in enumerate(lines, start=1):
        if name_re.match(line):
            return i
    return None


def _decl_snippet_from_lines(lines: list[str], *, line: int, max_chars: int = 8000) -> str:
    if not lines:
        return ""
    idx = max(0, min(len(lines) - 1, line - 1))
    start = idx
    while start > 0 and lines[start].strip() != "":
        start -= 1
    if lines[start].strip() == "" and start < idx:
        start += 1
    end = idx
    while end + 1 < len(lines) and lines[end + 1].strip() != "":
        end += 1
    snippet = "\n".join(lines[start : end + 1])
    if len(snippet) > max_chars:
        return snippet[:max_chars]
    return snippet


def _count_sorry_tokens_in_named_decl_text(*, file_text: str, name: str) -> int | None:
    if not file_text or not name:
        return None
    lines = file_text.splitlines()
    decl_line = _find_decl_line_by_name(lines, name=name)
    if not decl_line:
        return None
    snippet = _decl_snippet_from_lines(lines, line=decl_line)
    return _count_sorry_tokens(snippet)


def _new_proved_named_decls(*, before_text: str, after_text: str) -> list[str]:
    before_names = _named_decl_names(before_text)
    after_names = _named_decl_names(after_text)
    new_names = sorted(after_names - before_names)
    if not new_names:
        return []
    lines_after = after_text.splitlines()
    proved: list[str] = []
    for name in new_names:
        decl_line = _find_decl_line_by_name(lines_after, name=name)
        if not decl_line:
            continue
        snippet = _decl_snippet_from_lines(lines_after, line=decl_line)
        if _count_sorry_tokens(snippet) == 0:
            proved.append(name)
    return proved


def _combine_extra_instructions(*parts: str | None) -> str | None:
    cleaned = [p.strip() for p in parts if p and p.strip()]
    if not cleaned:
        return None
    return "\n\n".join(cleaned)


def _format_plan_history(history: list[str]) -> str | None:
    if not history:
        return None
    return "PLAN HISTORY (past proof attempts):\n" + "\n".join(history)


def _summarize_plan(plan: dict[str, Any] | None) -> str | None:
    if not isinstance(plan, dict):
        return None
    parts: list[str] = []
    status = plan.get("status")
    if isinstance(status, str) and status.strip():
        parts.append(f"status={status.strip()}")
    main_decl = plan.get("main_declaration")
    if isinstance(main_decl, dict):
        name = main_decl.get("name")
        if isinstance(name, str) and name.strip():
            parts.append(f"main={name.strip()}")
    lemma_names: list[str] = []
    lemma_plan = plan.get("lemma_plan")
    if isinstance(lemma_plan, list):
        for entry in lemma_plan[:8]:
            if not isinstance(entry, dict):
                continue
            name = entry.get("name")
            if isinstance(name, str) and name.strip():
                lemma_names.append(name.strip())
    if lemma_names:
        parts.append("lemmas=" + ", ".join(lemma_names))
    failure_reason = plan.get("failure_reason")
    if isinstance(failure_reason, str) and failure_reason.strip():
        parts.append("failure_reason=" + failure_reason.strip()[:300])
    return " | ".join(parts) if parts else None


def _summarize_feedback(feedback: dict[str, Any] | None) -> str | None:
    if not isinstance(feedback, dict):
        return None
    parts: list[str] = []
    status = feedback.get("status")
    if isinstance(status, str) and status.strip():
        parts.append(f"status={status.strip()}")
    reason = feedback.get("reason")
    if isinstance(reason, str) and reason.strip():
        parts.append("reason=" + reason.strip()[:400])
    pending = feedback.get("pending_goals")
    if isinstance(pending, list) and pending:
        parts.append(f"pending_goals={len(pending)}")
    return " | ".join(parts) if parts else None


def _print_parsed_agent_summary(*, agent: str, summary: str | None) -> None:
    if not summary:
        return
    text = summary.strip()
    if not text:
        return
    print(f"[Agent {agent}] [item_proof] parsed_summary: {text}")


def _format_retry_feedback_for_agent_a(feedback: dict[str, Any] | None) -> str | None:
    if not isinstance(feedback, dict):
        return None
    lines: list[str] = ["RETRY CONTEXT FROM PREVIOUS AGENT A FEEDBACK:"]
    status = feedback.get("status")
    if isinstance(status, str) and status.strip():
        lines.append(f"- previous_status: {status.strip()}")
    reason = feedback.get("reason")
    if isinstance(reason, str) and reason.strip():
        lines.append(f"- previous_reason: {reason.strip()[:500]}")
    pending = feedback.get("pending_goals")
    if isinstance(pending, list) and pending:
        lines.append("- previous_pending_goals:")
        for goal in pending[:8]:
            if isinstance(goal, str) and goal.strip():
                lines.append(f"  - {goal.strip()[:300]}")
    requested = feedback.get("requested_lemmas")
    if isinstance(requested, list) and requested:
        lines.append("- previous_requested_lemmas:")
        for entry in requested[:8]:
            if not isinstance(entry, dict):
                continue
            name = entry.get("name")
            statement_hint = entry.get("statement_hint")
            lemma_reason = entry.get("reason")
            detail_parts: list[str] = []
            if isinstance(name, str) and name.strip():
                detail_parts.append(f"name={name.strip()}")
            if isinstance(statement_hint, str) and statement_hint.strip():
                detail_parts.append(f"statement={statement_hint.strip()[:260]}")
            if isinstance(lemma_reason, str) and lemma_reason.strip():
                detail_parts.append(f"reason={lemma_reason.strip()[:260]}")
            if detail_parts:
                lines.append("  - " + " | ".join(detail_parts))
    if len(lines) == 1:
        return None
    lines.append(
        "Address this feedback concretely in this retry; do not return another no-op or purely narrative re-plan."
    )
    return "\n".join(lines)


def _history_line_from_record(rec: dict[str, Any]) -> str | None:
    kind = rec.get("kind")
    summary = rec.get("summary")
    if not isinstance(kind, str) or not kind.strip():
        return None
    if isinstance(summary, str) and summary.strip():
        return f"- (prev) {kind.strip()}: {summary.strip()}"
    payload = rec.get("payload")
    if isinstance(payload, dict):
        reason = payload.get("reason")
        if isinstance(reason, str) and reason.strip():
            return f"- (prev) {kind.strip()}: reason={reason.strip()[:300]}"
    return f"- (prev) {kind.strip()}"


def _build_proof_agent_c_prompt(
    *,
    item: ItemProof,
    target_file: Path,
    task_id: str,
    feedback_from_agent_a: dict[str, Any] | None,
    prior_plan: dict[str, Any] | None,
    extra_instructions: str | None = None,
) -> str:
    meta = {
        "item": {
            "index": item.index,
            "label": item.label,
            "env": item.env,
            "number_components": [0, 0, item.index],
            "content": item.content,
            "context": {"task_id": task_id, "source_mode": "item_per_file"},
            "dependencies": item.dependencies,
            "nl_answer": item.nl_answer,
            "notes": item.notes,
        },
        "target_file": str(target_file),
        "feedback_from_agent_a": feedback_from_agent_a,
        "prior_plan": prior_plan,
    }
    return _assemble_prompt(ITEM_PROOF_AGENT_C_PROMPT, meta, extra_instructions=extra_instructions)


def _build_proof_agent_a_meta(
    *,
    item: ItemProof,
    target_file: Path,
    task_id: str,
    plan: dict[str, Any] | None,
    plan_raw: str | None,
    attempt: int,
) -> dict[str, Any]:
    meta = {
        "item": {
            "index": item.index,
            "label": item.label,
            "env": item.env,
            "number_components": [0, 0, item.index],
            "content": item.content,
            "context": {"task_id": task_id, "source_mode": "item_per_file"},
            "dependencies": item.dependencies,
            "nl_answer": item.nl_answer,
            "notes": item.notes,
        },
        "target_file": str(target_file),
        "plan_from_agent_c": plan,
        "plan_raw_from_agent_c": plan_raw,
        "agent_a_attempt": attempt,
    }
    return meta


def _build_proof_agent_b_prompt(
    *,
    item: ItemProof,
    lean_file: Path,
    task_id: str,
    error_log: str,
) -> str:
    meta = {
        "item_index": item.index,
        "label": item.label,
        "target_file": str(lean_file),
        "lean_file": str(lean_file),
        "error_log": error_log,
        "content": item.content,
        "dependencies": item.dependencies,
        "context": {"task_id": task_id, "source_mode": "item_per_file"},
        "notes": item.notes,
    }
    return _assemble_prompt(ITEM_PROOF_AGENT_B_PROMPT, meta, extra_instructions=None)


def _build_proof_agent_d_prompt(
    *,
    lean_file: Path,
    max_lines: int,
) -> str:
    meta = {"lean_file": str(lean_file), "max_lines": max_lines}
    return _assemble_prompt(PROOF_AGENT_D_PROMPT, meta, extra_instructions=None)


def _run_agent(
    *,
    pipeline: str,
    agent: str,
    stage: str,
    prompt: str,
    log_dir: Path,
    log_name: str,
    model: str | None,
    reasoning_effort: str | None,
    log_meta: dict[str, Any],
) -> CodexCallResult:
    return run_codex(
        prompt,
        extra_args=_agent_extra_args(model=model, reasoning_effort=reasoning_effort) or None,
        log_name=log_name,
        log_dir=log_dir,
        cwd=LEAN_ROOT,
        stage=stage,
        log_meta={"pipeline": pipeline, "agent": agent, **log_meta},
    )


def main() -> None:
    parser = argparse.ArgumentParser(description="Run PROOF stage in item-per-file mode.")
    parser.add_argument(
        "--project",
        type=str,
        default="P2512_19197",
        help="Lean subdir under M2F/ (default: P2512_19197).",
    )
    parser.add_argument("--data-file", type=Path, required=True, help="Path to a JSON list of items.")
    parser.add_argument(
        "--stage",
        type=str,
        default="proof",
        help="Codex stage for AGENTS.md swapping (default: proof). Use 'infra_proof' for infra proof pipelines.",
    )
    parser.add_argument(
        "--task-id",
        type=str,
        default=None,
        help="Optional task id (defaults to the data file stem). Used for log partitioning only.",
    )
    parser.add_argument("--start-index", type=int, default=None, help="Start processing from this item index (overrides saved state).")
    parser.add_argument("--only-label", type=str, default=None, help="Process only this item label (e.g. n000001).")
    parser.add_argument("--max-items", type=int, default=None, help="Process at most this many items.")
    parser.add_argument("--max-b-retries", type=int, default=3, help="Max retries for Agent B when Lean still fails (default: 3).")
    parser.add_argument("--max-c-replans", type=int, default=1, help="Max additional Agent C replans (default: 1).")
    parser.add_argument(
        "--max-no-progress-retries",
        type=int,
        default=2,
        help=(
            "If Agent A makes no meaningful progress (no sorry reduction, no proved helper, "
            "and no target-decl sorry reduction), retry this plan round up to N times (default: 2)."
        ),
    )
    parser.add_argument(
        "--clean-warnings-with-agent-b",
        action="store_true",
        help="If set, invoke Agent B to remove non-sorry warnings (default: false).",
    )
    parser.add_argument(
        "--proof-agent-config",
        type=Path,
        default=None,
        help="Optional TOML controlling per-agent model/reasoning for PROOF stage.",
    )
    parser.add_argument(
        "--write-history",
        action=argparse.BooleanOptionalAction,
        default=True,
        help="Persist proof Agent C/A plan & feedback history to JSONL (default: true).",
    )
    parser.add_argument(
        "--use-history",
        action=argparse.BooleanOptionalAction,
        default=True,
        help="Load prior proof history for the same Lean file and pass it into Agent prompts (default: true).",
    )
    parser.add_argument(
        "--history-file",
        type=Path,
        default=None,
        help="Optional path for proof history JSONL (default: log/<project>/item_proof_logs/<task>/item_proof_history.jsonl).",
    )
    parser.add_argument(
        "--history-max-records",
        type=int,
        default=5,
        help="How many recent history records to load per Lean file (default: 5).",
    )
    parser.add_argument(
        "--split-with-agent-d",
        action=argparse.BooleanOptionalAction,
        default=None,
        help=(
            "Enable Agent D file splitting for oversized Lean files. "
            "Defaults to true for --stage infra_proof, else false."
        ),
    )
    parser.add_argument(
        "--max-lines-per-part",
        type=int,
        default=int(os.getenv("MAX_LINES_PER_PART", "2500")),
        help="When splitting with Agent D, target at most this many lines per part file (default: 2500).",
    )
    args = parser.parse_args()
    stage = (args.stage or "proof").strip().lower()
    if args.split_with_agent_d is None:
        split_env = os.getenv("ITEM_PROOF_SPLIT_WITH_AGENT_D", "").strip().lower()
        if split_env:
            args.split_with_agent_d = split_env in {"1", "true", "yes", "on"}
        else:
            args.split_with_agent_d = stage == "infra_proof"

    project = args.project.strip()
    if not project:
        raise SystemExit("--project must be non-empty")
    task_id = (args.task_id or args.data_file.stem).strip()
    if not task_id:
        raise SystemExit("--task-id must be non-empty (or provide a non-empty data file stem)")

    items = _load_items(args.data_file)
    if args.only_label:
        want = args.only_label.strip()
        items = [it for it in items if it.label == want]
        if not items:
            raise SystemExit(f"--only-label={want} not found in {args.data_file}")

    project_slug = slugify(project, max_len=80)
    task_slug = slugify(task_id, max_len=120)
    logs_dir = ROOT / "log" / project_slug / "item_proof_logs" / task_slug
    agent_a_logs_dir = logs_dir / "agent_a"
    agent_b_logs_dir = logs_dir / "agent_b"
    agent_c_logs_dir = logs_dir / "agent_c"
    agent_d_logs_dir = logs_dir / "agent_d"
    progress_file = logs_dir / "progress.json"
    history_file = args.history_file or (logs_dir / "item_proof_history.jsonl")
    agent_a_logs_dir.mkdir(parents=True, exist_ok=True)
    agent_b_logs_dir.mkdir(parents=True, exist_ok=True)
    agent_c_logs_dir.mkdir(parents=True, exist_ok=True)
    if args.split_with_agent_d:
        agent_d_logs_dir.mkdir(parents=True, exist_ok=True)

    default_cfg = ROOT / "agent_configs/proof_agents.toml"
    cfg_from_env = Path(os.environ["PROOF_AGENT_CONFIG_FILE"]) if os.environ.get("PROOF_AGENT_CONFIG_FILE") else None
    proof_agent_cfg = args.proof_agent_config or cfg_from_env
    agent_keys = ["A", "B", "C"] + (["D"] if args.split_with_agent_d else [])
    agent_settings = resolve_stage_agents_settings(
        stage_prefix="PROOF_AGENT",
        agent_keys=agent_keys,
        config_path=proof_agent_cfg,
        default_config_path=(default_cfg if default_cfg.exists() else None),
    )
    agent_a_strict = agent_settings.strict_agents.get("A")

    run_id = start_run(
        "item_proof",
        stage=2,
        name_tag=f"{project_slug}_{task_slug}",
        data_file=str(args.data_file),
        extra={
            "project": project,
            "task_id": task_id,
            "start_index_arg": args.start_index,
            "clean_warnings_with_agent_b": bool(args.clean_warnings_with_agent_b),
            "split_with_agent_d": bool(args.split_with_agent_d),
            "max_lines_per_part": int(args.max_lines_per_part),
            "proof_agent_config": str(agent_settings.source_path) if agent_settings.source_path else None,
            "proof_agent_a_model": agent_settings.agents["A"].model,
            "proof_agent_a_reasoning_effort": agent_settings.agents["A"].reasoning_effort,
            "proof_agent_a_strict_retry_model": (agent_a_strict.model if agent_a_strict else None),
            "proof_agent_a_strict_retry_reasoning_effort": (
                agent_a_strict.reasoning_effort if agent_a_strict else None
            ),
            "proof_agent_b_model": agent_settings.agents["B"].model,
            "proof_agent_b_reasoning_effort": agent_settings.agents["B"].reasoning_effort,
            "proof_agent_c_model": agent_settings.agents["C"].model,
            "proof_agent_c_reasoning_effort": agent_settings.agents["C"].reasoning_effort,
            "proof_agent_d_model": (agent_settings.agents.get("D").model if agent_settings.agents.get("D") else None),
            "proof_agent_d_reasoning_effort": (
                agent_settings.agents.get("D").reasoning_effort if agent_settings.agents.get("D") else None
            ),
            "history_file": str(history_file),
            "write_history": bool(args.write_history),
            "use_history": bool(args.use_history),
            "history_max_records": int(args.history_max_records),
            "max_no_progress_retries": int(args.max_no_progress_retries),
        },
    )

    state = load_state(progress_file)
    plan_sha256 = _sha256_file(args.data_file)
    prev_plan_sha256 = str(state.get("plan_sha256", "") or "").strip()
    state_changed = False
    if prev_plan_sha256 and prev_plan_sha256 != plan_sha256:
        prev_next = int(state.get("next_index", 1) or 1)
        state["next_index"] = 1
        state_changed = True
        print(
            "[progress] plan hash changed; reset item_proof progress "
            f"from next_index={prev_next} to next_index=1."
        )
        log_event(
            run_id,
            "progress_reset_plan_changed",
            {
                "task_id": task_id,
                "progress_file": str(progress_file),
                "data_file": str(args.data_file),
                "previous_next_index": prev_next,
                "previous_plan_sha256": prev_plan_sha256,
                "plan_sha256": plan_sha256,
            },
        )
    if prev_plan_sha256 != plan_sha256:
        state["plan_sha256"] = plan_sha256
        state_changed = True
    if args.start_index is not None:
        state["next_index"] = int(args.start_index)
        state_changed = True
    if state_changed:
        save_state(state, progress_file, run_id=run_id)
    next_index = int(state.get("next_index", 1) or 1)

    processed = 0
    last_success = next_index - 1
    total_tokens = 0
    if not args.use_history:
        print("[History] disabled (--no-use-history)")
    total_failed = 0
    total_bad_statement = 0
    run_start = time.monotonic()
    print(
        "[Agent A strict retry] "
        f"model={agent_a_strict.model if agent_a_strict and agent_a_strict.model else (agent_settings.agents['A'].model or 'default')}, "
        f"reasoning_effort={agent_a_strict.reasoning_effort if agent_a_strict and agent_a_strict.reasoning_effort else (agent_settings.agents['A'].reasoning_effort or 'default')}"
    )

    def _split_and_validate(*, item: ItemProof, target_rel: Path, target_abs: Path) -> tuple[bool, str | None]:
        nonlocal total_tokens
        if not args.split_with_agent_d:
            return True, None
        if int(args.max_lines_per_part or 0) <= 0:
            return True, None

        try:
            line_count = len(target_abs.read_text(encoding="utf-8").splitlines())
        except FileNotFoundError:
            return True, None
        if line_count <= int(args.max_lines_per_part):
            return True, None

        if not PROOF_AGENT_D_PROMPT.strip():
            missing_msg = "Missing prompts/proof/agent_d_prompt.txt; cannot run Agent D splitter."
            print(f"Proof Agent D unavailable: {missing_msg}")
            return False, missing_msg

        settings_d = agent_settings.agents.get("D")
        print(f"[Proof Agent D] splitting oversized file ({line_count} lines): {target_rel}")
        d_prompt = _build_proof_agent_d_prompt(lean_file=target_rel, max_lines=int(args.max_lines_per_part))
        d_log_name = build_log_filename("item_proof", "agent_d", task_id, f"idx{item.index}", target_rel.as_posix())
        d_start = time.monotonic()
        d_res = _run_agent(
            pipeline="item_proof",
            agent="D",
            stage=stage,
            prompt=d_prompt,
            log_dir=agent_d_logs_dir,
            log_name=d_log_name,
            model=(settings_d.model if settings_d else None),
            reasoning_effort=(settings_d.reasoning_effort if settings_d else None),
            log_meta={
                "task_id": task_id,
                "item_index": item.index,
                "label": item.label,
                "target_file": str(target_rel),
                "max_lines_per_part": int(args.max_lines_per_part),
            },
        )
        d_seconds = time.monotonic() - d_start
        total_tokens += d_res.tokens_used or 0
        log_event(
            run_id,
            "agent_d_result",
            {
                "index": item.index,
                "label": item.label,
                "task_id": task_id,
                "code": d_res.code,
                "seconds": d_seconds,
                "tokens_used": d_res.tokens_used,
                "log_path": str(d_res.log_path) if d_res.log_path else None,
                "line_count": line_count,
                "max_lines_per_part": int(args.max_lines_per_part),
            },
        )
        if d_res.code != 0:
            print(f"Proof Agent D failed with code {d_res.code}. Stopping.\n{d_res.stderr}")
            return False, (d_res.stderr or "").strip() or "agent_d_failed"

        print(f"[lean check] running: lake env lean {target_rel} (cwd={LEAN_ROOT})")
        code, out, err = lake_env_lean(target_rel)
        lean_output = "\n".join(part for part in (err, out) if part)
        non_sorry = _non_sorry_warning_blocks(lean_output)
        log_event(
            run_id,
            "lean_check",
            {
                "index": item.index,
                "label": item.label,
                "task_id": task_id,
                "phase": "post_agent_d_split",
                "code": code,
                "seconds": None,
                "has_non_sorry_warnings": bool(non_sorry),
                "compiled_file": str(target_rel),
            },
        )
        if code == 0 and (not non_sorry or not args.clean_warnings_with_agent_b):
            if non_sorry and not args.clean_warnings_with_agent_b:
                print(
                    "Lean OK after proof Agent D split, but has non-sorry warnings (--no-clean-warnings-with-agent-b); continuing."
                )
            else:
                print("Lean OK after proof Agent D split.")
            return True, None

        if code != 0:
            err_text = lean_output
        else:
            err_text = "Lean produced the following non-sorry warnings. Please remove them:\n\n" + "\n\n".join(non_sorry)
        print("Post-split check failed; calling proof Agent B...")

        for attempt in range(1, int(args.max_b_retries) + 1):
            print(f"[Proof Agent B (post-split) attempt {attempt}/{args.max_b_retries}]")
            file_before_b = file_snapshot(target_abs)
            b_prompt = _build_proof_agent_b_prompt(item=item, lean_file=target_rel, task_id=task_id, error_log=err_text)
            print(
                f"[Agent B] [item_proof] start idx={item.index} label={item.label} "
                f"target={target_rel} model={agent_settings.agents['B'].model or 'default'}/{agent_settings.agents['B'].reasoning_effort or 'default'}"
            )
            b_log_name = build_log_filename(
                "item_proof", "agent_b_post_split", task_id, f"idx{item.index}", target_rel.as_posix(), f"attempt{attempt}"
            )
            b_start = time.monotonic()
            b_res = _run_agent(
                pipeline="item_proof",
                agent="B",
                stage=stage,
                prompt=b_prompt,
                log_dir=agent_b_logs_dir,
                log_name=b_log_name,
                model=agent_settings.agents["B"].model,
                reasoning_effort=agent_settings.agents["B"].reasoning_effort,
                log_meta={
                    "task_id": task_id,
                    "item_index": item.index,
                    "label": item.label,
                    "target_file": str(target_rel),
                    "attempt": attempt,
                    "mode": "post_split",
                },
            )
            b_seconds = time.monotonic() - b_start
            total_tokens += b_res.tokens_used or 0
            file_after_b = file_snapshot(target_abs)
            log_event(
                run_id,
                "agent_b_post_split_result",
                {
                    "index": item.index,
                    "label": item.label,
                    "task_id": task_id,
                    "attempt": attempt,
                    "code": b_res.code,
                    "seconds": b_seconds,
                    "tokens_used": b_res.tokens_used,
                    "log_path": str(b_res.log_path) if b_res.log_path else None,
                    "file_before": file_before_b,
                    "file_after": file_after_b,
                    "file_delta": snapshot_delta(file_before_b, file_after_b),
                },
            )
            if b_res.code != 0:
                print(f"Proof Agent B failed with code {b_res.code}. Stopping.\n{b_res.stderr}")
                return False, (b_res.stderr or "").strip() or "agent_b_post_split_failed"

            print(f"[lean check] running: lake env lean {target_rel} (cwd={LEAN_ROOT})")
            code2, out2, err2 = lake_env_lean(target_rel)
            lean_output2 = "\n".join(part for part in (err2, out2) if part)
            non_sorry2 = _non_sorry_warning_blocks(lean_output2)
            log_event(
                run_id,
                "lean_check",
                {
                    "index": item.index,
                    "label": item.label,
                    "task_id": task_id,
                    "phase": f"post_agent_d_b_attempt{attempt}",
                    "code": code2,
                    "seconds": None,
                    "has_non_sorry_warnings": bool(non_sorry2),
                    "compiled_file": str(target_rel),
                },
            )
            if code2 == 0 and (not non_sorry2 or not args.clean_warnings_with_agent_b):
                if non_sorry2 and not args.clean_warnings_with_agent_b:
                    print(
                        "Lean OK after proof Agent B (post-split), but has non-sorry warnings (--no-clean-warnings-with-agent-b); continuing."
                    )
                else:
                    print("Lean OK after proof Agent B (post-split).")
                return True, None

            if code2 != 0:
                err_text = lean_output2
                print("Post-split check still failing after proof Agent B; will retry if attempts remain.")
            else:
                err_text = "Lean still reports these non-sorry warnings. Please remove them:\n\n" + "\n\n".join(non_sorry2)
                print("Post-split check still has non-sorry warnings after proof Agent B; will retry if attempts remain.")

        print("Lean still failing after post-split Agent B retries.")
        return False, err_text

    for item in items:
        if item.index < next_index:
            continue
        print(f"=== item-proof index={item.index} label={item.label} (task={task_id}) ===")
        item_start = time.monotonic()
        log_event(run_id, "item_start", {"index": item.index, "label": item.label, "task_id": task_id})

        if not item.nl_answer and stage != "infra_proof":
            print(f"Skipping proof item index={item.index} label={item.label}: empty `proof` in data.")
            state["next_index"] = item.index + 1
            save_state(state, progress_file, run_id=run_id)
            processed += 1
            last_success = item.index
            log_event(
                run_id,
                "item_end",
                {"index": item.index, "label": item.label, "task_id": task_id, "status": "skipped_empty_proof"},
            )
            continue

        if item.target_file:
            target_rel = Path(item.target_file)
        else:
            target_rel = Path(project) / f"{item.label}.lean"
        target_abs = LEAN_ROOT / target_rel
        target_abs.parent.mkdir(parents=True, exist_ok=True)
        if not target_abs.exists():
            print(f"Target file {target_rel} does not exist; skipping.")
            state["next_index"] = item.index + 1
            save_state(state, progress_file, run_id=run_id)
            processed += 1
            last_success = item.index
            log_event(
                run_id,
                "item_end",
                {"index": item.index, "label": item.label, "task_id": task_id, "status": "skipped_missing_file"},
            )
            continue

        plan_history: list[str] = []
        if args.use_history and args.history_max_records > 0:
            history_exists = history_file.exists()
            recs = load_recent_history(
                history_file,
                lean_file=str(target_rel),
                max_records=int(args.history_max_records),
                kinds={"agent_c_plan", "agent_a_feedback", "item_end"},
                item_index=int(item.index),
                label=item.label,
            )
            for rec in recs:
                line = _history_line_from_record(rec)
                if line:
                    plan_history.append(line)
            print(
                f"[History] loaded {len(recs)} record(s) from {history_file} for {target_rel}"
                + ("" if history_exists else " (file not found; starting fresh)")
            )

        # Step 1: Agent C plan -> Agent A execute (supports re-plan)
        plan_data: dict[str, Any] | None = None
        plan_raw_block: str | None = None
        feedback_for_c: dict[str, Any] | None = None
        abandon_item = False
        abandon_status: str | None = None
        abandon_detail: dict[str, Any] = {}
        skip_current_item = False
        max_plan_rounds = int(args.max_c_replans) + 1

        for plan_round in range(1, max_plan_rounds + 1):
            print(f"[Proof Agent C planning {plan_round}/{max_plan_rounds}]")
            print(
                f"[Agent C] [item_proof] start idx={item.index} label={item.label} "
                f"target={target_rel} model={agent_settings.agents['C'].model or 'default'}/{agent_settings.agents['C'].reasoning_effort or 'default'} "
                f"attempt={plan_round}"
            )
            history_text = _format_plan_history(plan_history)
            c_prompt = _build_proof_agent_c_prompt(
                item=item,
                target_file=target_rel,
                task_id=task_id,
                feedback_from_agent_a=feedback_for_c,
                prior_plan=plan_data,
                extra_instructions=history_text,
            )
            c_log_name = build_log_filename("item_proof", "agent_c", task_id, f"idx{item.index}", item.label)
            c_start = time.monotonic()
            c_res = _run_agent(
                pipeline="item_proof",
                agent="C",
                stage=stage,
                prompt=c_prompt,
                log_dir=agent_c_logs_dir,
                log_name=c_log_name,
                model=agent_settings.agents["C"].model,
                reasoning_effort=agent_settings.agents["C"].reasoning_effort,
                log_meta={"task_id": task_id, "item_index": item.index, "label": item.label, "target_file": str(target_rel)},
            )
            c_seconds = time.monotonic() - c_start
            total_tokens += c_res.tokens_used or 0
            log_event(
                run_id,
                "agent_c_result",
                {
                    "index": item.index,
                    "label": item.label,
                    "task_id": task_id,
                    "plan_round": plan_round,
                    "code": c_res.code,
                    "seconds": c_seconds,
                    "tokens_used": c_res.tokens_used,
                    "log_path": str(c_res.log_path) if c_res.log_path else None,
                },
            )
            if c_res.code != 0:
                print(f"Proof Agent C failed with code {c_res.code}. Stopping.\n{c_res.stderr}")
                abandon_item = True
                abandon_status = "agent_c_failed"
                abandon_detail = {"agent": "C", "code": c_res.code, "stderr": (c_res.stderr or "")[:2000]}
                break

            plan_data, plan_raw_block = extract_marked_json(c_res.stdout, AGENT_C_PLAN_START, AGENT_C_PLAN_END)
            _maybe_report_math_blocker(stdout=c_res.stdout, agent="C", label=item.label)
            if plan_raw_block is None:
                plan_raw_block = c_res.stdout
            if plan_data is None:
                print("Warning: could not parse Agent C plan JSON; passing raw output to Agent A.")
            plan_summary = _summarize_plan(plan_data) or "unparsed_plan"
            _print_parsed_agent_summary(agent="C", summary=plan_summary)
            plan_history.append(f"- round {plan_round} C: {plan_summary}")
            main_decl_name = None
            if isinstance(plan_data, dict):
                main_decl = plan_data.get("main_declaration")
                if isinstance(main_decl, dict):
                    main_name = main_decl.get("name")
                    if isinstance(main_name, str) and main_name.strip():
                        main_decl_name = main_name.strip()
            if args.write_history:
                append_history(
                    history_file,
                    pipeline="item_proof",
                    run_id=run_id,
                    lean_file=str(target_rel),
                    task_id=task_id,
                    kind="agent_c_plan",
                    summary=plan_summary,
                    log_path=(str(c_res.log_path) if c_res.log_path else None),
                    payload={
                        "index": item.index,
                        "label": item.label,
                        "plan_round": plan_round,
                        "status": (plan_data.get("status") if isinstance(plan_data, dict) else None),
                        "main_declaration": main_decl_name,
                    },
                )
            if isinstance(plan_data, dict) and plan_data.get("status") == "failed":
                failure_reason = plan_data.get("failure_reason") or "no failure_reason provided"
                print(f"[REPORT] Agent C flagged label={item.label} as likely mathematically unprovable/blocked: {failure_reason}")
                log_event(
                    run_id,
                    "item_end",
                    {
                        "index": item.index,
                        "label": item.label,
                        "task_id": task_id,
                        "status": "failed_bad_statement",
                        "failure_reason": failure_reason,
                    },
                )
                state["next_index"] = item.index + 1
                save_state(state, progress_file, run_id=run_id)
                processed += 1
                last_success = item.index
                total_bad_statement += 1
                skip_current_item = True
                break

            feedback_for_c = None
            request_replan = False
            replan_feedback: dict[str, Any] | None = None
            a_meta = _build_proof_agent_a_meta(
                item=item,
                target_file=target_rel,
                task_id=task_id,
                plan=plan_data,
                plan_raw=plan_raw_block,
                attempt=plan_round,
            )
            prior_no_progress_feedback: dict[str, Any] | None = None

            for progress_attempt in range(0, int(args.max_no_progress_retries) + 1):
                stricter_no_progress = None
                retry_feedback_context = None
                agent_a_model = agent_settings.agents["A"].model
                agent_a_reasoning_effort = agent_settings.agents["A"].reasoning_effort
                if progress_attempt > 0:
                    stricter_no_progress = (
                        "NO-PROGRESS RETRY: previous attempt did not make meaningful proof progress. "
                        "This retry must produce at least one of: "
                        "(1) reduce total `sorry` tokens in file; "
                        "(2) reduce `sorry` tokens in target declaration; "
                        "(3) add and fully prove at least one new helper lemma. "
                        "Do not make cosmetic edits."
                    )
                    retry_feedback_context = _format_retry_feedback_for_agent_a(prior_no_progress_feedback)
                    if agent_a_strict and agent_a_strict.model:
                        agent_a_model = agent_a_strict.model
                    if agent_a_strict and agent_a_strict.reasoning_effort:
                        agent_a_reasoning_effort = agent_a_strict.reasoning_effort

                file_before_a = file_snapshot(target_abs)
                before_text_a = target_abs.read_text(encoding="utf-8")
                before_sorry_total = _count_sorry_tokens(before_text_a)
                before_target_sorry = (
                    _count_sorry_tokens_in_named_decl_text(
                        file_text=before_text_a,
                        name=main_decl_name,
                    )
                    if main_decl_name
                    else None
                )

                print(
                    f"[Agent A] [item_proof] start idx={item.index} label={item.label} "
                    f"target={target_rel} model={agent_a_model or 'default'}/{agent_a_reasoning_effort or 'default'} "
                    f"attempt={plan_round} progress_try={progress_attempt + 1}"
                )
                a_prompt = _assemble_prompt(
                    ITEM_PROOF_AGENT_A_PROMPT,
                    a_meta,
                    extra_instructions=_combine_extra_instructions(
                        _format_plan_history(plan_history),
                        stricter_no_progress,
                        retry_feedback_context,
                    ),
                )
                a_log_name = build_log_filename(
                    "item_proof", "agent_a", task_id, f"idx{item.index}", item.label, f"attempt{plan_round}_try{progress_attempt + 1}"
                )
                a_start = time.monotonic()
                a_res = _run_agent(
                    pipeline="item_proof",
                    agent="A",
                    stage=stage,
                    prompt=a_prompt,
                    log_dir=agent_a_logs_dir,
                    log_name=a_log_name,
                    model=agent_a_model,
                    reasoning_effort=agent_a_reasoning_effort,
                    log_meta={
                        "task_id": task_id,
                        "item_index": item.index,
                        "label": item.label,
                        "target_file": str(target_rel),
                        "attempt": plan_round,
                        "progress_attempt": progress_attempt + 1,
                    },
                )
                a_seconds = time.monotonic() - a_start
                file_after_a = file_snapshot(target_abs)
                after_text_a = target_abs.read_text(encoding="utf-8")
                after_sorry_total = _count_sorry_tokens(after_text_a)
                after_target_sorry = (
                    _count_sorry_tokens_in_named_decl_text(
                        file_text=after_text_a,
                        name=main_decl_name,
                    )
                    if main_decl_name
                    else None
                )
                total_tokens += a_res.tokens_used or 0
                log_event(
                    run_id,
                    "agent_a_result",
                    {
                        "index": item.index,
                        "label": item.label,
                        "task_id": task_id,
                        "plan_round": plan_round,
                        "progress_attempt": progress_attempt + 1,
                        "code": a_res.code,
                        "seconds": a_seconds,
                        "tokens_used": a_res.tokens_used,
                        "log_path": str(a_res.log_path) if a_res.log_path else None,
                        "model": agent_a_model,
                        "reasoning_effort": agent_a_reasoning_effort,
                        "file_before": file_before_a,
                        "file_after": file_after_a,
                        "file_delta": snapshot_delta(file_before_a, file_after_a),
                    },
                )
                if a_res.code != 0:
                    print(f"Proof Agent A failed with code {a_res.code}. Stopping.\n{a_res.stderr}")
                    abandon_item = True
                    abandon_status = "agent_a_failed"
                    abandon_detail = {"agent": "A", "code": a_res.code, "stderr": (a_res.stderr or "")[:2000]}
                    break
                _maybe_report_math_blocker(stdout=a_res.stdout, agent="A", label=item.label)

                axiom_decls = find_axiom_decls(target_abs)
                if axiom_decls:
                    print("Detected forbidden `axiom` declarations after proof Agent A; requesting cleanup.")
                    cleanup_instructions = build_axiom_cleanup_instructions(target_rel, axiom_decls)
                    cleanup_prompt = _assemble_prompt(ITEM_PROOF_AGENT_A_PROMPT, a_meta, cleanup_instructions)
                    cleanup_log_name = build_log_filename(
                        "item_proof", "agent_a_axiom_cleanup", task_id, f"idx{item.index}", item.label, f"attempt{plan_round}_try{progress_attempt + 1}"
                    )
                    cleanup_start = time.monotonic()
                    cleanup_res = _run_agent(
                        pipeline="item_proof",
                        agent="A",
                        stage=stage,
                        prompt=cleanup_prompt,
                        log_dir=agent_a_logs_dir,
                        log_name=cleanup_log_name,
                        model=agent_a_model,
                        reasoning_effort=agent_a_reasoning_effort,
                        log_meta={
                            "task_id": task_id,
                            "item_index": item.index,
                            "label": item.label,
                            "target_file": str(target_rel),
                            "attempt": plan_round,
                            "progress_attempt": progress_attempt + 1,
                            "mode": "axiom_cleanup",
                        },
                    )
                    _ = time.monotonic() - cleanup_start
                    total_tokens += cleanup_res.tokens_used or 0
                    log_event(
                        run_id,
                        "agent_a_axiom_cleanup_result",
                        {
                            "index": item.index,
                            "label": item.label,
                            "task_id": task_id,
                            "plan_round": plan_round,
                            "progress_attempt": progress_attempt + 1,
                            "code": cleanup_res.code,
                            "tokens_used": cleanup_res.tokens_used,
                            "log_path": str(cleanup_res.log_path) if cleanup_res.log_path else None,
                            "model": agent_a_model,
                            "reasoning_effort": agent_a_reasoning_effort,
                        },
                    )
                    if cleanup_res.code != 0:
                        print(f"Proof Agent A axiom cleanup failed with code {cleanup_res.code}. Stopping.\n{cleanup_res.stderr}")
                        abandon_item = True
                        abandon_status = "agent_a_axiom_cleanup_failed"
                        abandon_detail = {
                            "agent": "A",
                            "mode": "axiom_cleanup",
                            "code": cleanup_res.code,
                            "stderr": (cleanup_res.stderr or "")[:2000],
                        }
                        break
                    axiom_decls = find_axiom_decls(target_abs)
                    if axiom_decls:
                        print("Axiom cleanup failed; `axiom` declarations still present.")
                        print(format_axiom_report(axiom_decls))
                        abandon_item = True
                        abandon_status = "axiom_cleanup_incomplete"
                        abandon_detail = {"axiom_count": len(axiom_decls)}
                        break
                    after_text_a = target_abs.read_text(encoding="utf-8")
                    after_sorry_total = _count_sorry_tokens(after_text_a)
                    after_target_sorry = (
                        _count_sorry_tokens_in_named_decl_text(
                            file_text=after_text_a,
                            name=main_decl_name,
                        )
                        if main_decl_name
                        else None
                    )
                a_feedback, _ = extract_marked_json(a_res.stdout, AGENT_A_FEEDBACK_START, AGENT_A_FEEDBACK_END)
                a_feedback_summary = _summarize_feedback(a_feedback)
                _print_parsed_agent_summary(agent="A", summary=a_feedback_summary)
                if a_feedback and a_feedback.get("status") == "failed_bad_statement":
                    reason = (a_feedback.get("reason") or "no reason provided").strip()
                    print(f"[REPORT] Agent A flagged item as failed_bad_statement: {reason}")
                    if args.write_history:
                        append_history(
                            history_file,
                            pipeline="item_proof",
                            run_id=run_id,
                            lean_file=str(target_rel),
                            task_id=task_id,
                            kind="agent_a_feedback",
                            summary=f"status=failed_bad_statement | reason={reason[:300]}",
                            log_path=(str(a_res.log_path) if a_res.log_path else None),
                            payload={
                                "index": item.index,
                                "label": item.label,
                                "plan_round": plan_round,
                                "feedback": a_feedback,
                            },
                        )
                        append_history(
                            history_file,
                            pipeline="item_proof",
                            run_id=run_id,
                            lean_file=str(target_rel),
                            task_id=task_id,
                            kind="item_end",
                            summary="failed_bad_statement",
                            payload={
                                "index": item.index,
                                "label": item.label,
                                "status": "failed_bad_statement",
                                "reason": reason,
                            },
                        )
                    log_event(
                        run_id,
                        "item_end",
                        {
                            "index": item.index,
                            "label": item.label,
                            "task_id": task_id,
                            "status": "failed_bad_statement",
                            "failure_reason": reason,
                        },
                    )
                    state["next_index"] = item.index + 1
                    save_state(state, progress_file, run_id=run_id)
                    processed += 1
                    last_success = item.index
                    total_bad_statement += 1
                    skip_current_item = True
                    break

                proved_new_decls = _new_proved_named_decls(
                    before_text=before_text_a,
                    after_text=after_text_a,
                )
                target_sorry_shrunk = (
                    before_target_sorry is not None
                    and after_target_sorry is not None
                    and after_target_sorry < before_target_sorry
                )
                meaningful_progress = (
                    after_sorry_total < before_sorry_total
                    or bool(proved_new_decls)
                    or target_sorry_shrunk
                )
                if not meaningful_progress:
                    prior_no_progress_feedback = a_feedback if isinstance(a_feedback, dict) else None
                    log_event(
                        run_id,
                        "no_progress_after_agent_a",
                        {
                            "index": item.index,
                            "label": item.label,
                            "task_id": task_id,
                            "plan_round": plan_round,
                            "progress_attempt": progress_attempt + 1,
                            "before_sorry_total": before_sorry_total,
                            "after_sorry_total": after_sorry_total,
                            "target_before": before_target_sorry,
                            "target_after": after_target_sorry,
                            "proved_new_decls": proved_new_decls,
                        },
                    )
                    if progress_attempt < int(args.max_no_progress_retries):
                        print(
                            "Warning: no meaningful progress after proof Agent A; "
                            "retrying with stricter instructions."
                        )
                        continue
                    print(
                        "Warning: no meaningful progress after proof Agent A after retries; "
                        "recording failure and continuing."
                    )
                    if args.write_history:
                        append_history(
                            history_file,
                            pipeline="item_proof",
                            run_id=run_id,
                            lean_file=str(target_rel),
                            task_id=task_id,
                            kind="item_end",
                            summary="failed_no_progress",
                            payload={
                                "index": item.index,
                                "label": item.label,
                                "status": "failed_no_progress",
                            },
                        )
                    log_event(
                        run_id,
                        "item_end",
                        {
                            "index": item.index,
                            "label": item.label,
                            "task_id": task_id,
                            "status": "failed_no_progress",
                        },
                    )
                    state["next_index"] = item.index + 1
                    save_state(state, progress_file, run_id=run_id)
                    processed += 1
                    last_success = item.index
                    total_failed += 1
                    skip_current_item = True
                    break

                if a_feedback and a_feedback.get("status") == "needs_replan":
                    reason = a_feedback.get("reason") or "no reason given"
                    print(f"Agent A requested re-plan: {reason}")
                    feedback_summary = a_feedback_summary or reason
                    plan_history.append(f"- round {plan_round} A: {feedback_summary}")
                    if args.write_history:
                        append_history(
                            history_file,
                            pipeline="item_proof",
                            run_id=run_id,
                            lean_file=str(target_rel),
                            task_id=task_id,
                            kind="agent_a_feedback",
                            summary=feedback_summary,
                            log_path=(str(a_res.log_path) if a_res.log_path else None),
                            payload={
                                "index": item.index,
                                "label": item.label,
                                "plan_round": plan_round,
                                "feedback": a_feedback,
                            },
                        )
                    request_replan = True
                    replan_feedback = a_feedback
                    break

                break

            if skip_current_item:
                break
            if abandon_item:
                break
            if request_replan:
                if plan_round >= max_plan_rounds:
                    print("Reached max Agent C re-plan attempts; stopping.")
                    abandon_item = True
                    abandon_status = "max_replans_reached"
                    abandon_detail = {"reason": (replan_feedback or {}).get("reason") if isinstance(replan_feedback, dict) else None}
                    break
                feedback_for_c = replan_feedback
                continue

            break

        if skip_current_item:
            if args.max_items is not None and processed >= int(args.max_items):
                print(f"Reached max-items={args.max_items}, stopping batch.")
                break
            continue

        if abandon_item:
            total_failed += 1
            total_tokens += 0
            if args.write_history:
                append_history(
                    history_file,
                    pipeline="item_proof",
                    run_id=run_id,
                    lean_file=str(target_rel),
                    task_id=task_id,
                    kind="item_end",
                    summary=(abandon_status or "abandoned_in_planning"),
                    payload={
                        "index": item.index,
                        "label": item.label,
                        "status": (abandon_status or "abandoned_in_planning"),
                        **(abandon_detail or {}),
                    },
                )
            log_event(
                run_id,
                "item_end",
                {
                    "index": item.index,
                    "label": item.label,
                    "task_id": task_id,
                    "status": (abandon_status or "abandoned_in_planning"),
                    **(abandon_detail or {}),
                },
            )
            break

        # Step 2: compile check
        failed = False
        failure_kind = ""
        err_text = ""
        lean_start = time.monotonic()
        print(f"[lean check] running: lake env lean {target_rel} (cwd={LEAN_ROOT})")
        code, out, err = lake_env_lean(target_rel)
        lean_seconds = time.monotonic() - lean_start
        lean_output = "\n".join(part for part in (err, out) if part)
        non_sorry = _non_sorry_warning_blocks(lean_output)
        log_event(
            run_id,
            "lean_check",
            {
                "index": item.index,
                "label": item.label,
                "task_id": task_id,
                "phase": "post_agent_a",
                "code": code,
                "seconds": lean_seconds,
                "has_non_sorry_warnings": bool(non_sorry),
                "compiled_file": str(target_rel),
            },
        )
        if code != 0:
            failed = True
            failure_kind = "lean_error"
            err_text = lean_output
        elif non_sorry and args.clean_warnings_with_agent_b:
            failed = True
            failure_kind = "non_sorry_warnings"
            err_text = "Lean produced the following non-sorry warnings. Please remove them:\n\n" + "\n\n".join(non_sorry)
        if not failed:
            status = "ok_with_warnings" if non_sorry and not args.clean_warnings_with_agent_b else "ok"
            if status == "ok_with_warnings":
                print("Lean OK (proof), but has non-sorry warnings (--no-clean-warnings-with-agent-b); continuing.")
            else:
                print("Lean OK (proof).")
            split_ok, split_err = _split_and_validate(item=item, target_rel=target_rel, target_abs=target_abs)
            if not split_ok:
                print("Split validation failed after proof Agent A; recording failure and stopping batch.")
                total_failed += 1
                log_event(
                    run_id,
                    "item_end",
                    {
                        "index": item.index,
                        "label": item.label,
                        "task_id": task_id,
                        "status": "failed_after_split",
                        "seconds": time.monotonic() - item_start,
                        "error_snippet": (split_err[:1200] if split_err else None),
                    },
                )
                break
            state["next_index"] = item.index + 1
            save_state(state, progress_file, run_id=run_id)
            processed += 1
            last_success = item.index
            log_event(
                run_id,
                "item_end",
                {"index": item.index, "label": item.label, "task_id": task_id, "status": status, "seconds": time.monotonic() - item_start},
            )
            if args.max_items is not None and processed >= int(args.max_items):
                print(f"Reached max-items={args.max_items}, stopping batch.")
                break
            continue

        if failure_kind == "non_sorry_warnings":
            print("Lean produced non-sorry warnings after proof Agent A; calling proof Agent B to clean.")
        else:
            print("Lean failed after proof Agent A, calling proof Agent B...")
        success_after_b = False
        for attempt in range(1, int(args.max_b_retries) + 1):
            print(f"[Proof Agent B attempt {attempt}/{args.max_b_retries}]")
            file_before_b = file_snapshot(target_abs)
            b_prompt = _build_proof_agent_b_prompt(item=item, lean_file=target_rel, task_id=task_id, error_log=err_text)
            print(
                f"[Agent B] [item_proof] start idx={item.index} label={item.label} "
                f"target={target_rel} model={agent_settings.agents['B'].model or 'default'}/{agent_settings.agents['B'].reasoning_effort or 'default'}"
            )
            b_log_name = build_log_filename("item_proof", "agent_b", task_id, f"idx{item.index}", target_rel.as_posix(), f"attempt{attempt}")
            b_start = time.monotonic()
            b_res = _run_agent(
                pipeline="item_proof",
                agent="B",
                stage=stage,
                prompt=b_prompt,
                log_dir=agent_b_logs_dir,
                log_name=b_log_name,
                model=agent_settings.agents["B"].model,
                reasoning_effort=agent_settings.agents["B"].reasoning_effort,
                log_meta={
                    "task_id": task_id,
                    "item_index": item.index,
                    "label": item.label,
                    "target_file": str(target_rel),
                    "attempt": attempt,
                },
            )
            b_seconds = time.monotonic() - b_start
            total_tokens += b_res.tokens_used or 0
            file_after_b = file_snapshot(target_abs)
            log_event(
                run_id,
                "agent_b_result",
                {
                    "index": item.index,
                    "label": item.label,
                    "task_id": task_id,
                    "attempt": attempt,
                    "code": b_res.code,
                    "seconds": b_seconds,
                    "tokens_used": b_res.tokens_used,
                    "log_path": str(b_res.log_path) if b_res.log_path else None,
                    "file_before": file_before_b,
                    "file_after": file_after_b,
                    "file_delta": snapshot_delta(file_before_b, file_after_b),
                },
            )
            if b_res.code != 0:
                print(f"Proof Agent B failed with code {b_res.code}. Stopping.\n{b_res.stderr}")
                break

            print(f"[lean check] running: lake env lean {target_rel} (cwd={LEAN_ROOT})")
            code, out, err = lake_env_lean(target_rel)
            lean_output = "\n".join(part for part in (err, out) if part)
            non_sorry2 = _non_sorry_warning_blocks(lean_output)
            log_event(
                run_id,
                "lean_check",
                {
                    "index": item.index,
                    "label": item.label,
                    "task_id": task_id,
                    "phase": f"post_agent_b_attempt{attempt}",
                    "code": code,
                    "seconds": None,
                    "has_non_sorry_warnings": bool(non_sorry2),
                    "compiled_file": str(target_rel),
                },
            )
            if code == 0 and (not non_sorry2 or not args.clean_warnings_with_agent_b):
                if non_sorry2 and not args.clean_warnings_with_agent_b:
                    print(
                        "Lean OK after proof Agent B, but has non-sorry warnings (--no-clean-warnings-with-agent-b); continuing."
                    )
                else:
                    print("Lean OK after proof Agent B.")
                success_after_b = True
                break
            retry_failure_kind = "lean_error"
            if code != 0:
                retry_failure_kind = "lean_error"
                err_text = lean_output
            elif non_sorry2 and args.clean_warnings_with_agent_b:
                retry_failure_kind = "non_sorry_warnings"
                err_text = "Lean still reports these non-sorry warnings. Please remove them:\n\n" + "\n\n".join(non_sorry2)
            if retry_failure_kind == "non_sorry_warnings":
                print("Lean still has non-sorry warnings after proof Agent B; will retry if attempts remain.")
            else:
                print("Lean still failing after proof Agent B attempt; will retry if attempts remain.")

        if success_after_b:
            split_ok, split_err = _split_and_validate(item=item, target_rel=target_rel, target_abs=target_abs)
            if not split_ok:
                print("Split validation failed after proof Agent B; recording failure and stopping batch.")
                total_failed += 1
                log_event(
                    run_id,
                    "item_end",
                    {
                        "index": item.index,
                        "label": item.label,
                        "task_id": task_id,
                        "status": "failed_after_split",
                        "seconds": time.monotonic() - item_start,
                        "error_snippet": (split_err[:1200] if split_err else None),
                    },
                )
                break
            state["next_index"] = item.index + 1
            save_state(state, progress_file, run_id=run_id)
            processed += 1
            last_success = item.index
            log_event(
                run_id,
                "item_end",
                {"index": item.index, "label": item.label, "task_id": task_id, "status": "ok_after_b", "seconds": time.monotonic() - item_start},
            )
            if args.max_items is not None and processed >= int(args.max_items):
                print(f"Reached max-items={args.max_items}, stopping batch.")
                break
            continue

        print("Lean still failing after proof Agent B retries; recording failure and stopping batch.")
        total_failed += 1
        log_event(
            run_id,
            "item_end",
            {
                "index": item.index,
                "label": item.label,
                "task_id": task_id,
                "status": "failed_after_b",
                "seconds": time.monotonic() - item_start,
                "error_snippet": (err_text[:1200] if err_text else None),
            },
        )
        break

    summary = {
        "pipeline": "item_proof",
        "stage": 2,
        "run_id": run_id,
        "project": project,
        "task_id": task_id,
        "data_file": str(args.data_file),
        "processed": processed,
        "items_failed": total_failed + total_bad_statement,
        "items_failed_bad_statement": total_bad_statement,
        "last_success_index": last_success,
        "next_index": int(state.get("next_index", next_index)),
        "tokens_used_total": total_tokens,
        "seconds_total": time.monotonic() - run_start,
        "paths": {"progress_file": str(progress_file), "history_file": str(history_file), "logs_dir": str(logs_dir)},
    }
    finish_run(run_id, summary)
    print(
        f"Processed {processed} proof items. Last successful index: {last_success}. "
        f"Next index: {int(state.get('next_index', next_index))}."
    )


if __name__ == "__main__":
    main()
