{
  "condition": "pdf_to_chat_baseline",
  "started_at": "2026-05-08T23:10:42.971333+00:00",
  "finished_at": "2026-05-08T23:13:55.607995+00:00",
  "wall_seconds": 192.644,
  "returncode": 0,
  "claude_duration_ms": 191842,
  "claude_api_duration_ms": 191011,
  "num_turns": 9,
  "total_cost_usd": 0.31631525000000005,
  "stop_reason": "end_turn",
  "terminal_reason": "completed",
  "permission_denials": [],
  "tool_commands": [
    "curl -L -sS -o /tmp/proofnet.pdf https://arxiv.org/pdf/2302.12433 && pdftotext -f 1 -l 10 -layout /tmp/proofnet.pdf - | head -c 50000",
    "cat <claude_tool_result_path>",
    "pdftotext -f 1 -l 10 -layout /tmp/proofnet.pdf - | head -c 50000 | strings | head -200",
    "curl -L -sS -o /tmp/fimo.pdf https://arxiv.org/pdf/2309.04295 && pdftotext -f 1 -l 10 -layout /tmp/fimo.pdf - | head -c 50000 | strings | head -300",
    "pdftotext -f 1 -l 10 -layout /tmp/fimo.pdf - | head -c 50000 | strings | sed -n '50,300p'",
    "curl -L -sS \"https://stepfun.ai/research/en/stepfun-prover-preview\" | head -c 50000",
    "curl -L -sS \"https://stepfun.ai/research/en/stepfun-prover-preview\" | python3 -c \"\nimport sys, re\nhtml = sys.stdin.read()\n# Remove scripts, styles, head\nhtml = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL)\nhtml = re.sub(r'<style[^>]*>.*?</style>', '', html, flags=re.DOTALL)\nhtml = re.sub(r'<head[^>]*>.*?</head>', '', html, flags=re.DOTALL)\n# Remove tags\ntext = re.sub(r'<[^>]+>', ' ', html)\n# Clean whitespace\ntext = re.sub(r'[ \\t]+', ' ', text)\ntext = re.sub(r'\\n\\s*\\n+', '\\n\\n', text)\nprint(text[:20000])\n\"",
    "curl -L -sS -o /tmp/verina.pdf https://arxiv.org/pdf/2505.23135 && pdftotext -f 1 -l 10 -layout /tmp/verina.pdf - | head -c 50000 | strings | head -300"
  ],
  "tool_result_bytes": 91059,
  "primary_model_output_tokens": 10691,
  "primary_model_input_tokens": 10,
  "primary_model_cache_read_tokens": 152540,
  "primary_model_cache_creation_tokens": 29043,
  "result": {
    "condition": "pdf_to_chat_baseline",
    "route": [
      {
        "url": "https://arxiv.org/pdf/2302.12433",
        "reason": "ProofNet establishes the benchmark landscape for autoformalization and formal proving at undergraduate level, identifying the informal-to-formal translation gap as the field's central infrastructure problem."
      },
      {
        "url": "https://arxiv.org/pdf/2309.04295",
        "reason": "FIMO escalates difficulty to IMO Shortlisted Problems, confirming that GPT-4 cannot prove any formally stated Olympiad problems and revealing the formalization pipeline's dependence on human semantic verification."
      },
      {
        "url": "https://stepfun.ai/research/en/stepfun-prover-preview",
        "reason": "StepFun-Prover demonstrates that training LLMs via RL to conduct open-ended Lean 4 REPL interactions \u2014 rather than receiving verifier feedback only at test time \u2014 achieves new state-of-the-art on miniF2F, establishing verifier-in-the-loop RL as the dominant paradigm shift."
      },
      {
        "url": "https://arxiv.org/pdf/2505.23135",
        "reason": "VERINA extends evaluation to verifiable software code generation, showing proof generation remains critically limited (4.9% single-trial success for o3; 20.1% with 64 iterative refinements), and proving that the formal gap at software-verification difficulty is a distinct, harder challenge than math competition proving."
      }
    ],
    "pages_read_count": 4,
    "paper_summaries": [
      {
        "paper": "ProofNet (arXiv:2302.12433)",
        "summary": "Introduces ProofNet, a 371-example benchmark for autoformalization and formal proving of undergraduate-level mathematics in Lean 3, covering real and complex analysis, linear algebra, abstract algebra, and topology. Also releases PROOF GPT models (1.3B and 6.7B params) trained on an 8B-token proof-pile. Proposes two novel autoformalization methods: prompt retrieval (nearest-neighbor search over mathlib declarations) and distilled backtranslation (unsupervised finetuning without parallel data). Central finding: lack of parallel informal-formal benchmarks was blocking progress; ProofNet fills this gap but baseline models remain far from human performance.",
        "decision": "use"
      },
      {
        "paper": "FIMO (arXiv:2309.04295)",
        "summary": "Presents FIMO, a dataset of 149 human-verified Lean formal statements sourced from IMO Shortlisted Problems (2006\u20132021), focused on Algebra and Number Theory. Constructs the dataset via a three-stage pipeline: OCR (Mathpix) \u2192 GPT-4 auto-formalization with iterative Lean error feedback (up to 5 reflection rounds) \u2192 human semantic verification. Key finding: GPT-4 fails to prove any IMO-level formal statements, confirming that competition-level mathematical reasoning is qualitatively beyond current LLMs. The feedback-augmented auto-formalization loop is promising but insufficient at this difficulty, and human verification remains unavoidable.",
        "decision": "use"
      },
      {
        "paper": "StepFun-Prover Preview (stepfun.ai/research/en/stepfun-prover-preview)",
        "summary": "Releases StepFun-Prover-Preview-7B and 32B \u2014 Lean 4 theorem provers trained via tool-integrated reinforcement learning. The training pipeline proceeds through cold-start data curation (multi-turn trajectories collected with Claude Sonnet 4 plus Kimina-Prover-72B outputs), two-stage SFT, response pattern fusion, and tool-integrated GRPO RL in which the model learns to decide when to invoke the Lean 4 REPL sandbox, interpret error messages, and adaptively restructure proofs without a fixed interaction limit. StepFun-Prover-Preview-32B achieves 70.0% pass@1 on miniF2F-test, outperforming DeepSeek-Prover-V2-671B (61.9%) and Kimina-Prover-72B (63.9%) with a much smaller model. Test-time scaling is monotonic: pass@1 rises from 58.3% at 4K to 70.0% at 20K maximum generation length, confirming verifier interaction as the core proof mechanism.",
        "decision": "use"
      },
      {
        "paper": "VERINA (arXiv:2505.23135, ICLR 2026)",
        "summary": "Introduces VERINA (Verifiable Code Generation Arena), a 189-task Lean benchmark for holistic evaluation of verifiable code generation across three compositional subtasks: code generation, specification generation (pre/post-conditions), and proof generation. Manually curated from MBPP-DFY-50, CloverBench, and university course submissions. Best general-purpose model (OpenAI o3): 72.6% code correctness, 52.3% spec soundness/completeness, 4.9% proof success (single trial). Best theorem-proving specialist (Goedel Prover V2 32B): 11.2% proof success. Iterative Lean compiler feedback raises proof success to 20.1% with 64 refinement steps, but at high computational cost and with 80% of proofs still failing. Identifies proof generation \u2014 not code or specification generation \u2014 as the binding bottleneck in the verifiable code pipeline.",
        "decision": "use"
      }
    ],
    "source_checks": [
      {
        "claim": "StepFun-Prover-Preview-32B achieves 70.0% pass@1 on miniF2F-test, surpassing DeepSeek-Prover-V2-671B (61.9%) and Kimina-Prover-72B (63.9%)",
        "checked_against": "StepFun-Prover source article Table 1 (Performance comparison on miniF2F-test)",
        "status": "supported"
      },
      {
        "claim": "OpenAI o3 achieves only 4.9% proof success rate (single trial) on VERINA; iterative refinement with 64 steps raises this to 20.1%",
        "checked_against": "VERINA abstract and Section 1 introduction",
        "status": "supported"
      },
      {
        "claim": "ProofNet contains 371 parallel formal/informal theorem pairs in Lean 3 drawn from undergraduate pure mathematics textbooks",
        "checked_against": "ProofNet abstract and Section 2 dataset collection",
        "status": "supported"
      },
      {
        "claim": "GPT-4 (gpt-4-0314) cannot prove any IMO-level formal statements in the FIMO benchmark",
        "checked_against": "FIMO abstract ('highlights GPT-4's limited capacity to yield satisfactory results') and experimental discussion",
        "status": "supported"
      },
      {
        "claim": "Test-time scaling in StepFun-Prover is monotonic: pass@1 increases from 58.3% (4K tokens) to 70.0% (20K tokens)",
        "checked_against": "StepFun-Prover article Table 2 (Performance with various maximum generation lengths)",
        "status": "supported"
      },
      {
        "claim": "Autoformalization of formal mathematical statements still requires human semantic verification even with LLM-plus-Lean-feedback loops",
        "checked_against": "FIMO dataset construction section (Manual Verification stage) and ProofNet dataset collection criteria",
        "status": "supported"
      }
    ],
    "two_page_synthesis": "The four sources trace a coherent arc from benchmark construction through state-of-the-art proof generation to software verification, revealing why the formal reasoning gap between human mathematicians and automated theorem provers remains large \u2014 and pointing to one structural intervention, tight iterative coupling between LLM generation and formal verifier feedback, as the key lever for progress.\n\nProofNet (2023) identified the core infrastructure problem: no parallel benchmark aligned informal undergraduate mathematics with formal Lean 3 statements, making progress in autoformalization essentially unmeasurable. Its 371-example dataset, spanning real and complex analysis, linear algebra, abstract algebra, and topology, exposed how difficult it is to bridge informal mathematical intuition and formal rigor even at the undergraduate level. PROOF GPT models, trained on an 8B-token proof-pile, outperformed base models on perplexity metrics but the paper's two novel autoformalization techniques \u2014 prompt retrieval via nearest-neighbor search over mathlib declarations, and distilled backtranslation \u2014 represent incremental gains on the translation subtask, not yet a path to full end-to-end proving.\n\nFIMO (2023) escalated the difficulty to genuine Olympiad problems and found a steep cliff. Sourcing 149 problems from IMO Shortlisted Problems and formalizing them through a three-stage pipeline (OCR, GPT-4 auto-formalization with iterative Lean error feedback, human semantic verification), the authors showed that GPT-4 could produce syntactically valid Lean statements but could not prove a single one. The critical design choice \u2014 exposing the model to Lean error messages and allowing iterative reflection \u2014 is the same pattern that would later define the StepFun-Prover training regime, but at IMO difficulty it cannot eliminate the need for human semantic validation of the formal statements themselves. This reveals a two-layer problem: autoformalization correctness and proof search capability are separate failure modes.\n\nStepFun-Prover (2025) operationalizes the verifier-feedback loop at training time rather than inference time, and the results are dramatic. The training pipeline begins with cold-start SFT data including multi-turn trajectories collected with Claude Sonnet 4, followed by two-stage fine-tuning, response pattern fusion to reconcile different reasoning styles, and finally group relative policy optimization (GRPO) in which models learn to decide when to invoke the Lean 4 REPL sandbox, how to interpret error and warning messages, and when to restructure the proof entirely. Crucially, there is no fixed limit on the number of REPL interactions: the model learns to self-terminate when it believes the proof is complete. StepFun-Prover-Preview-32B achieves 70.0% pass@1 on miniF2F-test, outperforming DeepSeek-Prover-V2-671B (61.9%) and Kimina-Prover-72B (63.9%) with a far smaller model. Test-time scaling is monotonic: extending maximum generation length from 4K to 20K tokens raises pass@1 from 58.3% to 70.0%, and the REPL interaction frequency distribution for successful proofs confirms that many solutions emerge only after multiple rounds of error diagnosis and adaptive restructuring. The verifier feedback loop is not a fallback mechanism \u2014 it is the primary proof generation process.\n\nVERINA (2026) applies this framing to software verification and reveals a harder bottleneck. The benchmark's 189 manually-curated Lean tasks require jointly generating code, formal pre/post-condition specifications, and correctness proofs \u2014 a more compositional challenge than pure math proving, and one that demands precise semantic alignment between three formal artifacts rather than one. Even with Lean compiler feedback available, the best general-purpose model (OpenAI o3) achieves only 4.9% proof success in a single trial. The best specialist theorem prover (Goedel Prover V2 32B) reaches 11.2%. Iterative refinement over 64 steps raises proof success to 20.1% \u2014 meaningful progress, but leaving 80% of tasks unsolved and incurring substantial computational cost. Critically, code correctness (72.6%) and specification soundness (52.3%) are far higher than proof success, confirming that proof construction \u2014 not code generation or informal-to-formal translation \u2014 is the binding constraint in the verifiable software pipeline.\n\nThe route reveals a field in transition. The dominant paradigm \u2014 generate formal proofs, check them with an ITP, and learn from the feedback signal \u2014 has been validated at the miniF2F competition level and is now being pushed toward software verification and genuine Olympiad difficulty. The trajectory from 0% (GPT-4 on FIMO IMO problems) to 70% (StepFun-Prover on miniF2F) and to 4\u201320% (best models on VERINA proofs) exposes both the power and the limits of this paradigm. The informal-to-formal translation bottleneck (autoformalization) identified by ProofNet and FIMO remains present even in 2025\u20132026 systems: VERINA's manually curated ground-truth formal specifications are required because LLMs cannot yet reliably generate faithful specifications without human intervention. Progress on proof search (via RL from verifier feedback) has outpaced progress on autoformalization, creating an asymmetry that will need to be addressed for truly end-to-end automated theorem proving.",
    "final_question": "To what extent does reinforcement learning from proof-assistant verifier feedback (Lean REPL interactions) generalize across difficulty regimes \u2014 from undergraduate-level and competition-level math to software verification \u2014 and does the informal-to-formal autoformalization step constitute an independent bottleneck that limits end-to-end automated theorem proving regardless of proof search capability?",
    "observations": [
      "Iterative verifier-feedback loops during RL training \u2014 not just at inference time \u2014 yield qualitative performance gains: StepFun-Prover-Preview-32B achieves 70.0% pass@1 on miniF2F-test by learning to conduct open-ended multi-turn Lean 4 REPL interactions, outperforming models 20\u00d7 its size (DeepSeek-Prover-V2-671B at 61.9%), with test-time scaling showing monotonic improvement from 58.3% at 4K to 70.0% at 20K generation tokens. [Source: StepFun-Prover article Table 1 and Table 2]",
      "Proof generation is the binding bottleneck in verifiable code pipelines, not code or specification generation: VERINA shows that even OpenAI o3 achieves 72.6% code correctness but only 4.9% proof success in a single trial, and iterative Lean compiler feedback over 64 steps improves this to only 20.1% \u2014 confirming that formal reasoning about code correctness remains qualitatively harder than generating the code itself. [Source: VERINA abstract and Section 1]",
      "The informal-to-formal autoformalization gap is a persistent, difficulty-dependent bottleneck that operates independently of proof search capability: ProofNet (2023) introduced prompt retrieval and distilled backtranslation to bridge the gap at undergraduate level, while FIMO (2023) showed that even GPT-4 with iterative Lean feedback cannot autoformalize and prove any IMO Shortlisted Problems without human semantic verification \u2014 implying that as benchmark difficulty rises, autoformalization failure precedes and compounds proof search failure. [Sources: ProofNet Section 4.1, FIMO abstract and dataset construction section]"
    ],
    "limitation": "Cross-benchmark evaluation is essentially absent in the current literature, making it impossible to attribute observed performance gaps to specific failure modes. StepFun-Prover achieves 70% on miniF2F but its performance on FIMO IMO-level tasks or VERINA software-verification proofs is unreported; VERINA evaluates Goedel Prover V2 but not StepFun-Prover or the models from ProofNet and FIMO. The steep drop between miniF2F pass rates (70%) and VERINA proof rates (4.9\u201311.2%) is inferred by comparing different models on different benchmarks designed by different teams, not from controlled ablations holding the model fixed. Without a unified evaluation across difficulty regimes \u2014 undergraduate autoformalization, competition proving, and software verification proofs \u2014 it is unclear whether the verifier-feedback RL paradigm represents a general capability gain or a benchmark-specific one, and whether improvements in proof search (as in StepFun-Prover) would transfer to the harder autoformalization-plus-proving pipeline that FIMO and VERINA require.",
    "blocking_model_calls_needed": 6,
    "notes": "All four source papers were successfully fetched and processed. ProofNet and FIMO PDFs were extracted with pdftotext (first 10 pages each). The StepFun-Prover page is a Next.js app that renders content client-side; the article text was recoverable via HTML tag stripping (Python regex) from the raw server-rendered HTML, yielding sufficient detail including the methodology, training pipeline, experimental tables, and full reference list. VERINA PDF was extracted with pdftotext. No sources were unavailable. VERINA is published as a conference paper at ICLR 2026 (per the PDF header), which is a future date relative to the knowledge cutoff \u2014 the arXiv preprint (v3, 2026-03-16) was used as the source. The blocking_model_calls_needed estimate of 6 reflects: 1 planning call + 1 call per paper summary (4 calls) + 1 final synthesis call, which is the minimum sequential workflow for a PDF-to-chat baseline without parallelization."
  }
}