\begin{table}[!htbp]
\centering
\footnotesize
\caption{Paired Lean checking comparison against unguided retrieval on Accept@5. Differences are strategy minus unguided over the same 500 states.}
\label{tab:execution_significance}
\begin{tabular}{lrrr}
\toprule
Strategy & $\Delta$A@5 & 95\% CI & McNemar $p$ \\
\midrule
Soft family & -0.020 & [-0.042, 0.002] & 0.121 \\
Hard family & -0.092 & [-0.132, -0.052] & $<0.001$ \\
Family RRF & -0.050 & [-0.080, -0.020] & 0.002 \\
Top-m family & -0.082 & [-0.118, -0.046] & $<0.001$ \\
\bottomrule
\end{tabular}
\end{table}

\begin{table}[!htbp]
\centering
\scriptsize
\caption{Lean-accepted alternatives that differ from the traced tactic. All rates use the same query-level denominator as the 500-state execution set. A-no-E is acceptance without a trace hit; E-no-A is a trace hit without Lean acceptance in the reconstructed state.}
\label{tab:accepted_alternatives}
\begin{tabular}{lrrrrr}
\toprule
Strategy & Trace & Lean & Non-trace & A-no-E & E-no-A \\
\midrule
Unguided & 0.168 & 0.672 & 0.628 & 0.534 & 0.030 \\
Soft family & 0.170 & 0.652 & 0.596 & 0.516 & 0.034 \\
Hard family & 0.090 & 0.580 & 0.518 & 0.502 & 0.012 \\
Family RRF & 0.160 & 0.622 & 0.570 & 0.492 & 0.030 \\
Top-m family & 0.124 & 0.590 & 0.564 & 0.490 & 0.024 \\
\bottomrule
\end{tabular}
\end{table}

\begin{table}[!htbp]
\centering
\footnotesize
\caption{Reconstruction and checking failures in the 500-state Lean check. The 245 reconstruction failures are separated from normal Lean rejections; timeouts and other tool failures did not occur.}
\label{tab:execution_failure_classes}
\begin{tabular}{lrr}
\toprule
Class & Count & Cand. frac. \\
\midrule
Unknown identifier & 198 & 0.016 \\
Elab./type class & 47 & 0.004 \\
Parse error & 0 & 0.000 \\
Timeout & 0 & 0.000 \\
Other tool failure & 0 & 0.000 \\
\bottomrule
\end{tabular}
\end{table}
