\begin{table}[t]
\centering
\caption{Controlled LeanDojo subset characteristics. The dataset is extracted from a curated mathlib4 subset rather than a full-mathlib trace.}
\label{tab:dataset}
\begin{tabular}{lr}
\toprule
Statistic & Value \\
\midrule
Proof steps & 3723 \\
Theorems & 1702 \\
Tactic families & 81 \\
Mean steps/theorem & 2.187 \\
Mean goal tokens & 11.084 \\
Mean context size & 5.784 \\
\bottomrule
\end{tabular}
\end{table}

\begin{table*}[t]
\centering
\footnotesize
\caption{Tactic-family prediction results under by-theorem splitting. Accuracy, macro-F1 and top-3 accuracy compare the same model families across proof-state representations. Bold marks the strongest state-visible entry in each metric; future premise is a hindsight comparison.}
\label{tab:classification}
\begin{tabular}{clrrr}
\toprule
Representation & Model & Accuracy & Macro-F1 & Top-3 \\
\midrule
\multirow{5}{*}{State-only} & Majority & 0.262 & 0.007 & 0.495 \\
 & Keyword & 0.168 & 0.015 & 0.449 \\
 & Naive Bayes & 0.335 & 0.065 & 0.588 \\
 & TF-IDF LR & 0.109 & 0.066 & 0.245 \\
 & TF-IDF SVM & 0.214 & 0.096 & 0.456 \\
\cmidrule(lr){1-5}
\multirow{5}{*}{State+metadata} & Majority & 0.262 & 0.007 & 0.495 \\
 & Keyword & 0.168 & 0.015 & 0.449 \\
 & Naive Bayes & \textbf{0.352} & 0.063 & 0.592 \\
 & TF-IDF LR & 0.117 & 0.073 & 0.252 \\
 & TF-IDF SVM & 0.217 & 0.099 & 0.456 \\
\cmidrule(lr){1-5}
\multirow{5}{*}{Retrieved-premise} & Majority & 0.262 & 0.007 & 0.495 \\
 & Keyword & 0.168 & 0.015 & 0.449 \\
 & Naive Bayes & 0.337 & 0.073 & \textbf{0.594} \\
 & TF-IDF LR & 0.124 & 0.072 & 0.278 \\
 & TF-IDF SVM & 0.237 & \textbf{0.103} & 0.469 \\
\cmidrule(lr){1-5}
\multirow{5}{*}{Future premise} & Majority & 0.262 & 0.007 & 0.495 \\
 & Keyword & 0.168 & 0.015 & 0.449 \\
 & Naive Bayes & 0.360 & 0.051 & 0.586 \\
 & TF-IDF LR & 0.130 & 0.076 & 0.288 \\
 & TF-IDF SVM & 0.296 & 0.124 & 0.546 \\
\bottomrule
\end{tabular}
\end{table*}

\begin{table*}[t]
\centering
\footnotesize
\caption{Split-seed stability on the controlled subset. Values are mean $\pm$ population standard deviation over six by-theorem random splits. Bold marks the strongest state-visible mean in each metric.}
\label{tab:stability}
\begin{tabular}{clrrr}
\toprule
Representation & Model & Accuracy & Macro-F1 & Top-3 \\
\midrule
\multirow{2}{*}{Future premise} & Naive Bayes & 0.352 $\pm$ 0.004 & 0.051 $\pm$ 0.005 & 0.589 $\pm$ 0.012 \\
 & TF-IDF SVM & 0.301 $\pm$ 0.009 & 0.129 $\pm$ 0.008 & 0.547 $\pm$ 0.020 \\
\cmidrule(lr){1-5}
\multirow{2}{*}{Retrieved-premise} & Naive Bayes & 0.329 $\pm$ 0.007 & 0.073 $\pm$ 0.006 & 0.585 $\pm$ 0.008 \\
 & TF-IDF SVM & 0.240 $\pm$ 0.016 & 0.097 $\pm$ 0.010 & 0.469 $\pm$ 0.016 \\
\cmidrule(lr){1-5}
\multirow{2}{*}{State+metadata} & Naive Bayes & \textbf{0.338} $\pm$ 0.010 & 0.059 $\pm$ 0.004 & \textbf{0.596} $\pm$ 0.014 \\
 & TF-IDF SVM & 0.229 $\pm$ 0.012 & \textbf{0.101} $\pm$ 0.008 & 0.449 $\pm$ 0.020 \\
\cmidrule(lr){1-5}
\multirow{2}{*}{State-only} & Naive Bayes & 0.329 $\pm$ 0.009 & 0.066 $\pm$ 0.007 & 0.586 $\pm$ 0.013 \\
 & TF-IDF SVM & 0.219 $\pm$ 0.010 & \textbf{0.101} $\pm$ 0.010 & 0.449 $\pm$ 0.016 \\
\bottomrule
\end{tabular}
\end{table*}

\begin{table*}[t]
\centering
\footnotesize
\caption{Retrieval ranking before Lean checking. The table reports family-level and trace-tactic success at fixed ranks. Bold marks leading non-ceiling entries; near-equal exact@5 values are treated as ties.}
\label{tab:search}
\begin{tabular}{lrrrr}
\toprule
Strategy & Family@1 & Family@5 & Exact@1 & Exact@5 \\
\midrule
Unguided & \textbf{0.354} & \textbf{0.600} & 0.120 & \textbf{0.169} \\
Soft family (val-selected) & 0.337 & 0.589 & \textbf{0.123} & \textbf{0.170} \\
Hard family & 0.113 & 0.130 & 0.069 & 0.082 \\
Family RRF & 0.261 & 0.546 & 0.111 & \textbf{0.169} \\
Top-m family & 0.108 & 0.231 & 0.065 & 0.111 \\
True family & 0.988 & 0.988 & 0.199 & 0.245 \\
\bottomrule
\end{tabular}
\end{table*}

\begin{table*}[t]
\centering
\footnotesize
\caption{Lean checking results on 500 held-out S4 states. Each strategy checks 2500 candidate tactics. Accept@k uses all queries as the denominator; coverage is the fraction of candidate tactics that reached ordinary Lean acceptance or rejection after state reconstruction. Bold marks the best value in each column.}
\label{tab:micro_execution}
\begin{tabular}{lrrrrr}
\toprule
Strategy & Accept@1 & Accept@3 & Accept@5 & Coverage & Recon. fail \\
\midrule
Unguided & 0.424 & 0.588 & \textbf{0.672} & \textbf{0.988} & \textbf{0.012} \\
Soft family & \textbf{0.428} & \textbf{0.594} & 0.652 & 0.987 & 0.013 \\
Hard family & 0.424 & 0.516 & 0.580 & 0.980 & 0.020 \\
Family RRF & 0.412 & 0.562 & 0.622 & 0.982 & 0.018 \\
Top-m family & 0.372 & 0.558 & 0.590 & 0.964 & 0.036 \\
\bottomrule
\end{tabular}
\end{table*}
