% CI Generalization: Near-Gold vs Above-Gold (auto-generated)
% Shows YES holdout exact-match rate by formula complexity
\begin{table}[t]
\centering
\caption{\textbf{Held-out generalization by formula complexity (CI).}
For valid (train-correct) formulas, we compare YES holdout exact-match rates
between \emph{near-gold} formulas (AST $\leq$ gold+1) and \emph{above-gold}
formulas (AST $>$ gold+1). $\Delta$ = near-gold $-$ above-gold; positive values
indicate near-gold formulas generalize better to new YES worlds.}
\label{tab:ci_generalization}
\small
\begin{tabular}{@{}lrrrr@{}}
\toprule
Model & \#Valid & \shortstack{Near-Gold\\Gen\%} & \shortstack{Above-Gold\\Gen\%} & $\Delta$ \\
\midrule
GPT-5.2 & \textbf{165} & 5.4\% & 1.6\% & +3.8\% \\
Grok4 & 156 & \textbf{13.8\%} & \textbf{3.7\%} & +10.1\% \\
Grok4.1f & 121 & 8.7\% & 1.6\% & +7.1\% \\
Gemini 3 & 110 & 10.5\% & 1.9\% & +8.7\% \\
DSR & 83 & 9.5\% & 0.0\% & +9.5\% \\
Opus 4.5 & 68 & 6.9\% & 3.3\% & +3.6\% \\
\bottomrule
\end{tabular}
\end{table}