% FullObs Generalization: Near-Gold vs Above-Gold (auto-generated)
% Main finding: above-gold valid formulas generalize worse to held-out worlds
\begin{table}[t]
\centering
\caption{\textbf{Held-out generalization by formula complexity (FullObs).}
Near-gold: AST $\leq$ gold+1; above-gold: AST $>$ gold+1.
$\Delta$ = near-gold $-$ above-gold.}
\label{tab:fo_generalization}
\small
\begin{tabular}{@{}lrrrr@{}}
\toprule
Model & \#Valid & \shortstack{Near-Gold\\Gen\%} & \shortstack{Above-Gold\\Gen\%} & $\Delta$ \\
\midrule
Grok4 & \textbf{181} & \textbf{98.1\%} & \textbf{40.5\%} & +57.6\% \\
GPT-5.2 & 156 & 90.3\% & 15.0\% & +75.2\% \\
Grok4.1f & 60 & 94.1\% & 20.0\% & +74.1\% \\
Gemini 3 & 53 & 78.0\% & 26.1\% & +51.9\% \\
DSR & 39 & 95.3\% & 37.8\% & +57.6\% \\
Opus 4.5 & 32 & 78.4\% & 31.4\% & +47.0\% \\
\bottomrule
\end{tabular}
\end{table}