% FullObs Generalization vs Bloat Bins Table (auto-generated)
\begin{table}[t]
\centering
\caption{\textbf{FullObs generalization by AST delta bin.}
For train-correct predictions, we report holdout exact-match rate (\%).
Bins: $\leq$+1 (compact), +2..+25 (moderate), $>$+25 (bloated).
--- indicates no train-correct instances in that bin.}
\label{tab:fullobs_generalization_bins}
\small
\begin{tabular}{@{}lrrr@{}}
\toprule
Model & $\leq$+1 & +2..+25 & $>$+25 \\
\midrule
Grok4 & 98.1 (144) & 50.0 (22) & 26.7 (15) \\
GPT-5.2 & 90.3 (39) & 34.3 (28) & 9.0 (89) \\
Grok4.1f & 94.1 (54) & 20.0 (6) & --- \\
Gemini3 & 78.0 (30) & 27.0 (20) & 20.0 (3) \\
DSR & 95.3 (30) & 37.8 (9) & --- \\
Opus4.5 & 78.4 (25) & 31.4 (7) & --- \\
Hermes4 & 100.0 (10) & --- & --- \\
\bottomrule
\end{tabular}
\end{table}