% Within-problem bloat control (auto-generated)
\begin{table}[tb]
\centering
\caption{\textbf{Within-problem bloat control (CI).}
For problems with $\geq 2$ train-correct predictions (excluding exact gold matches) across models,
we compare holdout generalization of shortest vs longest formulas
(and near-gold vs above-gold when both exist). This controls for instance difficulty.
$\Delta$ = short/near-gold $-$ long/above-gold; positive values indicate
shorter formulas generalize better on the \emph{same} problem.
(The remaining problems are ties with $\Delta = 0$.)
Fraction $\Delta < 0$: Short--Long 1\%, Near--Above 4\%.
95\% CIs via percentile bootstrap (2000 resamples); $p$-values from one-sided binomial test.}
\label{tab:within_problem_bloat_ci}
\small
\begin{tabular}{@{}l@{\hspace{4pt}}c@{\hspace{4pt}}c@{\hspace{4pt}}c@{\hspace{4pt}}c@{\hspace{4pt}}c@{\hspace{4pt}}c@{}}
  \toprule
  & & \multicolumn{2}{c}{Holdout Gen} & & & \\
  \cmidrule(lr){3-4}
  Comparison & $n$ & Short & Long & $\Delta$ [CI] & $\Delta>0$ & $p$ \\
  \midrule
  Short--Long & 157 & 6.2\% & 3.6\% & +2.7 [1, 5] & 8\% & 0.004 \\
  Near--Above & 80 & 5.7\% & 1.1\% & +4.6 [2, 8] & 14\% & 0.03 \\
  \bottomrule
\end{tabular}
\end{table}