% Difficulty Correlations Table (auto-generated)
\begin{table}[t]
\centering
\caption{\textbf{Correlation between generation diagnostics and model success.}
Spearman $\rho$ between difficulty metrics and correctness (0/1).
VS\_final = number of hypotheses remaining after all worlds;
Kill\_mean = mean hypotheses killed per world.
Negative correlations indicate harder instances (higher metric) have lower accuracy.}
\label{tab:difficulty_correlations}
\small
\begin{tabular}{@{}l|rr|rr@{}}
\toprule
 & \multicolumn{2}{c|}{FullObs} & \multicolumn{2}{c}{EC} \\
Model & VS\_final & Kill\_mean & VS\_final & Kill\_mean \\
\midrule
Grok4 & --- & --- & N/A & N/A \\
GPT-5.2 & --- & --- & N/A & N/A \\
Grok4.1f & --- & --- & N/A & N/A \\
Gemini3 & --- & --- & N/A & N/A \\
DSR & --- & --- & N/A & N/A \\
Opus4.5 & --- & --- & N/A & N/A \\
Hermes4 & --- & --- & N/A & N/A \\
GPT-4o & --- & --- & N/A & N/A \\
\bottomrule
\multicolumn{5}{l}{\footnotesize * $p < 0.05$}
\end{tabular}
\end{table}