% Lift-Hard Breakdown Table (auto-generated)
\begin{table*}[t]
\centering
\caption{\textbf{Lift-hard breakdown across tasks.}
Lift-hard instances contain cross-relational patterns (using both R and S predicates)
that empirically prove harder for models. We report Acc@+25 (budgeted accuracy)
separately for lift-hard and non-lift instances. EC v1 contains no lift-hard instances
by construction (N/A).}
\label{tab:lift_hard_breakdown}
\small
\begin{tabular}{@{}l|rr|rr|rr@{}}
\toprule
 & \multicolumn{2}{c|}{FullObs} & \multicolumn{2}{c|}{CI} & \multicolumn{2}{c}{EC} \\
Model & Lift (61) & Non (314) & Lift (28) & Non (172) & Lift (0) & Non (200) \\
\midrule
Grok4 & 13.1\% & 53.2\% & 71.4\% & 76.2\% & N/A & 53.0\% \\
GPT-5.2 & 0.0\% & 23.2\% & 50.0\% & 76.7\% & N/A & 59.5\% \\
Grok4.1f & 0.0\% & 21.3\% & 46.4\% & 62.8\% & N/A & 41.0\% \\
Gemini3 & 0.0\% & 18.2\% & 46.4\% & 56.4\% & N/A & 52.0\% \\
DSR & 0.0\% & 12.4\% & 46.4\% & 40.7\% & N/A & 33.0\% \\
Opus4.5 & 0.0\% & 10.2\% & 32.1\% & 34.3\% & N/A & 30.0\% \\
Hermes4 & 0.0\% & 3.2\% & 0.0\% & 2.9\% & N/A & 15.5\% \\
GPT-4o & 0.0\% & 0.0\% & 0.0\% & 0.6\% & N/A & 2.0\% \\
\bottomrule
\end{tabular}
\end{table*}