\begin{table}[htbp]
\centering
\caption{Correlation between Base LLM Benchmarks and Virtualhome Action Sequencing Task Performance}
\label{tab:virtualhome_action_sequencing_correlation}
\footnotesize
\begin{tabular}{lccccccccccccc}
\toprule
Base LLM Metrics & Task Success & State Goal & Relation Goal & Action Goal & Total Goal & Execution Su... & Parsing & Hallucination & Predicate Arg & Wrong Order & Missing Step & Affordance & Additional Step \\
\midrule
GPQA & \textit{0.525} & \textit{0.554} & \textit{0.521} & \textit{0.514} & \textit{0.552} & \textit{0.507} & 0.480 & 0.081 & -0.204 & -0.369 & -0.360 & -0.198 & -0.317 \\
MUSR & \textit{0.558} & \textit{0.564} & \textit{0.558} & \textit{0.531} & \textit{0.571} & \textit{0.532} & 0.382 & 0.212 & -0.085 & -0.350 & -0.329 & -0.102 & -0.286 \\
IFEval & \textit{0.618} & \textit{0.589} & \textit{0.577} & \textit{0.622} & \textit{0.608} & \textit{0.648} & \textit{0.530} & 0.152 & -0.345 & -0.016 & -0.386 & -0.189 & 0.112 \\
MMLU-PRO & \textbf{0.714} & \textbf{0.706} & \textbf{0.707} & \textbf{0.704} & \textbf{0.725} & \textbf{0.701} & \textit{0.535} & 0.217 & -0.308 & -0.485 & -0.355 & -0.199 & -0.304 \\
BBH & \textbf{0.754} & \textbf{0.742} & \textbf{0.743} & \textbf{0.746} & \textbf{0.764} & \textbf{0.747} & \textit{0.574} & 0.217 & -0.382 & -0.388 & -0.380 & -0.205 & -0.277 \\
MATH Lvl 5 & \textbf{0.782} & \textbf{0.761} & \textbf{0.783} & \textbf{0.779} & \textbf{0.792} & \textbf{0.773} & 0.496 & 0.227 & -0.080 & -0.417 & -0.255 & -0.122 & -0.190 \\
\bottomrule
\end{tabular}
\begin{tablenotes}
\footnotesize
\item Note: Bold values indicate strong correlations ($|r| \geq 0.7$), italic values indicate moderate correlations ($0.5 \leq |r| < 0.7$).
\end{tablenotes}
\end{table}
