% LaTeX document with correlation tables
% Required packages: booktabs, threeparttable, longtable, array

\begin{table}[htbp]
\centering
\caption{Correlation between Base LLM Benchmarks and Virtualhome Action Sequencing Task Performance. Bold values indicate strong correlations ($|r| \geq 0.7$), italic values indicate moderate correlations ($0.5 \leq |r| < 0.7$).}
\label{tab:virtualhome_action_sequencing_correlation}
\footnotesize
\begin{tabular}{lcccccc}
\toprule
EAI Task Metrics & GPQA & MUSR & IFEval & MMLU-PRO & BBH & MATH Lvl 5 \\
\midrule
Task Success & \textit{0.525} & \textit{0.558} & \textit{0.618} & \textbf{0.714} & \textbf{0.754} & \textbf{0.782} \\
State Goal & \textit{0.554} & \textit{0.564} & \textit{0.589} & \textbf{0.706} & \textbf{0.742} & \textbf{0.761} \\
Relation Goal & \textit{0.521} & \textit{0.558} & \textit{0.577} & \textbf{0.707} & \textbf{0.743} & \textbf{0.783} \\
Action Goal & \textit{0.514} & \textit{0.531} & \textit{0.622} & \textbf{0.704} & \textbf{0.746} & \textbf{0.779} \\
Total Goal & \textit{0.552} & \textit{0.571} & \textit{0.608} & \textbf{0.725} & \textbf{0.764} & \textbf{0.792} \\
Execution Success & \textit{0.507} & \textit{0.532} & \textit{0.648} & \textbf{0.701} & \textbf{0.747} & \textbf{0.773} \\
Parsing & 0.480 & 0.382 & \textit{0.530} & \textit{0.535} & \textit{0.574} & 0.496 \\
Hallucination & 0.081 & 0.212 & 0.152 & 0.217 & 0.217 & 0.227 \\
Predicate Arg & -0.204 & -0.085 & -0.345 & -0.308 & -0.382 & -0.080 \\
Wrong Order & -0.369 & -0.350 & -0.016 & -0.485 & -0.388 & -0.417 \\
Missing Step & -0.360 & -0.329 & -0.386 & -0.355 & -0.380 & -0.255 \\
Affordance & -0.198 & -0.102 & -0.189 & -0.199 & -0.205 & -0.122 \\
Additional Step & -0.317 & -0.286 & 0.112 & -0.304 & -0.277 & -0.190 \\
\bottomrule
\end{tabular}
\end{table}

\begin{table}[htbp]
\centering
\caption{Correlation between Base LLM Benchmarks and Behavior Action Sequencing Task Performance. Bold values indicate strong correlations ($|r| \geq 0.7$), italic values indicate moderate correlations ($0.5 \leq |r| < 0.7$).}
\label{tab:behavior_action_sequencing_correlation}
\footnotesize
\begin{tabular}{lcccccc}
\toprule
EAI Task Metrics & GPQA & MUSR & IFEval & MMLU-PRO & BBH & MATH Lvl 5 \\
\midrule
Task Success & 0.311 & 0.203 & \textit{0.689} & \textit{0.601} & \textit{0.613} & \textit{0.604} \\
State Goal & 0.282 & 0.264 & \textbf{0.702} & \textit{0.613} & \textit{0.581} & \textit{0.649} \\
Relation Goal & 0.355 & 0.148 & \textbf{0.709} & \textit{0.526} & \textit{0.620} & \textit{0.510} \\
Total Goal & 0.340 & 0.196 & \textbf{0.740} & \textit{0.578} & \textit{0.629} & \textit{0.589} \\
Execution Success & 0.333 & 0.184 & \textbf{0.726} & \textit{0.587} & \textit{0.615} & \textit{0.574} \\
Parsing & 0.081 & 0.062 & \textbf{0.838} & 0.307 & 0.372 & 0.295 \\
Hallucination & 0.137 & -0.035 & -0.187 & 0.120 & 0.111 & 0.225 \\
Predicate Arg & 0.201 & 0.016 & -0.112 & 0.256 & 0.220 & 0.247 \\
Wrong Order & -0.171 & -0.131 & \textbf{-0.816} & -0.356 & -0.469 & -0.444 \\
Missing Step & -0.010 & 0.069 & \textbf{-0.783} & -0.229 & -0.296 & -0.254 \\
Additional Step & 0.063 & 0.027 & \textit{-0.515} & -0.147 & -0.180 & -0.208 \\
\bottomrule
\end{tabular}
\end{table}

\begin{table}[htbp]
\centering
\caption{Correlation between Base LLM Benchmarks and Virtualhome Goal Interpretation Task Performance. Bold values indicate strong correlations ($|r| \geq 0.7$), italic values indicate moderate correlations ($0.5 \leq |r| < 0.7$).}
\label{tab:virtualhome_goal_interpretation_correlation}
\footnotesize
\begin{tabular}{lcccccc}
\toprule
EAI Task Metrics & GPQA & MUSR & IFEval & MMLU-PRO & BBH & MATH Lvl 5 \\
\midrule
Node F1 & 0.495 & 0.237 & 0.087 & 0.372 & 0.459 & 0.136 \\
Edge F1 & \textit{0.531} & 0.305 & 0.163 & 0.456 & \textit{0.567} & 0.274 \\
Action F1 & 0.339 & 0.175 & 0.314 & 0.250 & 0.343 & 0.163 \\
All F1 & \textit{0.554} & 0.289 & 0.162 & 0.427 & \textit{0.526} & 0.187 \\
\bottomrule
\end{tabular}
\end{table}

\begin{table}[htbp]
\centering
\caption{Correlation between Base LLM Benchmarks and Behavior Goal Interpretation Task Performance. Bold values indicate strong correlations ($|r| \geq 0.7$), italic values indicate moderate correlations ($0.5 \leq |r| < 0.7$).}
\label{tab:behavior_goal_interpretation_correlation}
\footnotesize
\begin{tabular}{lcccccc}
\toprule
EAI Task Metrics & GPQA & MUSR & IFEval & MMLU-PRO & BBH & MATH Lvl 5 \\
\midrule
Overall F1 & \textit{0.627} & \textit{0.642} & 0.484 & \textbf{0.821} & \textbf{0.742} & \textbf{0.761} \\
State Goal F1 & \textit{0.576} & \textit{0.605} & 0.459 & \textbf{0.777} & \textbf{0.707} & \textbf{0.780} \\
Relation Goal F1 & \textit{0.646} & \textit{0.652} & \textit{0.500} & \textbf{0.837} & \textbf{0.765} & \textbf{0.727} \\
State Hallucinati... & 0.309 & 0.300 & 0.402 & 0.493 & 0.403 & 0.466 \\
Object Hallucinat... & 0.325 & 0.324 & 0.420 & \textit{0.511} & 0.422 & 0.490 \\
Format Error Rate & 0.295 & 0.280 & 0.419 & 0.473 & 0.384 & 0.460 \\
Grammatically Val... & -0.446 & -0.412 & \textit{-0.515} & \textit{-0.634} & \textit{-0.518} & \textit{-0.574} \\
\bottomrule
\end{tabular}
\end{table}

% Summary:
% Total tables generated: 4
% Tables included:
% - Virtualhome Action Sequencing
% - Behavior Action Sequencing
% - Virtualhome Goal Interpretation
% - Behavior Goal Interpretation
