\review{
\begin{table}[h!]
\centering
\begin{tabular}{lcc}
\toprule
\textbf{Model} & \textbf{\believability (Mean $\pm$ Std)} & \textbf{\goalcompletion (Mean $\pm$ Std)} \\
\midrule
GPT-4o \textbf{(no memory)}     & $8.1 \pm 0.32$  & $5.9 \pm 0.46$  \\
GPT-4o + memory        & $9.1 \pm 0.54$  & $6.9 \pm 0.49$  \\
Gemini-1.5 + memory    & $8.55 \pm 0.56$ & $6.6 \pm 0.47$  \\
Llama-3.1 + memory     & $8.0 \pm 1.0$   & $6.8 \pm 0.56$  \\
Llama-3.2 + memory     & $8.2 \pm 0.93$  & $6.7 \pm 0.46$  \\
\bottomrule
\end{tabular}
\caption{Performance comparison of different models on \believability and \goalcompletion metrics, including a model without memory of past interactions. The results show that while the performance on the \believability dimension is similar across models, the \goalcompletion dimension performance is slightly worse for the model without memory compared to those equipped with the advanced memory module.}
\label{tab:no_memory}
\end{table}
}