\begin{figure}[!h]
    \centering
    \includegraphics[width=0.88\textwidth]{figures/llama_eval/bel_goal_combined.pdf}
    
    \caption{Performance of language agents and humans across multiple episodes evaluated using \textbf{Llama-3.1}. (Left) Evolution of \believability scores with an increasing number of episodes. (Right) Evolution of \goalcompletion scores. We observe that Llama-3.1 is not able to properly distinguish between conversations where the agents perform well and when they do not thus making it unsuitable for use.}
    \label{fig:llama:main}
\end{figure}