\renewcommand{\arraystretch}{1.4}
\begin{table}[t]
\centering
\resizebox{1\textwidth}{!}{
\begin{tabular}{lccccccc}
\hline \hline
\textbf{Checkpoint} & \textbf{True Positive} & \textbf{False Positive} & \textbf{True Negative} & \textbf{False Negative} & \textbf{Precision} & \textbf{Recall} & \textbf{F1 Score} \\
\hline
Repetition of Sentences & 48 & 2 & 42 & 8 & 0.96 & 0.85 & 0.90 \\
Consistency with Character Traits & 43 & 7 &44 & 6 & 0.86 & 0.87 & 0.86 \\
Consistency with Environment Goals & 48 & 2 & 37 & 13 & 0.96 & 0.78 & 0.86 \\
Agent Leaves Promptly After Goal Resolution & 36 & 14 & 42 & 8 & 0.72 & 0.81 & 0.76 \\
Repetition of Exact Goal & 33 & 17 & 48 & 2 & 0.66 & 0.94 & 0.77 \\
Stalling in a Conversation & 39 & 11 & 45 & 5 & 0.78 & 0.88 & 0.82 \\
Character Responses & 49 & 1 & 37 & 13 & 0.98 & 0.79 & 0.87 \\
Episode Beginning & 47 & 3 & 33 & 17 & 0.94 & 0.78 & 0.85 \\
\hline \hline
\end{tabular}
}
\caption{Performance of GPT-4 as an evaluator for the \believabilityextended dimension, with the evaluation results validated manually.}
\label{tab:bel_extended:validation}
\end{table}
\renewcommand{\arraystretch}{1.0}