\begin{table*}[b!]
\centering
\begin{tabular}{c c | c | c c | c c }
\toprule
\textbf{Model}        & \textbf{Metric}   & \textbf{Uncalibrated} & \textbf{HB}     & \textbf{IGHB}     & \textbf{PS}   & \textbf{GCULR} \\ 
\midrule
\multirow{3}{*}{Llama 2 7B Chat} 
& marginal   & 0.26830     & 0.00951   & \textbf{0.00229} & 0.00164 & \textbf{0.00125}* \\ 
& group max  & 0.48594     & 0.07208   & \textbf{0.04088} & 0.05017 & \textbf{0.03519}* \\ 
& group mean & 0.29983     & 0.02848   & \textbf{0.01108} & 0.01659 & \textbf{0.00858}* \\ 
\midrule
\multirow{3}{*}{Mistral 7B Instruct} 
& marginal   & 0.25496     & 0.01032   & \textbf{0.00268} & \textbf{0.00093}* & 0.00146 \\ 
& group max. & 0.54701     & 0.08436   & \textbf{0.04585}* & 0.07043 & \textbf{0.04931} \\ 
& group mean & 0.29435     & 0.03226   & \textbf{0.01143} & 0.01848 & \textbf{0.00911}* \\ 
\bottomrule
\end{tabular}
\caption{We generate biographies for entities from \textsc{Bio-FActScore} and compare each calibration method (HB, PS) against its multicalibration counterpart (IGHB, GCULR) on \textbf{ASCE}, \textbf{max gASCE}, and \textbf{average gASCE} ($\downarrow$ better). We bold the better-performing method for each pairing. * denotes the best-performing method across all methods evaluated. All methods use \textit{self-consistency} as their base scoring function.}
\label{tab:multicalibration_asce_fs}
\end{table*}