\begin{table*}[b!]
\centering
\begin{tabular}{c c | c | c c | c c }
\toprule
\textbf{Model}        & \textbf{Metric}   & \textbf{Uncalibrated} & \textbf{HB}     & \textbf{IGHB}     & \textbf{PS}   & \textbf{GCULR} \\ 
\midrule
\multirow{3}{*}{Llama 2 7B Chat} 
& marginal  & 0.475     & 0.169   & \textbf{0.148} & 0.152 & \textbf{0.143}* \\ 
& group max & 0.535     & 0.323   & \textbf{0.247} & 0.285 & \textbf{0.235}* \\ 
& group mean & 0.479     & 0.169   & \textbf{0.148} & 0.152 & \textbf{0.142}* \\ 
\midrule
\multirow{3}{*}{Mistral 7B Instruct} 
& marginal  & 0.471     & 0.186   & \textbf{0.159} & 0.164 & \textbf{0.152}* \\ 
& group max & 0.554     & 0.333   & \textbf{0.250} & 0.285 & \textbf{0.235}* \\ 
& group mean & 0.477     & 0.186   & \textbf{0.158} & 0.164 & \textbf{0.152}* \\ 
\bottomrule
\end{tabular}
\caption{We generate biographies for entities from \textsc{Bio-FActScore} and compare each calibration method (HB, PS) against its multicalibration counterpart (IGHB, GCULR) on \textbf{Brier score} ($\downarrow$ better) \textbf{marginally} across the entire dataset, as well as within each subgroup (in terms of \textbf{max} and \textbf{mean} over all groups). We bold the better-performing method for each pairing. * denotes the best-performing method across all methods evaluated. All methods use \textit{self-consistency} as their base scoring function.}
\label{tab:multicalibration_brier_fs}
\end{table*}