
%%%%% BRIER %%%%%

\begin{table*}[htb!]
\centering
\begin{tabular}{c c c | c | c c | c c }
\toprule
\textbf{Model} & \textbf{Base Score} & \textbf{Metric} & \textbf{Uncalibrated} & \textbf{HB} & \textbf{IGHB} & \textbf{PS} & \textbf{GCULR} \\
\toprule
%
\multirow{9}{*}{Llama} 
& \multirow{3}{*}{self-consistency} 
& marginal     & 0.475     & 0.169   & \textbf{0.148} & 0.152 & \textbf{0.143}* \\
& & group max  & 0.535     & 0.323   & \textbf{0.247} & 0.285 & \textbf{0.235}* \\
& & group mean & 0.479     & 0.169   & \textbf{0.148} & 0.152 & \textbf{0.143}* \\
\cmidrule{2-8}
& \multirow{3}{*}{P(True)} 
& marginal     & 0.274     & 0.165   & \textbf{0.152} & 0.157 & \textbf{0.149}* \\
& & group max  & 0.341     & 0.315   & \textbf{0.261} & 0.305 & \textbf{0.250}* \\
& & group mean & 0.277     & 0.165   & \textbf{0.152} & 0.157 & \textbf{0.148}* \\
\cmidrule{2-8}
& \multirow{3}{*}{verb. conf.} 
& marginal     & 0.177     & 0.161   & \textbf{0.152} & 0.161 & \textbf{0.150}* \\
& & group max  & 0.270     & 0.311   & \textbf{0.253} & 0.311 & \textbf{0.248}* \\
& & group mean & 0.177     & 0.160   & \textbf{0.152} & 0.160 & \textbf{0.149}* \\
%%%%
\midrule
%%%%
%
\multirow{9}{*}{Mistral} 
& \multirow{3}{*}{self-consistency} 
& marginal     & 0.471     & 0.186   & \textbf{0.164} & 0.159 & \textbf{0.152}* \\ 
& & group max  & 0.554     & 0.333   & \textbf{0.285} & 0.250 & \textbf{0.235}* \\ 
& & group mean & 0.477     & 0.186   & \textbf{0.164} & 0.158 & \textbf{0.152}* \\ 
\cmidrule{2-8}
& \multirow{3}{*}{P(True)} 
& marginal     & 0.237     & 0.175   & \textbf{0.164} & 0.174 & \textbf{0.161}* \\
& & group max  & 0.304     & 0.318   & \textbf{0.259} & 0.317 & \textbf{0.249}* \\
& & group mean & 0.237     & 0.175   & \textbf{0.164} & 0.174 & \textbf{0.160}* \\
\cmidrule{2-8}
& \multirow{3}{*}{verb. conf.} 
& marginal     & 0.397     & 0.175   & \textbf{0.164} & 0.175 & \textbf{0.161}* \\
& & group max  & 0.427     & 0.318   & \textbf{0.259} & 0.318 & \textbf{0.249}* \\
& & group mean & 0.398     & 0.175   & \textbf{0.164} & 0.174 & \textbf{0.160}* \\
\bottomrule
\end{tabular}
\caption{We generate biographies using Llama 2 7B Chat and Mistral 7B Instruct for entities from \textsc{Bio-NQ} and compare each calibration method (HB, PS) against its multicalibration counterpart (IGHB, GCULR) on \textbf{Brier score} ($\downarrow$ better) \textbf{marginally} across the entire dataset, as well as within each subgroup (in terms of \textbf{max} and \textbf{mean} over all groups). We test each method using the base scores: self-consistency, P(True), and verbalized confidence. We bold the better-performing method for each pairing and use * to denote the best-performing method across all methods.}
\label{tab:multicalibration_brier_nq}
\end{table*}