%%%%% ASCE %%%%%

\begin{table*}[hbt!]
\centering
\begin{tabular}{c c c | c | c c | c c }
\toprule
\textbf{Model} & \textbf{Base Score} & \textbf{Metric} & \textbf{Uncalibrated} & \textbf{HB} & \textbf{IGHB} & \textbf{PS} & \textbf{GCULR} \\
\toprule
%
\multirow{9}{*}{Llama} 
& \multirow{3}{*}{self-consistency} 
& marginal     & 0.32291     & 0.00875   & \textbf{0.00038} & 0.00022 & \textbf{0.00015}* \\
& & group max  & 0.42343     & 0.07711   & \textbf{0.01481} & 0.05791 & \textbf{0.00628}* \\ 
& & group mean & 0.33352     & 0.01597   & \textbf{0.00289} & 0.00636 & \textbf{0.00111}* \\ 
\cmidrule{2-8}
& \multirow{3}{*}{P(True)} 
& marginal     & 0.11768     & 0.00451   & \textbf{0.00021}* & 0.00036 & \textbf{0.00022} \\
& & group max  & 0.20701     & 0.06988   & \textbf{0.01798} & 0.06025 & \textbf{0.00697}* \\ 
& & group mean & 0.12682     & 0.01176   & \textbf{0.00328} & 0.00654 & \textbf{0.00145}* \\ 
\cmidrule{2-8}
& \multirow{3}{*}{verb. conf.} 
& marginal     & 0.01642     & \textbf{0.00014}   & 0.00055 & \textbf{0.00013}* & 0.00023 \\
& & group max  & 0.06645     & 0.06634   & \textbf{0.01315} & 0.06709 & \textbf{0.00730}* \\ 
& & group mean & 0.02447     & 0.00738   & \textbf{0.00357} & 0.00750 & \textbf{0.00154}* \\ 
%%%%
\midrule
%%%%
\multirow{9}{*}{Mistral} 
& \multirow{3}{*}{self-consistency} 
& marginal     & 0.30706     & 0.01163   & \textbf{0.00026} & 0.00029 & \textbf{0.00013}* \\
& & group max  & 0.43659     & 0.08372   & \textbf{0.01660} & 0.05729 & \textbf{0.00487}* \\ 
& & group mean & 0.32067     & 0.01988   & \textbf{0.00269} & 0.00726 & \textbf{0.00097}* \\ 
\cmidrule{2-8}
& \multirow{3}{*}{P(True)} 
& marginal     & 0.06291     & 0.00074   & \textbf{0.00031} & 0.00047 & \textbf{0.00015}* \\
& & group max  & 0.12293     & 0.06942   & \textbf{0.01417} & 0.07054 & \textbf{0.00587}* \\ 
& & group mean & 0.07173     & 0.00896   & \textbf{0.00309} & 0.00886 & \textbf{0.00132}* \\ 
\cmidrule{2-8}
& \multirow{3}{*}{verb. conf.} 
& marginal     & 0.22229     & 0.00047   & \textbf{0.00036} & 0.00034 & \textbf{0.00015}* \\
& & group max  & 0.33763     & 0.06922   & \textbf{0.01453} & 0.07080 & \textbf{0.00653}* \\ 
& & group mean & 0.23230     & 0.00869   & \textbf{0.00315} & 0.00878 & \textbf{0.00128}* \\ 
%
\bottomrule
\end{tabular}
\caption{We generate biographies using Llama 2 7B Chat and Mistral 7B Instruct for entities from \textsc{Bio-NQ} and compare each calibration method (HB, PS) against its multicalibration counterpart (IGHB, GCULR) on \textbf{ASCE}, \textbf{max gASCE}, and \textbf{average gASCE} ($\downarrow$ better). We test each method using the base scores: self-consistency, P(True), and verbalized confidence. We bold the better-performing method for each pairing and use * to denote the best-performing method across all methods.}
\label{tab:multicalibration_asce_nq}
\end{table*}
