\begin{table*}[b!]
\centering
\renewcommand{\arraystretch}{1.1}
\begin{tabular}{l l c c c}
\toprule
& & \textbf{HB} & \textbf{IGHB} & $\Delta$ \\
\toprule
\multirow{5}{*}{Top 5 $\Delta$}
& \textbf{\# Wiki prop.} = \textit{Low} \textcolor{ForestGreen}{\&} \textbf{nationality} = \textit{EU/ME} & 0.0771 & 0.0148 & -0.0623 \\
& \textbf{\# Wiki prop.} = \textit{Low} \textcolor{ForestGreen}{\&} \textbf{has IMDb ID} = \textit{True} & 0.0540 & 0.0055 & -0.0485 \\
& \textbf{\# Wiki prop.} = \textit{Low} \textcolor{ForestGreen}{\&} \textbf{sport} = \textit{False} & 0.0429 & 0.0031 & -0.0398 \\
& \textbf{\# Wiki prop.} = \textit{Low} \textcolor{ForestGreen}{\&} \textbf{sex or gender} = \textit{female} & 0.0395 & 0.0033 & -0.0362 \\
& \textbf{\# Wiki prop.} = \textit{Low} \textcolor{ForestGreen}{\&} \textbf{nationality} = \textit{NA} & 0.0379 & 0.0042 & -0.0337 \\
\midrule
Min $\Delta$
& \textbf{\# Wiki prop.} = \textit{Medium} \textcolor{ForestGreen}{\&} \textbf{nationality} = \textit{APAC} & 0.0114 & 0.0088 & -0.0026 \\
\bottomrule
\end{tabular}
\renewcommand{\arraystretch}{1}
\caption{
[\textbf{Calibration} on \textit{self-consistency} scores] Using outputs from \textbf{Llama 2 7B Chat} on \textsc{Bio-NQ}, we calculate the ASCE for each group using \textbf{HB} and \textbf{IGHB} as well as the difference in ASCE ($\Delta$) between the two methods. First, we then present the top 5 groups according ($\Delta$) where \textbf{top} corresponds to groups for which the multicalibration method achieves the biggest improvement (most negative change $\Delta$). In our experiments, we find that IGHB improves over HB for all groups, and so as reference, we also present the group with the minimum amount of change between IGHB and HB.
}
\label{tab:asce_hb_best_worst_llama2}
\end{table*}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{table*}[btp!]
\centering
\renewcommand{\arraystretch}{1.1}
\begin{tabular}{l l c c c}
\toprule
& & \textbf{HB} & \textbf{IGHB} & $\Delta$ \\
\toprule
\multirow{5}{*}{Top 5 $\Delta$}
& \textbf{\# Wiki prop.} = \textit{Low} \textcolor{ForestGreen}{\&} \textbf{nationality} = \textit{EU/ME} & 0.0837 & 0.0166 & -0.0671 \\
& \textbf{\# Wiki prop.} = \textit{Low} \textcolor{ForestGreen}{\&} \textbf{has IMDb ID} = \textit{True}  & 0.0718 & 0.0097 & -0.0621 \\
& \textbf{\# Wiki prop.} = \textit{Low} \textcolor{ForestGreen}{\&} \textbf{sport} = \textit{False} & 0.0505 & 0.0059 & -0.0446 \\
& \textbf{\# Wiki prop.} = \textit{Low} \textcolor{ForestGreen}{\&} \textbf{nationality} = \textit{NA} & 0.0458 & 0.0088 & -0.0370 \\
& \textbf{\# Wiki prop.} = \textit{Low} \textcolor{ForestGreen}{\&} \textbf{sex or gender} = \textit{female} & 0.0414 & 0.0045 & -0.0369 \\
\midrule
Min $\Delta$
& \textbf{\# Wiki prop.} = \textit{Medium} \textcolor{ForestGreen}{\&} \textbf{nationality} = \textit{APAC} & 0.0118 & 0.0098 & -0.0021 \\
\bottomrule
\end{tabular}
\renewcommand{\arraystretch}{1}
\caption{
[\textbf{Calibration} on \textit{self-consistency} scores] Using outputs from \textbf{Mistral 7B Instruct} on \textsc{Bio-NQ}, we calculate the ASCE for each group using \textbf{HB} and \textbf{IGHB} as well as the difference in ASCE ($\Delta$) between the two methods. First, we then present the top 5 groups according ($\Delta$) where \textbf{top} corresponds to groups for which the multicalibration method achieves the biggest improvement (most negative change $\Delta$). In our experiments, we find that IGHB improves over HB for all groups, and so as reference, we also present the group with the minimum amount of change between IGHB and HB.
}
\label{tab:asce_hb_best_worst_mistral}
\end{table*}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


\begin{table*}[btp!]
\centering
\renewcommand{\arraystretch}{1.1}
\begin{tabular}{l l c c c}
\toprule
& & \textbf{PS} & \textbf{GCULR} & $\Delta$ \\
\toprule
\multirow{6}{*}{Top 5 $\Delta$}
& \textbf{\# Wiki prop.} = \textit{Low} \textcolor{ForestGreen}{\&} \textbf{nationality} = \textit{EU/ME} & 0.0579 & 0.0063 & -0.0516 \\
& \textbf{\# Wiki prop.} = \textit{Low} \textcolor{ForestGreen}{\&} \textbf{has IMDb ID} = \textit{True} & 0.0375 & 0.0030 & -0.0345 \\
& \textbf{\# Wiki prop.} = \textit{Low} \textcolor{ForestGreen}{\&} \textbf{sport} = \textit{False} & 0.0266 & 0.0013 & -0.0253 \\
& \textbf{\# Wiki prop.} = \textit{Low} \textcolor{ForestGreen}{\&} \textbf{sex or gender} = \textit{female} & 0.0243 & 0.0034 & -0.0209 \\
& \textbf{\# Wiki prop.} = \textit{Low} \textcolor{ForestGreen}{\&} \textbf{nationality} = \textit{NA} & 0.0338 & 0.0031 & -0.0306 \\
\cline{2-5}
& Mean & 0.0338 & 0.0031 & -0.0306 \\
\midrule
\multirow{6}{*}{Bottom 5 $\Delta$}
& \textbf{has IMDb ID} = \textit{True} \textcolor{ForestGreen}{\&} \textbf{nationality} = \textit{APAC} & 0.0015 & 0.0018 & 0.0003 \\
& \textbf{\# Wiki prop.} = \textit{Low} \textcolor{ForestGreen}{\&} \textbf{sport} = \textit{True} & 0.0028 & 0.0033 & 0.0005 \\
& \textbf{has IMDb ID} = \textit{False} \textcolor{ForestGreen}{\&} \textbf{nationality} = \textit{NA} & 0.0005 & 0.0012 & 0.0007 \\
& \textbf{has IMDb ID} = \textit{False} \textcolor{ForestGreen}{\&} \textbf{sex or gender} = \textit{female} & 0.0011 & 0.0019 & 0.0008 \\
& \textbf{nationality} = \textit{APAC} \textcolor{ForestGreen}{\&} \textbf{sex or gender} = \textit{female} & 0.0016 & 0.0024 & 0.0008 \\
\cline{2-5}
& Mean & 0.0016 & 0.0024 & 0.0008 \\
\bottomrule
\end{tabular}
\renewcommand{\arraystretch}{1}
\caption{
[\textbf{Calibration} on \textit{self-consistency} scores] Using outputs from \textbf{Llama 2 7B Chat} on \textsc{Bio-NQ}, we calculate the ASCE for each group using \textbf{PS} and \textbf{GCULR} as well as the difference in ASCE ($\Delta$) between the two methods. We then present the top and bottom 5 groups according ($\Delta$) where \textbf{top} corresponds to groups for which the multivalid method achieves the biggest improvement (most negative change $\Delta$). In addition, we calculate the mean values for the top and bottom 5. We observe that GCULR greatly improves over PS among the top 5 groups, and even in the cases where GCULR worsens ASCE compared to PS, we find that the errors are already extremely small for both PS and GCULR.
}
\label{tab:asce_reg_best_worst_llama2}
\end{table*}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{table*}[btp!]
\centering
\renewcommand{\arraystretch}{1.1}
\begin{tabular}{l l c c c}
\toprule
& & \textbf{PS} & \textbf{GCULR} & $\Delta$ \\
\toprule
\multirow{6}{*}{Top 5 $\Delta$}
& \textbf{\# Wiki prop.} = \textit{Low} \textcolor{ForestGreen}{\&} \textbf{nationality} = \textit{EU/ME} & 0.0573 & 0.0049 & -0.0524 \\
& \textbf{\# Wiki prop.} = \textit{Low} \textcolor{ForestGreen}{\&} \textbf{has IMDb ID} = \textit{True} & 0.0488 & 0.0035 & -0.0453 \\
& \textbf{\# Wiki prop.} = \textit{Low} \textcolor{ForestGreen}{\&} \textbf{sport} = \textit{False} & 0.0291 & 0.0011 & -0.0280 \\
& \textbf{\# Wiki prop.} = \textit{Low} \textcolor{ForestGreen}{\&} \textbf{nationality} = \textit{NA} & 0.0253 & 0.0018 & -0.0235 \\
& \textbf{\# Wiki prop.} = \textit{Low} \textcolor{ForestGreen}{\&} \textbf{sex or gender} = \textit{female} & 0.0226 & 0.0026 & -0.0200 \\
\cline{2-5}
& Mean & 0.0366 & 0.0028 & -0.0338 \\
\midrule
\multirow{6}{*}{Bottom 5 $\Delta$}
& \textbf{\# Wiki prop.} = \textit{Medium} \textcolor{ForestGreen}{\&} \textbf{nationality} = \textit{APAC} & 0.0018 & 0.0019 & 0.0001 \\
& \textbf{nationality} = \textit{APAC} \textcolor{ForestGreen}{\&} \textbf{sex or gender} = \textit{female} & 0.0020 & 0.0027 & 0.0008 \\
& \textbf{has IMDb ID} = \textit{False} \textcolor{ForestGreen}{\&} \textbf{sex or gender} = \textit{female} & 0.0011 & 0.0022 & 0.0011 \\
& \textbf{has IMDb ID} = \textit{True} \textcolor{ForestGreen}{\&} \textbf{nationality} = \textit{APAC} & 0.0010 & 0.0022 & 0.0012 \\
& \textbf{\# Wiki prop.} = \textit{Low} \textcolor{ForestGreen}{\&} \textbf{sport} = \textit{True} & 0.0021 & 0.0041 & 0.0020 \\
\cline{2-5}
& Mean & 0.0016 & 0.0026 & 0.0011 \\
\bottomrule
\end{tabular}
\renewcommand{\arraystretch}{1}
\caption{
[\textbf{Calibration} on \textit{self-consistency} scores] Using outputs from \textbf{Mistral 7B Instruct} on \textsc{Bio-NQ}, we calculate the ASCE for each group using \textbf{PS} and \textbf{GCULR} as well as the difference in ASCE ($\Delta$) between the two methods. We then present the top and bottom 5 groups according ($\Delta$) where \textbf{top} corresponds to groups for which the multivalid method achieves the biggest improvement (most negative change $\Delta$). In addition, we calculate the mean values for the top and bottom 5. We observe that GCULR greatly improves over PS among the top 5 groups, and even in the cases where GCULR worsens ASCE compared to PS, we find that the errors are already extremely small for both PS and GCULR.
}
\label{tab:asce_reg_best_worst_mistral}
\end{table*}