\begin{table*}[btp!]
\centering
\renewcommand{\arraystretch}{1.1}
\begin{tabular}{l l c c c}
\toprule
& & \textbf{SC} & \textbf{MVSC} & $\Delta$ \\
\toprule
\multirow{6}{*}{Top 5 $\Delta$}
& \textbf{\# Wiki prop.} = \textit{Very High} \textcolor{ForestGreen}{\&} \textbf{has IMDb ID}     = \textit{True}    & 0.0318 & 0.0137 & -0.0182 \\
& \textbf{\# Wiki prop.} = \textit{Low}       \textcolor{ForestGreen}{\&} \textbf{nationality}     = \textit{NA}      & 0.0325 & 0.0153 & -0.0171 \\
& \textbf{\# Wiki prop.} = \textit{High}      \textcolor{ForestGreen}{\&} \textbf{sex or gender}   = \textit{Female}  & 0.0312 & 0.0183 & -0.0129 \\
& \textbf{has IMDb ID}   = \textit{True}      \textcolor{ForestGreen}{\&} \textbf{plays pro sport} = \textit{True}    & 0.0427 & 0.0298 & -0.0129 \\
& \textbf{\# Wiki prop.} = \textit{Very High}                                                                         & 0.0256 & 0.0131 & -0.0125 \\
% & \textbf{\# Wiki prop.} = \textit{Medium}    \textcolor{ForestGreen}{\&} \textbf{nationality}     = \textit{NA}      & 0.0333 & 0.0215 & -0.0117 \\
\cline{2-5}
& Mean                                                                                                                & 0.0328 & 0.0180 & -0.0147 \\
\midrule
\multirow{6}{*}{Bottom 5 $\Delta$}
& \textbf{has IMDb ID}   = \textit{False}     \textcolor{ForestGreen}{\&} \textbf{nationality}     = \textit{APAC}   & 0.0272 & 0.0401 & 0.0129 \\
& \textbf{\# Wiki prop.} = \textit{High}      \textcolor{ForestGreen}{\&} \textbf{has IMDb ID}     = \textit{False}  & 0.0180 & 0.0292 & 0.0112 \\
& \textbf{\# Wiki prop.} = \textit{Low}       \textcolor{ForestGreen}{\&} \textbf{has IMDb ID}     = \textit{True}   & 0.0183 & 0.0216 & 0.0033 \\
& \textbf{\# Wiki prop.} = \textit{Very High} \textcolor{ForestGreen}{\&} \textbf{has IMDb ID}     = \textit{False}  & 0.0259 & 0.0283 & 0.0024 \\
& \textbf{nationality}   = \textit{APAC}      \textcolor{ForestGreen}{\&} \textbf{plays pro sport} = \textit{False}  & 0.0231 & 0.0244 & 0.0013 \\
% & \textbf{has IMDb ID}   = \textit{False}     \textcolor{ForestGreen}{\&} \textbf{plays pro sport} = \textit{True}   & 0.0249 & 0.0253 & 0.0003 \\
\cline{2-5}
% & Mean                                                                                                               & 0.0229 & 0.0282 & 0.0052 \\
& Mean                                                                                                               & 0.0225 & 0.0287 & 0.0062 \\
\bottomrule
\end{tabular}
\renewcommand{\arraystretch}{1}
\caption{[\textbf{Conformal} on \textit{self-consistency} scores] Using outputs from \textbf{Llama 2 7B Chat} on \textsc{Bio-NQ}, we calculate the coverage error (for a target coverage of $90\%$) for each group using \textbf{SC} and \textbf{MVSC} as well as the difference in coverage error ($\Delta$) between the two methods. We then present the top and bottom 5 groups according ($\Delta$) where \textbf{top} corresponds to groups for which the multivalid method achieves the biggest improvement (most negative change $\Delta$). In addition, we calculate the mean values for the top and bottom 5.}
\label{tab:conformal_sc_best_worst_llama2}
\end{table*}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{table*}[btp!]
\centering
\renewcommand{\arraystretch}{1.1}
\begin{tabular}{l l c c c}
\toprule
& & \textbf{SC} & \textbf{MVSC} & $\Delta$ \\
\toprule
\multirow{6}{*}{Top 5 $\Delta$}
& \textbf{\# Wiki prop.} = \textit{Very High} \textcolor{ForestGreen}{\&} \textbf{nationality}     = \textit{EU/ME}  & 0.0423 & 0.0235 & -0.0188 \\
& \textbf{\# Wiki prop.} = \textit{Low}       \textcolor{ForestGreen}{\&} \textbf{has IMDb ID}     = \textit{True}   & 0.0282 & 0.0112 & -0.0170 \\
& \textbf{\# Wiki prop.} = \textit{Very High} \textcolor{ForestGreen}{\&} \textbf{sex or gender}   = \textit{Male}   & 0.0335 & 0.0190 & -0.0145 \\
& \textbf{has IMDb ID}   = \textit{Medium}    \textcolor{ForestGreen}{\&} \textbf{plays pro sport} = \textit{True}   & 0.0404 & 0.0269 & -0.0135 \\
& \textbf{\# Wiki prop.} = \textit{Very High} \textcolor{ForestGreen}{\&} \textbf{has IMDb ID}     = \textit{False}  & 0.0371 & 0.0240 & -0.0131 \\
% & \textbf{\# Wiki prop.} = \textit{Very High}                                                                        & 0.0305 & 0.0192 & -0.0114 \\
\cline{2-5}
% & Mean                                                                                                               & 0.0354 & 0.0206 & -0.0147 \\
& Mean                                                                                                               & 0.0363 & 0.0209 & -0.0154 \\
\midrule
\multirow{6}{*}{Bottom 5 $\Delta$}
& \textbf{nationality}     = \textit{NA}     \textcolor{ForestGreen}{\&} \textbf{plays pro sport} = \textit{True}    & 0.0173 & 0.0282 & 0.0109 \\
& \textbf{has IMDb ID}     = \textit{False}  \textcolor{ForestGreen}{\&} \textbf{plays pro sport} = \textit{True}    & 0.0221 & 0.0279 & 0.0058 \\
& \textbf{plays pro sport} = \textit{False}  \textcolor{ForestGreen}{\&} \textbf{sex or gender}   = \textit{Female}  & 0.0138 & 0.0192 & 0.0055 \\
& \textbf{sex or gender}   = \textit{Female}                                                                         & 0.0143 & 0.0193 & 0.0050 \\
& \textbf{has IMDb ID}     = \textit{True}   \textcolor{ForestGreen}{\&} \textbf{sex or gender}   = \textit{Female}  & 0.0145 & 0.0190 & 0.0044 \\
% & \textbf{has IMDb ID}     = \textit{True}   \textcolor{ForestGreen}{\&} \textbf{nationality}     = \textit{EU/ME}   & 0.0144 & 0.0177 & 0.0032 \\
\cline{2-5}
% & Mean                                                                                                               & 0.0161 & 0.0219 & 0.0058 \\
& Mean                                                                                                               & 0.0164 & 0.0227 & 0.0063 \\
\bottomrule
\end{tabular}
\renewcommand{\arraystretch}{1}
\caption{[\textbf{Conformal} on \textit{self-consistency} scores] Using outputs from \textbf{Mistral 7B Instruct} on \textsc{Bio-NQ}, we calculate the coverage error (for a target coverage of $90\%$) for each group using \textbf{SC} and \textbf{MVSC} as well as the difference in coverage error ($\Delta$) between the two methods. We then present the top and bottom 5 groups according ($\Delta$) where \textbf{top} corresponds to groups for which the multivalid method achieves the biggest improvement (most negative change $\Delta$). In addition, we calculate the mean values for the top and bottom 5.}
\label{tab:conformal_sc_best_worst_mistral}
\end{table*}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{table*}[btp!]
\centering
\renewcommand{\arraystretch}{1.1}
\begin{tabular}{l l c c c}
\toprule
& & \textbf{CQR} & \textbf{GCCQR} & $\Delta$ \\
\toprule
\multirow{6}{*}{Top 5 $\Delta$}
& \textbf{\# Wiki prop.} = \textit{Low} \textcolor{ForestGreen}{\&} \textbf{sport}         = \textit{False} & 0.0652 & 0.0190 & -0.0463 \\
& \textbf{\# Wiki prop.} = \textit{Low} \textcolor{ForestGreen}{\&} \textbf{IMDb ID}       = \textit{True}  & 0.0564 & 0.0153 & -0.0411 \\
& \textbf{\# Wiki prop.} = \textit{Low}                                                                     & 0.0556 & 0.0167 & -0.0389 \\
& \textbf{\# Wiki prop.} = \textit{Low} \textcolor{ForestGreen}{\&} \textbf{IMDb ID}       = \textit{False} & 0.0565 & 0.0226 & -0.0339 \\
& \textbf{\# Wiki prop.} = \textit{Low} \textcolor{ForestGreen}{\&} \textbf{sex or gender} = \textit{Male}  & 0.0556 & 0.0222 & -0.0334 \\
% & \textbf{\# Wiki prop.} = \textit{Low} \textcolor{ForestGreen}{\&} \textbf{nationality}   = \textit{NA}    & 0.0502 & 0.0229 & -0.0273 \\
\cline{2-5}
% & Mean                                                                                                      & 0.0566 & 0.0198 & -0.0368 \\
& Mean                                                                                                      & 0.0579 & 0.0192 & -0.0387 \\
\midrule
\multirow{6}{*}{Bottom 5 $\Delta$}
& \textbf{nationality}   = \textit{APAC}      \textcolor{ForestGreen}{\&} \textbf{sport}         = \textit{False}  & 0.0150 & 0.0294 & 0.0143 \\
& \textbf{\# Wiki prop.} = \textit{Very High} \textcolor{ForestGreen}{\&} \textbf{IMDb ID}       = \textit{False}  & 0.0193 & 0.0335 & 0.0142 \\
& \textbf{nationality}   = \textit{APAC}      \textcolor{ForestGreen}{\&} \textbf{sex or gender} = \textit{Male}   & 0.0151 & 0.0227 & 0.0076 \\
& \textbf{\# Wiki prop.} = \textit{Medium}    \textcolor{ForestGreen}{\&} \textbf{nationality}   = \textit{EU/ME}  & 0.0196 & 0.0268 & 0.0072 \\
& \textbf{\# Wiki prop.} = \textit{Medium}    \textcolor{ForestGreen}{\&} \textbf{sex or gender} = \textit{Female} & 0.0183 & 0.0241 & 0.0058 \\
% & \textbf{nationality}   = \textit{APAC}                                                                           & 0.0117 & 0.0174 & 0.0057 \\
\cline{2-5}
% & Mean                                                                                                             & 0.0165 & 0.0256 & 0.0091 \\
& Mean                                                                                                             & 0.0175 & 0.0273 & 0.0098 \\
\bottomrule
\end{tabular}
\renewcommand{\arraystretch}{1}
\caption{[\textbf{Conformal} on \textit{self-consistency} scores] Using outputs from \textbf{Llama 2 7B Chat} on \textsc{Bio-NQ}, we calculate the coverage error (for a target coverage of $90\%$) for each group using \textbf{CQR} and \textbf{GCCQR} as well as the difference in coverage error ($\Delta$) between the two methods. We then present the top and bottom 5 groups according ($\Delta$) where \textbf{top} corresponds to groups for which the multivalid method achieves the biggest improvement (most negative change $\Delta$). In addition, we calculate the mean values for the top and bottom 5.}
\label{tab:conformal_qreg_best_worst_llama2}
\end{table*}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{table*}[btp!]
\centering
\renewcommand{\arraystretch}{1.1}
\begin{tabular}{l l c c c}
\toprule
& & \textbf{CQR} & \textbf{GCCQR} & $\Delta$ \\
\toprule
\multirow{6}{*}{Top 5 $\Delta$}
& \textbf{\# Wiki prop.} = \textit{Low} \textcolor{ForestGreen}{\&} \textbf{nationality}   = \textit{NA}    & 0.0795 & 0.0135 & -0.0661 \\
& \textbf{\# Wiki prop.} = \textit{Low} \textcolor{ForestGreen}{\&} \textbf{IMDb ID}       = \textit{True}  & 0.0758 & 0.0173 & -0.0585 \\
& \textbf{\# Wiki prop.} = \textit{Low} \textcolor{ForestGreen}{\&} \textbf{sport}         = \textit{False} & 0.0746 & 0.0175 & -0.0571 \\
& \textbf{\# Wiki prop.} = \textit{Low}                                                                     & 0.0662 & 0.0121 & -0.0541 \\
& \textbf{\# Wiki prop.} = \textit{Low} \textcolor{ForestGreen}{\&} \textbf{sex or gender} = \textit{Male}  & 0.0698 & 0.0244 & -0.0454 \\
% & \textbf{\# Wiki prop.} = \textit{Low} \textcolor{ForestGreen}{\&} \textbf{IMDb ID}       = \textit{False} & 0.0605 & 0.0213 & -0.0391 \\
\cline{2-5}
% & Mean                                                                                                      & 0.0711 & 0.0177 & -0.0534 \\
& Mean                                                                                                      & 0.0732 & 0.0170 & -0.0562 \\
\midrule
\multirow{6}{*}{Bottom 5 $\Delta$}
& \textbf{IMDb ID}       = \textit{False}  \textcolor{ForestGreen}{\&} \textbf{nationality}   = \textit{EU/ME}  & 0.0193 & 0.0331 & 0.0138 \\
& \textbf{nationality}   = \textit{NA}     \textcolor{ForestGreen}{\&} \textbf{sport}         = \textit{True}   & 0.0155 & 0.0283 & 0.0128 \\
& \textbf{IMDb ID}       = \textit{False}  \textcolor{ForestGreen}{\&} \textbf{sport}         = \textit{False}  & 0.0114 & 0.0194 & 0.0080 \\
& \textbf{nationality}   = \textit{EU/ME}  \textcolor{ForestGreen}{\&} \textbf{sex or gender} = \textit{Female} & 0.0275 & 0.0352 & 0.0077 \\
& \textbf{IMDb ID}       = \textit{False}  \textcolor{ForestGreen}{\&} \textbf{sport}         = \textit{True}   & 0.0267 & 0.0325 & 0.0057 \\
% & \textbf{sex or gender} = \textit{Female} \textcolor{ForestGreen}{\&} \textbf{sport}         = \textit{False}  & 0.0126 & 0.0181 & 0.0055 \\
\cline{2-5}
% & Mean                                                                                                          & 0.0188 & 0.0278 & 0.0089 \\
& Mean                                                                                                          & 0.0201 & 0.0297 & 0.0096 \\
\bottomrule
\end{tabular}
\renewcommand{\arraystretch}{1}
\caption{[\textbf{Conformal} on \textit{self-consistency} scores] Using outputs from \textbf{Mistral 7B Instruct} on \textsc{Bio-NQ}, we calculate the coverage error (for a target coverage of $90\%$) for each group using \textbf{CQR} and \textbf{GCCQR} as well as the difference in coverage error ($\Delta$) between the two methods. We then present the top and bottom 5 groups according ($\Delta$) where \textbf{top} corresponds to groups for which the multivalid method achieves the biggest improvement (most negative change $\Delta$). In addition, we calculate the mean values for the top and bottom 5.}
\label{tab:conformal_qreg_best_worst_mistral}
\end{table*}