\begin{figure*}[tb!]
    \centering
    %
    \begin{subfigure}[t]{\columnwidth}
        \centering
        \includegraphics[width=1.0\textwidth]{figures/images/conformal_nq_llama_selfconsistency.png}
        \caption{LLama 2 7B Chat; self-consistency}
        \label{fig:conformal_nq_llama_selfconsistency}
    \end{subfigure}
    %
    \begin{subfigure}[t]{\columnwidth}
        \centering
        \includegraphics[width=1.0\textwidth]{figures/images/conformal_nq_mistral_selfconsistency.png}
        \caption{Mistral 7B Instruct; self-consistency}
        \label{fig:conformal_nq_mistral_selfconsistency}
    \end{subfigure}
    %
    \begin{subfigure}[t]{\columnwidth}
        \centering
        \includegraphics[width=1.0\textwidth]{figures/images/conformal_nq_llama_ptrue.png}
        \caption{LLama 2 7B Chat; P(True)}
        \label{fig:conformal_nq_llama_ptrue}
    \end{subfigure}
    %
    \begin{subfigure}[t]{\columnwidth}
        \centering
        \includegraphics[width=1.0\textwidth]{figures/images/conformal_nq_mistral_ptrue.png}
        \caption{Mistral 7B Instruct; P(True)}
        \label{fig:conformal_nq_mistral_ptrue}
    \end{subfigure}
    %
    \begin{subfigure}[t]{\columnwidth}
        \centering
        \includegraphics[width=1.0\textwidth]{figures/images/conformal_nq_llama_verbconf.png}
        \caption{LLama 2 7B Chat; verbalized confidence}
        \label{fig:conformal_nq_llama_verbconf}
    \end{subfigure}
    %
    \begin{subfigure}[t]{\columnwidth}
        \centering
        \includegraphics[width=1.0\textwidth]{figures/images/conformal_nq_mistral_verbconf.png}
        \caption{Mistral 7B Instruct; verbalized confidence}
        \label{fig:conformal_nq_mistral_verbconf}
    \end{subfigure}
    %
    \caption{For each target coverage, we run conformal methods (\textcolor{mplblue}{\textbf{blue}}: SC, CQR) and their multigroup counterparts (\textcolor{mplorange}{\textbf{orange}}: MVSC, GCCQR) on \textsc{Bio-NQ} using the following base uncertainty scoring functions: \textbf{(a, b)} self-consistency, \textbf{(c, d)} P(True), and \textbf{(e, f)} verbalized confidence. We evaluate on generations from \textbf{(a, c, e)} Llama 2 7B Chat and \textbf{(b, d, f)} Mistral 7B Instruct. We calculate the average coverage error across all groups and plot them side by side for each pairing.}
    \label{fig:conformal_nq}
\end{figure*}