\onecolumn
\section{Proofs}
\label{app:proofs}

\subsection{Proof for Prop 3.1}

Recall that $K_{i} = \inf\{j: \mathcal{A}(X_i, Y_{ij}) = 1\} \cup \{M+1\}$, and $\hat{C}(X_{n+1}) = \{Y_{j} \sim \hat{\pi}(\cdot|X=X_{n+1})\}_{j=1}^{\hat{K}(X_{n+1})}$. From the standard CP proof, we have that:
\[
    P\{K_{n+1} \leq \hat{K}(X_{n+1})\} \geq 1-\alpha
\] Consider the event $\{K_{n+1} \leq \hat{K}(X_{n+1})\}$:
\begin{equation}
    \label{app:eq:coverage-event}
        \{K_{n+1} \leq \hat{K}(X_{n+1}) \} = \{K_{n+1} \leq \hat{K}(X_{n+1}) \} \{ \hat{K}(X_{n+1}) < \infty\} \cup \{K_{n+1} \leq \hat{K}(X_{n+1}) \} \{ \hat{K}(X_{n+1} = \infty\} 
\end{equation}

On $\{\hat{K}(X_{n+1}) = \infty\}$, $\hat{C}(X_{n+1}) = \mathcal{Y}$, and $\{\exists Y \in \hat{C}(X_{n+1}): \mathcal{A}(X_{n+1}, Y) = 1\}$. 

On $\{\hat{K}(X_{n+1}) < \infty\}$:
\[
\begin{aligned}
    \{K_{n+1} \leq \hat{K}(X_{n+1}) \} &= \left\{\inf\{j: \mathcal{A}(X_{n+1}, Y_{n+1,j}) = 1\} < \hat{K}(X_{n+1}) \right\} \\
    &= \left\{ \exists j < \hat{K}(X_{n+1}):  \mathcal{A}(X_{n+1}, Y_{n+1,j}) = 1\ \right\} \\
    &= \left\{\exists Y \in \hat{C}(X_{n+1}): \mathcal{A}(X_{n+1}, Y) = 1 \right\} \\
    \end{aligned}
\]
Substituting in Eq.\ref{app:eq:coverage-event}, we obtain:
\[
\{K_{n+1} \leq \hat{K}(X_{n+1})\} = \{\exists Y \in \hat{C}(X_{n+1}): \mathcal{A}(X_{n+1}, Y) = 1\}
\] and plugging this into the probability statement gives us the desired coverage guarantee.\qed


% \subsection{Proof for Corollary 1}
% \label{sec:proof-cor-1}
% Let $V:=\{\exists Y \in \hat{C}(X_{n+1}) \text{ such that } \mathcal{A}(X_{n+1}, Y_{n+1}) = 1\}$, be the event that we obtain an admissible solution in the prediction set. Recall from Eq~\ref{eq:k-estimate} that predicted set size, $\hat{K}(X_{n+1})$ is the quantile estimate plus the conformal adjustment term. Since our predictor is the oracle predictor, the first term is the true quantile, $k_0(X_{n+1})$, and define $\mathcal{E}:=Q_{1-\alpha}(\{S_i\}_{i=1}^n)$ be the conformal adjustment term.  
% Then, the conditional coverage for $X_{n+1}$ is:
% \[
% P(V | X_{n+1}) = P(K(X_{n+1}) < k_0(X_{n+1}) + \mathcal{E})
% \]
% Since $S_i$ is positive, $\mathcal{E} \ge 0$, we have:
% \[
% P(V | X_{n+1}) \ge P(K(X_{n+1}) < k_0(X_{n+1})) \geq 1 - \alpha
% \]
% \qed

%\subsection{Proof for Lemma 3.2}
%Let $\mathcal{D}=\mathcal{D}_{\text{cal}} \cup \{X_{n+1}\}$ and $Z = \mathds{1}\{\exists Y \in \hat{C}(X_{n+1}) \text{ such that } \mathcal{A}(X_{n+1}, Y_{n+1}) = 1\}$. We start with the upper bound for CP stated in Eq.~\ref{eq:conformal-guarantee-ub}. 
%\[
%\mathbb{E}\{ Z\} \leq 1-\alpha + \frac{1}{n+1}
%\]
%Note that $\mathbb{E}\{Z|\mathcal{D}\}$ is the CDF of a geometric distribution with success probability $f(X_{n+1})$ at $\hat{K}(X_{n+1})$, we can use the tower rule along with the definitions from Sec.~\ref{sec:proof-cor-1}:
%\[
%\mathbb{E}\{1 - (1-f(X_{n+1}))^{k_0(X_{n+1}) + \mathcal{E}} \} \le 1 - \alpha + \frac{1}{n+1}
%\]
%By expanding the definition of $k_0(X_{n+1})$ and simplifying, we get:
%
%\[
%\mathbb{E}\{1-(1-f(X_{n+1}))^{\mathcal{E}}\} \leq \frac{1}{\alpha (n+1)}
%\]
%Note that the term inside the expectation is bounded in [0, 1]. Then, by applyiny Markov's inequality and simplifying, we get the following for all $t \in (0, 1)$:
%\begin{equation*}
%\label{eq:markovs}
%P\Bigl\{\mathcal{E} \geq \frac{\ln(1-t)}{\ln{(1-f(X_{n+1}))}}\Bigr\} \leq \frac{1}{t \alpha (n+1)}
%\end{equation*}
%
%By setting $t:=1-\alpha^\epsilon$ and $n_0 \geq \frac{1}{\delta \alpha (1-\alpha^\epsilon)} - 1$, we arrive at the statement of the Lemma.\qed

\section{Equating theoretical guarantees between CLM and GPS}
\label{app:equate-clm-gps}
The marginal guarantee of conformal prediction Eq. \ref{eq:conformal-guarantee} and CLM's guarantee Eq. \ref{equation:clm-pac-style}
are qualitatively different. In this paper, whenever we mention $\alpha$ for comparing our method, it refers to $\alpha$ in
Eq. \ref{equation:clm-pac-style}, and the conformal value is adjusted accordingly. We use the calibration conditional coverage properties of CP to perform this adjustment.


Assuming non-conformity scores are almost surely distinct, the calibration coverage, conditional on the calibration set of a conformal predictor at level $\alpha_0$ follows a Beta distribution~(\cite{angelopoulos2024theoretical}:
\begin{equation}
\label{app:eq:calibration-cond-cov}
P\bigl\{Y_{n+1} \in C(X_{n+1}) \mid \mathcal{D}_\text{cal} \bigr\} \sim \text{Beta}(k(\alpha_0), n + 1 - k(\alpha_0))
\end{equation}
where $k(\alpha_0) := \left \lceil (1 - \alpha_0) (n + 1) \right \rceil$. Even if scores are not distinct, the miscoverage probability stochastically dominates the Beta$(k_\alpha, n+1-k_\alpha)$ distribution~(\cite{angelopoulos2024theoretical}[Theorem 4.1]).
% \end{theorem}

Following that, for any value of $\alpha$, this conformal method has the following guarantee:
\[
P\Bigl(P(\exists Y \in \hat{C}(X_{n+1}): \mathcal{A}(X_{n+1}, Y) = 1 \mid \mathcal{D}_{\text{cal}})
\geq 1-\alpha\Bigr) \geq 1-\delta
\]

where $\delta$ is the CDF of the Beta distribution with parameters from Theorem \ref{app:eq:calibration-cond-cov} at $1 - \alpha$:
\begin{equation}
\label{appendix:equation:beta-cdf}
\delta = \text{BetaCDF}_{k(\alpha_0), n + 1 - k(\alpha_0)}(1 - \alpha)
\end{equation}



Thus, for any $\alpha$ and $\delta$ satisfying the LTT guarantee with the form of Eq. \ref{equation:clm-pac-style}, we perform a grid search in the $\alpha_0$ space to find values such that the $\delta$ in Eq. \ref{appendix:equation:beta-cdf} is lower than the $\delta$ in Eq. \ref{equation:clm-pac-style}. Table \ref{appendix:table:alpha_values} shows the adjusted confidence levels,  $\alpha_0$, corresponding to the non-adjusted $\alpha$-values in Figure \ref{fig:main-results}.

\begin{figure}[!h]
    \centering
    \begin{center}
    %\renewcommand{\arraystretch}{1.2}
    \begin{tabular}{|c|c|c|c|c|c|}
        \hline
        \multirow{1}{*}{$\alpha$} & \multicolumn{1}{c|}{DS1000 ($n=200$)} & \multicolumn{1}{c|}{GSM8k ($n=330$)} & \multicolumn{1}{c|}{MATH ($n=1250$)} & \multicolumn{1}{c|}{MBPP ($n=113$)} & \multicolumn{1}{c|}{TriviaQA ($n=4486$)} \\
        \hline
        0.10 & 0.0594  & 0.0662  & 0.0814  & 0.0526  & 0.0900  \\
        0.15 & 0.0990  & 0.1084  & 0.1277  & 0.0877  & 0.1381  \\
        0.20 & 0.1485  & 0.1566  & 0.1757  & 0.1228  & 0.1867  \\
        0.25 & 0.1881  & 0.1987  & 0.2236  & 0.1754  & 0.2352  \\
        0.30 & 0.2376  & 0.2469  & 0.2715  & 0.2105  & 0.2843  \\
        0.35 & 0.2772  & 0.2951  & 0.3194  & 0.2631  & 0.3337  \\
        0.40 & 0.3267  & 0.3433  & 0.3690  & 0.2982  & 0.3832  \\
        0.45 & 0.3762  & 0.3915  & 0.4185  & 0.3508  & 0.4331  \\
        0.50 & 0.4257  & 0.4397  & 0.4680  & 0.4035  & 0.4830  \\
        \hline
    \end{tabular}
    \end{center}
    \caption{Table of $\alpha$ values and corresponding adjusted $\alpha_0$ for the different tasks used in this paper.}    \label{appendix:table:alpha_values}
\end{figure}

\section{Experimental Details}
\label{app:experiment}

\subsection{Compute Infrastructure and Software}

All inference except for GPT-4o-mini is performed on a single node with 3 H100 GPUs (80GB VRAM), 8 CPUs and 400GB RAM. We utilize \texttt{vllm}\footnote{https://github.com/vllm-project/vllm} to generate samples, and perform a separate forward pass using Huggingface Transformers\footnote{https://huggingface.co/docs/transformers/en/index} to collect logits and hidden states. GPT-4o-mini inference is conducted using the OpenAI client library\footnote{https://github.com/openai/openai-python} for python. 

\subsection{Dataset Details}
All the datasets we use are publicly available from the Hugginface Hub. For GSM\footnote{https://huggingface.co/datasets/openai/gsm8k} we use the test split from \texttt{main} subset. For Math\footnote{https://huggingface.co/datasets/EleutherAI/hendrycks\_math} we use the test split. For TriviaQA\footnote{https://huggingface.co/datasets/mandarjoshi/trivia\_qa} we use the validation split in the \texttt{rca.nocontext} subset of the data. DS1000\footnote{https://huggingface.co/datasets/xlangai/DS-1000} has only a single split, and we use the entire dataset. For MBPP\footnote{https://huggingface.co/datasets/google-research-datasets/mbpp}, we use both the train and test splits from the \texttt{sanitized} subset, due to the small number of examples in the data. Details regarding the size of each split and generation settings are shown in Table~\ref{fig:data-details}.    
\begin{figure}[!h]
    \centering
    \begin{center}
\begin{tabular}{ |c||c|c|c|c| } 
\hline
\textbf{Dataset} & \textbf{Train Split Size} & \textbf{Test Split Size} & \textbf{Temperature} & \textbf{Top-p} \\
 \hline
DS1000 & 600 & 400 & 0.2 & 0.95 \\
MBPP & 150 & 227 & 0.2 & 0.95 \\
TriviaQA & 8972 & 8972 & 0.2 & 0.95 \\
GSM8k & 659 & 660 & 0.2 & 0.95 \\
Math & 2500 & 2500 & 0.6 & 0.95 \\
\hline
\end{tabular}
\end{center}
    \caption{Dataset splits and generation settings}
    \label{fig:data-details}
\end{figure}



\subsubsection{Prompts} Here we list the prompts used for our experiments. 
Text enclosed in \textcolor{violet}{\{\dots\}} refers to columns in the original dataset.
\textcolor{teal}{teal} colored text denotes the system prompt used by GPT-4o-mini. \textcolor{red}{red} colored text are only added for Gemma 2 27b.

% \subsubsection{Prompts}
\lstset{
  basicstyle=\ttfamily\small,
  breaklines=true,
  frame=single,
  rulecolor=\color{black},
  backgroundcolor=\color{gray!10},
  commentstyle=\color{green!50!black},
  keywordstyle=\color{blue},
  stringstyle=\color{red},
   showlines=true,
   % numbers=none,
   escapeinside={(*@}{@*)},
  moredelim=**[is][\color{teal}]{@gpt@}{@end@},  %
  moredelim=**[is][\color{red}]{@gemma@}{@end@},  % For orange lines
  moredelim=**[is][\color{violet}]{@bold@}{@end@},  % For orange lines
  % \bfseries
}

\begin{lstlisting}[caption=Prompt for MBPP]
@gpt@Answer the following question. In your response, only write the raw code, do not use markdown and do not add explanations.@end@

@bold@{description}@end@
@bold@{test_list[0]}@end@

\end{lstlisting}


\begin{lstlisting}[caption=Prompt for DS-1000]
@gpt@Answer the following question. In your response, only write the raw code, do not use markdown and do not add explanations@end@
@bold@{prompt}@end@
\end{lstlisting}

\begin{lstlisting}[caption=Prompt for GSM]
Q: @bold@{question}@end@
A: Let's think step by step.
\end{lstlisting}


\begin{lstlisting}[caption=Prompt for Trivia]
Answer these questions:
Q: Which former major league baseball pitcher, known as The Big Unit, now pitches Geico?
A: Randy Johnson
Q: The Philippines were named after which king of Spain?
A: King Philip II
Q: US Vice-President Joe Biden represents which state?
A: DELAWARE
Q: Which, now defunct, political party was founded by Declan Ganley in April 2009?
A: Libertas Ireland
Q: Sept 30, 1966 saw the public unveiling of which popular model of Boeing aircraft?
A: 747
Q: @bold@{question}@end@
A:
\end{lstlisting}





\begin{lstlisting}[caption=Prompt for Math]
@gemma@Answer the following questions. Your final answer should always follow the same format: 'Final Answer: The final answer is [answer]. I hope it is correct.@end@
@gpt@Answer the following questions. Use the answer format provided in the examples. All of your latex expressions must be wrapped in $...$ (for example, to write 'x=2' as latex, write $x=2$).@end@
Problem:
Find the domain of the expression  $\frac{\sqrt{x-2}}{\sqrt{5-x}}$.}

Solution: The expressions inside each square root must be non-negative. Therefore, $x-2 \ge 0$, so $x\ge2$, and $5 - x \ge 0$, so $x \le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\boxed{[2,5)}$.
Final Answer: The final answer is $[2,5)$. I hope it is correct.

Problem:
If $\det \mathbf{A} = 2$ and $\det \mathbf{B} = 12,$ then find $\det (\mathbf{A} \mathbf{B}).$

Solution: We have that $\det (\mathbf{A} \mathbf{B}) = (\det \mathbf{A})(\det \mathbf{B}) = (2)(12) = \boxed{24}.$
Final Answer: The final answer is $24$. I hope it is correct.

Problem:
Terrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?

Solution: If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\cdot 12\cdot20=480$ pounds of weight.  If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\cdot15\cdot n=30n$ pounds of weight.  Equating this to 480 pounds, we can solve for $n$:
\begin{align*}
30n&=480\
\Rightarrow\qquad n&=480/30=\boxed{16}
\end{align*}
Final Answer: The final answer is $16$. I hope it is correct.

Problem:
If the system of equations

\begin{align*}
6x-4y&=a,\
6y-9x &=b.
\end{align*}has a solution $(x, y)$ where $x$ and $y$ are both nonzero,
find $\frac{a}{b},$ assuming $b$ is nonzero.

Solution: If we multiply the first equation by $-\frac{3}{2}$, we obtain

$$6y-9x=-\frac{3}{2}a.$$Since we also know that $6y-9x=b$, we have

$$-\frac{3}{2}a=b\Rightarrow\frac{a}{b}=\boxed{-\frac{2}{3}}.$$
Final Answer: The final answer is $-\frac{2}{3}$. I hope it is correct.

Problem: @bold@{problem}@end@
Answer:
\end{lstlisting}


\subsection{Results}

We provide extended evaluation results for \methodname\ and CLM for $\alpha \in [0.05, 0.1, 0.15,.2, .25, .3, .35, .4, .45, .5]$ in the figures below. Additional models include Llama 3 8b, Phi-2, and 2 variants of Gemma 2 27b - the base model and the instruction tuned version. The figures below follow the same format as Figure~\ref{fig:main-results}.

For \texttt{\methodname-L}, we use a linear regression model from \texttt{scikit-learn}, that takes as input the length normalized log probability of the input prompt under the base LLM. For \texttt{\methodname-HS}, we use a multi-layer perceptron implemented in \texttt{torch} that takes hidden state activations of the last token in the input prompt from the last layer of either the underlying model in the case of Llama and Phi-2. For Gemma 2 and GPT-4o-mini, we use hidden state activations from the last token and layer of Phi-2. We perform a grid-search to determine the number of layers ([1, 2, 4]), number of hidden units ([256, 512, 1024]) and training epochs ([15, 20, 25]). Details are provided in the supplementary code. 

\subsubsection{Figures}
We provide plots similar to Figure~\ref{fig:main-results} for each model and dataset.
\begin{figure*}[!h]
     \centering
     \includegraphics[width=\textwidth]{figures/appendix/per-model-plots-main-figure/meta-llama_Meta-Llama-3-8B.jpg}
     \caption{Llama 3 8b results on all datasets.}
\end{figure*}


\begin{figure*}[!h]
     \centering
     \includegraphics[width=\textwidth]{figures/appendix/per-model-plots-main-figure/microsoft_phi-2.jpg}
     \caption{Phi-2 results on all datasets.}
     \vspace{-3mm}
\end{figure*}

\begin{figure*}[!h]
     \centering
     \includegraphics[width=\textwidth]{figures/appendix/per-model-plots-main-figure/google_gemma-2-27b.jpg}
     \caption{Gemma 2 27b results on DS-1000}
     \vspace{-3mm}
\end{figure*}

\begin{figure*}[!h]
     \centering
     \includegraphics[width=\textwidth]{figures/appendix/per-model-plots-main-figure/google_gemma-2-27b-it.jpg}
     \caption{Gemma 2 27b (Instruction Tuned) on DS-1000}
     \vspace{-3mm}
\end{figure*}

\begin{figure*}[!h]
     \centering
     \includegraphics[width=\textwidth]{figures/appendix/per-model-plots-main-figure/gpt-4o-mini.jpg}
     \caption{GPT-4o-mini results on all datasets.}
\end{figure*}


\subsubsection{Tables}

Here we provide a comprehensive set of tables including empirical coverage rates, in addition to the metrics considered prior. Blank entries (-) indicate complete abstention.
\begin{figure}[htbp]
    \centering
    \resizebox{\textwidth}{!}{  % The '!' preserves aspect ratio
        \input{figures/appendix/tables/gemma_2_27b_ds1000}
    }
    \caption{Results for Gemma 2 27B on DS1000}
\end{figure}
\begin{figure}[htbp]
    \centering
    \resizebox{\textwidth}{!}{  % The '!' preserves aspect ratio
        \input{figures/appendix/tables/gpt_4o_mini_ds1000}
    }
    \caption{Results for GPT 4o mini on DS1000}
\end{figure}
\begin{figure}[htbp]
    \centering
    \resizebox{\textwidth}{!}{  % The '!' preserves aspect ratio
        \input{figures/appendix/tables/gpt_4o_mini_gsm8k}
    }
    \caption{Results for GPT 4o mini on GSM8K}
\end{figure}
\begin{figure}[htbp]
    \centering
    \resizebox{\textwidth}{!}{  % The '!' preserves aspect ratio
        \input{figures/appendix/tables/llama_3_8b_gsm8k}
    }
    \caption{Results for Llama 3 8B on GSM8K}
\end{figure}
\begin{figure}[htbp]
    \centering
    \resizebox{\textwidth}{!}{  % The '!' preserves aspect ratio
        \input{figures/appendix/tables/phi_2_gsm8k}
    }
    \caption{Results for Phi 2 on GSM8K}
\end{figure}
\begin{figure}[htbp]
    \centering
    \resizebox{\textwidth}{!}{  % The '!' preserves aspect ratio
        \input{figures/appendix/tables/gemma_27b-it_math}
    }
    \caption{Results for Gemma 27B-IT on MATH}
\end{figure}
\begin{figure}[htbp]
    \centering
    \resizebox{\textwidth}{!}{  % The '!' preserves aspect ratio
        \input{figures/appendix/tables/gpt_4o_mini_math}
    }
    \caption{Results for GPT 4o mini on MATH}
\end{figure}
\begin{figure}[htbp]
    \centering
    \resizebox{\textwidth}{!}{  % The '!' preserves aspect ratio
        \input{figures/appendix/tables/gpt_4o_mini_mbpp}
    }
    \caption{Results for GPT 4o mini on MBPP}
\end{figure}
\begin{figure}[htbp]
    \centering
    \resizebox{\textwidth}{!}{  % The '!' preserves aspect ratio
        \input{figures/appendix/tables/llama_3_8b_mbpp}
    }
    \caption{Results for Llama 3 8B on MBPP}
\end{figure}
\begin{figure}[htbp]
    \centering
    \resizebox{\textwidth}{!}{  % The '!' preserves aspect ratio
        \input{figures/appendix/tables/phi_2_mbpp}
    }
    \caption{Results for Phi 2 on MBPP}
\end{figure}
\begin{figure}[htbp]
    \centering
    \resizebox{\textwidth}{!}{  % The '!' preserves aspect ratio
        \input{figures/appendix/tables/gpt_4o_mini_triviaqa}
    }
    \caption{Results for GPT 4o mini on TriviaQA}
\end{figure}
\begin{figure}[htbp]
    \centering
    \resizebox{\textwidth}{!}{  % The '!' preserves aspect ratio
        \input{figures/appendix/tables/llama_3_8b_triviaqa}
    }
    \caption{Results for Llama 3 8B on TriviaQA}
\end{figure}
\begin{figure}[htbp]
    \centering
    \resizebox{\textwidth}{!}{  % The '!' preserves aspect ratio
        \input{figures/appendix/tables/phi_2_triviaqa}
    }
    \caption{Results for Phi 2 on TriviaQA}
\end{figure}