%\vspace{-2ex}

\section{Generative Prediction Sets}
\label{sec:methodology}

This section describes our {\em Generative Prediction Sets} (GPS) framework to construct prediction sets from a given DGM. 
%First, we provide a high-evel overview of GPS and its advantages over CLM. Next, we describe the conformal calibration algorithm of GPS and coverage guarantee. %. Finally, we summarize the theoretical guarantees and their significance. 

\subsection{Overview and Advantages of GPS}

 
\paragraph{Overview of \methodname.} \methodname\ constructs adaptive prediction sets with valid coverage guarantees as illustrated in Figure~\ref{fig:gps-overview} (see Algorithm~\ref{alg:gps} for a complete pseudocode). The core idea behind \methodname\ is to reformulate the prediction set construction problem into estimating the number of samples needed to achieve admissible outputs, operating in two key phases.


\begin{algorithm}
\caption{Generative Prediction Sets (GPS) Framework}\label{alg:gps}
\begin{algorithmic}[1]
\Require generative model $\hat{\pi}(\cdot|X)$, calibration data $\mathcal{D}_X$ = $\{X_i\}_{i=1}^n$, admissibility function $\mathcal{A}$, admissibility estimator $\hat{f}$, sampling budget $M$, test input $X_{n+1}$
\Ensure prediction set $\mathcal{C}(X_{n+1})$ 
\vspace{1ex}
%\State
\State {\em // Construct augmented calibration data}
\label{alg:gps-construct-data-start}
\State Initialize $\mathcal{D}_{\text{cal}} = \varnothing$
\For{each $X_i \in \mathcal{D}_X$}
    % \State $\hat{K}_i \leftarrow \min\{k \in [M] : \mathcal{A}(X_i, Y_k) = 1, Y_k \sim \hat{\pi}(\cdot|X_i)\} \cup \{\infty\}$
    \State $\hat{K}_i \leftarrow$ minimum number of samples to obtain admissible output within the maximum budget $M$
    %\inf\{k \in [M] : \mathcal{A}(X_i, Y_k) = 1, Y_k \sim \hat{\pi}(\cdot|X_i)\}$
    \State If no admissible output, then $\hat{K}_i \leftarrow M+1$
    \State $\mathcal{D}_{\text{cal}} \leftarrow \mathcal{D}_{\text{cal}} \cup \{(X_i, \hat{K}_i)\}$
\EndFor
% \EndFor
\label{alg:gps-construct-data-end}
%\State
\vspace{1ex}
\State {\em // Calibration on minimum number of samples} \label{alg:gps-algorithm-step-calibration-start}
\State Execute CP for regression using  $\mathcal{D}_{\text{cal}}$ and estimator $\hat{f}$ 
\vspace{1ex}
\State {\em // Conformal inference on minimum number of samples}
\State Use conformal regressor on $X_{n+1}$ to get $\hat{K}(X_{n+1})$ as per Eq. \ref{eq:method-conformal-set} (i.e., upper-bound of prediction interval)
\State {\em // Generate $\hat{K}(X_{n+1})$ samples to create prediction set}
\State $\mathcal{C}(X_{n+1}) \leftarrow \{ Y_j \sim \hat{\pi}(\cdot | X_{n+1})  \}_{j=1}^{\hat{K}(X_{n+1})}$ 

%$\boldsymbol{\hat{Y}} \leftarrow \{ Y_j \sim \hat{\pi}(\cdot | X_{n+1})  \}_{j=1}^{\hat{K}_{n+1}}$
\State \textbf{return} Prediction set $\mathcal{C}(X_{n+1})$
\end{algorithmic}
\end{algorithm}

First, \methodname\ creates an augmented calibration dataset by sampling multiple outputs from the given generative model for each calibration input $X_i$ in $\mathcal{D}_{X}$ (steps \ref{alg:gps-construct-data-start}-\ref{alg:gps-construct-data-end} in Algorithm~\ref{alg:gps}). For each $X_i$, we sample from $\hat{\pi}(Y|X)$ using an arbitrary, independent sampling procedure to obtain a sequence of samples $\{Y_{ij}\}_{j=1}^M$ up to a maximum budget $M$. We define $K_i$ to be the number of samples needed to obtain an admissible output from $\hat{\pi}$ for input $X_i$. This gives us an augmented calibration dataset, $\{(X_i, K_i)\}_{i=1}^n$. 

Second, \methodname\ develops a conformal procedure around this augmented data (steps \ref{alg:gps-algorithm-step-calibration-start}-\ref{alg:gps-construct-data-end} in Algorithm~\ref{alg:gps}). Notice that if we construct a prediction interval around $K_i$, we will achieve our desired coverage guarantee due to the following equivalence of events:

\begin{equation}
    \label{eq:y-k-space-equivalence}
    \{K_{i} \leq \hat{K}_i\} \Leftrightarrow \left\{ \sum_{j=1}^{\hat{K}_i} \mathcal{A}(X_i, Y_{ij}) > 0 \right\}
\end{equation}

At test time, given a new input $X_{n+1}$, we predict $K_{n+1}$ (using the upper bound of the conformal interval) and sample from $\hat{\pi}(Y|X)$ exactly $K_{n+1}$ times to construct the prediction set $\mathcal{C}(X_{n+1})$, achieving the guarantee stated in Eq. (\ref{eq:gaps-target-guarantee}).

\noindent {\bf Key advantages of GPS.} \methodname\ has two qualitative benefits over CLM. First, \methodname\ reduces the problem of generating prediction sets for DGMs to a vanilla CP problem. This reduction allows us to bring to bear the full body of machinery developed in vanilla CP to this problem setting. For example, the approximate conditional coverage method of \citet{Gibbs2023-ax} can be applied off-the-shelf to \methodname. Achieving such conditional coverage guarantees with CLM would require a non-trivial extension of the LTT framework.  Second, since \methodname\ calibrates only a single parameter, it has substantially lower computational complexity than CLM. Lastly, \methodname\ works in batch mode; it specifies how many samples are to be collected from the DGM apriori (see Figure~\ref{fig:gps-overview}). In contrast, CLM works sequentially, one sample at a time. Thus, \methodname\ can easily be used in modern batch inference pipelines, where it is desirable to produce all samples at once in a single batch to maximize hardware utilization, or in cost-sensitive applications, where users might want to make a trade-off between abstention rate and sampling cost.

% \vspace{-1ex}

% \subsection{Sampling Calibration Data}

% We start by generating samples from $\hat{\pi}_{Y|X}$ to augment the calibration dataset using some fixed, arbitrary sampling procedure. 
% For each $X_i$, we generate a set of samples, $\mathbf{\hat{Y}}_i=\{\hat{Y}_{ij}\}_{j=1}^{k_i}$. 
% Here, $k_i$ is the minimum number of samples needed to obtain an admissible sample under $\mathcal{A}$, bounded above by a pre-specified budget $\omega^{\max}$. 
% We can then compute a sample statistic $\hat{T}_i$ as follows:
% \begin{equation*}
%     \hat{W}_i = \begin{cases}
%         \mathcal{H}(X_i, \mathbf{\hat{Y}}_i) & \text{, if } \sum_{j=1}^{k_i} \mathcal{A}(X_i, \mathbf{\hat{Y}}_{ij}) > 0 \\
%         \infty, & \text{otherwise}
%     \end{cases}
% \end{equation*}
% Thus, $\hat{W}_i$ represents the minimum cost incurred for a calibration point $X_i$ to produce a set of samples with at least one admissible output.
\subsection{Conformal Calibration Algorithm}
\label{sec:calibration-algorithm}

% If we do not observe any correct solutions within our sampling budget, we simply set it to $M+1$. This is the main modification that allows \methodname\ to selectively abstain. Formally, we define $K_i = \inf \{j: \mathcal{A}(X_i, Y_{ij}) = 1\} \cup \{M+1\}$. 

% With the augmented calibration data, $\{(X_i, K_i)\}_{i=1}^{n}$ in hand via steps \ref{alg:gps-construct-data-start}-\ref{alg:gps-construct-data-end} in Algorithm~\ref{alg:gps}, we can use any CP procedure for regression to calibrate an estimator to predict the $K_{n+1}$. 

% The key challenge in designing a calibration procedure for in a sampling based regime is that $\hat{\pi}$ might not have produced an admissible solution in the first $M$ samples. 
% In these instances, how do we compute the non-conformity score? Prior work in CP for generative models either sets these corresponding scores to $\infty$ (\cite{su2024api}), or provide calibration assuming this doesn't happen (\cite{wang2024conu}. Of course, if there are more than $\ceil{(1-\alpha)(n+1)}$ such examples in our calibration data, our the conformal quantile of our scores, $Q_{1-\alpha}(\{S_i\}_{i=1}^n)$ will also be $\infty$, resulting in trivial prediction sets (i.e. $\mathcal{Y}$). This where \methodname\ differs from prior work. 

% For calibration examples where $K_i$ isn't observed in the set sampling budget, we set $K_i=M+1$. Formally, we define $K_i = \inf \{j: \mathcal{A}(X_i, Y_{ij}) = 1\} \cup \{M+1\}$. Here,  $M+1$ is a sentinel value indicating that an admissible solution cannot be obtained in $M$ samples. Intuitively, while we do not know what value $K_i$ actually takes, we know that it exceeds $M$; the closer an estimate is to $M$, the smaller radius we require to capture $K_i$. This definition ensures that our $K_i$ are always finite, so we can utilize common non-conformity scores for regression as is to calibrate $\{(X_i, K_i)\}_{i=1}^{n}$. Moreover, any estimator can produce an estimate $> M$ to signal abstention on specific input examples.  
A key challenge in designing calibration procedures for sampling-based regimes is handling cases where $\hat{\pi}$ fails to produce an admissible solution within $M$ samples. Prior work in CP for generative models has addressed this by either setting non-conformity scores to $\infty$ (\cite{su2024api}) or assuming such failures don't occur (\cite{wang2024conu}). However, if more than $\ceil{(1-\alpha)(n+1)}$ calibration examples lack admissible solutions, the conformal quantile $Q_{1-\alpha}(\{S_i\}_{i=1}^n)$ becomes $\infty$, yielding trivial prediction sets ($\mathcal{Y}$). \methodname\ addresses this challenge via a different approach.

For calibration examples where no admissible solution is found within the sampling budget, we set $K_i=M+1$, defining $K_i = \inf \{j: \mathcal{A}(X_i, Y_{ij}) = 1\} \cup \{M+1\}$. This sentinel value $M+1$ indicates the absence of an admissible solution in $M$ samples. While the true $K_i$ may be unknown in such cases, we know it exceeds $M$. This definition ensures finite $K_i$ values, enabling the use of any standard regression-based non-conformity scores to calibrate $\{(X_i, K_i)\}_{i=1}^{n}$. %As we will see later,
This choice also allows us to handle abstention in a principled way.

To develop this calibration procedure, we first characterize the distribution of $K_i$. For our problem setting, the samples are collected using an I.I.D. sampling process, and thus, $K_i$ follows a geometric distribution conditional on $X_i$, with its success probability an unknown function of $X_i$, i.e. $K_i$ given $X_i=x$ follows Geom($f(x)$). While $f$ is unknown, we can train an estimator $\hat{f}$ to estimate the success probability conditional on $X_i$, on a separate training split. We then use $\hat{f}$ to produce an estimate of the $1-\alpha$ conditional quantile for $K_{n+1}$, being careful to account for finite sampling:
\begin{equation}\label{eq:qhat}
    \hat{q}_{\alpha}(x) := \ceil{\frac{\ln{\alpha}}{\ln{\{ 1-\hat{f}(x) \}}}}
\end{equation}
While this quantile estimate is simply an estimate and will not yield a $1-\alpha$ coverage in finite samples, we can conformalize it to obtain the valid, finite-sample coverage guarantees. For this, we use the Conformalized Quantile Regression (CQR) score from~\cite{romano2019conformalized}:
$$
S(x, k) = \max\{0, k - \hat{q}_{\alpha}(x)\} 
$$
% \begin{equation}
%     \begin{aligned}
%         S(x, k) &= \max\{0, k - \hat{q}_{\alpha}(x) \}  \\
%         S_i &= S(X_i, K_i)
%     \end{aligned}
% \end{equation}

The score function is asymmetric, since we are building a one-sided interval of the form $[0, \hat{K}]$. Next, using we can construct a one-sided interval for $K_{n+1}$ by applying the conformal adjustment term: %, as follows:
\begin{equation}
\label{eq:k-estimate}
    \hat{K}(X_{n+1}) = \hat{q}_{\alpha}(X_{n+1}) + Q_{1-\alpha}(\{S_i\}_{i=1}^n) 
\end{equation}

Recall that $\hat{Q}_{1-\alpha}$ is the conformal quantile over calibration scores. 
Now, because of our definition of $K_i$, $\hat{K}$ will always produce finite values, although they might exceed $M$. If this does happen, we must allow the model to abstain; it must output infinite sets. We can do this using the below scheme: %with the following adjustment: %as follows:
\begin{equation}
    \label{eq:method-conformal-set}
    \hat{K}(X_{n+1}) = 
    \begin{cases}
        \hat{K}(X_{n+1}) & \text{ , if } \hat{K}(X_{n+1}) \leq M \\
        \infty & \text{ otherwise} \\
    \end{cases}
\end{equation}

% Note that the truncation above corresponds to an abstention; if $\hat{K}(X_{n+1})$ is larger than $M$, the calibration algorithm asserts that the underlying generative model is not able to produce an admissible output under $M$ samples. 
With the above $\hat{K}$, we have the below guarantee due to CP:
\begin{equation}
    P\{K_{n+1} \leq \hat{K}(X_{n+1})\} \geq 1-\alpha
\end{equation}
This can then be translated back to the original output space, $\mathcal{Y}$, resulting in the following coverage guarantee for GPS:
\begin{proposition}
Let $\{Y_j \sim \hat{\pi}(\cdot|X=X_{n+1})\}_{j=1}^{\hat{K}(X_{n+1})}$ be the prediction set generated according to Algorithm 1. Then we have the following coverage guarantee:
\begin{equation*}
    P\{\exists Y \in \hat{C}(X_{n+1}): \mathcal{A}(X_{n+1}, Y) = 1\} \geq 1-\alpha
\end{equation*}
\end{proposition}

This statement is a direct result of the equivalence of events stated in Eq.~\ref{eq:y-k-space-equivalence}. We include a proof in the Appendix.

% To elucidate $\hat{K}$'s abstention behavior, consider a dataset where we have a large number of abstentions. If $\hat{f}$ is a good in that it abstains when $\hat{\pi}$ abstains, a large number of scores will be 0. Thus conformal term in Eq.~\ref{eq:k-estimate} will likely be small, and at test time, $\hat{f}$ determines whether to abstain or not based on specific input examples. On the other extreme, if $\hat{f}$ is a fixed constant such that $\hat{q}_\alpha=1$, we will obtain a score of $M$ on every calibration example where $\hat{\pi}$ abstains, pushing the quantile term to a high value. With enough of these terms, the quantile will be $M$ and our calibrated predictor will take a constant value, $\hat{K}=M+1$, thereby abstaining \textit{always}. Thus, our calibration procedure adjusts abstention based on the quality of the estimator for $K_i$ as well as the generative model's abstention rate.
To understand $\hat{K}$'s abstention behavior, consider two extremes. When $\hat{f}$ accurately predicts $\hat{\pi}$'s abstentions, many calibration scores will be small, resulting in a modest conformal adjustment in Eq.~\ref{eq:k-estimate}. Thus, abstention decisions are primarily driven by $\hat{f}$'s input-specific predictions. Conversely, with a poor estimator $\hat{f}$ that outputs a constant probability (yielding $\hat{q}_{\alpha}=1$), calibration examples where $\hat{\pi}$ abstains will have scores of $M$. With enough such examples, this forces $\hat{K}=M+1$ universally, leading to constant abstention. Our calibration procedure thus adapts based on both $\hat{f}$'s quality and $\hat{\pi}$'s abstention rate.

% This tells us that if we generate $\hat{K}(X_{n+1})$ samples from the generative model, we will obtain at least one admissible solution with a marginal probability of $1-\alpha$.
We have now obtained a stopping rule for sampling, $\hat{K}(X_{n+1})$, without having to generate any samples from $\hat{\pi}(\cdot | X)$. At this point, the user has the option to abstain, if $\hat{K}$ is either $\infty$ or deemed too high for their usage scenario. To construct the prediction set from combinatorial space $\mathcal{Y}$, we take $\hat{K}(X_{n+1})$ samples from the generative model:
\begin{equation}
{\hat{C}(X_{n+1})} = \{Y_{j} \sim \hat{\pi}(\cdot|X=X_{n+1})\}_{j=1}^{\hat{K}(X_{n+1})}
\end{equation}
$\hat{C}(X_{n+1})$ contains only distinct outputs. Due to potential duplicate samples from the generative model, the cardinality of $\hat{C}(X_{n+1})$ may be less than or equal to $\hat{K}(X_{n+1})$. 

% This ensures that our prediction set accurately represents unique outcomes while maintaining the coverage guarantee.

% \vspace{1ex}
\noindent {\bf Construction of admissibility estimator $\hat{f}$.} 
While \methodname\ guarantees valid prediction sets with the desired coverage level, the admissibility predictor $\hat{f}$ determines three key performance characteristics: empirical set sizes, sampling efficiency, and adaptivity of the prediction sets.

The construction of $\hat{f}$ requires collecting multiple samples from the underlying deep generative model using a separate training set. For each training example $X_i$, we obtain samples $\{Y_{ij}\}_{j=1}^{M_{\text{train}}}$, where $M_{\text{train}}$ need not equal the calibration parameter $M$. The probability of success ($f(X_i)$) can be estimated, for example, using a MAP estimate:
\begin{equation}
    p_i = \frac{\sum_{j=1}^{M_{\text{train}}}\mathcal{A}(X_i, Y_{ij}) + 1}{M_{\text{train}} + 2},
\end{equation}
yielding training data pairs $(X_i, p_i)$.

Training $\hat{f}$ to predict success probabilities requires input signals derived solely from $X_i$. The primary options are log probabilities and latent space representations from the generative model. When log probabilities are inaccessible (as in API-based models), one can utilize either a separate embedding model or hidden states from a smaller surrogate model. In Section~\ref{sec:experimental-results}, we demonstrate this approach using both input log probabilities and hidden states from a smaller model (Phi 2) to train $\hat{f}$ for a larger models (GPT4o-mini). A constant predictor serves as a final alternative when no input signals are available---while preserving marginal validity, this yields fixed-size prediction sets.
% \paragraph{Optimality} \devjeet{We might want to include this discussion in theoretical section rather than here.}

% \vspace{-2ex}
% \subsection{Theoretical Properties of GPS}

% In this section, we discuss the theoretical properties of \methodname. Formal proofs are provided in the Appendix (section~\ref{app:proofs}). 

%\vspace{1ex}


% \vspace{1ex} 

% \noindent {\bf Oracle properties.} The following corollary justifies our choice of score function, which provides finite sample conditional coverage if we have access to the oracle $f$:
% \begin{corollary}
% Given the oracle estimate, i.e. $\hat{f}(X) := f(X)$, the sets generated using GPS have conditional coverage:
% \[
% P\{\exists Y \in \hat{C}(X_{n+1}): \mathcal{A}(X_{n+1}, Y) = 1 | X_{n+1}\} \geq 1-\alpha
% \]
% \end{corollary}

% As a consequence of the one-sided interval on $K$, the coverage here is \textit{conservative}, since the second term in Eq~\ref{eq:k-estimate} is positive, and need not be 0 even for the oracle $\hat{f}$. 

% Next, using the continuous version of the \methodname\ (obtained by removing the ceiling functions in the quantile estimates), and the upper bound of the coverage guarantee for CP (Eq. \ref{eq:conformal-guarantee-ub}), we show that asymptotically, the set sizes for \methodname\ are close to oracle set sizes with high probability:

% \begin{lemma} For the oracle predictor $\hat{f} = f$, let $k_0(X_{n+1}) = \ln(\alpha)/\ln(1-f(X_{n+1}))$ (the continuous oracle quantile estimate). Then, for any $\epsilon > 0, 0 < \delta < 1$, there exists a $n_0 \in \mathbb{N}$, such that if $|D_{cal}| > n_0$, \methodname\ with the oracle predictor produces set sizes close to oracle set sizes, such that:
% \[
% P\Bigl\{\frac{\hat{K}(X_{n+1}) - k_0(X_{n+1})}{k_0(X_{n+1})} \ge \epsilon \Bigr\} \le \delta
% \]
% \end{lemma}
% $n_0$ is given by
% \[
% n_0 = \frac{1}{\alpha \delta (1 - \alpha^\epsilon)}
% \]

% As an example, if we pick $\alpha=.2$, $\delta = .2$ and $\epsilon = 5\%$, we can achieve this bound with $\approx 350$ calibration examples.
