
\section{Background}
\label{sec:background}

Let us consider the supervised machine learning setting where we have an input variable $X \in \mathcal{X}$ and a target variable $Y \in \mathcal{Y}$ jointly distributed according to an unknown distribution $X,Y \sim p(X)p(Y|X)$. Given a prediction function $f:\mathcal{X} \rightarrow {\mathcal{Y}'}$, where ${\mathcal{Y}'}$ is an output space that approximates some statistic of $p(Y|X)$, \footnote{ e.g.,
${\mathcal{Y}'} = \Delta ^{|\mathcal{Y}|-1}$ if $f$ outputs a probability vector over labels in the classification setting,  or $\mathcal{Y}' = \mathcal{Y}$ for regression}, we consider a non-conformity score function $S_{f}: \mathcal{X}\times \mathcal{Y}  \rightarrow \mathbb{R}$, that depends on $f$ and measures the proximity between the prediction $f(X)$ and the corresponding target $Y$. We use $S$ to denote the non-conformity random variable $S = S_{f}(X,Y)$ that depends on the input variable $X$, target variable $Y$ and model $f$.

In the split conformal setting we assume we have access to an i.i.d. calibration dataset $\mathcal{D}^{cal} = \{(X_i,Y_i)\}^n_{i=1} \sim p(X,Y)^{\otimes n}$ that is independent of $f$. Then, the set of non-conformity scores $\{s_i = S_f(X_i,Y_i)\}_{i=1}^n$ are exchangeable with any unseen non-conformity score sample $S_{n+1} = S_{f}(X_{n+1},Y_{n+1})$. This exchangeabilty property implies the following for any new sample $X_{n+1},Y_{n+1} \sim P(X)P(Y|X)$
\begin{equation}
\label{eq:quantile}
\begin{array}{l}
     1- \alpha \le P\Big(S_{n+1} \le \textsc{q}_{1-\alpha}(\{s_i\}_{i=1}^n) \Big) \le 1- \alpha + \frac{1}{n+1} \\
     \textsc{q}_{1-\alpha}(\{s_i\}_{i=1}^n) = \mathbb{Q}_{1-\alpha}(\sum^n_{i=1} \frac{1}{n+1}\delta_{s_i} + \frac{1}{n+1} \delta_{\infty}) 
\end{array}
    % P\Big(S_{n+1} \le \mathbb{Q}_{1-\alpha}(\sum^n_{i=1} \frac{1}{n+1}\delta_{s_i} + \frac{1}{n+1} \delta_{\infty}) \Big) \ge 1- \alpha
\end{equation}

where $ \mathbb{Q}_{1-\alpha}(\cdot)$ denotes the $1-\alpha$ quantile operator of its input (for Eq.\ref{eq:quantile}  this is the  
 $ \lceil{(1-\alpha)(n+1)}\rceil$-th smallest $\{s_i\}_{i=1}^n$), and $\alpha \in (0,1)$ is a pre-specified mis-coverage level. Conversely, we can define the conformity set for a given mis-coverage level $\alpha$ based on the non-conformity score function as 
\begin{equation}
\begin{array}{l}
         C_f(X_{n+1}) = \{Y_{n+1} \in \mathcal{Y}: S_{n+1} \le \textsc{q}_{1-\alpha}(\{s_i\}_{i=1}^n)\}. \\
    % S_{n+1} = S_f(X_{n+1},Y_{n+1})
\end{array}
\label{eq:average_coverage}
\end{equation}
This satisfies $P(Y_{n+1} \in C_f(X_{n+1})) \ge 1- \alpha$.

\paragraph{Conditional and Local Coverage Guarantees} The set described in Eq.~\ref{eq:average_coverage} provides guarantees on average across the entire data distribution, but not for any particular value of $X$, i.e., $P(Y_{n+1} \in C_f(X_{n+1})| X_{n+1}=x) \ge 1- \alpha, \forall x \in \mathcal{X}$, also known as conditional coverage. This desirable guarantee cannot be achieved in practice \cite{vovk2012conditional,lei2014distribution,foygel2021limits}, since it would require $C_f(x)$ to have infinite expected length at any non-atom $x$. A relaxation of this setting is to consider local coverage over a partition of the support of $P(X)$ denoted as $g:\mathcal{X} \rightarrow \mathcal{G}$ with $\mathcal{G}$ a discrete finite set. Then, local conditional guarantees implies $P(Y_{n+1} \in C_f(X_{n+1})| X_{n+1} \in g_j) \ge 1- \alpha$ with $g_j =\{x: g(x)=j\}$ $\forall j \in \mathcal{G}$.

\paragraph{Pinball Loss in the Infinite Sample Regime} In the ideal case were the conditional distribution of the non-conformity scores ($P(S|X)$) is known, the most efficient prediction interval for a given $X$ and mis-coverage level $\alpha$ is
\begin{equation}
\begin{array}{r}
C(X) = \{y \in \mathcal{Y}: S(X,y) \le F^{-1}_{S|X}(1-\alpha) \}\\
\end{array}
\end{equation} 
with $F^{-1}_{S|X}(1-\alpha) = \inf\{\hat{s} \in supp(P_{S|X}): P(S \le \hat{S}|X) \ge 1-\alpha\}$. We can approximate the $1-\alpha$ conditional quantile by minimizing the expected pinball loss
\begin{equation}
\label{eq:gen_pinball_objective}
    \begin{array}{r}
        F^{-1}_{S|X}(1-\alpha) = \arg\min\limits_{q \in \mathcal{Q}} \mathbb{E}_{p(X,S)}\big[\ell_{1-\alpha}(q(X),S)\big]
    \end{array}
\end{equation}
where $\mathcal{Q}$ represents a universal class of functions and $\ell_{1-\alpha}(\cdot,\cdot)$ is the pinball loss, defined as
% \begin{equation}
% \begin{array}{rl}
% \ell_{1-\alpha}(q,s) &= (1-\alpha)(s-q)\mathbf{1}[s \ge q]  \\
% &+ \alpha(q-s) \mathbf{1}[s < q]. 
% \end{array}
% \end{equation}
\begin{equation}
\begin{array}{rl}
\ell_{1-\alpha}(q,s) &= \max\{(1-\alpha)(s-q), \alpha(q-s) \}. 
\end{array}
\end{equation} 

Section \ref{sec:region_identification} leverages the pinball loss, in addition to a worst case generalization objective, to identify a set of disjoint regions in the input space where the $1-\alpha$ quantile of the non-conformity score differs significantly. We use the discovered grouping in this prior step as an input to a group-conditional split conformal approach which now holds local conditional guarantees on the identified groups. Section \ref{sec:region_conformal} presents an implementation of this approach based on decision trees, which provide an interpretable clustering of the input space based on the input features (or an interpretable embedding of the same).

% \section{Background}
% \label{sec:background}

% Let us consider the supervised machine learning setting where we have an input variable $X \in \mathcal{X}$ and a target variable $Y \in \mathcal{Y}$ jointly distributed according to an unknown distribution $X,Y \sim p(X)p(Y|X)$. Given a prediction function $f:\mathcal{X} \rightarrow {\mathcal{Y}'}$, where ${\mathcal{Y}'}$ is an output space that approximates some statistic of $p(Y|X)$, \footnote{ e.g.,
% ${\mathcal{Y}'} = \Delta ^{|\mathcal{Y}|-1}$ if $f$ outputs a probability vector over labels in the classification setting,  or $\mathcal{Y}' = \mathcal{Y}$ for regression}, we consider a non-conformity score function $S_{f}: \mathcal{X}\times \mathcal{Y}  \rightarrow \mathbb{R}$, that depends on $f$ and measures the proximity between the prediction $f(X)$ and the corresponding target $Y$. We use $S$ to denote the non-conformity random variable $S = S_{f}(X,Y)$ that depends on the input variable $X$, target variable $Y$ and model $f$.

% In the split conformal setting we assume we have access to an i.i.d. calibration dataset $\mathcal{D}^{cal} = \{(X_i,Y_i)\}^n_{i=1} \sim p(X,Y)^{\otimes n}$ that is independent of $f$. Then, the set of non-conformity scores $\{s_i = S_f(X_i,Y_i)\}_{i=1}^n$ are exchangeable with any unseen non-conformity score sample $S_{n+1} = S_{f}(X_{n+1},Y_{n+1})$. This exchangeabilty property implies the following for any new sample $X_{n+1},Y_{n+1} \sim P(X)P(Y|X)$
% \begin{equation}
% \label{eq:quantile}
% \begin{array}{l}
%      1- \alpha \le P\Big(S_{n+1} \le \textsc{q}_{1-\alpha}(\{s_i\}_{i=1}^n) \Big) \le 1- \alpha + \frac{1}{n+1} \\
%      \textsc{q}_{1-\alpha}(\{s_i\}_{i=1}^n) = \mathbb{Q}_{1-\alpha}(\sum^n_{i=1} \frac{1}{n+1}\delta_{s_i} + \frac{1}{n+1} \delta_{\infty}) 
% \end{array}
%     % P\Big(S_{n+1} \le \mathbb{Q}_{1-\alpha}(\sum^n_{i=1} \frac{1}{n+1}\delta_{s_i} + \frac{1}{n+1} \delta_{\infty}) \Big) \ge 1- \alpha
% \end{equation}

% where $ \mathbb{Q}_{1-\alpha}(\cdot)$ denotes the $1-\alpha$ quantile operator of its input (for Eq.\ref{eq:quantile}  this is the  
%  $ \lceil{(1-\alpha)(n+1)}\rceil$-th smallest $\{s_i\}_{i=1}^n$), and $\alpha \in (0,1)$ is a pre-specified mis-coverage level. Conversely, we can define the conformity set for a given mis-coverage level $\alpha$ based on the non-conformity score function as 
% \begin{equation}
% \begin{array}{l}
%          C_f(X_{n+1}) = \{Y_{n+1} \in \mathcal{Y}: S_{n+1} \le \textsc{q}_{1-\alpha}(\{s_i\}_{i=1}^n)\}. \\
%     % S_{n+1} = S_f(X_{n+1},Y_{n+1})
% \end{array}
% \label{eq:average_coverage}
% \end{equation}
% This satisfies $P(Y_{n+1} \in C_f(X_{n+1})) \ge 1- \alpha$.

% \paragraph{Conditional and Local Coverage Guarantees} The set described in Eq.~\ref{eq:average_coverage} provides guarantees on average across the entire data distribution, but not for any particular value of $X$, i.e., $P(Y_{n+1} \in C_f(X_{n+1})| X_{n+1}=x) \ge 1- \alpha, \forall x \in \mathcal{X}$, also known as conditional coverage. This desirable guarantee cannot be achieved in practice \cite{vovk2012conditional,lei2014distribution,foygel2021limits}, since it would require $C_f(x)$ to have infinite expected length at any non-atom $x$. A relaxation of this setting is to consider local coverage over a partition of the support of $P(X)$ denoted as $g:\mathcal{X} \rightarrow \mathcal{G}$ with $\mathcal{G}$ a discrete finite set. Then, local conditional guarantees implies $P(Y_{n+1} \in C_f(X_{n+1})| X_{n+1} \in g_j) \ge 1- \alpha$ with $g_j =\{x: g(x)=j\}$ $\forall j \in \mathcal{G}$.

% \paragraph{Pinball Loss in the Infinite Sample Regime} In the ideal case were the conditional distribution of the non-conformity scores ( $P(S|X)$) is known, the most efficient prediction interval for a given $X$ and mis-coverage level $\alpha$ is
% \begin{equation}
% \begin{array}{r}
% C(X) = \{y \in \mathcal{Y}: S(X,y) \le F^{-1}_{S|X}(1-\alpha) \}\\
% \end{array}
% \end{equation} 
% with $F^{-1}_{S|X}(1-\alpha) = \inf\{\hat{s} \in supp(P_{S|X}): P(S \le \hat{S}|X) \ge 1-\alpha\}$. We can approximate the $1-\alpha$ conditional quantile by minimizing the expected pinball loss
% \begin{equation}
% \label{eq:gen_pinball_objective}
%     \begin{array}{r}
%         F^{-1}_{S|X}(1-\alpha) = \arg\min\limits_{q \in \mathcal{Q}} \mathbb{E}_{p(X,S)}\big[\ell_{1-\alpha}(q(X),S)\big]
%     \end{array}
% \end{equation}
% where $\mathcal{Q}$ represents a universal class of functions and $\ell_{1-\alpha}(\cdot,\cdot)$ is the pinball loss, defined as
% % \begin{equation}
% % \begin{array}{rl}
% % \ell_{1-\alpha}(q,s) &= (1-\alpha)(s-q)\mathbf{1}[s \ge q]  \\
% % &+ \alpha(q-s) \mathbf{1}[s < q]. 
% % \end{array}
% % \end{equation}
% \begin{equation}
% \begin{array}{rl}
% \ell_{1-\alpha}(q,s) &= \max\{(1-\alpha)(s-q), \alpha(q-s) \}. 
% \end{array}
% \end{equation} 

% Section \ref{sec:region_identification} leverages pinball loss, in addition to a worst case generalization objective, to identify a set of disjoint regions in the input space where the $1-\alpha$ quantile of the non-conformity score differs significantly. We use the discovered grouping in this prior step as an input to a group-conditional split conformal approach which now holds local conditional guarantees on the identified groups. Section \ref{sec:region_conformal} presents an implementation of this approach based on decision trees, which provide an interpretable clustering of the input space based on the input features (or an interpretable embedding of the same).