\documentclass[accepted]{uai2024}
\usepackage[american]{babel}
\usepackage{natbib} % has a nice set of citation styles and commands
\bibliographystyle{plainnat}
\renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\newcommand{\swap}[3][-]{#3#1#2} % just an example
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% THEOREMS
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\newtheorem{remark}[theorem]{Remark}
\newcommand{\score}{conformity score}
\usepackage[textsize=tiny]{todonotes}
\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}


%\usepackage{algorithm}
%\usepackage{algorithmic}
%\usepackage{amsmath}
\usepackage{amssymb}
%\usepackage{amsthm}
\usepackage[formats]{listings}
\lstset{basicstyle=\ttfamily,numberstyle=\small,language=C,frame=single}
\lstset{
    % numbers=left,
    breaklines=true,
    tabsize=2,
    float,
    basicstyle={\ttfamily\small},
    stringstyle=\ttfamily,
    showstringspaces=false,
    literate={\ \ }{{\ }}1
}



\title{Normalizing Flows for Conformal Regression}
\author[1]{Nicolo~Colombo}
% Add affiliations after the authors
\affil[1]{%
    {\tt nicolo.colombo@rhul.ac.uk}\\
    
    Computer Science Department\\
    Royal Holloway, University of London\\
    Egham, Surrey, UK
}
\newcommand{\supplementaryMaterial}{{\tt Supplementary Material.pdf}}


\begin{document}
\maketitle

\begin{abstract}
Conformal Prediction (CP) algorithms estimate the uncertainty of a prediction model by calibrating its outputs on labeled data.  The same calibration scheme usually applies to any model and data without modifications. The obtained prediction intervals are valid by construction but could be inefficient, i.e. unnecessarily big, if the prediction errors are not uniformly distributed over the input space.

We present a general scheme to localize the intervals by training the calibration process. The standard prediction error is replaced by an optimized distance metric that depends explicitly on the object attributes.  Learning the optimal metric is equivalent to training a Normalizing Flow that acts on the joint distribution of the errors and the inputs.  Unlike the Error Re-weighting CP algorithm of \cite{papadopoulos2008normalized}, the framework allows estimating the gap between nominal and empirical conditional validity. The approach is compatible with existing locally-adaptive CP strategies based on re-weighting the calibration samples and applies to any point-prediction model without retraining.
%

\end{abstract}
\section{Introduction}
\label{section introduction}
In natural sciences, calibration often refers to comparing measurements of the same quantity made by a new device and a reference instrument.\footnote{The International Bureau of Weights and Measurements defines calibration as the
\emph{"operation that, under specified conditions, in a first step, establishes a relation between the quantity values with measurement uncertainties provided by measurement standards and corresponding indications with associated measurement uncertainties (of the calibrated instrument or secondary standard) and, in a second step, uses this information to establish a relation for obtaining a measurement result from an indication."}
}  
In data science, calibrating a model means quantifying the uncertainty of its predictions.
%
Parametric and non-parametric methods for model calibration have been proposed in the past.
%
Examples of trainable post hoc approaches are Platt scaling \citep{platt1999probabilistic}, Isotonic regression \citep{zadrozny2002transforming}, and Bayesian Binning \citep{naeini2015obtaining}. 
%
Here we focus on regression problems, where data objects have an attribute, $X \in {\cal X}$, and a real-valued label, $Y \in {\mathbb R}$.
%
The model output is a \emph{point-like} prediction of the most likely label given its attribute, i.e. $f(X) \approx {\rm E}(Y|X)$.
%
Calibrating $f$ would promote $f(X)$ to a Prediction Interval (PI), i.e. a \emph{subset of the label space}, $C \subseteq {\mathbb R}$, that contains the unknown label, $Y$, with lower-bounded probability.
%
Given a target \emph{confidence level}, $1 - \alpha \in (0, 1)$, $C$ is \emph{valid} if it contains the unknown label with probability at least $1 - \alpha$, i.e. if ${\rm Prob}(Y \in C) \geq 1-\alpha$. 
%
Conformal Prediction (CP) is a frequentist approach for producing valid PIs without making assumptions on the data-generating distribution, $P_{XY}$, or the prediction model, $f$ \citep{vovk2005algorithmic, shafer2008tutorial}.
%
PIs are obtained by evaluating the \emph{conformity} between predictions and labels of a \emph{calibration set}. 
%
The evaluation is based on a \emph{conformity function}, e.g. the absolute residual, $|Y - f(X)|$.
%
Validity guarantees come automatically from the probabilistic properties of empirical quantile estimation. 

Different conformity functions, however, may produce non-equivalent PIs.
%
Several criteria have been proposed to assess their \emph{efficiency} \citep{vovk2016criteria}.
%
For real-valued labels, a straightforward criterion is the average size, ${\rm E}(|C|)$.
%
Making the PIs locally adaptive (over the input space) may increase efficiency if the data are heteroscedastic. 
%
Intuitively, this happens because the size of adaptive PIs changes according to the performance of $f$, i.e. the prediction band shrinks where $|Y - f(X)|$ is small and grows where $|Y-f(X)|$ is large.


\subsection{Outline}
We aim to \emph{localize} conformal PIs by letting the calibration function depend on the object attributes explicitly.
%
We assume the calibration samples, $(X_1, Y_1), \dots, (X_N, Y_N)$, and the test object, $(X_{N+1}, Y_{N+1})$, are independently drawn from the same \emph{joint distribution}, i.e. $(X_n, Y_n) \sim P_{XY}$.
%
Given a conformity function, $a:{\mathbb R}^2 \to {\mathbb R}$, we compute a series of calibration scores, $A_n=a(Y_n, f(X_n))$, $n=1, \dots, N$, and evaluate the conformity of all possible test labels, $y \in {\mathbb R}$, using the same function, $a(y, f(X_{N+1}))$. 
A common choice is $A_n = |Y_n-f(X_n)|$.
The PIs are defined by a threshold, $Q$, which establishes a \emph{validity condition} for the test-time evaluation function, $a(y, f(X_{N+1}))\geq Q$.
If $A_n = |Y_n-f(X_n)|$, for example, we include a candidate label in the PI if $|y - f(X_{N+1})|$ is smaller than $Q$.
%
At confidence level $1-\alpha$, $Q$ should be such that $a(y, f(X_{N+1}))\leq Q$ guarantees ${\rm Prob}(Y_{N+1} \in C) \geq 1 - \alpha$.
%
In CP, $Q = Q_A$ is the \emph{sample quantile} of the calibration scores, $\{ A_n\}_{n=1}^N$.
%
The obtained PIs, $C_A$, are valid by construction and \emph{marginal} because $Q_A$ approximates the quantile of the marginal distribution $P_A = \sum_{XY}P_{AXY}$.
%
As the threshold is obtained from $(X_1, Y_1), \dots, (X_{N}, Y_N)$, depends on the joint distribution of the calibration and test samples, i.e. ${\rm Prob}(Y_{N+1} \in C_A) = P_{X_{N+1}Y_{N+1}X_1Y_1\dots X_NY_N}(Y_{N+1} \in C_A)$.
%
In particular, there is no conditioning on the test object attribute, $X_{N+1}$ \citep{vovk2012conditional}.
%
PIs with input-conditional coverage, ${\rm Prob}(Y_{N+1} \in C_{A|X{N+1}}|X_{N+1})$ \emph{cannot} be obtained with finite data and without certain regularity assumptions on the data distribution \citep{lei2012distribution, vovk2012conditional, foygel2021limits}.
%
Approximating ideal distribution-free conditionally-valid PIs is the goal of an active research stream (see Section \ref{section related work}). 
%
Unlike most existing methods based on sample-specific re-weighting \citep{lin2021locally, tibshirani2019conformal, guan2023localized}\footnote{In \cite{lin2021locally, tibshirani2019conformal, guan2023localized}, the sample quantile of $\{ A_n\}_{n=1}^N$ is replaced by the quantile of an importance-sampling estimate of the empirical input-conditional distribution, $P_{A|X_{N+1}} \approx \sum_{n=1}^N w_n(X_{N+1}) {\bf 1}(A =A_n)$, where $\sum_{n=1}^N w_n(X_{N+1})  = 1$ and $w_n(X_{N+1})$ depends on $X_{N+1}$ through a given function.}, our strategy is to change the definition of the conformity function, i.e. replace $a$ with $b = b(a(Y, f(X)), X)$. 
We then apply $b$ unconditionally to \emph{all} calibration and test samples.
%
Data exchangeability holds automatically, provided $b$ is trained on a separate set.
%
As we use all the available \emph{transformed calibration samples}, $B_n = b(a(Y_n, f(X_n)), X_n)$, to compute the transformed-space threshold, $Q = Q_B$, the obtained PIs are marginally valid.
%
Local adaptability arises when the validity requirement, $b(a(y, f(X_{N+1})), X_{N+1}) \leq Q_B$, is \emph{mapped back} to the label space (by inverting $b$).
%

In this work we address the following problem, 
\begin{center}
    \emph{What transformations of the conformity function improve CP adaptivity? How can we optimize a transformation using a separate training set from the task?}
\end{center}
We start by interpreting $b$ as a \emph{Normalizing Flow} (NF), i.e. a coordinate transformation that maps a \emph{source} distribution, $P$, into a \emph{target} distribution, $P'$ \citep{papamakarios2021normalizing}.
%
In our case, the source distribution is the joint distribution of the conformity scores and the object attributes, $P_{AX}$.
%
The target is a factorized distribution, $P_{BX} = U_{B} P_X$, where $U_{B}$ is an arbitrary univariate distribution.
%
In the $(B, X)$-space, the PIs are marginally valid and have a constant size. 
Their efficiency is guaranteed because the joint distribution factorizes, which implies $P_{B|X} = U_B$ for all $X$ and hence the equivalence between marginally and conditionally valid PIs.
%
To enforce the factorization, we train $b$ by maximizing the likelihood of the transformed samples under $U_{B}$.
%
The $(B, X)$-space PIs are defined by the standard validity condition, $b(a(y, f(X_{N+1})), X_{N+1}) \leq Q_B$, 
When $b$ is invertible (in its first argument and for any $X_{N+1}$), we can obtain an equivalent condition for the label-space, $a(y, f(X_{N+1}) \leq b^{-1}(Q_B, X_{N+1})$.
%
Intuitively, this produces locally adaptive PIs because $b^{-1}(Q_B, X_{N+1})$ approximates the unavailable conditional quantile $Q_{A|X}$.
%
The approximation error depends on the calibration set size and the distribution distance between $P_{AX}$ and $U_{b(A, X), X} P_X$. 
%


\subsection{An example}
Let $P_X={\rm Uniform}({\cal X})$ be the uniform distribution over ${\cal X}=[0, 1]$ and $(X_{1}, Y_{1}), \dots, (X_{N}, Y_{N}), (X_{N+1}, Y_{N+1})$ a collection of i.i.d. random variables from $P_{XY}=P_{Y|X}P_X$ where
\begin{align}
\label{toy model}
    &P_{Y|X} \sim  {\bf 1}(X < 0.5) \varepsilon_1 + {\bf 1}(X > 0.5) \varepsilon_5  
\end{align}
with $\varepsilon_1 \sim {\cal N}(0,1)$ and $\varepsilon_5 \sim {\cal N}(0,5)$.
%
Assume the prediction model is $f(X) = {\rm E}(Y|X) = 0$, for all $X\in {\cal X}$, and let the conformity measure be $a(Y, f(X)) = |Y- f(X)|=|Y|$.
%
Use $f$ and $a$ to form $\{ (A_n, X_n)= (|Y_n|, X_n) \}_{n=1}^N$.
%
As the conformity function, $a$ is deterministic, the \emph{conformity scores} are i.i.d. random variables.
%
Let $1-\alpha$ be the target confidence level and $Q_A$ the $(1-\alpha)$-th sample quantile of $\{ (A_n, X_n)\}_{n=1}^N$.
%
Assuming the scores are i.i.d., $Q_A$ is the $m_*$-th smallest element of $\{ (A_n, X_n)\}_{n=1}^N$, where $m_*=\lceil(1 - \alpha)(N+1)\rceil$.
For example, if $N=100$ and $\alpha=0.05$, $m_*= 96$.
%
For any $X_{N+1}$, the marginal PI is $C_{A}=[f(X_{N+1})-Q_A, f(X_{N+1}) + Q_A] =  [-Q_A, Q_A]$.
%
The exchangeability of $(A_n, X_n)$, $n=1, \dots, N+1$, implies ${\rm Prob}(Y_{N+1} \in C_{A})=\frac{m*}{N+1}$.
%
Since $Q_A$ is a constant, $C_{A}$ has the same width over the entire input space ${\cal X} = [0, 1]$.

According to \eqref{toy model}, the model prediction error depends on $X$, which makes the data heteroscedastic and the marginal PIs inefficient (see Figure \ref{figure toy example 1}).
%
In particular, $C_{A}$ is too large when $X_{N+1}<0.5$ and too small when $X_{N+1}>0.5$.
%
An adaptive CP algorithm may improve efficiency by producing PIs that are smaller or larger than $C_{A}$ when $X_{N+1}<0.5$ and $X_{N+1}>0.5$.
%
Our strategy is to learn \emph{a locally adaptive conformity functions}, $b = b(A, X)$, that produces these adaptive PIs automatically. Technically, $b$ needs to be such that $b^{-1}(Q_B, X_{N+1})$, where $Q_B$ is the sample quantile of $\{ B_n = b(A_n, X_n)\}_{n=1}^N$, is smaller than $Q_A$ for $X_{N+1}<0.5$ and larger for $X_{N+1}>0.5$.
%
As $Q_B$ is computed using all samples, the condition $b(a(y, f(X_{N+1})), X_{N+1})\leq Q_B$  will produce constant-size valid PIs in the $(B, X)$-space.
%

Let $C_B = \{ y \in {\mathbb R}, b(a(y, f(X_{N+1})), X_{N+1})\leq Q_B \}$ be the marginal PIs obtained through $b$. 
%
Assuming \eqref{toy model}, we can compute the exact conditionally valid PIs for all $X$ and compare them with $C_B$ and $C_{A}$.
%  
Split  $\{(A_n, X_n)\}_{n=1}^N$ into $D_{X<0.5} = \{ (A_n, X_n), X_{n} < 0.5 \}_{n=1}^N$ and $D_{X>0.5}=\{ (A_{n}, X_n), X_n>0.5 \}_{n=1}^N$.
%
Assuming $N$  is large enough, the sample quantile of each subset, $Q_{A|X<0.5}$ and $ Q_{A|X>0.5}$, would be a good approximation of the quantiles of $P_{A|X<0.5}$ and $P_{A|X>0.5}$.
%
Let $Q_{A|X_{N+1}} = {\bf 1}(X_{N+1}<0.5) Q_{A|X<0.5} + {\bf 1}(X_{N+1}>0.5) Q_{A|X>0.5}$, where $Q_{A|X<0.5}$ and $ Q_{A|X>0.5}$, are the $m_{+}$ and $m_{-}$-th smallest elements of $D_{X<0.5}$ and $D_{X>0.5}$, with $m_{+}=\lceil(1 - \alpha)(|D_{X>0.5}|+1)\rceil$ (idem for $X<0.5$). 
%
The conditionally-valid PIs, $C_{A|X_{N+1}} = [-Q_A|X_{N+1}, Q_A|X_{N+1}]$ will depend on the location of $X_{N+1}$ through $Q_{A|X_{N+1}}$, i.e. we will have $C_{A|X_{N+1}}=[-Q_{A|X_{N+1}}, Q_{A|X_{N+1}}]$. 
%
In words, the conditionally valid PIs for $X<0.5$ and $X>0.5$ are the marginal PIs of the regions $[0,0.5] \subset {\cal X}$ and $[0.5, 1]\subset {\cal X}$. 

%
Let 
\begin{align}
\label{toy phi}
   b_{\theta}(A, X) = \frac{A}{\theta_1 + \theta_2 \ \sigma(M X)}
\end{align} 
where $\theta = (\theta_1,\theta_2)$ is a non-negative free parameter, $\sigma(t)=(1 + e^{-t})^{-1}$, and $M = 30$.
%
For any $X$ and $\theta$, $b_\theta(A, X)$ is a monotonic (and hence invertible) function of $A$.
%
$b_\theta$ is the conformity function of  \cite{papadopoulos2008normalized} with $\gamma = \theta_1$ and $g^2(X) = \theta_2 \sigma(M X)$.
%
Let $\{ B_n=b_\theta(A_n,X_n)\}_{n=1}^N$ be the \emph{transformed calibration set}.
%
In Figure \ref{figure toy example 2}, we compare a sample of $\{ A_n,X_n)\}_{n=1}^N$  and a sample of $\{ B_n=b(A_n,X_n)\}_{n=1}^N$ for two different choices of $\theta$.
%

Let $Q_{B}$ be the $(1-\alpha)$-th (marginal) sample quantile of $\{ B_n=b_\theta(A_n,X_n)\}_{n=1}^N$, i.e. $Q_{B} = A_{n_*}(\theta_1 + \theta_2 \sigma(M X_{n_*}))^{-1}$, with $n_*$ such that there are $n_*$ scores elements of $\{ B_n\}_{n=1}^N$ smaller than or equal to $B_{n_*}$ .

The exchangeability of $B_1, \dots, B_N$ and $B_{N+1}=b_{\theta}(A_{N+1}, X_{N+1})$ implies ${\rm Prob}(B_{N+1} \leq Q_{B}) = \frac{n_*}{N+1}$. 
%
Thanks to the monotonicity of $b_\theta^{-1}$, we can convert the transformed PIs back to the original space through 
${\rm Prob}(B_{N+1} \leq Q_{B})  = {\rm Prob}(A_{N+1} \leq b^{-1}(Q_{B}, X_{N+1}) = {\rm Prob}(|Y_{N+1}| \leq b^{-1}(Q_{B}, X_{N+1})) = {\rm Prob}(|Y_{N+1}| \leq Q_{B}(\theta_1 + \theta_2 \sigma(M X_{N+1})))= {\rm Prob}(Y_{N+1} \in C_B)$. 
%
If $M \to \infty$, $\theta = (1, 5)$, and $|D_{X<0.5}| = |D_{X>0.5}|$ (we assume the samples are equally split between the two regions), $Q_{B}$ is equivalent to $C_{A|X_{N+1}}$.  

If $P_{Y|X}$ is unknown, we need an optimization strategy to find $\theta$.
In the Error Re-weighted (ER) approach of \cite{papadopoulos2008normalized}, $\theta_1$ is a hyper-parameter and  $\theta_2 \sigma(M X)$ is a model of the conditional residuals, i.e. $\theta_2 = \theta_{ER} = \arg \min_t \sum_{(X, Y) \in D} |Y^2 - t^2\sigma(M X) |^2$. 
The results in Figures \ref{figure toy example 1} and \ref{figure toy example 2} are for $\theta_1=0.5$.
%
In the proposed approach, we interpret $b_\theta$ as an NF acting on $(A, X) \sim P_{AX}$ and train it by maximizing the likelihood of the transformed scores under a target factorized distribution, $U_{B}P_X$.
Choosing $U_B={\cal N}(0, 1)$, we obtain 
\begin{align}
\label{toy density}
    u_{b_\theta(A)}(A, X) = \frac{\exp\left( -\frac12 A^{2} (\theta_1 + \theta_2 \sigma(M X))^{-2} \right)}{ \sqrt{2\pi}(\theta_1 + \theta_2 \sigma(MX))}
\end{align}
where the Jacobian of $b$ is added because we evaluate the density $u_B$ at $B=b(A)$.
%
Assuming we have a separate \emph{training set} of labeled samples, $\{(A_{n'},X_{n'})\}_{n'=1}^N$, we find an optimal $b$ by minimizing $ - \sum_{n'=1}^N \log (u_{b_\theta(A)}(A_{n'}, X_{n'}))$ over $\theta$.
%
Figure \ref{figure toy example 1} shows the PIs obtained through the above procedure, $C_{flow}$, in red, and the ER approach, $C_{ER}$, in blue.

\begin{figure}[t]
    \centering
    \includegraphics[scale=.5]{plots/toyModel.pdf}
    \caption{Samples and marginal and conditional PIs for the example of Section \ref{section introduction}. 
    $C_{ER}$ and $C_{flow}$ are the (marginal) PIs obtained through the ER approach of \cite{papadopoulos2008normalized} and the proposed method.
    }
    \label{figure toy example 1}
\end{figure}
\begin{figure}[t]
    \centering
    \includegraphics[scale=.5]{plots/bSamples.pdf}
    \caption{Calibration samples of the original and transformed random variables, $A_n=|Y_n|$, $B_{ER n}=A_n(0.5 + \theta_{ER}\sigma( 30 X_n))^{-1}$, and $B_{flow n} = A_n(\theta_{flow 1} + \theta_{flow 2}\sigma( 30 X_n))^{-1}$, where 
    $\theta_{ER} = \arg \min_t {\rm E}|Y^2 - t^2 \sigma^2(M X) |^2$, $\theta_{flow}=\arg \max_\theta {\rm E}\log(u_{b_\theta}(|Y|, X))$, and $u_{b_\theta}(A, X)$ is defined in \eqref{toy density}.
    }
    \label{figure toy example 2}
\end{figure}

\section{Theory}
\label{section theory}
In this section, 
${\cal X}$ is an arbitrary attribute space and $\{ (X_{n}, Y_n) \in {\cal X} \times {\mathbb R}\}_{n=1}^{N+1}$ a collection of i.i.d. random variables from an unknown joint distribution, $P_{XY} = P_{Y|X} P_X$.
%
The regression model, $f(X_n) \approx {\rm E}(Y_n|X_n)$, $n=1, \dots, N + 1$, is assumed to be pre-trained on separate data.
%

\subsection{Quantiles}
\label{section quantiles}
Given a random variable, $Z \in {\cal Z}$, a distribution, $P_{Z}$, let $F_{Z}(z) = P_{Z}(Z \leq z)$ be the Cumulative Distribution Function of $P_Z$. 
The $(1 - \alpha)$-th quantile of $Z \sim P_Z$ is 
\begin{align}
\label{theoretical quantile definition}
\bar Q_Z = \inf_q\{q \in {\cal Z}: F_Z(q) \geq (1 - \alpha) \}    
\end{align}
When $Z$ is continuous, $F_Z$ is strictly increasing and  $\bar Q_Z = F_Z^{-1}(1 - \alpha)$.
%
The $(1-\alpha)$-th sample quantile of a collection of i.i.d. random variables, $\{ Z_n \sim P_Z\}_{n=1}^N$, is the $(1-\alpha)$-th quantile of the empirical distribution $P_Z \approx \frac{1}{N}\sum_{n=1}^N {\bf 1}(Z = Z_n)$, i.e. 
\begin{align}
\label{sample quantile definition}
    &Q_Z = \inf_q \{q \in {\cal Z}, |\{ Z_n \leq q\}_{n=1}^N | \geq n_*\} \\
    &n_* = \lceil(N + 1)(1 - \alpha)\rceil\nonumber
\end{align}
where $|S|$ is the cardinality of set $S$ and $\lceil s \rceil$ the smallest integer greater than or equal to $s \in {\mathbb R}$.
%
Assuming ties occur with probability 0, i.e. ${\rm Prob}(Z_n=Z_{n'})=0$ for any $n\neq n'$, $Q_Z$ is the  $n_*$-th smallest element of $\{ Z_n \sim P_Z\}_{n=1}^N$.
%
CP validity is a direct consequence of 
\begin{lemma}[Quantile Lemma \cite{tibshirani2019conformal}]
\label{lemma quantile}
Let $Z_1, \dots, Z_{N}, Z_{N+1} \in {\mathbb R}$ be a collection of i.i.d. random variables and $Q_Z$ be the $(1-\alpha)$-th sample quantile of $\{ Z_n \}_{n=1}^N$ defined in \eqref{sample quantile definition}. 
If ties occur with probability 0,  
\begin{align}
{\rm Prob} \left(Z_{N+1} \leq Q_Z \right) = \frac{\lceil (1 - \alpha) (N + 1)\rceil}{N +1}
\end{align}
\end{lemma}
The lemma first appeared in \cite{papadopoulos2002inductive}. 
Slightly different proofs can be found in  
\cite{lei2014distribution,tibshirani2019conformal, angelopoulos2021gentle}. 
%
The standard CP bounds, $1-\alpha \leq {\rm Prob}(Z_{N+1} \leq Q_Z) \leq 1-\alpha + \frac{1}{N+1}$, follows from $\lceil s \rceil - s\geq 0$ and $(1 - \alpha)(N+1) \leq \lceil (1-\alpha)(N+1)\rceil \leq (1 - \alpha) (N+1) + 1$. 
%
Asymptotically, $Q_Z$ is normally distributed around $\bar Q_Z$ with variance $\sigma^2 = \frac{(1-\alpha)\alpha}{N p_Z(\bar Q_Z)}$, where $p_Z(\bar Q_Z)$ is the density of $P_Z$ evaluated at $Z = \bar Q_Z$, with $\bar Q_Z$ defined in \eqref{theoretical quantile definition}. 

\subsection{Conformity scores}
A conformity score is a random variable, $A= a(f(X), Y)$, that describes the conformity between a prediction, $f(X)$, and the corresponding label, $Y$.
%
A standard choice is $a=|Y - f(X)|$. 
Let $P_{AX}$ be the distribution of the i.i.d. random variables $\{(A_{n}=|Y_n - f(X_n)|, X_{n})\}_{n=1}^{N+1}=$.
%
Lemma \ref{lemma quantile} guarantees the validity of the symmetric PI, 
\begin{align}
\label{marginal prediction intervals}
    C_A = [f(X_{N+1}) - Q_A,  f(X_{N+1}) + Q_A] 
\end{align}
when $Q_{A}$ is the $(1-\alpha)$-th sample quantile of $\{ A_n\}_{n=1}^N$.
%
We may also let the conformity scores be $B=b(A)$, where $b$ is a \emph{global monotonic function} of its argument, e.g.  $b(s)=-s^{-1}$ or $b(s) = \log s$.
%
In that case, we obtain the PIs by inverting $b$ and letting $C_{B} = [f(X_{N+1}) - b^{-1}(Q_{B}), f(X_{N+1}) + b^{-1}(Q_{B})]$, where $Q_{B}$ is the $(1-\alpha)$-th sample quantile of $\{ B_n=b(A_n)\}_{n=1}^N$ and $b^{-1} \circ b(A) = A$.
For example, $b^{-1}(Q_{B}) = - \frac{1}{Q_{B}}$ if $B=-\frac{1}{A}$  and $b^{-1}(Q_{B})  = \exp(Q_{B})$ if $B = \log A$. 
%
Assuming ties occur with probability 0, $Q_{A}$ is the $\lceil (1-\alpha) (N + 1)\rceil$-th smallest element of $\{A_n\}_{n=1}^N$.
%
Let $A_{n_*}$ be that element.
%
The $(1-\alpha)$-th sample quantile of the transformed scores, $Q_{B}$, is the $\lceil (1-\alpha) (N + 1)\rceil$-th smallest element of $\{b(A_n)\}_{n=1}^N$.
%
If $b$ is monotonic and applies globally to all samples, $b(A_n) < b(A_{n'})$ if and only if $A_n<A_{n'}$, for any $n\neq n'$.
%
Then $Q_{B} = b(A_{n_*})$ and $b^{-1}(Q_{B})= Q_{A}$, i.e. the size of the PIs does not depend on $b$.
%
If $b$ depends on the input, $b(A_n, X_n) < b(A_{n'}, X_{n'})$ does not imply $A_n<A_{n'}$, for any $n\neq n'$, i.e. the PIs depends on $b$.

%
\subsection{Normalizing Flows}
This work is about finding an input-dependent transformation $b =b(A, X)$ that changes the PIs to make them locally adaptive and more efficient.
%
In what follows, we assume $b$ always satisfies 
\begin{assumption}
    \label{assumption phi}
    Let ${\cal A}, {\cal B} \subseteq {\mathbb R}$ and $b: {\cal A} \times {\cal X} \to {\cal B}$. Then,
    \begin{enumerate}
    \item the domain and co-domain of $b$ are the same for all $X \in {\cal X}$ and 
    \item $b$ is strictly increasing on its first argument, i.e. $J_{b}(A, X) = \frac{\partial }{\partial A} b(A, X) > 0$ for all $(A, X)$.
    \end{enumerate}
\end{assumption}
Let $b^{-1}(B, X)$ be defined by $b^{-1}(b(A, X), X) = A$.
%
The assumption on the domain and co-domain of $b$ guarantees $b^{-1}(b(A, X'), X)$ is well defined for any $X \neq X'$.
%
We avoid over-fitting by letting $b$ be smooth in $X$ and $A$.
%
Since $b$ acts on random variables, we interpret it as (part of) an NF. 
%
Let $P_Z$ and $U_Z$ be two distributions with the same support, ${\cal Z}$. 
An NF is an invertible coordinate transformation from ${\cal Z}$ to ${\cal Z}$ such that 
\begin{align}
    Z' = \phi_b(Z)  \sim U_{Z'},  \quad Z = \phi_b^{-1}(Z') \sim P_Z
\end{align}
In our case, $Z = (A, X)$, $Z'=(B, X)$, and $\phi_b(A, X) = (b(A, X), X)$. 
%
The Jacobian of $\phi_b$ is a $(|{\cal X}|+1)$-dimensional squared matrix, $J_{\phi_b}$, such that $J_{\phi_b ij} = 0$ for all $i, j > 1$ and $i\neq j$, $J_{\phi_b ii} = 1$ for all $i> 1$, $J_{\phi_b 1 i} = \frac{\partial}{\partial X_i} b(A, X)$ for all $i>1$, and $J_{\phi_b 11} = \frac{\partial}{\partial A} b(A, X)$.
We often use $J_b(A, X)$ instead of $J_{\phi_b 11}$.
Assumption \ref{assumption phi} implies $J_{\phi_b 11}> 0$ and guarantees the invertibility of $\phi_b$ because,
for any $(A, X)$, ${\rm det} (J_{\phi_b}(A, X)) = \prod_{i=1}^{|{\cal X}|+1} J_{\phi_b \ ii}(A,X) = J_{\phi_b \ 11}(A, X)$ is strictly positive.
%
When not explicitly required, we drop the trivial part of $\phi_{b}$ and use $b$ for either $\phi_b$ or $\phi_{b1}$ depending on the context.
%
See \cite{papamakarios2021normalizing} for a review of using NFs in inference tasks. 
%


\subsection{Validity}
Given an NF, $b$, we let the associated marginal PI at $X_{N+1}$ be  
\begin{align}
\label{prediction interval phi}
    &C_{B} = [f(X_{N+1})- \delta, f(X_{N+1})+ \delta] \\ 
    &\delta = b^{-1}(Q_B, X_{N+1}) \nonumber 
\end{align}
where $Q_{B}$ is the $(1-\alpha)$-th sample quantile of $\{ B_n= b(A_n, X_n) \}_{n=1}^N$.
%
If ties occur with probability 0, the validity of $C_B$ defined in \eqref{prediction interval phi} is guaranteed by  
\begin{lemma}
\label{lemma validity cphi}
    Let $b$ satisfy Assumption \ref{assumption phi} and $C_B$ be the PI defined in \eqref{prediction interval phi}.
    Then 
    \begin{align}
        {\rm Prob}(Y_{N+1} \in C_B) =  \frac{\lceil (1-\alpha)(N+1)\rceil}{N+1}  
    \end{align}
\end{lemma}
%
%
The transformation is globally defined but \emph{acts differently} on the samples, e.g. we may have $b(A, X_n) \neq b(A, X_{n'})$ for some $A\in {\cal A}$ and $n\neq n'$.
%
The ranking of the \emph{original scores}, $\{A_n\}_{n=1}^N$, may differ from the ranking of the.\emph{transformed scores}, $\{B_n\}_{n=1}^N$, i.e. $A_1<A_2< \dots< A_N$  may not imply $B_1<B_2< \dots< B_N$.
%
This happens if $A_n < A_{n'}$ and $b(A_n, X_n) > b(A_{n'}, X_{n'})$ for some $n\neq n'$. 
%
While validity is automatically guaranteed because calibration and test samples remain exchangeable, we may have $C_A \neq C_B$, e.g. when $b$ changes the ranking of the calibration samples.
%
Under further mild assumptions on $b$, Lemma  \ref{lemma size is different} shows that we can find a test object for which $|C_B| \neq |C_A|$. 
\begin{lemma}
\label{lemma size is different}
Let $\{ A_{n}\}_{n=1}^{N+1}$ be a collection of i.i.d. continuous random variables.
Assume $b$ satisfies Assumption \ref{assumption phi}.
Then, if $b(A_n, X_{N+1}) \neq b(A_n, X_n)$ for any $n=1, \dots, N$,
\begin{align}
    |C_{B}| \neq |C_A|
\end{align}
with $C_B$ and $C_A$ defined in \eqref{prediction interval phi} and \eqref{marginal prediction intervals}.
\end{lemma}


\subsection{Exact Normalizing Flows}
In some cases, marginally valid PIs are also conditionally valid for any $X_{N+1} \in {\cal X}$, i.e. $C_A$ defined in \eqref{marginal prediction intervals} obeys
\begin{align}
\label{special case}
    {\rm Prob}(Y_{N+1} \in C_{A} |X_{N+1}) \geq 1 - \alpha
\end{align}
which occurs when $P_{AX}$ has a specific form.
%
For example, when the data are \emph{not} heteroscedastic, i.e. $P_{AX} =  P_{A|X} P_X = P_A P_X$.
% 
The equivalence of marginal and conditional PIs in this case is proven in %
\begin{theorem}
    \label{theorem marginal and conditional}
    Let $P_{AX} =  P_A P_X$ for any $X \in {\cal X}$.
    For any $X_{N+1} \in {\cal X}$, $C_A$ defined in \eqref{marginal prediction intervals}, obeys 
    \begin{align}
    \label{claim marginal and conditional}
        {\rm Prob}(Y_{N+1}\leq C_A|X_{N+1}) = \frac{\lceil (N+1) (1 - \alpha) \rceil}{N+1}
    \end{align}
   \end{theorem}
Theorem \ref{theorem marginal and conditional} is a straightforward consequence of the Bayesian theorem and Lemma \ref{lemma quantile}.
We include it here because it suggests we can find an NF that localizes the PIs.
%
The idea is to train the NF, $b$, to make $C_{B} = C_{b(A)}$ \emph{approximately} conditionally valid through Theorem \ref{theorem marginal and conditional}, i.e. to make $b$ such that $(b(A_n, X_n), X_n) = (B_n, X_n) \sim P_{BX} \approx P_B P_X$.
%
Interpreting $b$ as an NF, we find a near-optimal $b$ by \emph{maximizing the likelihood of the transformed scores under an arbitrary target distribution, $U_{B}$, that does not depend on the input}. 
Given samples from $A$, we need the composition between the target distribution and the score transformation, $b$.
%
$\int_{x}^{x'} dx p(f(x)) = \int_{f(x)}^{f(x')} \frac{dy}{f'(f^{-1}(y))} p(y)$ implies the density of the composition is $p(B, X) = u(b(A, X)) J_{b}(A, X)) p(X)$.
%
The objective function is  
\begin{align}
\label{likelihood}
\ell(b) &=  {\rm E} \left(\log u(B)p(X)  \right)\\
&= {\rm E}\left( \log\left(u(b(A, X) \ |J_b(A, X)|\right) \right) + \ell_0 \nonumber 
\end{align}
where $u$ is the density of the (arbitrary) target distribution $U_B$ and $\ell_0 = {\rm E} (\log p(X))$ does not depend on $b$.
%
Fix a given target distribution, $U_B$, e.g. let $U_B$ be the univariate Gauss distribution or $U_B\sim {\rm Uniform}([0, 1])$.
%
Assume there exists an NF, $b$, that satisfies Assumption \ref{assumption phi} and is such that $P_{BX} = U_{B} P_X$ for any $(A, X)$ when $B=b(A, X)$.
%
Then, $C_B$ defined in \eqref{prediction interval phi} is conditionally valid at $X_{N+1}$, as we show in 
%
\begin{corollary}
    \label{corollary exact flow intervals}
    Let $U_B$ be an arbitrary univariate distribution and $b$ an NF satisfying Assumption \ref{assumption phi}.
    If $(B, X) = (b(A, X), X) \sim P_{BX} = U_B P_X$ for any $(A, X)$, $C_B$ defined in \eqref{prediction interval phi} obeys
    \begin{align}
        {\rm Prob}(Y_{N+1} \in C_{B}|X_{N+1}) = \frac{\lceil(1 - \alpha) (N + 1) \rceil}{N+1}
    \end{align}
\end{corollary}
%
Corollary \ref{corollary exact flow intervals} follows from Lemma \ref{lemma validity cphi} and the monotonicity of $b$.
%
There is no contradiction with the negative results of \cite{lei2012distribution, vovk2012conditional} because exact factorization can not be achieved with finite data. 

\subsection{Non-exact Normalizing Flows}
Let $\hat b$ be an NF trained by maximizing a finite-sample empirical estimation of the likelihood defined in \eqref{likelihood}.
%
We should not expect $\hat b$ to factorize $P_{BX}$ exactly but assume it approximates the ideal optimal transformation, $b$, defined in Corollary \ref{corollary exact flow intervals} in the Huber sense.
%
More precisely, we let $\epsilon >0$ quantify the discrepancy between the two transformations and
\begin{align}
\label{huber expansion}
    &\hat b = (1 - \epsilon) b + \epsilon \delta, 
    \end{align}
where $\delta = \delta(A, X)$ is an unknown error term that depends on $(A, X)$.
%
The assumption is technical and used to prove the error bounds below. 
%
The density of the perturbed distribution is $p(\hat B, X) = |J_{\hat b}(A, X)| u(\hat b(A, X)))p(X)$, 
which may be expanded in $\epsilon$ under the assumption $\epsilon <<1$. 
%
Theorem \ref{theorem approximate validity} characterizes the validity of $C_{\hat B}$, i.e. the PIs defined in \eqref{prediction interval phi} with $b$ replaced by $\hat b$, up to $o(\epsilon^2)$ errors.
%
We assume $b$ and $\hat b$ fulfill the requirements of Assumption \ref{assumption phi}, $b$ satisfies the assumption of Corollary \ref{corollary exact flow intervals}, and $\hat b$ is the minimizer of \eqref{likelihood} for a given target distribution $U_B$.
%
To simplify the notation, we let $B=b_X(A)$ where $b_X = b(A, X)$ (idem $b_{X}^{-1}$,  $\hat b_X$, and $\hat b^{-1}_X$) and define $\tilde B = \psi_X(A)$, where $\psi_{X} = b_{X_{N+1}}^{-1} \circ \hat b_{X_{N+1}}\circ \hat b_{X}^{-1} \circ b_{X}$.  
We bound the validity gap of $C_{\hat B}$ in terms of the variation distance between the distributions of $B$ and $\tilde B$, i.e. 
\begin{align}
d_{\rm TV}(P_{BX}, P_{\tilde  BX})  &=  \sup_{(A, X)} \| p(B, X) - p(\tilde B, X) \| 
\end{align}
where $p(B, X) $ and $p(\tilde B, X) $ are the densities of $P_{BX} = U_B P_X$ and $P_{\tilde BX}$ and $B$ and $\tilde B$ depend on $(A, X)$ through $b$ and $\psi$.
We use the Maximal Coupling Theorem to link the CP validity bound in \eqref{lemma validity cphi} and the total variation distance above. 
See \cite{lindvall2002lectures} or \cite{ross2023second} for an overview of coupling methods.
%
Up to  $o(\epsilon^2)$ correction an explicit lower bound of the gap is given in 
\begin{theorem}
\label{theorem approximate validity}
Let $b(A, X)$ and $\hat b(A, X)$ obey Assumption \ref{assumption phi} and $U_B = {\rm Uniform}([0, 1])$.
Assume $\hat b(A, X) =  (1 - \epsilon)b(A, X)  + \epsilon \delta(A, X)$ for all $(A,X)$. 
Then,   
\begin{align}
    &{\rm Prob}(B_{N+1} \leq Q_{\hat B}) \\
    &\geq \frac{\lceil (N+1)(1 - \alpha)\rceil }{N+1} - \frac{1}{2} d_{\rm TV}(P_B, P_{\tilde B}) \\
    &\geq \frac{\lceil (N+1)(1 - \alpha)\rceil }{N+1} - \epsilon \sup_{x} \|p_X(x) \| L_\delta L_{b^{-1}} + o(\epsilon^2) \nonumber 
\end{align}
where $Q_{\hat B}$ is the sample quantile of $ \{\hat b(A_n, X_n)\}_{n=1}^N$ defined in \eqref{sample quantile definition} with $b$ replaced by $\hat b$, $\tilde B = \psi_X(B)$, with $\psi_{X} = b_{X_{N+1}}^{-1} \circ \hat b_{X_{N+1}}\circ \hat b_{X}^{-1} \circ b_{X}$ and $b_{X}(A) = b(A, X)$ (idem $b_{X}^{-1}$,  $\hat b_X$, and $\hat b^{-1}_X$), $L_\delta$ and $L_{b^{-1}}$ are the Lipshitz constants of $\delta(B, X)$ and  $b^{-1}$, and $p(X)$ is the marginal density of the covariates.
\end{theorem}
Theorem \ref{theorem approximate validity} connects our work with the non-exchangeability gaps obtained in  \cite{barber2022conformal} in a different framework.

\section{Implementation}
\label{section implementation}
We compare two models trained with the proposed scheme with a standard CP algorithm and the ER model of \cite{papadopoulos2008normalized}. 
For simplicity, we focus on Split CP, where the regression mode, $f$, is pre-trained on separate data and kept fixed.


\subsection{Data}
We generate 4 synthetic data sets by perturbing the output of a polynomial regression model of order 2 with four types of heteroscedastic noise.
Each data sets consist of 1000 samples from the following generative model
\begin{align}
&Y = X^T w + \epsilon_i,\\
    &X = [1, X_1, X_1^2], \quad X_1 \sim {\rm Uniform}([-1, 1]), \nonumber \\
    &\epsilon_i = 0.1 + \sigma_{\tt synth-i}(X) E, \quad E \sim {\cal N}(0, 1) \nonumber
\end{align}
where $w \in {\mathbb R}^3$ is a randomly generated fixed parameter,  ${\tt i} \in \{ {\tt cos}, {\tt squared}, {\tt inverse}, {\tt linear} \}$, and 
\begin{align}
    &\sigma_{\tt synth-cos}(X) = 2 \cos\left(\frac{\pi}{2} X_1 \right) {\bf 1}(X_1 < 0.5) \\
    &\sigma_{\tt synth-squared}(X) = 2 X^2_1 {\bf 1}(X_1 > 0.5)\\
    &\sigma_{\tt synth-inverse}(X) = 2 \frac{1}{0.1 + |X_1|}{\bf 1}(X_1 < 0.5)\\
    &\sigma_{\tt synth-linear}(X) = 2 |X_1|{\bf 1}(X_1 > 0.5)
\end{align}

For the real-data experiments, we use the following 6 public benchmark data sets from the UCI database:
{\tt bike}, the Bike Sharing Data Set \citep{bikeData}, {\tt CASP}, the Physicochemical Properties of Protein Tertiary Structure Data Set \citep{caspData}, {\tt community}, Community and Crime Data Set\citep{communityData}, {\tt concrete}, the Concrete Compressive Strength Data Set \citep{concreteData}, {\tt energy}, the Energy Efficiency Data Set \citep{energyData}, and {\tt facebook\_1}, the Facebook Comment Volume Data Set \cite{facebookData}. 

All data sets are split into two subsets.
We use the first subset to train a Random Forest (RF) regressor and the second subset to train and test the conformity functions.
For stability, we limit the dimensionality of the object attributes to 10 (with PCA) and normalize the label before training the RF models.
The Mean Absolute Error of the RF regressor is reported in Table \ref{table mae}.
%%%
\begin{table}
    \centering
    \begin{small}
    \begin{tabular}{*{2}{l}} 
    \toprule
    data set & MAE \\ 
    \midrule
{\tt synth-cos}& 0.051(0.033)\\
{\tt synth-inverse}& 0.056(0.007)\\ 
{\tt synth-linear}& 0.157(0.054)\\
{\tt synth-squared} &0.125(0.038)\\
{\tt bike}& 0.028(0.001)\\ 
{\tt CASP}& 0.14(0.003)\\ 
{\tt community}& 0.007(0.001)\\ 
{\tt concrete}& 0.051(0.002)\\ 
{\tt energy}&0.019(0.001)\\ 
{\tt  facebook\_1}&0.003(0.0)\\
\bottomrule
    \end{tabular}
    \caption{Averages and standard deviation over 5 runs of the MAE of the RF regression model on the synthetic and real data sets.}
    \label{table mae}
    \end{small}
\end{table}
%
%
To make the performance comparable across different data sets, we reduce the size of the second subset to 1000 (except for {\tt community}, {\tt concrete}, and {\tt energy} that have size 997, 515, and 384), split it into two equal parts, and use the first to train the conformity measures and the second for calibrating and testing the optimized models.


\subsection{Models}
We let $A = |Y - f(X)|$ and consider four model classes,
\begin{align}
    &b_{\tt baseline} = A \\
    &b_{\tt ER} = \frac{A}{\gamma + |g(X)|} \\
    &b_{\tt Gauss} = \log\left(\frac{A}{\gamma + |g(X)|}\right) \\
    &b_{\tt Uniform} = \sigma\left(\frac{A}{\gamma + |g(X)|}\right) 
\end{align}
where $\gamma = 0.001$ and $g$ is a fully connected ReLU neural network with 5 layers of 100 hidden units per layer. 
The network parameters of {\tt ER} are trained as in \cite{papadopoulos2008normalized} by minimizing $\ell_{ER} = {\rm E}(|g(X)|-|f(X)-Y|)^2$. 
{\tt Gauss} and {\tt Uniform} are trained with the proposed approach by maximizing the log-likelihood in \eqref{likelihood} where $U_B = {\cal N}(0, 1)$ for {\tt Gauss} and $U_B = {\rm Uniform}([0, 1])$ for {\tt Uniform}.
The model functional form guarantees $b$ belongs to the distribution support for any $(A, X)$.
We use the ADAM gradient descent algorithm of \cite{adam1904understanding} to solve all optimization problems with standard parameters.
The learning rate is 0.01 for {\tt ER}, $10^{-4}$ for {\tt Gauss}, and $10^{-5}$ for {\tt Uniform}.

\subsection{Results}
To evaluate the PIs, we consider their empirical validity, ${\rm E}({\bf 1}(Y_{N+1} \in C_B))$, average size, ${\rm E}(|C_B|)$, and empirical input-conditional coverage, which we approximate with the Worse-Slab Coverage (WSC) algorithm of \cite{cauchois2020knowing}. % 
Table \ref{table all exps} summarizes our numerical results across the 4 synthetic and 6 real data sets for three values of the confidence level, $1-\alpha \in \{ 0.95, 0.90, 0.65 \}$. 
Tables \ref{table data sets synth} and \ref{table data sets real} show the model performances at $\alpha = 0.05$ on each data set. 
The figures are the averages and standard deviations over 5 random train-test splits.

{\tt baseline} is the best method for $\alpha = 0.35$, on synthetic and real data, but is generally outperformed by the trained models at higher confidence levels.
{\tt Gauss} seems to outperform all other models on synthetic data. 
This may be due to the Gaussianity of the noise in the generation of the synthetic samples. 
On synthetic data, {\tt ER} is the second best model, probably because we generate the data using $Y \sim f(X) + g(X) \epsilon$, $\epsilon \sim {\cal N}(0, 1)$, which implies the ER assumptions are \emph{exact}.
{\tt Uniform} is the best model on real data. 
Interestingly, the model with the conditional coverage closest to the nominal is not the same at all confidence levels.
Table \ref{table data sets real} suggests that the optimized models are outperformed by {\tt baseline} when data are not heteroscedastic, i.e. when {\tt baseline} has good conditional coverage.
This seems to be a shared problem of re-weighting methods, as already observed in \cite{romano2019conformalized}.
The code for reproducing all numerical simulations is available in this \url{https://github.com/nicoloRHUL/ConformalCalibrationTraining}. 
\begin{table}
    \centering
    \begin{small}
    \begin{tabular}{*{4}{l}}
\toprule
\multicolumn{4}{c}{{\bf synthetic data} (all data sets)}\\ 
&coverage&size&WSC\\ \midrule
\multicolumn{4}{c}{$\alpha=0.05$} \\ 
baseline                &0.954(0.021)   &0.783(0.086)   &0.798(0.198)\\
ER              &0.954(0.019)   &0.57(0.122)    &0.915(0.094)\\
Gauss           &0.953(0.02)    &0.506(0.048)   &0.883(0.118)\\
Uniform         &0.955(0.025)   &0.624(0.077)   &0.881(0.131)\\
\midrule
\multicolumn{4}{c}{$\alpha=0.1$} \\ 
baseline                &0.904(0.035)   &0.546(0.063)   &0.637(0.239)\\
ER              &0.902(0.027)   &0.429(0.067)   &0.833(0.147)\\
Gauss           &0.904(0.024)   &0.382(0.035)   &0.8(0.119)\\
Uniform         &0.906(0.036)   &0.451(0.041)   &0.737(0.153)\\
\midrule
\multicolumn{4}{c}{$\alpha=0.35$} \\ 
baseline                &0.661(0.042)   &0.183(0.018)   &0.244(0.146)\\
ER              &0.677(0.052)   &0.209(0.029)   &0.443(0.134)\\
Gauss           &0.673(0.05)    &0.2(0.022)     &0.461(0.139)\\
Uniform         &0.668(0.05)    &0.22(0.029)    &0.449(0.13)\\
\bottomrule
\vspace{0.5cm}
\\\toprule
\multicolumn{4}{c}{{\bf real data} (all data sets)}\\ 
&coverage&size&WSC\\ \midrule
\multicolumn{4}{c}{$\alpha=0.05$} \\ 
baseline                &0.955(0.016)   &0.141(0.013)   &0.929(0.066)\\
ER              &0.955(0.02)    &0.175(0.056)   &0.94(0.079)\\
Gauss           &0.957(0.013)   &0.17(0.03)     &0.913(0.078)\\
Uniform         &0.952(0.017)   &0.137(0.015)   &0.944(0.076)\\
\midrule
\multicolumn{4}{c}{$\alpha=0.1$} \\ 
baseline                &0.901(0.029)   &0.103(0.009)   &0.853(0.134)\\
ER              &0.9(0.032)     &0.117(0.024)   &0.832(0.114)\\
Gauss           &0.911(0.025)   &0.113(0.011)   &0.889(0.093)\\
Uniform         &0.901(0.028)   &0.102(0.009)   &0.869(0.11)\\
\midrule
\multicolumn{4}{c}{$\alpha=0.35$} \\ 
baseline                &0.681(0.044)   &0.045(0.005)   &0.512(0.157)\\
ER              &0.659(0.039)   &0.049(0.006)   &0.645(0.143)\\
Gauss           &0.658(0.041)   &0.048(0.003)   &0.574(0.144)\\
Uniform         &0.672(0.054)   &0.046(0.005)   &0.643(0.122)\\
\bottomrule
    \end{tabular}
    \caption{Average efficiency of the model PIs (coverage, size, and Worst Slab Coverage (WSC) estimate of the conditional coverage \citep{cauchois2020knowing}) over the data sets listed in Table \ref{table mae}.
    The reported averages and standard deviation are computed over 5 random training-test splits.}
    \label{table all exps}
    \end{small}
\end{table}

\begin{table}
    \centering
    \begin{small}
    \begin{tabular}{*{4}{l}}
\toprule
\multicolumn{4}{c}{\bf synthetic data  ($\alpha = 0.05$)}\\
&coverage&size&WSC\\ \midrule
\multicolumn{4}{c}{\tt synth-cos} \\ 
%--------------------------------------
baseline                &0.944(0.029)   &0.274(0.039)   &0.749(0.293)\\
ER              &0.945(0.022)   &0.208(0.123)   &0.926(0.062)\\
Gauss           &0.945(0.024)   &0.128(0.015)   &0.833(0.136)\\
Uniform         &0.945(0.035)   &0.148(0.03)    &0.828(0.188)\\
\midrule
\multicolumn{4}{c}{\tt synth-inverse} \\ 
%+++++++++++++++++++++++synth-inverse
baseline                &0.953(0.02)    &0.321(0.048)   &0.758(0.231)\\
ER              &0.96(0.017)    &0.165(0.019)   &0.942(0.064)\\
Gauss           &0.944(0.024)   &0.124(0.021)   &0.787(0.225)\\
Uniform         &0.951(0.021)   &0.195(0.024)   &0.814(0.233)\\
\midrule
%
\multicolumn{4}{c}{\tt synth-linear} \\
%+++++++++++++++++++++++synth-linear
baseline                &0.967(0.018)   &1.923(0.157)   &0.826(0.131)\\
ER              &0.96(0.011)    &1.54(0.309)    &0.941(0.063)\\
Gauss           &0.967(0.008)   &1.402(0.109)   &0.927(0.09)\\
Uniform         &0.968(0.014)   &1.795(0.201)   &0.898(0.071)\\
\midrule
%+++++++++++++++++++++++synth-squared
\multicolumn{4}{c}{\tt synth-squared} \\ 
baseline                &0.953(0.016)   &0.614(0.099)   &0.858(0.137)\\
ER              &0.949(0.026)   &0.366(0.037)   &0.849(0.185)\\
Gauss           &0.956(0.024)   &0.37(0.048)    &0.987(0.018)\\
Uniform         &0.956(0.029)   &0.359(0.054)   &0.983(0.03)\\
\bottomrule
\end{tabular}
    \caption{Efficiency of the model PIs at $\alpha=0.05$ on the synthetic data sets listed in Table \ref{table mae}.
    The reported averages and standard deviation are computed over 5 random training-test splits.}
    \label{table data sets synth}
\end{small}
\end{table}
\begin{table}
    \centering
    \begin{small}
    \begin{tabular}{*{4}{l}}
\\\toprule
\multicolumn{4}{c}{\bf real data ($\alpha = 0.05$)}\\
&coverage&size&WSC\\ \midrule
\multicolumn{4}{c}{\tt bike} \\ 
%+++++++++++++++++++++++bike
baseline                &0.98(0.011)    &0.112(0.011)   &0.995(0.011)\\
ER              &0.969(0.019)   &0.181(0.095)   &0.995(0.011)\\
Gauss           &0.972(0.007)   &0.127(0.026)   &0.98(0.017)\\
Uniform         &0.976(0.012)   &0.119(0.021)   &0.982(0.02)\\
\midrule
%
\multicolumn{4}{c}{\tt CASP} \\ 
%+++++++++++++++++++++++CASP
baseline                &0.944(0.021)   &0.413(0.026)   &0.969(0.041)\\
ER              &0.959(0.011)   &0.435(0.036)   &0.883(0.192)\\
Gauss           &0.973(0.009)   &0.525(0.063)   &0.959(0.045)\\
Uniform         &0.955(0.011)   &0.433(0.03)    &0.89(0.196)\\
\midrule
%
\multicolumn{4}{c}{\tt community} \\ 
%+++++++++++++++++++++++ community
baseline                &0.965(0.011)   &0.059(0.018)   &0.888(0.097)\\
ER              &0.967(0.009)   &0.035(0.016)   &0.967(0.02)\\
Gauss           &0.968(0.011)   &0.071(0.028)   &0.924(0.06)\\
Uniform         &0.963(0.013)   &0.03(0.007)    &0.958(0.052)\\
\midrule
%
%+++++++++++++++++++++++concrete
\multicolumn{4}{c}{\tt concrete} \\ 
baseline                &0.96(0.023)    &0.154(0.016)   &0.938(0.064)\\
ER              &0.947(0.031)   &0.288(0.134)   &0.948(0.078)\\
Gauss           &0.95(0.016)    &0.201(0.045)   &0.954(0.043)\\
Uniform         &0.958(0.021)   &0.159(0.014)   &0.972(0.039)\\
\midrule
%+++++++++++++++++++++++energy
\multicolumn{4}{c}{\tt energy} \\ 
baseline                &0.942(0.016)   &0.092(0.005)   &0.965(0.046)\\
ER              &0.944(0.041)   &0.098(0.05)    &0.897(0.131)\\
Gauss           &0.944(0.017)   &0.079(0.015)   &0.793(0.18)\\
Uniform         &0.933(0.028)   &0.071(0.019)   &0.975(0.033)\\
\midrule
%
%+++++++++++++++++++++++facebook_1
\multicolumn{4}{c}{\tt facebook\_1} \\ 
baseline                &0.94(0.014)    &0.014(0.003)   &0.82(0.139)\\
ER              &0.947(0.011)   &0.016(0.005)   &0.953(0.045)\\
Gauss           &0.935(0.02)    &0.015(0.004)   &0.87(0.126)\\
Uniform         &0.929(0.015)   &0.01(0.002)    &0.885(0.114)\\
\bottomrule
\end{tabular}
    \caption{Efficiency of the model PIs at $\alpha=0.05$ on the real data sets listed in Table \ref{table mae}.
    The reported averages and standard deviation are computed over 5 random training-test splits.}
    \label{table data sets real}
    \end{small}
\end{table}





\section{Related work}
\label{section related work}
%%%%%%%%%%%%%%%%
{\bf Calibration training}
In CP, learning a conformity function from data is fairly new. 
%
To the best of our knowledge, the only example of a trained conformity function is the ER algorithm of \cite{papadopoulos2008normalized, papadopoulos2011regression, lei2012distribution}, where localization is achieved by re-weighting $|Y - f(X)|$ with a pre-trained model of the conditional residual, $|g(X)| \approx {\rm E}(|Y-f(X)|\ |X)$. 
%
Outside CP, there are several examples of calibration optimization for data science applications \citep{platt1999probabilistic, zadrozny2002transforming,naeini2015obtaining}.
See \cite{guo2017calibration} for an introduction and empirical comparison of different calibration methods for neural networks. 
%
{\bf Object-dependent conformity measures.}  \cite{papadopoulos2008normalized, papadopoulos2011regression, lei2012distribution} use different versions of re-weighted conformity measures.
The localization function is either fixed, e.g. a KNN-based variance estimator, or pre-trained using \emph{ad-hoc} strategies. 
Section 5 of \cite{romano2019conformalized} contains a detailed discussion on the limitations of ER.
%
Despite its intuitive and empirical efficiency, ER has been poorly investigated or justified from a theoretical perspective.
%
Our work provides a conceptual framework to explain why it works well for approximating conditional validity  \citep{lei2012distribution, foygel2021limits}.
Recent work about ER includes \cite{vovk2020conformal}, which is a theoretical study on the validity of oracle conformity measures, and \cite{bellotti2021approximation}, where the conformity score is iteratively updated to make the PI conditionally valid. 
Similar to \cite{gibbs2021adaptive}, coverage is corrected by minimizing an empirical estimation of the validity gap.
Besides \cite{papadopoulos2008normalized, bellotti2020constructing}, conformity scores other than $A = |f(X)-Y|$ have been rarely used.
%
In \cite{romano2019conformalized}, the conformity function is redesigned to mimic the pinball loss of quantile regression problems.
%
In \cite{colombo2023training}, a series of trained conformity functions are tested empirically.
Compared to this work, the learning scheme is not analyzed theoretically and uses a different learning loss.
We are unaware of other works where the conformity measure is explicitly optimized.
%
{\bf CP localization and conditional validity.}
%
Except for \cite{papadopoulos2008normalized}, the scheme can be combined with other localization methods because it applies to any base conformity score.
\cite{papadopoulos2008normalized} is an exception because the conformity function is trained by minimizing ${\rm E}_{XY}|A^2 - g^2(X)|^2$. 
%
In \cite{lei2014distribution, vovk2012conditional, lin2021locally, guan2023localized, deutschmann2023adaptive}, locally adaptive PI are constructed by re-weighting the calibration samples and temporarily breaking data exchangeability.
The weights transform the marginal distribution into an estimate of the object-conditional distribution.
Often, computing the localizing weights requires a density estimation step based on one or more hyper-parameters \citep{lei2014distribution, vovk2012conditional, guan2023localized, deutschmann2023adaptive}.  
This may cause technical issues and can be unreliable if data is scarce.
Our approach avoids an explicit estimation, because $b$ is a globally defined functional, and does not require splitting the calibration set.
%
Conditional validity gaps can be viewed as a non-exchangeability problem.
\cite{barber2022conformal} is a study of CP under general non-exchangeability but does not make an explicit connection to local adaptivity.
%
\cite{xu2023sequential} exploits the bounds of \cite{barber2022conformal} for proving the asymptotic convergence of the estimated PIs to the exact conditional PIs.  
Theorem 4 in \cite{guan2023localized} guarantees exact conditional coverage for a sample re-weighting method, up to corrections on the estimated PI.
The NF setup allows more explicit bounds on the validity of the algorithm outputs (Theorem \ref{theorem approximate validity} in Section \ref{section theory}).
% 
In \cite{einbinder2022training}, a point-prediction model is trained to guarantee $P_{AX}=U_A P_X$, where $U_A={\rm Uniform}([0, 1])$.
It is unclear whether tuning the point-prediction model or the conformity function produces equivalent PIs.
%
This work is intuitively close to conformity-aware training, which aims to optimize the output of a standard CP algorithm by tuning the underlying model \citep{colombo2020training, bellotti2020constructing, stutz2021learning, einbinder2022training}. 
The two ideas are compatible and could be implemented simultaneously. 
We leave this for future work.
%

\section{Discussion and limitations}
\label{section discussion and limitations}
This is mainly a theoretical and methodological work.
We recognize our numerical simulations are limited, especially regarding the model complexity. 
We also miss a full comparison with existing localization approaches. 
We focus on conformity functions similar to ER to underline the efficiency of the learning strategy, without bias coming from the definition of more or less suitable model classes.
Generalizing the approach to more complex NF is possible, provided $b(A, X)$ remains invertible, i.e. monotonic in $A$.
%
A comparison with other localization methods goes beyond our scope because calibration training is orthogonal to many existing strategies, e.g. algorithms based on re-weighting the calibration samples.
The proposed scheme could be used on top of them to provide theoretical guarantees. 
As mentioned in Section \ref{section related work}, CP-aware retraining of the prediction model could also be combined with calibration training.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newpage
\bibliography{refs.bib}

%\newpage
%\onecolumn
%%\begin{center}
%\large{{\bf Normalizing Flows for Conformal Regression} \\
%Supplementary Material }    
%\end{center}
%\appendix
\section{Proofs}





%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\paragraph{Proof of Lemma \ref{lemma quantile}}.
Assume ties occur with probability 0.
According to \eqref{sample quantile definition}, $Q_Z$ is the $n* = \lceil (1-\alpha)(N+1)\rceil$-th smallest element of $\{ Z_n \}_{n=1}^N$.
Assume the calibration samples have been labeled so that $Z_1 < Z_2 \dots < Z_{N-1} < Z_N$.
By assumption, $Z_1, \dots, Z_n$, and $Z_{N+1}$ are exchangeable.
This implies $Z_{N+1}$ falls with equal probability in any of the $N+1$ intervals 
\begin{align}
    (-\infty, Z_1), &[Z_1,Z_2) \dots, [Z_{n*-1}, Q_Z), \nonumber\\&(Q_Z, Z_{n*+1})\dots %\\ \dots  
    (Z_{N-1}, Z_{N}), [Z_{N}, \infty)
\end{align}
i.e.
%\begin{align}
${\rm Prob}(Z_{N+1} \leq Q_Z) = \frac{n*}{N+1} = \frac{\lceil (1-\alpha)(N+1)\rceil}{N+1}.$
%\end{align}
$\square$


\paragraph{Proof of Lemma \ref{lemma validity cphi}}
$\{ B_n\}_{n=1}^N$ are i.i.d. random variable because $b$ is deterministic and  $\{ A_n\}_{n=1}^N$ are i.i.d.
When $b$ satisfies Assumption \ref{assumption phi}, ${\rm Prob}(A_n=A_{n'}) = 0$ for any $n\neq n'$ implies 
${\rm Prob}(B_n=B_{n'})= {\rm Prob}(A_n = b^{-1}(b(A_{n'},X_{n'}), X_n) = 0$ for any $n\neq n'$, i.e. there are no ties in $\{ B_n\}_{n=1}^N$.
Let $Q_{B}$ be the $(1-\alpha)$-th sample quantile $\{ B_n\}_{n=1}^N$ defined in \eqref{sample quantile definition}.
From Lemma \ref{lemma quantile}, ${\rm Prob}(B_{N+1} \leq Q_{B}) = \frac{n_*}{N+1}$, with $n_* = \lceil (1-\alpha)(N+1)\rceil$.
Let $b_X(A) = b(A, X)$ and $b^{-1}_X(B) = b^{-1}(B, X)$, with $b^{-1}$ defined by $b(b^{-1}(B, X), X) = b_X \circ b_X^{-1}(B)$, and
$\partial_A b_X(A) = J_{b\ 1 1}(A, X)= \frac{\partial}{\partial A}b(A, X) = \frac{\partial}{\partial A}b_X(A)$.
By Assumption \ref{assumption phi},  $\partial_A b_X>0$ for all $X$.
Let $\frac{d}{ds} h(s, g(s)) = \partial_s h + \partial_g h \partial_s g$ be the total derivative of $h$.
From $1 = \frac{d}{dB} b_X\circ b_X^{-1}(B) = \partial_A b_X(b_X^{-1}(B)) \frac{d}{dB} b_X^{-1}(B)$, we obtain $\frac{d}{dB}b^{-1}_X(B) = \left( \partial_A b_X(b^{-1}_X(B))\right)^{-1} > 0$, i.e. $b^{-1}_{X_{N+1}}(B)$ is a monotonic function of $B$.
%
Therefore, 
\begin{align}
    &{\rm Prob}\left( B_{N+1} \leq Q_{B}\right)\\ 
    &={\rm Prob}\left(b^{-1}_{X_{N+1}}(B_{N+1}) \leq b_{X_{N+1}}^{-1}(Q_{B})\right) \\
    & ={\rm Prob}\left(b^{-1}_{X_{N+1}}\circ b_{X_{N+1}}(A_{N+1}) \leq b_{X_{N+1}}^{-1}(Q_{B})\right) \\
    & ={\rm Prob}\left(A_{N+1} \leq b_{X_{N+1}}^{-1}(Q_{B})\right) \\
    & ={\rm Prob}\left( |f(X_{N+1})-Y_{N+1}| \leq b_{X_{N+1}}^{-1}(Q_{B})\right) \\
    & ={\rm Prob}\left( Y_{N+1} \in C_B\right) 
\end{align}
where $C_B$ is defined in \eqref{marginal prediction intervals}.
$\square$
%
\paragraph{Proof of Lemma \ref{lemma size is different}}
Let $\{ B_n=b_{X_n}(A_n)\}_{n=1}^{N+1}$, where $b_X(A) = b(A, X)$, and $C_A$ and $C_B$ be the PIs in \eqref{marginal prediction intervals} and \eqref{prediction interval phi}.
From \eqref{sample quantile definition}, there are $m_*$ and $n_*$ such that $Q_A = A_{m*}$ and  $Q_{\hat B} = b_{X_{n_*}}(A_{n*})$.
Then, when $b_{X_{N+1}}(A_{n}) \neq  b_{X_{n}}(A_{n})$ for any $n$,  we have  $b_{X_{N+1}}(A_{n_*}) \neq  b_{X_{n_*}}(A_{n_*})$ and 
\begin{align}
        &|C_{B}| = b_{X_{N+1}}^{-1}\circ b_{X_{m_*}}(A_{m_*}) \neq  A_{n_*} =|C_A| 
\end{align}
The claim holds because $A_{n*}= b_{X_{N+1}}^{-1}\circ b_{X_{m_*}}(A_{m_*})$ occurs with probability 0 if $A_n$ are continuous.
$\square$
%%%
\paragraph{Proof of Theorem\ref{theorem marginal and conditional}}
    Let $\{A_n \sim P_{A} \}_{n=1}^{N}$ $\{ \tilde A_n \sim P_{A|X_{N+1}} \}_{n=1}^{N}$ be two collections of i.i.d random variables distributed according to the marginal and $X_{N+1}$-conditional distributions.
    Let $Q_{A}$ and $Q_{\tilde A}$ be the sample quantiles of the two collections defined in \eqref{sample quantile definition}.
    Let $C_{A}$ be the PI defined in \eqref{marginal prediction intervals} and $C_{\tilde A}$ be obtained analogously with $Q_A$ replaced by $Q_{\tilde A}$. 
    Assume ties occur with probability 0. 
    By the Bayesian theorem, $P_{AX} = P_A P_X$ implies $P_{A|X} = P_A = \sum_{X} P_{AX}$.
    Then, for any $X_{N+1}$, $\tilde A_n \sim P_A \sim A_n$ and, from Lemma \ref{lemma quantile},  ${\rm Prob}(\tilde A_{N+1} \leq Q_{\tilde A}) = {\rm Prob}(\tilde A_{N+1} \leq Q_{A}) = {\rm Prob}(Y_{N+1} \in C_A)$.
    $\square$
    %%%
\paragraph{Proof of Corollary \ref{corollary exact flow intervals}}
Let $Q_{A|X_{N+1}}$ and $C_{A|X_{N+1}}$ be the conditional sample quantile of $\{ \tilde A_n \sim P_{A|X_{N+1}} \}$ and the corresponding PI defined as in \eqref{marginal prediction intervals} with $Q_A$ replaced by $Q_{A|X_{N+1}}$. 
By construction, $C_{A|X_{N+1}}$ is conditionally valid PI at $X_{N+1}$, i.e. it obeys ${\rm Prob}(Y_{N+1} \in C_{A|X_{N+1}}|X_{N+1}) = \frac{m_*}{N+1}$, $m_*=\lceil(1 - \alpha) (N + 1) \rceil$. 
%
Let $(B, X) = (b(A, X), X) = (b_X(A), X)$.
Then, if $b$ obeys Assumption \ref{assumption phi} and $P_{BX} = P_{B|X} P_X = U_B P_X$, 
\begin{align}
    Q_{A|X_{N+1}} 
    &= Q_{b^{-1}_{X_{N+1}}(B)|X_{N+1}} \\
    &= b_{X_{N+1}}^{-1}(Q_{B|X_{N+1}}) = b_{X_{N+1}}^{-1}(Q_{B})\label{line3}
\end{align}
because $b^{-1}_{X_{N+1}}$ is monotonic and we apply it globally to all samples (second equality) and $P_{BX} =  U_B P_X$ implies $Q_{B|X_{N+1}} = Q_{B}$ (last equality).
The claim follows from Lemma \ref{lemma quantile}, $b_{X_{N+1}}(A)=b(A, X_{N+1})$, and the PI definition in \eqref{prediction interval phi}.
$\square$    
%%
\paragraph{Proof of Theorem \ref{theorem approximate validity}.}
Let $\{ A_n\}_{n=1}^{N+1}$ be a collection of i.i.d. conformity scores and  $\{ B_n = b(A_n, X_n)\}_{n=1}^{N+1}$ and $\{ \hat B_n = \hat b(A_n, X_n)\}_{n=1}^{N+1}$ the conformity scores transformed by $b$ and $\hat b = (1 - \epsilon) b + \epsilon \delta$.
Let $C_B$ be the PI defined in \eqref{prediction interval phi} and $C_{\hat B}$  defined analogously by replacing $b$ with $\hat b$.
Let $b_X(B) = b(A, X)$ (idem $\hat b_X$, $b_X^{-1}$, and $\hat b_X^{-1}$).
Assumption \ref{assumption phi} and Corollary \ref{corollary exact flow intervals} imply 
\begin{align}
&{\rm Prob}(Y_{N+1} \in C_{\hat B}|X_{N+1}) \\
& = {\rm Prob}(A_{N+1} \leq \hat b_{X_{N+1}}^{-1}(Q_{\hat B})|X_{N+1}) \\
& = {\rm Prob}(b_{X_{N+1}}^{-1}(B_{N+1}) \leq \hat b_{X_{N+1}}^{-1}(Q_{\hat B})|X_{N+1}) \\
& = {\rm Prob}(B_{N+1} \leq b_{X_{N+1}}\circ \hat b_{X_{N+1}}^{-1}(Q_{\hat B}))
\end{align}
where we drop the conditioning in the last line because, by assumption, $(B_n, X_n) \sim U_B P_X$ for all $X_n$.
%
The monotonicity of $b_{X_{N+1}}\circ \hat b_{X_{N+1}}^{-1}(B)$ 
%and $Q_{\hat B} = \hat b_{X_{n*}}(A_{n_*})$ for some $n_* \in \{1, \dots, N\}$, we have 
implies $\hat b_{X_{N+1}}^{-1}(Q_{\hat B}) = Q_{\tilde B}$, where $Q_{\tilde B}$ is the sample quantile of $\{\tilde B_n\}$, $\tilde B_n = b_{X_{N+1}}\circ \hat b_{X_{N+1}}^{-1}(\hat B_n)$.
Test and calibration data are not exchangeable because, $B_{N+1}$ and $\tilde B_n$, $n=1, \dots N$, come from different distributions.
%
The coverage gap can be bounded in terms of the total variation distance between their distributions,  $P_B$ and  $P_{\tilde B}$, i.e.
${\rm d}_{{\rm TV}}(P_{B}, P_{\tilde B}) = \sup_{Z}|P_{B}(Z)- P_{\tilde B}(Z)|$.
Let $(\hat {\rm P}, \tilde B, B')$ define a \emph{maximal coupling} between $\tilde B_1, \dots, \tilde B_N$ and $B_{N+1}$ defined by ${\rm Prob}(\tilde B_n) = \hat {\rm P}(\tilde B)$, $n=1, \dots, N$, and ${\rm Prob}(B_{N+1}) = \hat {\rm P}(B')$.
Then, 
\begin{align}
    &{\rm Prob}(B_{N+1} \leq  Q_{\tilde B}) \\
    & = \hat {\rm P}(B'\leq Q_{\tilde B}, B' = \tilde B) %\nonumber\\& 
    + \hat {\rm P}(B' \leq Q_{\tilde B}, B'\neq \tilde B) \\
    & \geq  \frac{\lceil (N+1)(1 - \alpha)\rceil }{N+1} - \hat {\rm P}(B' \neq \tilde B) ) 
 \end{align}
where the Maximal Coupling Theorem implies (see \cite{lindvall2002lectures, ross2023second} for a proof) 
\begin{align}
\hat {\rm P}(B' \neq \tilde B) = \frac12 {\rm d}_{{\rm TV}}(P_{B_{N+1}}, P_{\tilde B_{n}})
\end{align}
which, in this case, holds for any $n \in \{ 1, \dots, N\}$ because we assume the data objects are i.i.d. 
%

Assume $\hat b= (1 - \epsilon) b + \epsilon \delta$ and $\hat b^{-1} = (1 - \epsilon) b^{-1} + \epsilon \delta^{-1}$, for all $(A, X)$.
%
The invertibility of $\hat b$ implies ${\rm Id} = \hat b \circ \hat b^{-1} = (1 - 2\epsilon) {\rm Id} + \epsilon (b \circ\delta^{-1} + b^{-1} \circ\delta ) + \epsilon^2 ({\rm Id} +  \delta \circ\delta^{-1})$, where ${\rm Id}(B) = B$.
%
Neglecting second-order terms, we have $b \circ\delta^{-1} + \delta \circ b^{-1} = 2 {\rm Id}$, i.e. $\delta^{-1} = 2 b^{-1}  - b^{-1}\circ\delta \circ b^{-1}$ and $\hat b^{-1} = (1 - \epsilon) b^{-1} + \epsilon (2 b^{-1} -b^{-1}\circ\delta \circ b^{-1}) = (1 + \epsilon) b^{-1} - b^{-1}\circ\delta \circ b^{-1}$.
%
Let $b_X(A) = b(A, X)$ (idem $b^{-1}$, $\hat b$, $\hat b^{-1}$, and $\delta$).
Since $\psi_X(B) = b_{X_{N+1}}  \circ \hat b^{-1}_{X_{N+1}} \circ \hat b_{X} \circ b^{-1}_{X}$ is monotonic, we may interpret it as an NF.
The density of $(\tilde B_n, X_n) \sim P_{\tilde BX}$ is 
\begin{align}
    p(\psi_X(B), X) = \frac{p(B, X)}{|{\rm det} J_{\psi}(B, X)|} =  
    \frac{u(B) p(X)}{|\partial_B \psi_X(B)|} 
\end{align}
where $|\partial_B \psi_X(B)| = \partial_B \psi_X(B)$ because $\psi_X$ is monotonic.   
Then, up to $o(\epsilon^2)$ errors,
\begin{align}
    &{\rm d}_{{\rm TV}}(P_{B}, P_{\tilde B}) \\
    &= \sup_{(B, X)} \left\|u(B)p(B) \left(1 - \frac{1}{\partial_B \psi_X(B)}\right)\right \| \\
    & \leq  \epsilon \sup_{(B, X)} \|u(B)p(X) \| \sup_{(B, X)} \|1 - \partial_B \psi^{-1}_X(B)\| \\
    & = \epsilon \sup_{(B, X)} \|u(B)p(X) \| \\ \nonumber 
    &\quad \times \sup_{(B, X)} \| \partial_A \delta_X \circ \partial_B b^{-1}_X - \partial_A \delta_{X_{N+1}} \circ \partial_B b_{X_{N+1}}^{-1} \| \\
 & \leq  2 \epsilon \sup_{(B, x)} \|u(B)p(X) \| L_\delta L_{b^{-1}} 
\end{align}
where $L_\delta$  and $L_{b^{-1}}$ are the Lipshitz constants of $\delta_X$ and $b^{-1}_X$.
%
If $U_B = {\rm Uniform}([0, 1])$, $\sup_{(B, X)} \|u(B)p(X) \| = \sup_{X} \| p(X) \|$, which only depends on the marginal density of the covariates over the attribute space.
Hence,    
\begin{align}
    &{\rm Prob}(B_{N+1} \leq Q_{\hat B}) \\
    &\geq \frac{\lceil (N+1)(1 - \alpha)\rceil }{N+1} - \epsilon \sup_{X} \|p(X) \| L_\delta L_{b^{-1}}
\end{align}
$\square$


\end{document}
