% \documentclass{uai2024} % for initial submission
\documentclass[accepted]{uai2024} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2024} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2024} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

\usepackage{hyperref}
\usepackage{url}

\usepackage{wrapfig}
\usepackage{graphicx}
\usepackage{amsmath}
\usepackage{amssymb}
% \usepackage{booktabs}
\usepackage{amsthm}
% \usepackage{tikz}
\usepackage{float}
\usepackage{subfig}

\usepackage{setspace}
\usepackage{xcolor}
\usepackage[ruled, noend]{algorithm2e}
\usepackage{cancel}



\usepackage{hyperref}
\usepackage{mathtools}
\usepackage{bm}
%% Self-defined macros
\newcommand{\X}{\mathcal{X}}
\newcommand{\Y}{\mathcal{Y}}
\newcommand{\D}{\mathcal{D}}
% \newcommand{\R}{\mathbb{R}}
\newcommand{\pr}{\mathbb{P}}
\newcommand{\F}{\mathcal{F}}
\newcommand{\N}{\mathcal{N}}
\newcommand{\W}{\boldsymbol{W}}
\newcommand{\U}{\boldsymbol{U}}
\newcommand{\Hb}{\boldsymbol{H}}
\newcommand{\pmu}{\boldsymbol{\bar{\mu}}}
\newcommand{\bmu}{\boldsymbol{\mu}}
\newcommand{\bpi}{\boldsymbol{\pi}}
\newcommand{\psig}{\boldsymbol{\bar{\Sigma}}}
\newcommand{\base}{EENN-Bayes }
\newcommand{\our}{EENN-AVCS }

\newcommand{\rvx}{\mathbf{x}}
\newcommand{\rvy}{\mathbf{y}}

\newcommand{\vx}{\bm{x}}
\newcommand{\vy}{\bm{y}}
\newcommand{\mV}{\bm{V}}
\newcommand{\mW}{\bm{W}}
\newcommand{\mU}{\bm{U}}

\newcommand{\rmU}{\mathbf{U}}
\newcommand{\rmV}{\mathbf{V}}
\newcommand{\rmW}{\mathbf{W}}

\newcommand{\ry}{\textnormal{y}}
\newcommand{\rx}{\textnormal{x}}



\newtheorem{remark}{Proposition} 
\newtheorem{assump}{Assumption} 

\definecolor{mplblue}{rgb}{0.12156862745098039, 0.4666666666666667, 0.7058823529411765}
\definecolor{mplorange}{rgb}{1.0, 0.4980392156862745, 0.054901960784313725}
\definecolor{mplred}{rgb}{0.8392156862745098, 0.15294117647058825, 0.1568627450980392}
\definecolor{mplgreen}{rgb}{0.17254901960784313, 0.6274509803921569, 0.17254901960784313}
\definecolor{mplgrey}{rgb}{0.4980392156862745, 0.4980392156862745, 0.4980392156862745}

\usepackage{tikz}
\usetikzlibrary{bayesnet, arrows, tikzmark, calc, matrix, fit, patterns}
\usepackage{scalerel}

\newsavebox{\boxline}
\sbox{\boxline}{\tikz[baseline=-0.5ex] \draw[mplblue!90, thick] (0., 0.) -- (0.5, 0.);}

\newsavebox{\boxblue}
\newsavebox{\boxorange}
\newsavebox{\boxgrey}
% \newsavebox{\boxred}

\sbox{\boxblue}{\tikz \fill[mplblue!50] (0,0) rectangle (1,1);}
\sbox{\boxorange}{\tikz \fill[mplorange!50] (0,0) rectangle (1,1);}
\sbox{\boxgrey}{\tikz \fill[mplgrey!50] (0,0) rectangle (1,1);}
% \sbox{\boxred}{
%   \tikz {
%     \fill[mplred, opacity=0.9] (0,0) rectangle (1,1);
%     \fill[pattern=north east lines, pattern color=black] (0,0) rectangle (1,1);
%   }
% }

\newcommand{\reddashedline}{{\tikz[baseline=-0.5ex] \draw[mplred, very thick, dashed] (0., 0.) -- (0.3, 0.);}}

\newcommand{\hatchline}{{\tikz[baseline=-0.5ex] \draw[very thick] (0., -0.12) -- (0.3, 0.12);}}

\newcommand{\orangeline}{{\tikz[baseline=-0.5ex] \draw[mplorange, very thick, solid] (0., 0.) -- (0.3, 0.);}}

\newcommand{\blueline}{{\tikz[baseline=-0.5ex] \draw[mplblue, very thick, solid] (0., 0.) -- (0.3, 0.);}}

\newcommand{\redline}{{\tikz[baseline=-0.5ex] \draw[mplred,  thick, solid, opacity=0.5] (0., 0.) -- (0.3, 0.);}}

\newcommand{\redbline}{{\tikz[baseline=-0.5ex] \draw[mplred, very thick, solid] (0., 0.) -- (0.3, 0.);}}

\newcommand{\tmlr}[1]{{\color{mplblue}#1}}



\title{Early-Exit Neural Networks with Nested Prediction Sets}

% The standard author block has changed for UAI 2024 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1,2]{Metod~Jazbec\thanks{Corresponding author: \href{mailto:m.jazbec@uva.nl}{m.jazbec@uva.nl} }}
\author[2]{Patrick~Forré}
\author[3]{Stephan~Mandt}
\author[4]{Dan~Zhang}
\author[1,2]{Eric~Nalisnick}
% Add affiliations after the authors
\affil[1]{
    UvA-Bosch Delta Lab,
    University of Amsterdam
}
\affil[2]{
    Amsterdam Machine Learning Lab,
    University of Amsterdam
}
\affil[3]{
    Department of Computer Science, University of California, Irvine  
  }
\affil[4]{
    Bosch Center for AI \& University of Tübingen
  }
\begin{document}
\maketitle
\begin{abstract}
Early-exit neural networks (EENNs) enable adaptive and efficient inference by providing predictions at multiple stages during the forward pass. In safety-critical applications, these predictions are meaningful only when accompanied by reliable uncertainty estimates. A popular method for quantifying the uncertainty of predictive models is the use of prediction sets. However, we demonstrate that standard techniques such as conformal prediction and Bayesian credible sets are not suitable for EENNs. They tend to generate non-nested sets across exits, meaning that labels deemed improbable at one exit may reappear in the prediction set of a subsequent exit. To address this issue, we investigate anytime-valid confidence sequences (AVCSs), an extension of traditional confidence intervals tailored for data-streaming scenarios. These sequences are inherently nested and thus well-suited for an EENN's sequential predictions. We explore the theoretical and practical challenges of using AVCSs in EENNs and show that they indeed yield nested sets across exits. Thus our work presents a promising approach towards fast, yet still safe, predictive modeling.
\end{abstract}

\section{Introduction}
\label{sec:intro}

Modern predictive models are increasingly deployed to environments in which computational resources are either constrained or dynamic. In the constrained setting, the available resources are fixed and often modest. For example, when models are deployed on low-resource devices such as mobile phones, they need to make fast yet accurate predictions for the sake of the user experience. On the other hand, in the dynamic setting, the available resources can vary due to external conditions. Consider an autonomous vehicle: when it is moving at high speeds, the model must make rapid predictions. However, as the vehicle slows down, the model can afford more time to process information or `think'. Early-exit neural networks (EENNs) \citep{branchy2016, huang2018} present a promising solution to challenges arising in both of these settings.  As the name implies, these architectures have multiple exits that allow a prediction to be generated at an arbitrary stopping time. This is in contrast to traditional NNs that yield a single prediction after processing all layers or model components.

To employ EENNs in safety-critical applications such as autonomous driving, it is necessary to estimate the predictive uncertainty at each exit \citep{mcallister2017concrete}. One prominent approach to capture a model's predictive uncertainty is constructing prediction sets or intervals.\footnote{We use the terms prediction \emph{sets} and prediction \emph{intervals} interchangeably, unless otherwise specified.} Prediction sets aim to cover the ground-truth label with high probability, and their size measures the model's certainty in its prediction.  Prediction sets based on Bayesian methods \citep{meronen2023} and conformal prediction \citep{cat2021} have been explored for EENNs. However, no work that has accounted for the fact that prediction sets computed at neighboring exits are \emph{dependent}.  A prediction interval at a given exit should be \textit{nested} within the intervals at the preceding exits (see Figure \ref{fig:intro}).  In other words, if a candidate prediction $y_0$ is in the interval at exit $t-1$ and drops out of the interval at exit $t$, $y_0$ should not re-enter the interval at exit $t+1$.  An even worse case would if the intervals at exit $t$ and $t+1$ are disjoint. Such non-nested behaviour limits the decisions that can be made at the initial exits of an EENN, thereby undermining their anytime properties \citep{zilbertstein1996using}. 

\begin{figure}[htbp]
  \centering
  \includegraphics[width=0.9\linewidth]{figures/fig_intro_hatch.pdf}

    \caption{Illustrative example of a 1-dimensional regression problem using an Early-Exit neural network (EENN) with $T=5$ exits. \textit{Upper}: At each exit, the EENN produces a prediction interval $C_t$ nested within its previous estimates, i.e., $C_t \subseteq C_{t-1}$. \textit{Lower}: An example of non-nested prediction intervals across different exits, e.g., $C_2$ contains candidate labels $y$ not included in $C_1$ (area denoted with (\protect\hatchline) lines). Such behavior often results from an EENN becoming overconfident, i.e., exhibiting low uncertainty, too early.}
   \label{fig:intro}
\end{figure}


We address this open problem by applying \textit{anytime-valid confidence sequences} (AVCSs) \citep{darling1967confidence, robbins1970statistical, lai1976confidence} to the task of constructing prediction sets across the exits of an EENN. AVCSs extend traditional, point-wise confidence intervals to streaming data scenarios \citep{maharaj2023anytime}. Importantly, AVCSs are guaranteed to have a non-increasing interval width \citep{howard2021time} and are therefore nested by definition. Our main insight is that AVCSs can be applied (with assumptions) when only one data point is observed, as is the case when constructing the prediction set for a single test point. To achieve this for EENNs, we consider the model parameters (e.g., the output weights) to be `streaming’ across exits. %, instead of using streaming data as is traditionally done. % Our key insight that allows AVCSs to be applied to EENNs (where a single data points is observed at test time) is that we can consider the EENN's parameters to be `streaming' across exits, thereby inverting how AVCSs are usually applied in the case of streaming data. 
We detail the approximations necessary to make AVCSs applicable for the sequential prediction setting of EENNs and provide bounds on the errors introduced by our approximations.  %However, using AVCSs for predictive inference requires an approximation, but we provide bounds on the error it introduces.  
In our experiments across various classification and regression tasks, we demonstrate that our AVCS-based procedure yields nested estimates of predictive uncertainty across the exits of EENNs. 



\section{Background}
\label{sec:bg}


\paragraph{Data} Let $\mathcal{X} \subseteq \mathbb{R}^D$ denote a $D$-dimensional feature space and $\mathcal{Y}$ the response (output) space. In the case of regression, we have $\mathcal{Y} \subseteq \mathbb{R}$, and for classification $\mathcal{Y} = \{1, \ldots, K\}$.  We assume $\vx$ and $y$ are realizations of the random variables $\rvx$ and $\rvy$, drawn from the unknown data distribution $\mathbb{P}(\rvx, \rvy) = \mathbb{P}(\rvy | \rvx) \; \mathbb{P}(\rvx)$.  The training data consists of $N$ feature-response pairs $\D = \{(\vx_n, y_n)\}_{n=1}^N$. Lastly, let $(\vx^*, y^*)$ denote a test point, which may be drawn from a different distribution than the one used for training.

\paragraph{Early-Exit Neural Networks} EENNs \citep{branchy2016, huang2018} generate predictions at various depths by having several prediction heads branch out from a shared backbone network. Specifically, an EENN defines a sequence of predictive models: $ f(\vx; \mW_t, \mU_{1:t}), \  t=1,\ldots,T$, where  $\mW_t$ represents the parameters of the predictive head at exit $t$ and $\mU_t$ denotes the parameters of the $t$-th block in the backbone architecture. EENNs are usually trained by fitting all exits at once 
% \begin{flalign*}
$
\mathcal{L}(\mW_{1:T}, \mU_{1:T} ; \D) \coloneqq \sum_{n=1}^N \frac{1}{T}\sum_{t=1}^T \ell \big(y_n, f(\vx_n; \mW_t, \mU_{1:t})\big) 
$
% \end{flalign*}
% $\mathcal{L}(\mW_{1:T}, \mU_{1:T} ; \D) \coloneqq \sum_{n=1}^N \frac{1}{T}\sum_{t=1}^T \ell \big(y_n, f(\vx_n; \mW_t, \mU_{1:t})\big) 
% $
where $\ell$ is a suitable loss function such as negative log-likelihood. 

At test time, we can utilize the intermediate predictions of EENNs in various ways. For instance, if the model is deemed sufficiently confident at exit $t$, we can halt computation without propagating through blocks $t+1,\ldots,T$, thus speeding up prediction time. Naturally, the merit of such an approach relies on quality estimates of the EENN's uncertainty at every exit. EENNs can also be employed as anytime predictors \citep{zilbertstein1996using, jazbec2023}: the aim is to quickly provide an approximate prediction---ideally with its associated uncertainty---and continuously improve upon it as long as the environment permits.

\paragraph{Prediction Sets}  Quantifying the uncertainty of a  predictive model $f_{\theta} : \X \rightarrow \Y$ is crucial for its robustness and reliability. A popular approach, which is the focus of this study, augments the model output in the form of a prediction set (or interval, in the case of regression) $C_{\theta} : \X \rightarrow 2^{\Y}$. For a given test point, $C_{\theta}(\vx^*)$ should include (or \emph{cover}) the ground-truth $y^*$ with high probability. The \emph{size} of $C_{\theta}(\vx^*)$ can be interpreted as a proxy for the model's confidence---a smaller set indicates certainty, a larger set indicates uncertainty. Conformal prediction \citep{vovk2005algorithmic, shafer2008tutorial} is a popular method to construct prediction sets. Requiring only a calibration dataset $\D_{cal}$, it can generate prediction sets for a given model \emph{post hoc} and with finite-sample, distribution-free guarantees on the coverage of the ground-truth label.  See \cite{angel2021conformal} for an introduction to conformal prediction. Alternatively, one can employ Bayesian modeling \citep{gelman1995bayesian} to first obtain a posterior predictive distribution $p(\ry | \vx^*, \D)$ and then construct a credible set/interval based on it.

\paragraph{Anytime-Valid Confidence Sequences} Consider a streaming setting in which new data arrives at every time point $t$ via sampling from an unknown (parametric) model $\vx_t \sim p(\rvx | \theta^{*})$. Here $\theta^* \in \mathbb{R}$ represents the parameter of the data-generating distribution for which we want to perform statistical inference.  An anytime-valid confidence sequence (AVCS) \citep{darling1967confidence, robbins1970statistical, lai1976confidence} for $\theta^*$ is a sequence of confidence intervals $C_t = (l_t, r_t) \subseteq \mathbb{R}$ that have time-uniform and non-asymptotic coverage guarantees: 
%\begin{flalign*}
$\pr( \forall t,  \: \theta^* \in C_t  )  \ge  1- \alpha,$
%\end{flalign*}
where $\alpha \in (0, 1)$ represents the level of significance. The anytime (i.e.~time-uniform) property allows the user to stop the experiment, `peek' at the current results, and choose to continue or not, all while preserving the validity of the statistical inference.  This is in contrast with standard confidence intervals based on the central limit theorem (CLT), which are valid only pointwise (i.e.~for a fixed time / sample size). The stronger theoretical properties of AVCSs come at a cost, as they are typically larger than CLT-based intervals \citep{howard2021time}.   

An AVCS is constructed by first specifying a family of stochastic processes $\{R_t(\theta) : \theta \in \Theta \}$ that depends only on observations $\vx_1, \ldots, \vx_t$ available at time $t$. Next, we require that when evaluated at the parameter of interest, $R_t(\theta^*)$ forms a discrete, non-negative \textit{martingale} \citep{ramdas2020admissible}---a stochastic process that remains constant in expectation:\footnote{It is also common to define AVCS in terms of \emph{supermartingales}, which are stochastic processes that decrease in expectation over time: $\mathbb{E}_{\rvx_{t+1}}[R_{t+1}(\theta^*)  | \rvx_{1},\ldots,\rvx_{t}] \le R_{t}(\theta^*), \forall t$.} $\mathbb{E}_{\rvx_{t+1}}[R_{t+1}(\theta^*)  | \rvx_{1},\ldots,\rvx_{t}] = R_{t}(\theta^*), \forall t$. Additionally, $R_0(\theta^*)$ should have an initial value that is constant (usually one). Once such a martingale is constructed, the AVCS at a given $t$ is implemented by computing $R_t(\theta)$ for all $\theta \in \Theta$ and adding to the set the values for which $R_t$ does not exceed $1/\alpha$: $C_t \coloneqq \{ \theta : R_t(\theta) \le 1/\alpha \}$.  Strong theoretical properties (i.e., time-uniformity) then follow from Ville’s inequality %\footnote{It is a generalization of more widely known Markov's inequality from non-negative random variables to non-negative supermartingales.} 
for nonnegative (super)martingales: $\pr\left( \exists t: R_t(\theta^{*}) \ge 1/\alpha \right) \le \alpha.$ 
One example of a random variable $R_t$ from which we can construct an AVCS is the prior-posterior ratio: $R_{t}(\theta) = p(\theta) / p(\theta | \vx_{1},\ldots,\vx_{t} )$ \citep{smith2020}.  The time-uniform nature of AVCSs enables one to consider the intersection of all previous intervals---$C_t = \cap_{s \le t} C_s$, at time $t$---without sacrificing statistical validity \citep{pmlr-v202-shekhar23a}. This results in nested intervals/sets, i.e., $C_t \subseteq C_{t-1}$.  We wish to exploit this pivotal property of AVCSs to ensure that the prediction sets of EENNs remain nested across exits.

\section{Confidence Sequences for Early-Exit Neural Networks}
\label{sec:avcs_eens}
Our contribution is to apply AVCSs to perform inference over the predictions generated by each exit of a EENN.  As we will see, this is not a straightforward synthesis: AVCSs have been exclusively used in streaming-data settings, where the goal at every time step is to produce  a \textit{confidence interval} covering the parameter of the data generating distribution $\theta$.  On the other hand, we want to apply them to EENNs that see just one feature vector $\vx^{*}$ at test time. Moreover, we are interested in obtaining a \textit{prediction set/interval} at every exit that contains the ground-truth label $y^*$ with high probability. We overcome these differences by considering the parameters of the EENN's exits $\mW_t$ as the sequence of random variables for which the martingale is defined.  Below we first give a general recipe for constructing AVCSs for EENNs and then describe practical implementations for regression (Section \ref{sec:eenn_blm}) and classification (Section \ref{sec:class}).

\paragraph{Bayesian EENN} We begin by positing a (last-layer) Bayesian predictive model at  every exit:\footnote{In this section, we work with Bayesian predictive models at every exit for ease of exposition. Yet our approach is more general. It can also accommodate models for which the `randomness' does not come from placing a distribution over weights $\rmW_t$. We will provide a concrete example of this later in Section \ref{sec:class}, where we use an evidential approach \citep{malinin2018predictive, sensoy2018evidential} instead of a Bayesian one.}
\begin{flalign}
\label{eq:bayes_eenn}
p_t(\ry | \vx^*, \D) = \int p(\ry | \vx^*, \rmW_t, \mU_{1:t}) \ p( \rmW_{t} | \D , \mU_{1:t}) \ d\rmW_{t}
\end{flalign}
for $t = 1, \ldots, T$, with $T$ representing the total number of exits. $p(\ry | \vx^*, \rmW_t, \mU_{1:t})$ and $p( \rmW_{t} | \D , \mU_{1:t})$ correspond to the likelihood and (exact) posterior distribution, respectively. To ensure minimal overhead of our approach at test time, we treat the backbone parameters $\mU_{1:t}$ as point estimates (e.g.~found through pre-training) that are held constant when constructing the AVCS. To reduce notational clutter, we omit these parameters from here forward. While Bayesian predictives $p_t(\ry | \vx^*, \D)$ can be used `as is' to get uncertainty estimates at each exit (e.g., by constructing a credible interval), we show in Section \ref{sec:exp} that this results in a non-nested sequence of uncertainty estimates. We next present an approach based on AVCSs to rectify such behaviour.

\paragraph{Idealized Construction}
We first consider an idealized construction that, while impossible to implement exactly, will serve as the foundation of our approach.  At test time, upon seeing a new feature vector $\vx^{*}$, we wish to compute an interval $C_t$ for its label such that $y^* \in C_t \ \forall t$ with high probability.  Assume that we also have observed the true label $y^*$.  For the moment, ignore the circular reasoning that this is the very quantity for which we wish to perform inference.  Furthermore, with $(\vx^{*}, y^{*})$ in hand, assume we can compute (exactly) the posterior for any exit's parameters: $p( \rmW_{t} |, \D \cup (\vx^*, y^*)) $. This distribution is the posterior update we would perform after observing the new feature-response pair. For notational brevity, we will denote $\mathcal{D}_* :=  \D \cup (\vx^*, y^*)$ from here forward. 

To prepare for the proposition that follows, we define for a given $y \in \Y$ the \textit{predictive-likelihood ratio} 
\begin{equation}
\label{eq:eenn_avcs_ratio}
\begin{split}
    & R_t^*(y) \  \coloneqq \ \prod_{l=1}^t \frac{p_l(y | \vx^*, \D)}{p(y | \vx^*, \W_l)}, \;\; \mW_l \sim p( \rmW_{l} | \D_*) \:.
\end{split}
\end{equation}
 Note that only the likelihood terms in the denominator depend on the updated posterior (via samples $\mW_l$), whereas the predictive terms in the numerator rely solely on training data (via $p( \rmW_{l} | \D )$). The above ratio in (\ref{eq:eenn_avcs_ratio}) is inspired by the aforementioned prior-posterior martingale \citep{smith2020} yet modified for the predictive setting.  We next state our key proposition that will serve as an inspiration for constructing AVCS for $y^*$ in
EENNs:
% Assume we have a sequence of samples $\mW_{t} \sim p( \rmW_{t} | \D \cup (\vx^*, y^*)) $ for $t\in [1, T]$.  Moreover let $p_l(\ry | \vx^*, \D) = \int p(\ry | \vx^*, \rmW_l) \ p( \rmW_{l} | \D ) \ d\rmW_{l} $  denote the predictive distribution for the $l$th exit. To prepare for the propositiong that follows, define the \textit{predictive-likelihood ratio} w.r.t.~these posterior samples: 

% \begin{equation}
% \begin{split}
%     & R_t(y) \  \coloneqq \ \prod_{l=1}^t \frac{p_l(y | \vx^*, \D)}{p(y | \vx^*, \W_l))}, \ \ \ \text{ such that } \\  &    \ \ \ \ \ \  \ \ \mathbb{E}\left[ R_t(y) \ | \  \mW_{t},\ldots, \mW_{1}  \right] = R_{t-1}(y), \\ &   \ \ \ \ \ \  \ \ \mathbb{E}\left[ R_1(y) \  | \ \mW_{1}  \right] = R_{0}(y) = 1.
% \end{split}
% \end{equation}
% Invoking Ville’s inequality, as described in Section \ref{sec:bg}, allows us to construct an AVCS for $y^{*}$: 
\begin{remark}\label{prop:avcs}
For a given test point $(\vx^*, y^*)$, the predictive-likelihood ratio $R_t^*(y)$ in (\ref{eq:eenn_avcs_ratio})  is a non-negative martingale with $R_0^* = 1$ when evaluated at $y= y^*$. Moreover, the prediction sets of the form $C_t^* \coloneqq \{ y \in \Y \: |  \: R_t^*(y) \le 1 / \alpha\}$ are $(1 - \alpha)$-confidence sequences for $y^*$, meaning that
$\pr(\forall t, y^* \in C_t^*) \ge 1 - \alpha \:$.
\end{remark} The proof follows the standard procedure for deriving parametric confidence sequences; see Appendix \ref{app:sec_theory}. We term the resulting confidence sequence an \textit{EENN-AVCS}.  

% Other ratios  %: $$\frac{p(\theta)}{ p(\theta | \vx_{1},\ldots,\vx_{t} )} = \frac{p(\theta) \ p(\vx_{1},\ldots,\vx_{t})}{p(\vx_{1},\ldots,\vx_{t} | \theta ) \ p(\theta)} = \frac{p(\vx_{1},\ldots,\vx_{t})}{ p(\vx_{1},\ldots,\vx_{t} | \theta )}$$ where the numerator has the parameters marginalized away and the denominator is the likelihood.

\paragraph{Realizable Relaxation} Now we return to the aforementioned circular reasoning: we are performing inference for $y^{*}$ while assuming we have access to it. In practice, we do not have access to $y^*$ at test time; hence we cannot compute $R_t^*(y)$ (and consequently $C^*_t$). As a workaround, we propose to approximate the updated posterior with the one based on only the training data at every exit $t=1,\ldots,T$:
\begin{equation}
\label{eq:assumption1}
   p( \rmW_{t} | \D_*) \approx p( \rmW_{t} | \D ). 
\end{equation}
With $R_t(y)$ and $C_t$, we denote the resulting predictive-likelihood ratio and confidence sequence based on $p( \rmW_{t} | \D )$, respectively. While $C_t$ is now computable in a real-world scenario (since it is independent of $y^*$),  it unfortunately does not inherit the statistical validity of $C_t^*$. Naturally, the degree to which $C_t$ violates validity depends on the quality of approximation in (\ref{eq:assumption1}). If the posterior distribution $p( \rmW_{t} | \D )$ is stable---meaning that adding a single new data point $(\vx^*, y^*)$ would have minimal effect---the approximation is well-justified, and only minor validity violations can be expected. Such stability in the posterior is likely when the training dataset $\mathcal{D}$ is large and the new test datapoint originates from the same distribution. Conversely, if the posterior is unstable, the approximation will likely be poor, leading to larger violations of validity. This intuition can be formalized via the following proposition:
\begin{remark}\label{prop:TV_bound}
Assume $C_t^*$ is a valid $(1 - \alpha)$ confidence sequence for a given test datapoint $(\vx^*, y^*)$ (c.f. Proposition \ref{prop:avcs}). Then the miscoverage probability of the confidence sequence $C_t \coloneqq \{ y \in \Y \: |  \: R_t(y) \le 1 / \alpha\}$ can be upper bounded by 
\begin{flalign*}
P(\exists l \in \{1,\ldots,t\}, y^* \notin C_l) \le \\  \alpha + \sqrt{1 - e^{-\sum_{l=1}^t KL\big(p( \rmW_{l} | \D), \: p( \rmW_{l} | \D_*)\big)}}
\end{flalign*}
$\forall t=1,\ldots,T$, where KL denotes the Kullback-Leibler divergence between probability distributions.
\end{remark}
See Appendix \ref{sec:app_TV_bound_proof} for the derivation. Based on the bound in Proposition \ref{prop:TV_bound}, it is clear that when the posteriors at different exits are stable, i.e. the KL divergence between $p( \rmW_{l} | \D)$ and $p( \rmW_{l} | \D_*)$ is small, the validity violation is minor. As a result, $C_t$ will be a good approximation of  $C_t^*$.

\paragraph{Detecting Violations of Posterior Stability} It is evident from Proposition \ref{prop:TV_bound} that when the approximation in (\ref{eq:assumption1}) is poor---i.e. the KL divergence between $p( \rmW_{l} | \D)$ and $p( \rmW_{l} | \D_*)$ is large---the validity of $C_t$ will quickly degrade. As aforementioned, this could happen for a particular $\vx^*$ if either (i) $\mathcal{D}$ is small and the posterior is not stable yet or (ii) $\vx^*$ is not drawn from the training distribution. 
 The method should fail gracefully in such cases. Fortunately, the behavior of invalid AVCSs---ones for which $R_t(y)$ is not a martingale for all $y \in \Y$---has been previously studied for change-point detection \citep{pmlr-v202-shekhar23a}. Based off of their theoretical and empirical results, our procedure should collapse to the empty interval if the approximation (\ref{eq:assumption1}) is poor: $\exists t_{0}$ such that $C_{t \ge t_{0}} = \emptyset$. Encouragingly, in Section \ref{sec:exp_synthetic}, we experimentally validate that such collapses occur for out-of-distribution points for a reasonably small $t_0$. However, there will be times at which the interval width will be small---which the user might interpret as high confidence---only to later collapse to the empty set (meaning maximum uncertainty). In Section \ref{sec:exp_synthetic}, we explore using epistemic uncertainty as a measure of stability in our regression models, and we leave to future work a more general method for diagnosing when an EENN-AVCS has not yet collapsed but is likely to.

% \paragraph{Speeding up convergence of EENN-AVCS} In our original formulation (c.f. Eq. (\ref{eq:eenn_avcs_ratio})), we draw a single weights sample $\mW_t$ at each exit. Hence, in the first few exits, our confidence sequence will be based on only a few samples, and is thus expected to be large and potentially uninformative. This is analogous to AVCSs being large for the first few datapoints in the data-streaming scenario \citep{howard2021time}. As a workaround, we propose constructing multiple AVCSs in parallel for a given test datapoint $\vx^*$, and then considering their intersection at a given exit. Importantly, due to its fully parallel nature, such an approach does not introduce additional time overhead. We have also considered alternative approaches, like constructing a single AVCS based on multiple samples at each exit; however, we found that they perform worse in terms of marginal coverage and efficiency. For a further discussion on speeding up the convergence of EENN-AVCS, refer to Appendix \ref{sec:app_speed}.

\section{EENN-AVCS for Regression}
\label{sec:eenn_blm}
We next consider a concrete instantiation of our EENN-AVCS procedure proposed in the previous section. We focus on the case of one-dimensional Bayesian regression as it allows for exact inference due to conjugacy.  This allows us to assess the quality of approximation  (\ref{eq:assumption1})
without introducing the additional challenge of approximate inference. We summarize our approach for obtaining AVCSs in EENNs in Algorithm \ref{algo}. 

\paragraph{Bayesian Linear Regression} Recall from Section \ref{sec:avcs_eens} that since we require fast and exact Bayesian inference, we keep EENN's backbone parameters $\U_t$ fixed and give only the weights $\W_t$ of the prediction heads a Bayesian treatment. We define the predictive model at the $t$th exit as a linear model $f(\vx; \mW_t, \mU_{1:t}) = h_t(\vx)^T \mW_t$ where $h_t(\cdot\: ; \U_{1:t}): \X \rightarrow \mathbb{R}^H$ represents the output of the first $t$ backbone layers or blocks. % From here on, we denote this hidden representation as $\vh_t$.  Thus each exit is a linear model whose features are $\vh_t$.  
  We use a Gaussian likelihood and prior:
 $$
  \ry \sim \N\left(\ry; h_t(\vx)^T \mW_t, \sigma_{t}^2\right), \;  \mW_t \sim \N\left(\rmW_t;  \hat{\mW}_t, \sigma_{w, t}^2 \mathbb{I}_H\right)
$$
  where $\sigma_{t}^2$ is the observation noise, $\sigma_{w, t}^2$ is the prior's variance, and $\hat{\mW}_t$ are the prediction weights obtained during (pre)training of the EENN.  Due to conjugacy, we can obtain a closed form for the posterior and predictive distributions:
  \begin{flalign}
  \label{eq:blr_main}
      p(\rmW_t | \D) = \N \left(\rmW_t ; \pmu_t, \psig_t \right), \notag \\ 
      p_t(\ry | \vx^*, \D) = \N \left(\ry ; h_t(\vx^*)^T \pmu_t,  v_* + \sigma_{t}^2\right),
  \end{flalign}
  where $v^* := h_t(\vx^*)^T \psig_t  h_t(\vx^*)$. See Appendix \ref{app:sec_bayes_lin_reg} for exact expressions for posterior parameters $\pmu_t, \psig_t$.  To estimate $\sigma_{t}^2$ and $\sigma_{w,t}^2$, we optimize the (exact) marginal likelihood on the training data (type-II maximum likelihood). Combining the obtained Bayesian quantities, we can compute the predictive-likelihood ratio in (\ref{eq:eenn_avcs_ratio}) at every exit.


\paragraph{Solving for Interval Endpoints} To construct $C_t$, we next have to evaluate $R_t$ at every $y \in \Y$ and discard those where the ratio exceeds $1/\alpha$, with $\alpha$ representing a significance level (e.g., $0.05$). However, in the case of regression, where the output space is continuous, the method of evaluation is not immediately clear. One possible approach would be to define a grid of points over $\Y$ and then evaluate the predictive-likelihood ratio using a finite number of labels. Fortunately, the Bayesian linear regression model above allows us to obtain the endpoints of the prediction interval, at all exits, via a closed-form expression: $C_t = [y_L^t, y_R^t]$. This is computationally valuable since it eliminates the overhead of iterating over $\mathcal{Y}$, which could be prohibitively expensive in the low-resource settings in which EENNs typically operate.  To arrive at the analytical form, we first observe that $\log R_t$ represents a convex quadratic function in $y$:
\begin{flalign*}
    \log R_t(y) = \alpha_t (\vx^*) \cdot y^2 + \beta_t (\vx^*, \mW_{1:t}) \cdot y + \gamma_t(\vx^*, \mW_{1:t}) \: .
\end{flalign*}
Expressions for the coefficients $\alpha_t, \beta_t, \gamma_t$ are provided in Appendix \ref{app:sec_quad_roots}. To obtain the bounds $y_L^t, y_R^t$ of the prediction interval at the $t$th exit, we then simply need to find the roots of the quadratic equation $\log R_t(y) - \log (1 / \alpha) = 0$.  If the discriminant $\beta_t^2 - 4 \alpha_t (\gamma_t + \log \alpha)$ is negative, the equation has no real-valued roots, resulting in an empty prediction interval. In such cases, we interpret $\vx^*$ as an out-of-distribution sample, as mentioned in Section \ref{sec:avcs_eens}. 

\paragraph{Epistemic Uncertainty as a Measure of Stability}  In our assumed Bayesian linear regression scenario, both the posterior and updated posterior are Gaussian. This allows us to derive a closed-form expression for the KLD term   $KL\big(p( \rmW_{t} | \D), \: p( \rmW_{t} | \D_*)\big)$ in the upper bound from Proposition \ref{prop:TV_bound}. See Appendix \ref{sec:epi_KL} for the derivation.  Recall that $v_*$ represents the epistemic uncertainty (c.f. Eq. (\ref{eq:blr_main})), which is the uncertainty that stems from observing limited data.  In turn, the KLD is small for a given $\vx_*$ when $v_*$ is small.  The uncertainty decreases as we collect more data\footnote{$\lim_{N \rightarrow \infty} v_* = 0$ where $N$ represents the number of training data points (c.f. Section 3.3.2 in \cite{bishop2006pattern}).}, which, together with Proposition \ref{prop:TV_bound}, implies that the statistical coverage of our EENN-AVCS will improve as the dataset size increases. Moreover, $v^*$ is independent of the test label $y^*$. Thus, we can employ it as a measure of the stability of a EENN-AVCS: for a given $\vx_*$, a higher $v^*$ can signal to the user that the resulting confidence sequence may not be reliable. 
 We illustrate this in Section \ref{sec:exp_synthetic}. 

\section{EENN-AVCS for Classification}
\label{sec:class}

In this section, we propose a concrete instantiation of our \our for classification. Unlike the regression scenario in the previous section, an additional challenge is presented by a lack of conjugacy. Specifically, we cannot obtain a closed-form expression for the Bayesian predictive posterior (see Eq. (\ref{eq:bayes_eenn})) at every exit when using the usual Gaussian assumption for the posterior over parameters. To circumvent this, we depart from the Bayesian predictive model and utilize instead Dirichlet Prior Networks \citep{malinin2018predictive}, which enable analytically tractable predictive distributions at each exit. Our EENN-AVCS approach for classification is summarized in Algorithm \ref{algo_class}.

\paragraph{Dirichlet Prior Networks} Instead of positing a distribution over (last-layer) weights $\rmW_t$ at every exit, we posit a distribution over categorical distributions $p(\bpi_t | \D, \vx^*), \: \bpi_t \in \Delta^K $ \footnote{$\Delta^K := \{\bpi \in \mathbb{R}^K | \sum_{k=1}^K \pi_k = 1, \pi_k \ge 0\}$} for a given test datapoint $\vx^*$. %This gives rise to the following predictive distribution
% \begin{flalign*}
% p_t(\ry | \vx^*, \D) = \int p(\ry | \bmu_t(\vx^*)) \ p( \bmu_t(\vx^*) | \D ) \ d\bmu_t \: .
% \end{flalign*}
Assuming a categorical likelihood, the posterior is Dirichlet via conjugacy:
\begin{flalign*}
    p(\rvy | \bpi_t) = \texttt{Cat}(\rvy | \bpi_t), \; p( \bpi_t | \vx^*, \D ) = \texttt{Dir}( \bpi_t | \boldsymbol{\alpha}_t(\vx^* ; \D) )
\end{flalign*}
where $\boldsymbol{\alpha}_t \in \mathbb{R}_{> 0}^K$ are the concentration parameters.  The predictive distribution also has a closed form:
\begin{flalign*}
    p_t(\rvy = y |\vx^*, \D) = \\ \int p(\rvy = y  | \bpi_t) \ p( \bpi_t | \vx^*, \D ) \ d\bpi_t = \frac{\alpha_{t, y}}{\sum_{y' \in \mathcal{Y}} \alpha_{t, y'}} \: \: .
\end{flalign*}
\citet{malinin2018predictive} propose to parameterize the Dirichlet concentration parameters via the outputs of a neural network,
% \begin{flalign*}
%     \boldsymbol{\alpha}_t(\vx^* ; \D) = f(\vx; \mW_t, \mU_{1:t})
% \end{flalign*}
$
    \boldsymbol{\alpha}_t(\vx^* ; \D) = f(\vx^*; \mW_t, \mU_{1:t}),
$
and term this model a \textit{Dirichlet Prior Network} (DPN).  In DPNs, the aim is to capture the \emph{distributional uncertainty} that arises due to the mismatch between test and training distributions, in addition to the \emph{data uncertainty} (often referred to as aleatoric uncertainty). This is in contrast to Bayesian models, which focus on the \emph{model uncertainty} (or epistemic uncertainty). We refer the reader to \cite{malinin2018predictive} for an in-depth discussion of the different sources of uncertainty.

\paragraph{Classification EENN-AVCS} Having a closed-form predictive distribution, we can define the following \emph{predictive-likelihood} ratio for a given $y \in \mathcal{Y}$:
\begin{equation*}
\begin{split}
    & R_t^*(y) \  \coloneqq \ \prod_{l=1}^t \frac{p_l(y | \vx^*, \D)}{p(y | \bpi_l)}, \;\; \bpi_l \sim p( \bpi_l | \D^* ) \:.
\end{split}
\end{equation*}
Our result from Proposition \ref{prop:avcs} applies here as well\footnote{The only difference in the proof being that the martingale is defined with respect to the sequence of categorical distributions $\bpi_t$ instead of the sequence of weights $\rmW_t$.}, hence it follows that 
% \begin{flalign*}
%     C_t^* \coloneqq \{ y \in \Y \: |  \: R_t^*(y) \le 1 / \alpha\}
% \end{flalign*}
$
    C_t^* \coloneqq \{ y \in \Y \: |  \: R_t^*(y) \le 1 / \alpha\}
$
is a valid $(1 - \alpha)$-confidence sequences for $y^*$. As in the regression case, $R_t^*$ can not be realized in practice as it depends on the unknown label $y^*$. We again approximate this oracle posterior with the one based solely on the training data 
% \begin{flalign*}
%     p( \bmu_l(\vx^*) | \D^* ) \approx p( \bmu_l(\vx^*) | \D)
% \end{flalign*}
$
    p( \bpi_l | \D^* ) \approx p( \bpi_l |\vx^*, \D)
$
and denote the resulting predictive-likelihood ratio and confidence sequence as $R_t$ and $C_t$, respectively. To reason about the quality of this approximation, we can again rely on Proposition \ref{prop:TV_bound}.

\paragraph{Post-Hoc Implementation} The original DPN formulation \citep{malinin2018predictive} requires a specialized training procedure to ensure that the NN's outputs represent meaningful concentration parameters.  We instead opt for a simpler \emph{post-hoc} approach as we have found it to yield satisfactory results. Specifically, to obtain the concentration parameters, we start with a pretrained (classification) EENN and pass the logits at each exit through an activation function $a: \mathbb{R} \rightarrow \mathbb{R}_{>0}$. We found that a simple choice of ReLU activation $a_t(x) = \texttt{ReLU}(x, \tau_t)$ with a different threshold $\tau_t \ge 1$ at each exit works well in practice.\footnote{We restrict concentration parameters to be larger than one due to the Dirichlet concentrating towards the simplex's edges for parameter values smaller than one.} To obtain the ReLU thresholds, we use a validation dataset and pick the largest $\tau_t$ such that $(1 - \alpha)\%$ of validation datapoints are still contained in the resulting prediction sets at each exit. Lastly, since $\mathcal{Y}$ has a finite support (unlike the regression case), we iterate over all of $\mathcal{Y}$ when constructing a prediction set $C_t$. % at a given exit.

\section{Related Work}
\label{sec:related_work}

\textbf{Early-Exit Neural Networks} (EENNs) enable faster inference in deep models by allowing predictions to be made at intermediate layers \citep{branchy2016, huang2018, laskaridis2021adaptive}. They have been extensively explored for computer vision \citep{li2019improved, kaya_shallow_deep_2019, yang2023} and natural language processing \citep{schwartz2020right, zhou2020bert, xu2022survey}. The majority of these studies aimed to improve the accuracy-speed trade-off, i.e., ensuring the model exits as early as possible while maintaining high accuracy. However, uncertainty quantification (UQ) within EENNs has so far received relatively little attention \citep{cat2021, meronen2023, regol2023jointly}. When it has, UQ has primarily been used to improve EENN termination criteria.  \citet{meronen2023} employ a Bayesian predictive model at each exit to enhance the calibration of EENNs. 
 \citet{cat2021} propose a conformal prediction scheme with the goal of generating sets/intervals that are (marginally) guaranteed to contain the prediction of the full EENN.  Yet none of the preceding works address the fact that uncertainty estimates at successive exits are dependent, which is the main focus of our work.  Perhaps the closest related work is by \citet{jazbec2023}, who adapt EENNs for the anytime setting \citep{zilbertstein1996using}.  Their method promotes \emph{conditional monotonicity}: the EENN's performance improves across exits for every test sample. Our idea of nested prediction sets can be seen as an extension of conditional monotonicity to EENNs that yield prediction sets, not only point predictions as done by \cite{jazbec2023}.

\textbf{Anytime-Valid Confidence Sequences} (AVCSs)  are sequences of confidence intervals designed for streaming data settings, providing time-uniform and non-asymptotic coverage guarantees \citep{darling1967confidence, lai1976confidence, howard2021time}. They allow for adaptive experimentation that permits one to 'peek' at the data at any time, make decisions, yet still maintain the validity of the statistical inferences. Recently, AVCSs have found applications in A/B testing that is resistant to `p-hacking' \citep{maharaj2023anytime},  Bayesian optimization \citep{neiswanger2021}, and change-point detection \citep{pmlr-v202-shekhar23a}.  AVCSs have not been previously considered for sequential estimation of predictive uncertainty in EENNs.
 

\section{Experiments}
\label{sec:exp}


We conduct three sets of experiments, which can be reproduced using the code at \texttt{\url{https://github.com/metodj/EENN-AVCS}}. Firstly, in Section \ref{sec:exp_synthetic}, we explore our method (EENN-AVCS) on synthetic datasets to empirically verify its correctness and assess its feasibility. In the subsequent set of experiments, detailed in Section \ref{sec:exp_nlp}, we check that our findings extend to practical scenarios, applying EENN-AVCS to a textual semantic similarity regression task using a transformer backbone model \citep{zhou2020bert}. Lastly, in Section \ref{sec:exp_msdnet}, we report results on image classification tasks (CIFAR-10/100, ImageNet) using a multi-scale dense net (MSDNet) \citep{huang2018}.

\paragraph{Evaluation Metrics} To assess the quality of the prediction sets at each exit, we utilize the standard combination of \emph{marginal coverage} and \emph{efficiency}, i.e. average interval size, on the test dataset \citep{angel2021conformal}:
\begin{flalign*}
    \textrm{size}(t) := \frac{1}{n_{test}}\sum_{n=1}^{n_{test}} |C_t(\vx_n)|, \\ \textrm{coverage}(t) := \frac{1}{n_{test}}\sum_{n=1}^{n_{test}} \big[y_n \in C_t(\vx_n)\big],
\end{flalign*}
where $C_t$ is a prediction set at the $t$-th exit and $[\cdot]$ is the indicator function. Marginal coverage serves as a proxy for the statistical validity of the approach, measuring how frequently the ground-truth falls within the predicted interval on average. Among two methods with similar marginal coverage, the one with smaller interval sizes is preferred.  To assess the nestedness of prediction sets across exits, we define a \emph{nestedness} metric: at each exit $t$, we compute
\begin{flalign*}
\mathfrak{N}(t) = |\cap_{s \le t} C_s| / |C_t|
\end{flalign*}
and report its mean across test data points.  A model with perfectly nested prediction sets will have $\mathfrak{N}(t)=1$, exactly.  Otherwise, $\mathfrak{N}(t)$ will be less than one and zero only in the case of disjoint sets.

%While AVCSs usually aim for conditional coverage, the approximations we introduce (c.f. Section \ref{sec:avcs_eens}) is a reason why we focus on the marginal level in our evaluations.

% \begin{wrapfigure}{r}{0.5\textwidth}
%   \centering
%    \vspace{1\baselineskip}
%    \includegraphics[width=0.48\textwidth]{figures/fig_blr_vs_avcs_all_metrics_S_parallel_10.pdf}
%   \caption{We compare our \our with \base baseline based on average nestedness (\textit{top}), marginal coverage (\textit{middle}), and average interval size (\textit{bottom}). EENN-AVCS is the only approach that yields perfect nestedness while maintaining reasonably high marginal coverage across exits. The nestedness comes at a price of larger intervals in the initial exits, though. Note that in the \emph{top} plot, the nestedness curves of EENN-AVCS (\protect\orangeline) and EENN-Bayes-intersection (\protect\blueline) overlap at $\mathfrak{N}(t) = 1$.}
%   \label{fig:toy_data_metrics}
%   \vspace{-2\baselineskip}
% \end{wrapfigure}

\begin{figure}[htbp]
  \centering
  % \includegraphics[width=0.9\linewidth]{figures/fig_blr_vs_avcs_all_metrics_S_parallel_10.pdf}
  \includegraphics[width=0.9\linewidth]{figures/fig_toy_regression.pdf}

    \caption{We compare our \our with \base baseline based on average nestedness (\textit{top}), marginal coverage (\textit{middle}), and average interval size (\textit{bottom}). EENN-AVCS is the only approach that yields perfect nestedness while maintaining reasonably high marginal coverage across exits. The nestedness comes at a price of larger intervals in the initial exits, though. Note that in the \emph{top} plot, the nestedness curves of EENN-AVCS (\protect\orangeline) and EENN-Bayes-intersection (\protect\blueline) overlap at $\mathfrak{N}(t) = 1$.}
   \label{fig:toy_data_metrics}
   \vspace{-1\baselineskip}
\end{figure}

\begin{figure}[t]
  \centering
   \vspace{-1\baselineskip}
  % \includegraphics[width=0.86\linewidth]{figures/fig_blr_vs_avcs_wiggle_and_3clustershomo_S_10_parallel_OOD_line.pdf}
    \includegraphics[width=0.9\linewidth]{figures/fig2_no_line_ood.pdf}
    % \caption{Prediction intervals (\protect\scalerel*{\usebox{\boxorange}}{\square}) for \base (\textit{left}) and our EENN-AVCS (\textit{right}) on two simulated regression tasks \cite{antoran2020depth}: wiggle (\textit{up}) and 3-clusters (\textit{bottom}). Blue points denote training data. In cases where the EENN-AVCS collapses to an empty set (out-of-distribution) we depict the mean prediction by a red dashed line (\protect\reddashedline). We set the significance level to $\alpha = 0.05$ for EENN-AVCS, while for EENN-Bayes, we plot intervals that capture 2 standard deviations away from the predicted mean (\protect\orangeline). With different background colors we denote different regions of data distribution, see Section \ref{sec:exp_synthetic}.}
    \caption{Prediction intervals (\protect\scalerel*{\usebox{\boxorange}}{\square}) for \base (\textit{left}) and our EENN-AVCS (\textit{right}) on two simulated regression tasks \cite{antoran2020depth}: wiggle (\textit{up}) and 3-clusters (\textit{bottom}). Blue points denote training data.  In cases where the EENN-AVCS collapses to an empty set (out-of-distribution), we do not depict anything, which explains the gaps in EENN-AVCS predictions. We set the significance level to $\alpha = 0.05$ for EENN-AVCS, while for EENN-Bayes, we plot intervals that capture 2 standard deviations away from the predicted mean (\protect\orangeline). With different background colors we denote different regions of data distribution, see Section \ref{sec:exp_synthetic}.}
   \label{fig:toy_data_intervals}
   \vspace{-2\baselineskip}
\end{figure}

% \begin{figure*}[t]
%   \centering
%   % \includegraphics[width=0.9\linewidth]{figures/fig_blr_vs_avcs_all_metrics_S_parallel_10.pdf}
%   \includegraphics[width=0.9\linewidth]{figures/fig_toy_regression_horizontal.pdf}

%     \caption{We compare our \our with \base baseline based on average nestedness (\textit{top}), marginal coverage (\textit{middle}), and average interval size (\textit{bottom}). EENN-AVCS is the only approach that yields perfect nestedness while maintaining reasonably high marginal coverage across exits. The nestedness comes at a price of larger intervals in the initial exits, though. Note that in the \emph{top} plot, the nestedness curves of EENN-AVCS (\protect\orangeline) and EENN-Bayes-intersection (\protect\blueline) overlap at $\mathfrak{N}(t) = 1$.}
%    \label{fig:toy_data_metrics}
%    \vspace{-1\baselineskip}
% \end{figure*}

\paragraph{Baselines} We compare EENN-AVCS against standard UQ techniques---namely Bayesian methods and conformal prediction. As a Bayesian baseline, we use the same underlying Bayesian EENN but without applying the AVCS. We term this approach \textit{\base}since it uses the Bayesian predictive distribution at each exit to perform UQ. \base can be seen as an adaptation of the last-layer Laplace approach for early-exiting \citep{meronen2023}.  For the conformal baselines, we perform conformal inference independently at every exit. Specifically, we use the Regularized Adaptive Predictive Sets algorithm \citep[RAPS;][]{angelopoulos2020uncertainty} for the classification experiments (c.f., \ref{sec:exp_msdnet}) and Conformalized Quantile Regression  \citep[CQR;][]{romano2019conformalized} for the NLP regression experiments (c.f., Sec \ref{sec:exp_nlp}).  The primary difference between our approach and the baselines should be that EENN-AVCS has nested intervals, without sacrificing coverage, whereas the baselines have no such guarantee. %, which could lead to non-overlapping intervals.

%This has been previously proposed for classification tasks \citep{meronen2023}.

\subsection{Synthetic Regression Data}
\label{sec:exp_synthetic}

We use two non-linear regression simulations \citep{antoran2020depth}: \textit{wiggle}
 and \textit{3-clusters}. The EENN used in this experiment has a backbone architecture of $T=15$ feed-forward layers with residual connections. Each layer consists of $M=20$ hidden units, and we attach an output layer on top of it to enable early-exiting. We fit the (last-layer) Bayesian linear regression model at each exit using the training data and construct $S=10$ confidence sequences in parallel at test time for each datapoint (see Appendix \ref{sec:app_speed} for more details on the parallel construction). We set the significance level to $\alpha = 0.05$ for EENN-AVCS, while for EENN-Bayes, we plot intervals that capture two standard deviations away from the predicted mean. Further details regarding data generation, the model architecture, and the training can be found in Appendix \ref{app:sec_implement}.

 


 In the \emph{top} row of Figure \ref{fig:toy_data_metrics}, we compare our \our ({\tikz[baseline=-0.5ex] \draw[mplorange, very thick, solid] (0., 0.) -- (0.3, 0.);}) against the \base ({\tikz[baseline=-0.5ex] \draw[mplblue, very thick, dotted] (0., 0.) -- (0.3, 0.);}) baseline on the test dataset based on how nested the prediction intervals are across exits. We observe that, due to their theoretical foundation, EENN-ACVSs attain perfect nestedness. In contrast, EENN-Bayes's nestedness deteriorates over time on both datasets considered, indicating that there are labels that re-enter the \base prediction intervals after being ruled out at some earlier exit(s). In the \textit{top} row, we additionally observe that perfect nestedness can be achieved in \base by considering a running intersection of all previous prediction intervals at each exit (denoted with ({\tikz[baseline=-0.5ex] \draw[mplblue, very thick, solid] (0., 0.) -- (0.3, 0.);}) line), similar to \our (the two nestedness lines of both intersection methods overlap at $\mathfrak{N}(t) = 1$). However, as shown in the \textit{middle} row, this approach leads to a decrease in marginal coverage, indicating that fewer data points are covered by the intersection of \base intervals as more exits are evaluated. In contrast, \our maintains high marginal coverage despite utilizing an intersection of intervals at each exit. This is a direct consequence of the time-uniform nature of AVCS. The nestedness of EENN-AVCS comes at a price, though, as the interval size tends to be larger than that of EENN-Bayes at the initial exits (\emph{bottom} plot). This observation is in line with existing work on AVCSs \citep{howard2021time}. 



% \begin{figure*}[t]
%   \centering
%   % \includegraphics[width=0.86\linewidth]{figures/fig_blr_vs_avcs_wiggle_and_3clustershomo_S_10_parallel_OOD_line.pdf}
%     \includegraphics[width=0.86\linewidth]{figures/fig2_no_line_ood_horizontal.pdf}
%     % \caption{Prediction intervals (\protect\scalerel*{\usebox{\boxorange}}{\square}) for \base (\textit{left}) and our EENN-AVCS (\textit{right}) on two simulated regression tasks \cite{antoran2020depth}: wiggle (\textit{up}) and 3-clusters (\textit{bottom}). Blue points denote training data. In cases where the EENN-AVCS collapses to an empty set (out-of-distribution) we depict the mean prediction by a red dashed line (\protect\reddashedline). We set the significance level to $\alpha = 0.05$ for EENN-AVCS, while for EENN-Bayes, we plot intervals that capture 2 standard deviations away from the predicted mean (\protect\orangeline). With different background colors we denote different regions of data distribution, see Section \ref{sec:exp_synthetic}.}
%     \caption{Prediction intervals (\protect\scalerel*{\usebox{\boxorange}}{\square}) for \base (\textit{left}) and our EENN-AVCS (\textit{right}) on two simulated regression tasks \cite{antoran2020depth}: wiggle (\textit{up}) and 3-clusters (\textit{bottom}). Blue points denote training data.  In cases where the EENN-AVCS collapses to an empty set (out-of-distribution), we do not depict anything, which explains the gaps in EENN-AVCS predictions. We set the significance level to $\alpha = 0.05$ for EENN-AVCS, while for EENN-Bayes, we plot intervals that capture 2 standard deviations away from the predicted mean (\protect\orangeline). With different background colors we denote different regions of data distribution, see Section \ref{sec:exp_synthetic}.}
%    \label{fig:toy_data_intervals}
%    \vspace{-1\baselineskip}
% \end{figure*}

To better understand our method's behavior on in-distribution (ID) vs out-of-distribution (OOD) points, we construct a new test dataset by considering equidistantly spaced points across the entire $\mathcal{X}$ space\footnote{Specifically, for $\mathcal{X} = [L, R]$, we construct $X_{test} = \texttt{np.linspace}(L - \epsilon, R + \epsilon, N_{test})$ for $\epsilon > 0$.}. We report results for both datasets considered in Figure \ref{fig:toy_data_intervals}. Initially, we observe that for ID datapoints (with ID regions of $\mathcal{X}$ depicted using \protect\scalerel*{\usebox{\boxblue}}{\square} background), our method satisfactorily covers the data distribution, especially at later exits. Encouragingly, AVCSs are also observed to quickly collapse to empty intervals outside of the data distribution (OOD regions are depicted with a white background).  Whenever the AVCS collapses to an empty interval, %we highlight this by altering the mean prediction line from ({\tikz[baseline=-0.5ex] \draw[mplorange, very thick, solid] (0., 0.) -- (0.3, 0.);}) to ({\tikz[baseline=-0.5ex] \draw[mplred, very thick, dashed] (0., 0.) -- (0.3, 0.);})).
we omit plotting the EENN-AVCS's predictions, showing the collapse via gaps in Figure \ref{fig:toy_data_intervals}. Recall that in our setting, an empty interval represents that a distribution shift has been detected (i.e.~maximal predictive uncertainty), which is exactly the desired behavior in OOD regions.

%  \begin{wrapfigure}{r}{0.5\textwidth}
%   \centering
%    \vspace{-2\baselineskip}
%    \includegraphics[width=0.48\textwidth]{figures/fig_epistemic_uncertainty.pdf}

%    \caption{Average epistemic uncertainty $v_*$ (\protect\redline) across Bayesian linear regression models at different exits. As expected, $v_*$ is larger in the regions where we observe less training data: \emph{out-of-distribution} (denoted with a white background) and \emph{in-between} (denoted with a  grey background (\protect\scalerel*{\usebox{\boxgrey}}{\square})). Hence, $v_*$ can serve as an indicator for assessing the reliability of EENN-AVCSs.} 
%    \label{fig:epi_uncertainty}
%   % \vspace{1\baselineskip}
% \end{wrapfigure}



 On the \emph{wiggle} dataset, we also have the opportunity to study the behavior on the so-called in-between (IB) datapoints that reside between ID and OOD regions.  We depict the IB region with a \protect\scalerel*{\usebox{\boxgrey}}{\square} background. We observe that our method encounters challenges in this regime to some extent, as the prediction intervals are, counterintuitively, smaller compared to those in the ID region despite the density of observed training datapoints being lower in the IB area. A partial remedy is provided by the epistemic uncertainty $v^*$ (see Eq. (\ref{eq:blr_main})), which in our framework can be interpreted as a proxy for the stability of posterior distributions at different exits as explained in Section \ref{sec:eenn_blm}. As depicted in Figure \ref{fig:epi_uncertainty}, $v^*$ is larger for IB points compared to the ID ones (as expected). Thus, a higher $v^*$ can serve as a warning that the resulting confidence sequence should not be blindly relied upon.\footnote{The IB region also poses challenges for other UQ methods; a similar behavior was reported for Gaussian processes \citep{lin2023sampling}, with the in-between region being referred to as the \emph{extrapolation region}.}

 \begin{figure}[htbp]
  \centering
  \includegraphics[width=1.\linewidth]{figures/fig_epistemic_uncertainty.pdf}

    \caption{Average epistemic uncertainty $v_*$ (\protect\redline) across Bayesian linear regression models at different exits. As expected, $v_*$ is larger in the regions where we observe less training data: \emph{out-of-distribution} (denoted with a white background) and \emph{in-between} (denoted with a  grey background \protect\scalerel*{\usebox{\boxgrey}}{\square}). Hence, $v_*$ can serve as an indicator for assessing the reliability of EENN-AVCSs.} 
   \label{fig:epi_uncertainty}
\end{figure}

 \subsection{Semantic Similarity using ALBERT}
\label{sec:exp_nlp}

% \begin{wrapfigure}{r}{0.5\textwidth}
%   \centering
%    \vspace{-2\baselineskip}
%    \includegraphics[width=0.48\textwidth]{figures/fig_nlp.pdf}

%    \caption{\tmlr{Comparison of our \our with CQR \citep{romano2019conformalized} and \base baselines on the NLP regression datasets.} Similar to findings on the synthetic data (c.f., Figure \ref{fig:toy_data_metrics}), EENN-AVCS attains perfect nestedness (\emph{upper} plot) while maintaining reasonably high marginal coverage across exits (\emph{middle} plot). However, the intervals generated by EENN-AVCS at each exit are larger compared to the baseline (\emph{bottom} row). Note that in the \emph{upper} plot, the nestedness curves of EENN-AVCS (\protect\orangeline), EENN-Bayes-intersection (\protect\blueline), and EENN-CQR-intersection (\protect\redbline)  overlap at $\mathfrak{N}(t) = 1$.}
%    \label{fig:sts_b_main}
%   \vspace{-1\baselineskip}
% \end{wrapfigure}

In this experiment, we examine the STS-B dataset from the GLUE Benchmark \citep{wang2018glue} and the SICK dataset \citep{marelli2014semeval}. For both, the task is  predicting the degree of semantic similarity between two input sentences. The similarity score is a continuous label ranging between 0 and 5, denoted as $\mathcal{Y} = [0, 5]$. %\footnote{Note that this is the only regression dataset that we have seen considered in the NLP EENN literature thus far \citep{zhou2020bert, cat2021}. }
As the backbone model, we employ ALBERT with 24 transformer layers \citep{lan2019albert}, providing the model an option to early exit after every layer. Bayesian linear regression models are fitted on the development set. At test time, we construct a single AVCS ($S=1$) with $\alpha = 0.05$.  We observed that constructing multiple AVCSs in parallel leads to a quicker decay of marginal coverage on this dataset. Since we know that the true label is within $[0, 5]$, we clip the resulting prediction intervals for all approaches to this region (if they should extend beyond it). Refer to Appendix \ref{app:sts_b_implement} for additional details on data, model, and training for this experiment.


Results are presented in Figure \ref{fig:sts_b_main}. Encouragingly, the observations here align qualitatively with those made on synthetic datasets in Section \ref{sec:exp_synthetic}. In the \emph{top} plot, considering only the current Bayesian ({\tikz[baseline=-0.5ex] \draw[mplblue, very thick, dotted] (0., 0.) -- (0.3, 0.);})  or conformal ({\tikz[baseline=-0.5ex] \draw[mplred, very thick, dotted] (0., 0.) -- (0.3, 0.);})  interval at each exit again results in non-nested uncertainty estimates.  As shown in the \emph{middle} plot, using the running intersection of \base's ({\tikz[baseline=-0.5ex] \draw[mplblue, very thick, solid] (0., 0.) -- (0.3, 0.);}) and CQR's ({\tikz[baseline=-0.5ex] \draw[mplred, very thick, solid] (0., 0.) -- (0.3, 0.);}) intervals rectifies this non-nestedness.  However, using the running intersection results in a larger decay in marginal coverage.  EENN-AVCS's ({\tikz[baseline=-0.5ex] \draw[mplorange, very thick, solid] (0., 0.) -- (0.3, 0.);}) coverage does not suffer nearly to the same extent.  The marginal coverage in the case of the STS-B dataset is worse across all approaches when compared to the coverage observed on synthetic data experiments, c.f. Figure \ref{fig:toy_data_metrics}. We attribute this to there being a larger shift between training, development, and test data splits for the STS-B dataset, as evidenced by the difference in model performance on each of those splits (see Appendix \ref{app:sts_b_implement} for further details). Finally, the \emph{bottom} plot reaffirms that the nestedness of EENN-AVCS comes at the expense of larger intervals.

\begin{figure}[htbp]
  \centering
  \includegraphics[width=1.\linewidth]{figures/fig_nlp.pdf}

    \caption{Comparison of our \our with CQR \citep{romano2019conformalized} and \base baselines on the NLP regression datasets. Similar to findings on the synthetic data (c.f., Figure \ref{fig:toy_data_metrics}), EENN-AVCS attains perfect nestedness (\emph{upper} plot) while maintaining reasonably high marginal coverage across exits (\emph{middle} plot). However, the intervals generated by EENN-AVCS at each exit are larger compared to the baseline (\emph{bottom} row). Note that in the \emph{upper} plot, the nestedness curves of EENN-AVCS (\protect\orangeline), EENN-Bayes-intersection (\protect\blueline), and EENN-CQR-intersection (\protect\redbline)  overlap at $\mathfrak{N}(t) = 1$.}
   \label{fig:sts_b_main}
\end{figure}

%%While intersecting the \base ({\tikz[baseline=-0.5ex] \draw[mplblue, very thick, solid] (0., 0.) -- (0.3, 0.);}) and CQR ({\tikz[baseline=-0.5ex] \draw[mplred, very thick, solid] (0., 0.) -- (0.3, 0.);}) intervals rectifies this non-nestedness, it leads to a larger decay in marginal coverage compared to our EENN-AVCS ({\tikz[baseline=-0.5ex] \draw[mplorange, very thick, solid] (0., 0.) -- (0.3, 0.);}), as depicted in the \emph{middle} plot. 

\subsection{Image Classification with MSDNet}
\label{sec:exp_msdnet}

In the last experiment, we quantify uncertainty at every exit on an image classification task.  We consider CIFAR-10/100, \citep{krizhevsky2009learning}, and ILSVRC 2012 (ImageNet; \cite{deng2009imagenet}). As our backbone EENN, we employ a Multi-Scale Dense Network \citep[MSDNet;][]{huang2018}, which consists of stacked convolutional blocks. At each exit, we map the logits to concentration parameters of the Dirichlet distribution using the ReLU activation function, as discussed in Section \ref{sec:class}. To find the exact ReLU thresholds at each exit, we allocate 20\% of the test dataset as a validation dataset and evaluate the performance on the remaining 80\%. We construct a single AVCS $(S=1)$ at each exit. We use significance level $\alpha=0.05$ for \our as well as for both baselines. % , i.e., \base and RAPS. 

\begin{figure}[htbp]
  \centering
  % \includegraphics[width=0.9\linewidth]{figures/fig_classification_multiple_threshold_laplace_baseline.pdf}
  \includegraphics[width=0.9\linewidth]{figures/fig_img_cls.pdf}

    \caption{Comparison of our \our with RAPS \citep{angelopoulos2020uncertainty} and \base baselines based on average nestedness (\textit{top}), marginal coverage (\textit{middle}), and average interval size (\textit{bottom}) for our image classification experiments using MSDNet as a backbone. EENN-AVCS is the only approach that attains perfect nestedness (\emph{top}) while maintaining high marginal coverage across different exits (\emph{middle}). Nestedness comes at a price, though, as EENN-AVCS sets are larger compared baseline ones (\emph{bottom}). Note that in the \emph{top} plot, the nestedness curves of EENN-AVCS (\protect\orangeline), RAPS-intersection (\protect\redbline), and \base-intersection (\protect\blueline) overlap at $\mathfrak{N}(t) = 1$.}
   \label{fig:class}
    % \vspace{-1\baselineskip}
\end{figure}

In Figure \ref{fig:class}, we observe that constructing conformal RAPS ({\tikz[baseline=-0.5ex] \draw[mplred, very thick, dotted] (0., 0.) -- (0.3, 0.);}) or Bayesian credible ({\tikz[baseline=-0.5ex] \draw[mplblue, very thick, dotted] (0., 0.) -- (0.3, 0.);}) sets at every exit independently leads to non-nested behavior (see \emph{top} row). Taking the intersection of RAPS sets ({\tikz[baseline=-0.5ex] \draw[mplred, very thick, solid] (0., 0.) -- (0.3, 0.);}) corrects this; however, as expected this leads to a violation of conformal marginal coverage guarantees (see \emph{middle} row). The same observations hold for the intersection of \base sets ({\tikz[baseline=-0.5ex] \draw[mplblue, very thick, solid] (0., 0.) -- (0.3, 0.);}). Encouragingly, as in our regression experiments, our \our based on the Dirichlet Prior Network ({\tikz[baseline=-0.5ex] \draw[mplorange, very thick, solid] (0., 0.) -- (0.3, 0.);}) yields perfect nestedness while maintaining high marginal coverage. In the \emph{bottom} row, we also see that \our sets are roughly two times (or less) larger than the sets from both baselines, which might be a reasonable price to pay for the nestedness.

\section{Conclusion}
We proposed using anytime-valid confidence sequences for predictive uncertainty quantification in EENNs. We showed that our approach yields nested prediction sets across exits---a property that is lacking in prior work, yet is crucial when deploying EENNs in safety critical applications. We described the theoretical and practical challenges associated with using AVCSs for predictive tasks.  Moreover, we empirically validated our approach across a range of EENNs and datasets.  Our work is an important step towards models that are not only fast but also safe.

\paragraph{Limitations and Future Work} For future work, it is paramount to improve the efficiency of EENN-AVCSs, aiming for smaller intervals.  This is especially crucial for the initial exits, which are of the highest practical interest for resource-constrained settings. While we explored ways to reduce the set size (c.f., Appendix \ref{sec:app_speed}), further efforts are necessary to ensure faster convergence without sacrificing marginal coverage in the process. Additionally, studying alternatives to the predictive-likelihood ratio (c.f.,  Eq. (\ref{eq:eenn_avcs_ratio})) for constructing confidence sequences might be a promising way to improve efficiency.  Finally, from a theoretical standpoint, it would be interesting to study the behaviour of EENN-AVCS as the number of exits goes to infinity. Implicit deep models \citep{chen2018neural, bai2020multiscale} could be used to this end.

\section{Acknowledgments}

We thank Alexander Timans, Rajeev Verma, and Mona Schirmer for helpful discussions. We are also grateful to the anonymous reviewers who helped us improve our work with their constructive feedback. MJ and EN are generously supported by the Bosch Center for Artificial Intelligence. SM acknowledges support by the IARPA WRIVA program, the National Science Foundation (NSF) under the NSF CAREER Award 2047418; NSF Grants 2003237 and 2007719, the Department of Energy, Office of Science under grant DE-SC0022331, as well as gifts from Disney and Qualcomm.


% References
\bibliography{references}

\newpage

\onecolumn


\appendix

\section{Additional Results}

\subsection{Speeding up convergence of EENN-AVCS}
\label{sec:app_speed}

 In our original formulation in Section \ref{sec:avcs_eens}, we draw a single sample of the weighs $\mW_t$ (or predictive distribution $\bmu_t$ in the case of classification) at each exit. This invariably leads to large prediction intervals/sets at the initial exits - a phenomenon analogous to AVCSs being large for the initial few observed data points in the conventional data streaming scenario \citep{howard2020time}. In this section, we explore two distinct approaches to mitigate this issue, aiming to attain more efficient confidence estimates right from the initial exits.

In the first approach, we simply take multiple samples  $S_t > 1$ at each exit. Consequently, the predictive likelihood ratio for a given test point $\vx^*$ takes the following form:
\begin{equation*}
\begin{split}
    & R_t(y ) \  \coloneqq \ \prod_{l=1}^t \prod_{s=1}^{S_l} \frac{p_l(y | \vx^*, \D)}{p(y | \vx^*, \W_l^{(s)})}, \;\; \mW_l^{(s)} \sim p( \rmW_{l} | \D) \:.
\end{split}
\end{equation*}
We term this approach \emph{Multiple-Samples AVCS}. As an alternative, we construct multiple AVCSs $\{C_t^{(s)}\}_{s=1}^{S_t}$ based on a single sample in parallel. At each exit, we then consider their intersection $C_t^{\cap} = \bigcap_{s=1}^{S_t} C_t^{(s)}$ and pass it on to the next exit. We refer to this method as \emph{Parallel AVCS}.

We present the results for both approaches in Figure \ref{fig:app_speed} using synthetic datasets from Section \ref{sec:exp_synthetic}. While both methods yield more efficient, i.e., smaller, intervals in the initial exits (\emph{top} row), it is interesting to observe that the \emph{Multiple-Samples} approach leads to a much faster decay in marginal coverage compared to the \emph{Parallel} one (see \emph{bottom} row). We attribute this to the fact that by sampling multiple samples within a single confidence sequence at each exit, we are essentially `committing' more to our approximation of the updated posterior (c.f., Eq. (\ref{eq:assumption1})), which results in larger coverage violations. Hence, we recommend using the \emph{Parallel} approach when attempting to speed up the convergence of our \our. Nonetheless, we acknowledge that this area warrants further investigation, and we consider this an important direction for future work.


\begin{figure}[h]
    \centering
    \subfloat[\centering Wiggle]{{\includegraphics[width=0.43\textwidth]{figures/convergence_speed_wiggle.pdf} }}%
    \qquad
    \subfloat[\centering 3-Clusters]{{\includegraphics[width=0.43\textwidth]{figures/convergence_speed_3_clusters.pdf} }}%
    \caption{Average interval size and marginal coverage for regression synthetic datasets. While both of the considered approaches yield more efficient intervals (\emph{top} row), the \emph{Parallel} method is better at preserving high marginal coverage (\emph{bottom} row). AVCS$(S)$ denotes a confidence sequence based on $S$ samples at each exit in the case of \emph{Multiple-Samples}, and the sequence based on $S$ parallel ones in the case of \emph{Parallel}.}%
    \label{fig:app_speed}%
\end{figure}

\newpage

% \subsection{Epistemic uncertainty as a measure of stability}
% \label{sec:epi_uncertainty}
%  In our assumed Bayesian linear regression scenario (Section \ref{sec:eenn_blm}), both the posterior and updated posterior are Gaussian. This allows us to derive a closed-form expression for the KL term   $KL\big(p( \rmW_{t} | \D), \: p( \rmW_{t} | \D_*)\big)$ in the upper bound from Proposition \ref{prop:TV_bound}. See Appendix \ref{sec:epi_KL} for the derivation. We observe that KL remains small for a given $\vx_*$ when $v_*$  is small. Recall that $v_*$ represents the epistemic uncertainty in a Bayesian regression model (c.f. Eq. (\ref{eq:blr_main})), meaning the uncertainty stemming from the fact that we observe limited data. Such uncertainty decreases the more data we collect\footnote{$\lim_{N \rightarrow \infty} v_* = 0$ where $N$ represents the number of training datapoints (c.f. Section 3.3.2 in \cite{bishop2006pattern}).}, which, together with Proposition \ref{prop:TV_bound}, implies that the statistical coverage of our EENN-AVCS will improve with the increasing dataset size. Additionally,  $v^*$ is independent of the test label $y^*$. Thus, we can employ it as a measure of the stability of EENN-AVCS: for a given $\vx_*$, a higher $v^*$ can signal to the user that the resulting confidence sequence may not be entirely reliable. 

%  \begin{figure}[htbp]
%   \centering
%   \includegraphics[width=0.75\linewidth]{figures/fig_epistemic_uncertainty.pdf}

%     \caption{Average epistemic uncertainty $v_*$ (\protect\redline) across Bayesian linear regression models at different exits. As expected, $v_*$ is larger in the regions where we observe less training data: \emph{out-of-distribution} (denoted with a white background) and \emph{in-between} (denoted with a  grey background (\protect\scalerel*{\usebox{\boxgrey}}{\square})). Hence, $v_*$ can serve as an indicator for assessing the reliability of EENN-AVCSs.} 
%    \label{fig:epi_uncertainty}
% \end{figure}



\section{Supporting Derivations}

\subsection{Proof of Proposition \ref{prop:avcs}}
\label{app:sec_theory}
The proof can be divided into two steps. In the first step, we demonstrate that the predictive-likelihood ratio $R_t^*(y)$ in (\ref{eq:eenn_avcs_ratio}) is a non-negative martingale when evaluated at the true value $y^*$, with an initial value of one. In the second step, we utilize Ville's inequality to construct AVCS. Throughout this process, we closely adhere to the proof technique outlined in \cite{smith2020} (refer to Appendix B.1 in that work).

We begin the first step by showing that the expectation of the predictive-likelihood ratio evaluated at $y^*$ remains constant over time:
\begin{flalign*}
\mathbb{E}_{\rmW_{t+1}}[R_{t+1}^*(y^*) \: | \: \rmW_1, \ldots, \rmW_t] = \\ \int R_{t+1}^*(y^*) \: p (\rmW_{t+1} | \D \cup (\vx^*, y^*)) \: d\rmW_{t+1} \stackrel{(i)}{=} \\
\int R_{t+1}^*(y^*) \: \frac{p(y^* | \vx^*, \rmW_{t+1}) p(\rmW_{t+1} | \D)}{p_{t+1}(y^*| \vx^*,  \D)} \: d\rmW_{t+1} = \\
\int\prod_{l=1}^{t+1} \frac{p_l(y^* | \vx^*, \D)}{p(y^* | \vx^*, \rmW_l)} \: \frac{p(y^* | \vx^*, \rmW_{t+1}) p(\rmW_{t+1} | \D)}{p_{t+1}(y^*| \vx^*,  \D)} \: d\rmW_{t+1} = \\
\int \underbrace{\prod_{l=1}^{t} \frac{p_l(y^* | \vx^*, \D)}{p(y^* | \vx^*, \rmW_l)}}_{R_{t}^*(y^*)} \: \frac{\cancel{p_{t+1}(y^* | \vx^*, \D)}}{\cancel{p(y^* | \vx^*, \rmW_{t+1})}} \: \frac{\cancel{p(y^* | \vx^*, \rmW_{t+1})} \: p(\rmW_{t+1} | \D)}{\cancel{p_{t+1}(y^*| \vx^*,  \D)}} \: d\rmW_{t+1} = \\
\int R_{t}^*(y^*) \: p(\rmW_{t+1} | \D) \: d\rmW_{t+1} = \\ 
R_{t}^*(y^*) \int \: p(\rmW_{t+1} |  \D) \: d\rmW_{t+1} =  \\
R_{t}^*(y^*) \; ,
\end{flalign*}
where the step $(i)$ follows from the (sequential) Bayesian updating of the current posterior $p(\rmW_{t+1} |  \D)$ based on the new data-point $(\vx^*, y^*)$.


To show that initial value is equal to one, we proceed similarly: 
\begin{flalign*}
\mathbb{E}_{\rmW_{1}}[R_{1}^*(y^*) ] = \\ \int R_{1}^*(y^*) \: p (\rmW_{1} | \D \cup (\vx^*, y^*)) \: d\rmW_{1} = \\ 
\int R_{1}^*(y^*) \: \frac{p(y^* | \vx^*, \rmW_{1}) p(\rmW_{1} | \D)}{p_{1}(y^*| \vx^*,  \D)} \: d\rmW_{1} = \\
\int p(\rmW_{1} | \D) \: d\rmW_{1} = 1 =: R_0^* \; .
\end{flalign*}

In the second step, we make use of Ville's inequality, which provides a bound on the probability that a non-negative supermartingale exceeds a threshold $\beta > 0$.
\begin{flalign*}
    \pr\left( \exists t: R_t^*(y^{*}) \ge \beta \right) \le \mathbb{E}[R_0^*(y^*)] \:/ \: \beta \; .
\end{flalign*}
Since every martingale is also a supermartingale,  Ville's inequality is applicable in our case. Then, for a particular threshold $\alpha \in (0,1)$ and since we have a constant initial value (one), Ville's inequality implies: $\pr\left( \exists t: R_t^*(y^{*}) \ge 1/\alpha \right) \le \alpha$. If we define the sequence of sets as $C_t^* := \{ y \in \Y \: |  \: R_t^*(y) \le 1 / \alpha\}$, their validity can be shown as
\begin{flalign*}
    \pr(\forall t, y^* \in C_t^*) =  \pr(\forall t, R_t^*(y^*) \le 1 / \alpha) = \\ 1 - \pr(\exists t: R_t^*(y^*) \ge 1 / \alpha) \ge 1 - \alpha \;,
\end{flalign*}
which concludes the proof.

\subsection{Proof of Proposition \ref{prop:TV_bound}}
\label{sec:app_TV_bound_proof}
We first note that due to $C_t^*$ being a valid $(1-\alpha)$ confidence sequence, we have
\begin{flalign}
\label{eq:app_valid_cs}
    P(\exists l \in [t], y^* \notin C_l^*) \le  P(\exists l \in [T], y^* \notin C_l^*) \le  \alpha \: ,
\end{flalign}
where we adopt the notation $[t] := \{1,\ldots,t\}$ for brevity. Additionaly we observe that randomness in $P(\exists l \in [t], y^* \notin C_l)$ and $P(\exists l \in [t], y^* \notin C_l^*)$ comes from $p(\rmW_1,\ldots, \rmW_{t} | \D)$ and $p( \rmW_1,\ldots, \rmW_{t} | \D_*)$, respectively. Hence, we can use total variation distance (TV) to upper bound the difference
\begin{flalign*}
    P(\exists l \in [t], y^* \notin C_l)  - P(\exists l \in [t], y^* \notin C_l^*) \le \\ \big|P(\exists l \in [t], y^* \notin C_l)  - P(\exists l \in [t], y^* \notin C_l^*)\big| \le \\
    TV\big(p( \rmW_1,\ldots, \rmW_{t} | \D), \: p( \rmW_1,\ldots, \rmW_{t} | \D_*)\big) \:.
\end{flalign*}

Next, we apply Bretangnolle and Huber inequality \citep{bretagnolle1979estimation} to upper bound the TV distance in terms of KL divergence and use the fact that weights at different exits are independent which gives rise to a factorized joint distribution

\begin{flalign*}
    TV\big(p( \rmW_1,\ldots, \rmW_{t} | \D), \: p( \rmW_1,\ldots, \rmW_{t} | \D_*)\big) \le \\ 
    \sqrt{1 - e^{- KL\big(p( \rmW_1,\ldots, \rmW_{t} | \D), \: p( \rmW_1,\ldots, \rmW_{t}| \D_*)\big)}} \le \\
    \sqrt{1 - e^{-\sum_{l=1}^t KL\big(p( \rmW_{l} | \D), \: p( \rmW_{l} | \D_*)\big)}}
\end{flalign*}
 Rearranging the terms and using (\ref{eq:app_valid_cs}), the proposition follows
 \begin{flalign*}
     P(\exists l \in [t], y^* \notin C_l) \le \\
     P(\exists l \in [t], y^* \notin C_l^*) + \sqrt{1 - e^{-\sum_{l=1}^t KL_l}} \le \\
     \alpha + \sqrt{1 - e^{-\sum_{l=1}^t KL_l}}
 \end{flalign*}
 where $KL_l := KL\big(p( \rmW_{l} | \D), \: p( \rmW_{l} | \D_*)\big)$.

\subsection{Bayesian Linear Regression}
\label{app:sec_bayes_lin_reg}
 In Section \ref{sec:eenn_blm}, we define the predictive model at the $t$th exit as a linear model $f(\vx; \mW_t, \mU_{1:t}) = h(\vx; \mU_{1:t})^T \mW_t$. For notational brevity, we omit $\mU_{1:t}$  and denote $h(\vx  ; \U_{1:t})$ as $h_t(\vx)$ in this section. Additionally, let $\vy = [y_1, \ldots, y_N]^T \in \mathbb{R}^N$ and $\Hb_t = [h_t(\vx_1), \ldots, h_t(\vx_N)]^T \in \mathbb{R}^{N \times H}$ represent a concatenation of training labels and (deep) features, respectively. Assuming a Gaussian likelihood $\N\left(\ry ; h_t(\vx)^T \rmW_t, \sigma_{t}^2\right)$ and a prior $ \N\left(\rmW_t;  \boldsymbol{0}, \sigma_{w, t}^2 \mathbb{I}_H\right)$, the posterior over weights $\rmW_t$ has the following form \citep{bishop2006pattern}:
 \begin{flalign*}
     p(\rmW_t | \D) = \N \left(\rmW_t ; \pmu_t, \psig_t \right) \: , \\
     \pmu_t = \frac{1}{\sigma_{t}^2} \psig_t \Hb_t^T \vy \: ,\\
     \psig_t^{-1} = \frac{1}{\sigma_{t}^2} \Hb_t^T \Hb_t + \frac{1}{\sigma_{w, t}^2}  \mathbb{I}_H \: .
 \end{flalign*}
Similarly, for a new test point $\vx^*$, the posterior predictive can be obtained in a closed-form:
 \begin{flalign*}
p_t(\ry | \vx^*, \D) = \N \left(\ry ; h_t(\vx^*)^T \pmu_t,  h_t(\vx^*)^T \psig_t  h_t(\vx^*) + \sigma_{t}^2\right).
\end{flalign*}

For the exact derivation of both distributions above, we refer the interested reader to the Section 3.3 in \cite{bishop2006pattern}.



\subsection{Solving for Interval Endpoints}
\label{app:sec_quad_roots}
Due to the assumed Bayesian linear regression model at each exit $t$, $\log R_t$ is a convex quadratic function in $y$:
\begin{flalign*}
    \log R_t(y) = \\ \sum_{l=1}^t \log p_l(y | \vx^*, \D) - \log p(y | \vx^*, \mW_l) = \\ \alpha_t (\vx^*) \cdot y^2 + \beta_t (\vx^*, \mW_{1:t}) \cdot y + \gamma_t(\vx^*, \mW_{1:t}) \: .
\end{flalign*}
Coefficients have the following form:
\begin{flalign*}
    \alpha_t(\vx^*) = \frac{1}{2} \sum_{l=1}^t \bigg(\frac{1}{\sigma_{l}^2} -  \frac{1}{v_{*, l} + \sigma_{l}^2} \bigg) \; , \\
    \beta_t(\vx^*, \mW_{1:t}) =  \sum_{l=1}^t \frac{h_l(\vx^*)^T \pmu_l}{v_{l}^* + \sigma_{l}^2} -  \frac{h_l(\vx^*)^T \mW_l}{\sigma_{l}^2}  \; , \\
    \gamma_t(\vx^*, \mW_{1:t}) = \\ \frac{1}{2} \sum_{l=1}^t \bigg( \frac{(h_l(\vx^*)^T \mW_l)^2}{\sigma_{l}^2} -  \frac{(h_l(\vx^*)^T \pmu_l)^2}{v_{l}^* + \sigma_{l}^2} + \log \frac{\sigma_{l}^2}{v_{l}^* + \sigma_{l}^2} \bigg)
\end{flalign*}
where $v_{l}^* := h_l(\vx^*)^T \psig_l  h_l(\vx^*)$, and we provide expressions for $h_l, \pmu_l, \psig_l$ in Appendix \ref{app:sec_bayes_lin_reg}. It is easy to show that $\alpha_t \ge 0$, from which the convexity follows.


To find AVCS $C_t = \{ y \in \Y \: |  \: R_t(y) \le 1 / \alpha\}$, we look for the roots of the equation $\log R_t(y) - \log (1  / \alpha) = 0$. This yields an analytical expression for $C_t = [y_L^t, y_R^t]$ : 
\begin{flalign*}
    y_{L,R}^t = \frac{-\beta_t \pm \sqrt{\beta_t^2 - 4 \alpha_t \tilde{\gamma}_t }}{2 \alpha_t}
\end{flalign*}
where $\tilde{\gamma}_t = \gamma_t + \log \alpha$. See Figure \ref{fig:log_R} for a concrete example of log-ratios. 

% \begin{wrapfigure}{r}{0.5\textwidth}
%   \centering
%    \vspace{-2\baselineskip}
%    \includegraphics[width=0.48\textwidth]{figures/fig_log_R_quadratic.pdf}

%    \caption{Plot of $\log R_t(y)$ at various exits $t$ for a randomly selected test data point $(\vx^*, y^*)$ from the \textit{3-clusters} dataset. As described in Appendix \ref{app:sec_quad_roots}, we observe that the log-ratios exhibit a quadratic shape, allowing for an analytical solution for the endpoints of confidence intervals $C_t$.}
%    \label{fig:log_R}
%   \vspace{-5\baselineskip}
% \end{wrapfigure}

\begin{figure}[htbp]
  \centering
  \includegraphics[width=0.5\linewidth]{figures/fig_log_R_quadratic.pdf}

    \caption{Plot of $\log R_t(y)$ at various exits $t$ for a randomly selected test data point $(\vx^*, y^*)$ from the \textit{3-clusters} dataset. As described in Appendix \ref{app:sec_quad_roots}, we observe that the log-ratios exhibit a quadratic shape, allowing for an analytical solution for the endpoints of prediction intervals $C_t$.}
   \label{fig:log_R}
\end{figure}


\subsection{Epistemic Uncertainty and KL Divergence}
\label{sec:epi_KL}
To compute the KL divergence between the posterior and update posterior in the Bayesian linear regression model (c.f. Appendix \ref{app:sec_bayes_lin_reg}), we first use the Bayes rule to rewrite the latter as:
\begin{flalign*}
    p( \rmW_{t} | \D_*) = \frac{p(y^* | \vx^*, \rmW_t) \: p(\rmW_t | \D)}{p_t(y^* | \vx^*, \D)} \: .
\end{flalign*}
Using the definition of the KL divergence together with the formulas for posterior predictive and posterior distributions from Appendix \ref{app:sec_bayes_lin_reg}, we proceed as
\begin{flalign*}
    KL\big(p( \rmW_{t} | \D), \: p( \rmW_{t} | \D_*)\big) = \\
    \mathbb{E}_{p( \rmW_{t} | \D)} \bigg[\log \frac{p( \rmW_{t} | \D)}{p( \rmW_{t} | \D_*)} \bigg] = \\
    \log p_t(y^* | \vx^*, \D) - \mathbb{E}_{p(\rmW_t | \D)} \big[\log p(y^* | \vx^*, \rmW_t) \big] = \\
    0.5 \bigg( \log \big( \frac{\sigma_t^2}{\sigma_t^2 +  v^*_t}\big) + \big(\frac{1}{\sigma_t^2 + v^*_t} - \frac{1}{\sigma^2_t}\big) r_*^2 + \frac{v^*_t}{\sigma^2_t} \bigg)
\end{flalign*}
where $r_* = y^* - \pmu_t^Th_t(\vx^*)$ represents a residual, $v_* = h_t(\vx^*)^T \psig_t h_t(\vx^*) $ denotes epistemic uncertainty, and $\sigma = \sigma_{y, t}$. Based on the obtained expression, it is evident that a small $v^*$, implies small KL-divergence.

\newpage

\section{Implementation Details}
\subsection{Synthetic Data Experiments}
\label{app:sec_implement}
\paragraph{Data Generation} We closely follow data generation process from \cite{antoran2020depth}. Specifically, for \textit{wiggle} dataset we sample $N$ points from $$ y = \sin(\pi x) + 0.2 \cos(4 \pi x) - 0.3 x + \epsilon $$ where $\epsilon \sim \N(0, 0.25)$ and $x \sim \N(5, 2,5)$. For \textit{3-clusters} dataset, we simulate data via $$y = x - 0.1x^2 + \cos(x \pi / 2)$$ where $\epsilon \sim \N(0, 0.25)$ and we sample $N/3$ points from $[-1, 0]$, $[1.5, 2.5]$ and $[4, 5]$, respectively. For both datasets, we sample a total of $N=900$ points and allocate $80\%$ of the data for training, while the remaining $20\%$ constitutes the test dataset.
\paragraph{Model Architecture} Our EENN is composed of an input layer and $T=15$ residual blocks. The residual blocks consist of a \texttt{Dense} layer (with $M=20$ hidden units), followed by a \texttt{ReLU} activation and \texttt{BatchNorm} (with default \texttt{PyTorch} parameters). We attach an output layer at each residual block to facilitate early exiting. 

\paragraph{Training} We train our EENN for $500$ epochs using \texttt{SGD} with a learning rate of $1 \times 10^{-3}$, a momentum of $0.9$, and a weight decay of $1 \times 10^{-4}$. For the loss function, we use the average mean-square error (\texttt{MSE}) across all exits.

\newpage

\subsection{Semantic Textual Similarity Experiment}
\label{app:sts_b_implement}
\begin{wrapfigure}{r}{0.5\textwidth}
  \centering
   \vspace{-2\baselineskip}
   \includegraphics[width=0.48\textwidth]{figures/albert.pdf}

   \caption{Mean Absolute Error (MAE) performance of the \texttt{ALBERT-large} model across different datasets: train, development (dev), and test. A large performance gap between the train and dev/test datasets is observed. Note that in our work, we reuse the exact model and training setup from previous approaches \citep{zhou2020bert}.}
   \label{fig:albert_errors}
  \vspace{-1\baselineskip}
\end{wrapfigure}

\paragraph{Datasets} We use the STS-B dataset, the only regression dataset in the GLUE benchmark \citep{wang2018glue}, as well as the SICK dataset \citep{marelli2014semeval}. The task is to measure the semantic similarity $y \in [0, 5]$ between the two input sentences. For STS-B, the training, development, and test datasets consist of 5.7K, 1.5K, and 1.4K datapoints, respectively. For SICK, , the training, development, and test datasets consist of 4.4K, 2.7K, and 2.7K datapoints, respectively.

\paragraph{Model Architecture and Training} For the model architecture and training we reuse the code from \cite{zhou2020bert}. Specifically, we work with \texttt{ALBERT-large} which is a 24-layers transformer model. To facilitate early exiting, a regression head is attached after every transformer block.

\paragraph{EENN-AVCS} In the results presented in the main text, we construct a single ($S=1$) AVCS at test time with $\alpha=0.05$. To fit the Bayesian linear regression models (i.e., empirical Bayes) at every exit, we use the development set. Note that this contrasts with our experiments on the synthetic dataset (c.f., Section \ref{sec:exp_synthetic}) where we utilized the training dataset for this purpose. We observed that when fitting the regression model on the training dataset for STS-B, the noise parameters $\hat{\sigma}_t$ get underestimated, resulting in a rapid decay of marginal coverage for both \our and \base. We attribute this to a distribution shift present in the STS-B dataset, which is evident based on the different performances (MAE) that the ALBERT model achieves on different datasets, as seen in Figure \ref{fig:albert_errors}.





\newpage
\section{EENN-AVCS Algorithm}
Here, we outline in detail the implementation of our EENN-AVCS model. In Algorithm 1, we present EENN-AVCS for regression tasks. We start by fitting a Bayesian posterior model $p(\rmW_t | \D)$ at every exit using the training data $\D$ (c.f. Appendix \ref{app:sec_bayes_lin_reg}). To estimate the observation noise $\hat{\sigma}_t$ at every exit, we perform empirical Bayes (type-II maximum likelihood). Then, for a given test point $\vx^*$, we first sample the weights from the posterior and compute the epistemic uncertainty $v_t^*$ at every exit. Next, we use the obtained quantities to update the coefficients of the (logarithm of) predictive-likelihood ratio $R_t$ (c.f. Appendix \ref{app:sec_quad_roots}). To get the prediction interval at a given exit, we then solve the quadratic equation based on the updated coefficients from the previous step (c.f. Appendix \ref{app:sec_quad_roots}). Finally, we take the running intersection with the intervals obtained at the previous exits. In case the intersection results in an empty interval, we stop evaluating exits and label the given test point $\vx^*$ as an out-of-distribution (OOD) example (c.f. \textit{Detecting Violations of Posterior Stability} in Section \ref{sec:avcs_eens}). 

In Algorithm 2, we present EENN-AVCS for classification tasks. To determine the concentration parameters $\boldsymbol{\alpha}_t$ of the Dirichlet distribution at each exit for a given test point $\vx^*$, we apply a ReLU activation to the logits from the backbone EENN, retaining only the classes that "survive" the ReLU. We then sample from the Dirichlet distribution to obtain the denominator part of the predictive-likelihood ratio $R_t$ (refer to Section \ref{sec:class}). For the numerator part of $R_t$ , we calculate the (closed-form) posterior distribution using the concentration parameters at a specific exit. To create a predictive set at a given exit, we iterate over classes and include only those classes in the set for which the predictive-likelihood ratio 
 $R_t$ is less than $1/ \alpha_S$. Finally, as in the regression case, we consider the running intersection with all sets computed at previous exits. We label the test example $v^*$  as out-of-distribution (OOD) if the set collapses to an empty set.
% \begin{algorithm}
% \setstretch{1.2}
%  \caption{EENN-AVCS Regression\label{algo}}
% \DontPrintSemicolon
% \SetAlgoLined
% \SetNoFillComment
% \SetKwInOut{Input}{input}
% \SetKwInOut{Output}{output}
% \Input{Backbone EENN $\{h(\cdot | \mU_{1:t})\}_{t=1}^T$, Regression models $\{p(\rmW_t | \D), \: \hat{\sigma}_{ t}^2\}_{t=1}^T$, \\ test datapoint $\vx^*$, significance level $\alpha$}
% \Output{AVCS for $\vx^*$}
% \BlankLine

% $C_0 = \mathcal{Y}$ \;
% $\alpha_0, \beta_0, \gamma_0 = 0, 0, \log \alpha$ \;
%  \For{$t=1,..., T$}{
%     $\mW_t \sim p(\rmW_t | \D) = \mathcal{N} (\rmW_t | \pmu_t, \psig_t)$ \;
%     $v_{t}^* := h_t(\vx^*)^T \psig_t  h_t(\vx^*)$ \;
%     \textcolor{mplgreen}{\texttt{\#} update coefficients of $ \log R_t(y)$ } \;
%     $\alpha_t \mathrel{+}= \frac{1}{2} \: (\frac{1}{\hat{\sigma}_{t}^2} - \frac{1}{v_{t}^* + \hat{\sigma}_{t}^2}) $ \;
%     $\beta_t \mathrel{+}= \frac{h_t(\vx^*)^T \pmu_t}{v_{t}^* + \hat{\sigma}_{t}^2} -  \frac{h_t(\vx^*)^T \mW_t}{\hat{\sigma}_{t}^2}$ \;
%     $\gamma_t \mathrel{+}= \frac{1}{2} \big( \frac{(h_t(\vx^*)^T \mW_t)^2}{\hat{\sigma}_{t}^2} -  \frac{(h_t(\vx^*)^T \pmu_t)^2}{v_{t}^*+ \hat{\sigma}_{t}^2} + \log \frac{\hat{\sigma}_{t}^2}{v_{t}^* + \hat{\sigma}_{t}^2} \big) $ \;
%     \textcolor{mplgreen}{\texttt{\#} find the roots of quadratic equation}
%     $y_{L,R}^t = \frac{-\beta_t \pm \sqrt{\beta_t^2 - 4 \alpha_t \gamma_t }}{2 \alpha_t}$ \;
%      $C_t = C_{t - 1} \cap [y_L^t, y_R^t]$ \;
%      \If{$C_t = \emptyset$}
% 		{\Return $\emptyset$ \textcolor{mplgreen}{\texttt{\#} OOD}}
     

%  }
%  \Return{$\{C_t\}_{t=1}^T$}

% \end{algorithm}



\begin{figure}[htbp]
  \centering
  % \vspace{-2\baselineskip}
  % \vspace{-3em}
  \begin{minipage}[t]{0.48\textwidth}
    \begin{algorithm}[H]
        \setstretch{1.2}
         \caption{EENN-AVCS Regression\label{algo}}
        \DontPrintSemicolon
        \SetAlgoLined
        \SetNoFillComment
        \SetKwInOut{Input}{input}
        \SetKwInOut{Output}{output}
        \Input{Backbone EENN $\{h(\cdot | \mU_{1:t})\}_{t=1}^T$, Regression models $\{p(\rmW_t | \D), \: \hat{\sigma}_{ t}^2\}_{t=1}^T$, \\ test datapoint $\vx^*$, significance level $\alpha_S$}
        \Output{AVCS for $\vx^*$}
        \BlankLine
        
        $C_0 = \mathcal{Y}$ \;
        $\alpha, \beta, \gamma = 0, 0, \log \alpha_S$ \;
         \For{$t=1,..., T$}{
            $\mW_t \sim p(\rmW_t | \D) = \mathcal{N} (\rmW_t | \pmu_t, \psig_t)$ \;
            $v_{t}^* := h_t(\vx^*)^T \psig_t  h_t(\vx^*)$ \;
            \textcolor{mplgreen}{\texttt{\#} update coefficients of $ \log R_t(y)$ } \;
            $\alpha \mathrel{+}= \frac{1}{2} \: (\frac{1}{\hat{\sigma}_{t}^2} - \frac{1}{v_{t}^* + \hat{\sigma}_{t}^2}) $ \;
            $\beta \mathrel{+}= \frac{h_t(\vx^*)^T \pmu_t}{v_{t}^* + \hat{\sigma}_{t}^2} -  \frac{h_t(\vx^*)^T \mW_t}{\hat{\sigma}_{t}^2}$ \;
            $\gamma \mathrel{+}= \frac{1}{2} \big( \frac{(h_t(\vx^*)^T \mW_t)^2}{\hat{\sigma}_{t}^2} -  \frac{(h_t(\vx^*)^T \pmu_t)^2}{v_{t}^*+ \hat{\sigma}_{t}^2} + \log \frac{\hat{\sigma}_{t}^2}{v_{t}^* + \hat{\sigma}_{t}^2} \big) $ \;
            \textcolor{mplgreen}{\texttt{\#} find the roots of quadratic equation} \;
            $y_{L,R}^t = \frac{-\beta \pm \sqrt{\beta^2 - 4 \alpha \gamma }}{2 \alpha}$ \;
             $C_t = C_{t - 1} \cap [y_L^t, y_R^t]$ \;
             \If{$C_t = \emptyset$}
        		{\Return $\emptyset$ \textcolor{mplgreen}{\texttt{\#} OOD}}
             
        
         }
         \Return{$\{C_t\}_{t=1}^T$}
        
        \end{algorithm}
  \end{minipage}
  \hfill  % This adds a space between the two minipages
  \begin{minipage}[t]{0.48\textwidth}
    \begin{algorithm}[H]
        \setstretch{1.2}
         \caption{EENN-AVCS Classification\label{algo_class}}
        \DontPrintSemicolon
        \SetAlgoLined
        \SetNoFillComment
        \SetKwInOut{Input}{input}
        \SetKwInOut{Output}{output}
        \Input{Backbone EENN $\{f(\cdot | \mU_{1:t}, \mW_t)\}_{t=1}^T$, ReLU thresholds $\{\tau_t\}_{t=1}^T$, \\ test datapoint $\vx^*$, significance level $\alpha_S$}
        \Output{AVCS for $\vx^*$}
        \BlankLine
        
        $C_0 = \mathcal{Y}$ \;
        $R = [1, \ldots, 1]$ \;
         \For{$t=1,..., T$}{
         \textcolor{mplgreen}{\texttt{\#} get concentration parameters, only keep classes that "survive" \texttt{ReLU} } \;
         $\boldsymbol{\alpha}_t = \texttt{ReLU}(f(\vx^* | \mU_{1:t}, \mW_t), \tau_t)$ \;
            $\tilde{\boldsymbol{\alpha}}_t = \boldsymbol{\alpha}_t[\boldsymbol{\alpha}_t > 0]$ \;
            \BlankLine
            $\bpi_t \sim \texttt{Dir}(\tilde{\boldsymbol{\alpha}}_t)$ \;
            $S_t = \sum_k \alpha_{t, k}$ \;
            $C_t = [\:] $ \;
            \textcolor{mplgreen}{\texttt{\#} update the predictive-likelihood ratio} \;
            \For{$k = 1, \ldots, K$}{
                \If{$\alpha_{t, k} > 0$}{$R[k] \mathrel{*}= \frac{\alpha_{t, k} / S_t}{\pi_{t, k}}$}
                \Else{$R[k] = \infty$}
                \If{$R[k] \le \frac{1}  {\alpha_S}$}{
                $C_t\texttt{.append}(k)$ \;
                }
            }
            $C_t = C_{t} \cap C_{t - 1}$ \;
             \If{$C_t = \emptyset$}
        		{\Return $\emptyset$ \textcolor{mplgreen}{\texttt{\#} OOD}}
             
        
         }
         \Return{$\{C_t\}_{t=1}^T$}
        \end{algorithm}
  \end{minipage}
  %\vspace{-2\baselineskip}
   % \vspace{-1em}
\end{figure}






\end{document}
