\newpage
\onecolumn

%\vspace{12cm}
%\let\cleardoublepage\clearpage
\title{Calibrated Regression Against An Adversary
Without Regret (Supplementary Material)}
\maketitle
% \section*{\centering Calibrated Regression Against An Adversary
% Without Regret (Supplementary Material)}
\appendix


\section{Correctness of the Recalibration Procedure}


\label{app:proofs}

In the appendix, we provide the proofs of the theorems from the main part of the paper.

\paragraph{Notation}
We use $\Ind_E$ denote the indicator function of $E$, $[N]$ and $[N]_0$ to (respectively) denote the sets $\{1,2,...,N\}$ and $\{0,1,2,...,N\}$, and $\Delta_d$ to denote a $d$-dimensional simplex.

\paragraph{Setup}

We place our work in the framework of online learning 
\citep{shalev2007phd}.
%
At each time step $t = 1,2,...$, we are given features $x_t \in \mathcal{X} $. We use a forecaster $H : \mathcal{X} \to \mathcal{P}$ to produce a prediction $p_t = H(x_t)$, $p_t \in \mathcal{P}$ in the set of distributions $\mathcal{P}$ over a target $y \in \mathcal{Y}$.
Nature then reveals the true target $y_t \in \mathcal{Y}$ and we incur a loss of $\ell(y_t, p_t)$, where $\ell : \mathcal{Y} \times \mathcal{P}  \to \Rb^+$ is a loss function.
The forecaster $H$ updates itself based on $x_t, y_t$, and we proceed to time $t+1$.

Unlike in classical machine learning, we do not assume that the $x_t, y_t$ are i.i.d.: they can be random, deterministic or even chosen by an adversary. Online learning algorithms feature strong performance guarantees in this regime, where performance is usually measured in terms of regret $R_T(q)$ relative to a constant prediction $q$, $R_T(q) = \sum_{t=1}^T \ell(y_t, p_t)  - \ell(y_t, q). $
The worst-case regret at time $T$ equals $R_T = \max_{q \in \mathcal{P}} R_T(q)$.

In this paper, the predictions $p_t$ are probability distributions over the outcome $y_t$. We focus on regression, where $y_t \in \mathbb{R}$ and the prediction $p_t$ can be represented by a cumulative distribution function (CDF), denoted $F_t : \mathbb{R} \to [0,1]$ and defined as $F_t(z) = p_t(y \leq z)$.

\paragraph{Learning with expert advice}

A special case of this framework arises when each $x_t$ represents advice from $N$ {\em experts}, and $H$ outputs $p_t \in \Delta_{N-1}$, a distribution over experts. Nature reveals an outcome $y_t$, resulting in an expected loss of $\sum_{i=1}^N p_{ti} \ell(y_t, a_{ti})$, where $\ell(y_t, a_{ti})$ is the loss under expert $i$'s advice $a_{ti}$. Performance in this setting is measured using two notions of regret.
%Note that this is a special case of the online optimization setting with $f_t(w_t) = \sum_{i=1}^N w_{ti} \ell(i, y_t)$.
%
%In the setting of learning with expert advice, we use slightly different notions of regret.
%The first type of regret --- external regret --- is an analogue of online optimization regret. Internal regret is a stronger notion needed to construct calibrated algorithms.
%
\begin{defn}
The external regret $\extR_T$ and the internal regret $\intR_T$ are defined as
\begin{align*}
\extR_T = \sum_{t=1}^T \ellavg(y_t, p_t)  - \min_{i \in [N]} \sum_{t=1}^T \ell(y_t, a_{it}) &&
\intR_T  = \max_{i,j \in [N]} \sum_{t=1}^T p_{ti} \left( \ell(y_t, a_{it})  - \ell(y_t, a_{jt}) \right),
\end{align*}
where $ \ellavg(y, p) = \sum_{i=1}^N p_i \ell(y, a_{it}) $ is the expected loss.
\end{defn}
\paragraph{Calibration for online binary calibration}

For now, we focus on the $\ell_1$ norm, and we define the calibration error of a forecaster $\Fcal$ as
\begin{equation}
C_{T} = \sum_{i=0}^N \left| \rho_T(i/N) - \frac{i}{N} \right| \left( \frac{1}{T} \sum_{t=1}^T \Ind_{\{p_t = \frac{i}{N}\}} \right),
\end{equation}
where $\rho_T(p) = \frac{\sum_{t=1}^T y_t \Ind_{p_t = p}}{\sum_{t=1}^T \Ind_{p_t = p}}$ denotes the frequency at which event $y = 1$ occurred over the times when we predicted $p$.

We further define the calibration error when $\Fcal_j$ predicts $i/N$ as 
\begin{align*}
C^{(j)}_{T,i} & = \left| \rjt(i/N) - \frac{i}{N} \right| \left( \frac{1}{T_j} \sum_{t=1}^T \wsupj_{t,i} \right) 
% &
% C_{T,i} & = \left|  \rho_T(i/N) - \frac{i}{N} \right| \left( \frac{1}{T} \sum_{t=1}^T \Ind_{t,i} \right),
\end{align*}
where $ \wsupj_{t,i} = \Ind\{p_t = \frac{i}{N} \cap \palg_t \in [\frac{j-1}{M},\frac{j}{M})\} $ is an indicator for the event that $\Fcal_j$ is triggered at time $t$ and predicts $i/N$.
Similarly, $ \Ind_{t,i} = \Ind\{p_t = i/N\} = \sum_{j=1}^M \wsupj_{t,i} $ indicates that $i/N$ was predicted at time $t$, and $T_j = \sum_{t=1}^T \sum_{i=0}^N \wsupj_{t,i}$ is the number of calls to $\Fcal_j$.
Also,
\begin{align*}
& \rjt(i/N) = \frac{\sum_{t=1}^T \wsupj_{t,i} y_t}{\sum_{t=1}^T \wsupj_{t,i}}
% & \rho_T(i/N) = \frac{\sum_{t=1}^T \Ind_{t,i} y_t}{\sum_{t=1}^T \Ind_{t,i}}. 
%= \frac{\sum_{i=1}^N \sum_{t=1}^T \wsupj_{t,j} y_t}{\sum_{i=1}^N \sum_{t=1}^T \wsupj_{t,j}} 
\end{align*}
is the empirical success rate for $\Fcal_j$. 
% and \algorithmref{recal} respectively.

Note that with these definitions, we may write the calibration losses of $\Fcal_j$
% and \algorithmref{recal} 
as $ C^{(j)}_{T} = \sum_{i=0}^N C^{(j)}_{T,i}$.
% and $ C_{T} = \sum_{i=0}^N C_{T,i}$.

\paragraph{Calibration for regression}

A sequence of forecasts $F_t$ achieves online quantile calibration for all $y \in \mathcal{Y}$ and all $p \in \mathcal{P}$,
$ \rho_T(y, p) \to p, $ a.s.~as $T \to \infty$, where
\begin{equation}
\rho_T(y, p) = \dfrac{\sum_{t=1}^T \Ind_{y_t \leq y, F_t(y) = p}}{\sum_{t=1}^T \Ind_{F_t(y) = p}}
\end{equation}
% $$ \rho_T(y, p) = \dfrac{\sum_{t=1}^T \Ind_{y_t \leq y, F_t(y) = p}}{\sum_{t=1}^T \Ind_{F_t(y) = p}}$$
In other words, out of the times when the predicted probability $F_t(y')$ for $\{y_t \leq y'\}$ to be $p$, the event $\{y_t \leq y'\}$ holds a fraction $p$ of the time.

We also seek to quantify more precisely the calibration of Algorithm \ref{algo:recal}, specifically compare $\rho(y,p)$ with $p$. We define for this the quantity
\begin{align*}
C_{T,i}(y) = \left|  \rho_T(y, i/N) - \frac{i}{N} \right| \left( \frac{1}{T} \sum_{t=1}^T \Ind_{t,i} \right),
\end{align*}
and we define the calibration loss of Algorithm \ref{algo:recal} at $y$ as $C_{T}(y) = \sum_{i=0}^N C_{T,i}(y)$.

% We define the the quantile calibration error with resolution $N$ of of a sequence of forecasts $F_t$ as
% \begin{align}
% C_T^p & = \sum_{i=0}^N \left| \rho_T(i/N) - \frac{i}{N} \right|^p \left( \frac{1}{T} \sum_{t=1}^T \Ind_{\{p_t \leq \frac{i}{N}\}} \right). \label{eqn:cal_loss}
% \end{align}
% where $\rho_T(p) = \dfrac{\sum_{t=1}^T \Ind_{F_t(y_t) \leq p}}{T}$ is the empirical frequency of the event $F_t(y_t) \leq p$.

\paragraph{Proper losses}

The quality of probabilistic forecasts is evaluated using {\em proper} losses $\ell$. Formally, 
a loss $\ell(y, p)$ is proper if
$p \in \arg\min_{q \in \mathcal{P}} \Exp_{y \sim (p)} \ell(y, q) \; \forall p \in \mathcal{P}.$ 
%And example is the log-loss $\ell_\text{log}(y,p) = y\log(p) + (1-y)\log(1-p)$. 
An important proper loss for CDF predictions F is the continuous ranked probability score, defined as
\begin{equation}
    \ell_\text{CRPS}(y, F) = \int_{-\infty}^\infty (F(z) - \mathbb{I}_{y \leq z})^2 dz.
\end{equation}
% $$\ell_\text{CRPS}(y, F) = \int_{-\infty}^\infty (F(z) - \mathbb{I}_{y \leq z})^2 dz.$$

\subsection{Assumptions}

We assume that each subroutine $\Fcal$ is an instance of a binary calibrated forecasting algorithm (e.g., the methods introduced in Chapter 4 in \cite{cesabianchi2006prediction}) that produce predictions in $[0,1]$ that are $(\e, \ell_2)$-calibrated and that $C^2_{T} \leq R_{T} + \e$ uniformly ($R_{T} = o(1)$ as $T \to \infty$; $T$ is the number of calls to instance $\Fcal_j$).
%
We also assume that for each $t$, the target $y_t$ lies in some bounded interval $\mathcal{Y}$ of $\mathbb{R}$ of length at most $B$.

\subsection{Online Calibrated Regression}

First, we look at algorithms for online calibrated regression (without covariates). Our algorithms leverage classical online binary calibration \citep{foster98asymptoticcalibration} as a subroutine.
Formally, \algorithmref{recal} 
partitions $[-\frac{B}{2},\frac{B}{2}]$ into $M$ intervals $\Ic = \{[\frac{-B}{2},\frac{-B}{2} + \frac{B}{M}), ..., [\frac{B}{2} - \frac{B}{M},\frac{B}{2}]\}$; each interval is associated with an instance of an online binary recalibration subroutine $\Fcal$ \citep{foster98asymptoticcalibration,cesabianchi2006prediction}. In order to compute $G_t(y \leq z)$, we invoke the subroutine $\Fcal_j$ associated with interval $I_j$ containing $z$.
% on the data $\{\palg_t, y_t \mid \palg_t \in I_j \}$ belonging to each bucket $I_j \in \Ic$; at prediction time, it calls the instance of $\Fcal$ associated with the bucket of the uncalibrated forecast $\palg_t$.
After observing $y_t$, each $\Fcal_j$ observes whether $y_t$ falls in its interval and updates its state.

\begin{theorem}
    Let $\mathcal{Y}_\mathcal{I}$ be the set of upper bounds of the intervals $\mathcal{I}$ and let $\mathcal{P}_S$ be the output space of $\Fcal$. Algorithm \ref{algo:recal} achieves online calibration and for all $y \in \mathcal{Y}_\mathcal{I}, p \in \mathcal{P}_S$ we have $\rho_T(y,p) \to p$ a.s. as $T \to \infty$.
\end{theorem}

\begin{proof}
The above theorem follows directly from the construction of Algorithm \ref{algo:recal}: for each $y \in \mathcal{Y}$, we run an online binary calibration algorithm to target the event $y_t \leq y$. 

Specifically, note that for each $y \in \mathcal{Y}_\mathcal{I}$, the empirical frequency $\rho(y,p)$ reduces to the definition of the empirical frequency of a classical binary calibration algorithm targeting probability $p$ and the binary outcome that $y_t \leq y$. The output of the algorithm for $F_t(y)$ is also a prediction for the binary outcome $y_t \leq y$ produced by a classical onlne binary calibration algorithm. Thus, by construction, we have the desired result.
\end{proof}

Algorithms $\Fcal$ for online binary calibration are randomized. Our procedure needs to be randomized as well and this is a fundamental property of our task.

\begin{theorem}
    There does not exist a deterministic online calibrated regression algorithm that achieves online calibration.
\end{theorem}

\begin{proof}
This claim follows because we can encode a standard online binary calibration problem as calibrated regression. 
Specifically, given a non-randomized online calibrated regression algorithm, we could solve an online binary classification problem.
Suppose the adversary chooses a binary $y_t \in \{0,1\} \subseteq [0,1]$ that defines one of two classes. Then we can define an instance of calibrated regression with two buckets $[0,0.5)$ and $[0.5,1)$. We use the forecast $F_t(0.5)$ as our prediction for $y_t=0$ and one minus that as the prediction for 1. Then, the  error on the ratio $\rho_T(0.5,p)$ yields the definition of calibration in binary classification. If our deterministic online calibration regression algorithm works, then we have $\rho_T(0.5,p) \to p$, which means that the empirical ratio for the binary algorithm goes to the predicted frequency $p$ as well. But that would yield a deterministic algorithm for online binary calibration, which we know can't exist.
\end{proof}


\subsection{Proving the Calibration of \algorithmref{recal}}

First, we will provide a proof of \lemmaref{calibration}; this proof holds for any norm $\ell_p$.

%\begin{lemma}\label{lem:calibration2}
%If each $\Fcal_j$ is $(\e, \ell)$-calibrated with convex loss $\ell$ and with
%$ C^{(j)}_{T, \ell} \leq R_{T_j} + \e $
%for all $T$,
%where $R_{T_j} = o(1)$ as $T_j \to \infty$, then \algorithmref{recal} is also $(\e, \ell)$-calibrated and
%\begin{align}
%C_{T, \ell} \leq \sum_{j=1}^M \frac{T_j}{T} R_{T_j} + \e. \label{eqn:rate}
%\end{align}
%This bound holds uniformly over time $T$.
%%where $T_j$ is the number of plays of $\Fcal_j$ at time $T$.
%%$$C_T \leq R_T + \e.$$
%\end{lemma}

\begin{lemma}[Preserving calibration]
If each $\Fcal_j$ is $(\e, \ell_p)$-calibrated,
then \algorithmref{recal} is also $(\e, \ell_p)$-calibrated and the following bound holds uniformly over $T$:
\begin{align}
C_T \leq \sum_{j=1}^M \frac{T_j}{T} R_{T_j} + \e. \label{eqn:rate1}
\end{align}\vspace{-4mm}
\end{lemma}

\begin{proof}
Let $\wsupj_i = \sum_{t=1}^T \wsupj_{t,i}$ and note that $\sum_{t=1}^T \Ind_{t,i} = \sum_{j=1}^M \wsupj_i$. We may write
\begin{align*}
  C_{T,i}(y)
& = \frac{\sum_{t=1}^T \Ind_{t,i}}{T} \left|  \rho_T(y, i/N) - \frac{i}{N} \right|^p 
 = \frac{\sum_{j=1}^M \wsupj_i }{T} \left| \sum_{j=1}^M \frac{ \sum_{t=1}^T \wsupj_{t,i} o_{tj}}{\sum_{j=1}^M \wsupj_i }  - \frac{i}{N} \right|^p \\
& = \frac{\sum_{j=1}^M \wsupj_i}{T} \left| \sum_{j=1}^M \frac{ \wsupj_i \rjt(y, i/N) }{\sum_{j=1}^M \wsupj_i }  - \frac{i}{N} \right|^p 
 \leq \sum_{j=1}^M \frac{\wsupj_i}{T} \left| \rjt(y, i/N) - \frac{i}{N} \right|^p = \sum_{j=1}^M  \frac{T_j}{T} C^{(j)}_{T, i},
\end{align*}
where in the last line we used Jensen's inequality. 
Plugging in this bound in the definition of $C_T$, we find that 
%$ C_T = \sum_{i=1}^N  C_{T,i} \leq \sum_{j=1}^M \sum_{i=1}^N \frac{T_j}{T} C^{(j)}_{T,i} $ which in turn can be bounded as
%\begin{align}
% C_T \leq \sum_{j=1}^M \frac{T_j}{T} R_{T_j} + \e. \label{eqn:rate}
%\end{align}
\begin{align}
 C_T 
& = \sum_{i=1}^N  C_{T,i}
\leq \sum_{j=1}^M \sum_{i=1}^N \frac{T_j}{T} C^{(j)}_{T,i} 
 \leq \sum_{j=1}^M \frac{T_j}{T} R_{T_j} + \e, \nonumber
\end{align}
Since each $R_{T_j} \to 0$, \algorithmref{recal} will be $\e$-calibrated.
\end{proof}

\subsection{Recalibrated Forecasts Have Low Regret Under the CRPS Loss}

%Next, we would like to prove an analogue of \lemmaref{accuracy} for the popular $\ell_2$ norm. Although calibration follows from the $\ell_1$ norm version, here, we use a slightly different argument to derive simpler and better convergence rates. To obtain this result, we make the additional assumption that the proper loss that we are using to measure accuracy is the $\ell_2$ loss.

\begin{lemma}[Recalibration preserves accuracy]
Consider \algorithmref{recal} with
parameters $M \geq N > 1/\e$. Suppose that the $\Fcal$ are $(\e, \ell_2)$-calibrated.
Then the recalibrated $G_t$ a.s.~have vanishing $\ell_\text{CRPS}$-regret relative to $F_t$:
\begin{equation}
 \frac{1}{T} \sum_{t=1}^T \ell_\text{CRPS} (y_t , G_t) - \frac{1}{T} \sum_{t=1}^T \ell_\text{CRPS} (y_t , F_t) < NB R_T + \frac{2B}{N}.
\end{equation}
\end{lemma}

\begin{proof}
Our proof will rely on the following fact about any online calibration subroutine $\Fcal$. We start by formally establishing this fact.
\begin{fact}\label{fact:external_regret}
Let $\Fcal$ be an binary online calibration subroutine with actions $0, 1/N, ... 1$ whose $\ell_2$ calibration error $C^p_T$ is bounded by $R_T = o(T)$. Then the predictions $p_t$ from $\Fcal$ also minimize external regret relative to any single action $i/N$:
\begin{equation}
   \sum_{t=1}^T (p_t - y_t)^2 - (\frac{i}{N} - y_t)^2 \leq N R_T \text{ for all } i  
\end{equation}
% $$ \sum_{t=1}^T (p_t - y_t)^2 - (\frac{i}{N} - y_t)^2 \leq N R_T \text{ for all } i $$
\end{fact}
We refer the reader to  Lemma 4.4 in \cite{cesabianchi2006prediction} for a proof.
%Our proof will use the fact that an algorithm with resolution $\frac{1}{N}$ whose $\ell_2$ calibration error is bounded by $R_T = o(1)$ also minimizes external regret (relative to the $\ell_2$ loss) at a rate of $NR_T$. See e.g. Lemma 4.4 in \cite{cesabianchi2006prediction} for a proof of this fact.

Next, we prove our main claim. We start with some notation.
Let $\Ic = \{[0,\frac{1}{M}), [\frac{1}{M}, \frac{2}{M}), ..., [\frac{M-1}{M},1]\}$ be a set of intervals that partition $[0,1]$ and let $I_j = [\frac{j-1}{M}, \frac{j}{M})$ be the $j$-th interval.
%Let us use $\Ind_{j,t} = \Ind \{\palg_t \in [\frac{j-1}{M},\frac{j}{M})\}$ to indicate that $\Fcal_j$ was called at time $t$. 
Also, for each $j$, we use $i_j$ denote the index $i \in [N]$ that is closest to $j$ in the sense of $|\frac{i_j}{N} - \frac{j}{M}| \leq \frac{1}{N}$. By our assumption that $M \geq N$, this index exists.

We begin our proof by from the definition of the CRPS regret:
\begin{align*}
&  \frac{1}{T} \sum_{t=1}^T \ell_\text{CRPS} (y_t , G_t) - \frac{1}{T} \sum_{t=1}^T \ell_\text{CRPS} (y_t , F_t) \\
& \;\; = \frac{1}{T} \sum_{t=1}^T \int_{-\infty}^\infty  (G_t(z) - \mathbb{I}_{y_t \leq z})^2 dz - \frac{1}{T} \sum_{t=1}^T \int_{-\infty}^\infty  (F_t(z) - \mathbb{I}_{y_t \leq z})^2 dz \\
& \;\; = \int_{-\infty}^\infty \frac{1}{T} \sum_{t=1}^T  \left[ (G_t(z) - \mathbb{I}_{y_t \leq z})^2 - (F_t(z) - \mathbb{I}_{y_t \leq z})^2 \right] dz  \\
& \;\; = \int_{z \in \mathcal{Y}} \frac{1}{T} \sum_{t=1}^T  \left[ (G_t(z) - \mathbb{I}_{y_t \leq z})^2 - (F_t(z) - \mathbb{I}_{y_t \leq z})^2 \right] dz \\
& \;\; =\int_{z \in \mathcal{Y}} \frac{1}{T} \sum_{t=1}^T  \left[ (G_t(z) - \mathbb{I}_{F_t(y_t) \leq F_t(z)})^2 - (F_t(z) - \mathbb{I}_{F_t(y_t) \leq F_t(z)})^2 \right] dz
\end{align*}
In the second-to-last line, we have used the fact that the forecasts have finite support, i.e., the $y_t$ live within a closed bounded set $\mathcal{Y}$.
In the last line, we replaced the event $y_t \leq z$ with $F_t(y_t) \leq F_t(z)$, which is valid because $F_t$ is monotonically increasing.

Let's now analyze the above integrand for one fixed value of $z$:
\begin{equation}
 \frac{1}{T} \sum_{t=1}^T  \left[ (G_t(z) - \mathbb{I}_{F_t(y_t) \leq F_t(z)})^2 - (F_t(z) - \mathbb{I}_{F_t(y_t) \leq F_t(z)})^2 \right].   
\end{equation}
% $$  \frac{1}{T} \sum_{t=1}^T  \left[ (G_t(z) - \mathbb{I}_{F_t(y_t) \leq F_t(z)})^2 - (F_t(z) - \mathbb{I}_{F_t(y_t) \leq F_t(z)})^2 \right]. $$
Since $F_t$ outputs a finite number of values in the set $\{0, \frac{1}{M}, ..., 1\}$, let $j/M$ denote the value $F_t(z) = j/M$ taken by $F_t$ at $z$.
Additionally, observe that $\mathbb{I}_{F_t(y_t) \leq \frac{j}{M}} = o_{tj}$, where $o_{tj}$ is the binary target variable given to $\Fcal_j$ at the end of step $t$. 
Finally, recall that when $F_t(z) = \frac{j}{M}$, we have defined $G_t(z)$ to be the output of $\Fcal_j$ at time $t$, which we denote as $G_{tj}$.
This yields the following expression for the above integrand for a fixed $z$:
\begin{equation}
     \frac{1}{T} \sum_{t=1}^T  \left[ (G_{tj} - o_{tj})^2 - (\frac{j}{M} - o_{tj})^2 \right]. 
\end{equation}
% $$  \frac{1}{T} \sum_{t=1}^T  \left[ (G_{tj} - o_{tj})^2 - (\frac{j}{M} - o_{tj})^2 \right]. $$
Next, recall that $i_j$ is the index $i \in [N]$ that is closest to $j$ in the sense of $|\frac{i_j}{N} - \frac{j}{M}| \leq \frac{1}{N}$. Recall also that $M \geq N$. Note that this implies
\begin{equation}
 \ell_2(\frac{j}{M}, o_{tj}) \geq \ell_2(\frac{i_j}{M}, o_{tj}) + \frac{\partial \ell_2}{\partial p}(p,o_{tj})(\frac{j}{M}-\frac{i_j}{M}) \geq \frac{2}{N}.   
\end{equation}
% $$\ell_2(\frac{j}{M}, o_{tj}) \geq \ell_2(\frac{i_j}{M}, o_{tj}) + \frac{\partial \ell_2}{\partial p}(p,o_{tj})(\frac{j}{M}-\frac{i_j}{M}) \geq \frac{2}{N}.$$
Using this inequality, we obtain the following bound for our earlier integrand:
\begin{equation}
     \frac{1}{T} \sum_{t=1}^T  \left[ (G_{tj} - o_{tj})^2 - (\frac{i_j}{N} - o_{tj})^2 \right] + \frac{2}{N}. 
\end{equation}
% $$  \frac{1}{T} \sum_{t=1}^T  \left[ (G_{tj} - o_{tj})^2 - (\frac{i_j}{N} - o_{tj})^2 \right] + \frac{2}{N}. $$
Crucially, this expression is precisely the {\em external regret} of recalibration subroutine $\Fcal_j$ relative to the fixed action $\frac{i_j}{N}$ and measured in terms of the L2 loss. By Fact \ref{fact:external_regret}, we know that this external regret is bounded by $N R_T$. Since this bound holds pointwise for any value of $z$, we can plug it into our original integral to obtain a bound on the CRPS regret:
\begin{align*}
& \int_{z \in \mathcal{Y}} \frac{1}{T} \sum_{t=1}^T  \left[ (G_t(z) - \mathbb{I}_{F_t(y_t) \leq F_t(z)})^2 - (F_t(z) - \mathbb{I}_{F_t(y_t) \leq F_t(z)})^2 \right] dz \\
& \;\; \leq \int_{z \in \mathcal{Y}} \left[ N R_T + \frac{2}{N} \right] dz \\
& \;\; \leq NB R_T + \frac{2B}{N} 
\end{align*}
In the last line, we used the fact that the integration is over a finite set $\mathcal{Y}$ whose measure is bounded by $B > 0$. This establishes the main claim of this proposition.
\end{proof}




\subsection{Correctness of \algorithmref{recal}}

We now prove our main result about the correctness of \algorithmref{recal}.


\setcounter{theorem}{0}
\begin{theorem}
Let $\Fcal$ be an $(\ell_1, \epsilon/3B)$-calibrated online subroutine with resolution $N \geq 3B/\epsilon$. 
and let $\ell$ be a proper loss satisfying the assumptions of \lemmaref{accuracy}. Then \algorithmref{recal} with parameters $\Fcal$ and $N$ is an $\epsilon$-accurate online recalibration algorithm for the loss $\ell$.
\end{theorem}

\begin{proof}
It is easy to show that \algorithmref{recal} is $(\ell_1, \e/3B)$-calibrated by the same argument as Lemma 1 (see the next section for a formal proof). By Lemma 4, its regret w.r.t. the raw $\palg_t$ tends to $< 3B/N < \e$. Hence, the theorem follows.
\end{proof}



\subsection{Calibration Implies No Internal Regret}

Here, we show that a calibrated forecaster also has small internal regret relative to any bounded proper loss \citep{kuleshov2017estimating}.

\setcounter{lemma}{0}

\begin{lemma}
If $\ell$ is a bounded proper loss, then an $\e$-calibrated $\Fcal$
a.s.~has a small internal regret w.r.t.~$\ell$ and satisfies uniformly over time $T$ the bound
\begin{align}
\intR_{T} = \max_{ij} \sum_{t=1}^T \Ind_{p_t = i/N} \left( \ell(y_t, i/N)  - \ell(y_t, j/N) \right) \leq 2 B (R_T + \e).
\end{align}
\end{lemma}

\begin{proof}

Let $T$ be fixed for the rest of this proof.
Let $\Ind_{ti} = \Ind_{p_t = i/N}$ be the indicator of $\Fcal$ outputting prediction $i/N$ at time $t$, let $T_i = \sum_{t=1}^T \Ind_{ti}$ denote the number of time $i/N$ was predicted,  and let
\begin{equation}
     \intR_{T, ij} = \sum_{t=1}^T \Ind_{ti} \left( \ell(y_t, i/N)  - \ell(y_t, j/N) \right) 
\end{equation}
% $$ \intR_{T, ij} = \sum_{t=1}^T \Ind_{ti} \left( \ell(y_t, i/N)  - \ell(y_t, j/N) \right) $$
denote the gain (measured using the proper loss $\ell$) from retrospectively switching all the plays of action $i$ to $j$. This value forms the basis of the definition of internal regret (Section 2).

Let $T(i,y) = \sum_{t=1}^T \Ind_{ti} \Ind\{y_t = y\}$ denote the total number of $i/N$ forecasts at times when $y_t = y \in \{0,1\}$. Observe that we have
\begin{align*}
T(i,y) 
& = \sum_{t=1}^T \Ind_{ti} \Ind\{y_t = y\} 
= \frac{\sum_{t=1}^T \Ind_{ti} \Ind\{y_t = y\} }{T_i} T_i
= \frac{\sum_{t=1}^T \Ind_{ti} \Ind\{y_t = y\} }{\sum_{t=1}^T \Ind_{ti}} T_i \\
& = q(i,y) T_i + T_i \left( \frac{\sum_{t=1}^T \Ind_{ti} \Ind\{y_t = y\} }{\sum_{t=1}^T \Ind_{ti}} - q(i,y) \right) \\
& = q(i,y) T_i + T_i \left( \rho_T(i/N) - i/N \right),
\end{align*}
where $q(i,y) = i/N$ if $y=1$ and $1-i/N$ if $y=0$. The last equality follows using some simple algebra after adding and subtracting one inside the parentheses in the second term.

We now use this expression to bound $\intR_{T, ij}$:
\begin{align*}
\intR_{T, ij}
& = \sum_{t=1}^T \Ind_{ti} \left( \ell(y_t, i/N)  - \ell(y_t, j/N) \right) \\
& = \sum_{y \in \{0,1\}} T(i,y) \left( \ell(y, i/N)  - \ell(y, j/N) \right) \\
& \leq \sum_{y \in \{0,1\}} q(i,y) T_i \left( \ell(y, i/N)  - \ell(y, j/N) \right) + \sum_{y \in \{0,1\}} B T_i \left| \rho_T(i/N) - i/N \right| \\
& \leq 2B T_i \left| \rho_T(i/N) - i/N \right|,
\end{align*}
where in the first inequality, we used $\ell(y, i/N)  - \ell(y, j/N) \leq \ell(y, i/N)  \leq B$, and in the second inequality we used the fact that $\ell$ is a proper loss.

Since internal regret equals $\intR_{T} = \max_{i,j} \intR_{T, ij}$, we have
\begin{align*}
\intR_{T}
& \leq \sum_{i=1}^N \max_{j} \intR_{T, ij} 
 \leq 2B \sum_{i=0}^N T_i \left| \rho(i/N) - i/N \right| 
 \leq 2 B ( R_T + \e ).
\end{align*}

\end{proof}

\subsection{Impossibility of Recalibrating Non-proper Losses}

We conclude the appendix by explaining why non-proper losses cannot be calibrated \citep{kuleshov2017estimating}.

\begin{theorem}
If $\ell$ is not proper, then there is no recalibration algorithm w.r.t.~$\ell$.
\end{theorem}

\begin{proof}
If $\ell$ is not proper, there exist a $p'$ and $q$ such that $\Exp_{y \sim \text{Ber}(p')} \ell(y, q)  < \Exp_{y \sim \text{Ber}(p')} \ell(y, p') $.% Let $\delta = \Exp_{y \sim \text{Ber}(p')} \ell(y, p') - \Exp_{y \sim \text{Ber}(p')} \ell(y, q) > 0$

Consider a sequence $y_t$ for which $y_t \sim \text{Ber}(p')$ for all $t$. Clearly the prediction of a calibrated forecaster $p_t$ much converge to $p'$ and the average loss will approach $\ell(y, p')$. This means that we cannot recalibrate the constant predictor $p_t = q$ without making its loss $\ell(y, q)$ higher. We thus have a forecaster that cannot be recalibrated with respect to $\ell$.
\end{proof}

\section{Low Regret Relative to Baseline Classifiers}\label{app:regret}



Here, we show that a calibrated forecaster also has small regret relative to any bounded proper loss if we use a certain construction that combines our algorithm with a baseline forecaster. This extends our previous construction to more general settings.


\subsection{Recalibration Construction}

\paragraph{Setup}

We start with an online forecaster $F$ that outputs uncalibrated forecasts $\palg_t$ at each step; these forecasts are fed into a {\em recalibrator} such that the resulting forecasts $p_t$ are calibrated and have low regret relative to the baseline forecasts $\palg_t$. 

Formally, at every step $t=1,2,...$ we have:
  \begin{algorithmic}[1]
    \STATE Forecaster $F$ predicts $\palg_t$.
    \STATE A recalibration algorithm produces a calibrated forecast $p_t$ based on $\palg_t$.
    \STATE Nature reveals label $y_t$
    \STATE Based on $x_t, y_t$, we update the recalibration algorithm and optionally update $H$.
  \end{algorithmic}

\paragraph{Notation}  

We define a discretization $V$ of the space of forecasts. We assume that the forecasts live in a compact set $\Delta$ and we define a triangulation of $\Delta$, i.e., a partition into a set of simplices
such that any two simplices intersect in either a common face, common vertex,
or not at all. Let $V$ be the vertex set of this triangulation, and let
$V (p)$ be the set of corners for this simplex. 

Note that each distribution $p$ can be uniquely written as a weighted average of its neighboring vertices, $V (p)$. For $v \in V (p)$, we define the test functions $w_v(p)$ to be
these linear weights, so they are uniquely defined by the linear equation $p=\sum_{v \in V(p)} w_v(p) v$.
We also define the discretization to be sufficiently small: given a target precision $\epsilon > 0$ we define the discretization such that for all $f_1, f_2$ in the same simplex we have $|| f_1 - f_2 || < \epsilon$.

\subsection{Recalibration Algorithm}

We are going to define a general meta-algorithm that follows a construction in which we run multiple instances of our calibrated forecasting algorithms over the inputs of $F$.

More formally, we take the aforementioned partition of the space of forecasts of $\Delta$ of $F$ and we associate each simplex with an instance of 
our calibration algorithm $\Fcal$ (using the same $\Delta$ and discretization $V$). In order to compute $\palg_t$, we invoke the subroutine $\Fcal_j$ associated with simplex $I_j$ containing $\palg_t$ (with ties broken arbitrarily).
After observing $y_t$, we pass it to $\Fcal_j$.

The resulting procedure produces valid calibrated estimates because each $\Fcal_j$ is a calibrated subroutine. More importantly the new forecasts do not decrease the predictive performance of $F$, as measured by a proper loss $\ell$.
In the remainder of this section, we establish these facts formally.


\subsection{Theoretical Analysis}

\paragraph{Notation}

Our task is to produce calibrated forecasts. Intuitively, we say that a forecast $F_t$ is calibrated if for every $y' \in \mathcal{Y}$, the probability $F_t(y')$ on average matches the frequency of the event $\{ y = y' \}$.
% We formalize this intuition as follows.
We formalize this by introducing the ratio
\begin{equation}
    \rho_T(p) = \dfrac{\sum_{t=1}^T y_t \cdot \Ind_{p_t = p}}{\sum_{t=1}^T \Ind_{p_t = p}}
\end{equation}
Intuitively, we want $ \rho_T(p) \to p, $ a.s.~as $T \to \infty$ for all $y$.
In other words, out of the times when the predicted probability for $y_t$ is $p$, the average $y_t$ look like $p$.

The quality of probabilistic forecasts is evaluated using {\em proper} losses $\ell$. Formally, 
a loss $\ell(y, p)$ is proper if
$p \in \arg\min_{q \in \mathcal{P}} \Exp_{y \sim (p)} \ell(y, q) \; \forall p \in \mathcal{P}.$ 
An example in binary classification is the log-loss $\ell_\text{log}(y,p) = y\log(p) + (1-y)\log(1-p)$. We will assume that the loss is bounded by $B > 0$ .

We measure calibration a calibration error $C_T$. Our algorithms will output discretized probabilities; hence we define the error relative to a set of possible predictions $V$
\begin{equation}
    C_T  = \sum_{p \in V} \left| \rho_T(p) - p \right| \left( \frac{1}{T} \sum_{t=1}^T \Ind_{\{p_t = p\}} \right). 
\end{equation}

% \paragraph{Assumptions}

\subsubsection{A Helper Lemma}

In order to establish the correctness of our recalibration procedure, we need to start with a helper lemma. This lemma shows that if forecasts are calibrated, then they have small internal regret.

\begin{lemma}
If $\ell$ is a bounded proper loss, then an $(\e, \ell_1)$-calibrated $\Fcal$
a.s.~has a small internal regret w.r.t.~$\ell$ and satisfies uniformly over time $T$ the bound
\begin{align}
\intR_{T} = \max_{ij} \sum_{t=1}^T \Ind_{p_t = p_i} \left( \ell(y_t, p_i)  - \ell(y_t, p_j) \right) \leq 2 B (R_T + \e).
\end{align}
\end{lemma}

\begin{proof}

Let $T$ be fixed for the rest of this proof.
Let $\Ind_{ti} = \Ind_{p_t = p_i}$ be the indicator of $\Fcal$ outputting prediction $p_i$ at time $t$, let $T_i = \sum_{t=1}^T \Ind_{ti}$ denote the number of time $i/N$ was predicted,  and let
\begin{equation}
    \intR_{T, ij} = \sum_{t=1}^T \Ind_{ti} \left( \ell(y_t, p_i)  - \ell(y_t, p_j) \right)
\end{equation}
% $$ \intR_{T, ij} = \sum_{t=1}^T \Ind_{ti} \left( \ell(y_t, p_i)  - \ell(y_t, p_j) \right) $$
denote the gain (measured using the proper loss $\ell$) from retrospectively switching all the plays of action $i$ to $j$. This value forms the basis of the definition of internal regret.

Let $T(i,y) = \sum_{t=1}^T \Ind_{ti} \Ind\{y_t = y\}$ denote the total number of $p_i$ forecasts at times when $y_t = y$. Observe that we have
\begin{align*}
T(i,y) 
& = \sum_{t=1}^T \Ind_{ti} \Ind\{y_t = y\} 
= \frac{\sum_{t=1}^T \Ind_{ti} \Ind\{y_t = y\} }{T_i} T_i
= \frac{\sum_{t=1}^T \Ind_{ti} \Ind\{y_t = y\} }{\sum_{t=1}^T \Ind_{ti}} T_i \\
& = q(i,y) T_i + T_i \left( \frac{\sum_{t=1}^T \Ind_{ti} \Ind\{y_t = y\} }{\sum_{t=1}^T \Ind_{ti}} - q(i,y) \right) \\
& = q(i,y) T_i + T_i \left( \rho_T(p_i) - p_i \right),
\end{align*}
where $q(i,y) = p_i(y)$. The last equality follows using some simple algebra after adding and subtracting one inside the parentheses in the second term.

We now use this expression to bound $\intR_{T, ij}$:
\begin{align*}
\intR_{T, ij}
& = \sum_{t=1}^T \Ind_{ti} \left( \ell(y_t, p_i)  - \ell(y_t, p_j) \right) \\
& = \sum_{y} T(i,y) \left( \ell(y, p_i)  - \ell(y, p_j) \right) \\
& \leq \sum_{y} q(i,y) T_i \left( \ell(y, p_i)  - \ell(y, p_j) \right) + B T_i \left| \rho_T(p_i) - p_i \right| \\
& \leq B T_i \left| \rho_T(p_i) - p_i \right|,
\end{align*}
where in the first inequality, we used $\ell(y, p_i)  - \ell(y, p_j) \leq \ell(y, p_i)  \leq B$, and in the second inequality we used the fact that $\ell$ is a proper loss.

Since internal regret equals $\intR_{T} = \max_{i,j} \intR_{T, ij}$, we have
\begin{align*}
\intR_{T}
& \leq \sum_{i=1}^N \max_{j} \intR_{T, ij} 
 \leq 2B \sum_{i=0}^N T_i \left| \rho(i/N) - p_i \right| 
 \leq 2 B ( R_T + \e ).
\end{align*}

\end{proof}

\subsection{Recalibrated Forecasts Have Low Regret Relative to Uncalibrated Forecasts}

Next, we use the above result to prove that the forecasts recalibrated using the above construction have low regret relative to the baseline uncalibrated forecasts.

%\begin{lemma}\label{lem:regret}
%Consider an instance of Algorithm 2 with
%parameters $M \geq N$, and $\ell$ be a proper loss that is
%\begin{enumerate}
%\item Bounded in absolute value by $B>0$
%\item $\ell(y_t, p) \leq \ell(y_t, j/M) + B/M$ whenever $p \in [j/M, (j+1)/M)$.
%\item $\ell(y_t, p) \leq \ell(y_t, p_i) + B/N$ whenever $p \in [p_i, (i+1)/N)$.
%\end{enumerate}
%The recalibrated forecasts $p_t$ have vanishing $\ell$-loss regret relative to $\palg_t$:
%$$ \lim_{T\to\infty} \left( \frac{1}{T} \sum_{t=1}^T \ell (y_t , p_t) - \frac{1}{T} \sum_{t=1}^T \ell(y_t , \palg_t) \right) < 3B/N. $$
%\end{lemma}

\begin{lemma}[Recalibration preserves accuracy]
Let $\ell$ be a bounded proper loss such that $\ell(y_t, p) \leq \ell(y_t, p_j) + B\epsilon$ whenever $||p - p_j|| \leq \epsilon$.
Then the recalibrated $p_t$ a.s.~have vanishing $\ell$-loss regret relative to $\palg_t$ and we have uniformly:
\begin{equation}
\frac{1}{T} \sum_{t=1}^T \ell (y_t , p_t) - \frac{1}{T} \sum_{t=1}^T \ell(y_t , \palg_t)  < \frac{B}{\epsilon} \sum_{j=1}^M \frac{T_j}{T} R_{T_j} + 3B\e.
\end{equation}
\end{lemma}


\begin{proof}
By the previous lemma, we know that an algorithm whose calibration error is bounded by $R_T = o(1)$ also minimizes internal regret at a rate of $2BR_T$, and thus external regret at a rate of $2BR_T / \epsilon$.

Next, let us use $\Ind_{j,t}$ to indicate that $\Fcal_j$ was called at time $t$. 
We establish our main claim as follows:
\begin{align*}
& \frac{1}{T} \sum_{t=1}^T  \ell (y_t , p_t) - \frac{1}{T} \sum_{t=1}^T \ell (y_t , \palg_t) \\
& \;\; = \frac{1}{T} \sum_{t=1}^T \left( \sum_{j=1}^M \left( \ell (y_t , p_t) - \ell (y_t , \palg_t) \right) \Ind_{j,t} \right) \\
& \;\; < \frac{1}{T} \sum_{t=1}^T \left( \sum_{j=1}^M \left( \ell (y_t , p_t) - \ell (y_t , p_j) \right) \Ind_{j,t} + B\epsilon \right) \\
% & \;\; < \frac{1}{T} \sum_{t=1}^T \left( \sum_{j=1}^M \left( \ell (y_t , p_t) - \ell (y_t , \frac{i_j}{N}) \right) \Ind_{j,t} + \frac{2B}{N}\right) \\
& \;\; \leq \frac{1}{\epsilon} B \sum_{j=1}^M \frac{T_j}{T} R_{T_j} + 3B\epsilon,
\end{align*}
where $R_{T_j}$ is a bound on the calibration error of $\Fcal_j$ after $T_j$ plays. 

%The first inequality holds because $|\palg_t - \frac{j}{M}| \leq \frac{1}{M} \leq \frac{1}{N}$ when $\Ind_{j,t} = 1$ and because $\ell_2(\palg_t, y_t) \geq \ell_2(\frac{j}{M},y_t) + \frac{\partial \ell_2}{\partial p}(p,y_t)(\frac{j}{M}-\palg_t)$. 
In the first two inequality, we use our assumption on the loss $\ell$.
%Note that this bound holds for other convex loss functions.
%We repeat the same argument in the second inequality using the fact that $|\frac{i_j}{N} - \frac{j}{M}| \leq \frac{1}{N}$ for some $i_j$.
The last inequality follows because $\Fcal_j$ minimizes external regret w.r.t.~the constant action $p_j$ at a rate of $BR_{T_j}/\epsilon$.
\end{proof}


\subsection{Proving That Calibration Holds}

We want to also give a proof that the recalibration construction described above yields calibrated forecasts.

\begin{lemma}
If each $\Fcal_j$ is $(\e, \ell_p)$-calibrated,
then the combined algorithm is also $(\e, \ell_p)$-calibrated and the following bound holds uniformly over $T$:
\begin{align}
C_T \leq \sum_{j=1}^M \frac{T_j}{T} R_{T_j} + \e. \label{eqn:rate}
\end{align}\vspace{-4mm}
\end{lemma}

\begin{proof}
Let $M = |V|$.
Let $\wsupj_i = \sum_{t=1}^T \wsupj_{t,i}$ where $\wsupj_{t,i} = \Ind \{p_t = p_j \cap \palg_t = p_j\}$ and note that $\sum_{t=1}^T \Ind_{t,i} = \sum_{j=1}^M \wsupj_i$. Let also $\rjt(p_i) = \frac{\sum_{t=1}^T \wsupj_{t,i} y_t}{\sum_{t=1}^T \wsupj_{t,i}}$. We may write
\begin{align*}
  C_{T,i} 
& = \frac{\sum_{t=1}^T \Ind_{t,i}}{T} \left|  \rho_T(p_i) - p_i \right|
 = \frac{\sum_{j=1}^M \wsupj_i }{T} \left| \sum_{j=1}^M \frac{ \sum_{t=1}^T \wsupj_{t,i} y_t}{\sum_{j=1}^M \wsupj_i }  - p_i \right| \\
& = \frac{\sum_{j=1}^M \wsupj_i}{T} \left| \sum_{j=1}^M \frac{ \wsupj_i \rjt(p_i) }{\sum_{j=1}^M \wsupj_i }  - p_i \right|
 \leq \sum_{j=1}^M \frac{\wsupj_i}{T} \left| \rjt(p_i) - p_i \right| = \sum_{j=1}^M  \frac{T_j}{T} C^{(j)}_{T, i},
\end{align*}
where $C^{(j)}_{T,i} = \left| \rjt(p_i) - p_i \right| \left( \frac{1}{T_j} \sum_{t=1}^T \wsupj_{t,i} \right)$ and in the last line we used Jensen's inequality. 
Plugging in this bound in the definition of $C_T$, we find that 
%$ C_T = \sum_{i=1}^N  C_{T,i} \leq \sum_{j=1}^M \sum_{i=1}^N \frac{T_j}{T} C^{(j)}_{T,i} $ which in turn can be bounded as
%\begin{align}
% C_T \leq \sum_{j=1}^M \frac{T_j}{T} R_{T_j} + \e. \label{eqn:rate}
%\end{align}
\begin{align}
 C_T 
& = \sum_{i=1}^N  C_{T,i}
\leq \sum_{j=1}^M \sum_{i=1}^N \frac{T_j}{T} C^{(j)}_{T,i} 
 \leq \sum_{j=1}^M \frac{T_j}{T} R_{T_j} + \e, \nonumber
\end{align}
Since each $R_{T_j} \to 0$, the full procedure will be $\e$-calibrated.
\end{proof}


Recall that $R_T$ denotes the rate of convergence of the calibration error $C_T$.
For most online calibration subroutines $\Fcal$,
$R_{T} \leq f(\e)/\sqrt{T}$ for some $f(\e)$.
In such cases, we can further bound the calibration error in the above lemma as
\begin{equation}
    \sum_{j=1}^M \frac{T_j}{T} R_{T_j} \leq \sum_{j=1}^M \frac{\sqrt{T_j}f(\e)}{T} \leq \frac{f(\e)}{\sqrt{ \e T}}. 
\end{equation}
% $$
% \sum_{j=1}^M \frac{T_j}{T} R_{T_j} \leq \sum_{j=1}^M \frac{\sqrt{T_j}f(\e)}{T} \leq \frac{f(\e)}{\sqrt{ \e T}}. 
% $$
In the second inequality, we set the $T_j$ to be equal. 
Thus, our recalibration procedure introduces an overhead of
$ \frac{1}{\sqrt{\e}} $
in the convergence rate of the calibration error $C_T$ and of the regret relative to a baseline forecaster in the earlier lemma.
% In addition, \algorithmref{recal} requires $ \frac{1}{{\e}} $ times more memory and computation time (we run $1/\e$ instances of $\Fcal_j$). Overall, our runtime in linear in $M$, and the cost of \algorithmref{recal} is negligible relative to fitting the base model.


\section{Applications: Decision-making}\label{app:applications}

Next, we complement our results with a formal characterization of some benefits of calibration. We are interested in decision-making settings where we wish to estimate the value of a function $v : \mathcal{Y} \times \mathcal{A} \times \mathcal{X} \to \mathbb{R}$ over a set of outcomes $\mathcal{Y}$, actions $\mathcal{A}$, and features $\mathcal{X}$. Note that the function $v$ could be a loss $\ell(y,a,x)$ that quantifies the error of an action $a \in \mathcal{A}$ in a state $x \in \mathcal{X}$ given outcome $y \in \mathcal{Y}$.

We assume that given $x$, the agent chooses an action $a(x)$ according to a decision-making process. This could be an action $a(x) = \arg \min_a \mathbb{E}_{y \sim H(x)} [ \ell(y, a,x) ]$ that minimizes a loss that are trying to estimate, but any outcome is possible.
The agent then relies on a predictive model $H$ of $y$ to estimate the future values $v (y, a,x)$ for the decision $a(x)$ :
\begin{align}
% a(x) & = \arg \min_a \mathbb{E}_{y \sim H(x)} [ \ell(y, a,x) ] \\
v(x) & = \mathbb{E}_{y \sim H(x)} [ v(y, a(x),x) ].
\end{align}
% Here, $a(x)$ is the action that minimizes the expected loss under $H$.
We study $v(y,a,x)$ that are monotonically non-increasing or non-decreasing in $y$. Examples include linear utilities $u(a,x) \cdot y + c(a,x)$ or their monotone transformations. 

\paragraph{Expectations under calibrated models}

If $H$ was a perfect predictive model, we could estimate expected values of outcomes perfectly. In practice, inaccurate models can yield imperfect decisions. Surprisingly, our analysis shows that in many cases, calibration (a much weaker condition that having a perfectly specified model $H$) is sufficient to correctly estimate the value of various outcomes.

Surprisingly, our guarantees can be obtained with a weak condition---quantile calibration.
Additional requirements are the non-negativity and monotonicity of $v$.
Our result is a concentration inequality that shows that estimates of $v$ are unlikely to exceed the true $v$ on average.
% \vk{cite individual calibration}

\begin{theorem}
\label{apdx:thm:dist_calib_bound_app}
Let $M$ be a quantile calibrated model as in and
let $v(y, a, x)$ be a monotonic value function.
Then for any sequence $(x_t, y_t)_{t=1}^T$ and $r > 0$, we have:
\begin{equation}
    \label{apdx:eqn:dist_calib_bound1}
    \lim_{T \to \infty} \frac{1}{T} \sum_{t=1}^T \mathbb{I} \left[ v(y_t, a(x_t), x_t) \geq r v(x_t)) \right] \leq 1 / r
\end{equation}
\end{theorem}

\begin{proof}

Recall that $M(x)$ is a distribution over $\mathcal{Y}$, with a density $p_x$, a quantile function $Q_x$, and a cdf $F_x$.
Note that for any $x$ and $s \in (0,1)$ and $y' \leq F_x^{-1}(1-s)$ we have:
\begin{align*}
v(x)  
& = \int v(x, y, a(x)) q_x(y) dy \\
& \geq \int_{y \geq y'} v(x, y, a(x)) q_x(y) dy \\
& \geq v(x, y', a(x)) \int_{y \geq y'} q_x(y) dy \\
& \geq s v(x, y', a(x))
\end{align*}

The above logic implies that whenever $v(x)   \leq s v(x, y, a)$, we have $y \geq F_x^{-1}(1-s)$ or $F_x(y) \geq (1-s)$. Thus, we have for all $t$,
\begin{align*}
\mathbb{I}\{  v(x_t)   \leq s v(x_t, y_t, a_t) \} \leq \mathbb{I}\{  F_{x_t}(y_t) \geq (1-s) \}.
\end{align*}
Therefore, we can write
\begin{align*}
\frac{1}{T} \sum_{t=1}^T \mathbb{I}\{  v(x_t)   \leq s v(x_t, y_t, a_t) \} \leq \frac{1}{T} \sum_{t=1}^T \mathbb{I}\{  F_{x_t}(y_t) \geq (1-s) \} = s + o(T),
\end{align*}
where the last equality follows because $M$ is calibrated. Therefore, the claim holds in the limit as $T \to \infty$ for $r = 1/s$. 
The argument is similar if $v$ is monotonically non-increasing. In that case, we can show that whenever $y' > F_x^{-1}(s)$, we have $v(x)  \geq s v(x, y', a(x))$. Thus, whenever $v(x)   \leq s v(x, y, a)$, we have $y \leq F_x^{-1}(s)$ or $F_x(y) \leq s$. Because, $F_x$ is calibrated, we again have that
\begin{align*}
\frac{1}{T} \sum_{t=1}^T \mathbb{I} \{ v(x_t)   \leq s v(x_t, y_t, a_t) \} \leq \sum_{t=1}^T \mathbb{I} \{ F_{x_t}(y_t) < s \} = s + o(T),
\end{align*}
and the claim holds with $r = 1/s$. 
\end{proof}

Note that this statement represents an extension of Markov inequality. 
Note also that this implies the same result for a distribution calibrated model, since distribution calibration implies quantile calibration. 


\section{Experiments on UCI Benchmarks}
\label{apdx:uci_expt}
The existing UCI datasets~\citep{Dua2019UCI} used in our experiments hold a Creative Commons Attribution 4.0 International (CC BY 4.0) license. 
\paragraph{Computational resources.} Our experiments were conducted on a laptop with 2.3 GHz 8-Core Intel Core i9 processor and 32 GB 2667 MHz DDR4 RAM. The code and datasets take 16MB memory. 

\paragraph{Detailed setup.}
Our dataset consists of input and output pairs $\{x_t, y_t\}_{t=1}^{T}$ where $T$ is the size of the dataset. 
We simulate a stream of data by sending batches of data-points $\{x_t, y_t\}_{t=nt'+1}^{n(t'+1)}$ to our model, where $t'$ is the time-step and $n$ is the batch-size. This simulation is run for $\left \lceil{T/n}\right \rceil $ time-steps. For each batch, Bayesian ridge regression is fit to the data and the recalibrator is trained. 
We set $N=20$ in the recalibrator and use a batch size of $n=10$ for all experiments except for the Aquatic Toxicity dataset~\ref{fig:daphnia-aquatic-toxicity} where we used $n=5$. The calibration is evaluated at levels $[0.2, 0.4, 0.5, 0.6, 0.8]$. 

\section{Experiments on Bayesian Optimization}
\label{apdx:bayes_opt}
Bayesian optimization attempts to find the global minimum $x^\star = \arg \min_{x \in \mathcal{X}} f(x)$ of an unknown function $f:\mathcal{X} \to \mathbb{R} $ over an input space $\mathcal{X} \subseteq \mathbb{R}^D$. 
We are given an initial labeled dataset $x_t, y_t \in \mathcal{X} \times \mathbb{R}$ for $t = 1, 2, ..., N$ of i.i.d. realizations of random variables $X,Y \sim P$. At every time-step $t$, we use uncertainties from the probabilistic model $\mathcal{M}:\mathcal{X} \to (\mathbb{R} \to [0, 1])$ of $f$ to select the next data-point $x_{next}$ and iteratively update the model $\mathcal{M}$.  Algorithm~\ref{algo:plain-bo} outlines this procedure. Since the black-box function evaluation can be expensive, the objective of Bayesian optimization in this context is to find the minima (or maxima) of this function while using a small number of function evaluations.


\paragraph{Computational resources.} Our experiments were conducted on a laptop with 2.3 GHz 8-Core Intel Core i9 processor and 32 GB 2667 MHz DDR4 RAM. The code and datasets take 16MB memory.


\paragraph{Detailed setup. } We use online calibration to improve the uncertainties estimated by the model $\mathcal{M}$. Following~\citet{Deshpande2021Calibrated}, we use Algorithm~\ref{algo:calibrate} to recalibrate the model $\mathcal{M}$. Since the dataset size is small, we use the \textsc{CREATESPLITS} function to generate leave-one-out cross-validation splits of our dataset $\mathcal{D}.$  We train the base model on train-split and use this to obtain probabilistic forecast for data in the test-split. We collect these predictions on all test-splits to form our recalibration dataset and use Algorithm~\ref{algo:recal} to perform calibration.

Following~\citet{Deshpande2021Calibrated}, we perform calibrated Bayesian optimization as detailed in Algorithm~\ref{algo:calibrated-bo}. Specifically, we recalibrate the base model $\mathcal{M}$ after every step in Bayesian optimization. We build on the GpyOpt library~\citep{gpyopt2016} for Bayesian optimization that holds the BSD 3-clause license. 

We use some popular benchmark functions to evaluate the performance of Bayesian optimization. We initialize the Bayesian optimization with 3 randomly chosen data-points. We use the Lower Confidence Bound (LCB) acquisition function to select the data-point $x_t$ and evaluate a potentially expensive function $f$ as $x_t$ to obtain $y_t$.  At any given time-step $T$, we have the dataset $\mathcal{D}_T = \{x_t, y_t\}_{t=1}^{T}$ collected iteratively. 

In Figure~\ref{fig:bayes-opt}, we see that using online calibration of uncertainties from $\mathcal{M}$ allows us to reach a lower minimum or find the same minimum with a smaller number of steps with Bayesian optimization. 




\begin{figure}[h]
\centering     %%% not \center
\subfigure[SixHumpCamel]{\label{ewa-recalibrator-sixhumpcamel2}\includegraphics[width=0.32\linewidth]{ figures/sixhumpcamel_new_aggregate_convergence_comparison.png}}
\subfigure[Beale]{\label{beale2}\includegraphics[width=0.32\linewidth]{ figures/beale_new_aggregate_convergence_comparison.png}}
\subfigure[Mccormick]{\label{Mccormick2}\includegraphics[width=0.32\linewidth]{ figures/mccormick_new_aggregate_convergence_comparison.png}}
\caption{Online Calibration Improves Bayesian optimization}
\label{fig:bayes-opt}
\end{figure}

\begin{algorithm}[H]
  \caption{Bayesian Optimization}
  \label{algo:plain-bo}
  \begin{algorithmic}[1]
    \STATE Initialize base model $\mathcal{M}$ with data $\mathcal{D}=\{x_t, y_t\}_{t=0}^{M}$.
    % \STATE $\mathcal{R} \gets \textsc{Calibrate}(\mathcal{M,\mathcal{D}})$.
    \FOR {$n=1,2,...,T$:}
    \STATE $x_{\textrm{next}}$ = $\arg \max_{x \in \mathcal{X}}(\textrm{Acquisition}(x, \mathcal{R} \circ \mathcal{M}))$.
    \STATE $y_{\textrm{next}}$ = $f(x_{\textrm{next}})$.
    \STATE $\mathcal{D}$ = $\mathcal{D} \bigcup \{(x_{\textrm{next}}, y_{\textrm{next}})\}$
    \STATE Update model $\mathcal{M}$ with data $\mathcal{D}$
    % \STATE $\mathcal{R} \gets \textsc{Calibrate}(\mathcal{M,\mathcal{D}})$
    \ENDFOR
  \end{algorithmic}
\end{algorithm}


\begin{algorithm}[H]
  \caption{Calibrated Bayesian Optimization~\citep{Deshpande2021Calibrated}}
  \label{algo:calibrated-bo}
  \begin{algorithmic}[1]
    \STATE Initialize base model $\mathcal{M}$ with data $\mathcal{D}=\{x_t, y_t\}_{t=0}^{M}$.
    \STATE $\mathcal{R} \gets \textsc{Calibrate}(\mathcal{M,\mathcal{D}})$.
    \FOR {$n=1,2,...,T$:}
    \STATE $x_{\textrm{next}}$ = $\arg \max_{x \in \mathcal{X}}(\textrm{Acquisition}(x, \mathcal{R} \circ \mathcal{M}))$.
    \STATE $y_{\textrm{next}}$ = $f(x_{\textrm{next}})$.
    \STATE $\mathcal{D}$ = $\mathcal{D} \bigcup \{(x_{\textrm{next}}, y_{\textrm{next}})\}$
    \STATE Update model $\mathcal{M}$ with data $\mathcal{D}$
    \STATE $\mathcal{R} \gets \textsc{Calibrate}(\mathcal{M,\mathcal{D}})$
    \ENDFOR
  \end{algorithmic}
\end{algorithm}

\begin{algorithm}[H]
  \caption{$\textsc{Calibrate}$~\citep{Deshpande2021Calibrated}}
  \label{algo:calibrate}
  \begin{algorithmic}[1]
    \REQUIRE Base model $\mathcal{M}$, Dataset $\mathcal{D}=\{x_t, y_t\}_{t=0}^{N}$
    \STATE Train a base model $\mathcal{M}$ on training dataset $\{x_t, y_t\}_{t=0}^{N}$.
    \STATE Initialize recalibration dataset $\mathcal{D_{\textrm{recal}}} = \phi$
  %     \item $S = \textsc{CreateSplits({D})}$.
     \STATE $S = \textsc{CreateSplits({D})}$
    \FOR {$(\mathcal{D_{\textrm{train}}}, \mathcal{D_{\textrm{test}}})$ in $S$:}
    \STATE $\mathcal{D_{\textrm{train}}}=$ Train Dataset $\{x_t, y_t\}_{t=0}^{M}$ in split $s$.
    \STATE $\mathcal{D_{\textrm{test}}}=$ Test Dataset $\{x_t, y_t\}_{t=0}^{L}$ in split $s$.
    \STATE $\mathcal{D_{\textrm{train}}}=\textsc{TrainSplit}(s), \mathcal{D_{\textrm{test}}}=\textsc{TestSplit}(s)$
    \STATE Train base model $\mathcal{M'}$ on dataset $\mathcal{D_{\textrm{train}}}$
    \STATE Compute CDF dataset $\{[M'(x_t)](y_t)\}_{t=1}^{M}$ from dataset $\mathcal{D_{\textrm{test}}}$
    \STATE $\mathcal{D_{\textrm{recal}}} = \mathcal{D_{\textrm{recal}}} \bigcup \{[\mathcal{M'}(x_t)](y_t), y_t\}_{t=1}^{M}$
    \ENDFOR
    \STATE Train recalibrator model $\mathcal{R}$ on the recalibration dataset $\mathcal{D_{\textrm{recal}}}$ using Algorithm~\ref{algo:recal}
    \STATE Return ($\mathcal{R}$)
  \end{algorithmic}
\end{algorithm}


% \bibliography{all}


\section{Comparison To Prior Work}
\label{sec:prior_work}

 
Table~\ref{tab:iid_data} and Table~\ref{tab:non_iid_data} summarize how our work fits in the broader literature. In brief, we provide calibration with regret guarantees in the setting of quantile regression on adversarial data. By regret guarantees we mean that performance relative to a user-specified baseline classifier is guaranteed not to drop.

% Note: I have replaced the bracketed numbers with the appropriate \cite and \citet commands.
% \begin{table}[h!]
% \centering
% \caption{A summary of how our work fits into the broader literature.}
% \label{tab:literature_summary}
% \begin{tabular}{@{}lllll@{}}
% \toprule
% \textbf{Data Assumptions} & \textbf{Setting} & \textbf{Outputs} & \textbf{No Regret Guarantees} & \textbf{Regret Guarantees} \\ \midrule
% \multirow{6}{*}{\makecell[l]{\textbf{IID} or \\ exchangeable}} & \multirow{3}{*}{Classification} & Predicting Sets & \citet{vovk2005algorithmic} & \citet{Kuleshov2022Calibrated} \\
%  & & \makecell[l]{Predicting \\ Probabilities} & \makecell[l]{\citet{platt1999probabilistic}, \\ \citet{niculescu2005predicting}} & - \\ \cmidrule(l){2-5}
%  & \multirow{3}{*}{Regression} & Quantiles & \makecell[l]{\citet{kuleshov2018accurate}, \\ \cite{dheur2023large,dheur2024probabilistic}} & - \\
%  & & Distributions & \citet{song2019distribution} & \citet{Kuleshov2022Calibrated} \\ \midrule
% \multirow{5}{*}{\makecell[l]{\textbf{non-IID} \\ (“adversarial”)}} & \multirow{3}{*}{Classification} & Predicting Sets & \citet{vovk2005defensive} & - \\
%  & & \makecell[l]{Predicting \\ Probabilities} & \makecell[l]{\citet{foster98asymptoticcalibration}, \\ \cite{cesabianchi2006prediction,abernethy11blackwell,okoroafor2024faster,noarov2023high}} & \makecell[l]{\citet{kuleshov2017estimating}, \\ \citet{foster2022calibeating}} \\ \cmidrule(l){2-5}
%  & \multirow{3}{*}{Regression} & Marginal & & \citet{lee2022online}\\
%  & & Quantiles & \citet{gibbs2022conformal,ramalingam2025relationship} & \textbf{This work} \\
%  & & Distributions & \citet{marx2025calibrated} & - \\ \bottomrule
% \end{tabular}
% \end{table}
% In your preamble, you should have these packages:
\begin{table}[ht!]
\centering
\caption{Summary of literature for IID or Exchangeable Data.}
\label{tab:iid_data}
\begin{tabular*}{\textwidth}{@{\extracolsep{\fill}} l l l @{}}
\toprule
\textbf{Output Type} & \textbf{No Regret Guarantees} & \textbf{Regret Guarantees} \\
\midrule
\multicolumn{3}{l}{\textbf{Classification}} \\
\addlinespace[0.2em]
\hspace{1em}Predicting Sets & \citet{vovk2005algorithmic} & \citet{Kuleshov2022Calibrated} \\
\cmidrule(lr){1-3}
% --- FINAL CHANGE: Made all dashes consistently left-aligned ---
\makecell[l]{\hspace{1em}Predicting Probabilities} & \makecell[l]{\citet{platt1999probabilistic}, \\ \citet{niculescu2005predicting}} & \makecell[l]{---} \\
\midrule
\multicolumn{3}{l}{\textbf{Regression}} \\
\addlinespace[0.2em]
\makecell[l]{\hspace{1em}Quantiles} & \makecell[l]{\citet{kuleshov2018accurate}, \\ \cite{dheur2023large,dheur2024probabilistic}} & \makecell[l]{---} \\
\cmidrule(lr){1-3}
\hspace{1em}Distributions & \citet{song2019distribution} & \citet{Kuleshov2022Calibrated} \\
\bottomrule
\end{tabular*}
\end{table}


\begin{table}[ht!]
\centering
\caption{Summary of literature for Non-IID (“Adversarial”) Data.}
\label{tab:non_iid_data}
\begin{tabular*}{\textwidth}{@{\extracolsep{\fill}} l l l @{}}
\toprule
\textbf{Output Type} & \textbf{No Regret Guarantees} & \textbf{Regret Guarantees} \\
\midrule
\multicolumn{3}{l}{\textbf{Classification}} \\
\addlinespace[0.2em]
\hspace{1em}Predicting Sets & \citet{vovk2005defensive} & --- \\
\cmidrule(lr){1-3}
\makecell[l]{\hspace{1em}Predicting Probabilities} & \makecell[l]{\citet{foster98asymptoticcalibration}, \\ \cite{cesabianchi2006prediction,abernethy11blackwell}, \\ \cite{okoroafor2024faster,noarov2023high}} & \makecell[l]{\citet{kuleshov2017estimating}, \\ \citet{foster2022calibeating}} \\
\midrule
\multicolumn{3}{l}{\textbf{Regression}} \\
\addlinespace[0.2em]
\hspace{1em}Marginal & --- & \citet{lee2022online} \\
\cmidrule(lr){1-3}
\makecell[l]{\hspace{1em}Quantiles} & \makecell[l]{\citet{gibbs2022conformal}, \\ \citet{ramalingam2025relationship}} & \makecell[l]{\textbf{This work}} \\
\cmidrule(lr){1-3}
\hspace{1em}Distributions & \citet{marx2025calibrated} & --- \\
\bottomrule
\end{tabular*}
\end{table}
We now summarize the existing literature. We cite a representative paper in each class.

\subsection*{IID Data}

\textbf{Classification.} Many papers on calibration or conformal prediction assume that data is IID or exchangeable. In calibration for classification, representative works include Platt scaling \cite{platt1999probabilistic} and isotonic regression \cite{niculescu2005predicting}. Both methods output a calibrated probability $p$ of a binary outcome in $\{0,1\}$, and admit multi-class extensions. On the other hand, conformal prediction outputs confidence sets that contain the outcome with some probabilities. The conformal prediction by \citet{vovk2005defensive} and other authors often assumes that data are exchangeable. \citet{Kuleshov2022Calibrated} proves that these methods admit regret guarantees.

\textbf{Regression.} In regression, the most standard definition is quantile calibration. \citet{kuleshov2018accurate} extends Platt scaling to this setting. Conformal prediction for continuous outcomes (e.g., \cite{vovk2005algorithmic}) is similar to quantile calibration, but targets one pair of quantiles, while \citet{kuleshov2018accurate} outputs a full quantile function. Recently, a stronger form of regression called distribution calibration was studied, and it directly extends calibrated classification: of the times one forecasts predictive distribution $p$, the data looks like it’s distributed as $p$. \citet{song2019distribution} describes this notion and \citet{Kuleshov2022Calibrated} shows it has regret guarantees.

\subsection*{Non-IID Data}

Another line of work seeks to extend the above results to settings where data is non-IID and can be even chosen by an adversary. This is the setting that we study.

\textbf{Classification.} The earliest work is by \citet{foster98asymptoticcalibration}, who frame calibration as internal regret minimization. \citet{cesabianchi2006prediction} provide a modern view on this algorithm based on online learning. \citet{abernethy11blackwell} presents yet another view based on Blackwell approachability. Most algorithms fall in one of these three approaches (internal regret, online learning, approachability)---ours is a form of internal regret minimization. Similarly, work on conformal prediction establishes comparable results for constructing confidence sets without IID or exchangeability assumptions. There exist many extensions of this work, including extensions for multi-class \cite{ramalingam2025relationship}.

The drawback of these early works is that they only provide calibration results, but not regret. Thus a classifier can predict 50\% chance of rain every day and still be calibrated (but not useful). \citet{kuleshov2017estimating} first introduce regret into adversarial online binary recalibration; \citet{foster2022calibeating} later re-derive the same algorithm. Our work extends these regret guarantees from classification to regression.

\textbf{Regression.} \citet{gibbs2022conformal} provides analogous results to \citet{foster98asymptoticcalibration} in online quantile regression using an approach based on online learning. \cite{ramalingam2025relationship} further explains the online learning connection. The distribution calibration extension is much more challenging—\citet{marx2025calibrated} provides the first extension. Besides this fully adversarial literature, there exist extensive work prediction under covariate shift (where a data distribution exists, but its shifting an unknown)---\cite{tibshirani2019conformal} is an example of this long line of literature.

The challenge with methods such as those of Gibbs and Marx is the same as in classification: there is not a guarantee that calibrated predictions will have useful predictive value. Our work provides this no-regret guarantee for a setting that resembles quantile calibration.

\subsection{Comparison with Lee at al. [2022]}
\label{subsec:lee}
Below we discuss how the work by \citet{lee2022online} compares and differs with out work. While the framework by \citet{lee2022online} admits a general compact, convex action set~$A$, the calibration definition achieved by their algorithm is different.
\begin{itemize}
  
 \item Their definition says: for each time step, draw a sample from the predicted distribution and the true label distribution over~$\mathcal{Y}$. Averaged over~$T$, the empirical pdf/CDFs of the samples from both distributions should match. (This definition can be applied to each population group when extending to multi-calibration).

 \item In contrast, our definition asks for quantile calibration: for any value~$p \in [0, 1]$, look at whether the observed outcome $y_t \le F_t^{-1}(p)$, i.e., whether $y_t$ is below the $p$-th quantile. The frequency that $y \le F_t^{-1}(p)$ should approach~$p$ as $T$ increases.
 
\end{itemize}
Note that these two definitions are not the same. See ``Probabilistic forecasts, calibration and sharpness'' by \citet{gneiting2007probabilistic} for counter-examples: the above definitions correspond respectively to \emph{marginal} and \emph{probabilistic calibration} (i.e., (c) and (a) in Defn. 1 of that work).

For continuous~$\mathcal{Y}$, our work provides a more appropriate calibration guarantee for probability distributions over continuous outcomes: our calibration guarantee is closer to the notion of quantile calibration guarantee for regression as defined for the IID case.

Our paper also defines guarantees on regret relative to a baseline forecaster in a different way.
\begin{itemize}
   
\item  \citet{lee2022online} define regret relative to a baseline using the average Brier score $(f_t-b_t)^2$, where  $b_t$ is a sample drawn from a true (unknown) distribution over the label $y$ chosen by an adversary, and $f_t$ is a forecast coming either from the model or a baseline.

\item  In our paper, $F_t$ is a CDF over continuous outcomes and we measure its performance relative to a sequence of baseline functions using the Continuous Ranked Probability Score (CRPS), defined as $\int_{y \in \mathcal{Y}} (F_t(y) - G_t(y))^2 \,dy$, which is an integral over losses between the outputs of two CDFs (typically a forecast and an empirical/step-function observed CDF).
 
\end{itemize}
These definitions are clearly different: one takes the $L_2$ loss in the space of outcomes, and the other in the space of probabilities.

\subsection{Comparing to Kuleshov and Ermon [2017]}

While \citet{kuleshov2017estimating} and the Calibeating technique focus on binary classification, we study regression. We want to emphasize that moving from calibration to regression is non-trivial and significantly more involved than generalizing the scoring rule from CDFs to point forecasts. The regression setting is significantly harder than classification, and requires (1) non-trivial thinking about how to define calibration and (2) algorithms and analyses that are substantially different than in classification.

% Defining calibration in regression is non-trivial

The classical definition of calibration (of the times when I predict $p$, binary event holds $p$ \% of the time) does not easily carry over to regression. In fact, an “easier” version of regression is multi-class calibration (imagine the continuous label $y$ is discretized), and even that is PPAD-hard (Hazan and Kakade, 2012).

Thus, most work on regression studies marginal notions of calibration: a $p$-\% confidence interval contains the label $p$-\% of the time (note how we omit the “when I predict $p$” part). Still, maintaining this in a non-IID setting is non-trivial. One well-known method is ACI (Gibbs and Candes, 2021), but it does not admit regret guarantees. We define a novel and slightly stronger notion of marginal calibration (which has elements of conditional calibration; see Eqn 2), and we provide regret guarantees.

Also, quantifying and minimizing regret is itself non-trivial. This requires defining a suitable notion of regret that is compatible with our definition of calibration. We use the CRPS and CDF recalibration as measures of regret and calibration, respectively.

% Algorithms and analyses for calibration.

While our method superficially resembles that of \citet{kuleshov2017estimating} (and Calibeating, which is the same algorithm) in that we partition an interval and run simple subroutines in each sub-interval, the analysis is significantly different, especially the part about minimizing regret. Superficially, while that proof takes (1/2)-page in Kuleshov and Ermon, ours is about 2 pages long and is substantially different.

Note also that we provide a significant number of additional results that strengthen our core work: a generalized Markov inequality that guarantees our method is able to accurately estimate losses, an analysis of confidence intervals, and an application to online decision-making and Bayesian optimization.

\subsection{Comparing to Deshpande et al. [2024]}

Note that the focus and the methods of both papers are different. Our work makes more theoretical contributions around the feasibility of defining and maintaining good calibration and regret in an online non-IID regression setting. The work by \citet{Deshpande2021Calibrated} is mainly empirical: it applies methods from IID regression (e.g., Kuleshov and Ermon, ICML2018) and additional heuristics to obtain the best possible empirical results on classification. %Please note also that their paper is an unpublished manuscript.

We adopt a similar setup to \citet{Deshpande2021Calibrated} in our experiments because the setting is useful and inherently non-IID. However, because our work is more theoretical, our experiments are not as extensive as those of \citet{Deshpande2021Calibrated} (whose entire paper is mostly experimental). That said, our non-randomized baseline (orange line) is effectively equivalent to the IID algorithm used in \citet{Deshpande2021Calibrated} (it simply maintains marginal calibration by counting frequencies in bins), and we outperform that baseline in our experiments by virtue of designing specialized non-IID algorithms.

Lastly, while the paper by \citet{Deshpande2021Calibrated} has a lemma on online decision-making, ours holds in the online non-IID setting, while theirs is only IID.

\subsection{Applications}
\label{subsec:applications}
Consider the example of predicting the demand for electricity so that the power grid operator can make decisions. The electricity demand may fluctuate in unpredictable ways depending on changes in variables like weather, special events producing sudden large industrial demands, time of the day, etc. The adversarial setting allows us to accommodate the worst case deviations from i.i.d. data. Having poorly calibrated forecasts in this setting can result in poor decisions (e.g. inadequate electricity supply). For example, an operator might want to provide electricity supply that minimizes a black-out with a target probability: if demand forecast is miscalibrated, then the true probability could be far different from the one inferred from the forecasted demand.

Some other examples include: 1) When assessing patient risk (e.g., sepsis probability) based on streaming vital signs, we require a calibrated forecast to correctly determine the probability of a bad outcome. 2) When market conditions constantly shift, predicting whether a loan is defaulted requires a calibrated probability. These are all examples of temporal data that may become non-IID since the state of the system evolves over time.

% \section{Implementation}
% \label{apdx:codebase}
% Please find our code at this anonymous link \href{https://anonymous.4open.science/r/OnlineQuantileCalibrationDemo-BD10}{https://anonymous.4open.science/r/OnlineQuantileCalibrationDemo-BD10}