\documentclass[accepted]{uai2025}

% if you need to pass options to natbib, use, e.g.:
     % \PassOptionsToPackage{numbers, compress}{natbib}
% before loading neurips_2024


% ready for submission
%\usepackage{neurips_2024}


% if you need to pass options to natbib, use, e.g.:
%     \PassOptionsToPackage{numbers, compress}{natbib}
% before loading neurips_2023


% ready for submission


% to compile a preprint version, e.g., for submission to arXiv, add add the
% [preprint] option:
%     \usepackage[preprint]{neurips_2023}


% to compile a camera-ready version, add the [final] option, e.g.:
%     \usepackage[final]{neurips_2023}


% to avoid loading the natbib package, add option nonatbib:
%    \usepackage[nonatbib]{neurips_2023}

\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}    % use 8-bit T1 fonts
\usepackage{hyperref}       % hyperlinks
\usepackage{url}            % simple URL typesetting
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{xcolor}         % colors
% \usepackage{titling}
\usepackage{wrapfig}
\usepackage[american]{babel}
%%% Some suggested packages, as needed:
% \usepackage{natbib} % has a nice set of citation styles and commands
%     \bibliographystyle{plainnat}
%     \renewcommand{\bibsection}{\subsubsection*{References}}



%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}


% many macros in here
\usepackage{scabby}

% For algorithms
\usepackage{algorithm}
%\usepackage{algorithmic}
\usepackage[noend]{algcompatible}
% \usepackage[ruled,vlined]{algorithm2e} 
%
\usepackage{subfigure}
\usepackage{float}
\usepackage{multirow}
\usepackage{graphicx}
\usepackage{subfigure}



\providecommand{\tT}{\tilde T}

%\newcommand{\sgn}{\textrm{sign}}
\providecommand{\x}{\chi}
\providecommand{\e}{\epsilon}
\providecommand{\La}{\Lambda}
\providecommand{\la}{\lambda}
\providecommand{\vt}{\tilde{v}}
\providecommand{\ut}{\tilde{u}}
\providecommand{\pit}{\tilde{\pi}}
\providecommand{\Vt}{\tilde{V}}
\providecommand{\Ut}{\tilde{U}}
\newcommand{\supr}[1]{^{(#1)}}

\providecommand{\sMh}{\widehat{\sM}}
\providecommand{\Mh}{\widehat{M}}
\providecommand{\Th}{\widehat{T}}
\providecommand{\vh}{\widehat{v}}

\providecommand{\pb}{\bar{p}}
\providecommand{\pib}{\bar{\pi}}
\providecommand{\Ub}{\bar{U}}
\providecommand{\Vb}{\bar{V}}
\providecommand{\Lb}{\bar{\Lambda}}

\providecommand{\off}{\operatorname{off}}

%\newcommand{\x}{\chi}
%\newcommand{\e}{\epsilon}
\newcommand{\yh}{{\hat y}}
\newcommand{\ellavg}{{\bar \ell}}
\newcommand{\Yc}{{\mathcal Y}}
\newcommand{\Ic}{{\mathcal I}}
\newcommand{\Bcal}{{\mathcal B}}
\newcommand{\Fc}{{\mathcal S}}
\newcommand{\Xc}{{\mathcal X}}
\newcommand{\Pc}{{\mathcal P}}
\newcommand{\Rb}{{\mathbb R}}
\newcommand{\extR}{R^\mathrm{ext}}
\newcommand{\intR}{R^\mathrm{int}}
\newcommand{\Fcal}{S^\mathrm{cal}}
\newcommand{\Ind}{{\mathbb{I}}}
\newcommand{\Exp}{{\mathbb{E}}}
\newcommand{\Pb}{{\mathbb{P}}}
\newcommand{\palg}{F^H}
\newcommand{\wsupj}{\Ind^{(j)}}
\newcommand{\ren}{{\rho^\e_n}}
\newcommand{\rjt}{{\rho^{(j)}_T}}
\newcommand{\wi}{w^{(i)}}
\newcommand{\bfw}{{\bf w}}
\newcommand{\bfd}{{\bf d}}
\newcommand{\bfp}{{\bf p}}
\newcommand{\pk}{p^{(k)}}
\newcommand{\Regret}{\textrm{Regret}}
%\newcommand{\dist}{\mathrm{dist}}

\newtheorem{claim}{Claim}
\newtheorem{defn}{Definition}
\newtheorem{thm}{Theorem}
\newtheorem{fact}{Fact}

\newcommand{\fix}{\marginpar{FIX}}
\newcommand{\new}{\marginpar{NEW}}

% \setcitestyle{numbers}
% \setcitestyle{square}

\newcommand\vk[1]{\textbf{[VKVKVKVKVKVK: #1]}}
\newcommand\regmin{{\sc RegMin}}

% \usepackage{natbib} % has a nice set of citation styles and commands
%     \bibliographystyle{plainnat}
%     \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{multirow}
\usepackage{makecell}
\usepackage{array} 
\usepackage{xcolor}
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)


\title{Calibrated Regression Against An Adversary \\Without Regret}

% The standard author block has changed for UAI 2024 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<ssd86@cornell.edu>?Subject=Your UAI 2025 paper}{Shachi Deshpande}{}}
\author[2]{Charles Marx}
\author[1]{Volodymyr Kuleshov}
% Add affiliations after the authors
\affil[1]{%
    Computer Science Dept.\\
    Cornell University and Cornell Tech\\
    New York, NY , USA
}
\affil[2]{%
    Computer Science Dept.\\
    Stanford University\\
    Stanford, CA, USA
}  
  \begin{document}
\maketitle
% \twocolumn[

% \aistatstitle{Adversarial Calibrated Regression
% for Online Decision Making}

% \aistatsauthor{ Volodymyr Kuleshov \And  Shachi Deshpande  }

% \aistatsaddress{ Cornell University } ]



\begin{abstract}
We are interested in probabilistic prediction in online settings in which data does not follow a probability distribution. Our work seeks to achieve two goals: (1) producing valid probabilities that accurately reflect model confidence; (2) ensuring that traditional notions of performance (e.g., high accuracy) still hold. 
%Without a data distribution, these goals may not be straightforward to define.
We introduce online algorithms guaranteed to achieve these goals on arbitrary streams of datapoints, including data chosen by an adversary.
Specifically, our algorithms produce forecasts that are (1) calibrated---i.e., an 80\% confidence interval contains the true outcome 80\% of the time---and (2) have low regret relative to a user-specified baseline model.
We implement a post-hoc recalibration strategy that provably achieves these goals in regression; previous algorithms applied to classification or achieved (1) but not (2).
In the context of Bayesian optimization, an online model-based decision-making task in which the data distribution shifts over time, our method yields accelerated convergence to improved optima. 
% Our results highlight the potential of online uncertainty estimation to improve decision-making.


%We are interested in probabilistic prediction in online settings in which data does not follow a fixed distribution.
%Standard forecasters 
%may fail in such settings, as even the concept of data probability is not well-defined.
%Here, we introduce online algorithms guaranteed to output accurate predictions on arbitrary streams of datapoints, including data chosen by an adversary.
%We define accurate forecasts as being calibrated---i.e., an 80\% confidence interval will contain the true outcome 80\% of the time---and having low regret relative to a user-specified baseline model.
%We ensure this via a general post-hoc recalibration strategy that guarantees vanishing regret across a range of settings.
%  We apply our method in the context of Bayesian optimization, an online model-based decision-making task in which the data distribution shifts over time, and observe accelerated convergence to improved optima. Our results highlight the potential of online uncertainty estimation to improve decision-making.
  
  
  
  % Accurately estimating uncertainty is an essential component of decision-making and forecasting in machine learning. However, existing uncertainty estimation methods 
  % % developed for IID data 
  % may fail when data no longer follows the distribution seen during training. Here, we introduce 
  % online uncertainty estimation algorithms that are guaranteed to be reliable on arbitrary streams of datapoints, including data chosen by an adversary.
  % Specifically, our algorithms perform post-hoc recalibration of a black-box regression model and produce outputs that are provably calibrated---i.e., an 80\% confidence interval will contain the true outcome 80\% of the time---and  that have low regret relative to the learning objective of the base model. 
  % We apply our algorithms in the context of Bayesian optimization, an online model-based decision-making task in which the data distribution shifts over time, and observe accelerated convergence to improved optima. Our results suggest that online uncertainty estimation has the potential to improve decision-making.
\end{abstract}


\section{Introduction}\label{sec:introduction}

In applications of machine learning (ML), data can change over time. Online learning algorithms can guarantee good predictive accuracy (e.g., as measured by squared error) on arbitrary data streams, even ones chosen adversarially \citep{cesabianchi2006prediction,shalev2007phd}. 
However, we are often interested not only in minimizing predictive error, but also in outputting valid probabilities representative of future outcomes
% these guarantees are typically defined in terms of regret relative to a set of benchmark policies. 
% In many applications, quantifying the confidence of predictions can be as important as accuracy or performance 
\citep{vovk2005defensive, kuleshov2018accurate, angelopoulos2021gentle}.
For example a doctor might wish to estimate the probability of a patient being sick; similarly, a power grid operator might want to know the likelihood that demand for electricity will increase. 
% Expressing such confidence, naturally requires algorithms that make probabilistic predictions over time.

In this paper, we are interested in probabilistic predictions in online settings where data does not follow a probability distribution \citep{shalev2007phd}. This setting is challenging because we need to achieve two goals on data that shifts over time: (1) producing valid probabilities that accurately reflect model confidence; (2) ensuring that traditional notions of performance (e.g., achieving a low squared error) still hold. Additionally, without a data distribution, these goals may not be straightforward to define.

Our approach towards the first goal uses calibration to define valid probabilistic forecasts \citep{foster98asymptoticcalibration,kuleshov2017estimating,gibbs2022conformal}. Intuitively, an algorithm outputs calibrated predictions if the predicted and the empirical probabilities of a predicted outcome match---i.e., an 80\% confidence interval contains the true outcome 80\% of the time. 
% We combine calibration with traditional performance metrics 
We formalize the second goal by requiring that calibrated predictions have low regret relative to a baseline uncalibrated forecaster, as measured by a proper score \citep{gneiting2007probabilistic}.
We focus on real-valued outcomes, and define online calibrated regression, a task that seeks to achieve the above two goals.

We propose algorithms for online calibrated regression that output accurate probabilistic predictions 
% that meet the above goals; these algorithms 
via the post-hoc recalibration of a black-box baseline model.
Unlike classical recalibration methods \citep{platt1999probabilistic,kuleshov2018accurate}, ours work on online non-IID data (even data chosen by an adversary). In contrast to classical online learning \citep{shalev2007phd}, we provide guarantees on not only regret, but also on the validity of probabilistic forecasts.
Crucially, unlike many online calibrated and conformal prediction algorithms for classification \citep{foster98asymptoticcalibration} or regression \citep{gibbs2022conformal}, we ensure low regret relative to a baseline forecaster. % We do this via a general construction that extends across calibration algorithms.


% Specifically, this paper introduces online uncertainty estimation algorithms that are guaranteed to be reliable on arbitrary streams of datapoints, including data chosen by an adversary. 
% Our algorithms perform post-hoc recalibration of a black-box regression model and produce outputs that are calibrated---i.e., an 80\% confidence interval will contain the true outcome 80\% of the time---and  that have low regret relative to the learning objective of the base model. 
% Unlike existing work on recalibration \citep{platt1999probabilistic,kuleshov2018accurate} ours admits provable guarantees without IID assumptions; unlike classical online learning \citep{cesabianchi2006prediction} we provide guarantees on predictive uncertainty, not only regret. 

Accurate predictive uncertainties can be especially useful in decision-making settings, where an agent uses a model of future outcomes to estimate the results of its actions (e.g., the likelihood of treating a patient) \citep{Malik2019Calibrated}.
We complement our algorithms with formal guarantees on expected utility estimation in decision-making applications. % the ability of a decision-making agent to estimate its expected utility. % with a calibrated predictive model.
We apply our algorithms to several regression tasks, as well in the context of Bayesian optimization, an online model-based decision-making task in which the data distribution shifts over time. We find that improved uncertainties in the Bayesian optimization model yield faster convergence to optimal solutions which are also often of higher quality. 

%\vk{TODO: This still needs to be edited.}
\paragraph{Contributions.} 
% Our contributions are twofold.
%
% \begin{itemize}
% \item 
First, we formulate a new problem called online calibrated regression, which requires producing calibrated probabilities on potentially adversarial input while retaining the predictive power of a given baseline uncalibrated forecaster.
%
Second, we propose an algorithm for this task that generalizes recalibration in regression to non-IID data. % and online calibrated classification to regression settings.
% We also give a construction that enables maintaining vanishing regret relative to a baseline model.
Third, we show that the algorithm can improve the performance of Bayesian optimization, highlighting its potential to improve decision-making.
% Our technique recalibrates any existing model and relies on a black-box classical calibration algorithm.
%
%Specifically, we assume we are given probability estimates $\palg_t$ from an uncalibrated forecaster $F$ (e.g., an existing online medical diagnosis algorithm) that target binary outcomes $y_t$. 
%The $(\palg_t, y_t)$ are observed one at a time and may be chosen by an adversary. 
%The goal of online recalibration is to transform the $\palg_t$ into new forecasts $p_t$ that are calibrated, even in an adversarial setting, and whose accuracy is within $\e$ of that of the $\palg_t$.

%In this work, we formulate a new problem called {\em online recalibration} which bridges together work on online calibration and batch recalibration and addresses practical problems like the medical diagnosis example above. In online recalibration, we are given forecasts $\palg_t$ from an uncalibrated forecaster $F$ (e.g. the online medical diagnosis algorithm) that target binary outcomes $y_t$. The $y_t$ are observed one at a time and may be chosen by an adversary. The goal of online recalibration is to transform the $\palg_t$ into new forecasts $p_t$ that are calibrated, and that can be made almost as accurate as the $\palg_t$.

% \item Propose an algorithm for this task that generalizes calibrated regression for i.i.d.~settings and online calibrated classification in the online learning setting.
% Our technique recalibrates any existing model and relies on a black-box classical calibration algorithm.

% \item Improvement on practical problems, including Bayesian optimization
% \end{itemize}
% motivation of doing recalibration
%Recalibration of existing classifiers is particularly appealing because it works with any real-world system, including ones that cannot be modified. In online learning, it preserves all the favorable convergence of the uncalibrated forecaster $F$ (e.g. exploiting sparsity, adaptive rates as in Adagrad), while in addition ensuring calibration.
%The result is a recalibrator that works in an adversarial setting while preserving accuracy guarantees comparable to the batch setting. 

%We evaluate the accuracy of our method on two synthetic tasks as well as two real-world problems: question-answering and predicting diabetes from genomic data. In each case, 
%We also validate our algorithm empirically on two real-world problems
%We also demonstrate empirically on two real-world problems that our method converges quickly and retains an accuracy comparable to that of the input baseline uncalibrated forecaster.

%A more interesting setting is to assume we have an online algorithm that makes (uncalibrated) forecasts $q_t$ and we want pass them through a recalibration algorithm to obtain new forecasts $p_t$ that are calibrated in the above sense and that don't have more regret than $q_t$. This setup would be interesting for several reasons:
%\begin{itemize}
%\item There exist many online learning algorithms that exploit various assumptions about covariates $x_t$ (e.g. Online sub gradient descent with different regularizers, Adagrad, online algorithms that exploit sparsity ). These algorithms have different regret. It would be great to have methods that keep the regret of an existing algorithm *and* that are also calibrated. 
%\item It is possible to do the equivalent of this in the batch setting (improve calibration without affecting sharpness), and would be nice to show that this carries over to the online setting.
%\end{itemize}

\section{Background}\label{sec:background}

% \vk{keep?} 


%\subsection{Online learning}

% \paragraph{Notation}
% We use $\Ind_E$ denote the indicator function of $E$, $[N]$ and $[N]_0$ to (respectively) denote the sets $\{1,2,...,N\}$ and $\{0,1,2,...,N\}$, and $\Delta_d$ to denote a $d$-dimensional simplex.

We place our work in the framework of online learning 
\citep{shalev2007phd}.
%
At each time step $t = 1,2,...$, we are given features $x_t \in \mathcal{X} $. We use a forecaster $H : \mathcal{X} \to \mathcal{F}$ to produce a forecast $f_t = H(x_t)$, $f_t \in \mathcal{F}$ in a set of forecasts $\mathcal{F}$ over a target $y \in \mathcal{Y}$.
Nature then reveals the true target $y_t \in \mathcal{Y}$ and we incur a loss of $\ell(y_t, f_t)$, where $\ell : \mathcal{Y} \times \mathcal{F}  \to \Rb^+$ is a loss function.
%The forecaster $H$ updates itself based on $x_t, y_t$, and we proceed to time $t+1$.
%
Unlike in classical machine learning, we do not assume that the $x_t, y_t$ are i.i.d.: they can be random, deterministic, or even chosen by an adversary. In this regime, online learning algorithms admit strong performance guarantees measured in terms of regret $R_T(g)$ relative to a constant prediction $g$, $R_T(g) = \sum_{t=1}^T \ell(y_t, f_t)  - \ell(y_t, g). $
The worst-case regret at time $T$ equals $R_T = \max_{g \in \mathcal{F}} R_T(g)$.

%The forecaster makes a prediction $\palg_t = \sigma (w_{t-1} \cdot x_t) \in [0,1]$ where $w_{t-1} \in S$ is a parameter vector, $S \subseteq \Rb^d $ is a convex set, and $\sigma : \mathbb R \to [0,1]$ is a {\em transfer function}. Nature then reveals a binary outcome $y_t \in \{0,1\}$ and $F$ incurs a loss of $\ell(\palg_t , y_t)$, where $\ell : [0,1] \times \{0,1\} \to \Rb^+$ is a convex function in $\palg_t$ for all $y_t$.
%The forecaster sets new weights $w_t$, and the cycle repeats.

%in which a forecaster $F$ tries to minimize its loss in  a game against a potentially adversarial opponent, Nature.
%%At each time step $t$, a forecaster $F$ chooses weights $w_t$ in a is a convex set $S$, while Nature  chooses (potentially adversarially) a convex loss function $f_t$; $F$ then incurs loss $f_t(w_t)$.
%In the context of binary classification, online optimization is defined as follows \citep{cesabianchi2006prediction}.  At each time $t=1,...,T$:
%\begin{enumerate}
%\item Nature chooses features $x_t \in \Rb^d$ and reveals them to the forecaster $F$.
%\item $F$ predicts $p_t = \sigma (w_{t-1} \cdot x_t) \in [0,1]$ where $w_{t-1} \in S$ is a parameter vector, $S \subseteq \Rb^d $ is a convex set, and $\sigma : \mathbb R \to [0,1]$ is a transfer function.
%\item Nature reveals a binary outcome $y_t \in \{0,1\}$. $F$ incurs a loss of $\ell(p_t , y_t)$, where $\ell : [0,1] \times \{0,1\} \to \Rb^+$ is a convex function in $p_t$ for all $y_t$.
%\item The forecaster chooses a new $w_t$.
%\end{enumerate}
%%Note that the sequence of $y_t$ can be adversarial.\vk{TODO: Comment on the properties of $\sigma$}

%The accuracy of online optimization algorithms is measured in terms of regret.
%\begin{defn}
%The regret of an online binary classification algorithm at $u$ is defined as
%$ \Regret_T(u) = \sum_{t=1}^T l(p_t, y_t)  - l(\sigma(u \cdot x_t), y_t). $
%The worst-case regret after $T$ time steps is defined as $R_T = \max_{u \in S} \Regret_T(u)$.
%\end{defn}

%The following setup is taken from Section 4 in Cesa-Bianchi and Lugosi.
%We assume that we have access to $N$ {\em actions}. 
%At each time $t=1,...,n$,
%\begin{enumerate}
%\item The environment chooses an outcome $y_t \in \Yc$.
%\item The forecaster $F$ chooses a probability distribution $p_t \in \Delta_N$ over its $N$ possible actions
%\item Nature reveals the outcome $y_t$. The forecaster samples action $I_t \sim p_t$.
%\item The forecaster incurs loss $\ell(I_t, y_t)$.
%\end{enumerate}
%
%In this setting, we make no assumptions on the loss $\ell(i, y)$. The sequence of $y_t$ can be either adversarial or oblivious (chosen in advance), but this has no effect on the performance of algorithms.
%Note that this setup does not allow for covariates. It is mostly inspired by game theory, where it used to to show that repeated no-regret play converges to an equilibrium.

% \paragraph{Learning with Expert Advice}\label{sec:advice}

% A special case of this framework arises when each $x_t$ represents advice from $N$ {\em experts}, and $H$ outputs $p_t \in \Delta_{N-1}$, a distribution over experts. Nature reveals an outcome $y_t$, resulting in an expected loss of $\sum_{i=1}^N p_{ti} \ell(y_t, a_{ti})$, where $\ell(y_t, a_{ti})$ is the loss under expert $i$'s advice $a_{ti}$. Performance in this setting is measured using two notions of regret.
% \begin{defn}
% The external regret $\extR_T$ and the internal regret $\intR_T$ are defined as
% \begin{align*}
% & \extR_T = \sum_{t=1}^T \ellavg(y_t, p_t)  - \min_{i \in [N]} \sum_{t=1}^T \ell(y_t, a_{it}) \\
% % \end{align*}
% % \begin{align*}
% & \intR_T  = \max_{i,j \in [N]} \sum_{t=1}^T p_{ti} \left( \ell(y_t, a_{it})  - \ell(y_t, a_{jt}) \right),
% \end{align*}
% where $ \ellavg(y, p) = \sum_{i=1}^N p_i \ell(y, a_{it}) $ is the expected loss.
% \end{defn}

% External regret measures loss with respect to the best fixed expert, while internal regret is a stronger notion
% that measures the gain from retrospectively switching all the plays of action $i$ to $j$.

% \begin{figure}
% \begin{center}
% \includegraphics[width=8.5cm]{figures/hist2.pdf}
% \end{center}
% \caption{Our method bins uncalibrated inputs and runs online calibration subroutines in each bin. This emulates density estimation in a non-IID setting.
% }
% \end{figure}

\paragraph{Online forecasting}
\label{sec:forecasting}

Our work extends the online learning setting to probabilistic predictions.
% at each step $t$, the forecaster outputs a forecast $f_t$ over the outcome $y_t$. 
 We focus on regression, where $y_t \in \mathbb{R}$ and the prediction $f_t$ can be represented by a cumulative distribution function (CDF), which we denote by $F_t : \mathbb{R} \to [0,1]$; $F_t(z)$ denotes the predicted probability that $y$ is less than $z$.
%
The quality of probabilistic forecasts is evaluated using {\em proper} losses $\ell$. Formally, 
a loss $\ell(y, f)$ is proper if
$f \in \arg\min_{g \in \mathcal{F}} \Exp_{y \sim (f)} \ell(y, g) \; \forall f \in \mathcal{F}.$; i.e., the true data probability minimizes the loss.
%And example is the log-loss $\ell_\text{log}(y,p) = y\log(p) + (1-y)\log(1-p)$. 
An important proper loss for CDF predictions is the continuous ranked probability score, defined as
$\ell_\text{CRPS}(y, F) = \int_{-\infty}^\infty (F(z) - \mathbb{I}_{y \leq z})^2 dz.$

%\paragraph{Calibrated forecasting}



% \begin{defn}
% We say that $H$ is $(\e, \ell_p)$-quantile calibrated with resolution $1/N$ if
% $ \lim \sup_{T \to \infty} C_{T}^p \leq \e $ a.s.
% \end{defn}

%More formally, let $\Fcal$ be a forecaster making predictions in the set $\{\frac{i}{N} \mid i=0,...,N\}$,  where $1/N$ is called the {\em resolution} of $\Fcal$;
%consider the quantities $\rho_T(p) = \dfrac{\sum_{t=1}^T y_t \Ind_{p_t = p}}{\sum_{t=1}^T \Ind_{p_t = p}}$ and
%\begin{align}
%C_T^p & = \sum_{i=0}^N \left| \rho_T(i/N) - \frac{i}{N} \right|^p \left( \frac{1}{T} \sum_{t=1}^T \Ind_{\{p_t = \frac{i}{N}\}} \right). \label{eqn:cal_loss}
%\end{align}
%The term $\rho_T(p)$ denotes the frequency at which event $y = 1$ occurred over the times when we predicted $p$. Our intuition was that $\rho_T(p)$ and $p$ should be close to each other; we capture this using the notion of calibration error $C_T^p$
%for $p \geq 1$; this corresponds to the weighted $\ell_p$ distance between the $\rho_T(i/N)$ and the predicted probabilities $\frac{i}{N}$; typically one assumes that $p=1$ or $p=2$. To simplify notation, we will use the term $C_T$ when the exact $p$ is unambiguous.

%\begin{defn}
%We say that $\Fcal$ is an $(\e, \ell_p)$-calibrated algorithm with resolution $1/N$ if
%$ \lim \sup_{T \to \infty} C_{T}^p \leq \e $ a.s.
%\end{defn}

\paragraph{Online calibration}

Proper losses decompose into a calibration and a sharpness component: these quantities precisely define an ideal forecast.
%
Intuitively, calibration means that a 60\% prediction should be valid 60\%  of the time; sharpness means that confidence intervals should be tight.

In the online setting, there exist algorithms guaranteed to produce calibrated forecasts of binary outcomes $y_t \in \{0,1\}$ even when the $y_t$ is adversarial \cite{foster98asymptoticcalibration,cesabianchi2006prediction,abernethy11blackwell}.
These algorithms
% reduce calibrated forecasting to internal regret minimization and 
are oftentimes randomized; hence their guarantees hold almost surely (a.s.). Here, and in all other usages going forward, ``almost surely'' refers to the simulated randomness in the randomized algorithm, and not the data. 
%More formally, for any $p \in [0,1]$, we define $\rho_T(p) = {(\sum_{t=1}^T y_t \Ind_{p_t = p})}/{(\sum_{t=1}^T \Ind_{p_t = p})}$ to be the empirical frequency of $y_t$ when we predict $p$. Online calibration methods minimize the calibration error
%% \begin{align}
%$
%C_T  = \sum_{i=0}^N \left| \rho_T(i/N) - \frac{i}{N} \right| \left( \frac{1}{T} \sum_{t=1}^T \Ind_{\{p_t = \frac{i}{N}\}} \right). 
%% \label{eqn:cal_loss}
%% \end{align}
%$
%This compares the observed and the predicted frequencies of $y_t$.
%The model is calibrated if $\rho_T(p) \approx p$.
However, most calibration methods do not account for covariates $x_t$ \cite{foster98asymptoticcalibration} or assume simple binary $y_t$ \cite{kuleshov2017estimating,foster2022calibeating}. We extend this work to regression and add guarantees on regret.
We provide a detailed comparison of our work with the broader literature along with some motivating examples in Appendix~\ref{sec:prior_work}. % hence are not directly applicable on standard supervised machine learning tasks.

% Recently, ~\citet{kuleshov2017estimating} introduced algorithms for online recalibration, an task in which we are given probabilistic predictions from an algorithm and seek to transform them into calibrated ones while maintaining low regret in terms of a proper loss.
% This approach yields forecasters that leverage covariates $x_t$ and possess calibration guarantees on non-i.i.d. data. However, this method only works for classification; our work extends it to regression.

%There exists a vast literature on calibration in the online setting \cite{cesabianchi2006prediction} which is primarily concerned with constructing calibrated predictions $p_t \in [0,1]$ of a binary outcome $y_t \in \{0,1\}$ based solely on the past sequence $y_1,...,y_{t-1}$.
%Surprisingly, this is possible even when the $y_t$ are chosen adversarially by reducing the problem to
%%\algorithmref{online_cal} is an example of an $\frac{N+1}{N^2}$-calibrated algorithm with resolution $1/N$.
%internal regret minimization relative to $N+1$ experts with losses $(y_t - i/N)^2$ and proposed predictions $i/N$ for $i \in [N]_0$. 
%All such algorithms are randomized, hence our results will hold almost surely (a.s.).
%%One can show using simple algebra that the internal regret $\sum_{t=1}^T w_{ti} \left( (i/N- y_t)^2  - (j/N - y_t)^2 \right)$ from switching from $i \to j$ equals $\sum_{t=1}^T \left( \frac{\sum_{t=1}^T w_{ti}y_t}{\sum_{t=1}^T w_{ti}} - i/N \right)^2 - \left( \frac{\sum_{t=1}^T w_{tj}y_t}{\sum_{t=1}^T w_{tj}} - i/N \right)^2$
%%If we cannot improve the loss $\sum_{t : I_t = i} (y_t - i/N)^2$ from playing $i/N$ by predicting $j/N$ instead of $i/N$, then the empirical frequency $\rho_T(i/N)$ must be close to $i/N$. 
%See Chapter 4 in \citep{cesabianchi2006prediction} for details.
%%\algorithmref{online_cal} is an implementation of an internal regret minimization strategy for this setup; 
%%In fact, it is possible to show that the above procedure is ($\frac{N+1}{N^2}, \ell_2)$-calibrated;
%%see Chapter 4 of \citet{cesabianchi2006prediction} for a proof as well as bounds on $C_T$ that hold uniformly over $T$. 
%%In brief, at line 5 we form a set of experts (one for each $(i,j) \in [N]\times[N]$, $i \neq j$) that shifts all the mass from action $i \to j$; in line 6, we run a step of the exponentially weighed averaged forecaster for minimizing external regret; finally, we recover new weights $w_{t+1}$ in line 7.\vk{TODO: Make less obscure}

%\begin{lemma}[\citep{cesabianchi2006prediction}]\label{lem:subroutine}
%Choose $N$ such that $\frac{N+1}{N^2} < \e$ and let $\eta = \sqrt{2\frac{\log(N(N-1))}{T}}$. Then \algorithmref{online_cal} with this choice of $N$ and $\eta$ is $\e$-calibrated and $C_T \leq O\left( \frac{1}{\e}\sqrt{\frac{\log(1/\e)}{T}} \right)~+~\e$.
%\end{lemma}

% \section{ONLINE CALIBRATED REGRESSION}\label{sec:calibration}

% % \subsection{Definitions}

% % We seek to generalize the definition of online calibration to continuous variables. 
% Our strategy is to first extend to regression the simple covariate-free online binary calibration setting proposed by \citet{foster98asymptoticcalibration}. We will later use these results to develop recalibration algorithms for online regression analogous to those of \citet{kuleshov2017estimating}.
% % We take inspiration from calibrated regression~\citep{kuleshov2018accurate}, where the forecaster outputs CDFs $F_t$. We seek to produce CDF-like forecasts such that confidence intervals with probability $p$ contain the true $y_t$ a fraction $p$ of the time.

% \paragraph{Online Calibrated Regression}

% We first define an online regression task that is analogous to the task of online binary calibration \citep{foster98asymptoticcalibration}---there are no covariates and our task is to produce  calibrated forecasts on a sequence of $y_t$ that is potentially chosen by an adversary. 
% Formally, we define {\em online calibrated regression} as a task in which at every step $t=1,2,...$ we have:
%   \begin{algorithmic}[1]
%     \STATE The agent outputs a forecast $F_t$ where $F_t(y) \in [0,1]$ estimates the probability that $y_t \leq y$. % for all $y$
%     \STATE Nature reveals label $y_t$
%     \STATE The agent updates its internal state based on $y_t$.
%   \end{algorithmic}

% Unlike ~\citet{kuleshov2017estimating}, we focus on the setting of regression. We formalize this as follows.
% \begin{assumption}
% The labels $y_t \in \mathcal{Y} \subseteq \mathcal{R}$ are continuous and bounded $|y_t| < B/2$, where $B>0$.
% \end{assumption}

% % \paragraph{Online Quantile Calibration}

% Our task is to produce calibrated forecasts. Intuitively, we say that a forecast $F_t$ is calibrated if for every $y' \in \mathcal{Y}$, the probability $F_t(y')$ on average matches the frequency of the event $\{ y \leq y' \}$---in other words the $F_t$ behave like calibrated CDFs. We formalize this intuition as follows.
% \begin{definition}
% A sequence of forecasts $F_t$ achieves online calibration for all $y \in \mathcal{Y}$ and all $p \in \mathcal{P}$,
% $ \rho_T(y, p) \to p, $ a.s.~as $T \to \infty$, where
% $ \rho_T(y, p) = \dfrac{\sum_{t=1}^T \Ind_{y_t \leq y, F_t(y) = p}}{\sum_{t=1}^T \Ind_{F_t(y) = p}}$
% \end{definition}
% In other words, out of the times when the predicted probability $F_t(y')$ for $\{y_t \leq y'\}$ to be $p$, the event $\{y_t \leq y'\}$ holds a fraction $p$ of the time.

% % \subsection{Algorithms}

% \subsection{Algorithms for Online Calibrated Forecasting}

% Next, we define an algorithm for online calibrated forecasting. Our algorithm leverages classical online binary calibration \citep{foster98asymptoticcalibration} as a subroutine.
% Formally, \algorithmref{cal} 
% partitions $[-\frac{B}{2},\frac{B}{2}]$ into $M$ intervals $\Ic = \{[\frac{-B}{2},\frac{-B}{2} + \frac{B}{M}), ..., [\frac{B}{2} - \frac{B}{M},\frac{B}{2}]\}$; each interval is associated with an instance of an online binary recalibration subroutine $\Fcal$ \citep{foster98asymptoticcalibration,cesabianchi2006prediction}. In order to compute $G_t(y \leq z)$, we invoke the subroutine $\Fcal_j$ associated with interval $I_j$ containing $z$.
% % on the data $\{\palg_t, y_t \mid \palg_t \in I_j \}$ belonging to each bucket $I_j \in \Ic$; at prediction time, it calls the instance of $\Fcal$ associated with the bucket of the uncalibrated forecast $\palg_t$.
% After observing $y_t$, each $\Fcal_j$ observes whether $y_t$ falls in its interval and updates its state.


% \begin{figure}
% \vspace{-3mm}
% \begin{algorithm}[H]
%   \caption{Online Calibration}
%   \label{algo:cal}
%   \begin{algorithmic}[1]
%     \REQUIRE Online binary calibration subroutine $\Fcal$ and number of intervals $M$
%     \STATE Initialize $\Ic = \{[\frac{-B}{2},\frac{-B}{2} + \frac{B}{M}), ..., [\frac{B}{2} - \frac{B}{M},\frac{B}{2}]\}$, a set of $M$ intervals that partition $[\frac{-B}{2},\frac{B}{2}]$.
%     \STATE Initialize $\Fc = \{ \Fcal_j \mid j = 0,...,M-1 \}$, a set of $M$ instances of $\Fcal$, one per $I_j \in \mathcal{I}$.
%     \FOR {$t=1,2,...$:}
%     \STATE Define $F_t(y \leq z)$ as the output of $\Fcal_{ j(z) }$, where $ j(z)$ is the index of the subroutine associated with the interval containing $z$.
%     \STATE Output $F_t$. Observe $y_t$ and update forecaster:
%         \FOR {$j=1,2,...,M$:}
%         		\STATE Let $o_{tj} = 1 \text{ if } y_t \in I_k \text{ for some } k \leq j \text{ else } 0$. Pass $o_{tj}$ to $\Fcal_j$.
% 		% \STATE Pass $o_{tj}$ to $\Fcal_j$
% 	\ENDFOR
%     \ENDFOR
%   \end{algorithmic}
% \end{algorithm}
% \vspace{-7mm}
% \end{figure}

% \begin{theorem}
%     Let $\mathcal{Y}_\mathcal{I}$ be the set of upper bounds of the intervals $\mathcal{I}$ and let $\mathcal{P}_S$ be the output space of $\Fcal$. Algorithm \ref{algo:cal} achieves online calibration and for all $y \in \mathcal{Y}_\mathcal{I}, p \in \mathcal{P}_S$ we have $\rho_T(y,p) \to p$ a.s. as $T \to \infty$.
% \end{theorem}

% The above theorem follows directly from the construction of Algorithm \ref{algo:cal}: for each $y \in \mathcal{Y}$, we run an online binary calibration algorithm to target the event $y_t \leq y$. See Appendix \ref{app:proofs} for a proof.

% \paragraph{Are Deterministic Algorithms Possible?}
% Algorithms $\Fcal$ for online binary calibration are randomized; thus our procedure is randomized as well. This is a key property of our task.

% \begin{theorem}
%     There does not exist a deterministic online calibrated regression algorithm that achieves online calibration.
% \end{theorem}

% This claim follows because we can encode a standard online binary calibration problem as calibrated regression. If the adversary chooses a binary $y_t \in \{0,1\} \subseteq [0,1]$ that defines one of two classes, the ratio $\rho_T(0,p)$ yields the definition of calibration in binary classification, for which no deterministic algorithms exist
% \citep{cesabianchi2006prediction}.
% %
% See Appendix \ref{app:proofs} for a proof.
% Note, however, that alternative definitions of online calibration in regression may admit deterministic algorithms~\citep{gibbs2022conformal}. 
% % \vk{cite gibbs \& candes}.



\section{Online Calibrated Regression}\label{sec:recalibration}

Next, we define a task in which our goal is to produce calibrated forecasts in a regression setting while maintaining the predictive accuracy of a baseline uncalibrated forecaster.

%We define here a new problem called
% Next, we look at the more interesting setting in which predictions for $y_t$ also involve covariates $x_t$. We extend online calibrated regression (which is analogous to the setting of \citet{foster98asymptoticcalibration}) to a setting that generalizes online recalibration by \citet{kuleshov2017estimating} to regression.
% of {\em online quantile calibrated forecasting}, in which we seek to fit a regression model $H$ to output accurate forecasts $F$ over a continuous target variable $y$ in an online regime.
% Formally, 
% We seek to obtain models that

%We define here a new problem called {\em online quantile recalibration}, in which we seek to fit a regression model $H$ to output accurate forecasts $F$ over a continuous target variable $y$ in an online regime.

%\paragraph{Online recalibration}
% We introduce an approach that is based on  the framework of recalibration.
We start with a forecaster $H$ (e.g., an online learning algorithm) that outputs uncalibrated forecasts $F_t$ at each step; these forecasts are fed into a {\em recalibrator} such that the resulting forecasts $G_t$ are calibrated and have low regret relative to the baseline forecasts $F_t$. 
%
Formally, we introduce the setup of {\em online recalibration}, in which at every step $t=1,2,...$ we have:
  \begin{algorithmic}[1]
    \STATE Nature reveals features $x_t \in \mathbb R^d$. Forecaster $H$ predicts $F_t = H(x_t)$
%    \STATE Forecaster $H$ predicts $F_t = H(x_t)$
    \STATE A recalibration algorithm produces a calibrated forecast $G_t$ based on $F_t$.
    \STATE Nature reveals continuous label $y_t \in \mathcal{Y} \subseteq \mathcal{R}$ bounded by $|y_t| < B/2$, where $B>0$.
    \STATE Based on $x_t, y_t$, we update the recalibration algorithm and optionally update $H$.
  \end{algorithmic}
  
%We focus on the setting of regression, and make the following assumption.
%\begin{assumption}
%The labels $y_t \in \mathcal{Y} \subseteq \mathcal{R}$ are 
%% continuous and 
%bounded $|y_t| < B/2$, where $B>0$.
%\end{assumption}

% \paragraph{Online Quantile Calibration}

Our task is to produce calibrated forecasts. Intuitively, we say that a forecast $F_t$ is calibrated if for every $y' \in \mathcal{Y}$, the probability $F_t(y')$ on average matches the frequency of the event $\{ y \leq y' \}$---in other words the $F_t$ behave like calibrated CDFs.
% We formalize this intuition as follows.
%
We formalize this intuition by introducing the ratio 
\begin{equation}
\label{eq:empirical_cal}
    \rho_T(y, p) = \dfrac{\sum_{t=1}^T \Ind_{y_t \leq y, F_t(y) = p}}{\sum_{t=1}^T \Ind_{F_t(y) = p}}.
\end{equation}
 Intuitively, we want $ \rho_T(y, p) \to p, $ as $T \to \infty$ for all $y$.
In other words, out of the times when the predicted probability $F_t(y')$ for $\{y_t \leq y'\}$ to be $p$, the event $\{y_t \leq y'\}$ holds a fraction $p$ of the time. We define $ \rho_T(y, p)$ to be zero when the denominator in Equation \eqref{eq:empirical_cal} is zero. Below, we enforce that $\rho_T(y, p) \to p$ for forecasts $p$ that are played infinitely often, in that $\sum_{t=1}^T \Ind_{F_t(y) = p} \to \infty$; if a forecast ceases to be played, there is no need (or opportunity) to improve calibration for that forecast.

We measure calibration using an extension of the aforementioned calibration error $C_T$. 
%Our algorithms will output discretized probabilities; hence 
We define the calibration error of forecasts $\{F_t\}$ as %relative to a set of possible predictions $P$,
\begin{equation}
\label{eq:miscal_error}
    C_T(y)  = \sum_{p \in P_T(y)} \left| \rho_T(y,p) - p \right| \left( \frac{1}{T} \sum_{t=1}^T \Ind_{\{F_t(y) = p\}} \right), 
\end{equation}
where $P_T(y) = \{F_1(y), F_2(y), ..., F_T(y)\}$ is the set of previous predictions for $\{y_t \leq y\}$.
To measure (mis)calibration for the recalibrated forecasts $G_t$, we replace $F_t$ with $G_t$ in Equation \eqref{eq:miscal_error}.
\begin{defn}
A sequence of forecasts $G_t$ is $\epsilon$-calibrated for $y \in \mathcal{Y}$ if $C_T(y) \leq R_T + \epsilon$ for $R_T = o(1)$, where $R_T$ represents the convergence rate.
\end{defn}

The interpretation of $\epsilon$-calibration is simple: for example, if $\epsilon = 0.01$, then of the times when we predict a 90\% chance of rain, the observed occurrence of rain will be between 89\% and 91\%. For most applications, an error tolerance of a few \% is acceptable. Note that the use of an error tolerance $\epsilon$ mirrors previous works \citep{foster98asymptoticcalibration,abernethy11blackwell,kuleshov2017estimating}.

The goal of recalibration is also to produce forecasts that have high predictive value \citep{gneiting2007probforecast}. 
We enforce this by requiring that the $G_t$ have low regret relative to the baseline $F_t$ in terms of the CRPS proper loss. 
% While we will not guarantee improved sharpness (and it's easy to see that sharpness might worsen), we guarantee to produce a generally sound forecast in terms of the CRPS.
Since the expected CRPS is a sum of calibration and sharpness terms, by maintaining a good CRPS while being calibrated, we effectively implement Gneitig's principle of maximizing sharpness subject to calibration \citep{gneiting2007probabilistic}.
Formally, this yields the following definition.
% \vk{separate into two?}
\begin{defn}
A sequence of forecasts $G_t$ is $\e$-recalibrated relative to forecasts $F_t$ if (a) the forecasts  $G_t$ are $\epsilon$-calibrated for all $y \in \mathcal{Y}$ and (b) the regret of $G_t$ with respect to $F_t$ is a.s.~small w.r.t.~$\ell_\textrm{CRPS}$:  
\begin{align*}
\lim\sup_{T \to \infty} \frac{1}{T} \sum_{t=1}^T \left( \ell_\textrm{CRPS}(y_t , G_t) - \ell_\textrm{CRPS}(y_t, F_t)\right) \leq \epsilon.
\end{align*}%\vspace{-5mm}
%\begin{itemize}
%\item The forecasts $p_t = A(\palg_t)$ are $\e$-calibrated.
%\item The regret of $p_t$ with respect to $\palg_t$ is small:  
%$$\lim_{T \to \infty} \frac{1}{T} \sum_{t=1}^T \left( \ell(p_t , y_t) - \ell(\palg_t , y_t)\right) \leq \epsilon.$$
%\end{itemize}
\end{defn}

%\paragraph{Are deterministic algorithms possible?}
%All algorithms for online binary calibration are necessarily randomized \citep{foster98asymptoticcalibration}; this is true for our task as well.
%
%\begin{theorem}
%    There does not exist a deterministic online calibrated regression algorithm that achieves online calibration.
%\end{theorem}
%
%This claim follows because we can encode a standard online binary calibration problem as calibrated regression. If the adversary chooses a binary $y_t \in \{0,1\} \subseteq [0,1]$ that defines one of two classes, the ratio $\rho_T(0,p)$ yields the definition of calibration in binary classification, for which no deterministic algorithms exist
%\citep{cesabianchi2006prediction}.
%%
%See Appendix \ref{app:proofs} for a proof.
%Note, however, that alternative definitions of online calibration in regression may admit deterministic algorithms~\citep{gibbs2022conformal}. 
%% \vk{cite gibbs \& candes}.

\section{Algorithms for Online Regression}\label{sec:framework}

\begin{figure}
\vspace{-3mm}
\begin{algorithm}[H]
  \caption{Online Recalibration}
  \label{algo:recal}
  \begin{algorithmic}[1]
    \REQUIRE Online binary calibration subroutine $\Fcal$ with resolution $N$; number of intervals $M$
    \STATE Initialize $\Ic = \{[0,\frac{1}{M}), [\frac{1}{M}, \frac{2}{M}), ..., [\frac{M-1}{M},1]\}$, a set of intervals that partition $[0,1]$.
    \STATE Initialize $\Fc = \{ \Fcal_j \mid j = 0,...,M-1 \}$, a set of $M$ instances of $\Fcal$, one per $I_j \in \mathcal{I}$.
    \FOR {$t=1,2,...$:}
    \STATE Observe uncalibrated forecast $F_t$.
    \STATE Define $G_t(z)$ as the output of $\Fcal_{\lfloor F_t(z) \rfloor}$, where $\lfloor F_t(z) \rfloor$ is the index of the subroutine associated with the interval containing $F_t(z)$.
    \STATE Output $G_t$. Observe $y_t$ and update recalibrator:
        \FOR {$j=1,2,...,M$:}
        		\STATE $o_{tj} = 1 \text{ if } F(y_t) \leq \frac{j}{M} \text{ else } 0$. Pass $o_{tj}$ to $\Fcal_j$.
		% \STATE 
	\ENDFOR
    \ENDFOR
  \end{algorithmic}
\end{algorithm}
\vspace{-7mm}
\end{figure}

Next, we propose an algorithm for performing online recalibration (\algorithmref{recal}). This algorithm sequentially observes uncalibrated CDF forecasts $F_t$ and returns forecasts $G_t$ such that $G_t(z)$ is a calibrated estimate for the outcome $y_t \leq z$. This algorithm relies on a classical calibration subroutine (e.g., \citet{foster98asymptoticcalibration}), which it uses in a black-box manner to construct $G_t$.

%At a high level, \algorithmref{recal} can be seen as defining a $[0,1] \to [0,1]$ mapping. 
% An uncalibrated forecast $F_t(z)$ assigns a probability to $y_t \leq z$ for each $z$; however, these may not correspond to correct empirical frequencies. 
\algorithmref{recal} can be seen as producing a $[0,1] \to [0,1]$ mapping that remaps the probability of each $z$ into its correct value.
More formally, \algorithmref{recal} 
partitions $[0,1]$ into $M$ intervals $\Ic = \{[0,\frac{1}{M}), [\frac{1}{M}, \frac{2}{M}), ..., [\frac{M-1}{M},1]\}$; each interval is associated with an instance $\Fcal$ of a binary calibration algorithm (e.g., \citet{foster98asymptoticcalibration}; see below). In order to compute $G_t(z)$, we compute $p_{tz} = F_t(z)$ and invoke the subroutine $\Fcal_j$ associated with interval $I_j$ containing $p_{tz}$.
% on the data $\{\palg_t, y_t \mid \palg_t \in I_j \}$ belonging to each bucket $I_j \in \Ic$; at prediction time, it calls the instance of $\Fcal$ associated with the bucket of the uncalibrated forecast $\palg_t$.
After observing $y_t$, each $\Fcal_j$ observes the binary outcome $o_{tj} = \mathbb{I}_{F_t(y_t) \leq \frac{j}{M}}$ and updates itself.

%The resulting procedure produces valid calibrated estimates $G_t(z)$ for each $z$ because each $\Fcal_j$ is a calibrated subroutine. More importantly the $G_t$ do not decrease the predictive performance of the $F_t$, as measured by $\ell_\text{CRPS}$. Intuitively, this is true because the $\ell_\text{CRPS}$ is the sum of calibration and sharpness, the former of which improves in $G_t$. 
%In the remainder of this section, we establish these facts formally.

%\algorithmref{recal} works because a calibrated predictor is at least as accurate as any constant predictor; in particular, each subroutine $\Fcal_j$ is at least as accurate as the prediction $\frac{j}{M}$, which also happens to be approximately $\palg_t$ when $\Fcal_j$ was called. Thus, each $\Fcal_j$ is as accurate as its input sequence of $\palg_t$. One can then show that if each each $\Fcal_j$ is accurate and calibrated, then so it their aggregate, \algorithmref{recal}. The rest of this section provides a formal version of this argument; due to space limitations, we defer most of our full proofs to the appendix.

%Note that the above construction is directly inspired by the histogram method for recalibrating probabilities in the batch setting. The histogram method splits the $(y_t, \palg_t)$ into $M$ disjoint bins $B_i = \{ (y_t, \palg_t) \mid \palg_t \in [\frac{j}{M}, \frac{j+1}{M}) \}$ for $j=0,1,...,M-1$ and predicts the average $y_t$ in each bin. If the $(y_t, \palg_t)$ are sampled i.i.d. from some $\Pb$, then the bin averages are guaranteed by the law of large numbers to converge to $ \Pb(y=1 \mid \palg_t \in [\frac{j}{M}, \frac{j+1}{M})). $ 
%%This construction assumes that  $\Pb(y=1 \mid \palg_t)$ is lower for smaller $\palg_t$ and also gives us the freedom to predict $i/M$ on each interval, which corresponds to predicting the original $\palg_t$.
%In the online setting, the $y_t$ may be adversarial and hence we cannot simply output the average of each interval. However, we can achieve the same effect by instead using an online calibrated predictor.


\subsection{Online Binary Calibration Subroutines}

A key component of \algorithmref{recal} is the binary calibration subroutine $\Fcal$.
This subroutine is treated as a black box, hence can implement a range of known algorithms including regret minimization \citep{foster98asymptoticcalibration, cesabianchi2006prediction}, Blackwell approchability \citep{abernethy11blackwell} or defensive forecasting \citep{vovk2005defensive}.
More formally, let $p_{tj}$ denote the output of the $j$-th calibration subroutine $\Fcal_j$ at time $t$. For any $p \in [0,1]$, we define $\rho_T^{(j)}(p) = {(\sum_{t=1}^T o_{tj} \Ind_{p_{tj} = p})}/{(\sum_{t=1}^T \Ind_{p_{tj} = p})}$ to be the empirical frequency of the event $\{ o_{tj} = 1\}$
Online calibration subroutines ensure that $\rho_T^{(j)}(p) \approx p$. 


\paragraph{Assumptions.}

Specifically, 
a subroutine $\Fcal_j$ normally
outputs a set of discretized probabilities $i/N$ for $i \in \{0,1,...,N\}$. We refer to $N$ as their resolution.
We define the calibration error of $\Fcal_j$ at $i/N$ as 
$
C^{(j)}_{T,i}  = \left| \rjt(i/N) - \frac{i}{N} \right| \left( \frac{1}{T} \sum_{t=1}^T \wsupj_{t,i} \right)
% C_{T,i} = \left|  \rho_T(i/N) - \frac{i}{N} \right|^p \left( \frac{1}{T} \sum_{t=1}^T \Ind_{t,i} \right),
$
where $\wsupj_{t,i} = \Ind\{p_{tj} = i/N\}$. 
We may write the calibration loss of $\Fcal_j$ 
as $C^{(j)}_{T} = \sum_{i=0}^N C^{(j)}_{T,i}$.

We will assume that the subroutine $\Fcal$ used in \algorithmref{recal} is $\e$-calibrated in that $C^{(j)}_{T} \leq R_{T} + \e$ uniformly ($R_{T} = o(1)$ as $T \to \infty$). %; $T_j$ is the number of calls to instance $\Fcal_j$). 
% This also implies $\ell_p$-calibration (by continuity of $\ell_p$), albeit with different rates $R_{T_j}$ and a different $\e$. \citep{abernethy11blackwell} introduce $(\e, \ell_1)$-calibrated $F_j$. We also provide proofs for the $\ell_2$ loss in the appendix.
%
Recall also that the target $y_t$ is bounded as $|y_t| < B/2$.
% Finally, we assume that the input CDF forecasts $F_t$ are discretized like the other elements of our setup and are step functions over a set of values of $y$ denoted by $\mathcal{Y}$.
% \end{assumption}



\subsection{Online Recalibration Produces Calibrated Forecasts}

Intuitively, Algorithm \ref{algo:recal} produces valid calibrated estimates $G_t(z)$ for each $z$ because each $\Fcal_j$ is a calibrated subroutine. More formally, we seek to quantify the calibration of Algorithm \ref{algo:recal}. Since the $\Fcal$ output discretized probabilities, we may define the calibration loss of Algorithm \ref{algo:recal} at $y$ as
\begin{equation}
 C_{T}(y) = \sum_{i=0}^N \left|  \rho_T(y, i/N) - \frac{i}{N} \right| \left( \frac{1}{T} \sum_{t=1}^T \Ind_{t,i} \right),   
\end{equation}
% $$
% C_{T}(y) = \sum_{i=0}^N \left|  \rho_T(y, i/N) - \frac{i}{N} \right| \left( \frac{1}{T} \sum_{t=1}^T \Ind_{t,i} \right),
% $$
where $\Ind_{t,i} = \Ind\{F(y_t) = i/N\}$. 
% and we may write the calibration loss of Algorithm \ref{algo:recal} at $y$ as $C_{T}(y) = \sum_{i=0}^N C_{T,i}(y)$. 
The following lemma establishes
that combining the predictions of each $\Fcal_j$ preserves their calibration. 
% First, one can see that by construction, Algorithm \ref{algo:recal} ensures a form of quantile calibration, where $\rho(\alpha, p) \to p$ for all $p, \alpha \in [0,1]$ and
% $
% \rho(\alpha, p) = \dfrac{\sum_{t=1}^T \Ind_{y_t \leq F_t^{-1}(\alpha), G_t(y) = p}}{\sum_{t=1}^T \Ind_{G_t(y) = p}},
% $
% where $F, G$ denote the baseline and recalibrated forecasts, respectively. We can further refine this construction to get a bound on our initial definition of online calibration.
Specifically, the calibration error of Algorithm \ref{algo:recal} is bounded by a weighted average of $R_{T_j}$ terms, each is $o(1)$, hence the bound is also $o(1)$ (see next section).

\begin{lemma}[Preserving calibration]\label{lem:calibration}
Given $y \in \mathcal{Y}$, let $T_j = |\{ 1 \leq t \leq T : \lfloor F_t(y) \rfloor = j/M \}|$ denote the number of calls to $\Fcal_j$ by \algorithmref{recal}.
If each $\Fcal_j$ is $\e$-calibrated,
then \algorithmref{recal} is also $\e$-calibrated and the following bound holds uniformly a.s. over $T$:
\begin{equation}
  C_T(y) \leq \sum_{j=1}^M \frac{T_j}{T} R_{T_j} + \e  
\end{equation}
% $$C_T(y) \leq \sum_{j=1}^M \frac{T_j}{T} R_{T_j} + \e$$ 
%\vspace{-4mm}
\end{lemma}
%\begin{proof}[Proof (sketch)]
%One can verify that $ \rho_T(i/N) = \sum_{j=1}^M \frac{ T_{ij} \rjt(i/N) }{\sum_{j=1}^M T_{ij} }$ is an average over the sub-algorithms (where $T_{ij}$ is the number of predictions $i/N$ by $F_j$). We may thus bound $\left|  \rho_T(i/N) - \frac{i}{N} \right|^p$ using this observation and Jensen's inequality.
%\end{proof}





\subsection{Online Recalibration Produces Forecasts with Vanishing Regret}

%Let us now formally explain why \algorithmref{recal} is $\e$-accurate.
%We assume below that the $\Fcal$ have resolution $1/N$.



%\paragraph{Recalibration with proper losses.}
%
%Surprisingly, not every loss $\ell$ admits a recalibration procedure. Consider, for example, the following continuously repeating sequence $001001001...$ of $y_t$'s. A calibrated forecaster must converge to predicting $1/3$ (a constant prediction) with an $\ell_1$ loss of $\approx$0.44; however predicting $0$ for all $t$ has an $\ell_1$ loss of $1/3 < 0.44$. Thus we cannot recalibrate this sequence and also remain equally accurate under the $\ell_1$ loss. The same argument also applies to batch recalibration (e.g. Platt scaling): we only need to assume that $y_t \sim \mathrm{Ber}(1/3)$ i.i.d.

Next, we want to show that the $G_t$ do not decrease the predictive performance of the $F_t$, as measured by $\ell_\text{CRPS}$. Intuitively, this is true because the $\ell_\text{CRPS}$ is a proper loss that is the sum of calibration and sharpness, the former of which improves in $G_t$. 
%In the remainder of this section, we establish these facts formally.

Establishing this result will rely on the following key technical lemma~\citep{kuleshov2017estimating} (see Appendix).

\begin{lemma}\label{lem:noregret}
Each $\e$-calibrated $\Fcal_j$
a.s.~has a small regret w.r.t.~the $\ell_2$ norm and satisfies uniformly over time $T_j$ the bound
$
%{\footnotesize
%\begin{align*}
% \intR_{T} & = 
\max_{i,k} \sum_{t=1}^{T_j} \Ind_{p_{tj} = i/N} \left( \ell_2(o_{tj}, i/N)  - \ell_2(o_{tj}, k/N) \right) \leq 2 (R_{T_j} + \e).
%\end{align*}
%}
$
%\vspace{-5mm}
\end{lemma}

%According to \lemmaref{noregret}, if a set of predictions is calibrated, then we never want to retrospectively switch to predicting $p_2$ at times when we predicted $p_1$. Intuitively, this makes sense: if predictions are calibrated, then $p_1$ should minimize the total (or average) loss $\sum_{t : p_t=p_1 } \ell(y_t, p)$ over the times $t$ when $p_1$ was predicted (at least better so than $p_2$). However, our $\ell_1$ counter-example above shows that this intuition does not hold for every loss; we need to explicitly enforce our intuition, which amounts to assuming that $\ell$ is proper, i.e. that
%$p \in \arg\min_q \Exp_{y \sim \text{Ber}(p)} \ell(y, q)$.

% \paragraph{Accuracy and calibration.}

An important consequence of \lemmaref{noregret} is that a calibrated algorithm has vanishing regret relative to any fixed prediction (since minimizing internal regret also minimizes external regret). Using this fact, it becomes possible to establish that \algorithmref{recal} is at least as accurate as the baseline forecaster. % $F$.
%\lemmaref{noregret} is important because a internal regret-minimizing algorithm also minimizes external regret w.r.t.~any fixed prediction. We use this observation to show that \algorithmref{recal} preserves the accuracy of its baseline uncalibrated forecaster.

\begin{lemma}[Recalibration with low regret accuracy]\label{lem:accuracy}
Consider \algorithmref{recal} with
parameters $M \geq N > 1/\e$ and let $\ell$ be the CRPS proper loss. % for which
Then the recalibrated $G_t$ a.s.~have vanishing $\ell$-loss regret relative to $F_t$ and we have a.s.:
\begin{equation*}
\frac{1}{T} \sum_{t=1}^T \ell (y_t , G_t) - \frac{1}{T} \sum_{t=1}^T \ell(y_t , F_t) <NB R_T + \frac{2B}{N} 
\end{equation*}
%\vspace{-5mm}
\end{lemma}

\begin{proof}[Proof (sketch)]
When $p_{tj} = G_t(y)$ is the output of a given binary calibration subroutine $\Fcal_j$ at some $y$, we know what $\lfloor F(y) \rfloor = j/M$ (by construction). Additionally, we know from \lemmaref{noregret} that $\Fcal_j$ minimizes regret. Thus, it has vanishing regret in terms of $\ell_2$ loss relative to the fixed prediction $j/M$: $\sum_{t=1}^{T_j} (o_{tj}-p_{tj})^2 \leq \sum_{t=1}^{T_j} (o_{tj}-j/M)^2 + o(T_j)$. But $o_{tj} = \Ind_{F(y_t) \leq j/m}$, and during the times $t$ when $\Fcal_j$ was invoked, during the times $t$ when $\Fcal_j$ was invoked $p_{tj} = G_t(y)$ and $j/M = F_t(y)$. Aggregating over $j$ and integrating over $y$ yields our result.
\end{proof}

These two lemmas lead to our main claim: that \algorithmref{recal} solves the online recalibration problem.

\begin{theorem}\label{thm:main}
Let $\Fcal$ be an $(\epsilon/2B)$-calibrated online subroutine with resolution $N \geq 2B/\epsilon$. 
Then \algorithmref{recal} with parameters $\Fcal$ and $M=N$ outputs $\epsilon$-recalibrated forecasts.
\end{theorem}

\begin{proof}
By \lemmaref{calibration}, \algorithmref{recal} is $(\e/2B)$-calibrated and by \lemmaref{accuracy}, its regret w.r.t. the $F_t$ tends to $< 2B/N < \e$. Hence, \theoremref{main} follows.
\end{proof}

% In the appendix, we provide a detailed argument for how $\ell$ can be chosen to be the misclassificaiton loss.

\paragraph{General proper losses}

Throughout our analysis, we have used the CRPS loss to measure the regret of our algorithm. This raises the question: is the CRPS loss necessary? One answer to this question is that if the loss $\ell$ used to measure regret is not a proper loss, then recalibration is not possible. % for some $\e > 0$.
%Note also that our results so far crucially relied on measuring accuracy using proper losses. This raises a natural question: can we use other losses for calibration? Our second theorem provides a negative answer.

\begin{theorem}
If $\ell$ is not proper, then no algorithm achieves recalibration w.r.t.~$\ell$ for all $\e > 0$.
\end{theorem}

% The proof of this algorithm is a slight generalization of the counter-example provided for the $\ell_1$ loss. Interestingly, it holds equally for online and batch settings. 
% It builds on the result of \citet{kuleshov2017estimating} and adds to our knowledge of the limitations of recalibration algorithms.

%\vspace{10mm}
%\subsection{Extensions}\label{sec:extensions}

% The assumptions of \lemmaref{accuracy} essentially require that $\ell$ be Lipschitz with constant $B$, which holds e.g.~for convex bounded losses that are studied in online learning. Our assumption is slightly more general since $\ell$ may also be discontinuous (like the misclassification loss).
% When $\ell$ is unbounded (like the log-loss), its values at the baseline algorithm's predictions must be bounded away from infinity.

On the other hand, in Appendix \ref{app:regret}, we provide a more general analysis that shows that: (1) a calibrated $\Fcal$ must have vanishing regret relative to a fixed prediction as measured using any proper score; (2) \algorithmref{recal} achieves vanishing regret relative to any proper score.
See Appendix \ref{app:regret} for a formal statement and proof.

\section{Applications}

\subsection{Choice of Recalibration Subroutine}

%   \begin{figure}
%   \vspace{-6mm}
% \begin{table}[H]
% \begin{center}
%   \def\arraystretch{1.5}
%   \begin{tabular}{c | c | c}
% Subroutine & {\footnotesize Regret Minimization} & {\footnotesize Blackwell Approchability} \\
% \hline
% Time {\scriptsize / step} & $O({1}/{\e})$ & $O(\log({1}/{\e}))$ \\
% Space {\scriptsize / step} & $O({1}/{\e^2})$ & $O({1}/{\e^2})$ \\
% Calibration & $O({1}/{\e \sqrt{\e T}})$ & $O({1}/{\e \sqrt{T}})$ \\
% Advantage & Simplicity & Efficiency
% \end{tabular}
% \end{center}
% \vspace{-2mm}
% \caption{\footnotesize\label{tbl:rates}Time and space complexity and convergence rate of \algorithmref{recal} using different subroutines.}
% \end{table}
%     \vspace{-8mm}
%   \end{figure}
%The above argument shows that as $T \to \infty$ the calibration loss $C_T$ of \algorithmref{recal} and its regret with respect to $\palg_t$ are bounded by $\e$. Here, we examine the rate of this convergence.

\algorithmref{recal} is compabible with any binary recalibration subroutine $\Fcal$. Two choices of $\Fcal$ include methods based on {\bf internal regret minimization} \citep{mannor2010calibration} and ones based on {\bf Blackwell approachability} \citep{abernethy11blackwell}. These yield different computational costs and convergence rates for \algorithmref{recal}.

Specifically, recall that $R_T$ denotes the rate of convergence of the calibration error $C_T$ of \algorithmref{recal}.
For most online calibration subroutines $\Fcal$,
$R_{T} \leq f(\e)/\sqrt{T}$ for some $f(\e)$.
In such cases, we can further bound the calibration error in \lemmaref{calibration} as
\begin{equation}
 \sum_{j=1}^M \frac{T_j}{T} R_{T_j} \leq \sum_{j=1}^M \frac{\sqrt{T_j}f(\e)}{T} \leq \frac{f(\e)}{\sqrt{ \e T}}.    
\end{equation}
% $$
% \sum_{j=1}^M \frac{T_j}{T} R_{T_j} \leq \sum_{j=1}^M \frac{\sqrt{T_j}f(\e)}{T} \leq \frac{f(\e)}{\sqrt{ \e T}}. 
% $$
In the second inequality, we set the $T_j$ to be equal. 
%
Thus, our recalibration procedure introduces an overhead of
$ \frac{1}{\sqrt{\e}} $
in the convergence rate of the calibration error $C_T$ and of the regret in \lemmaref{accuracy}.
In addition, we require $ \frac{1}{{\e}} $ times more memory and computation time (we run $1/\e$ instances of $\Fcal_j$). 
% Overall, our runtime in linear in $M$, and the cost of \algorithmref{recal} is negligible relative to fitting the base model.
% \tableref{rates} summarizes the convergence rates of \algorithmref{recal} when the subroutine is either the method of \citep{abernethy11blackwell} based on Blackwell approachability or the simpler but slower approach based on internal regret minimization \cite{mannor2010calibration}.
%Note also that by the same argument, an extra $1/\sqrt{\e}$ is added to the convergence rate of the regret $ \frac{1}{T} \sum_{t=1}^T (y_t - p_t)^2 - \frac{1}{T} \sum_{t=1}^T (y_t - \palg_t)^2 $ w.r.t. $\palg$.

When using an internal regret minimization subroutine, 
% \cite{mannor2010calibration}, 
the overall calibration error of \algorithmref{recal} is bounded as $O({1}/{\e \sqrt{\e T}})$ with $O(1/\e)$ time and $O(1/\e^2)$ space complexity. These numbers improve to $O(\log(1/\e))$ time complexity for a $O({1}/{\e \sqrt{T}})$ calibration bound when using the method of \citet{abernethy11blackwell} based on Blackwell approachability. The latter choice is what we recommend.

\subsection{Uncertainty Estimation}

We complement our results with ways in which \algorithmref{recal} can yield predictions for various confidence intervals. % and other useful information.
\begin{theorem}
    Let $G_t$ for $t=1,2,...,T$ denote a sequence of $(\epsilon/2)$-calibrated forecasts. For any interval $[y_1, y_2]$, we have $\frac{1}{T} \sum_{t=1}^T ( G_t(y_2) - G_t(y_1) ) \to \frac{1}{T} \sum_{t=1}^T \mathbb{I}\{y_t \in [y_1, y_2]\}$ as $T \to \infty$ a.s.
\end{theorem}

This theorem justifies the use of $F_t(y_2) - F_t(y_1)$ to estimate the probability of the event that $y_t$ falls in the interval $[y_1, y_2]$: on average, predicted probabilities will match true outcomes. The proof follows directly from the definition of 
$\epsilon$-calibration. This result directly mirrors the construction for calibrated confidence intervals in \citet{kuleshov2018accurate}.

\subsection{Online Decision-making}

Consider a doctor seeing a stream of patients. For each patient $x_t$, they use a model $M$ of an outcome $y_t$ to estimate a loss $\ell(x_t) = \mathbb{E}_{y \sim M(x_t)} \ell(x_t, y, a(x_t))$ for a decision $a(x_t)$ (which could be $a(x_t) = \arg \min_a \mathbb{E}_{y \sim M(x_t)} [ \ell(x_t,y, a) ]$, e.g., a treatment that optimizes an expected outcome).
We want to guarantee that the doctor's predictions will be correct: over time, the estimated expected value will not exceed from the realized loss. Crucially, we want this to hold in non-IID settings.

Our framework enables us to achieve this result with only a weak condition---calibration.
The following concentration inequality shows that estimates of $v$ are unlikely to exceed the true $v$ on average (proof in Appendix \ref{app:applications}). If data was IID, this would be Markov's inequality: surprisingly, a similar  statement holds in non-IID settings.
% \vk{cite individual calibration}

\begin{theorem}
\label{thm:dist_calib_bound_app}
Let $M$ be a calibrated model and
let $\ell(y, a, x)$ be a monotonically non-increasing or non-decreasing loss in $y$.
Then for any sequence $(x_t, y_t)_{t=1}^T$ and $r > 1$, we have:
\begin{equation}
    \label{eqn:dist_calib_bound1}
    \lim_{T \to \infty} \frac{1}{T} \sum_{t=1}^T \mathbb{I} \left[ \ell(y_t, a(x_t), x_t) \geq r \ell(x_t)) \right] \leq 1 / r
\end{equation}
\end{theorem}

%\paragraph{Multiple experts.}
%
%%Imagine now that we are faced with $K$ uncalibrated systems each making predictions $\pk_t$. We can adapt 
%\algorithmref{recal} can also be adapted to simultaneously recalibrate the predictions $\pk_t$ of $k=1,2,...,K$ uncalibrated systems while also preserving these systems' accuracy.
%
%We may achieve this via a regret-minimizing forecaster $F$ that treats the $\pk_t$ as advice from $K$ experts (see \sectionref{advice}). Let $\palg_t$ denote the forecasts of $F$ and recall that by definition of regret,
%$ \frac{1}{T} \sum_{t=1}^T (y_t - \palg_t)^2 - \frac{1}{T} \sum_{t=1}^T (y_t - \pk_t)^2 \leq \extR_T$
%for all $k$, where $\extR_T$ is the external regret of $F$. Running \algorithmref{recal} on $\palg_t$ yields predictions $p_t$ with vanishing regret relative to every $\pk_t$:
%\begin{align*}
%\frac{1}{T} \sum_{t=1}^T & (y_t - p_t)^2 
%\leq \frac{1}{T} \sum_{t=1}^T (y_t - \palg_t)^2 + \frac{R^\textrm{recal}_T}{T} + \e 
%\leq \frac{1}{T} \sum_{t=1}^T (y_t - \pk_t)^2 + \frac{\extR_T}{T} + \frac{R^\textrm{recal}_T}{T} + \e.
%\end{align*}
%Above, we used $R^\textrm{recal}_T$ to denote the regret of \algorithmref{recal} relative to $\palg_t$.
%%
%Since typically $\extR_T =O(\sqrt{(\log K) / T})$ \cite{cesabianchi2006prediction}, our rates do not significantly degrade relative to the $K=1$ case.
%%Furthermore, the dependence of $\extR_T$ on $K$ is determined only by the choice of $F$, which is independent from our framework. %This allows us to use the best known regret minimization algorithm for a particular problem.

%\paragraph{Multiclass prediction.}
%%Classical online recalibration algorithms naturally extend beyond the binary setting
%%There exists a classical online recalibration algorithm that
%In the multiclass setting, we seek a recalibrator $A : \Delta_{K-1} \to \Delta_{K-1}$ producing calibrated probabilities $p_t \in \Delta_{K-1}$ that target class labels $y_t \in \{1,2,...,K\}$.
%In analogy to binary recalibration, we may discretize the input space $\Delta_{K-1}$ into a $K$-dimensional grid
%and train a classical multi-class calibration algorithm $\Fcal$ \cite{cesabianchi2006prediction} on each subset of $\palg_t$ associated with a cell. 
%Just like in the binary setting, a classical calibration method $\Fcal_j$ predicts calibrated $p_t \in \Delta_{K-1}$ based solely on past multiclass labels $y_1,y_2,...,y_{t-1}$; it can serve as a subroutine within \algorithmref{recal}.
%
%
%However, in the multi-class setting, this construction will require $O(1/\e^K)$ running time per iteration, $O(1/\e^{2K})$ memory, and will have a convergence rate of $O(1/(\e^{2K} \sqrt{T}))$. The exponential dependence on $K$ cannot be avoided, since the calibration problem is fundamentally PPAD-hard \cite{hazan2012calibration}. 
%%The first level of discretization introduced by \algorithmref{recal} is also hard to improve on: notice that if the $y_t$ are sampled from some distribution, in the worst case calibration requires us to estimate an arbitrary density $\BP(y | F(x) = \palg)$.
%However, there may exist practical workarounds inspired by popular heuristics for the batch setting, such as one-vs-all classification \cite{zadrozny2002transforming}.


%  \begin{wrapfigure}{R}{0.5\textwidth}
%%  \vspace{-8mm}
%\begin{center}
%\includegraphics[width=7cm]{figures/synth.pdf}
%\end{center}
%\caption{\footnotesize \label{fig:synth}We compare predictions from an uncalibrated expert $F$ (blue), \algorithmref{recal} (green), and \regmin~(red) on sequences $y_t \sim \textrm{Ber}(0.5)$ (plots a, b) and on adversarially chosen $y_t$ (plots c, d). 
%%Plots (a), (c) display the $\ell_2$ loss over first $t$ iterations; plots (b), (d) display the calibration error.
%}
%%\vspace{-5mm}
%  \end{wrapfigure}
  
% \begin{figure}
% \vspace{-2mm}
% \hspace{-5mm}
% \includegraphics[width=9.2cm]{figures/synth.pdf}\vspace{-1mm}
% \caption{\footnotesize \label{fig:synth}We compare predictions from an uncalibrated expert $F$ (blue), \algorithmref{recal} (green), and \regmin~(red) on sequences $y_t \sim \textrm{Ber}(0.5)$ (plots a, b) and on adversarially chosen $y_t$ (plots c, d). 
% %Plots (a), (c) display the $\ell_2$ loss over first $t$ iterations; plots (b), (d) display the calibration error.
% }
% \vspace{-3mm}
% \end{figure}
  
% \section{Experiments}\label{sec:experiments}

% We now proceed to study \algorithmref{recal} empirically. \algorithmref{recal}'s subroutine is the standard internal regret minimization approach of \citep{cesabianchi2006prediction} ("\regmin"). We measure calibration and accuracy in the $\ell_2$ norm.
% % subroutine ; we feel that this simpler algorithm will be preferred by most users.


% \paragraph{Predicting a Bernoulli sequence.}

% We start with a simple setting where we observe an i.i.d. sequence of $y_t \sim \textrm{Ber}(p)$ as well as uncalibrated predictions $(\palg_t)_{t=1}^T$ that equal $0.3$ whenever $y_t=0$ and $0.7$ when $y_t=1$. The forecaster $F$ is essentially a perfect predictor, but is not calibrated.

% In \figureref{synth}, we compare the performance of \regmin~(which does not observe $\palg_t$) to \algorithmref{recal} and to the uncalibrated predictor $F$. Both methods achieve low calibration error after about 300 observations, while the expert is clearly uncalibrated (\figureref{synth}b); however, \regmin~ is a terrible predictor: it always forecasts $p_t = 0.5$ and therefore has high $\ell_2$ loss (\figureref{synth}a). \algorithmref{recal}, on the other hand, makes perfect predictions by recalibrating the input $\palg_t$.
% %This example demonstrates that unlike earlier methods, our effectively algorithm accounts for both calibration and sharpness. Note, however, that we pay the price of slower convergence of the calibration error compared to \regmin~.

% \paragraph{Prediction against an adversary.}

% Next, we test the ability of our method to achieve calibration on adversarial input. At each step $t$, we choose $y_t = 0$ if $p_t > 0.5$ and $y_t = 1$ otherwise; we sample $\palg_t \sim \textrm{Ber}(0.5)$, which is essentially a form of noise. In \figureref{synth} (c, d), we see that \algorithmref{recal} successfully ignores the noisy forecaster $F$ and instead quickly converges to making calibrated (albeit not very accurate) predictions (it reduces to \regmin).

% \paragraph{Natural language understanding.}

% %Calibrated confidence scores enable natural language interfaces to determine whether they have understood a command, or whether they need clarifications. Such systems may be trained based on user feedback that arrives in an online fashion and is well-suited to our adversarial assumptions.

% We used \algorithmref{recal} to recalibrate a state-of-the-art question answering system \cite{berant2014semantic} on the popular Free917 dataset (641 training, 276 testing examples). We trained the system on the training set as described in \cite{berant2013semantic} and then calibrated probabilities using \algorithmref{recal} in one pass over first the training, and then the testing examples. This setup emulates a pre-trained system that further improves itself from user feedback.

% \begin{figure*}[t]
% \vspace{-2mm}
%     \centering
%     \begin{subfigure}
% {        \centering
% 	\includegraphics[width=8.5cm]{figures/semparse.pdf}}
% %        \caption{\footnotesize \label{fig:semparse}\algorithmref{recal} (green) and Platt scaling (red) are used to recalibrate raw probabilities from a state-of-the-art question answering system (blue). We track prediction (a) and calibration error (b) over time; plot (c) represents the level of calibration after seeing all the data; the size of each circle is proportional to the number of predictions in the corresponding bucket.}
%     \end{subfigure}\hspace{3mm}
%     \begin{subfigure}
% {        \centering
% \includegraphics[width=8.5cm]{figures/wtccc.pdf}}
% %\caption{\footnotesize \label{fig:wtccc}\algorithmref{recal} (green) is used to recalibrate probabilities from an online SVM trained to predict diabetes outcome from genomic data (blue). We track prediction (a) and calibration error (b) over time; plot (c) represents the level of calibration after seeing all the data; the size of each circle is proportional to the number of predictions in the corresponding bucket.}
%     \end{subfigure}\vspace{-1mm}
%         \caption{\footnotesize \label{fig:semparse}\algorithmref{recal} (green) is used to recalibrate probabilities from a question answering system (left) and a medical diagnosis system (right; both in blue). We track prediction (a) and calibration error (b) over time; plot (c) displays calibration curves after seeing all the data; circle sizes are proportional to the number of predictions in the corresponding bucket.}
% \vspace{-2mm}
% \end{figure*}

% %\begin{wrapfigure}{L}{0.5\textwidth}
% %\vspace{-9mm}
% %\begin{center}
% %\includegraphics[width=7cm]{figures/semparse.pdf}
% %\end{center}
% %\caption{\footnotesize \label{fig:semparse}\algorithmref{recal} (green) and Platt scaling (red) are used to recalibrate raw probabilities from a state-of-the-art question answering system (blue). We track prediction (a) and calibration error (b) over time; plot (c) represents the level of calibration after seeing all the data; the size of each circle is proportional to the number of predictions in the corresponding bucket.}
% %\end{wrapfigure}

% \figureref{semparse} (left) compares our predicted $p_t$ to the raw system probabilities $\palg_t$ via {\em calibration curves}. Given pairs of predictions and outcomes $p_t, y_t$, we compute for each of $N$ buckets $B \in \{[\frac{i}{N},\frac{i+1}{N}) \mid 0 \leq i \leq 1\}$, averages $\bar p_B = \sum_{t:p_t \in B} p_t /N_B$ and $\bar y_B = \sum_{t:p_t \in B} y_t /N_B$, where $N_B = |\{p_t \in B\}|$. A calibration curve plots the $\bar y_B$ as a function of $\bar p_B$; perfect calibration corresponds to a straight line.

% Calibration curves indicate that the $\palg_t$ are poorly calibrated in buckets below 0.9, while \algorithmref{recal} fares better. \figureref{semparse}a confirms that our accuracy (measured by the $\ell_2$ loss) tracks the baseline forecaster.
% %\figureref{semparse} also compares against Platt scaling (re-trained at each step), which has trouble making predictions on the test data (see red line). The default settings of \citep{berant2014semantic} lead to a slight overfitting on the training set, which affects Platt scaling but not our method. We may expect similar behavior under other forms of distributional shift. Overfitting by Platt scaling is a known phenomenon \cite{niculescu2005predicting}, and it is common to use a separate calibration set to avoid it.

% \paragraph{Medical diagnosis.}

% Our last task is predicting the risk of type 1 diabetes from genomic data.
% %This setup is an example of a medical diagnosis problem in which we need calibrated confidence scores are important and where patients arrive one at a time in a way that is not easily modeled by a i.i.d samples from a distribution.
% We use genotypes of 3,443 subjects (1,963 cases, 1,480 controls) over 447,221 SNPs \cite{wellcome2007genome}, with alleles encoded as $0,1,2$ (major, heterozygous and minor homozygous resp.). We use an online $\ell_1$-regularized linear support vector machine (SVM) to predict outcomes one patient at a time, and report performance for each $t \in [T]$. Uncalibrated probabilities are normalized raw SVM scores $s_t$, i.e. $\palg_t = (s_t + m_t)/2m_t$, where $m_t = \max_{1 \leq r \leq t} |s_r|$.

% \figureref{semparse} (right) measures calibration after observing all the data. Raw scores are not well-calibrated outside of the interval $[0.4, 0.6]$; recalibration makes them almost perfectly calibrated. \figureref{semparse} further shows that the calibration error of \algorithmref{recal} is consistently lower throughout the entire learning process, while accuracy approaches to within $0.01$ of that of $\palg_t$. %We found that Platt scaling performed very close to method, suggesting that the i.i.d.~assumption was valid in this setting.

% %\begin{wrapfigure}{R}{0.5\textwidth}
% %\begin{center}
% %\includegraphics[width=7cm]{figures/wtccc.pdf}
% %\end{center}
% %\caption{\footnotesize \label{fig:wtccc}\algorithmref{recal} (green) is used to recalibrate probabilities from an online SVM trained to predict diabetes outcome from genomic data (blue). We track prediction (a) and calibration error (b) over time; plot (c) represents the level of calibration after seeing all the data; the size of each circle is proportional to the number of predictions in the corresponding bucket.}
% %\end{wrapfigure}

\section{Experiments}\label{sec:experiments}

Next, we evaluate \algorithmref{recal} on regression tasks as well as on Bayesian optimization, a sequential decision-making process that induces a non-i.i.d.~data distribution. We performed all  experiments on a laptop, indicating the low overhead of our method.

\paragraph{Baselines}

We compare our randomized online calibration with two baselines. Calibrated regression is a popular algorithm for the IID setting~\citep{kuleshov2018accurate} and can be seen as estimating the same mapping as Algorithm \ref{algo:recal} using kernel density estimation with a tophat kernel.  
Non-randomized online calibration uses the same subroutine as \algorithmref{recal}, but outputs the expected probability as opposed to a random sample; we found this to be a strong baseline that outperforms simple density estimation and reveals the value of randomization.

\textbf{Analysis of calibration.} We assess the calibration of the base model and the recalibrated model with calibration scores defined using the probability integral transform~\citep{gneiting2007probforecast}. 
% For each input $X$, we can use the base model to predict the probability distribution over outcome $y$. Hence, it is possible to compute the probability density $P$ at the observed outcome $y$ using the output of the base model. Thus, 
We define the calibration score as 
$\text{cal}(p_1, y_1,..,p_n, y_n) = \sum_{j=1}^{m} ((q_j-q_{j-1}) - \hat{p_j})^2,$
where $q_0=0 < q_1 < q_2 <..<q_m = 1$ are $m$ confidence levels. The $\hat{p_j}$ is estimated as $\hat{p_j} = |\{ y_t |  q_{j-1} \leq p_t \leq q_j,  t=1,..,N\}|/N.$ 
% The calibration scores are computed on each batch of data $\{P_t, y_t\}_{t=nt'+1}^{n(t'+1)}$, before being observed , where $n$ is the batch size and $t'$ is the time-step.

\subsection{UCI Datasets}

We experiment with four multivariate UCI datasets~\citep{Dua2019UCI} to evaluate our online calibration algorithm.  

\textbf{Setup.}
Our dataset consists of input and output pairs $\{x_t, y_t\}_{t=1}^{T}$ where $T$ is the size of the dataset. 
We simulate a stream of data by sending batches of data-points $\{x_t, y_t\}_{t=nt'+1}^{n(t'+1)}$ to our model, where $t'$ is the time-step and $n$ is the batch-size. This simulation is run for $\left \lceil{T/n}\right \rceil $ time-steps. For each batch, Bayesian ridge regression is fit to the data and the recalibrator is trained. 
We set $N=20$ in the recalibrator and use a batch size of $n=10$ unless stated otherwise.

\begin{table*}[t]
  \caption{Evaluation of Online Calibration on UCI Datasets. We compare the performance of online calibration against non-randomized online calibration, kernel density estimation, and uncalibrated (i.e., raw) baselines. Our method produces the lowest calibration errors in the last time step. Results hold with std error quoted in braces (10 experimental runs, fixed dataset).}
  \label{table:uci}
  \centering
  % \resizebox{\textwidth}{!}
  \begin{small}
  % {
  % \begin{tabular}{lcccc}
  %   \toprule
  %   % Dataset & \multicolumn{4}{c}{Calibration Error (Last Time-step) }\\
  %   % \midrule
  %   Dataset & Uncalibrated  & Kernel Density & Online Calibration & Online Calibration \\
    
  %   & (Raw) & Estimation	& (Non-randomized) & \\
  %   \midrule
  %   Aq. Toxicity (Daphnia Magna) & {0.0087} & {0.0037} & {0.0040} & {\textbf{0.0017}} \\
  %   % (Daphnia Magna)& & & &  \\
  %   Aq. Toxicity (Fathead Minnow) & {0.0108} & {0.0137} & {0.0124} &  {\textbf{0.0072}} \\
  %   % (Fathead Minnow)& & & &  \\
     
  %     Energy Efficiency & 0.3336 & 0.0708 & 0.0822 & \textbf{0.0528} \\
  %     Facebook Comment Volume & 0.2510 & 0.1060 & 0.0628 & \textbf{0.0524} \\
    
  %   \bottomrule
  % \end{tabular}
  % }
  {
  \begin{tabular}{lcccc}
    \toprule
    % Dataset & \multicolumn{4}{c}{Calibration Error (Last Time-step) }\\
    % \midrule
    Dataset & Uncalibrated  & Kernel Density & Online Calibration & Online Calibration \\
    
    & (Raw) & Estimation	& (Non-randomized) & \\
    \midrule
    Aq. Toxicity (Daphnia Magna) & {0.0081 (0.0001)} & {0.0055 (0.0002)} & {0.0058 (0.0003)} & {\textbf{0.0027 (0.0001)}} \\
    % (Daphnia Magna)& & & &  \\
    Aq. Toxicity (Fathead Minnow) & {0.0111 (0.0000)} & {0.0097 (0.0005)} & {0.0084 (0.0005)} &  {\textbf{0.0031 (0.0003)}} \\
    % (Fathead Minnow)& & & &  \\
     
      Energy Efficiency & 0.3322 (0.0001) & 0.2857 (0.0356) & 0.1702 (0.0094) & \textbf{0.1156 (0.0061)} \\
      Facebook Comment Volume & 0.2510 (0.0000) & 0.0589 (0.0050) & 0.0623 (0.0000) & \textbf{0.0518 (0.0002)} \\
    
    \bottomrule
  \end{tabular}
  }
   \end{small}
\end{table*}


\begin{figure*}[tb]
\centering     %%% not \center
\vspace{-3mm}
\subfigure[Aquatic Toxicity (Daphnia Magna)]{\label{fig:daphnia-aquatic-toxicity}\includegraphics[width=0.49\linewidth]{figures/recalibration-aquatic-toxicity-total-gaussian_recalib.png}}
\subfigure[Aquatic Toxicity (Fathead Minnow)]{\label{fig:fathead-aquatic-toxicity}\includegraphics[width=0.49\linewidth]{figures/recalibration-fish-toxicity-total-gaussian_recalib.png}}

\caption{Performance of Online Calibration on the Aquatic Toxicity Datasets. Aquatic toxicity towards two different types of fish (Daphnia Magna~\ref{fig:daphnia-aquatic-toxicity} and Fathead Minnow~\ref{fig:fathead-aquatic-toxicity}) is predicted by the base model. In both datasets, online calibration outperforms the baseline methods.}
\label{fig:aquatic-toxicity}
\end{figure*}

\paragraph{Aquatic toxicity datasets}
We evaluate our algorithm on the QSAR (Quantitative Structure-Activity Relationship) Aquatic Toxicity Dataset~\ref{fig:daphnia-aquatic-toxicity} (batch size n=5)
%quantitative structure-activity relationship
and Fish Toxicity Dataset~\ref{fig:fathead-aquatic-toxicity} (batch size n=10), where aquatic toxicity towards two different types of fish is predicted using 8 and 6 molecular descriptors as features respectively. %A batch-size of 5 was used for the Aquatic Toxicity dataset and a batch-size of 10 was used for the Fish Toxicity dataset. 
In Figure~\ref{fig:aquatic-toxicity}, we can see that the randomized online calibration algorithm produces a lower calibration error than the non-randomized baseline. We also compare the performance of our algorithm against uniform kernel density estimation by maintaining a running average of probabilities in each incoming batch of data-points. For the Fish Toxicity Dataset, we can see that only online calibration improves calibration errors relative to the baseline model. We report all final calibration errors in Table~\ref{table:uci}. 
% \begin{wrapfigure}{r}{7cm}
% \begin{figure}
% % \vspace{-0.5cm}
% \centering     %%% not \center
% \subfigure{\label{ewa-recalibrator-energy-efficiency}\includegraphics[width=1.01\linewidth]{figures/recalibration-energy-efficiency-gaussian_recalib.png}}
% % \subfigure[Running Average Recalibrator]{\label{mean-recalibrator-energy-efficiency}\includegraphics[width=0.4\linewidth]{figures/recalibration-energy-efficiency-gaussian_mean_recalib.png}}
% \caption{Online recalibration (blue, bottom) attains a lower calibration error at a faster rate than baselines (red and top, middle) on the Energy dataset.}
% \label{fig:energy-efficiency}
% % \vspace{-1cm}
% \end{figure}
% \end{wrapfigure}




\begin{figure*}[!htb]
\centering     %%% not \center
\subfigure[Energy Efficiency]{\label{ewa-recalibrator-energy-efficiency}\includegraphics[width=0.49\linewidth]{figures/recalibration-energy-efficiency-gaussian_recalib.png}}
\subfigure[Facebook Comment Volume]{\label{ewa-recalibrator-facebook-comments}\includegraphics[width=0.49\linewidth]{figures/recalibration-facebook-comments-total-gaussian_recalib.png}}

\caption{Performance of Online Calibration on the Energy Efficiency and Facebook Comment Volume Datasets. In both datasets, online recalibration (blue, bottom) attains a lower calibration error at a faster rate than baselines (red and top, middle).}
\label{fig:other-datasets}
\vspace{-2mm}
\end{figure*}


\paragraph{Energy efficiency dataset}
The heating load and cooling load of a building is predicted using 8 building parameters as features. 
In Figure~\ref{ewa-recalibrator-energy-efficiency}, we see that the calibration errors produced by the online calibration algorithm drop sharply within the initial 10 time-steps. The baselines also produce a drop in calibration scores, but it happens more gradually. 


% 
\paragraph{Facebook comment volume dataset}
In Figure~\ref{ewa-recalibrator-facebook-comments}, the Facebook Comment Volume Dataset is used where the number of comments is to be predicted using 53 attributes associated with a post. We use the initial 10000 data-points from the dataset for this experiment. Here, the non-randomized and randomized online calibration algorithms produce a similar drop in calibration errors, but the  randomized online calibration algorithm still dominates both baselines (Table~\ref{table:uci}).



\subsection{Bayesian Optimization}

We also apply online recalibration in the context of Bayesian optimization, an online model-based decision-making task in which {\bf the data distribution shifts over time} (it is the result of our actions). We find that improved uncertainties yield faster convergence to higher quality optima.

 % \begin{wraptable}{l}{8.5cm}
\begin{table}
% \vspace{-0.6cm}
  \caption{Recalibrated Bayesian Optimization}
  \label{table:bayes-opt-uai}
  \centering
  % \resizebox{\textwidth}{!}
  % \begin{small}
{
  \begin{tabular}{lcc}
    \toprule
    % Benchmark & \multicolumn{4}{c}{Minima Obtained (Last Time-step) }\\
    % \midrule
    Benchmark & Uncalibrated & Recalibrated  \\
    
    %& (Raw) & & (Non-randomized) & \\
    \midrule
    % Sixhumpcamel (2D) & -0.378 (0.146) & & -1.029 (0.002) &  \\ 
    Ackley (2D) & 9.925 (3.502)  & \textbf{8.313 (3.403)}   \\    
      % Alpine (10D) & 18.554 (0.886)  & \textbf{14.025 (2.204)}  \\
    SixHump (2D) & -0.378 (0.146) & \textbf{-1.029 (0.002)}  \\ 
    Ackley (10D) & 14.638 (0.591) &  \textbf{10.867 (2.343)}   \\    
      Alpine (10D) & 13.911 (1.846) &  \textbf{12.163 (1.555)}   \\      
    \bottomrule
  \end{tabular}
  }
  % \end{small}
  % \vspace{-0.5cm}
\end{table}
% \end{wraptable}
% \vspace{-2mm}

\begin{figure*}[h]
\centering     %%% not \center
\subfigure[SixHumpCamel]{\label{ewa-recalibrator-sixhumpcamel}\includegraphics[width=0.32\linewidth]{figures/sixhumpcamel_new_aggregate_convergence_comparison.png}}
\subfigure[Beale]{\label{beale}\includegraphics[width=0.32\linewidth]{figures/beale_new_aggregate_convergence_comparison.png}}
\subfigure[Mccormick]{\label{Mccormick}\includegraphics[width=0.32\linewidth]{figures/mccormick_new_aggregate_convergence_comparison.png}}
% \subfigure[Cosines]{\label{Cosines}\includegraphics[width=0.4\linewidth]{ figures/cosines_new_aggregate_convergence_comparison.png}}
% \caption{Online Calibration Improves Bayesian optimization}
% \subfigure[Ackley]{\label{Ackley}\includegraphics[width=0.4\linewidth]{ figures/ackley_new_aggregate_convergence_comparison.png}}
% \subfigure[Alpine]{\label{Alpine}\includegraphics[width=0.4\linewidth]{ figures/alpine1_new_aggregate_convergence_comparison.png}}
% \subfigure[Rosenbrock]{\label{Rosenbrock}\includegraphics[width=0.4\linewidth]{ figures/rosenbrock_new_aggregate_convergence_comparison.png}}
\caption{Performance of Recalibration Methods on Bayesian Optimization Benchmarks}
\vspace{-2mm}
\label{fig:bayes-opt-main}
\end{figure*}

% \begin{wrapfigure}{r}{7cm}
% \begin{figure}
% % \vspace{-1.5cm}
% \centering     %%% not \center
% \subfigure{\label{ewa-recalibrator-facebook-comments}\includegraphics[width=0.97\linewidth]{figures/recalibration-facebook-comments-total-gaussian_recalib.png}}
% % \subfigure[Running Average Recalibrator]{\label{mean-recalibrator-facebook-comments}\includegraphics[width=0.4\linewidth]{figures/recalibration-facebook-comments-total-gaussian_mean_recalib.png}}
% \caption{
% % Online Calibration on the Facebook Comments Volume Dataset. 
% % The randomized and non-randomized online calibration algorithms both outperform the simple kernel density estimation baseline by producing s sharper drop in calibration errors in the initial 100 time-steps. 
% Online recalibration yields the best calibration on Facebook Comments Volume Data}
% \label{fig:facebook-comments}
% % \vspace{-0.5cm}
% \end{figure}


\paragraph{Setup}
Bayesian optimization attempts to find the global minimum $x^\star = \arg \min_{x \in \mathcal{X}} f(x)$ of an unknown function $f:\mathcal{X} \to \mathbb{R} $ over an input space $\mathcal{X} \subseteq \mathbb{R}^D$. 
We are given an initial labeled dataset $x_t, y_t \in \mathcal{X} \times \mathbb{R}$ for $n=3$. At every time-step $t$, we use normal and recalibrated uncertainties from the probabilistic model $\mathcal{M}:\mathcal{X} \to (\mathbb{R} \to [0, 1])$ of $f$ (here, a Gaussian Process) to select the next data-point $x_{next}$ and iteratively update the model $\mathcal{M}$. 
% \paragraph{Setup.} 
We use popular benchmark functions to evaluate the performance of Bayesian optimization. 
% We initialize the Bayesian optimization with 3 randomly chosen data-points. 
We use the Lower Confidence Bound (LCB) acquisition function to select the data-point $x_t$.
See Appendix \ref{apdx:bayes_opt} for details.
% At any given time-step $T$, we have the dataset $\mathcal{D}_T = \{x_t, y_t\}_{t=1}^{T}$ collected iteratively. We use leave-one-out cross-validation splits to obtain a calibration dataset. 



Table \ref{table:bayes-opt-uai} shows that 
the online recalibration of uncertainties in a Bayesian optimization (BO) model
% online recalibrated Bayesian optimization can 
achieves lower minima than an uncalibrated model (results averaged over 5 overall BO runs with fixed initialization).
Figure~\ref{fig:bayes-opt-main} shows that online recalibrated Bayesian optimization can also reach optima in fewer steps. The error bars for the Beale and Mccormick functions are too small to be visible in the plots. All error bars denote standard errors. % We also show improvements over calibration without randomization. %using online calibration of uncertainties provided by $\mathcal{M}$ allows us to reach a lower minima with Bayesian optimziation. 

\begin{table*}[!h]
  \caption{Comparison to Existing Methods in the Literature}
  \label{table:comparison}
  \centering
  \begin{small}
  {
  % \begin{tabular}{llllllll}
  %   \toprule
  %   Method & Setting  & Output & Calibration & Recalibrator & Regret & Proof Technique \\
  %   \midrule
  %   Foster \& Vohra \cite{foster98asymptoticcalibration} &	Class.	& $p_t \in [0,1]$ & Conditional & n/a &	n/a & Int.~regret min. \\
  %   Kuleshov \& Ermon \cite{kuleshov2017estimating} &	Class.	& $p_t \in [0,1]$ & Conditional & $p$-to-$p$ & L2 loss & Int.~regret min. \\
  %   Gibbs \& Candes \cite{gibbs2022conformal} &	Regr.	& $q_t \in [0,1]$ & One quantile & $q$-to-$q$ & n/a & Quantile regr. \\
  %   Ours &	Regr.	& CDF $F_t$ & CDF $\forall y$ & $F(y)$-to-$F(y)$ & CRPS & CDF regr. \\
  %   \bottomrule
  % \end{tabular}
  \begin{tabular}{llllllll}
    \toprule
    Method & Setting  & Output & Calibration & Recalibrator & Regret & Proof Technique \\
    \midrule
    \cite{foster98asymptoticcalibration} &  Class.  & $p_t \in [0,1]$ & Conditional & n/a & n/a & Int.~regret min. \\
    \cite{kuleshov2017estimating} & Class.  & $p_t \in [0,1]$ & Conditional & $p$-to-$p$ & L2 loss & Int.~regret min. \\
    \cite{gibbs2022conformal} & Regr. & $q_t \in [0,1]$ & One quantile & $q$-to-$q$ & n/a & Quantile regr. \\
    Ours &  Regr. & CDF $F_t$ & CDF $\forall y$ & $F(y)$-to-$F(y)$ & CRPS & CDF regr. \\
    \bottomrule
  \end{tabular}
  }
  \end{small}
\end{table*}

\section{Discussion}\label{sec:discussion}

\paragraph{Adversarial calibration methods}

Table \ref{table:comparison} compares our method against its closest alternatives. Unlike previous algorithms aimed at classification that output a binary forecast $p_t \in [0,1]$ \citep{foster98asymptoticcalibration,kuleshov2017estimating}, we study marginal quantile calibration in regression.
Our work resembles adaptive conformal inference \citep{gibbs2022conformal}, but provides a CDF-like object $F_t$ instead of one confidence interval $q_t \in [0,1]$ and yields a different notion of calibration. Crucially, we provide regret guarantees relative to a baseline model. 

Specifically, our technical goal is marginal CDF calibration: estimating the probability of the event 
$y_t \leq y$ for all $y$. Note that these probabilities are marginal over the $y_t$; this is in contrast to conditional calibration for $y_t = 1| p_t = p$ as in \citet{kuleshov2017estimating}.
We call our technical strategy online CDF regression (by analogy to quantile regression): we remap the predicted probabilities $F_t(y)$ (for any $y$) to a calibrated probability $R(F_t(y))$.
Our proof technique establishes calibration by relating final calibration to the calibration of each subroutine using Jensen’s inequality.
We establish low regret by aggregating the regret of all the subroutines within one CRPS loss.
% This step is particularly involved compared to the analogous argument in \citet{kuleshov2017estimating}.

% Our technical strategy allows us to achieve three key goals: (1) identifying a notion of calibration in online regression; (2) identifying a suitable notion of regret; (3) deriving the first algorithm that achieves this property.

Most existing methods in online calibrated classification \cite{foster98asymptoticcalibration,vovk2005defensive,abernethy11blackwell,okoroafor2024faster} or regression \cite{gibbs2022conformal} do not provide guarantees for regret, except online recalibrated classification \cite{kuleshov2017estimating} and calibeating \cite{foster2022calibeating,lee2022online}. However, these methods are only for binary classification, whereas ours are for regression.When compared with ~\citet{lee2022online}, our work achieves a different calibration definition that is more appropriate for continuous outcomes together with a different notion of regret (See Appendix~\ref{subsec:lee} for a detailed comparison). 

\paragraph{Marginal calibration}
Our definition of calibration in regression is  marginal across all $x_t, y_t$; this is in contrast to classification \citep{foster98asymptoticcalibration}, where calibration is conditional (also known as distributional) on each $p$. 
Marginal calibration implies that the true outcome falls below the 90\% quantile 90\% of times (averaged over all $t$). 
% This definition is useful even though the full CDF is not calibrated (each quantile is): it is commonly used to form confidence intervals \citep{kuleshov2018accurate, gibbs2022conformal}, and we provide one simple way to do that using our method.
%
Distribution calibration in regression \cite{Kuleshov2022Calibrated} would be PPAD-hard by reduction from multi-class \citep{hazan2012calibration}. Marginal calibration is also currently a common definition of calibration for regression. For example, \citet{kuleshov2018accurate} in the IID setting or \citet{gibbs2022conformal} in the online setting adopt this definition.



\paragraph{Batch vs online calibration}

%If the $x_t, y_t$ have a distribution $\BP$, the calibration problem reduces to estimating the density $\BP(y|F(x)=\palg)$. 

\algorithmref{recal} can be seen as a direct counterpart to the histogram technique, a simple method for density estimation. With the histogram approach, the $F_t$ is split into N bins, and the average $y$ value is estimated for each bin. Because of the i.i.d. assumption, the output probabilities are calibrated, and the bin width determines the sharpness. Note that by Hoeffding's inequality, the average for a specific bin converges at a faster rate of $O({1}/{\sqrt{T_j}})$\citep{devroye1996probabilistic}, as opposed to the $O({1}/{\sqrt{\e T_j}})$ rate given by \citet{abernethy11blackwell}; hence online calibration is harder than batch. 
% highlighting the challenge of calibration in an online setting.




% \paragraph{Conformal Prediction} Conformal prediction \citep{vovk2005defensive} is a technique for constructing calibrated predictive sets. It has been extended to handle distribution shifts \citep{hendrycks2018using, tibshirani2019conformal, Barber2022Conformal}, as well as to online adversarial data \citep{gibbs2022conformal}. Calibrated prediction \citep{platt1999probabilistic,kuleshov2018accurate} is closely related to conformal prediction, but focuses on predicting distributions rather than sets.


% \paragraph{Deterministic Forecasting} 

% Interestingly, the method of \citet{gibbs2022conformal} is deterministic, while ours is randomized, and we proved that this is unavoidable for our type of calibration. However, \citet{vovk2005defensive} defined deterministic algorithms for {\em weak} calibration, which asks for a small difference between average predicted $p_t$ and true $y_t$ at times $t$ when $p_t \approx p^*$ for any $p^*$. Crucially, the relation $\approx$ is ``soft": it can be obtained by setting the indicator functions in the definition of $\rho_T(y,p)$ to continuous approximations. On the other hand, we take ``$\approx$" as an exact equality.
% We anticipate that our construction can admit a similar deterministic extension to weak calibration by using subroutines from \citet{vovk2005defensive}.




\section{Previous Work \& Conclusion}

Calibrated probabilities are widely used as confidence measures in the context of binary classification.
Such probabilities are obtained via recalibration methods, of which Platt scaling \cite{platt1999probabilistic} and isotonic regression \cite{niculescu2005predicting} are by far the most popular. Recalibration methods also possess multiclass extensions, which typically involve training multiple one-vs-all predictors \cite{zadrozny2002transforming}, as well as extensions to ranking losses \cite{menon2012ranking}, combinations of estimators \cite{zhong2013accurate}, and structured prediction \cite{kuleshov2015calibrated}. Recalibration algorithms have applied to improve reinforcement learning~\citep{Malik2019Calibrated}, Bayesian optimization~\citep{Deshpande2021Calibrated, stanton2023bayesian} and deep learning~\citep{Kuleshov2022Calibrated}. Crucially, all of these methods implicitly rely on the assumption that data is sampled i.i.d.~form an underlying distribution; they can be interpreted as density estimation techniques.

% In the online setting, the calibration problem was formalized by \citep{dawid1982well}; 
Online calibration was first proposed by \citep{foster98asymptoticcalibration}. Existing algorithms are based on internal regret minimization \cite{cesabianchi2006prediction} or on Blackwell approachability \cite{foster1997proof}; recently, these approaches were shown to be closely related \cite{abernethy11blackwell,mannor2010calibration}. 
% Unfortunately, online calibration is PPAD-hard \cite{hazan2012calibration}.
%
Conformal prediction \citep{vovk2005defensive} is a technique for constructing calibrated predictive sets; it has been extended to handle distribution shifts \citep{hendrycks2018using, tibshirani2019conformal, Barber2022Conformal}, and online adversarial data \citep{gibbs2022conformal}. 
% Calibrated prediction \citep{platt1999probabilistic,kuleshov2018accurate} is closely related to conformal prediction, but focuses on predicting distributions rather than sets.

% The concepts of calibration and sharpness were first formalized in the statistics literature \cite{murphy1973vector,gneiting2007probabilistic}. These metrics are captured by a class of {\em proper} losses and can be used both for evaluating \cite{buja05lossfunctions,brocker2009decomposition} and constructing \cite{kuleshov2015calibrated} calibrated forecasts.



% In this paper, 
% \section{CONCLUSION}
\paragraph{Conclusion}

% \section{Conclusion}
% \paragraph{Conclusion}

%The need for calibrated probability estimates naturally arises in many applications, including medical diagnosis and natural language processing. Current recalibration techniques implicitly require that the data is distributed i.i.d., which potentially makes them unreliable when this assumption does not hold. 
%Although the online learning literature introduces several techniques that forgo of this assumption, these algorithms are not suitable for machine learning applications, since they do not admit covariates, and therefore cannot produce useful predictions. 

% We define online calibrated forecasting to generalize calibrated regression to settings where data may not be distributed as IID. Current recalibration techniques implicitly require that the data is distributed i.i.d., which potentially makes them unreliable when this assumption does not hold. 
% In this work, we introduced the first recalibration technique that provably recalibrates any existing forecaster with a vanishingly small degradation in accuracy. This method does not make i.i.d.~assumptions, and is provably calibrated even on adversarial input. We analyzed our method's theoretical properties and showed excellent empirical performance on several real-world benchmarks, where the method converges quickly and retains good accuracy.


We presented a novel approach to uncertainty estimation that leverages online learning. Our approach extends existing online learning methods to handle predictive uncertainty while ensuring high accuracy, providing formal guarantees on calibration and regret on adversarial input. 

We introduced a new problem called online calibrated forecasting, and proposed algorithms that generalize calibrated regression to non-IID settings. Our methods are effective on several predictive tasks and hold potential to improve performance in sequential model-based decision-making settings where we are likely to observe non-stationary data. 

% \paragraph{Acknowledgements.} This work is supported by the NSF (grant \#1649208) and by the Future of Life Institute (grant 2016-158687).

\newpage


% \bibliographystyle{plainnat}
\bibliography{all}
% \newpage
% \input{checklist}
%\newpage

% \title{Online Calibrated Regression for Adversarially Robust Forecasting (Supplementary Material)}
%\appendix
% \end{document}
% \begin{document}
\input{appendix}
%\input{nuerips_checklist}

\end{document}
