% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
% version; also before submission to
% see how the non-anonymous paper
% would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

\usepackage[dvipsnames]{xcolor}

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}
\usepackage{amsthm}
%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr} 
\externaldocument{chung_631}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% BEGIN: OUR DEFINED COMMANDS
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\usepackage{bbm}
\usepackage{amsfonts, amsmath, amssymb}
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{caption}
\usepackage{subcaption}
\hypersetup{
    colorlinks=true,
    allcolors=NavyBlue
}

\newcommand{\x}{\mathbf{x}}

\newcommand{\Prob}{\mathbb{P}}
\newcommand{\E}{\mathbb{E}}
\newcommand{\Ncal}{\mathcal{N}}
\newcommand{\Xcal}{\mathcal{X}}
\newcommand{\norm}[1]{\left\lVert#1\right\rVert}
\newcommand{\one}{\mathbbm{1}}
\newcommand{\R}{\mathbb{R}}
\newcommand{\loglike}{\mathcal{L}\mathcal{L}}
\newcommand{\C}{\mathcal{C}}
\newcommand{\A}{\mathcal{A}}
\newcommand{\X}{\mathcal{X}}
\newcommand{\Y}{\mathcal{Y}}
\newcommand{\D}{\mathcal{D}}
\newcommand{\F}{\mathcal{F}}
\newcommand{\bx}{\mathbf{x}}
\newcommand{\bh}{\mathbf{h}}
\newcommand{\bp}{\mathbf{p}}
\newcommand{\bxtilde}{\Tilde{\bx}}
\newcommand{\bX}{\mathbf{S}}
\newcommand{\hF}{\hat{F}}
\newcommand{\hp}{\hat{p}}
\newcommand{\indicator}[1]{\mathbbm{1}\{#1\}}
\newcommand{\qedwhite}{\hfill \ensuremath{\Box}}
\DeclareMathOperator*{\argmin}{arg\,min}

\newcommand{\ops}{\text{OPS}}
\newcommand{\mw}{\text{MW}}
\newcommand{\iw}{\text{IW}}
\newcommand{\Ycal}{\mathcal{Y}}
\newcommand{\sigmoid}{\text{sigmoid}}
\newcommand{\logit}{\text{logit}}
\newcommand{\Dcal}{\mathcal{D}}

\newtheorem{definition}{Definition}
\theoremstyle{definition}
\newtheorem{example}{Example}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% END: OUR DEFINED COMMANDS
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\title{Parity Calibration\\(Supplementary Material)}

\author[1]{\href{mailto:<youngsec@cs.cmu.edu>?Subject=Parity Calibration}{Youngseog Chung}{}}
% \author[1]{Youngseog Chung}
\author[1]{Aaron Rumack}
\author[1]{Chirag Gupta}
% Add affiliations after the authors
\affil[1]{%
    Machine Learning Department,
    Carnegie Mellon University,
    Pittsburgh, Pennsylvania, USA
}

\setcounter{figure}{8}
\setcounter{table}{5}
\begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\hypersetup{urlcolor=black}
\maketitle
\hypersetup{urlcolor=NavyBlue}

\appendix
\section{Details on Evaluation: reliability diagrams and metrics}
We provide details on how we assess a sequence of distributional forecasts $\{\hF_t\}_{t=1}^{T}$ and parity probabilities $\{\hp_t\}_{t=1}^{T}$, given a
test dataset $\mathcal{D}_{\text{test}} = \{\x_t, y_t\}_{t=1}^{T}$. We assess distributional forecasts via Quantile Calibration, and the parity probabilities via Parity Calibration, Sharpness, and Accuracy metrics.
\label{app:metrics}
\begin{itemize}
    \item \textbf{Quantile Calibration: reliability diagram and calibration error}
    
    To assess the quantile calibration of the distributional forecast $\hF_t$, we produce the reliability diagram using the \textit{Uncertainty Toolbox}~\citep{chung2021uncertainty}. This process works as follows. We take $100$ equi-spaced quantile levels in $[0, 1]$: $p_i \in$ \texttt{np.linspace(0, 1, 100)},
    and for each $p_i$, we compute the empirical coverage of the predictive quantile $\hF^{-1}_t(p_i)$ 
    with $\frac{1}{T}\sum_{t=1}^{T}\indicator{y_t \leq \hF^{-1}_t(p_i)}$, and we denote this quantity as $p_{i, \text{obs}}$.
    Note that $p_{i, \text{obs}}$ is an empirical estimate of the term $\frac{1}{T}\sum_{t=1}^{T}F_t(\hF^{-1}_t(p_i))$, from Eq.~\eqref{eq:probabilistic-calibration}.
    The reliability diagram is produced by plotting $\{p_i\}$ on the $x$-axis against $\{p_{i, \text{obs}}\}$ on the $y$-axis. Quantile Calibration Error (QCE) is then computed as the average of the absolute difference between $p_i$ and $p_{i, \text{obs}}$ over the $100$ values of $p_i$: $\frac{1}{100}\sum_{i=1}^{100}\mid p_{i, \text{obs}} - p_i\mid$.
    
    \item \textbf{Parity Calibration: reliability diagram and calibration error} 

    For parity calibration, we produce the reliability diagram following the standard method in binary classification~\citep{degroot1981assessing, niculescu2005predicting}. Note that the parity probability $\hp_t$ is a prediction for the parity outcome $\widetilde{y}_t := \indicator{y_t \leq y_{t-1}}$ (Eq.~\eqref{eq:parity-outcome}). Specifically, we first take $30$ fixed-width bins of the predicted parity probabilities: $\{B_{m}\}_{m=1}^{30}$, where $B_{m} = [\frac{m-1}{30}, \frac{m}{30})$ for $m < 30$ and $B_{30} = [\frac{29}{30}, 1]$.
    The average outcome in bin $B_m$ is computed as $\text{obs}(B_m) = \frac{1}{\mid B_m \mid} \sum_{t: \hp_t \in B_m} \indicator{\widetilde{y}_t = 1}$, and the average prediction of bin $B_m$ is computed as $\text{pred}(B_m) = \frac{1}{\mid B_m \mid} \sum_{t: \hp_t \in B_m} \hp_t$.
    The reliability diagram is then produced by plotting $\text{pred}(B_m)$ on the $x$-axis against $\text{obs}(B_m)$ on the $y$-axis.
    The blue bars in the background of each parity calibration reliability diagram represents the size of the bin: $|B_m |$. Parity Calibration Error (PCE) is then computed with this reliability diagram following the standard definition of ($\ell_1$-)expected calibration error (ECE): $\sum_{m=1}^{30}\frac{\mid B_m \mid}{T}\mid \text{obs}(B_m) - \text{pred}(B_m) \mid$.
    
    \item \textbf{Sharpness}
    
    Assuming the same notation as above, sharpness is computed as: $\sum_{m=1}^{M}\frac{\mid B_m \mid}{T}\cdot \text{obs}(B_m)^{2}$, where $M$ is the total number of bins. As indicated above, we use $M=30$ in all of our experiments. 
    We provide some additional intuition on this metric. A perfectly knowledgeable forecaster which outputs $\hat{p}_t = \widetilde{y}_t$ will place all predictions in either $B_1$ or $B_M$ and achieve sharpness $ = \frac{\mid B_1 \mid}{T}\cdot \text{obs}(B_1)^{2} + \frac{\mid B_M \mid}{T}\cdot \text{obs}(B_M)^{2} = \frac{\mid B_1 \mid}{T}\cdot 0^{2} + \frac{\mid B_M \mid}{T}\cdot 1^{2} = \frac{\mid B_M \mid}{T} = \frac{\sum_{t=1}^{T} \widetilde{y}_t}{T}$. On the other hand, if the forecaster places all predictions into a single bin $B_k$, then its sharpness will be $\text{obs}(B_k)^{2} = \left(\frac{\sum_{t=1}^{T} \widetilde{y}_t}{T}\right)^{2}$.
    It can be shown that sharpness is always within the closed interval $\left[ \left(\frac{\sum_{t=1}^{T} \widetilde{y}_t}{T}\right)^{2}, \frac{\sum_{t=1}^{T} \widetilde{y}_t}{T} \right]$~\citep{brocker2009reliability}. 
    Intuitively, sharpness measures the degree to which the forecaster attributes different valued predictions to events with different outcomes (i.e. labels). Hence, a sharper, or more precise, forecaster has more discriminative power, and this is reflected in a higher sharpness metric.
     
    \item \textbf{Accuracy metrics (Acc and AUROC)} 
    
    Accuracy is measured in the binary classification sense, where the true labels are the observed parity outcomes: $\indicator{y_t \leq y_{t-1}}$ (Eq.~\eqref{eq:parity-outcome}).
    \begin{itemize}
        \item \textbf{Binary accuracy (Acc)} is computed by regarding $\hp_t \geq 0.5$ as the positive class prediction, and the opposite case as the negative class prediction. 
        \item \textbf{Area under the ROC curve (AUROC)} is computed using the \texttt{scikit-learn} Python package, which implements the standard definition of the score. 
        Specifically, we called the function \texttt{sklearn.metrics.roc\_auc\_score} with the predictions $\{\hp_t\}$ and labels $\indicator{y_t \leq y_{t-1}}$.
    \end{itemize}
\end{itemize}

\section{Additional Details on Case Studies}
\subsection{Additional Details on COVID-19 Case Study}\label{app:covid-appendix}
\subsubsection{Details on Interpolating Expert Forecasts for COVID-19 Case Study}
The expert forecast provided by the COVID-19 Forecast Hub is represented as a set of quantiles.
To derive the parity probabilities $\hp_{s,t}$, we need to interpolate the expert forecast, as the forecast contains predicted quantiles at only 7 quantile levels : $\{0.025, 0.1, 0.25, 0.5, 0.75, 0.9, 0.975\}$. 
We interpolate under the assumption that the density between two adjacent quantiles $\tau_k$ and $\tau_{k+1}$ are defined by the normal distribution specified by those two quantiles. Specifically, for two quantiles $\tau_k$ and $\tau_{k+1}$ and forecast values $x^{(s,t)}_k$ and $x^{(s,t)}_{k+1}$, we compute $$\sigma^{(s,t)}_k = \frac{x^{(s,t)}_{k+1} - x^{(s,t)}_k}{\Phi^{-1}(\tau_{k+1}) - \Phi^{-1}(\tau_k)},$$
$$\mu^{(s,t)}_k = x^{(s,t)}_k - \sigma^{(s,t)}_k \Phi^{-1}(\tau_k),$$
where $\Phi$ is the standard normal cdf. For each forecast, if $x^{(s,t)}_k \leq y_{s,t-1} < x^{(s,t)}_{k+1}$, then the parity probability $$\hp_{s,t}= \Phi\left(\frac{y_{s,t-1} - \mu^{(s,t)}_k}{\sigma^{(s,t)}_k}\right).$$
If $y_{s,t-1} < x^{(s,t)}_1$, we can extrapolate using $\mu^{(s,t)}_1$ and $\sigma^{(s,t)}_1$, and if $y_{s,t-1} >= x^{(s,t)}_7$, we can extrapolate using $\mu^{(s,t)}_6$ and $\sigma^{(s,t)}_6$. However, this never occurs with the forecasts and observations in this dataset.
Figure~\ref{fig:covid-interp-parity-prob} provides a visualization of this interpolation scheme.

\begin{figure}[h]
\begin{center}
\includegraphics[width=0.4\textwidth]{Figures/calc_parity_prob.png}
\caption{We use a piece-wise Gaussian interpolation of the expert forecast quantiles to estimate the predictive cdf, from which we then calculate the parity probabilities.}
\label{fig:covid-interp-parity-prob}
\end{center}
\vskip -0.2in
\end{figure}

\subsubsection{Details on Experiment Setup for COVID-19 Case Study}
Section~\ref{sec:covid-single-timeseries} compares the expert forecaster, its parity probabilities and posthoc calibration by OPS. 
We did not tune OPS hyperparameters in this experiment, so the full 119 weeks' worth of data was used for testing and reporting the results.

For Section~\ref{sec:covid-methods-comparison}, the first 20 weeks' worth of data was used for tuning hyperparameters, and the reported results are based on the remaining 99 weeks' worth of data as the test set.

For the decision-making experiment in Section~\ref{sec:covid-decision-making}, we used the parity probabilities produced from Section~\ref{sec:covid-methods-comparison}.\\
Although the chosen loss function is just one example, we observe that similar results hold with any loss function that satisfies: $l_{2, 3}\leq l_{2, 2} \leq l_{1, 1} \leq l_{2, 1} \leq l_{1, 2} \leq l_{1, 3}$.

\subsection{Additional Details on Weather Forecasting Case Study}\label{app:weather-appendix}
\subsubsection{Details on Experiment Setup for Weather Forecasting Case Study}\label{app:weather-experiment-details}
We used the modeling and training infrastructure 
provided by the Keras tutorial on \textit{Timeseries Forecasting for Weather Prediction}\footnote{\url{https://keras.io/examples/timeseries/timeseries_weather_forecasting/}} which models this same dataset with an LSTM network~\citep{hochreiter1997long}. 
We made one change to the model provided by the tutorial: since we are interested in probabilistic forecasts instead of point forecasts, 
we changed the head of the model and the loss function from a point output trained with mean squared error loss 
to a mean and variance output that parameterizes a Gaussian distribution 
and trained it with the Gaussian likelihood loss.
Such a model is also referred to as a mean-variance network or a 
probabilistic neural network  \citep{lakshminarayanan2017simple, nix1994estimating}, 
and it is one of the most popular methods currently used in probabilistic regression.

While the tutorial's setup takes as input the past 120 hours' window of 7 features to predict the value of one feature (Temperature) 12 hours into the future, 
we expand the setting to predict all 7 features: Pressure, Temperature, Saturation vapor pressure, Vapor pressure deficit, Specific humidity, Airtight, and Wind speed.
We thus train 7 separate base regression models, one for each prediction target.

For the in-text experiment \textbf{Binary classifers as expert forecasts}, we trained binary classification base models with parity outcomes (Eq.~\eqref{eq:parity-outcome}) as the labels and took this model as the expert forecaster. 
We adopted the same model architecture as the base regression model and changed the last layer to output a logit. We then trained the model with the cross entropy loss.

The full Jena dataset spans from the beginning of January 2009 to the end of December 2016, with $420,551$ datapoints in total. In chronological order, we set $272,638$ datapoints to train the base models (both the regression and classification model) and the subsequent $83,390$ datapoints for validation. Following the same model training procedure as the tutorial, training was stopped early if the validation loss did not increase for 20 training epochs.

Afterwards, in running the posthoc calibration methods (MW, IW, and OPS), we used the last $8,640$ datapoints of the validation set to tune the hyperparameters of each calibration method, and used subsequent windows of $8,640 \times 3 = 25,920$ datapoints for testing.

We run 50 test trials with a moving test timeframe to produce the mean and standard errors reported in Tables~\ref{tab:pressure_numerical} and \ref{tab:pressure-binary-prehoc-ops}. Denoting the first test window as $[t+1, t+H]$ (i.e. $H$ is set to $25,920$), we move this frame by a multiple of a fixed offset $c$ into the future, and repeat this 50 times, to create a new set of 50 test sets. The resulting new test timeframes are $[t+1+(ck), t+H+(ck)]$, where $k = 0, 1, 2, \dots 49$, and $c$ was set to $336$.

\subsubsection{Additional Results on Weather Forecasting Case Study}\label{app:weather-additional-results}

We shows additional plots and tables from the experimental results in Section~\ref{sec:weather-case-study} of the main paper.

Figure~\ref{fig:pressure-binary-base-full-comparison} displays the full set of reliability diagrams for Figure~\ref{fig:pressure-binary-prehoc-ops}, which corresponds to the in-text experiment \textbf{Binary classifiers as expert forecasts} in Section~\ref{sec:weather-case-study}.

Table~\ref{tab:weather-average} displays the numerical results from the weather forecasting case study when averaged across all 7 prediction target settings. This corresponds to the in-text experiment \textbf{Results across all 7 timeseries} in Section~\ref{sec:weather-case-study}. To produce these results, we fixed the test timeframe to be the first test timeframe $[t+1, t+H]$ for all prediction target settings, then computed the mean and standard errors across the 7 sets of metrics produced (one set for each prediction target).

\begin{figure}[ht]
    \centering
    \includegraphics[width=\linewidth]{Figures/temperature_diagram_binary.pdf}
    \caption{Reliability diagrams with a binary classification base model predicting Pressure. This is the full set of reliability diagrams for Figure~\ref{fig:pressure-binary-prehoc-ops} from Section~\ref{sec:weather-case-study}. The left-most plot shows parity calibration of the base classification model (Prehoc), and the next three plots show the effects of MW, IW and OPS in calibrating the Prehoc parity probabilities. OPS produces the most calibrated and sharp parity probabilities.}
    \label{fig:pressure-binary-base-full-comparison}
\end{figure}


\begin{table*}[ht!]
\centering
\vspace{2mm}
\begin{subcaptionblock}{\textwidth}
    \centering
    \begin{tabular}{lccccc}
    \toprule
        & QCE $\downarrow$ & PCE $\downarrow$ & Sharp $\uparrow$ & Acc $\uparrow$ & AUROC $\uparrow$ \\ \midrule
    Prehoc  & $\mathbf{0.0266 \pm 0.0052}$ & $0.2794 \pm 0.0161$ & $0.2915 \pm 0.0117$ & $0.4902 \pm 0.0159$ & $0.4806 \pm 0.0249$\\ 
    MW  & N/A & $0.0233 \pm 0.0048$ & $0.2913 \pm 0.0117$ & $0.5610 \pm 0.0106$ & $0.5419 \pm 0.0195$\\
    IW  & N/A & $0.0188 \pm 0.0047$ & $0.2913 \pm 0.0118$ & $0.5630 \pm 0.0099$ & $0.5403 \pm 0.0209$\\ 
    OPS  & N/A & $\mathbf{0.0159 \pm 0.0009}$ & $\mathbf{0.2961 \pm 0.0122}$ & $\mathbf{0.5790 \pm 0.0122}$ & $\mathbf{0.5830 \pm 0.0217}$\\
    \bottomrule
    \end{tabular}
    \caption{Numerical results averaged across all 7 prediction settings where the base model is a Gaussian regression model. The base regression model (Prehoc) tends to be well quantile calibrated (QCE) but terribly parity calibrated (PCE). All methods (MW, IW, OPS) improve parity calibration, but OPS is the only method which improves all metrics simultaneously. Best value for each metric is in bold.}
    \label{tab:weather-average-gaussian-base}
\end{subcaptionblock}

\vspace{3mm}
\begin{subcaptionblock}{\textwidth}
    \centering
    \begin{tabular}{lcccc}
    \toprule
             & PCE $\downarrow$ & Sharp $\uparrow$ & Acc $\uparrow$ & AUROC $\uparrow$\\ \midrule
    Prehoc  & $0.0247 \pm 0.0016$ & $0.3049 \pm 0.0074$ & $0.6078 \pm 0.0099$ & $0.6348 \pm 0.0136$\\
    MW  & $0.0170 \pm 0.0018$ & $0.3049 \pm 0.0075$ & $0.6061 \pm 0.0102$ & $0.6340 \pm 0.0143$\\ 
    IW  & $0.0156 \pm 0.0012$ & $0.3047 \pm 0.0074$ & $0.6075 \pm 0.0098$ & $0.6340 \pm 0.0136$\\ 
    OPS  & $\mathbf{0.0135 \pm 0.0013}$ & $\mathbf{0.3134 \pm 0.0075}$ & $\mathbf{0.6278 \pm 0.0121}$ & $\mathbf{0.6643 \pm 0.0183}$\\ 
    \bottomrule
    \end{tabular}
    \caption{Numerical results averaged across all 7 prediction settings where the base model is a binary classification model trained with parity outcome labels. The base classification model (Prehoc) tends to be much better parity calibrated than when a regression base model is used (above Table~\ref{tab:weather-average-gaussian-base}). All methods (MW, IW, OPS) improve parity calibration further, but OPS is the only method which improves all metrics simultaneously. Notably, MW and IW tends to decrease the accuracy of the parity probabilities. Best value for each metric is in bold.}
    \label{tab:weather-average-binary-base}
\end{subcaptionblock}
% \vspace{3mm}
\caption{Numerical results from the weather forecasting case study (Section~\ref{sec:weather-case-study}), averaged across all 7 forecasting targets. Table~\ref{tab:weather-average-gaussian-base} displays results with the Gaussian regression base model, and Table~\ref{tab:weather-average-binary-base} displays results with the binary classification base model. $\pm$ indicates mean $\pm$ 1 standard error, across the 7 prediction target settings.}
\label{tab:weather-average}
\vspace{-4mm}
\end{table*}


\subsection{Additional Details on Control in Nuclear Fusion Case Study}\label{app:fusion-appendix}
\subsubsection{Details on Experiment Setup for Control in Nuclear Fusion Case Study}\label{app:fusion-experiment-details}
The expert forecaster for the nuclear fusion experiment in Section~\ref{sec:fusion-case-study} is provided by a pretrained dynamics models that was used to optimize control policies for deployment 
on the DIII-D tokamak~\citep{luxon2002design}, a nuclear fusion device in San Diego that is operated by General Atomics.
The dynamics model was trained with logged data from past experiments (referred to as ``shots'') on this device. Each shot consists of a trajectory of (state, action, next state) transitions, and one trajectory consists of $\sim20$ transitions (i.e. $20$ timesteps).

As input, the model takes the current state of the plasma and the actuator settings (i.e. actions). The model outputs a multi-dimensional predictive distribution over the state variables in the next timestep. The state is represented by three signals: $\beta_N$ (the ratio of plasma pressure over magnetic pressure), \textit{density} (the line-averaged electron density), and \textit{li} (internal inductance).
For the actuators, the model takes in the amount of power and torque injected from the neutral beams, the current, the magnetic field, and four shape variables (\textit{elongation}, $a_{minor}$, \textit{triangularity-top}, and \textit{triangularity-bottom}). This, along with the states, makes for an input dimension of 11 and output dimension of 3 for the states.

The model was implemented with a recurrent probabilistic neural network (RPNN), which features an encoding layer by an RNN with 64 hidden units followed by a fully connected layer with 256 units, and a decoding layer of fully connected layers with [128, 512, 128] units, which finally outputs a 3-dimensional isotropic Gaussian parameterized by the mean and a log-variance prediction.

The training dataset consisted of trajectories from 10294 shots, and the model was trained with the Gaussian likelihood loss, with a learning rate of 0.0003 and weight decay of 0.0001.
In using dynamics models to sample trajectories and train policies, the key metric 
practitioners are concerned with is explained variance, hence explained variance on a held out validation set of 1000 shots was monitored during training. Training was stopped early if there was no improvement in explained variance over the validation set for more than 250 epochs.
The test dataset consisted of another held-out set of 900 shots, with which we report all results presented in Section~\ref{sec:fusion-case-study}.

In all of our experiments, since $\beta_N$ is the key signal of interest in our problem setting, we just examine the predictive distribution for $\beta_N$ in the model outputs and ignore the other dimensions of the outputs. 

In running the posthoc calibration methods (MW, IW, and OPS), we used the same validation set to tune the hyperparameters of each calibration method, and used windows of $15,000$ datapoints from the concatenated test shot data for testing.

We run 50 test trials with a moving test timeframe to produce the mean and standard errors reported in Table~\ref{tab:fusion_numerical}. Denoting the first test window as $[t+1, t+H]$ (i.e. $H$ is set to $15,000$), we move this frame by a multiple of a fixed offset $c$ into the future, and repeat this 50 times, to create a set of 50 test datasets. The resulting test timeframes are $[t+1+(ck), t+H+(ck)]$, where $k = 0, 1, 2, \dots 49$, and $c$ was set to $100$.

\section{Details on Hyperparameters}
\label{app:hyperparameters}

Each of the three calibration methods we consider in Section~\ref{sec:main-parity-calibration-methodology}, which we use in our experiments in Section~\ref{sec:experiments}, requires a set of hyperparameters.
\begin{itemize}
    \item \textbf{MW} requires \texttt{uf} and \texttt{ws}.
    \begin{itemize}
        \item \texttt{uf} determines how often the PS parameters $(a^{\text{MW}}, b^{\text{MW}})$ are updated.
        \item \texttt{ws} determines the size of the calibration set that is used to update the PS parameters
    \end{itemize}
    \item \textbf{IW} requires \texttt{uf}.
    \begin{itemize}
        \item \texttt{uf} determines how often the PS parameters $(a^{\text{IW}}, b^{\text{IW}})$ are updated.\\
        Note that  IW always uses all of the data seen so far to update the PS parameters.
    \end{itemize}
    \item \textbf{OPS} requires \texttt{$\gamma$} and \texttt{D}.
    \begin{itemize}
        \item \texttt{$\gamma$} can be understood as step size for the OPS updates.
        \item \texttt{$D$} can be understood as regularization for the OPS updates.
    \end{itemize}
\end{itemize}
We provide details on how these hyperparameters were tuned for each of the three case studies.
\subsection{Hyperparameters for COVID-19 Case Study}
We observed that OPS performed well with the default hyperparameters, 
so we did not tune hyperparameters for OPS for the COVID-19 case study.
The default hyperparameter values used for OPS were $\gamma = 0.001$ and $\texttt{D} = 10$.

For MW and IW,
we tuned hyperparameters by optimizing parity calibration error (PCE, Section~\ref{sec:experiments}) on the first 20 weeks' worth of data as the validation set, over the following grids:
\begin{itemize}
    \item \texttt{uf} $\in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]$, separately for MW and IW
    \item \texttt{ws} $\in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]$, for MW.
\end{itemize}
The COVID-19 dataset records data for each week, so the grid size of 1 represents 1 week.

The tuned hyperparameters we used for MW and IW are as follows:
\begin{itemize}
    \item MW: $\texttt{uf}=1, \texttt{ws}=10$
    \item IW: $\texttt{uf}=5$
\end{itemize}


\subsection{Hyperparameters for Weather Forecasting Case Study}
For each calibration method, the hyperparameters were tuned by optimizing parity calibration error (PCE, Section~\ref{sec:experiments}) on the validation dataset over the following grids:
\begin{itemize}
    \item \texttt{uf} $\in [1, 24, 168, 336, 720, 2160]$, separately for MW and IW
    \item \texttt{ws} $\in [24, 168, 336, 720, 2160, 4320, 8640]$, for MW
    \item $\gamma$ $\in$ [1e-5, 5e-5, 1e-4, 5e-4, 1e-3, 5e-3, 1e-2], for OPS
    \item \texttt{D} $\in [1, 10, 30, 50, 70, 100, 150, 200]$, for OPS.
\end{itemize}
The hyperparameters were tuned separately for each base model setting (regression and classification), for each method (MW, IW, and OPS), and for each base model predicting one of 7 targets (Pressure, Temperature, Saturation vapor pressure, Vapor pressure deficit, Specific humidity, Airtight, and Wind speed).

The tuned hyperparameters we used are as follows:
\begin{itemize}
    \item \textbf{Base Regression Model}
    \begin{itemize}
        \item Pressure Model
        \begin{itemize}
            \item MW: $\texttt{uf}=2160 , \texttt{ws}=8640 $
            \item IW: $\texttt{uf}=2160$
            \item OPS: $\gamma=1\text{e-5}, \texttt{D}=50$
        \end{itemize}
        \item Temperature Model
        \begin{itemize}
            \item MW: $\texttt{uf}=336 , \texttt{ws}=8640 $
            \item IW: $\texttt{uf}=168$
            \item OPS: $\gamma=1\text{e-5} , \texttt{D}=30 $
        \end{itemize}
        \item Saturation Vapor Pressure Model
        \begin{itemize}
            \item MW: $\texttt{uf}=2160 , \texttt{ws}=2160 $
            \item IW: $\texttt{uf}=336$
            \item OPS: $\gamma=1\text{e-4} , \texttt{D}=10 $
        \end{itemize}
        \item Vapor Pressure Deficit Model
        \begin{itemize}
            \item MW: $\texttt{uf}=1 , \texttt{ws}=4320 $
            \item IW: $\texttt{uf}=1$
            \item OPS: $\gamma=1\text{e-3} , \texttt{D}=1$
        \end{itemize}
        \item Specific Humidity Model
        \begin{itemize}
            \item MW: $\texttt{uf}=1 , \texttt{ws}=4320 $
            \item IW: $\texttt{uf}=168$
            \item OPS: $\gamma=1\text{e-5} , \texttt{D}=30 $
        \end{itemize}
        \item Airtight Model
        \begin{itemize}
            \item MW: $\texttt{uf}=2160 , \texttt{ws}=2160 $
            \item IW: $\texttt{uf}=720$
            \item OPS: $\gamma=5\text{e-5} , \texttt{D}=10 $
        \end{itemize}
        \item Wind Speed Model
        \begin{itemize}
            \item MW: $\texttt{uf}=1 , \texttt{ws}=168 $
            \item IW: $\texttt{uf}=24$
            \item OPS: $\gamma=1\text{e-4} , \texttt{D}=10 $
        \end{itemize}
    \end{itemize}
    \item \textbf{Base Classification Model}
    \begin{itemize}
        \item Pressure Model
        \begin{itemize}
            \item MW: $\texttt{uf}=2160 , \texttt{ws}=8640 $
            \item IW: $\texttt{uf}=720$
            \item OPS: $\gamma=5\text{e-5} , \texttt{D}=30 $
        \end{itemize}
        \item Temperature Model
        \begin{itemize}
            \item MW: $\texttt{uf}=1 , \texttt{ws}=4320 $
            \item IW: $\texttt{uf}=168$
            \item OPS: $\gamma=1\text{e-5} , \texttt{D}=150 $
        \end{itemize}
        \item Saturation Vapor Pressure Model
        \begin{itemize}
            \item MW: $\texttt{uf}=336 , \texttt{ws}=4320 $
            \item IW: $\texttt{uf}=720$
            \item OPS: $\gamma=1\text{e-4} , \texttt{D}=30 $
        \end{itemize}
        \item Vapor Pressure Deficit Model
        \begin{itemize}
            \item MW: $\texttt{uf}=1 , \texttt{ws}=168 $
            \item IW: $\texttt{uf}=1$
            \item OPS: $\gamma=1\text{e-5} , \texttt{D}=70 $
        \end{itemize}
        \item Specific Humidity Model
        \begin{itemize}
            \item MW: $\texttt{uf}=1 , \texttt{ws}=2160 $
            \item IW: $\texttt{uf}=2160$
            \item OPS: $\gamma=1\text{e-5} , \texttt{D}=50 $
        \end{itemize}
        \item Airtight Model
        \begin{itemize}
            \item MW: $\texttt{uf}=24 , \texttt{ws}=4320 $
            \item IW: $\texttt{uf}=336$
            \item OPS: $\gamma=1\text{e-3} , \texttt{D}=10 $
        \end{itemize}
        \item Wind Speed Model
        \begin{itemize}
            \item MW: $\texttt{uf}=24 , \texttt{ws}=2160 $
            \item IW: $\texttt{uf}=1$
            \item OPS: $\gamma=1\text{e-5} , \texttt{D}=10 $.
        \end{itemize}
    \end{itemize}
    
\end{itemize}

\subsection{Hyperparameters for Control in Nuclear Fusion Case Study}
The nuclear fusion dataset records measurements in 25 millisecond intervals.
Therefore, in tuning hyperparameters, we design the search grid to represent lengths of time during which evolution of various plasma states are expected to be observable.

For each calibration method, the hyperparameters were tuned by optimizing parity calibration error (PCE, Section~\ref{sec:experiments}) on a validation dataset consisting of 1000 shot's worth of data, over the following grids:
\begin{itemize}
    \item \texttt{uf} $\in [1, 2, 4, 8, 24]$, separately for MW and IW
    \item \texttt{ws} $\in [2, 8, 16, 24, 48, 60, 80, 100, 200]$, for MW
    \item $\gamma$ $\in$ [1e-5, 5e-5, 1e-4, 5e-4, 1e-3, 5e-3, 1e-2], for OPS
    \item \texttt{D} $\in [1, 10, 30, 50, 70, 100, 150, 200]$, for OPS
\end{itemize}

The tuned hyperparameters we used are as follows:
\begin{itemize}
    \item MW: $\texttt{uf}=1, \texttt{ws}=60$
    \item IW: $\texttt{uf}=8$
    \item OPS: $\gamma=5\text{e-3}, \texttt{D}=150$.
\end{itemize}

\newpage
\section{Online Platt Scaling Algorithm}\label{app:ops-algorithm}
\begin{algorithm}[h]
\begin{algorithmic}
	% \STATE {\bfseries Input:} Interval partition of $[0,1]$, such as \eqref{eq:B-bins}%$\Bcal : [0,1] \to [1, 2, \ldots, B]$, the binning function
	\STATE {\bfseries Input: } $\mathcal{K} = \{(x, y): \norm{(x, y)}_2 \leq 100\}$, time horizon $H$, and initialization parameter $(a_1^\ops, b_1^\ops) = (1, 0) =: \theta_1 \in \mathcal{K}$\;
        \STATE {\bfseries Hyperparameters and default values:} $\gamma = 0.1$, $D = 1$, $A_0 = (1/\gamma D)^2\  \mathbf{I}_2$
    \FOR{$t=1$ {\bfseries to} $H$}
    \STATE Play $\theta_t$, observe log-loss $l(m^{\theta_t}(f(\x_t)), y_t)$ and its gradient $\nabla_t := \nabla_{\theta_t}l(m^{\theta_t}(f(\x_t)), y_t)$
    \STATE $A_t = A_{t-1} + \nabla_t \nabla_t^\intercal$
    \STATE Newton step: $\widetilde{\theta}_{t+1} = \theta_t - \frac{1}{\gamma} A_t^{-1} \nabla_t$
    \STATE Projection: $(a_{t+1}^\ops, b_{t+1}^\ops) = \theta_{t+1} = \argmin_{\theta \in \mathcal{K}} (\widetilde{\theta}_{t+1}-\theta)^{\intercal}A_t(\widetilde{\theta}_{t+1}-\theta)$
    \ENDFOR
    \end{algorithmic}
 	\caption{Online Platt Scaling (based on \citet{gupta2023online})} 
  \label{alg:ops-ons}
\end{algorithm}

\newpage
\bibliography{chung_631}

\end{document}
