\documentclass[accepted]{uai2023} % for initial submission
% \documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{abbrvnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{float}
\usepackage{algorithm}
\usepackage{algpseudocode}

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr} 
\externaldocument{uai2023-template}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\usepackage{hyperref}       % hyperlinks
\usepackage{url}            % simple URL typesetting
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{lipsum}
\usepackage{float}
\usepackage{graphicx}
\usepackage{caption}
\usepackage{subcaption}
\usepackage{amsthm}
\usepackage{listings}
\usepackage{amsmath,bm}
\usepackage[noabbrev,capitalize,nameinlink]{cleveref}
\usepackage{multirow}
\usepackage{comment}
\usepackage{notoccite}
\newcommand{\citeall}[1]{\citet{#1} (\citeyear{#1})}
%\bibliographystyle{unsrtnat}
%\usepackage[shortlabels]{enumitem}

\newtheorem{theorem}{Theorem}[]
\newtheorem{proposition}{Proposition}[]
\newtheorem{claim}{Claim}[]
\newtheorem{lemma}[theorem]{Lemma}

\newcommand{\gb}{\bm{\gamma}}
\newcommand{\Tb}{\bm{\Theta}}
\newcommand{\tb}{\bm{\theta}}
\newcommand{\Jb}{\textbf{J}}
\newcommand{\Ib}{\textbf{I}}
\newcommand{\intd}{\text{d}}
\newcommand{\bb}{\textbf{b}}
\newcommand{\zb}{\textbf{z}}
\newcommand{\xb}{\mathbf{x}}
\newcommand{\Xb}{\textbf{X}}
\newcommand{\yb}{\textbf{y}}
\newcommand{\fb}{\textbf{f}}

\DeclareMathOperator*{\argmin}{\arg\!\min}
\DeclareMathOperator*{\argmax}{\arg\!\max}
\newcommand{\N}{\mathcal{N}}
\newcommand{\Ha}{\langle H \rangle}
\newcommand{\Hh}{\hat{H}}
\newcommand{\Uh}{\hat{U}}
\newcommand{\HM}{\langle H \rangle_{M}}
\newcommand{\LE}{\mathcal{L}_E}
\newcommand{\LT}{\mathcal{L}_T}
\newcommand{\K}{\mathcal{K}}
\newcommand{\Sc}{\mathcal{S}}
\newcommand{\Ub}{\mathbf{U}}
\newcommand{\rhoh}{\hat{\rho}}
\newcommand{\psib}{\boldsymbol{\psi}}
\newcommand{\rhob}{\boldsymbol{\rho}}
\newcommand{\thetab}{\boldsymbol{\theta}}
\newcommand{\bigO}{\mathcal{O}}

\newcommand{\STAB}[1]{\begin{tabular}{@{}c@{}}#1\end{tabular}}

\title{On the Role of Model Uncertainties in Bayesian Optimization\\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1,*]{\href{mailto:<jonf@dtu.dk>?Subject=Your UAI 2023 paper}{Jonathan Foldager}{}}
\author[1,*]{Mikkel Jordahn}
\author[1]{Lars Kai Hansen}
\author[1]{Michael Riis Andersen}
% Add affiliations after the authors
\affil[1]{%
Department of Applied Mathematics and Computer Science, Technical University of Denmark
}\affil[*]{%
Shared first authorship.
}
  
  \begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle



\section{Hyperparameter Tuning Datasets}
\label{app:hyperparam_tuning}

When collecting our hyperparameter tuning datasets, the combinations of models and datasets are as follows:

\begin{table}[H]
\caption{Model and Data Combinations for Hyperparameter Tuning}
\begin{center}
\begin{tabular}{|c|c|c|c|c|}
\hline
\multicolumn{1}{|l|}{} & MNIST                     & FashionMNIST              & AG News Classification    & Wine Classification       \\ \hline
FFNN                   & \checkmark & \checkmark & \checkmark &                           \\ \hline
CNN                    & \checkmark & \checkmark &                           &                           \\ \hline
SVM                    &                           &                           &                           & \checkmark \\ \hline
\end{tabular}
\end{center}
\end{table}

For each of the models we then select a number of hyperparameters which we want to tune, create a grid for these hyperparameters and train a model for each of these hyperparameter sets (the BO input is thus hyperparameters and the output is validation performance). The FFNN simply has a single hidden layer with a ReLU activation function and a single dropout layer, except in the case of the AG News Classification where the "hidden layer" is an embedding layer using the \lstinline{nn.EmbeddingBag} from \lstinline{torch} \citep{pytorch}. The CNN is a network with two convolution layers with kernel size $(5,5)$ of output channels 16 and 32 respectively, and a single hidden and dropout layer. Max pooling is also used with a kernel size of $(2,2)$ at every convolution layer. The SVM used is the \lstinline{SVC} from \lstinline{sklearn} \citep{scikit-learn}. The hyperparameters and their grid specification can be seen here:

\begin{table}[H]
\caption{Grid Specifications for Hyperparameter Tuning}
\resizebox{\textwidth}{!}{%
\begin{tabular}{|c|c|c|c|c|c|c|c|}
\hline
 & \textbf{Training Epochs} & \textbf{Dropout Rate} & \textbf{Learning Rate (log space)} & \textbf{Batch Size Train} & \textbf{Hidden Size} & \textbf{C (log space)} & \textbf{$\gamma$ (log space)} \\ \hline
\textbf{FFNN} & np.linspace(1, 10, 10) & np.linspace(0, 0.8, 10) & np.linspace(-11.51, -2.23, 10) & np.arange(8, 256, 32) & np.linspace(1, 271, 10) &  &  \\ \hline
\textbf{CNN} & np.linspace(1, 10, 10) & np.linspace(0, 0.8, 10) & np.linspace(-11.51, -2.23, 10) & np.arange(8, 256, 32) & np.linspace(1, 271, 10) &  &  \\ \hline
\textbf{SVM} &  &  &  &  &  & np.linspace(-6.9, 4.6, 100) & np.linspace(-11.51, -2.23, 100) \\ \hline
\end{tabular}%
}
\end{table}


\section{Experimental Details}

Model details are as following. The GPs are built using GPytorch \citep{gardner2018gpytorch} and BoTorch \citep{balandat2020botorch} (BoTorch's SingleTaskGP class) and use a scale kernel and RBF kernel. The priors can be seen in \cref{tab:GP_priors}. The hyperparameters of the kernel are tuned at every BO iteration using the marginal likelihood using the scipy L-BFGS-B optimizer (default settings in BoTorch). The Deep Ensembles consist of 10 neural networks with two hidden layers of size 30 and 10 respectively and use the ReLU activation function. We use an Adam optimiser with learning rate $4e^{-3}$. We train them for 200 epochs. They are implemented using torch. The BNN Small has a single hidden layer of size 10 whilst the larger BNN model has two hidden layers of size 30 and 10 respectively and are implemented using the BayesLinear layers from torchbnn \citep{BNNs}. We use a KL weight of 1, and use an Adam optimiser with a learning rate of 0.1. They are trained for 500 epochs. The priors of these layers can be seen in \cref{tab:BNN_priors}. The RFs are implemented using sklearn and the hyperparameter grids we tune over are: n\_estimators$=[4, 10, 20]$, max\_depth=$[5, 10, 20]$ and max\_features=$[1.0, \text{"sqrt"}]$. 

\begin{table}[H]
\centering
\caption{GP Priors}
\label{tab:GP_priors}
\begin{tabular}{|c|l|l|}
\hline
\textbf{} & \multicolumn{1}{c|}{\textbf{Lengthscale Prior}} & \multicolumn{1}{c|}{\textbf{Outputscale Prior}} \\ \hline
\textbf{Synthetic Problems} & LogNormalPrior(0.1, 1.0) & NormalPrior(1.0, 2.0) \\ \hline
\textbf{Real Data Problems} & LogNormalPrior(0.1, 5.0) & NormalPrior(1.0, 5.0) \\ \hline
\end{tabular}
\end{table}

\begin{table}[H]
\centering
\caption{BNN Priors}
\label{tab:BNN_priors}
\begin{tabular}{|c|c|c|}
\hline
\textbf{} & \textbf{Prior Mean} & \textbf{Prior Sigma} \\ \hline
\textbf{Input Layer} & 0.0 & 1.0 \\ \hline
\textbf{Hidden Layer} & 0.0 & 1/900 \\ \hline
\textbf{Output Layer} & 0.0 & 1/100 \\ \hline
\end{tabular}
\end{table}

We use BoTorch to perform our Bayesian Optimisation. We use their UpperConfidenceBound and ExpectedImprovement classes for UCB (with beta=1. We experimented briefly with other beta values. These results can be seen in table \ref{tab:beta}) and EI AFs respectively, and have adapted their MaxPosteriorSampling class for use as a TS AF. Due to computational reasons, we perform BO by sampling a candidate pool set of size ($N_{pool}=5000$) at the beginning of each BO iteration that the surrogate and acquisition function can choose to sample from, rather than allowing the surrogates to sample from anywhere in the input space. Please note that $D_{test}\cap D_{pool} = \emptyset$ in the real data experiment setting where the input space is not continous, whilst this is not necesarily the case in the synthetic data experiments as the input space is continous here, and thus we allow random sampling for both $D_{test}$ and $D_{pool}$. The full experimental procedure is written in pseudocode in \cref{alg:experiment}. $F_S$ here denotes acquisition function evaluated based on surrogate model $S$. Please note that we invert the $y$s for the real data problems to make it a minimization problem (we want to optimize model accuracy).

\begin{table}[H]
\centering
\caption{Experimental results when tuning beta of UCB}
\label{tab:beta}
\begin{tabular}{|l|c|c|c|}
\hline
\textbf{Surrogate} & \textbf{Beta} & \textbf{Inst. Regret} & \textbf{Total Regret} \\ \hline
BNN Small & 0.2 & 0.006 & 1.95 \\ \hline
BNN Small & 0.5 & 0.017 & 3.59 \\ \hline
BNN Small & 1 & 0.018 & 4.03 \\ \hline
BNN Small & 2 & 0.040 & 4.76 \\ \hline
DE & 0.2 & 0.003 & 1.05 \\ \hline
DE & 0.5 & 0.001 & 0.91 \\ \hline
DE & 1 & 0.000 & 0.84 \\ \hline
DE & 2 & 0.001 & 1.01 \\ \hline
GP & 0.2 & 0.003 & 1.75 \\ \hline
GP & 0.5 & 0.002 & 1.47 \\ \hline
GP & 1 & 0.000 & 1.33 \\ \hline
GP & 2 & 0.001 & 1.38 \\ \hline
RF & 0.2 & 0.003 & 0.97 \\ \hline
RF & 0.5 & 0.003 & 0.94 \\ \hline
RF & 1 & 0.002 & 0.78 \\ \hline
RF & 2 & 0.004 & 0.97 \\ \hline
\end{tabular}
\end{table}

\begin{algorithm}[H]
\centering
\caption{Bayesian Optimisation Experiments}
\begin{algorithmic}
\Require Surrogate Model: $S$, Acquisition Function: $F$, BO Problem: $P$, $N_{test}=5000$, $N_{pool}=5000$, $N_{init}=10$, $i=90$
\If{$P$ is synthetic}
    \State $D_{test} \gets N_{test} \text{random points from} P$ 
    \State $D_{pool} \gets N_{pool}$ random points from $P$
    \State Standardize data s.t. mean=0, var=1 using $D_{pool}$ metrics.
    \State $x_{opt}, y_{opt} \gets \min D_{pool}$
    \State $D_{train} \gets N_{init}$ random points from $D_{pool}$
    \State $D_{pool} \gets D_{pool}$ - $D_{train}$
\ElsIf{$P$ is real data}
    \State $D_{problem} \gets $ all points from $P$
    \State $D_{test} \gets N_{test}$ random points from $D_{problem}$
    \State $D_{problem} \gets D_{problem}$ - $D_{test}$
    \State $D_{pool} \gets N_{pool}$ random points from $D_{problem}$
    \State Standardize data s.t. mean=0, var=1 using $D_{pool}$ metrics.
    \State $x_{opt}, y_{opt} \gets \min D_{pool}$
    \State $D_{train} \gets N_{init}$ random points from $D_{pool}$
    \State $D_{pool} \gets D_{pool}$ - $D_{train}$
\EndIf
\While{$i$ > 0}
    \State Fit $S$ to $D_{train}$
    \State $D_{next} \gets \max F_S(D_{pool})$
    \State $D_{train} \gets D_{train} + D_{next}$
    \State $D_{pool} \gets D_{pool} - D_{next}$
    \State Fit $S$ to $D_{train}$
    \State $y_{best} = \min D_{train}$
    \State Calculate regret: $y_{opt}-y_{best}$
    \State Calculate ECE of model based on $D_{test}$
    \State $i \gets i - 1$
\EndWhile
\end{algorithmic}
\label{alg:experiment}
\end{algorithm}




\section{Multiple Regression Analysis}

In \cref{tab:reg_real} and \cref{tab:reg_synth} the multiple regression analysis can be seen for the real and synthetic data respectively. The regression was done using Statsmodels for Python \citep{seabold2010statsmodels}.

\begin{table}[H]
\caption{Multiple regression analysis for hyperparameter tuning experiments. GP is the baseline model and MNIST is the baseline dataset. The other slopes and intercepts are contrasts to these two baselines.}\label{tab:reg_real}
\resizebox{\columnwidth}{!}{%
\begin{tabular}{lcccccc}
                               & \textbf{coef} & \textbf{std err} & \textbf{t} & \textbf{P$> |$t$|$} & \textbf{[0.025} & \textbf{0.975]}  \\
\midrule
\textbf{calibration\_mse}      &       8.9899  &       88.549     &     0.102  &         0.919        &     -167.409    &      185.389     \\
\textbf{BNN}                   &     -56.6270  &       89.992     &    -0.629  &         0.531        &     -235.899    &      122.646     \\
\textbf{DE}                    &     -48.5141  &       92.738     &    -0.523  &         0.602        &     -233.257    &      136.229     \\
\textbf{RF}                    &     -95.7785  &      113.183     &    -0.846  &         0.400        &     -321.251    &      129.694     \\
\textbf{BNN Small}             &     -59.3964  &       89.109     &    -0.667  &         0.507        &     -236.911    &      118.118     \\
\textbf{intercept}             &       1.7675  &        1.282     &     1.379  &         0.172        &       -0.787    &        4.322     \\
\textbf{DE\_intercept}         &       0.9198  &        1.676     &     0.549  &         0.585        &       -2.419    &        4.259     \\
\textbf{BNN\_intercept}        &       6.6811  &        2.856     &     2.339  &         0.022        &        0.992    &       12.370     \\
\textbf{RF\_intercept}         &       0.8637  &        1.615     &     0.535  &         0.594        &       -2.353    &        4.081     \\
\textbf{BNN\_Small\_Intercept} &       7.2043  &        2.353     &     3.062  &         0.003        &        2.518    &       11.891     \\
\textbf{fashionmnist}          &       1.4489  &        0.364     &     3.981  &         0.000        &        0.724    &        2.174     \\
\textbf{mnist\_cnn}            &      -1.2256  &        0.460     &    -2.665  &         0.009        &       -2.142    &       -0.310     \\
\textbf{fashionmnist\_cnn}     &       0.9344  &        0.413     &     2.261  &         0.027        &        0.111    &        1.758     \\
\textbf{news}                  &      -0.2718  &        0.406     &    -0.669  &         0.505        &       -1.081    &        0.537     \\
\textbf{svm\_wine}             &      -2.6434  &        0.390     &    -6.775  &         0.000        &       -3.421    &       -1.866     \\
\bottomrule
\end{tabular}
}
\end{table}

\begin{table}[H]
\caption{Multiple regression analysis for synthetic optimisation experiments. GP is the baseline model and Problem18 is the baseline dataset. The other slopes and intercepts are contrasts to these two baselines.}\label{tab:reg_synth}
\resizebox{\columnwidth}{!}{%
\begin{tabular}{lcccccc}
                              & \textbf{coef} & \textbf{std err} & \textbf{t} & \textbf{P$> |$t$|$} & \textbf{[0.025} & \textbf{0.975]}  \\
\midrule
\textbf{calibration\_mse}     &   -2477.1182  &      596.466     &    -4.153  &         0.000        &    -3649.745    &    -1304.492     \\
\textbf{BNN}                  &    2049.7326  &      576.865     &     3.553  &         0.000        &      915.640    &     3183.825     \\
\textbf{DE}                   &    2343.7352  &      582.393     &     4.024  &         0.000        &     1198.776    &     3488.695     \\
\textbf{RF}                   &    1124.1523  &      726.204     &     1.548  &         0.122        &     -303.534    &     2551.839     \\
\textbf{BNN Small}            &    2155.4162  &      578.996     &     3.723  &         0.000        &     1017.134    &     3293.698     \\
\textbf{intercept}            &      24.4450  &       11.213     &     2.180  &         0.030        &        2.400    &       46.490     \\
\textbf{DE\_intercept}        &     -22.5397  &        8.992     &    -2.507  &         0.013        &      -40.217    &       -4.862     \\
\textbf{BNN\_intercept}       &      59.5739  &       17.698     &     3.366  &         0.001        &       24.781    &       94.367     \\
\textbf{RF\_intercept}        &      11.6801  &       11.736     &     0.995  &         0.320        &      -11.393    &       34.754     \\
\textbf{BNN Small\_intercept} &      50.0913  &       13.164     &     3.805  &         0.000        &       24.211    &       75.972     \\
\textbf{MegaDomain02}         &      -8.3801  &       11.598     &    -0.723  &         0.470        &      -31.181    &       14.420     \\
\textbf{Ackley}               &     238.4477  &       11.751     &    20.292  &         0.000        &      215.346    &      261.549     \\
\textbf{Schwefel22}           &      15.1786  &       12.368     &     1.227  &         0.220        &       -9.137    &       39.494     \\
\textbf{Problem15}            &      -9.0384  &       11.668     &    -0.775  &         0.439        &      -31.978    &       13.901     \\
\textbf{Sargan}               &       5.3031  &       11.621     &     0.456  &         0.648        &      -17.544    &       28.150     \\
\textbf{Quadratic}            &       0.2745  &       11.537     &     0.024  &         0.981        &      -22.407    &       22.956     \\
\textbf{BartelsConn}          &      -3.1293  &       11.573     &    -0.270  &         0.787        &      -25.881    &       19.622     \\
\textbf{McCourt27}            &      68.7446  &       11.672     &     5.890  &         0.000        &       45.799    &       91.691     \\
\textbf{Sphere}               &      11.3725  &       11.622     &     0.978  &         0.328        &      -11.477    &       34.222     \\
\textbf{Ursem04}              &      51.9448  &       11.613     &     4.473  &         0.000        &       29.115    &       74.775     \\
\textbf{Plateau}              &      36.7332  &       11.642     &     3.155  &         0.002        &       13.846    &       59.620     \\
\textbf{MegaDomain04}         &       3.3478  &       11.668     &     0.287  &         0.774        &      -19.591    &       26.286     \\
\textbf{Problem13}            &      -3.7408  &       11.551     &    -0.324  &         0.746        &      -26.449    &       18.967     \\
\textbf{SumPowers}            &       4.3195  &       11.583     &     0.373  &         0.709        &      -18.452    &       27.091     \\
\textbf{MegaDomain03}         &      -6.3915  &       11.623     &    -0.550  &         0.583        &      -29.242    &       16.459     \\
\textbf{Brown}                &      24.5035  &       13.673     &     1.792  &         0.074        &       -2.378    &       51.385     \\
\textbf{Cigar}                &      26.1229  &       11.576     &     2.257  &         0.025        &        3.366    &       48.880     \\
\textbf{Schwefel06}           &       4.3806  &       11.667     &     0.375  &         0.708        &      -18.556    &       27.318     \\
\textbf{McCourt28}            &      -0.2202  &       11.688     &    -0.019  &         0.985        &      -23.197    &       22.757     \\
\textbf{Step}                 &      38.1989  &       11.571     &     3.301  &         0.001        &       15.450    &       60.948     \\
\textbf{HimmelBlau}           &       2.2361  &       11.524     &     0.194  &         0.846        &      -20.419    &       24.892     \\
\textbf{Problem18}            &      -3.1796  &       11.658     &    -0.273  &         0.785        &      -26.098    &       19.739     \\
\textbf{Giunta}               &      13.5342  &       11.684     &     1.158  &         0.247        &       -9.436    &       36.504     \\
\textbf{Csendes}              &       1.7708  &       11.689     &     0.151  &         0.880        &      -21.209    &       24.750     \\
\textbf{Exponential}          &      14.8954  &       11.656     &     1.278  &         0.202        &       -8.020    &       37.811     \\
\textbf{Problem04}            &      -5.2547  &       11.595     &    -0.453  &         0.651        &      -28.050    &       17.541     \\
\textbf{Schwefel20}           &      52.3473  &       11.663     &     4.488  &         0.000        &       29.419    &       75.276     \\
\textbf{Schwefel01}           &      -1.1381  &       11.568     &    -0.098  &         0.922        &      -23.880    &       21.604     \\
\bottomrule
\end{tabular}
}
\end{table}





\newpage

\section{Mathematical Proofs}
\textbf{Proposition 1}:
Let $F_i$ be the CDF of the predictive distribution for the $i$'th observation and let $\{ y_i \}_{i=1}^n$ be i.i.d. samples $y_i \sim p_y$. For $\mathcal{C}_y(p) = \frac{1}{n} \sum_{i=1}^n \mathbb{I}\left[y_i \leq F_i^{-1}(p)\right]$, then the variance of $C_y(p)$ is bounded by $1/n$, i.e. $\mathbb{V}\left[C\right] = \mathcal{O}(n^{-1})$. 

\textbf{Proof:}
First, we show that the variance is bounded by $\mathcal{O}(n^{-1})$. We have
\begin{align}
\mathcal{C}_y(p) = \frac{1}{n} \sum_{i=1}^n \mathbb{I}\left[y_i \leq F_i^{-1}(p)\right]= \frac{1}{n} \sum_{i=1}^n z_i,
\end{align}

where $z_i \equiv \mathbb{I}\left[y_i \leq F_i^{-1}(p)\right]$. The variance of $\mathcal{C}_y(p)$ is then by give

\begin{equation}
    \begin{split}
       \mathbb{V}\left[\mathcal{C}_y(p) \right] &= \mathbb{V}\left[\frac{1}{n} \sum_{i=1}^n z_i\right]\\
&= \frac{1}{n^2}\mathbb{V}\left[ \sum_{i=1}^n z_i\right]\\
%
&= \frac{1}{n^2} \sum_{i=1}^n \mathbb{V}\left[z_i\right]\\
%
&\leq \frac{1}{n^2} \sum_{i=1}^n \sup\limits_i \mathbb{V}\left[z_i\right]\\
%
&\leq \frac{1}{n^2} \sum_{i=1}^n \frac{1}{2^2}\\
%
&= \frac{1}{n} \frac{1}{2^2} 
    \end{split}
\end{equation}

Hence, it also follows the standard deviation of $\mathcal{C}_y(p)$ is bounded by
\begin{align}
\sqrt{\mathcal{C}_y(p)} \leq \sqrt{\frac{1}{n} \frac{1}{2^2}} = \frac{1}{2\sqrt{n}}  = \mathcal{O}\left(\frac{1}{\sqrt{n}}\right).
\end{align}
This completes the proof of the first statement. 

\textbf{Lemma 1}:
Given a perfectly calibrated model, it holds that $\mathbb{V}\left[\mathcal{C}_y(p)\right] = \frac{p(1-p)}{n}$ for all $p$.

\textbf{Proof:}
In this setting, we have
\begin{align}
z_i = \mathbb{I}\left[y_i \leq F_i^{-1}(p)\right] =\mathbb{I}\left[F_i(y_i) \leq p\right] = \mathbb{I}\left[u_i] \leq p\right],
\end{align}
where $u_i \sim \mathcal{U}\left[0, 1\right]$ are uniformly distributed on the unit interval due to the probability integral transform. Since $\{ u_i \}_{i=1}^n$ are also independent, it follows that  
\begin{align}
S_n = \sum_{i=1}^n z_i \sim \text{Binomial}(n, p).
\end{align}

Therefore, it follows that
\begin{equation}
    \begin{split}
        \mathbb{V}\left[\mathcal{C}_y(p)\right] &= \mathbb{V}\left[\frac{1}{n}S\right] = \frac{1}{n^2} \mathbb{V}\left[S\right] \\
        & = \frac{1}{n^2}np(1-p) = \frac{p(1-p)}{n}.
    \end{split}
\end{equation}

This completes the proof.
%\pagebreak


\textbf{Proposition 2}: Let $E_c = \sum_{j=1}^m w_j (p_j - \mathcal{C}_y(p_j))^2$ be the weighted mean square calibration error. Assume $w_i \in \left[0, 1\right]$ and $0 < p_1 < p_2 < ... < p_m < 1$ are fixed, and assume the CDF of the predictive distribution is equal to the true data distribution (almost everywhere), then it holds that $\mathbb{E}\left[E_c\right] = \frac{1}{n}\sum_{j=1}^m w_jp_j(1-p_j) = \mathcal{O}(n^{-1})$ if $y_i \sim p_y$ are i.i.d. samples.

The calibration error $E_C$ is defined as follows
\begin{align}
E_c &= \sum_{j=1}^m w_j (p_j - \mathcal{C}_y(p_j))^2,
\end{align}
where each $w_i \in \left[0, 1\right]$ is a weight and $0 \leq p_1 < p_2 < ... < p_m < 1$ is predefined set of points.

In order to compute the expectation of $E_C$, we first expand:
\begin{align}
E &= \sum_{j=1}^m w_j (p_j^2 + \mathcal{C}_y(p_j)^2 - 2 p_j \mathcal{C}_y(p_j)) \\
&=   \sum_{j=1}^m w_j\mathcal{C}_y(p_j)^2 - 2  \sum_{j=1}^m w_j p_j \mathcal{C}_y(p_j))
\end{align}

Then it follows that
\begin{align} \label{eq:prop2}
\mathbb{E_C}\left[E\right] &= \mathbb{E}\left[ \sum_{j=1}^m w_j p_j^2 +  \sum_{j=1}^m w_j\mathcal{C}_y(p_j)^2 - 2  \sum_{j=1}^m w_j p_j \mathcal{C}_y(p_j))\right]\\
%
&= \sum_{j=1}^m w_j p_j^2 +  \sum_{j=1}^m w_j \mathbb{E}\left[ \mathcal{C}_y(p_j)^2\right] - 2  \sum_{j=1}^m w_j p_j \mathbb{E}\left[ \mathcal{C}_y(p_j)\right].
\end{align}

The first moment evaluates to
\begin{align}
    \mathbb{E}[C_y(p)] &= \int_{-\infty}^{\infty}  \mathbb{I}[y_t \leq F_t^{-1}(p)] p_y \text{d} y \\
    &= \int_{-\infty}^{F_t^{-1}(p)} p_y \text{d} y\\
    &= F_y (F_t^{-1}(p)) \\
    &= p.
\end{align}

Similarly, the second moment evaluates to
\begin{align}
\mathbb{E}\left[\mathcal{C}_y(p)^2\right] &= \mathbb{E}\left[\left(\frac{1}{n} \sum_{i=1}^n z_i\right)^2\right]\\
%
&= \frac{1}{n^2} \mathbb{E}\left[\sum_{i=1}^n \sum_{j=1}^n z_i z_j\right]\\
%
&= \frac{1}{n^2} \sum_{i=1}^n \mathbb{E}\left[z_i^2\right] + \frac{1}{n^2}\sum_{j\neq i} \mathbb{E}\left[z_i z_j \right]\\
%
&= \frac{1}{n^2} \sum_{i=1}^n p + \frac{1}{n^2}\sum_{j\neq i} \mathbb{E}\left[z_i\right] \mathbb{E}\left[z_j \right]\\
%
&= \frac{n}{n^2} p + \frac{1}{n^2}\sum_{j\neq i} p^2\\
%
&= \frac{1}{n} p + \frac{1}{n^2}\left(n^2 - n\right) p^2
\end{align}

Rearranging the terms yields
\begin{equation}
    \begin{split}
        \mathbb{E}\left[\mathcal{C}_y(p)^2\right] %
%
&= \frac{1}{n} p + \frac{n^2 - n}{n^2} p^2\\
%
&= \frac{1}{n} p - \frac{1}{n} p^2 + p^2\\
%
&= \frac{p(1-p)}{n} + p^2
    \end{split}
\end{equation}



Substituting the moments into eq. \eqref{eq:prop2} yields
\begin{equation}
    \begin{split}
\mathbb{E}\left[E_C\right] &= \sum_{j=1}^m w_j p_j^2 +  \sum_{j=1}^m w_j \left[\frac{p_j(1-p_j)}{n} + p_j^2\right]  \\
&- 2  \sum_{j=1}^m w_j p_j^2\\
%
&= \sum_{j=1}^m w_j p_j^2 +  \sum_{j=1}^m w_j \frac{p_j(1-p_j)}{n}  \\
&+ \sum_{j=1}^m w_j p_j^2  - 2  \sum_{j=1}^m w_j p_j^2\\
%
&= \frac{1}{n}\sum_{j=1}^m w_jp_j(1-p_j)\\
%
&= \mathcal{O}(n^{-1}).
    \end{split}
\end{equation}

This completes the proof.


\subsection*{If $p_y$ and $p_t$ are normal distributions \label{sec:p_yt_normals}}    
For non-perfect models we have that $F_y (F_t^{-1}(p)) = g(p)$ where in general $g(p) \neq p$. If both $p_y$ and $p_t$ are normal distributions, the CDF and inverse CDF of a normal are, respectively, given by
\begin{align*}
    F(x) &= \frac{1}{2}\left[1 + \text{erf}\left(\frac{x-\mu}{\sigma \sqrt{2}}\right)\right] \\
    F^{-1}(p) &= \mu + \sigma \sqrt{2} \text{erf}^{-1}\left(2p-1\right) \\
\end{align*}
When data comes from $y_t \sim \mathcal{N}(\mu_y,\sigma_y^2)$ and the model is $\mathcal{N}(\mu_t,\sigma_t^2)$, we can write the expectation of the calibration curve as follows

\begin{align*}
    g(p) &= F_y(F_t^{-1}(p)) \\
    &=  \frac{1}{2}\left[1 + \text{erf}\left(\frac{F_t^{-1}(p)-\mu_y}{\sigma_y \sqrt{2}}\right)\right]  \\
    &=  \frac{1}{2}\left[1 + \text{erf}\left(\frac{\mu_t + \sigma _t\sqrt{2} \text{erf}^{-1}\left(2p-1\right)-\mu_y}{\sigma_y \sqrt{2}}\right)\right]  \\
    &=  \frac{1}{2}\left[1 + \text{erf}\left(\frac{\mu_t -\mu_y}{\sigma_y \sqrt{2}} + \frac{\sigma _t}{\sigma_y }\text{erf}^{-1}\left(2p-1\right)\right)\right]  \\
    &=  \frac{1}{2}\left[1 + \text{erf}\left(\frac{\mu_t -\mu_y}{\sigma_y \sqrt{2}} + \frac{\sigma _t}{\sigma_y }\text{erf}^{-1}\left(2p-1\right)\right)\right]  \\
\end{align*}

which also evaluates to $p$ for a perfect model:
\begin{align*}
    g(p) &= \frac{1}{2}\left[1 + \text{erf}\left(\frac{0}{\sigma_y \sqrt{2}} + 1 \cdot \text{erf}^{-1}\left(2p-1\right)\right)\right]  \\
    &= \frac{1}{2}\left[1 + 2p-1\right]  \\
    &= p  \\
\end{align*}


\newpage
\section{Variance of Calibration Curves as function of Validation Set Size}

We first illustrate empirically how accurately we can asses the calibration curve as a function of the size of the validation set $N$. This can be seen in \cref{fig:n-vs-variance}, where the true data generating distribution $p_y(y|x) = \mathcal{N}(y|0,1)$ is approximated by six different model distributions $p_t(y|x)$ (one for each row). The first column shows the PDF and CDF of the true distribution and the model distribution in blue and black, respectively. Each of the subsequent columns shows the estimated calibration curves as a function of the number validation samples $N$. We repeat this experiment one hundred times and display the mean and confidence intervals corresponding to $\pm 2$ standard deviations. 

\begin{figure}[H]
    \includegraphics[width=\textwidth]{figs/calibration_nsamples.pdf}
    \caption{Examples of calibration curves computed on various number of test examples $N$, when the true data comes from a standard Gaussian and the model (left plots) varies (each row). Even in the best case scenario when samples are i.i.d., a large sample-to-sample variance can be expected in the ranges of $N$ for which BO normally operates. Calibration curve distributions are made from 100 random seeds, and the intervals corresponds to two times the standard deviation. \label{fig:n-vs-variance}}
\end{figure}

\newpage

\section{Empirical Confirmation of Proposition 1}

To validate proposition 1, we now expand the experiment from \cref{fig:n-vs-variance}. 
In \cref{fig:std_calibration_n}, we have conducted a numerical experiment, where we sample 100 models of the form $p_t(y|x) = \mathcal{N}(y|\mu, \sigma)$, where $\mu \sim \mathcal{N}(0, 1)$ and $\sigma \sim \text{LogNormal}(1,1)$.
For each model, we compute 100 calibration curves for each sample size $N \in [5,1000]$ and subsequently estimate the variance of those curves. \cref{fig:std_calibration_n} shows the maximum standard deviation as a function of the sample size $N$.

\begin{figure}[H]
    \includegraphics[width=\columnwidth]{figs/sup_std_calibration.pdf}
    \caption{Maximum uncertainty across $p$ for calibration distribution $C_p(y)$ when $N$ samples of $y$ is given for computing the individual calibration curves. We sample 100 models (normal distributions) each with arguments $\mu_i \sim \text{Normal}(0,1)$ and $\sigma_i \sim \text{LogNormal}(1,1)$ each modelling data coming from a standard normal. For each experiment 100 calibration curves, that is 100 independent samples of size $N$ from the true model, constitutes the mean and std. We also plot the function $f(N) = a/\sqrt{N}$ for $a\approx 1.05$. \label{fig:std_calibration_n}}
\end{figure}

\bibliography{foldager_370} 
\end{document}
