% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{amssymb}

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr} 
\externaldocument{ott_114}



\newenvironment{talign*}
 {\let\displaystyle\textstyle\csname align*\endcsname}
 {\endalign}
\newenvironment{talign}
{\let\displaystyle\textstyle\align}
{\endalign}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\hypersetup{colorlinks=true, citecolor=black, linkcolor=black} % This is from Philipp not in original file
\newcommand{\update}[1]{\textcolor{olive}{#1}}

\title{Bayesian Numerical Integration with Neural Networks \\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1, 2]{\href{mailto:katharina.ott3@de.bosch.com?Subject=Your UAI 2023 paper}{Katharina~Ott}{}}
\author[1]{Michael~Tiemann}
\author[2, 3]{Philipp~Hennig}
\author[4, 5]{Fran\c{c}ois-Xavier~Briol}
% Add affiliations after the authors
\affil[1]{%
Bosch Center for Artificial Intelligence,
Renningen, Germany
}
\affil[2]{%
University of Tübingen, Tübingen, Germany
}
\affil[3]{%
MPI for Intelligent Systems,
Tübingen, Germany
}
\affil[4]{%
Department of Statistical Science, University College
London, London, United Kingdom
}
\affil[5]{%
The Alan Turing Institute, London, United Kingdom
}
  
  \begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle

\section{EXPERIMENTAL DETAILS}

Below we give implementation details for all the datasets used and provide additional experimental results.

For our implementation of the BSN and the experiments described in the main text we make use of the following packages: \texttt{PyTorch} \citep{pytorch}, \texttt{emukit} \citep{paleyes2019emukit}, \texttt{GPyTorch} \citep{gpytorch}, \texttt{laplace-torch} \citep{daxberger2021laplace}, \texttt{PyWake}\citep{pywake}, and \texttt{Matplotlib} \citep{matplotlib}.

\subsection{Impact of Architecture Design}
\label{sec:sup_architecture}

We provide additional discussion concerning the choice of activation function, choice of sampling strategy and choice of optimizer.


\subsubsection{Choice of Optimizer}
\label{sec:optimizer}

We compare different optimizers for the BSN and a standard neural network, where for the standard neural network we use the same architecture as for $u_{\theta_u}$.
We use the 1-dimensional wind farm dataset with $n=320$ data points.
We choose this dataset, due to the complicated structure of the score function of a mixture of Gaussians.
For the experiment we consider three optimizers, i.e., Adam \citep{kingma2015}, L-BFGS \citep{liu1989limited} and the Hessian-free optimizer \citep{martens2010deep}.
For Adam, we use mini-batching with a batch size of $32$ and full-batch training.
For the Hessian-free optimizer and L-BFGS we only consider full-batch training.
For Adam, we use $10000$ iterations, for the Hessian-free optimizer $1000$ iterations, and for L-BFGS we use automatic stopping based on the strong Wolfe conditions. 
We compare the loss for all training methods.
We also use CELU and RELU activation functions, where RELU is included as it is the standard activation function for neural networks.

Training a standard neural network with RELUs work significantly better, than using CELUs both in terms of the loss reached at the end of training and in terms of runtime (see Figure~\ref{fig:regression_performance} and Figure~\ref{fig:regression_performance_relu}).
Using RELUs does not work for the BSN, as the gradients of $u_{\theta_u}$ lead to discontinuities.

Training of the BSN using Adam is considerably slower than the training progress of the standard neural network.
We find that for CELU activation function, using (approximate) second order methods leads to a large improvement both in terms of speed and loss.
The success of the second order methods might be due to a narrow loss landscape, i.e., a larger spread in the eigenvalue spectrum of the curvature.
Therefore, we also examine the condition number of the Hessian, and we find that the BSN has a slightly higher condition number than the standard neural network (we do not report the condition number for RELUs as it cannot be computed numerically).
Given its short runtime and good optimization results, we choose L-BFGS for all our experiments.

\begin{figure}
\includegraphics{fig/regression_act_celu.pdf}
\caption{Regression performance of a plain neural network (red) and a BSN (blue) using CELU activations. 
Loss (\emph{left}), and condition number (\emph{left}) as a function of the iteration. 
\emph{Centre:} loss as a function of the runtime.
Thin dark lines correspond to training with full-batch Adam.
Runtime of the Hessian-free optimizer not plotted, due to its long runtime.
}
\label{fig:regression_performance}
\end{figure}

\begin{figure}
\includegraphics{fig/regression_act_relu.pdf}
\caption{Regression performance of a plain neural network (red) and a BSN (blue) using RELU activations. 
Loss (\emph{left}) as a function of the iteration. 
\emph{Centre:} loss as a function of the runtime.
Thin dark lines correspond to training with full-batch Adam.
Runtime of the Hessian-free optimizer not plotted, due to its long runtime.
}
\label{fig:regression_performance_relu}
\end{figure}

\subsubsection{Sampling Strategies}
\label{sec:sup_sampling}

For our experiments in the main text, we choose the data points by sampling from $\pi$, i.e., $x_i \sim \pi$.
Here we consider two additional sampling strategies:
\begin{itemize}
  \item Using a quasi-Monte Carlo (QMC) sequence. We use \texttt{SciPy}'s \citep{scipy} implementation of QMC based on the Sobol sequence \citep{sobol1967on}.
  \item Linearly spaced points in a hypercube (called \emph{grid} in Figure~\ref{fig:genz_1_sampling}). Here we consider the hypercube $[-5 \sigma_{\pi}, 5 \sigma_{\pi}]^d$, where $\pi(x) = \mathcal{N}(x | 0, \sigma_{\pi})$.
\end{itemize}

Figure~\ref{fig:genz_1_sampling} shows the result of the different sampling strategies in $d=1$.
The BSN performs better using MC samples then using QMC samples and grid points.
The low performance of the latter is expected, since too few points a placed in regions with a high probability mass.

\begin{figure}
\includegraphics{fig/genz_1_sampling.pdf}
\caption{Comparing different sampling schemes on the continuous Genz dataset in $d=1$. The BSN is trained on MC-sampled points, QMC-sampled points and on a regular grid.
  Mean relative integration error (\emph{left}), and runtime (\emph{centre}), (based on 5 repetitions) as a function of $n$. 
  \emph{Right:} Run time in seconds as a function of mean relative integration error.
}
\label{fig:genz_1_sampling}
\end{figure}

\subsubsection{Choice of Architecture}
\label{sec:architecture_search}

We consider a basic architecture of the following form:
\begin{talign*}
    u_{\theta_u} = \text{Linear}(d, h) \circ CELU (\circ \text{Linear}(h, h) \circ CELU)^l \circ \text{Linear}(h, d), 
\end{talign*}
where $h$ are the number of hidden units and $l$ are the number of hidden layers.
Figure~\ref{fig:architecture_search} shows the performance of different architectures on the 1-dimensional continuous Genz dataset.
All architectures perform similar but the architecture with $l=2$ and $h=32$ reaches the lowest error the fastest for large $n$.
Hence, we use this architecture for our experiments.

\begin{figure}
\includegraphics{fig/genz_1_network.pdf}
\caption{
  Testing different architectures on the 1-dimensional continuous Genz dataset.
  Mean relative integration error (\emph{left}), and run time (\emph{centre}), (based on 5 repetitions) as a function of $n$. 
  \emph{Right:} Run time in seconds as a function of mean relative integration error.
}
\label{fig:architecture_search}
\end{figure}

\begin{figure}
\includegraphics{fig/genz_1_m.pdf}
\caption{\emph{Continuous Genz dataset in $d=1$ with different $m(x)$.}
Mean relative integration error (\emph{left}), run time (\emph{center}) (based on 5 repetitions) as a function of $n$. 
\emph{Right:} Run time in seconds as a function of mean relative integration error.
}
\label{fig:genz_m}
\end{figure}
\subsubsection{Choice of $m(x)$}
For most of our experiments we set $m(x) = I_{d}$ in Equation~\eqref{eq:stein_operator}.
This might not necessarily be the best choice for a given task, but finding a function $m$ that works well is hard.
We test different $m$ on the 1-dimensional Continuous Genz function:
\begin{itemize}
  \item $m(x) = \frac{I_d}{||x||_2^2 +1}$ - $m(x)$ goes to zero for $x \rightarrow \pm\infty$
  \item $m(x) = \frac{I_d}{\sqrt{||x||_2^2 +1}}$ - $m(x)$ goes to zero for $x \rightarrow \pm\infty$ and cancels the $\nabla_x \log \pi(x)$ term for large $x$.
  \item $m(x) = I_d \pi(x)$ - in cases where $\pi$ is a normal distribution, this function also goes to zero for $x \rightarrow \pm\infty$.
  \item $m(x) = \mathrm{diag} x$ - example of a function having negative effect.
\end{itemize} 
The results of comparing these different $m$ are shown in Figure~\ref{fig:genz_m}.
On this test problem, none of the proposed $m$ significantly outperforms the choice $m(x)=I_d$, with some performing significantly worse.

\subsubsection{Choice of GP Kernel}

  As a benchmark we use BQ with an RBF kernel for all our experiments.
  The reason for this choice of kernel is the closed form availability of posterior mean and covariance when $\pi$ is a normal distribution.
  Here we add an experiment using a Matern 1/2 kernel.
  For this choice of kernel the posterior mean is only available in $d=1$, hence we conduct the experiment on the 1 dimensional Genz dataset (see Section~\ref{sec:matern} for the expression of the kernel mean embedding).
  The corresponding results a found in Figure~\ref{fig:genz_matern}. Once again, we do not observe a significant difference in performance, except for the continuous Genz dataset. 

\begin{figure}
\includegraphics{fig/genz_1_matern.pdf}
\caption{\emph{BQ with Matern 1/2 kernel on the Genz family in $d=1$.}
Mean relative integration error (based on 5 repetitions) as a function of $n$. 
}
\label{fig:genz_matern}
\end{figure}

\subsection{Genz Benchmark}
\label{sec:genz_family}
\label{sec:exp_details_genz}
In our experiments we use the Genz integrand family dataset.
Here we include a short description of each dataset, plus additional experiments on the 2-dimensional version of each dataset.
In our experiments we integrate the Genz function against a standard normal $\pi(x) = \mathcal{N}(x | 0, 1)$.
This requires the transformation of the inputs to the original Genz functions $f$, which are to be integrated against $[0, 1]^d$.
Therefore, we compute $\Pi_{\pi}[f \circ c]$ where $c(x) = \frac{1}{2} \left( 1 + \text{erf} \left( \frac{x}{\sqrt{2}} \right) \right)$ is the cumulative density function of the standard normal.
We give the form of $f$ below.

\paragraph{Continuous Genz dataset}
The integrand is given by
\begin{talign*}
  f(x) = \exp \left( - \sum_{k=1}^d a_k | x_k - u_k| \right)
\end{talign*}
with parameters $a_k = 1.3$ and $u_k = 0.55$.
See Figure~\ref{fig:genz_continuous} for results on a 2 dimensional version of this dataset.

\begin{figure}
\includegraphics{fig/genz_2_continuous.pdf}
\caption{\emph{Continuous Genz dataset in $d=2$.}
Mean relative integration error (\emph{left}), run time (\emph{centre-left}), and calibration (\emph{right}) (based on 5 repetitions) as a function of $n$. 
\emph{Center-right:} Run time in seconds as a function of mean relative integration error.
}
\label{fig:genz_continuous}
\end{figure}

\paragraph{Corner Peak dataset}
The integrand is given by
\begin{talign*}
  f(x) = \left(1 +  \sum_{k=1}^d a_k x_k \right)^{-(d+1)}
\end{talign*}
with parameters $a_k = 5$.
See Figure~\ref{fig:genz_corner} for results on a 2 dimensional version of this dataset.

\begin{figure}
\includegraphics{fig/genz_2_corner.pdf}
\caption{\emph{Corner Peak dataset in $d=2$.}
Mean relative integration error (\emph{left}), run time (\emph{centre-left}), and calibration (\emph{right}) (based on 5 repetitions) as a function of $n$. 
\emph{Center-right:} Run time in seconds as a function of mean relative integration error.
}
\label{fig:genz_corner}
\end{figure}

\paragraph{Discontinuous Genz dataset}
The integrand is given by
\begin{talign*}
  f(x) = \begin{cases}
    0, \quad\text{if}~ x_k > u_k ~\text{for any}~ $k$\\
    \exp\left( \sum_{k=1}^d a_k x_k \right)
  \end{cases}
\end{talign*}
with parameters $a_k = 5$ and $u_k=0.5$.
See Figure~\ref{fig:genz_discontinuous} for results on a 2 dimensional version of this dataset.

\begin{figure}
\includegraphics{fig/genz_2_discontinuous.pdf}
\caption{\emph{Discontinuous Genz dataset in $d=2$.}
Mean relative integration error (\emph{left}), run time (\emph{centre-left}), and calibration (\emph{right}) (based on 5 repetitions) as a function of $n$. 
\emph{Center-right:} Run time in seconds as a function of mean relative integration error.
}
\label{fig:genz_discontinuous}
\end{figure}

\paragraph{Gaussian peak dataset}
The integrand is given by
\begin{talign*}
  f(x) = \exp \left( - \sum_{k=1}^d a_k^2 (x_k - u_k)^2 \right)
\end{talign*}
with parameters $a_k = 5$ and $u_k = 0.5$
See Figure~\ref{fig:genz_gaussian} for results on a 2 dimensional version of this dataset.

\begin{figure}
\includegraphics{fig/genz_2_gaussian.pdf}
\caption{\emph{Gaussian peak dataset in $d=2$.}
Mean relative integration error (\emph{left}), run time (\emph{centre-left}), and calibration (\emph{right}) (based on 5 repetitions) as a function of $n$. 
\emph{Center-right:} Run time in seconds as a function of mean relative integration error.
}
\label{fig:genz_gaussian}
\end{figure}

\paragraph{Product peak dataset}
The integrand is given by
\begin{talign*}
  f(x) = \prod_{k=1}^d \frac{1}{\left( a_k^{-2} + (x_k - u_k)^2\right)}
\end{talign*}
with parameters $a_k = 5$ and $u_k = 0.5$
See Figure~\ref{fig:genz_product} for results on a 2 dimensional version of this dataset.

\begin{figure}
\includegraphics{fig/genz_2_product.pdf}
\caption{\emph{Product peak dataset in $d=2$.}
Mean relative integration error (\emph{left}), run time (\emph{centre-left}), and calibration (\emph{right}) (based on 5 repetitions) as a function of $n$. 
\emph{Center-right:} Run time in seconds as a function of mean relative integration error.
}
\label{fig:genz_product}
\end{figure}

\paragraph{Oscillatory Genz dataset}
The integrand is given by
\begin{talign*}
  f(x) = \cos \left(2 \pi u + \sum_{k+1}^d a_k x_k \right)
\end{talign*}
with parameters $a_k = 5$ and $u = 0.5$
See Figure~\ref{fig:genz_oscillatory} for results on a 2 dimensional version of this dataset.
\begin{figure}
\includegraphics{fig/genz_2_oscillatory.pdf}
\caption{
\emph{Oscillatory Genz dataset in $d=2$.}
Mean relative integration error (\emph{left}), run time (\emph{centre-left}), and calibration (\emph{right}) (based on 5 repetitions) as a function of $n$. 
\emph{Center-right:} Run time in seconds as a function of mean relative integration error.
  }
\label{fig:genz_oscillatory}
\end{figure}


\subsection{Goodwin Oscillator}
\label{sec:goodwin_oscillator}

Goodwin oscillator \citep{goodwin1965oscillatory} describes how the feedback loop between mRNA transcription and protein expression can lead to oscillatory dynamics in a cell. 
We here consider the case with no intermediate protein species.
The experimental setup is based on earlier work by \citep{riabiz2020, chen2019stein, Calderhead2009, Oates2016thermo}.

The Goodwin oscillator with no intermediate protein species is given by:
\begin{talign*}
  \frac{du_1}{dt} &= \frac{a_1}{1 + a_2 u_2^\rho} - \alpha u_1\\
  \frac{du_2}{dt} &= k_1 u_1 - \alpha u_2,
\end{talign*}
where $u_1$ corresponds to the concentration of mRNA and $u_2$ to the concentration of the corresponding protein product.
We set $\rho = 10$.

As initial conditions we set $u_0 = (0, 0)$.
To generate the ground truth dataset, we set $a_1 = 1$, $a_2 = 3$, $k_1=1$ and $\alpha=0.5$.
We use a measurement noise of $\sigma = (0.1, 0.05)$.
Data was collected for $2400$ time points in $t\in[1, 25]$, leading to the following expression for the likelihood:
\begin{talign*}
 p(y | x) \propto \exp \left(- \frac{1}{2 \sigma_1^2} \sum_{k=1}^{2400} ||y_{1,k} - u_1(t_k) ||_2^2- \frac{1}{2 \sigma_2^2} \sum_{k=1}^{2400} ||y_{2,k} - u_2(t_k) ||_2^2 \right)
\end{talign*}
We use an \texttt{JAX}'s implementation of Dopri5(4) to solve the ODE. 
We use automatic differentiation implemented in \texttt{JAX} to compute derivatives of the likelihood with respect to the parameters.
To avoid parameters becoming negative, we use log-transformed parameters $w = \log(x)$ for the parameter inference via MCMC.
We place a standard normal prior on the log-transformed parameters $w$.
For this dataset we choose MALA, based on the successful application and extensive study of this MCMC algorithm in previous work \citep{riabiz2020, chen2019stein, Calderhead2009, Oates2016thermo}.
For each dataset we run five chains, where the initial conditions for each chain are sampled from the prior.
Each chain is run with a step size of $h=0.0033$ for $500000$ steps.
We use thinning with a step size of $20$.
This results in datasets of $n=125000$.
We did not use any warm-up on this dataset to keep the choice consistent for all dataset sizes. 

Figure~\ref{fig:goodwin_oscillator2} shows the results for the remaining two parameters not shown in the main text.
\begin{figure*}
	\includegraphics{fig/ode_4_1.pdf}
	\caption{
  \emph{Posterior expectations for the parameters of a Goodwin ODE}.
  Mean relative integration error (\emph{left} and \emph{centre-right}), and uncertainty estimates (\emph{centre-left} and \emph{right}) (based on 5 repetitions) as a function of $n$. 
  }
	\label{fig:goodwin_oscillator2}
\end{figure*}

\subsection{Wind Farm Modelling}
\label{sec:exp_details_wind_farm}
For the wind farm model in our experiments, we assume we have a large-scale wind farm with equally spaced turbines on a two-dimensional grid and an ambient turbulence intensity. 
For each turbine, we use a wake deficit model by \citet{Niayifar2016}.
We put the following distributions on parameters for the wind farm simulation
\begin{itemize}
  \item \textbf{Turbine resistance coefficient:} Gaussian distribution with mean $\mu=1.33$ and variance $\sigma^2=0.1$.
  \item \textbf{Coefficient describing the wake expansion:} Gaussian distribution left-truncated at $0$ with mean $\mu=0.38$ and variance $\sigma = 0.001$.
  \item \textbf{Second coefficient describing the wake expansion:} Gaussian distribution left-truncated at $0$ with mean $\mu=4e-3$ and variance $\sigma^2=1e-8$.
  \item \textbf{Turbulence intensity:} Gaussian distribution left-truncated at $0$ with mean $\mu=0.1$ and variance $\sigma^2 = 0.003$
  \item \textbf{Wind direction:} Mixture of Gaussian distributions truncated so as to have support on $[0, 45]$ with means $\mu_1=0, \mu_2=22.5, \mu_3=33.75$ and variances $\sigma_1^2=50, \sigma_2^2=40, \sigma_3^2=8$.
  \item \textbf{Hub heights:} Gaussian distribution left-truncated at $0$ with mean $\mu=100$ and variance $\sigma^2=0.5$.
  \item \textbf{Hub diameter:} Gaussian distribution left-truncated at $0$ with mean $\mu=100$ and variance $\sigma^2=0.1$.
\end{itemize}

These distributions were chosen to have scales which might realistically represent uncertainty for their input, but if applying our method in practice these would have to be elicited from wind-farm experts. Note that the BSNs could be applied to much more complex distributions so-long as the density of $\Pi$ can be evaluated pointwise up to some normalization constant.

Our code is based on the code estimating the local turbine thrust coefficient \citet{Kirby2022} using a low-order wake model provided here: \url{https://github.com/AndrewKirby2/ctstar_statistical_model}. This code is based on the \texttt{PyWake} package \citep{pywake}.

\section{Bayesian Quadrature}
We now provide a short introduction to BQ and the derivation of the kernel mean embedding for truncated Gaussians.
\subsection{Introduction to Bayesian quadrature}
\label{sec:bq_details}

Recall that we are interest in approximating the integral $\Pi[f] = \int_{\mathcal{X}} f(x) \pi(x) dx$. BQ works by placing a $\mathcal{GP}(m, k)$ on $f$, i.e. a GP with mean function $m:\mathcal{X} \rightarrow \mathbb{R}$ and covariance functions (or kernel) $k : \mathcal{X}  \times \mathcal{X} \rightarrow \mathbb{R}$.
Then, given observation $\{x_i,f(x_i)\}_{i=1}^n$, we can compute the posterior mean and variance on the value of $\Pi[f]$ as
\begin{talign*}
	\mathbb{E}\left[ \Pi[f] \right] &= \int_{\mathcal{X}} \left(m(x) + k(x, x_{1:n}) k(x_{1:n},x_{1:n})^{-1} (f(x_{1:n})-m(x_{1:n}))\right) \pi(x) dx\\
	&= \Pi[m] + \Pi[k(\cdot, x_{1:n})] k(x_{1:n},x_{1:n})^{-1} (f(x_{1:n})-m(x_{1:n})),\\
	\mathbb{V}\left[ \Pi[f] \right] &=
	\int_{\mathcal{X}} \int_{\mathcal{X}} \left(k(x, x') - k(x, x_{1:n}) k(x_{1:n},x_{1:n})^{-1}  k(x_{1:n}, x')\right) \pi(x) \pi(x') dx dx'\\
	&= \Pi\bar{\Pi}[k] + \Pi[k(\cdot, x_{1:n})] k(x_{1:n},x_{1:n})^{-1} \Pi[k(x_{1:n}, \cdot)],
\end{talign*}
where $f(x_{1:n}) \in \mathbb{R}^n$ with $[f(x_{1:n})]_{i} = f(x_i)$, $m(x_{1:n}) \in \mathbb{R}^n$ with $[m(x_{1:n})]_{i} = m(x_i)$,  $k(x_{1:n},x)^\top = k(x,x_{1:n})\in \mathbb{R}^n$ with $[k(x,x_{1:n})]_i = k(x,x_{i})$,  $k(x_{1:n}, x_{1:n}) \in \mathbb{R}^{n \times n}$ with $[k(x_{1:n}, x_{1:n})]_{ij} = k(x_i,x_j)$ for all $i,j$ in $\{1,\ldots,n\}$. Finally, $\Pi\bar{\Pi}[k] = \int_{\mathcal{X}} \int_{\mathcal{X}} k(x, x')  \pi(x) \pi(x') dx dx'$. 


Clearly, the expressions above can only be used if $\Pi[k(\cdot, x)]$, called the kernel mean embedding, and $\Pi\bar{\Pi}[k]$, called the initial error, are known in closed-form. This is only possible for some combinations of distribution $\pi$ and covariance function $k$. For example, if $\mathcal{X}=\mathbb{R}^d$, $\pi$ is a Gaussian and $k$ is the RBF-kernel, then the expressions above can be computed analytically. A more challenging case is that of truncated Gaussian distributions. In the next section, we show that the kernel mean embedding can be derived in that case.

\subsection{Kernel Mean Embedding for Truncated Gaussians}
\label{sec:bq_truncated_gaussians}
For truncated Gaussian distributions and the RBF kernel, we can compute the posterior mean but not the posterior variance.
Here we consider the 1-dimensional case with $\mathcal{X} = [a, b]$ which can be extended to the $d$-dimensinal case for isotropic Gaussians. 
We provide the expression for the kernel mean embedding: $\Pi[k(\cdot, x)] = \int_{\mathcal{X}} k(x', x) \pi(x')dx'$. We consider the case when $\pi$ is a truncated Gaussian and introduce the following notation:
\begin{talign*}
	\pi(x) = \frac{\phi(x, \mu, \sigma)}{\Phi(\frac{b-\mu}{\sigma}) - \Phi(\frac{a -\mu}{\sigma})}
\end{talign*}
where $\phi(x) = (\sqrt{2 \pi} \sigma)^{-1} \exp (-(x-\mu)^2/2\sigma^2)$ and  $\Phi(x) = \frac{1}{2}(1 + \text{erf}(x/\sqrt{2}))$. 

We use $Z$ to denote the normalization constant
\begin{talign*}
	Z(a,b,\mu, \sigma) = \Phi\left(\frac{b-\mu}{\sigma}\right) - \Phi\left(\frac{a -\mu}{\sigma}\right)
\end{talign*}
We rewrite the RBF kernel using the above identities $k(x, x') = \exp\left(-(x-x')^2/2l^2\right) = l \sqrt{2 \pi} \phi(x, x', l)$. We can now express the kernel mean embedding as:
\begin{talign*}
	\Pi[k(\cdot, x)] = \int_a^b  l \sqrt{2 \pi} \phi(x, x', l) \frac{\phi(x', \mu, \sigma)}{Z(a, b, \mu, \sigma)} dx' = C l \sqrt{2 \pi}\int_a^b \frac{\phi(x', \tilde{\mu}, \tilde{\sigma})}{Z(a, b, \mu, \sigma)} dx' = l \sqrt{2 \pi} C \frac{Z(a, b, \tilde{\mu}, \tilde{\sigma})}{Z(a, b, \mu, \sigma)},
\end{talign*}
where For truncated Gaussian distributions and the RBF kernel, we can compute the posterior mean but not the posterior variance.
Here we consider the 1-dimensional case with $\mathcal{X} = [a, b]$ which can be extended to the $d$-dimensinal case for isotropic Gaussians. 
We provide the expression for the kernel mean embedding: $\Pi[k(\cdot, x)] = \int_{\mathcal{X}} k(x', x) \pi(x')dx'$. We consider the case when $\pi$ is a truncated Gaussian and introduce the following notation:
\begin{talign*}
	\pi(x) = \frac{\phi(x, \mu, \sigma)}{\Phi(\frac{b-\mu}{\sigma}) - \Phi(\frac{a -\mu}{\sigma})}
\end{talign*}
where $\phi(x) = (\sqrt{2 \pi} \sigma)^{-1} \exp (-(x-\mu)^2/2\sigma^2)$ and  $\Phi(x) = \frac{1}{2}(1 + \text{erf}(x/\sqrt{2}))$. 

We use $Z$ to denote the normalization constant
\begin{talign*}
	Z(a,b,\mu, \sigma) = \Phi\left(\frac{b-\mu}{\sigma}\right) - \Phi\left(\frac{a -\mu}{\sigma}\right)
\end{talign*}
We rewrite the RBF kernel using the above identities $k(x, x') = \exp\left(-(x-x')^2/2l^2\right) = l \sqrt{2 \pi} \phi(x, x', l)$. We can now express the kernel mean embedding as:
\begin{talign*}
	\Pi[k(\cdot, x)] = \int_a^b  l \sqrt{2 \pi} \phi(x, x', l) \frac{\phi(x', \mu, \sigma)}{Z(a, b, \mu, \sigma)} dx' = C l \sqrt{2 \pi}\int_a^b \frac{\phi(x', \tilde{\mu}, \tilde{\sigma})}{Z(a, b, \mu, \sigma)} dx' = l \sqrt{2 \pi} C \frac{Z(a, b, \tilde{\mu}, \tilde{\sigma})}{Z(a, b, \mu, \sigma)},
\end{talign*}
where 
\begin{talign*}
	\tilde{\mu} = \frac{\mu l^2 + x \sigma^2}{\sigma^2 + l^2}, \qquad 
	\tilde{\sigma} = \sqrt{\frac{\sigma^2 l^2}{\sigma^2 + l^2}}, \qquad
	C = \frac{1}{\sqrt{2 \pi (\sigma^2 + l^2)}}\exp\left(\frac{(\mu - x)^2}{2 (\sigma^2 + l^2)}\right).
\end{talign*}
\subsection{Kernel Mean Embedding for Matern 1/2 Kernel}
\label{sec:matern}
For Gaussian distributions and the Matern 1/2 kernel, we can compute the posterior mean but only in $d=1$.
We provide the expression for the kernel mean embedding: $\Pi[k(\cdot, x)] = \int_{\mathbb{R}} k(x', x) \pi(x')dx'$, where $\pi(x) = \mathcal{N}(0, 1)$ is a standard normal and $k(x', x) = \exp(|x - x'|/l)$ is the Matern 1/2 kernel.
\begin{talign*}
	\Pi[k(\cdot, x)]=\frac{1}{2} \exp \left(\frac{2 x l + 1}{2l^2}\right) \mathrm{erfc} \left(\frac{x + \frac{1}{l}}{\sqrt{2}}\right) + \frac{1}{2} \exp \left(\frac{1 - 2xl}{2l^2}\right) \left(\mathrm{erf}\left(\frac{x - \frac{1}{l}}{\sqrt{2}}\right) + 1 \right)
\end{talign*}

\section{ADDITIONAL BACKGROUND: LAPLACE APPROXIMATION}
\label{sec:ggn_approx}
The Laplace approximation constructs a second-order Taylor approximation around the maximum of the posterior, i.e., the mode of the posterior, which amounts to a Gaussian approximate of the posterior around the MAP (maximum a-posteriori) estimate.
Here we provide a detailed introduction. 

We want to compute a posterior for the parameters of our model
\begin{talign}
  \begin{split}
  p(\theta| \mathcal{D}) &= \frac{p(\mathcal{D}| \theta)p(\theta)}{Z},\quad \text{where}\\
  Z &= \int p(\mathcal{D}| \theta)p(\theta) d\theta.
  \end{split}
  \label{eq:posterior}
\end{talign}
Here, the integral for the normalization constant $Z$ is usually not tractable, and we will have to resort to some approximation to compute it.
We provide the expressions for negative log prior
\begin{talign}
  - \log p(\theta) = \frac{1}{2 \sigma_0^2} \|\theta\|_2^2 - \frac{p+1}{2}\log \pi \sigma_0^2
  \label{eq:prior}
\end{talign}
and the negative log likelihood
\begin{talign}
  - \log p(\mathcal{D}|\theta) = \frac{1}{2 \sigma^2} \sum_{i=1}^n \|f(x_i) - g_\theta(x_i)\|_2^2 - \frac{n}{2}\log \pi \sigma^2,
  \label{eq:likelihood}
\end{talign}
where $\sigma$ is the dataset noise.
By comparing \eqref{eq:likelihood} and \eqref{eq:prior} to the mean square loss with weight decay
\begin{talign*}
    \begin{split}
        l_\text{tot}(\theta) & = l(\theta) + \lambda \|\theta\|_2^2 \\
        l(\theta) & = \frac{1}{n} \sum_{i=1}^n \|f(x_i) - g_\theta(x_i)\|_2^2
    \end{split}
\end{talign*}
we note $l \propto - \log p(\mathcal{D}| \theta)$ and $\lambda \|\theta\|_2^2 \propto - \log p(\theta)$.
Hence, the minimum of the loss correspond the maximum of the posterior, i.e. $\theta_{\text{MAP}} = \text{argmin}_{\theta} l_{\text{tot}}(\theta) = \text{argmin}_{\theta} -\log p(\mathcal{D}| \theta) - \log p(\theta) = \text{argmin}_{\theta} -\log p(\theta| \mathcal{D})$.  
We denote $L(\theta)  = \log p(\mathcal{D}| \theta) + \log p(\theta)$, and rewrite Equation \eqref{eq:posterior} 
\begin{talign*}
  p(\theta| \mathcal{D}) = \frac{e^{L(\theta)}}{Z}.  
\end{talign*}
To find a suitable approximation for the posterior we, we use as Taylor series expansion of $L$ around $\theta_{\text{MAP}}$
\begin{talign*}
  L(\theta) \approx L(\theta_\text{MAP}) + (\theta - \theta_\text{MAP})^\top \nabla_{\theta}L(\theta_{\text{MAP}})  + \frac{1}{2}(\theta - \theta_\text{MAP})^\top \nabla_{\theta}^2 L(\theta_{\text{MAP}}) (\theta - \theta_\text{MAP})
\end{talign*}
The second term is equal to zero by definition of $\theta_\text{MAP}$.
Hence, we arrive at a Gaussian approximation of the posterior $q(\theta)$ for the form
\begin{talign*}
 q_{\text{Laplace}}(\theta) = \mathcal{N}\left(\theta \mid \theta_\text{MAP}, \Sigma \right),
\end{talign*}
where $\Sigma$ is proportional to the inverse Hessian of the loss $l_{\text{tot}}$ evaluated at $\theta_\text{MAP}$: 
\begin{talign*}
 \Sigma^{-1} &=\nabla_{\theta}^2 L|_{\theta=\theta_{\text{MAP}}} 
 =  \left(- \nabla_\theta^2 \log p(\mathcal{D}|\theta) - \nabla_\theta^2 \log p(\theta)\right)|_{\theta=\theta_{\text{MAP}}}\\
 &= H  +  \sigma_0^{-2} I_{p+1}.
\end{talign*}
Since computing $H$ is computationally expensive, we use the GGN Approximation to compute it.

\subsection{GGN Approximation}
For the Laplace approximation we are interested in computing the Hessian of the log likelihood $-\log p(\mathcal{D}| \theta) \propto \frac{1}{2 \sigma^2} \sum_{i=1}^n \|f(x_i) - g_\theta(x_i)\|_2^2$.
The GGN approximation is a positive-semi-definite approximation of the full Hessian $H$, i.e.,
\begin{talign*}
  H &=  \nabla_{\theta}^2 \left(-\log p(\mathcal{D}| \theta)\right)\\
    &=  \nabla_{\theta}^2 \frac{1}{2 \sigma^2} \sum_{i=1}^n \|f(x_i) - g_\theta(x_i)\|_2^2 \\
    &= \frac{1}{2 \sigma^2} \sum_{i=1}^n \left[ \left(\nabla_{\theta} g_{\theta}(x_i)|_{\theta=\theta_{\text{MAP}}}\right)^\top 
    2 
    \nabla_{\theta} g_{\theta}(x_i)|_{\theta=\theta_{\text{MAP}}}
  + \nabla_{\theta}^2 g_{\theta}(x_i)|_{\theta=\theta_{\text{MAP}}} 
  \left(\nabla_g \|f(x_i) - g\|_2^2|_{g=g_{\theta_{\text{MAP}}}(x_i)}\right)^2 \right]\\
  &= H_{\text{GGN}} + R,
\end{talign*}
where $\sigma$ is the dataset noise as in Eq. \eqref{eq:likelihood}.
The GGN approximation is given by
\begin{talign*}
  H_{\text{GGN}} = \frac{1}{\sigma^2}\sum_{i=1}^n J(x_i)^\top J(x_i)
\end{talign*}
where $J(x_i) = \nabla_{\theta} g_{\theta}(x_i)|_{\theta=\theta_{\text{MAP}}}$.

\bibliography{sample}

\end{document}
