\section{Divergences between Gaussian measures}
%
\subsection{The KL divergence is infinite}
\label{sec:infinite_kl}
%
In this section, we show that the Kullbach-Liebler (KL) divergence between the Gaussian measures $\measureQ_\phi^F \sim \gaussian{m_Q}{C_Q}$ and $\measureP^F\sim \gaussian{m_P}{C_P}$, respectively induced by the linearized BNN in Eq~\ref{eq:linearized-q} and by a non-degenerate Gaussian process satisfying conditions given in \cref{sec:reg_kl}, is infinite.
While this has already been shown by \citet{burt2020understanding}, the proof is easier for Gaussian measures. 
We first need the Feldman-Hàjek theorem which tells us when the KL divergence between two Gaussian measures is well-defined. 
%
\begin{theorem}[Feldman-Hàjek, \citet{quang2022gpkl} Theorem 2, \citet{simpson2022} Theorem 7]\label{th:feldman_hajek}
Consider two Gaussian measures $\nu_1 = \gaussian{m_1}{C_1}$ and $\nu_2 = \gaussian{m_2}{C_2}$ on $\Ltwo{\mathcal{X}}{\rho}$. Then $\nu_1$ and $\nu_2$ are called equivalent if and only if the following holds:
\begin{enumerate}[noitemsep,topsep=0pt,parsep=0pt,partopsep=0pt,leftmargin=*]
    \item $m_1 - m_2 \in \text{Im}(C_2^{1/2})$
    \item The operator $T$ such that $C_1 = C_2^{1/2}(I-T)C_2^{1/2}$ is Hilbert-Schmidt, that is $T$ has a countable set of eigenvalues $\lambda_i$ that satisfy $\lambda_i < 1$ and $\sum_{i=1}^{\infty} \lambda_i^2 < \infty$.
\end{enumerate}
otherwise $\nu_1$ and $\nu_2$ are singular.
If $\nu_1$ and~$\nu_2$ are equivalent, then the Radon-Nikodym derivative exists and $\kl{\nu_1}{\nu_2}$ admits an explicit formula. Otherwise, $\kl{\nu_1}{\nu_2} = \infty$.
\end{theorem}
%
Let us now show that the KL divergence between $\measureQ_\phi^F$ and $\measureP^F$ is indeed infinite.
%
\begin{prop}\label{prop:infinit-kl}
The Gaussian measures $\measureQ_\phi^F$ and $\measureP^F$ are mutually singular and $D_\text{KL}(\measureQ_\phi^F || \measureP^F) = \infty$.
\end{prop}
\begin{proof}
The proof follows from the Feldman-Hàjek theorem (\cref{th:feldman_hajek}). 
In our case, $C_Q$ has at most $p$ non-zero eigenvalues as the covariance function of the GP induced by the BNN is degenerate, while $C_P$ has a set of (countably) infinite non-zeros eigenvalues (prior is non-degenerate as per assumption).
Hence, for the equality in condition (2) to hold, $T$ must have eigenvalue $1$ which violates the requirement that $T$ is Hilbert-Schmidt \ie that its eigenvalues $\set{\lambda_i}_{i=1}^{\infty}$ satisfy $\lambda_i < 1$ and $\sum_{i=1}^{\infty} \lambda_i^2 < \infty$.
Therefore, $\measureQ_\phi^F$ and $\measureP^F$ are mutually singular and $D_\text{KL}(\measureQ_\phi^F || \measureP^F) = \infty$.
\end{proof}
%
\subsection{The regularized KL divergence}
\label{sec:app_reg_kl}
%
We provide the bound describing the asymptotic convergence of the regularized KL divergence estimator in \cref{eq:reg_kl_bound}. 
The error results from the fact that taking a finite number M of context points effectively cuts off the spectra of the covariance operators and the estimator $\hat{D}_{\text{KL}}^\gamma$ converges to $D_{\text{KL}}^\gamma$ as $M\to\infty$ with high probability. \\\\
%
\begin{theorem}[Convergence of estimator, \citet{quang2022gpkl} Theorem 45]
Assume the following:
\begin{enumerate}[noitemsep,topsep=0pt,parsep=0pt,partopsep=0pt,leftmargin=*]
    \item Let T be a $\sigma-\text{compact}$ metric space, that is $T = \cup_{i=1}^{\infty} T_i$, where $T_1 \subset T_2 \subset \cdots$ with each $T_i$ being compact.
    \item $\rho$ is a non-degenerate Borel probability measure on T, that is $\rho\br{B} > 0$ for each open set $B \subset T$.
    \item $K_1, K_2 : T \times T \to \real$ are continuous, symmetric, positive definite kernels and there exists $\kappa_1 > 0, \kappa_2 > 0$ such that $\int_T K_i(x, x) d\rho\br{x} \leq \kappa_i^2 \;$ for $i=1,2$.
    \item $\operatorname{sup}_{x \in T} K_i(x, x) \leq \kappa_i^2 \;$ for $i=1,2$.
    \item $f_i \sim GP\br{\mu_i, K_i}$, where $\mu_i \in L^2(T, \rho)$ for $i=1,2$.
    \item  $\exists B_i > 0$ such that $\norm{\mu_i}_{\infty} \leq B_i$ for $i=1,2$.
\end{enumerate}
Let $\vx = \{\vx^{(i)}\}_{i=1}^M$, $\vx^{(1)}, \dots, \vx^{(M)} \oset[.40ex]{\textup{\tiny i.i.d}}{\sim} \rho(\vx)$. If Gaussian measures $\gaussian{m_i}{C_i}$ are induced by GPs $f_i \sim \mathcal{GP}(\mu_i, K_i)$ for $i=1,2$, then for any $0 < \delta < 1$, with probability at least $1 - \delta$,
\begin{multline}\label{eq:reg_kl_bound}
    |\kl{\gaussian{\mu_1(\mathbf{x})}{K_1(\mathbf{x},\mathbf{x})+M \gamma \eye_M}}{{\gaussian{\mu_2(\mathbf{x})}{K_2(\mathbf{x}, \mathbf{x})+M \gamma \eye_M}}} 
    - \regkl{\gaussian{m_1}{C_1}}{\gaussian{m_2}{C_2}}| \\
\leq \frac{1}{2\gamma}(B_1 + B_2)^2 [1 + \kappa_2^2 / \gamma]^2 \br{\frac{2 \log \frac{48}{\delta}}{M}+\sqrt{\frac{2 \log \frac{48}{\delta}}{M}}} \\
+ \frac{1}{2\gamma^2} [\kappa_1^4 + \kappa_2^4 + \kappa_1^2 \kappa_2^2 (2 + \kappa_2^2 / \gamma)] \br{\frac{2 \log \frac{12}{\delta}}{M}+\sqrt{\frac{2 \log \frac{12}{\delta}}{M}}} 
\end{multline}
\end{theorem}
%
Note that \cref{eq:reg_kl_bound} provides a very general bound on the error that does not make assumptions on the spectral decay, and it may therefore dramatically overestimate the error. Indeed, we analyze convergence empirically in \cref{fig:regkl_vs_gamma} and observe that the estimator converges quickly except for very rough priors (e.g., Matérn-1/2) with very small $\gamma$.
%
\section{Additional details on the GFSVI objective estimator}
\label{app:sec_gfsvi_estimator}

In this section, we present details on the estimation of the generalized function-space variational inference (GFSVI) objective.
Let $f_L({\,\cdot\,; \vw})$ be the linearized BNN (Eq~\ref{eq:linearized_nn}) with weights $\vw \in \real^p$, and $\mathcal{D} = \{(\vx_i, y_i)\}_{i=1}^N$ a data set with features~$\vx_i \in \mathcal{X} \subset \real^d$ and associated values~$y_i \in \mathcal{Y}$.
Assuming a likelihood $\prob{\mathcal{D} \g \vw} = \prod_{i=1}^N \prob{y_i \g f(\vx_i; \vw)}$ and a Gaussian variational distribution on model weights $q_\phi(\vw)=\gaussianx{\vw}{\vm}{\mS}$, the GFSVI objective function is 
\begin{equation}\label{app:objective}
    \mathcal{L}(\phi) = \sum_{i=1}^N \expect{\log \prob{y_i \g f_L(\vx_i; \vw)}}{q_\phi(\vw)} - \regkl{\measureQ_\phi^F}{\measureP^F}
\end{equation}
where $\measureQ_\phi^F$ and $\measureP^F$ are the Gaussian measures induced by the linearized BNN and a Gaussian process prior respectively.

\paragraph{Expected log-likelihood}

When considering a Gaussian likelihood, we use the closed form expression available due to the Gaussian variational measure over functions induced by the linearized BNN
\begin{equation} \label{eq:objective_expected_ll_gaussian}
    \expect{\log \gaussianx{y_i}{f_L(\vx_i;\vw)}{\sigma_y^2}}{q_\phi(\vw)} = -\frac{1}{2}\log \br{2\pi \sigma_y^2} - \frac{\br{y_i - f\br{\vx_i; \vm}}^2 + J\br{\vx_i;\vm} \mS J\br{\vx_i;\vm}\tp}{2\sigma_y^2}.
\end{equation}
When considering a Categorical likelihood with $C$ different classes, we estimate the expected log-likelihood term using Monte-Carlo integration as
\begin{equation}\label{eq:objective_expected_ll_categorical}
    \expect{\log \text{Cat}(y_i \g \sigma(f_L(\vx_i;\vw)))}{q_\phi(\vw)} = \frac{1}{K} \sum_{k=1}^K \sum_{c=1}^C \eye[y_i = c] \left[ f^c_L(\vx_i;\vw^{(k)}) - \log \left[ \sum_{c'=1}^C \exp \left( f^{c'}_L(\vx_i;\vw^{(k)})\right) \right] \right]
\end{equation}
where $\vw^{(k)} \sim q_\phi(\vw)$ for $k=1, \dots, K$, $\eye[\cdot]$ is the indicator function, $\sigma(\cdot)$ is the softmax function and $f^c_L(\,\cdot\,;\vw)$ is the logit for class $c$ obtained from $f_L$. 

\paragraph{Regularized KL divergence}
We estimate the regularized KL divergence using its consistent estimator (Eq.~\ref{eq:reg-kl-estimator})
\begin{multline}\label{eq:gfsvi-reg-kl-estimator}
    \regklhat{\measureQ_\phi^F}{\measureP^F} = \frac{1}{2} \br{f(\vx; \vm) - \mu(\vx)}\tp (K(\vx, \vx)+\gamma M \eye_M)^{-1} \br{f(\vx; \vm) - \mu(\vx)} \\
    + \frac{1}{2} \operatorname{Tr}\big[(K(\vx, \vx)+\gamma M \eye_M)^{-1}(J(\vx; \vm)SJ(\vx; \vm)\tp+\gamma M \eye_M) - \eye_M\big] \\
     - \frac{1}{2} \log \det \big[(K(\vx, \vx)+\gamma M \eye_M)^{-1}(J(\vx; \vm)SJ(\vx; \vm)\tp+\gamma M \eye_M)\big]
\end{multline}
with measurement points $\vx = \{\vx^{(i)}\}_{i=1}^M$, $\vx^{(1)}, \dots, \vx^{(M)} \oset[.40ex]{\textup{\tiny i.i.d}}{\sim} \rho(\vx)$ sampled from a probability measure on $\mathcal{X}$.

\paragraph{Computational complexity}

Evaluating the objective in Eq.~\ref{app:objective} has complexity $O(BKC + M^3)$ for Categorical likelihoods and $O(B + M^3)$ for Gaussian likelihoods, where $B$ is the batch size, $K$ the number of variational posterior samples, $C$ the number of classes, and $M$ the number of context points. The first term corresponds to the expected log-likelihood in our objective and the second term to the regularized KL divergence estimator. We note that evaluating the linearized neural network can be efficiently done in about 3x the cost of one forward pass using the Jacobian-vector product computational primitive.

\section{Additional details on the experimental setup}
\label{app:sec_exp_details}

\subsection{Experiments on synthetic data}
\label{app:sec_synthetic_data}

\paragraph{Regression}
We consider the following generative model for the toy data
\begin{equation}
    y_i = \operatorname{sin}(2\pi x_i) + \epsilon \quad \text{with} \quad \epsilon \sim \gaussian{0}{\sigma_n^2}
\end{equation}
and draw $x_i \sim \mathcal{U}([-1, -0.5] \cup [0.5, 1])$.
When not otherwise specified, we use $\sigma_n = 0.1$. 
On the plots, the data points are shown as gray circles, inferred mean functions as red lines, their 2-standard-deviations interval around the mean in light green, and functions sampled from the approximate posterior as green lines.
In general, we consider two hidden-layer BNNs with 30 neurons per layer and hyperbolic tangent activation (Tanh) functions. 
Specifically in \cref{fig:fsvi_prior_elicitation}, the small BNN has the same architecture as above while the large BNN has 100 neurons per layer. 
All the BNN baselines have the same architecture and fully-factorized Gaussian approximate posterior.
The prior scale of TFSVI \citep{rudner2022fsvi} is set to $\sigma_p = 0.2$ and $\sigma_p = 0.75$ for MFVI \citep{blundell2015weight} and Laplace \citep{immer2021linlaplace}. 
For the Gaussian process posterior baseline, we fit the prior parameters by maximizing the log-marginal likelihood \citep{williams2006gaussian}.
Apart from the cases where the parameters of the GP prior used for GFSVI (our method) and FVI \citep{sun2018functional} are explicitly stated, we consider a constant zero-mean function and find the parameters of the covariance function by maximizing the log-marginal likelihood from mini-batches \citep{chen2021gaussian}.
Except where otherwise stated, we estimate the functional KL divergences with 500 measurement points and use the regularized KL divergence with $\gamma = 10^{-10}$. 

\paragraph{Classification}
We sample 100 data points perturbed by Gaussian noise with $\sigma_n = 0.1$ from the two moons data \citep{scikitlearn}.
On the plots, the data points are shown as red (class 0) and blue (class 1) dots. 
We plot the mean and 2-standard-deviations of the probability that $\vx$ belongs to class 1 with respect to the posterior  (\ie $p(y=1 \g \vw^{(k)}, \vx)$) which we estimate from $K=100$ samples $\vw^{(k)} \sim q_\phi(\vw)$ for $k= 1, \dots, K$. 
We consider two hidden-layer BNNs with 100 neurons per layer and hyperbolic tangent activation (Tanh) functions. 
All the BNN baselines have the same architecture and fully-factorized Gaussian approximate posterior.
The prior scale of MFVI \citep{blundell2015weight} is set to $\sigma_p = 0.8$ and $\sigma_p = 1.0$ for TFSVI \citep{rudner2022fsvi} and Laplace \citep{immer2021linlaplace}. 
For the Gaussian process posterior baseline, we approximate the intractable posterior using the Laplace approximation and find the prior parameters by maximizing the log-marginal likelihood \citep{williams2006gaussian}.
The GP prior for GFSVI (our method) and FVI \citep{sun2018functional} has a constant zero-mean function and we find the parameters of the covariance function by maximizing the log-marginal likelihood from mini-batches \citep{chen2021gaussian} using the method to transform classifications labels into regression targets from \citet{milios2018DirGP}.
We estimate the functional KL divergences with 500 measurement points and use the regularized KL divergence with $\gamma = 10^{-10}$. 

\subsection{Ocean current modeling experiment}
\label{app:sec_ocean_current_details}

Following \citet{cinquin2024fsplaplace}, we apply the Helmholtz decomposition to the neural network $f$ as
\begin{equation}
    f(\cdot, \vw) = \operatorname{grad} \Phi(\cdot, \vw_1) + \operatorname{rot} \Psi(\cdot, \vw_2)
\end{equation}
where $\vw = \set{\vw_1, \vw_2}$ and, $\Phi(\cdot, \vw_1)$ and $\Psi(\cdot, \vw_2)$ are 2-layer fully-connected neural networks with $50$ hidden units per layer and hyperbolic tangent activation functions. 
GFSVI and TFSVI both use $160$ fixed context points.
The prior scale of TFSVI is set to $\sigma_p=0.5$.
We fit the neural networks on the entire dataset and average the scores with respect to five different random seeds.

\subsection{Regression experiments with tabular data}
\label{app:sec_regression_details}

\paragraph{Datasets and pre-processing}
We evaluate the predictive performance of our model on regression datasets from the UCI repository \citep{Dua2019UCI} described in \cref{tab:regression_dataset_description}.
These datasets are also considered in \citet{sun2018functional, wild2022gvi} but we include two additional larger ones (Wave and Denmark).
We perform 5-fold cross validation, leave out one fold for testing, consider 10\% of the remaining 4 folds as validation data and the rest as training data.
We report mean and standard-deviation of the average expected log-likelihood and average mean square error on the test fold. 
We also report the mean rank of the methods across all datasets by assigning rank 1 to the best scoring method as well as any method who's error bars overlap with the highest score's error bars, and recursively apply this procedure to the methods not having yet been assigned a rank. 
The expected log-likelihood is estimated by Monte Carlo integration when it is not available in closed form (MFVI, TFSVI and FVI) with 100 posterior samples.
We preprocess the dataset by encoding categorical features as one-hot vectors and standardizing the features and labels. 

\paragraph{Baseline specification}
We compare our GFSVI method to two weight-space inference methods (mean-field variational inference \citep{blundell2015weight} and linearized Laplace \citep{immer2021linlaplace}) and two function-space inference methods (FVI \citep{sun2018functional} and TFSVI \citep{rudner2022fsvi}).
While FVI uses GP priors, TFSVI performs inference in function space but with the pushforward to function space of the variational distribution and prior on the weights.
We compute the function-space (regularized) KL divergence using a set of 500 measurement points sampled from a uniform distribution for GFSVI and TFSVI, and 50 points drawn from a uniform distribution along with 450 samples from the training batch for FVI as specified in \citet{sun2018functional}.
All the BNN baselines have the same architecture and fully-factorized Gaussian approximate posterior.
We also provide results with a GP \citep{williams2006gaussian} when the size of the dataset allows it, and a sparse GP \citep{hensman2013gaussian}.
As we restrict our comparison to BNNs, we do not consider the GP and sparse GP as baselines but rather as gold-standards. 
All models have a Gaussian homoskedastic noise model with a learned scale parameter.
All the BNNs are fit using the Adam optimizer \citep{kingma2017adam} using a mini-batch size of 2000 samples. 
We also perform early stopping when the validation loss stops decreasing.

\paragraph{Model selection}
Hyper-parameter optimization is conducted using the Bayesian optimization tool provided by Wandb \citep{wandb}.
BNN parameters are selected to maximize the average validation expected log-likelihood across the 5 cross-validation folds. 
We optimize over prior parameters (kernel and prior scale), learning-rate and activation function. 
We select priors for GFSVI, FVI, sparse GP and GP among the RBF, Matérn-1/2, Matérn-3/2, Matérn-5/2, Linear and Rational Quadratic covariance functions.
The GP prior parameters used with GFSVI and FVI are selected by maximizing the log-marginal likelihood from batches as proposed by \citet{chen2021gaussian} and done in \citet{sun2018functional}.
Hyper-parameters for GPs and sparse GPs (kernel parameters and learning-rate) are selected to maximize the mean log-marginal likelihood of the validation data across the 5 cross-validation folds. 

\begin{table}[t]
\scshape
\caption{UCI regression dataset description}
\label{tab:regression_dataset_description}
\resizebox{\linewidth}{!}{
\begin{tabular}{@{}llllllllllll@{}}
\toprule
Dataset & Boston & Naval  & Power & Protein & Yacht & Concrete & Energy & Kin8nm & Wine & Wave & Denmark\\ \midrule
Number samples & 506 & 11\,934 & 9\,568 & 45\,730 & 308 & 1\,030 & 768 & 8\,192 & 1\,599 & 288\,000 & 434\,874 \\
Number features & 13 & 16 & 4 & 9 & 6 & 8 & 8 & 8 & 11 & 49 & 2 \\ \bottomrule
\end{tabular}
}
\end{table}


\subsection{Classification experiments with image data}
\label{app:sec_classification_details}

\paragraph{Datasets and pre-processing}
We further evaluate the predictive performance of our model on classification tasks with the MNIST \citep{lecun2010mnist} and Fashion MNIST \citep{xiao2017FMNIST} image data sets.
We fit the models on a random subset of 90\% of the provided training split, consider the remaining 10\% as validation data and evaluate on the provided test split. 
We repeat this procedure 5 times with different random seeds and report the mean and standard-deviation of the average expected log-likelihood, accuracy and expected calibration error (ECE) of the mean of the predictive distribution on the test set.
The expected log-likelihood is estimated by Monte Carlo integration with 100 posterior samples when it is not available in closed form (MFVI, TFSVI and FVI). 
We estimate the mean of the predictive distribution to compute the accuracy and the ECE with 100 posterior samples. 
We preprocess the dataset by standardizing the images. 

\paragraph{Baseline specification}
We compare our GFSVI method to the same baselines as for the regression experiments (see \ref{app:sec_regression_details}).
% Architecture
All the BNN baselines have the same architecture and fully-factorized Gaussian approximate posterior.
More specifically, we consider a CNN with three convolutional layers (with output channels 16, 32 and 64) before two fully connected layers (with output size 128 and 10).
The convolutional layers use $3\times3$ shaped kernels. 
Each pair of convolutional layers is interleaved with a max-pooling layer. 
We consider three different measurement point distributions $\rho$ to estimate the (regularized) KL divergence in GFSVI, FVI and TFSVI: \textsc{random}, \textsc{random pixel} and \textsc{kmnist}.
The \textsc{random} measurement point distribution is sampled from by drawing 50\% of the samples from the training data batch and 50\% of the samples from a uniform distribution over $[p_{min}, p_{max}]^{H \times W \times C}$, where $H$, $W$ and $C$ are respectively the height, width and number of channels of the images, and $p_{min}=v_{min}-0.5 \times \Delta$ and $p_{max}=v_{max}+0.5 \times \Delta$ where $\Delta=v_{max}-v_{min}$ is the difference between the minimal ($v_{min}$) and maximal ($v_{max}$) pixel values of the data set.
The \textsc{random pixel} measurement point distribution is taken from \citet{rudner2022fsvi} and is sampled from by randomly choosing each pixel value among the ones available from the training data batch at the same position in the $28 \times 28$ pixel grid.
Finally, the \textsc{kmnist} measurement point distribution is also taken from \citet{rudner2022fsvi} and is drawn from by randomly sampling data points from the Kuzushiji-MNIST (KMNIST) dataset \citep{clanuwat2018kmnist}. 
The KMNIST dataset is a collection of 70'000 gray-scale images of size $28 \times 28$ which we preprocess by standardizing the images.
We sample 25 measurement points when using \textsc{random}, 25 measurement points when using \textsc{random pixel} and 20 when using \textsc{kmnist}.
All the BNNs are trained using the Adam optimizer \citep{kingma2017adam} using a mini-batch size of 100. 
We also perform early stopping when the validation loss stops decreasing.

\paragraph{Model selection}
Hyper-parameter optimization is conducted just like for the regression tasks (see \ref{app:sec_regression_details}).
The Gaussian process prior parameters used with GFSVI and FVI are selected by maximizing the log-marginal likelihood from batches \citep{chen2021gaussian} using the method to transform classifications labels into regression targets from \citet{milios2018DirGP}.
We optimize the same hyper-parameters as for the regression experiments with the exception of the additional $\alpha_\epsilon$ parameter introduced by \citet{milios2018DirGP} for the function-space VI methods with GP priors (FVI and GFSVI).

\subsection{OOD detection}
\label{app:sec_ood_details}

\paragraph{Tabular data with a Gaussian likelihood}

Following the setup from \citet{malinin2021uncertGBM} we take epistemic uncertainty to be the variance of the mean prediction with respect to samples from the posterior.
We consider the test data to be in-distribution (ID) data and a subset of the song dataset \citep{Bertin-Mahieux2011} of equal length and with an equal number of features as out-of-distribution (OOD) data.
We use the same preprocessing as for regression as well as the same baselines with the same hyper-parameters (see \cref{app:sec_regression_details}).
We first fit a model, then evaluate the extend by which the epistemic uncertainty under the model is predictive of the ID and OOD data using a single threshold obtained by a depth-1 decision tree fit to minimize the classification loss.
We report the mean and standard error of the accuracy of the threshold to classify OOD from ID data based on epistemic uncertainty across the 5 folds of cross-validation.
We also provide results obtained using a GP and sparse GP as gold standard.

\paragraph{Image data with a Categorical likelihood}

Following the setup by \citet{osawa2019practical}, we take the epistemic uncertainty to be the entropy of the mean of the predictive distribution with respect to samples from the posterior. 
We evaluate models trained on MNIST using MNIST's test split as ID data and a subset of the training set of Fashion MNIST as OOD data. 
Likewise, we evaluate models trained on Fashion MNIST using Fashion MNIST's test split as ID data and a subset of the training set of MNIST as OOD data. 
We use the same preprocessing as for classification, as well as the same baselines with the same hyper-parameters (see \cref{app:sec_classification_details}).
We first fit a model, then evaluate the extend by which the epistemic uncertainty under the model is predictive of the ID and OOD data using a single threshold obtained by a depth-1 decision tree fit to minimize the classification loss.
We estimate mean of the predictive distribution by Monte-Carlo integration using 100 posterior samples.
We report the mean and standard error of the accuracy of the threshold to classify OOD from ID data based on epistemic uncertainty for the 5 models trained on different random seeds (see \cref{app:sec_classification_details}).


\subsection{Variational measure evaluation}
\label{app:sec_measure_eval_details}

We evaluate our inference method by comparing the samples drawn from the exact posterior over functions with the approximate posterior obtained with our method (GFSVI).
We follow the setup by \citet{wilson2022evalBNN} and we compute the average Wasserstein-2 metric between 1000 samples drawn from a GP posterior with a RBF kernel evaluated at the test points, and samples from the approximate posterior of GFSVI, sparse GP and FVI evaluated at the same points and with the same prior. 
We consider the Boston, Concrete, Energy, Wine and Yacht datasets for which the exact GP posterior can be computed and use the same preprocessing as for regression (see \cref{app:sec_regression_details}).
We report the mean and standard error of the average Wasserstein-2 metric across the 5 folds of cross-validation.
The Wasserstein-2 metric is computed using the Python Optimal Transport library \citep{flamary2021pot}.

\paragraph{Baseline specification}
FVI and GFSVI have the same two hidden layer neural network architecture with 100 neurons each and hyperbolic tangent activation.
These models are fit with the same learning rate and set of 500 measurement points jointly sampled from a uniform distribution over the feature-space and mini-batch of training samples.
We use $\gamma=10^{-15}$ for the regularized KL divergence. 
We further consider a sparse GP with 100 inducing points.


\subsection{Software}
We use the JAX \citep{jax2018github} and DM-Haiku \citep{haiku2020github} Python libraries to implement our Bayesian neural networks.
MFVI, linearized Laplace and TFSVI were implemented based on the information in the papers, and code for FVI was adapted to the JAX library from the implementation provided by the authors. 
We further use the GPJAX Python library for experiments involving Gaussian processes \citep{Pinder2022gpjax}. 

\subsection{Hardware}
All models were fit using a single NVIDIA RTX 2080Ti GPU with 11GB of memory. 

\section{Additional experimental results}

In this section, we present additional figures for our qualitative uncertainty evaluation as well as further experimental results on regression, out-of-distribution detection and robustness under input distribution shift tasks.
We also provide plots illustrating the eigenvalue decay of different kernels, and figures showing the influence of $\gamma$ in the regularized KL divergence.

\subsection{Qualitative uncertainty evaluation}

\begin{figure*}[t]
    \centering
    \resizebox{\linewidth}{!}{
    \includegraphics[width=\linewidth]{plots/fsvi_varying_label_noise.pdf}
    }
    \caption{Our method (GFSVI) effectively regularizes functions generated by the Bayesian neural network (BNN) both in settings where the generative process is very noisy ($\sigma_n=1$) or not ($\sigma_n=0.1$).}
    \label{fig:fsvi_varying_label_noise}
\end{figure*}

\paragraph{Regression}

We further find that our method (GFSVI) provides strong regularization when the data generative process is noisy (see \cref{fig:fsvi_varying_label_noise}) and is more robust than FVI to situations where ones computational budget constrains the number of measurement points $M$ to be small (\cref{fig:fsvi_vs_fvi_robert}).
In contrast to FVI, GFSVI accurately approximates the exact GP posterior under rough (Matérn-1/2) GP priors effectively incorporating prior knowledge defined by the GP prior to the inference process (see \cref{fig:fsvi_matern_vs_baselines}).
Likewise, GFSVI adapts to the variability of the functions specified by the kernel (see \cref{fig:fsvi_rbf_varying_lengscale}).
We also find that GFSVI requires a larger number of measurement points to capture the behavior of a rougher prior (see \cref{fig:fsvi_varying_n_context_points}).

\begin{figure*}[t]
    \centering
    \resizebox{\linewidth}{!}{
    \includegraphics[width=\linewidth]{plots/fsvi_RBF_vs_baselines.pdf}
    }
    \caption{Our method (GFSVI) with an RBF Gaussian process (GP) prior accurately approximates the exact GP posterior unlike the function-space prior baseline (FVI). Weight-space prior baselines do not provide a straight-forward mechanism to incorporate prior assumptions regarding the functions generated by BNNs and underestimate the epistemic uncertainty (MFVI, Laplace). The lower row is identical to the one in \cref{fig:fsvi_matern_vs_baselines} in the main text and is reproduced here to make comparison easier.}
    \label{fig:fsvi_RBF_vs_baselines}
\end{figure*}

\begin{figure*}[!h]
    \centering
    \resizebox{\linewidth}{!}{
    \includegraphics[width=\linewidth]{plots/fsvi_rbf_varying_lengscale.pdf}
    }
    \caption{Our method (GFSVI) allows to incorporate prior beliefs in terms of function variability using the characteristic length-scale parameter of the Gaussian process (GP) prior. GFSVI was fit using a GP prior with RBF covariance function.}
    \label{fig:fsvi_rbf_varying_lengscale}
\end{figure*}


\begin{figure*}[!h]
    \centering
    \resizebox{\linewidth}{!}{
    \includegraphics[width=\linewidth]{plots/fsvi_varying_n_context_points.pdf}
    }
    \caption{Our method (GFSVI) captures the smooth behavior of a Gaussian process (GP) prior with RBF covariance function even if the number of measurement points is small (M=10). However, in that setting GFSVI fails to reproduce the rough effect of a GP prior with a Matérn-1/2 covariance function, and requires a larger amount of measurement points to do so (M=100).}
    \label{fig:fsvi_varying_n_context_points}
\end{figure*}

\begin{figure*}[!h]
    \centering
    \resizebox{\linewidth}{!}{
    \includegraphics[width=\linewidth]{plots/fsvi_vs_fvi_robert.pdf}
    }
    \caption{Our method (GFSVI) already provides a reasonable approximation to the exact posterior with small numbers of measurement points (M=10) while function-space baseline FVI requires many more (M=100).}
    \label{fig:fsvi_vs_fvi_robert}
\end{figure*}


\paragraph{Classification}

We find that GFSVI better captures the beliefs induced by the smooth RBF and rough Matérn-1/2 Gaussian process priors compared to FVI (see Figures~\ref{fig:classification_gfsvi_RBF_vs_baselines} and \ref{fig:classification_gfsvi_Matern12_vs_baselines}).
Moreover, GFVSI both accurately fits the training data and shows greater uncertainty outside of its support relative to BNNs baselines with weight-space and function-space priors.
Unlike for the toy data regression experiments where the GP posterior was the ground truth, the Laplace (approximate) GP posterior in Figures~\ref{fig:classification_gfsvi_RBF_vs_baselines} and \ref{fig:classification_gfsvi_Matern12_vs_baselines} only represents a possible approximation to the now in-tractable posterior (due to the softmax inverse link function). 
Thus the GP should not be considered as the ground truth nor as the optimal approximation in the classification setting, but is nevertheless useful to give a idea of the level of uncertainty a BNN with a GP prior should provide outside of the support of the data.

\begin{figure*}[!h]
    \centering
    \resizebox{\linewidth}{!}{
    \includegraphics[width=\linewidth]{plots/classification_gfsvi_RBF_vs_baselines.pdf}
    }
    \caption{Our method (GFSVI) with a RBF Gaussian process (GP) prior accurately captures the smooth decision boundary induced by the prior and shows high uncertainty outside of the data support. 
    Weight-space baselines do not provide a straight-forward mechanism to incorporate prior assumptions regarding the functions generated by BNNs and underestimate the epistemic uncertainty (TFSVI, Laplace) or underfit the data (MFVI).}
    \label{fig:classification_gfsvi_RBF_vs_baselines}
\end{figure*}

\begin{figure*}[!h]
    \centering
    \resizebox{\linewidth}{!}{
    \includegraphics[width=\linewidth]{plots/classification_gfsvi_Matern12_vs_baselines.pdf}
    }
    \caption{Our method (GFSVI) with a Matérn-1/2 Gaussian process (GP) prior accurately captures the rough decision boundary unlike the function-space baseline (FVI).
    Weight-space baselines do not provide a straight-forward mechanism to incorporate prior assumptions regarding the functions generated by BNNs and underestimate the epistemic uncertainty (TFSVI, Laplace) or underfit the data (MFVI).}
    \label{fig:classification_gfsvi_Matern12_vs_baselines}
\end{figure*}


\paragraph{Inductive biases}
\cref{fig:fsvi_prior_elicitation} compares GFSVI to the exact posterior across two different priors and three model architectures (details in~\ref{app:sec_synthetic_data}).
We find that the BNN's ability to incorporate the beliefs introduced by the GP prior depends on its size and activation function.
When using piece-wise linear activations (ReLU), small models are prone to underfitting for smooth priors (RBF), and to collapsing uncertainty for rough priors (Matérn-1/2).
By contrast, when using smooth activations (Tanh), smaller models suffice, and they are compatible with most standard GP priors (the results shown in \cref{fig:fsvi_prior_elicitation} extend to RBF, Matérn family, and Rational Quadratic in our experiments).
We also analyzed how the number~$M$ of measurement points affects performance.
\cref{fig:fsvi_varying_n_context_points,fig:kernel_gram_eigendecay} show that capturing the properties of rough GP priors and estimating $D_\text{KL}^\gamma$ with these priors requires larger~$M$.
\begin{figure*}%[t]
    \centering
    \resizebox{\linewidth}{!}{
    \includegraphics[width=\linewidth]{plots/fsvi_prior_elicitation.pdf}
    }
    \caption{
    Our method requires that the Bayesian neural network (BNN) and Gaussian process (GP) prior share similar inductive biases to provide an accurate approximation to the exact posterior. 
    }
    \label{fig:fsvi_prior_elicitation}
\end{figure*}

\subsection{Regression on tabular data}

\begin{table*}[!h]
\scshape
\caption{Test mean square error (MSE) of evaluated methods on regression datasets. We find that GFSVI (ours) also performs competitively in terms of MSE compared to baselines and obtains the best mean rank, matching best the performing methods on nearly all datasets.}
\label{tab:mse}
\resizebox{\linewidth}{!}{
\renewcommand{\arraystretch}{1}
\begin{tabular}{@{}lcccccccccc@{}}
\toprule
\multicolumn{1}{@{}l}{Dataset} & \multicolumn{2}{c}{Function-space priors} & \multicolumn{4}{c}{Weight-space priors} & \multicolumn{3}{c}{Gaussian Processes (Gold Standards)} \\ 
\cmidrule(rl){2-3} \cmidrule(rl){4-7} \cmidrule(rl){8-10}
\multicolumn{1}{@{}l}{} & GFSVI (ours) & \multicolumn{1}{c}{FVI} & TFSVI & MFVI & VIP & Laplace & GWI & Sparse GP & GP  \\ 
\midrule
Boston & \textbf{0.123 $\pm$ 0.021} & \textbf{0.136 $\pm$ 0.022} & 0.995 $\pm$ 0.092 & 0.532 $\pm$ 0.072 & 0.201 $\pm$ 0.056 & 0.203 $\pm$ 0.047 & 0.273 $\pm$ 0.069 & 0.122 $\pm$ 0.014 & 0.115 $\pm$ 0.020 \\
Concrete & \textbf{0.114 $\pm$ 0.008} & \textbf{0.116 $\pm$ 0.004} & 0.389 $\pm$ 0.015 & 0.698 $\pm$ 0.046 & \textbf{0.109 $\pm$ 0.008} & \textbf{0.116 $\pm$ 0.007} & 0.145 $\pm$ 0.017 & 0.399 $\pm$ 0.020 & 0.116 $\pm$ 0.007 \\
Energy & 0.003 $\pm$ 0.000 & 0.003 $\pm$ 0.000 & 0.003 $\pm$ 0.000 & 0.152 $\pm$ 0.024 & 0.043 $\pm$ 0.036 & \textbf{0.002 $\pm$ 0.000} & 0.003 $\pm$ 0.001 & 0.087 $\pm$ 0.005 & 0.087 $\pm$ 0.004 \\
Kin8nm & 0.071 $\pm$ 0.001 & 0.075 $\pm$ 0.003 & 0.073 $\pm$ 0.001 & 0.290 $\pm$ 0.111 & \textbf{0.068 $\pm$ 0.002} & 0.083 $\pm$ 0.001 & 0.071 $\pm$ 0.001 & 0.088 $\pm$ 0.002 & \textit{(infeasible)} \\
Naval & \textbf{0.000 $\pm$ 0.000} & \textbf{0.001 $\pm$ 0.001} & \textbf{0.000 $\pm$ 0.000} & 0.007 $\pm$ 0.003 & 0.002 $\pm$ 0.000 & \textbf{0.000 $\pm$ 0.000} & 0.197 $\pm$ 0.174 & 0.000 $\pm$ 0.000 & \textit{(infeasible)} \\
Power & \textbf{0.052 $\pm$ 0.001} & \textbf{0.054 $\pm$ 0.002} & 0.054 $\pm$ 0.001 & 0.058 $\pm$ 0.002 & \textbf{0.054 $\pm$ 0.002} & \textbf{0.054 $\pm$ 0.002} & 0.052 $\pm$ 0.001 & 0.071 $\pm$ 0.001 & \textit{(infeasible)} \\
Protein & 0.459 $\pm$ 0.005 & 0.466 $\pm$ 0.004 & \textbf{0.429 $\pm$ 0.004} & 0.537 $\pm$ 0.008 & \textbf{0.421 $\pm$ 0.005} & 0.446 $\pm$ 0.006 & 0.425 $\pm$ 0.003 & 0.408 $\pm$ 0.002 & \textit{(infeasible)} \\
Wine & \textbf{0.652 $\pm$ 0.022} & 0.663 $\pm$ 0.009 & 1.297 $\pm$ 0.093 & \textbf{0.655 $\pm$ 0.023} & \textbf{0.627 $\pm$ 0.013} & 0.637 $\pm$ 0.031 & 0.682 $\pm$ 0.048 & 0.607 $\pm$ 0.033 & 0.585 $\pm$ 0.032 \\
Yacht & \textbf{0.003 $\pm$ 0.001} & 0.004 $\pm$ 0.001 & 0.221 $\pm$ 0.037 & 0.682 $\pm$ 0.140 & 0.004 $\pm$ 0.001 & \textbf{0.002 $\pm$ 0.001} & 0.008 $\pm$ 0.003 & 0.399 $\pm$ 0.064 & 0.355 $\pm$ 0.030 \\
Wave & \textbf{0.000 $\pm$ 0.000} & \textbf{0.000 $\pm$ 0.000} & \textbf{0.000 $\pm$ 0.000} & \textbf{0.000 $\pm$ 0.000} & \textbf{0.000 $\pm$ 0.000} & \textbf{0.000 $\pm$ 0.000} & 0.001 $\pm$ 0.001 & 0.000 $\pm$ 0.000 & \textit{(infeasible)} \\
Denmark & \textbf{0.155 $\pm$ 0.004} & 0.287 $\pm$ 0.003 & 0.163 $\pm$ 0.004 & 0.225 $\pm$ 0.003 & 0.189 $\pm$ 0.008 & 0.194 $\pm$ 0.003 & 0.197 $\pm$ 0.004 & 0.260 $\pm$ 0.001 & \textit{(infeasible)} \\
\midrule
Mean rank & 1.364 & 2.000 & 2.182 & 3.182 & 1.636 & 1.727 & - & - & - \\
\bottomrule
\end{tabular}
}
\end{table*}

We present additional regression results reporting the mean square error (MSE) of evaluated methods across the considered baselines, see \cref{tab:mse}.
We find that GFSVI also performs competitively in terms of MSE compared to baselines and obtains the best mean rank, matching best the performing methods on nearly all datasets. 
In particular, we find that using GP priors in the linearized BNN setup with GFSVI yields improvements over the weight-space priors used in TFSVI and that GFSVI performs slightly better than FVI.
Function-space VI methods (TFSVI, GFSVI, FVI) significantly improves over weight-space VI mostly performing similarly to the linearized Laplace approximation. 
Further improvement over baselines are obtained when considering GP priors with GFSVI and FVI.
Finally, GFSVI compares favorably to the GP and sparse GP. 


\subsection{Variational measure evaluation}
\label{app:var_measure_eval}

\cref{tab:var_measure_eval} evaluates our inference method by comparing samples drawn from the exact posterior (where computationally feasible) with the approximate posterior obtained with our method (GFSVI).
We follow the setup by \citet{wilson2022evalBNN} and we compute the average per-sample Wasserstein-2 metric samples drawn from a GP posterior with RBF kernel evaluated at the test points, and samples from the approximate posterior of GFSVI, sparse GP and FVI evaluated at the same points and with the same prior. 
More details are provided in \cref{app:sec_measure_eval_details}.
We find that GFSVI approximates the exact posterior more accurately that FVI, obtaining a higher mean rank, but worse than the gold standard sparse GP, which demonstrates to be most accurate.

\begin{table*}[h!]
\scshape
\caption{
Average point-wise Wasserstein-2 distance (lower is better) between exact and approximate posterior of reported methods. GFSVI (ours) provides a more accurate approximation than FVI.
}
\label{tab:var_measure_eval}
\centering
\resizebox{0.8\linewidth}{!}{
\renewcommand{\arraystretch}{1}
\begin{tabular}{@{}lcccccc@{}}
\toprule
Dataset & Boston & Concrete & Energy & Wine & Yacht & Mean rank \\
\midrule
GFSVI (ours) & \textbf{0.0259 $\pm$ 0.0040} & \textbf{0.0499 $\pm$ 0.0029} & \textbf{0.0035 $\pm$ 0.0004} & \textbf{0.0571 $\pm$ 0.0097} & \textbf{0.0036 $\pm$ 0.0006} & 1.0 \\
FVI & 0.0469 $\pm$ 0.0044 & 0.0652 $\pm$ 0.0037 & \textbf{0.0037 $\pm$ 0.0004} & 0.1224 $\pm$ 0.0167 & \textbf{0.0052 $\pm$ 0.0013} & 1.6 \\
\midrule
GP sparse & 0.0074 $\pm$ 0.0022 & 0.0125 $\pm$ 0.0016 & 0.0042 $\pm$ 0.0003 & 0.0170 $\pm$ 0.0035 & 0.0035 $\pm$ 0.0008 & - \\ 
\bottomrule
\end{tabular}
}
\end{table*}

\subsection{Out-of-distribution detection with image data}

We here show an additional plot from our out-of-distribution detection experiment with image data (details in \ref{app:sec_ood_details}).
\cref{fig:ood_detection_plot} shows the (normalized) histograms of the entropy of the mean prediction produced by each model on the in-distribution (blue) and out-of-distribution (red) data sets considered in our OOD detection experiment.
Methods which estimate the (regularized) KL-divergence in function-space (GFSVI, FVI and TFSVI) use the \textsc{kmnist} measurement distribution. 
We find that the entropy produced by GFSVI on in-distribution data highly peaks around 0 while the entropy produced from out-of-distribution data strongly concentrates around its maximum $\ln (10)$.
GFSVI best partitions ID and OOD data based on predictive entropy improving over the function-space prior (FVI) and weight-space prior (TFSVI, MFVI, Laplace) BNN baselines (see \cref{tab:classification}).

\begin{figure*}[!h]
    \centering
    \resizebox{\linewidth}{!}{
    \includegraphics[width=\linewidth]{plots/ood_detection_plot.pdf}
    }
    \caption{Histograms of the entropy of the mean predictive distribution produced by evaluated methods in the out-of-distribution detection with image data experiment. GFSVI (ours) best partitions in-distribution and out-of-distribution data based on the entropy of its mean predictive distribution.}
    \label{fig:ood_detection_plot}
\end{figure*}


\subsection{Influence of measurement point distribution for image data}
\label{app:influence_rho_image_data}

We present additional results evaluating the influence of the measurement point distribution $\rho$ on the the performance of function-space inference methods when using high-dimensional image data. 
The measurement point distribution are described in \cref{app:sec_classification_details}.
Just like in \citet{rudner2022fsvi}, we find that the choice of measurement point distribution may highly influence the OOD detection accuracy. 
While the expected log-likelihood, accuracy and expected calibration error (ECE) of a model generally remains comparable across measurement point distributions, the OOD accuracy of GFSVI is greatly improved by using samples from \textsc{kmnist} to evaluate the (regularized) KL divergence.
The measurement point distribution determines where the BNN is regularized and thus should be carefully selected especially for high dimensional data.

\begin{table*}[!h]
\scshape
\caption{Influence of the measurement point distribution $\rho$ on expected log-likelihood (log-like.), accuracy (acc.), expected calibration error (ECE) and out-of-distribution detection accuracy (OOD acc.). $\rho$ determines where the BNN will be regularized and strongly influences the out-of-distribution performance of the BNN.}
\label{tab:influence_rho_ood_detection}
\resizebox{\linewidth}{!}{
\renewcommand{\arraystretch}{1}
\begin{tabular}{@{}llcccccccccc@{}}
\toprule
\multirow{1}{*}{\parbox[t]{0pt}{\multirow{2}{*}{\rotatebox[origin=c]{90}{Data\hspace{-0em}}}}}
& Metric & \multicolumn{3}{c}{GFSVI} & \multicolumn{3}{c}{FVI} & \multicolumn{3}{c}{TFSVI}\\
\cmidrule(rl){3-5} \cmidrule(rl){6-8} \cmidrule(rl){9-11}
& & Random & Random Pixel & KMNIST & Random & Random Pixel & KMNIST & Random & Random Pixel & KMNIST \\
\midrule
\multirow{4}{*}{\parbox[t]{0pt}{\multirow{2}{*}{\rotatebox[origin=c]{90}{MNIST\hspace{-1.3em}}}}}
& Log-like.\ ($\uparrow$) & \textbf{-0.033 $\pm$ 0.000} & -0.034 $\pm$ 0.000 & -0.041 $\pm$ 0.000 & -0.145 $\pm$ 0.005 & -0.038 $\pm$ 0.000 & -0.238 $\pm$ 0.006 & -0.047 $\pm$ 0.003 & \textbf{-0.032 $\pm$ 0.001} & -0.041 $\pm$ 0.001 \\
& Acc.\ ($\uparrow$) & \textbf{\hphantom{-}0.992 $\pm$ 0.000} & \hphantom{-}0.989 $\pm$ 0.000 & \hphantom{-}0.991 $\pm$ 0.000 & \hphantom{-}0.976 $\pm$ 0.001 & \hphantom{-}0.988 $\pm$ 0.000 & \hphantom{-}0.943 $\pm$ 0.001 & \hphantom{-}0.989 $\pm$ 0.000 & \hphantom{-}0.989 $\pm$ 0.000 & \hphantom{-}0.989 $\pm$ 0.000 \\
& ECE ($\downarrow$) & \textbf{\hphantom{-}0.002 $\pm$ 0.000} & \hphantom{-}0.004 $\pm$ 0.000 & \hphantom{-}0.006 $\pm$ 0.000 & \hphantom{-}0.064 $\pm$ 0.001 & \hphantom{-}0.003 $\pm$ 0.000 & \hphantom{-}0.073 $\pm$ 0.003 & \hphantom{-}0.007 $\pm$ 0.000 & \hphantom{-}0.003 $\pm$ 0.000 & \hphantom{-}0.006 $\pm$ 0.000 \\
& OOD acc.\ ($\uparrow$) & \hphantom{-}0.921 $\pm$ 0.008 & \hphantom{-}0.868 $\pm$ 0.010 & \textbf{\hphantom{-}0.980 $\pm$ 0.004} & \hphantom{-}0.894 $\pm$ 0.010 & \hphantom{-}0.863 $\pm$ 0.003 & \hphantom{-}0.891 $\pm$ 0.006 & \hphantom{-}0.887 $\pm$ 0.011 & \hphantom{-}0.861 $\pm$ 0.008 & \hphantom{-}0.893 $\pm$ 0.005 \\
\midrule
\multirow{4}{*}{\parbox[t]{0pt}{\multirow{2}{*}{\rotatebox[origin=c]{90}{FMNIST\hspace{-1.5em}}}}}
& Log-like.\ ($\uparrow$) & \textbf{-0.260 $\pm$ 0.003} & \textbf{-0.258 $\pm$ 0.002} & -0.294 $\pm$ 0.006 & -0.300 $\pm$ 0.002 & -0.293 $\pm$ 0.003 & -0.311 $\pm$ 0.005 & -0.261 $\pm$ 0.001 & \textbf{-0.258 $\pm$ 0.001} & -0.261 $\pm$ 0.002 \\
& Acc.\ ($\uparrow$) & \textbf{\hphantom{-}0.910 $\pm$ 0.001} & \hphantom{-}0.908 $\pm$ 0.001 & \textbf{\hphantom{-}0.909 $\pm$ 0.001} & \textbf{\hphantom{-}0.910 $\pm$ 0.002} & \hphantom{-}0.900 $\pm$ 0.001 & \hphantom{-}0.906 $\pm$ 0.002 & \textbf{\hphantom{-}0.909 $\pm$ 0.001} & \hphantom{-}0.908 $\pm$ 0.001 & \hphantom{-}0.907 $\pm$ 0.001 \\
& ECE ($\downarrow$) & \textbf{\hphantom{-}0.020 $\pm$ 0.003} & \hphantom{-}0.022 $\pm$ 0.001 & \hphantom{-}0.042 $\pm$ 0.002 & \hphantom{-}0.027 $\pm$ 0.005 & \textbf{\hphantom{-}0.018 $\pm$ 0.002} & \hphantom{-}0.024 $\pm$ 0.002 & \hphantom{-}0.022 $\pm$ 0.002 & \textbf{\hphantom{-}0.018 $\pm$ 0.001} & \textbf{\hphantom{-}0.021 $\pm$ 0.002} \\
& OOD acc.\ ($\uparrow$) & \hphantom{-}0.853 $\pm$ 0.005 & \hphantom{-}0.867 $\pm$ 0.005 & \textbf{\hphantom{-}0.997 $\pm$ 0.001} & \hphantom{-}0.925 $\pm$ 0.005 & \hphantom{-}0.842 $\pm$ 0.006 & \hphantom{-}0.975 $\pm$ 0.002 & \hphantom{-}0.802 $\pm$ 0.006 & \hphantom{-}0.800 $\pm$ 0.007 & \hphantom{-}0.779 $\pm$ 0.010 \\
\bottomrule
\end{tabular}
}
\end{table*}


\subsection{Input distribution shift with rotated image data}
\label{app:rotated_image_data}

We here provide an experiment evaluating our method's (GFSVI) robustness in detecting input distribution shift. 
We expect the predictive uncertainty of a well-calibrated Bayesian model to be low for in-distribution data and to gradually increase as the input distribution shifts further away from the training data distribution.
To test this property, we follow the setup by \citet{sensoy2018evidential,rudner2022fsvi} and assume like the related work that increasing the rotation angle of images gradually increases the level of input "distribution shift". 
We report the mean and standard-deviation of the average mean predictive entropy of models fit on MNIST \citep{lecun2010mnist} and Fashion MNIST \citep{xiao2017FMNIST} for increasingly large angles of rotation of their respective test data partition.  
We find that GFSVI is confident (low predictive entropy) for images with small rotation angles, and that its predictive entropy increases with the angle.
GFSVI therefore exhibits the expected behavior of a well-calibrated Bayesian model. 
We note that FVI, Laplace and MFVI tend to be under-confident (high predictive entropy) for small rotation angles, which might be a symptom of underfitting further supported by the results in \cref{tab:classification}. 
Also, with the exception of TFSVI, the predictive entropy of baselines across different rotation angles is generally higher than the one produced by GFSVI.
\begin{figure*}[h!]
    \centering
    \resizebox{\linewidth}{!}{
    \includegraphics[width=\linewidth]{plots/input_dist_shift.pdf}
    }
    \caption{Average predictive entropy of models trained on MNIST and Fashion MNIST and evaluated for different rotation angles of their respective test data partitions. We see that our method (GFSVI) exhibits the behavior of a well-calibrated Bayesian model.}
    \label{fig:input_dist_shift}
\end{figure*}

\subsection{Example of model misspecification with \cite{wild2022gvi}}

\begin{figure*}[h!]
    \centering
    \resizebox{\linewidth}{!}{
    \includegraphics[width=\linewidth,height=1in]{plots/gfsvi_vs_gwi_periodic.pdf}
    }
    \caption{Example of model misspecification when using a periodic GP prior with baseline GWI \citep{wild2022gvi} that does not occur with our method (GFSVI). In GWI, only the posterior covariance is periodic, while the neural network parameterizing the posterior mean results in a function that does not capture the beliefs carried by the (periodic) prior. In contrast, our method accurately captures the GP prior's beliefs and yields a (locally) periodic function.
    }
    \label{fig:gfsvi_vs_gwi_periodic}
\end{figure*}
\cref{fig:gfsvi_vs_gwi_periodic} shows an example of model misspecification when using a periodic GP prior with the baseline GWI \citep{wild2022gvi}. As can be seen in the left panel of the figure, this problem does not occur with our method (GFSVI). While the posterior covariance in GWI reflects the periodicity of the prior, the neural network parametrizing the posterior mean does not result in a periodic function, i.e., the mean does not capture the beliefs specified by the periodic GP prior. In contrast, our method accurately captures the GP prior's beliefs and yields a (locally) periodic function. 

\subsection{Convergence speed on UCI data}
\label{app:convergence_speed}

\begin{wraptable}{r}{7cm}
    \centering
    \scshape
    \caption{Training time of our method GFSVI and baselines MFVI \citep{blundell2015weight} and TFSVI \citep{rudner2022fsvi} on the boston UCI dataset.}
    \resizebox{\linewidth}{!}{
    \begin{tabular}{@{}lccc@{}}
    \toprule
     & GFSVI (ours) & TFSVI & MFVI \\
    \midrule
    Time (s) & 44.15 $\pm$ 1.56 & 36.36 $\pm$ 0.90 & 38.38 $\pm$ 10.80 \\
    \bottomrule
    \end{tabular}
    }
    \label{tab:boston_exp_ll_convergence}
\end{wraptable}
\cref{tab:boston_exp_ll_convergence} shows the training time of our method and baselines MFVI \citep{blundell2015weight} and TFSVI \citep{rudner2022fsvi} on the boston dataset using $M=100$ context points averaged over $5$ cross-validation splits, as well as \cref{fig:boston_exp_ll_convergence} showing the convergence of the validation expected log-likelihood on the boston dataset. Our method converges in more steps than the TFSVI. GFSVI typically takes more time/steps to train that TFSVI as it additionally needs to adapt its features to the beliefs specified by the Gaussian process prior. 

\begin{figure*}[h]
    \centering
    \resizebox{0.6\linewidth}{!}{
    \includegraphics[width=\linewidth]{plots/convergence_plot_boston.pdf}
    }
    \caption{Validation expected log-likelihood of our method (GFSVI) and baselines TFSVI and MFVI. GFSVI (ours) converges on the boston dataset in slightly more steps than TFSVI but in fewer than MFVI.}
    \label{fig:boston_exp_ll_convergence}
\end{figure*}

\begin{figure*}[t]
    \centering
    \resizebox{0.6\linewidth}{!}{
    \includegraphics[width=\linewidth]{plots/convergence_plot_mnist.pdf}
    }
    \caption{
     Validation expected log-likelihood of our method (GFSVI) and baselines TFSVI and MFVI on MNIST. GFSVI (ours) converges in slightly more steps than TFSVI but in fewer than MFVI.
    }
    \label{fig:mnist_exp_ll_convergence}
\end{figure*}


\subsection{Additional plots for kernel eigenvalue decay}

\cref{fig:kernel_gram_eigendecay} shows a plot demonstrating the decay rate of the eigenvalues of RBF and Matérn-1/2 kernels evaluated at points sampled uniformly over $\mathcal{X}$ . 
The rate of decay of covariance operator's eigenvalues gives important information about the smoothness of stationary kernels \citep{williams2006gaussian} and that increased smoothness of the kernel leads to faster decay of eigenvalues \citet{Santin2016approxEigenfunc}.
For instance, RBF covariance operator eigenvalues decay at near exponential rate independent of the underlying measure \citep{belkin2018approximation} and Matérn kernels eigenvalues decay polynomialy \citep{chen2021gaussian}.
We find that the kernel evaluated at points sampled from a uniform distribution over $\mathcal{X}$ share this same behavior (see \cref{fig:kernel_gram_eigendecay}).

\begin{figure}[!h]
    \centering
    \begin{tabular}{cc}
    \subfigure[$D=1$]{\includegraphics[width=0.5\linewidth]{plots/kernel_eigvals_1d.pdf}} &
    \subfigure[$D=100$]{\includegraphics[width=0.5\linewidth]{plots/kernel_eigvals_100d.pdf}} 
    \end{tabular}
    \caption{Mean eigenvalues of the Gram matrix obtained for different kernels and for varying length-scales over $10$ draws from a uniform distribution on $[-2, 2 ]^D$. The mean eigenvalues are arranged in increasing order. The eigenvalues of the Gram matrix associated with the smooth RBF kernel decays much faster than those of the Matérn-1/2. Furthermore, the eigenvalues decay at a slower rate in high dimensions (D=100).}
    \label{fig:kernel_gram_eigendecay}
\end{figure}


\subsection{Additional plots for choosing $\gamma$ in $D_\text{\normalfont KL}^\gamma$}
\label{seq:choosing_gamma}

The $\gamma$ parameter controls the magnitude of the regularized KL divergence (see \cref{fig:regkl_vs_gamma}) and adjusts the relative weight of the regularized KL divergence and expected log-likelihood term in the training objective (see \cref{fig:fsvi_influence_gamma}).  
Furthermore, $\gamma$ also acts as "jitter" preventing numerical errors. 
We recommend choosing $\gamma$ large enough to avoid numerical errors while remaining small enough to provide strong regularization. 

\begin{figure*}[h!]
    \centering
    \resizebox{\linewidth}{!}{
    \includegraphics[width=\linewidth]{plots/fsvi_influence_gamma.pdf}
    }
    \caption{The $\gamma$ parameter of the regularized KL divergence controls the magnitude of the regularizer in the objective and should be small enough to provide strong regularization.}
    \label{fig:fsvi_influence_gamma}
\end{figure*}

\begin{figure}[h!]
\centering
\noindent
\begin{minipage}[t]{0.45\linewidth}
\includegraphics[width=\linewidth]{plots/rank_vs_activation.pdf}
\captionof{figure}{The BNN's covariance adaptation to the prior's covariance rank depends on its activation function. BNNs fit with a RBF prior (full) show lower rank than with a Matérn-1/2 (dotted).}
\label{fig:approx_posterior_rank}
\end{minipage}%
\hfill
\begin{minipage}[t]{0.53\linewidth}
\includegraphics[width=\linewidth]{plots/reg_kl_vs_measurement_pts.pdf}
\captionof{figure}{$\gamma$ explicitly controls the magnitude of the regularized KL-divergence $D_\text{KL}^\gamma$. Rougher priors (Matérn-1/2) require more measurement points to accurately estimate $D_\text{KL}^\gamma$ than smooth priors (RBF).}
\label{fig:regkl_vs_gamma}
\end{minipage}%
\end{figure}
