% \documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

\usepackage{url}
\usepackage{algorithmic}
\usepackage{algorithm}
\usepackage{graphicx}
\usepackage{subfig}

% \usepackage{hyperref}
\usepackage{cleveref}
\usepackage{hyperref}
\hypersetup{colorlinks=true, linkcolor=black, urlcolor=black, citecolor=black}

\renewcommand{\algorithmicrequire}{\textbf{Input:}}
\renewcommand{\algorithmicensure}{\textbf{Output:}}
\renewcommand{\algorithmiccomment}[1]{\bgroup\hfill\scriptsize\it//~#1\egroup}
\newcommand{\algorithmicbreak}{\textbf{break}}
\newcommand{\BREAK}{\STATE \algorithmicbreak}

\newlength{\figwidth}
\newlength{\figheight}
\setlength{\figwidth}{0.8\textwidth}
\setlength{\figheight}{0.3\textheight}

\input{math_commands.tex}

\title{Fast Predictive Uncertainty for \\Classification with Bayesian Deep Networks (Supplementary material)}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
% Add authors
\author[1]{\href{mailto:<marius.hobbhahn@gmail.com>?Subject=Your UAI 2022 paper}{Marius~Hobbhahn}{}}
\author[1]{Agustinus~Kristiadi}
\author[1,2]{Philipp~Hennig}
% Add affiliations after the authors
\affil[1]{%
    University of Tübingen
}
\affil[2]{%
    Max-Planck Institute for Intelligent Systems
}
  
\begin{document}
\maketitle

\section{Appendix A}
\label{appendix_A}
\subsection*{Figures}

The parameters of Figure \ref{fig:1D_Laplace_bridge} are from left to right $\alpha, \beta = (0.8,0.9), (4,2,), (2, 7)$.

\subsection*{Change of Variable for pdf} 
%
Let $\rvz$ be an $n$-dimensional continuous random variable with joint density function $p_\rvx$. If $\rvy = G(\rvx)$, where $G$ is a differentiable function, then $\rvy$ has density $p_\rvy$:
\begin{equation}
p(\mathbf{y}) = f\Big(G^{-1}(\mathbf{y})\Big)\left\vert \det\left[\frac{dG^{-1}(\mathbf{z})}{d\mathbf{z}}\Bigg \vert_{\mathbf{z}=\mathbf{y}}\right]\right \vert
\end{equation}
where the differential is the Jacobian of the inverse of $G$ evaluated at $\rvy$. This procedure, also known as `change of basis', is at the core of the Laplace bridge since it is used to transform the Dirichlet into the softmax basis.

\subsection*{Correction for sum(y)=0} 
%
We know that the product rule of Gaussians yields
\begin{align}
    p(x \vert Ax = y) &= \frac{p(x,y)}{p(y)} \\
    = \mathcal{N}(x; &\mu  + \Sigma A^\top (A \Sigma A^\top)^{-1} (y - A\mu),\\ \nonumber
    & \Sigma - \Sigma A^\top (A \Sigma A^\top)^{-1} A \Sigma )
\end{align}
In our particular setup we have
\begin{equation}
    p(x) = \mathcal{N}(x; \mu, \Sigma)
\end{equation}
with constraint
\begin{equation}
    p(I \vert x) = \delta(1 x^\top -0) = \lim_{\epsilon \rightarrow \infty} \mathcal{N}(0; 1^\top x, \frac{1}{\epsilon})
\end{equation}
Therefore we get
\begin{align}
    p(x \vert I) &= \mathcal{N}(x; \mu + \Sigma 1(1^\top \Sigma 1 - \frac{1}{\epsilon})^{-1}(0 - 1^\top \mu),\\ \nonumber
    &\Sigma - \Sigma 1(1^\top \Sigma 1 - \frac{1}{\epsilon})^{-1} 1^\top \Sigma) \\
    &= \mathcal{N}\left(x; \mu - \frac{\Sigma \mathbf{1} \mathbf{1}^\top \mu}{\mathbf{1}^\top \Sigma \mathbf{1}}, \Sigma - \frac{\Sigma \mathbf{1} \mathbf{1}^\top \Sigma}{\mathbf{1}^\top \Sigma \mathbf{1}} \right)
\end{align}

\subsection*{Variance correction} 
As described in the main text, the original Laplace Bridge scales worse with $\Sigma$ than sampling and applying the softmax. In Figure \ref{fig:correction_contour} you can see a contourplot that shows the scaling of mean and variance with and without correction. As suggested, the Variance has nearly no influence on the result before the correction but our correction fixes that. 
\begin{figure}
    \centering
    \includegraphics[width=0.48\textwidth]{figures/correction_contour.pdf}
    \caption{Contourplot showing the scaling behavior of $\mu$ and $\Sigma$. In the left figure, we see that Sigma has nearly no influence on the scaling. Our correction in the right figure fixes that. Contour levels show the first entry of $\alpha$ on a log-scale.}
    \label{fig:correction_contour}
\end{figure}

Some reviewers wanted to understand how we derived the equations for our correction, so here is a short informal explanation. During the experimentation with the LB, we found that it doesn't approximate the sample distribution well when $\Sigma$ gets large. We then understood why (as detailed in the limitations section) and proposed a fix for these scenarios without damaging its behavior in all other scenarios. We experimented with multiple fixes and the result you see in the paper is the one that fulfilled most of our criteria. Therefore, the correction doesn't come from a principled theoretical derivation but is motivated by the theoretical findings.

\section{Appendix (Derivation of LB)}
\label{appendix_B_LB}

%Notation
% vector p: axis in the standard basis == z
% vector a: axis in the transformed basis == \pi
% vector q: == \xi
% vector b: == \tau


Assume we have a Dirichlet in the standard basis with parameter vector $\valpha$ and probability density function:

\begin{equation}\label{eq:dirichlet_appendix}
    \mathrm{Dir}(\vpi | \valpha) := \frac{\Gamma \left( \sum_{k=1}^K \alpha_k \right)}{\prod_{k=1}^K \Gamma(\alpha_k)} \prod_{k=1}^K \pi_k^{\alpha_k-1} \, ,
\end{equation}

We aim to transform the basis of this distribution via the softmax transform to be in the new base $\pi$:

\begin{equation}\label{eq:softmax_appendix}
    \pi_k(\vz) := \frac{\exp(z_k)}{\sum_{l=1}^K \exp(z_l)} \, ,
\end{equation}

Usually, to transform the basis we would need the inverse transformation $H^{-1}(\vz)$ as described in the main paper. However, the softmax does not have an analytic inverse. Therefore David JC MacKay uses the following trick. Assume we know that the distribution in the transformed basis is:

\begin{equation}\label{eq:dirichlet_softmax_appendix}
    \mathrm{Dir}_{\vz}(\vpi(\vz) | \valpha) := \frac{\Gamma \left( \sum_{k=1}^K \alpha_k \right)}{\prod_{k=1}^K \Gamma(\alpha_k)} \prod_{k=1}^K \pi_k(\vz)^{\alpha_k} \, ,
\end{equation}

then we can show that the original distribution is the result of the basis transform by the softmax. 

\textbf{The Dirichlet in the softmax basis:} 
We show that the density over $\vpi$ shown in Equation \ref{eq:dirichlet_softmax_appendix} transforms into the Dirichlet over $\vz$. First, we consider the special case where $\vpi$ is confined to an $I-1$ dimensional subspace satisfying $\sum_i \vpi_i = c$. In this subspace we can represent $\varphi$ by an $I - 1$ dimensional vector $\varphi$ such that 

\begin{align}
    \pi_i &= \varphi_i \quad i,...,I-1 \\
    \pi_I &= c - \sum_i^{I-1} \varphi_i
\end{align}

and similarly we can represent $\vz$ by an $I-1$ dimensional vector $\va$:

\begin{align}
    z_i &= \va_i \quad i,...,I-1 \\
    z_I &= 1 - \sum_i^{I-1}\va_i
\end{align}

then we can find the density over $\va$ (which is proportional to the required density over $\vz$)
from the density over $\varphi$ (which is proportional to the given density over $\vpi$) by finding the
determinant of the $(I - 1) \times (I - 1)$ Jacobian $\mJ$ given by

\begin{align}
    J_{ik} &= \frac{\partial \varphi_i}{\partial \va_l} = \sum_j^{I} \frac{\partial \vpi_i}{\partial \rvz_j}\frac{\partial \rvz_j}{\partial \va_k} \nonumber\\
    &= \delta_{ik}\vpi_i - \vpi_i\vpi_k + \vpi_i\vpi_I =  \vpi_i(\delta_{ik} - (\vpi_k - \vpi_I))
\end{align}

We define two additional $I-1$ dimensional helper vectors $\rvz_k^+ := \rvz_k - \rvz_I$ and $n_k := 1$, and use $\det(I - xy^T) = 1 - x \cdot y$ from linear algebra. It follows that
\begin{align}
    \det J &= \prod_{i=1}^{I-1} \vpi_i \times \det[I-n\vpi^{+^T}] \nonumber \\
    &= \prod_{i=1}^{I-1} \vpi_i \times (1 - n \cdot \vpi^{+})  \\
    &= \prod_{i=1}^{I-1} \vpi_i \times \left(1 - \sum_k \vpi_k^{+} \right) = I \prod_{i=1}^I \vpi_i \nonumber
\end{align}

Therefore, using Equation \ref{eq:dirichlet_softmax_appendix} we find that
\begin{equation}
    P(\vpi) = \frac{P(\rvz)}{|\det \mJ|} \propto \prod_{i=1}^{I} \vpi_i^{\alpha_i - 1} 
\end{equation}
This result is true for any constant $c$ since it can be put into the normalizing constant. Thereby we make sure that the integral of the distribution is 1 and we have a valid probability distribution.

\section{Appendix (Derivation of Inversion)}
\label{appendix_C_inversion}
Through the figures of the 1D Dirichlet approximation in the main paper we have already established that the mode of the Dirichlet lies at the mean of the Gaussian distribution and therefore $\vpi(\vy) = \frac{\mathbf{\alpha}}{\sum_i \alpha_i}$. Additionally, the elements of $\vy$ must sum to zero. These two constraints combined yield only one possible solution for $\vmu$.

\begin{equation}
	\mu_k = \log \alpha_k  - \frac{1}{K} \sum_{l=1}^{K} \log \alpha_l
	\label{eq:mu_k}
\end{equation}

Calculating the covariance matrix $\vSigma$ is more complicated but layed out in the following. The logarithm of the Dirichlet is, up to additive constants

\begin{equation}
    \log p_\rvz(\rvz|\alpha) = \sum_k \alpha_k \pi_k 
\end{equation}

Using $\pi_k$ as the softmax of $\vy$ as shown in Equation \ref{eq:softmax_appendix} we can find the elements of the Hessian $\vL$

\begin{equation}
    L_{kl} = \hat{\alpha}(\delta_{kl}\hat{\pi_k} - \hat{\pi_k} \hat{\pi_l})
\end{equation}

where $\hat{\valpha} := \sum_k \alpha_k$ and $\hat{\vpi} = \frac{\alpha_k}{\hat{\alpha}}$ for the value
of $\vpi$ at the mode. Analytically inverting $\vL$ is done via a lengthy derivation using the fact that we can write $\vL = \mA + \mX\mB\mX^\top$ and inverting it with the Schur-complement. You can find the derivation in \citep{Hennig2010}. This process results in the inverse of the Hessian

\begin{equation}
    L_{kl}^{-1} = \delta_{kl} \frac{1}{\alpha_k} - \frac{1}{K} \left[\frac{1}{\alpha_k} + \frac{1}{\alpha_l} - \frac{1}{K}\left(\sum_u^K \frac{1}{\alpha_u}\right) \right]
\end{equation}

We are mostly interested in the diagonal elements, since we desire a sparse encoding for computational reasons and we otherwise needed to map a $K \times K$ covariance matrix to a $K\times 1$ Dirichlet parameter vector which would be a very overdetermined mapping. Note that $K$ is a scalar not a matrix. The diagonal elements of $\vSigma = \vL^{-1}$ can be calculated as

\begin{equation}
    \label{eq:Hessian_diag}
    \Sigma_{kk} = \frac{1}{\alpha_k} \left(1 - \frac{2}{K}\right)  + \frac{1}{K^2} \sum_{l}^{k} \frac{1}{\alpha_l}.
\end{equation}

To invert this mapping we transform Equation \ref{eq:mu_k} to 

\begin{equation}
    \label{eq:reform_mu_k}
    \alpha_k = e^{\mu_k} \prod_l^{K} \alpha_l^{1/K}
\end{equation}

by applying the logarithm and re-ordering some parts. Inserting this into Equation \ref{eq:Hessian_diag} and re-arranging yields

\begin{equation}
    \prod_l^K \alpha_l^{1/K} = \frac{1}{\vSigma_{kk}} \left[e^{-\mu}\left(1 - \frac{2}{K}\right)  + \frac{1}{K^2} \sum_u^K e^{-\mu_u} \right]
\end{equation}

which can be re-inserted into Equation \ref{eq:reform_mu_k} to give

\begin{equation}
    \label{eq:mapping_alpha_appendix}
    \alpha_k = \frac{1}{\Sigma_{kk}} \left(1 - \frac{2}{K} + \frac{e^{\mu_k}}{K^2} \sum_l^K e^{-\mu_k} \right)
\end{equation}

which is the final mapping. With Equations \ref{eq:mu_k} and \ref{eq:Hessian_diag} we are able to map from Dirichlet to Gaussian and with Equation \ref{eq:mapping_alpha_appendix} we are able to map the inverse direction. 

\section{Appendix (Experimental Details)}
\label{appendix_D_experiments}
%%%%%%%%%%%%%%%%%%%%%
\begin{table*}[htb!]
    \scriptsize
    \fontsize{9}{10}\selectfont
    \setlength{\tabcolsep}{4pt}
    \centering
    \caption{Comparing the extended probit approximation with the normalized version of the LB norm in the KFAC setting. The probit approximation seems to break down in the MNIST scenarios. 
    }
    %#######################################
    %\resizebox{\textwidth}{!}{% use resizebox with textwidth
    \begin{tabular}{l  l || c c c c | c  c  c  c }
         \toprule
         & & \multicolumn{4}{c}{\textbf{KFAC Probit}} & \multicolumn{4}{c}{\textbf{KFAC LB norm}} \\
         \textbf{Train} & \textbf{Test} & \textbf{MMC} $\downarrow$ & \textbf{AUROC} $\uparrow$& \textbf{ECE} $\downarrow$ & \textbf{NLL} $\downarrow$& \textbf{MMC} $\downarrow$ & \textbf{AUROC} $\uparrow$& \textbf{ECE} $\downarrow$ & \textbf{NLL} $\downarrow$\\
         \midrule
          MNIST &    MNIST &            0.105 &              0.000 &            2.258 &            0.883 &             0.975 &               0.000 &             0.043 &             0.018 \\
  MNIST &   FMNIST &            0.102 &              0.955 &            2.302 &            0.032 &             0.444 &               0.990 &             2.871 &             0.364 \\
  MNIST & notMNIST &            0.103 &              0.922 &            2.300 &            0.043 &             0.409 &               0.986 &             2.854 &             0.294 \\
  MNIST &   KMNIST &            0.102 &              0.962 &            2.304 &            0.012 &             0.414 &               0.991 &             3.162 &             0.328 \\
  \midrule
CIFAR10 &  CIFAR10 &            0.548 &              0.000 &            0.661 &            0.404 &             0.941 &               0.000 &             0.195 &             0.017 \\
CIFAR10 & CIFAR100 &            0.358 &              0.896 &            2.652 &            0.253 &             0.662 &               0.866 &             3.871 &             0.558 \\
CIFAR10 &     SVHN &            0.307 &              0.956 &            2.567 &            0.195 &             0.441 &               0.965 &             2.837 &             0.327 \\ 
         \bottomrule
    \end{tabular}
    %}
    \label{tab:probit_table_kfac}
    %\vspace{-0.75em}
\end{table*}
%%%%%%%%%%%%%%%%%%%%%%%%

The exact experimental setups, i.e. network architectures, learning rates, random seeds, etc. can be found in the accompanying GitHub repository  
\footnote{\url{https://github.com/mariushobbhahn/LB_for_BNNs_official}}.
This section is used to justify some of the decisions we made during the process in more detail, highlight some miscellaneous interesting things and showcase the additional experiments promised in the main paper.

\subsection*{Mathematical description of the setup}

In principle, the Gaussian over the weights required by the Laplace Bridge for BNNs can be constructed by any Gaussian approximate Bayesian method such as variational Bayes \citep{Graves2011VB,Blundell2015WeightUI} and Laplace approximations for NNs \citep{MacKay1992,ritter2018a}. We will focus on the Laplace approximation, which uses the same principle as the Laplace Bridge. However, in the Laplace approximation for neural networks, the posterior distribution over the weights of a network is the one that is approximated as a Gaussian, instead of a Dirichlet distribution over the outputs as in the Laplace Bridge.

Given a dataset $\D := \{ (\vx_i, t_i) \}_{i=1}^D$ and a prior $p(\vtheta)$, let
%
\begin{equation}
    p(\vtheta | \D) \propto p(\vtheta) p(\D | \vtheta) = p(\vtheta) \prod_{(\vx, t) \in \D} p(y = t | \vtheta, \vx) \, ,
\end{equation}
%
be the posterior over the parameter $\vtheta$ of an $L$-layer network $f_\vtheta$. Then we can get an approximation of the posterior $p(\vtheta | \D)$ by fitting a Gaussian $\N(\vtheta | \vmu_\vtheta, \mSigma_\vtheta)$ where
%
\begin{align*}
    \vmu_\vtheta &= \vtheta_\text{MAP} \, , \\
    \mSigma_\vtheta &= (-\nabla^2 \vert_{\vtheta_\text{MAP}} \log p(\vtheta | \D))^\inv =: \mH_\vtheta^\inv \, .
\end{align*}
%
That is, we fit a Gaussian centered at the mode $\vtheta_\text{MAP}$ of $p(\vtheta | \D)$ with the covariance determined by the curvature at that point. We assume that the prior $p(\vtheta)$ is a zero-mean isotropic Gaussian $\N(\vtheta | \mathbf{0}, \sigma^2 \mI)$ and the likelihood function is the Categorical density
%
\begin{equation*}
    p(\D | \vtheta) = \prod_{(\vx, t) \in \D} \mathrm{Cat}(y = t | \mathrm{softmax}(f_\vtheta(\vx))) \, .
\end{equation*}
%
For various applications in Deep Learning, an approximation with full Hessian is often computationally too expensive. Indeed, for each input $\vx \in \R^N$, one has to do $K$ backward passes to compute the Jacobian $\mJ(\vx)$. Moreover, it requires an $\mathcal{O}(PK)$ storage which is also expensive since $P$ is often in the order of millions. A cheaper alternative is to fix all but the last layer of $f_\vtheta$ and only apply the Laplace approximation on $\mW_L$, the last layer's weight matrix. This scheme has been used successfully by \citet{ScalableBayesianOptimizationDNNs2015,2016DeepKernelLearning,brosse2020last}, etc. and has been shown theoretically that it can mitigate overconfidence problems in ReLU networks \citep{kristiadi2020being}. In this case, given the approximate last-layer posterior
%
\begin{equation}
    p(\mW^L | \D) \approx \N(\vec(\mW^L) | \vec(\mW^L_\text{MAP}), \mH_{\mW^L}^\inv) \, ,
\end{equation}
%
one can efficiently compute the distribution over the logits. That is, let $\vphi: \R^N \to \R^{Q}$ be the first $L-1$ layers of $f_\vtheta$, seen as a feature map. Then, for each $\vx \in \R^N$, the induced distribution over the logit $\mW^L \vphi(\vx) =: \vz$ is given by
%
\begin{equation}
    p(\vz | \vx) = \N(\vz | \mW^L_\text{MAP} \vphi(\vx), (\vphi(\vx)^\top \otimes \mI) \mH_{\mW^L}^\inv (\vphi(\vx) \otimes \mI)) \, ,
\end{equation}
%
where $\otimes$ denotes the Kronecker product.

An even more efficient last-layer approximation can be obtained using a Kronecker-factored matrix normal distribution \citep{louizos_structured_2016,sun_learning_2017,ritter2018a}. That is, we assume the posterior distribution to be
%
\begin{equation}
    p(\mW^L | \D) \approx \MN(\mW^L | \mW^L_\text{MAP}, \mU, \mV) \, ,
\end{equation}
%
where $\mU \in \R^{K \times K}$ and $\mV \in \R^{Q \times Q}$ are the Kronecker factorization of the inverse Hessian matrix $\mH_{\mW^L}^\inv$ \citep{martens2015optimizing} and $\MN$ denotes the Matrix Normal distribution. In this case, for any $\vx \in \R^N$, one can easily show that the distribution over logits is given by
%
\begin{equation}
    p(\vz | \vx) = \N(\vz | \mW^L_\text{MAP} \vphi(\vx), (\vphi(\vx)^\top \mV \vphi(\vx))\mU) \, ,
\end{equation}
%
which is easy to implement and computationally cheap. Finally, and even more efficient, is a last-layer approximation scheme with a diagonal Gaussian approximate posterior, i.e. the so-called mean-field approximation. In this case, we assume the posterior distribution to be
%
\begin{equation}
    p(\mW^L | \D) \approx \N(\vec(\mW^L) | \vec(\mW^L_\text{MAP}), \diag{\vsigma^2}) \, ,
\end{equation}
%
where $\vsigma^2$ is obtained via the diagonal of the Hessian of the log-posterior w.r.t. $\vec(\mW^L)$ at $\vec(\mW^L_\text{MAP})$.

\subsection*{OOD Detection}

The test scenarios are: A two-layer convolutional network trained on the MNIST dataset \citep{MNISTLeCun}. The OOD datasets for this case are FMNIST \citep{FMNIST2017}, notMNIST \citep{notMNIST2011}, and KMNIST \citep{KMNIST2018}. For larger datasets, i.e.~CIFAR-10 \citep{krizhevsky2014cifar}, SVHN \citep{SVHN2011}, and CIFAR-100 \citep{krizhevsky2014cifar}, we use a ResNet-18 network \citep{2015_ResNet}. In all scenarios, the networks are well-trained with $99\%$ test accuracy on MNIST, $95.4\%$ on CIFAR-10, $76.6\%$ on CIFAR-100, and $100\%$ on SVHN. For the sampling baseline, we use $100$ posterior samples.

All network have been trained with conventional setups, i.e. we use ADAM with learning rate $1e-3$ and weight decay $5e-4$ for the MNIST experiments and SGD with a cosine annealing scheduler starting at learning rate $0.1$ and momentum $0.9$ for the CIFAR and SVHN experiments. 

\subsection*{Probit vs LB}

The KFAC setting of the probit comparison can be found in Table \ref{tab:probit_table_kfac}. Especially in the MNIST scenario the probit approximation seems to break down since even in-dist detection is at chance level. The LB, on the other hand, yields reasonable results. 

\bibliography{hobbhahn_180-supp}

\end{document}
