% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
% version; also before submission to see how the non-anonymous paper would look
% like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer Modern (has noticable issues) \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon ptmx; less tested, no support) NOTE: Only keep *one* line above as appropriate, as it will be replaced automatically for papers to be published. Do not make any other change above this note for an accepted version.

%% Choose your variant of English; be consistent
% \usepackage[american]{babel}
\usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
\bibliographystyle{plainnat}
\renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

% for cross referencing the main text PLEASE ONLY USE xr IN THE SUPPLEMENTARY
% MATERIAL. In the main paper, hard code any cross-reference to the
% supplementary material.
\usepackage{xr}
% \makeatletter

\externaldocument{goan_277}



% packages I added
\usepackage{multirow, booktabs}
% packages added by me
\usepackage{subcaption} \usepackage[ruled,vlined]{algorithm2e}
\usepackage{hyperref} \usepackage[acronym]{glossaries}
\newacronym{elbo}{ELBO}{Evidence Lower Bound}
\newacronym{pdmp}{PDMP}{Piecewise Deterministic Markov Process}
\newacronym{bps}{BPS}{Bouncy Particle Sampler}
\newacronym{sbps}{SBPS}{Stochastic Bouncy Particle Sampler}
\newacronym{esbps}{eSBPS}{Efficient Stochastic Bouncy Particle Sampler}
\newacronym{atsbps}{atSBPS}{Adaptive Thinning Stochastic Bouncy Particle Sampler}
\newacronym{ipp}{IPP}{Inhongeneous Poisson Process}
\newacronym{vi}{VI}{Variational Inference}
\newacronym{hmc}{HMC}{Hamiltonian Monte Carlo}
\newacronym{sghmc}{SGHMC}{Stochastic Gradient Hamiltonian Monte Carlo}
\newacronym{bnn}{BNN}{Bayesian Neural Network}
\newacronym{cnn}{CNN}{Convolutional Neural Network}
\newacronym{sgld}{SGLD}{Stochastic Gradient Langevin Dynamics}
\newacronym{ood}{OOD}{Out of Distribution}
%
\usepackage{amsfonts}
% my new commands
\newcommand{\norm}[1]{\left\lVert#1\right\rVert}
\newcommand{\taub}[0]{\tau_{\text{bounce}}}
\newcommand{\taur}[0]{\tau_{\text{ref}}}
\newcommand{\tauz}[0]{\mathbf{\tau}_{Z}}
\newcommand{\myomega}[0]{\mathbf{\omega}} \newcommand{\myv}[0]{\mathbf{v}}
\newcommand{\myw}[0]{\mathbf{\omega}} \newcommand{\sigbps}[0]{$\sigma$BPS}
\usepackage{tabularx}
\newcolumntype{L}[1]{>{\raggedright\let\newline\\\arraybackslash\hspace{0pt}}m{#1}}
\newcolumntype{C}[1]{>{\centering\let\newline\\\arraybackslash\hspace{0pt}}m{#1}}
\newcolumntype{R}[1]{>{\raggedleft\let\newline\\\arraybackslash\hspace{0pt}}m{#1}}

\newlength{\Oldarrayrulewidth} \newcommand{\Cline}[2]{%
  \noalign{\global\setlength{\Oldarrayrulewidth}{\arrayrulewidth}}%
  \noalign{\global\setlength{\arrayrulewidth}{#1}}\cline{#2}%
  \noalign{\global\setlength{\arrayrulewidth}{\Oldarrayrulewidth}}}

\DeclareMathOperator*{\argmax}{argmax} % thin space, limits underneath in displays
% \DeclareMathOperator*{\argmin}{argmin} % no space, limits underneath in displays \DeclareMathOperator{\argmin}{arg\,min} % thin space, limits on side in displays \DeclareMathOperator{\argmin}{argmin} % no space, limits on side in displays

\externaldocument{uai2023-template}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
% redefining \footnotesize, we provide the original \footnotesize using this
% macro. (Use only sparingly, e.g., in drawings, as it is quite small.)
\title{Piecewise Deterministic Markov Processes for Bayesian Neural
  Networks\\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide more space for
% long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors Add authors
\author[1]{\href{mailto:<ej.goan@qut.edu.au>?Subject=UAI2023 BNN Paper}{Ethan~Goan}{}}
\author[1]{Dimitri~Perrin} \author[1]{Kerrie Mengersen} \author[1]{Clinton
  Fookes}
% Add affiliations after the authors
\affil[1]{%
  Queensland University of Technolgy }

\begin{document}

\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle

\appendix
%
\section{Tightness of Proposed Approximate Upper Bound}
%
%
The key contribution within this work is the proposal of a new generic and
data-dependent thinning method to approximately sample event times from within
PDMP samplers. The quality of this thinning method relies on two key components;
the tightness of the envelope and its ability to provide a strict upper bound.
We want the envelope used to be able to be as close to the true event rate as
possible without reducing below it. This enables maximum efficiency of thinning
methods by reducing the likelihood of a proposed event time will be rejected.
%
%
\par
%
%
Previous works have relied on performing experimentation on simple well-defined
models where derivation of a strict and exact upper
bound\cite{bouchard2018bouncy,bierkens2019zig,bierkens2020boomerang,wu2017generalized}.
Derivation for a strict upper bound is infeasible for neural networks, though we
can assess the quality of our event thinning method by analysing the acceptance
ratio in Equation \ref{eq:thinning} from the body of the paper. We want this
ratio to be as close to one as possible without exceeding it, otherwise, the
envelope section used to propose the time is below the true event rate. We
assess the distribution of these acceptance ratios for varying values
of $\alpha$ from Equation \ref{eq:rate_adjust} in the body of the paper in
Figure \ref{fig:acceptance} and we illustrate the result on predictive
performance and computational load in Table \ref{tab:acceptance}.
%
%
\begin{figure}[!h]
  \centering
  \begin{subfigure}[t]{0.3\textwidth}
    \centering
    \includegraphics[width=1\linewidth]{./figs/supp_acceptance/acceptance_box.pdf}
    \caption{$\alpha=1.0$}
  \end{subfigure}%
  ~
  \begin{subfigure}[t]{0.3\textwidth}
    \centering
    \includegraphics[width=1\linewidth]{./figs/supp_acceptance/acceptance_box_1_2.pdf}
    \caption{\(\alpha=1.2\)}
  \end{subfigure}%
  ~
  \begin{subfigure}[t]{0.3\textwidth}
    \centering
    \includegraphics[width=1\linewidth]{./figs/supp_acceptance/acceptance_box_1_5.pdf}
    \caption{\(\alpha=1.5\)}
  \end{subfigure}%
  \caption{Distribution of acceptance ratios for event thinning across the
    different PDMP samplers used within this work for varying levels
    of $\alpha$. All models are fit on the MNIST data set as described in
    Section 5.2.}
  \label{fig:acceptance}
\end{figure}
%
%
\begin{table}[!h]
  \caption{Summary of predictive performance with and timings as the scaling value of $\alpha$ is increased for the PDMP samplers demonstrated within. All models are fit to the MNIST dataset using the Lenet5 architecture.}
  \label{tab:acceptance}
  \begin{center}
    \begin{small}
      % \begin{sc}
      \scalebox{0.85}{
        %
        \begin{tabular}{c l l l l l l }
          \toprule
          $\mathbf{\alpha}$ & {\bfseries Inference} & {\bfseries ACC}
                            & {\bfseries NLL}
                            & {\bfseries ECC}
                            & {\bfseries Time}
          \\
          \midrule\midrule[.1em]
          \multirow{3}{2cm}{$\alpha=1.0$}
          % type ipp p lam warm iter acc NLL ece tim
                            & BPS                   & 0.9914          & 1.4227 & 74.752  & 71  \\
                            & \sigbps               & 0.9908          & 0.0375 & 1.0445  & 121 \\
                            & Boomerang             & 0.9919          & 0.0230 & 0.139   & 77  \\
          \midrule
          \multirow{3}{2cm}{$\alpha=1.2$}
          % type ipp p lam warm iter acc NLL ece tim
                            & BPS                   & 0.9906          & 1.0778 & 64.4556 & 75  \\
                            & \sigbps               & 0.9900          & 0.2141 & 16.4637 & 130 \\
                            & Boomerang             & 0.9925          & 0.0234 & 0.1736  & 82  \\
          \midrule
          \multirow{3}{2cm}{$\alpha=1.5$}
          % type ipp p lam warm iter acc NLL ece tim
                            & BPS                   & 0.9909          & 1.0907 & 64.8491 & 80  \\
                            & \sigbps               & 0.991           & 0.8527 & 56.0289 & 143 \\
                            & Boomerang             & 0.9922          & 0.0232 & 0.1651  & 86  \\
          \bottomrule
        \end{tabular}
      }
      % \end{sc}
    \end{small}
  \end{center}
\end{table}
%
%
We can see that with $\alpha=1.0$, we see frequent occurrences of the proposed
envelope being below that of the true event rate, though as we increase the
value of $\alpha$, the likelihood of the approximate envelope being a strict upper
bound increases. In practice, setting this scaling parameter can be achieved
through the use of a small warm-up phase at the start of sampling to find a
ratio that satisfies a users willingness to mitigate bias that may be induced due to the violation of the upper bound assumption. To mitigate potential bias, the value of \(\alpha\) may be increased at the expense of a small increase in the computational demands of the thinning method as seen in Table \ref{tab:acceptance}.
%
%
\section{Additional Regression and Binary Classification Examples}
%
%
To further validate the predictive performance of PDMP samplers using the proposed event thinning method, we provide additional examples on easy to visualise regression tasks in Figure. \ref{fig:regression} and Figure \ref{fig:supp_logistic} which are compared with \Gls{sgld} with a decreasing learning rate as required, and a constant learning rate with no decay as is typically done in practice (SGLD-ND). For both regression and binary classification models, \Gls{sgld} experiments are run with a learning rate starting at $1e^{-5}$ and decays to zero linearly. For SGLD-ND, the learning rate of regression models is set to the largest value found that would avoid divergences, resulting in $1e^{-5}$ for regression models and $1e^{3}$ for binary classification.
%
%
\par
%
%
From these results, we further validate the predictive performance of these
samplers and their ability to yield informative uncertainty information for
out-of-distribution data when compared to SGLD with a decaying learning rate and
a constant learning rate. We find that even with a larger value learning rate
used for SGLD-ND that the sampler is unable to explore the posterior
sufficiently to provide meaningful uncertainty estimates. This phenomenon has
been reported in \cite{brosse2018promises}, where they identify that even with a
larger and constant learning rate, \Gls{sgld} dynamics converge to that of regular SGD.
%
%
\begin{figure}[!h]
  \centering
  \begin{subfigure}[t]{0.18\textwidth}
    \centering \includegraphics[width=1\linewidth]{./figs/supp_regression/toy_a/bps.pdf}
  \end{subfigure}%
  ~
  \begin{subfigure}[t]{0.18\textwidth}
    \centering \includegraphics[width=1\linewidth]{./figs/supp_regression/toy_a/cov.pdf}
  \end{subfigure}%
  ~
  \begin{subfigure}[t]{0.18\textwidth}
    \centering \includegraphics[width=1\linewidth]{./figs/supp_regression/toy_a/boom.pdf}
  \end{subfigure}%
  ~
  \begin{subfigure}[t]{0.18\textwidth}
    \centering \includegraphics[width=1\linewidth]{./figs/supp_regression/toy_a/sgld.pdf}
  \end{subfigure}%
  ~
  \begin{subfigure}[t]{0.18\textwidth}
    \centering \includegraphics[width=1\linewidth]{./figs/supp_regression/toy_a/sgld_nd.pdf}
  \end{subfigure}%
  \\
  \begin{subfigure}[t]{0.18\textwidth}
    \centering \includegraphics[width=1\linewidth]{./figs/supp_regression/toy_c/bps.pdf}
    \caption{BPS}
  \end{subfigure}%
  ~
  \begin{subfigure}[t]{0.18\textwidth}
    \centering \includegraphics[width=1\linewidth]{./figs/supp_regression/toy_c/cov.pdf}
    \caption{\sigbps}
  \end{subfigure}%
  ~
  \begin{subfigure}[t]{0.18\textwidth}
    \centering \includegraphics[width=1\linewidth]{./figs/supp_regression/toy_c/boom.pdf}
    \caption{Boomerang}
  \end{subfigure}%
  ~
  \begin{subfigure}[t]{0.18\textwidth}
    \centering \includegraphics[width=1\linewidth]{./figs/supp_regression/toy_c/sgld.pdf}
    \caption{SGLD}
  \end{subfigure}%
  ~
  \begin{subfigure}[t]{0.18\textwidth}
    \centering \includegraphics[width=1\linewidth]{./figs/supp_regression/toy_c/sgld_nd.pdf}
    \caption{SGLD-ND}
  \end{subfigure}%

  \caption{Exampled of predictive posteriors for BNN regression models across synthetic data sets. Training samples are shown in blue dots, and draws from the predictive distrubtion shown with black lines.}
  \label{fig:regression}
\end{figure}
%
%
\begin{figure}[!h]
  \centering
  \begin{subfigure}[t]{0.18\textwidth}
    \centering \includegraphics[width=1\linewidth]{./figs/supp_moons/grid_mean_logistic_bps.pdf}
  \end{subfigure}
  ~
  \begin{subfigure}[t]{0.18\textwidth}
    \centering \includegraphics[width=1\linewidth]{./figs/supp_moons/grid_mean_logistic_cov_pbps.pdf}
  \end{subfigure}
  ~
  \begin{subfigure}[t]{0.18\textwidth}
    \centering \includegraphics[width=1\linewidth]{./figs/supp_moons/grid_mean_logistic_boomerang.pdf}
  \end{subfigure}
  ~
  \begin{subfigure}[t]{0.18\textwidth}
    \centering \includegraphics[width=1\linewidth]{./figs/supp_moons/grid_mean_logistic_sgld.pdf}
  \end{subfigure}
  ~
  \begin{subfigure}[t]{0.18\textwidth}
    \centering \includegraphics[width=1\linewidth]{./figs/supp_moons/grid_mean_logistic_sgld_no_decay.pdf}
  \end{subfigure}
  %
  \\
  %
  \begin{subfigure}[t]{0.18\textwidth}
    \centering \includegraphics[width=1\linewidth]{./figs/supp_moons/grid_var_logistic_bps.pdf}
    \caption{BPS}
  \end{subfigure}
  ~
  \begin{subfigure}[t]{0.18\textwidth}
    \centering \includegraphics[width=1\linewidth]{./figs/supp_moons/grid_var_logistic_cov_pbps.pdf}
    \caption{\sigbps}
  \end{subfigure}
  ~
  \begin{subfigure}[t]{0.18\textwidth}
    \centering \includegraphics[width=1\linewidth]{./figs/supp_moons/grid_var_logistic_boomerang.pdf}
    \caption{Boomerang}
  \end{subfigure}
  ~
  \begin{subfigure}[t]{0.18\textwidth}
    \centering \includegraphics[width=1\linewidth]{./figs/supp_moons/grid_var_logistic_sgld.pdf}
    \caption{SGLD}
  \end{subfigure}
  ~
  \begin{subfigure}[t]{0.18\textwidth}
    \centering \includegraphics[width=1\linewidth]{./figs/supp_moons/grid_var_logistic_sgld_no_decay.pdf}
    \caption{SGLD-ND}
  \end{subfigure}
  \caption{Examples of predictive distributions for synthetic binary classification task. Top row indicates predictive mean and bottom row illustrates variance in predictions. Best viewed on a computer screen in colour.}
  \label{fig:supp_logistic}
\end{figure}
%
%
%
\section{Mixing performance}
In Section 5.2, experiments to investigate the mixing capabilities of the PDMP
samplers were conducted using PCA to reduce the dimensionality of the samples
generated from the different samplers for a single network. We extend this
analysis here for all models in Figures \ref{fig:supp_pca_most}, \ref{fig:supp_pca_second} and
\ref{fig:supp_least} for the first, second, and last principal components respectively.
From this these figures we can verify that the Boomerang sampler provided the greatest
mixing across the different models and datasets, whilst SGLD consistently converges to a single solution. We
further investigate this here by comparing just the two samplers for raw
parameter traces within different parts of the networks used for the MNIST and SVHN datasets. These results are shown
in Figure \ref{fig:param_trace}, and confirms the pathology of SGLD quickly converging to a single steady-state solution, whilst the Boomerang sample is
able to explore the posterior at all stages in the networks.
%
%
\begin{figure*}[!hbt]
  % \centering
  \begin{subfigure}[t]{0.25\textwidth}
    \centering
    \includegraphics[width=1\linewidth]{./figs/autocorrelation/mnist/autocorr_most.pdf}
  \end{subfigure}%
  % ~
  \begin{subfigure}[t]{0.25\textwidth}
    \centering
    \includegraphics[width=1\linewidth]{./figs/autocorrelation/fmnist/autocorr_most.pdf}
  \end{subfigure}
  % ~
  \begin{subfigure}[t]{0.25\textwidth}
    \centering
    \includegraphics[width=1\linewidth]{./figs/autocorrelation/svhn/autocorr_most.pdf}
  \end{subfigure}%
  % ~
  \begin{subfigure}[t]{0.25\textwidth}
    \centering
    \includegraphics[width=1\linewidth]{./figs/autocorrelation/cifar10/autocorr_most.pdf}
  \end{subfigure}
  %
  \\
  %
  \begin{subfigure}[t]{0.25\textwidth}
    \centering
    \includegraphics[width=1\linewidth]{./figs/autocorrelation/mnist/trace_most.pdf}
    \caption{MNIST}
  \end{subfigure}%
  % ~
  \begin{subfigure}[t]{0.25\textwidth}
    \centering
    \includegraphics[width=1\linewidth]{./figs/autocorrelation/fmnist/trace_most.pdf}
    \caption{Fashion-MNIST}
  \end{subfigure}
  % ~
  \begin{subfigure}[t]{0.25\textwidth}
    \centering
    \includegraphics[width=1\linewidth]{./figs/autocorrelation/svhn/trace_most.pdf}
    \caption{SVHN}
  \end{subfigure}%
  % ~
  \begin{subfigure}[t]{0.25\textwidth}
    \centering
    \includegraphics[width=1\linewidth]{./figs/autocorrelation/cifar10/trace_most.pdf}
    \caption{CIFAR-10}
  \end{subfigure}
  \caption{Plots summarising samples from tested samples projected onto first principal component. Top row represents the ACF plot, and the bottom shows the coordinate trace plot for the first principal component. Best viewed on a computer screen.}
  \label{fig:supp_pca_most}
\end{figure*}
%
\begin{figure*}[!hbt]
  % \centering
  \begin{subfigure}[t]{0.25\textwidth}
    \centering
    \includegraphics[width=1\linewidth]{./figs/autocorrelation/mnist/autocorr_second.pdf}
  \end{subfigure}%
  % ~
  \begin{subfigure}[t]{0.25\textwidth}
    \centering
    \includegraphics[width=1\linewidth]{./figs/autocorrelation/fmnist/autocorr_second.pdf}
  \end{subfigure}
  % ~
  \begin{subfigure}[t]{0.25\textwidth}
    \centering
    \includegraphics[width=1\linewidth]{./figs/autocorrelation/svhn/autocorr_second.pdf}
  \end{subfigure}%
  % ~
  \begin{subfigure}[t]{0.25\textwidth}
    \centering
    \includegraphics[width=1\linewidth]{./figs/autocorrelation/cifar10/autocorr_second.pdf}
  \end{subfigure}
  %
  \\
  %
  \begin{subfigure}[t]{0.25\textwidth}
    \centering
    \includegraphics[width=1\linewidth]{./figs/autocorrelation/mnist/trace_second.pdf}
    \caption{MNIST}
  \end{subfigure}%
  % ~
  \begin{subfigure}[t]{0.25\textwidth}
    \centering
    \includegraphics[width=1\linewidth]{./figs/autocorrelation/fmnist/trace_second.pdf}
    \caption{Fashion-MNIST}
  \end{subfigure}
  % ~
  \begin{subfigure}[t]{0.25\textwidth}
    \centering
    \includegraphics[width=1\linewidth]{./figs/autocorrelation/svhn/trace_second.pdf}
    \caption{SVHN}
  \end{subfigure}%
  % ~
  \begin{subfigure}[t]{0.25\textwidth}
    \centering
    \includegraphics[width=1\linewidth]{./figs/autocorrelation/cifar10/trace_second.pdf}
    \caption{CIFAR-10}
  \end{subfigure}
  \caption{Plots summarising samples from tested samples projected onto second principal component. Top row represents the ACF plot, and the bottom shows the coordinate trace plot for the first principal component. Best viewed on a computer screen.}
  \label{fig:supp_pca_second}
\end{figure*}
%
\begin{figure*}[!hbt]
  % \centering
  \begin{subfigure}[t]{0.25\textwidth}
    \centering
    \includegraphics[width=1\linewidth]{./figs/autocorrelation/mnist/autocorr_min.pdf}
  \end{subfigure}%
  % ~
  \begin{subfigure}[t]{0.25\textwidth}
    \centering
    \includegraphics[width=1\linewidth]{./figs/autocorrelation/fmnist/autocorr_min.pdf}
  \end{subfigure}
  % ~
  \begin{subfigure}[t]{0.25\textwidth}
    \centering
    \includegraphics[width=1\linewidth]{./figs/autocorrelation/svhn/autocorr_min.pdf}
  \end{subfigure}%
  % ~
  \begin{subfigure}[t]{0.25\textwidth}
    \centering
    \includegraphics[width=1\linewidth]{./figs/autocorrelation/cifar10/autocorr_min.pdf}
  \end{subfigure}
  %
  \\
  %
  \begin{subfigure}[t]{0.25\textwidth}
    \centering
    \includegraphics[width=1\linewidth]{./figs/autocorrelation/mnist/trace_min.pdf}
    \caption{MNIST}
  \end{subfigure}%
  % ~
  \begin{subfigure}[t]{0.25\textwidth}
    \centering
    \includegraphics[width=1\linewidth]{./figs/autocorrelation/fmnist/trace_min.pdf}
    \caption{Fashion-MNIST}
  \end{subfigure}
  % ~
  \begin{subfigure}[t]{0.25\textwidth}
    \centering
    \includegraphics[width=1\linewidth]{./figs/autocorrelation/svhn/trace_min.pdf}
    \caption{SVHN}
  \end{subfigure}%
  % ~
  \begin{subfigure}[t]{0.25\textwidth}
    \centering
    \includegraphics[width=1\linewidth]{./figs/autocorrelation/cifar10/trace_min.pdf}
    \caption{CIFAR-10}
  \end{subfigure}
  \caption{Plots summarising samples from tested samples projected onto last principal component. Top row represents the ACF plot, and the bottom shows the coordinate trace plot for the last principal component. Best viewed on a computer screen.}
  \label{fig:supp_least}
\end{figure*}
%
%
%
\begin{figure}[!h]
  \centering
  \begin{subfigure}[t]{0.45\textwidth}
    \centering \includegraphics[width=1\linewidth]{./figs/supp_trace/mnist_first.pdf}
    \caption{Parameter first layer MNIST}
  \end{subfigure}%
  ~
  \begin{subfigure}[t]{0.45\textwidth}
    \centering \includegraphics[width=1\linewidth]{./figs/supp_trace/mnist_last.pdf}
    \caption{Parameter last layer MNIST}
  \end{subfigure}%
  \\
  \begin{subfigure}[t]{0.45\textwidth}
    \centering \includegraphics[width=1\linewidth]{./figs/supp_trace/svhn_first.pdf}
    \caption{Parameter first layer SVHN}
  \end{subfigure}%
  ~
  \begin{subfigure}[t]{0.45\textwidth}
    \centering \includegraphics[width=1\linewidth]{./figs/supp_trace/svhn_last.pdf}
    \caption{Parameter last layer SVHN}
  \end{subfigure}%
  \caption{Trace plots comparing mixing of SGLD and the Boomerang sampler for individual weight parameters within different networks at different locations.}
  \label{fig:param_trace}
\end{figure}
%
%
From this, we can verify that SGLD is converging to a steady-state solution,
whilst the Boomerang sampler consistently explores the posterior space and
provide improved mixing. Given the requirement for SGLD to maintain a small
learning rate that approaches 0 to target the posterior
\cite{nagapetyan2017true, brosse2018promises,welling2011bayesian}, these results
are expected. The theoretic ability of SGLD to maintain the posterior as its invariant distribution comes at the expense mixing efficiency.
%
%
%
\section{Sensitivity to Hyper-Parameters}
\label{sec:sensitivity}

\subsection{Sensitivity to Velocity Distribution}
As noted in Section \ref{sec:discussion}, we discuss the sensitivity of these
PDMP Samplers for BNNs with respect to the distribution assigned to the
auxiliary velocity variable. Given that the aim of this velocity variable is to guide
the dynamics of the system to efficiently explore the parameter space, it needs
to be set appropriately. We demonstrate this here through experimentation to
highlight how poorly specified velocity distribution can corrupt inference.
%
%
\par
%
%
Figure \ref{fig:velocity} illustrates the predictive distribution for poorly
specified velocity distributions for the Boomerang sampler, though similar
effects are seen amoungst the other PDMP samplers when the variance for the
velocity distribution is incorrectly specified. We see that the scale of the
velocity component proportionately controls the mixing capabilities of the model.
When variance is too low, the sampler is unable to explore beyond the MAP
solution, and when too large the predictive performance can suffer. With better
approximations to the diagonal of Hessian of the negative log-likelihood, the
effects of this may be mitigated for the Boomerang Sampler. We highlight these
behaviours of PDMP samplers applied to BNNs to show the limitations and to
provide insight into the importance of setting these parameters correctly, and
areas of future research.
%
%
\begin{figure}[!h]
  \centering
  \begin{subfigure}[t]{0.45\textwidth}
    \centering
    \includegraphics[width=1\linewidth]{./figs/supp_velocity/small_boom.pdf}
    \caption{Small velocity distribution $\gamma=1$}
  \end{subfigure}%
  ~
  \begin{subfigure}[t]{0.45\textwidth}
    \centering
    \includegraphics[width=1\linewidth]{./figs/supp_velocity/large_boom.pdf}
    \caption{Large velocity distribution $\gamma=1000$}
  \end{subfigure}%
  \caption{Effect of scale in velocity reference measure for PDMP samplers
    applied to BNNs.}
  \label{fig:velocity}
\end{figure}
%
%
\subsection{Sensitivity to Refresh Event Rate}
%
%
MCMC samplers such as HMC \cite{neal2011mcmc} and NUTS \cite{hoffman2014no} have
step size parameters that can be adjusted and tuned for individual models. With
a small step size, exploration of the posterior can be limited, and if too large
then divergences in the posterior trajectory can be encountered and corrupting
inference \cite{betancourt2017conceptual}. The step size parameter is typically
tuned during a warm-up phase before sampling is commenced to find an optimal
value to maximise exploration and minimise the risk of encountering these
divergences.
%
%
\par
The PDMP samplers within here do not have an equivalent parameter that can be
tuned to guide simulation. The Trajectory of these samplers is defined solely on
the transition kernel to update velocity parameters and the event rate that
determines when these events occur. We can however yield a similar effect to
adjusting the step size of a traditional MCMC model through our choice of event
rate for our refreshment process $\text{PP}(\lambda_{ref})$.
%
%
\par
Recall from Section \ref{sec:refresh} that the final event time is given by,
\begin{equation}
  \label{eq:1}
  \tau_{event} = \text{min}(\tau, \tau_{ref})
\end{equation}
%
%
Where \(\tau \sim \text{PP}(\lambda(\myw(t), \myv(t)))\),
and $\tau_{ref} \sim \text{PP}(\lambda_{ref})$. Setting the value
for $\tau_{ref}$ can implicitly control the level of exploration within our
samplers. For large $\lambda_{ref}$, we will encounter smaller proposed refresh
times and thus will refresh more frequently. Similarly, for
larger $\lambda_{ref}$, our sampled refresh times will be larger,
and $\tau_{event}$ will equal $\tau$ more frequently, and further exploration of
the posterior space with these dynamics will be possible. We illustrate this in
Figure \ref{fig:refresh}, where we show the effects for large and smaller values
of $\lambda_{ref}$.

\begin{figure}[!h]
  \centering
  \begin{subfigure}[t]{0.45\textwidth}
    \centering \includegraphics[width=1\linewidth]{./figs/supp_regression_event/small_bps.pdf}
    \caption{$\lambda_{ref}=0.01$}
  \end{subfigure}%
  ~
  \begin{subfigure}[t]{0.45\textwidth}
    \centering \includegraphics[width=1\linewidth]{./figs/supp_regression_event/large_bps.pdf}
    \caption{$\lambda_{ref}=10.0$}
  \end{subfigure}%
  \caption{Effect of $\lambda_{ref}$ on PDMP models applied to BNNs. Shown here is the predictive distribution found with the BPS using the proposed event rate thinning method.}
  \label{fig:refresh}
\end{figure}
%
%
We can see that the refresh rate can have a considerable impact on the inference quality of our model. With $\lambda_{ref}$ too large, our exploration is limited and we perform excessive refreshments instead of accepting those provided by the PDMP kernel. When is too small, we can accept larger event times as specified by the PDMP sampler and can diverge away from meaningful inferences.

\section{Summary of models used}
\subsection{Regression and Binary Classification Models for Synthetic Data}
\label{sec:synth}
Regression models used within this work consist of fully-connected networks with two hidden layers, each with 25 and  10 units respectively. Tanh non-linear activations are applied after each hidden layer, and a Normal likelihood with a variance of $\sigma^{2} = 0.01$ is used. MAP estimates for these networks are found with 10,000 iterations using the Adam optimiser \cite{kingma2015adam}, with each sampler initialised from the same MAP estimate.
%
\par
For binary classification models, the networks consist of a fully-connected network with three hidden layers, each with 100 units. ReLU non-linear activations are applied within the network, and a Bernoulli likelihood is used. Similar to the regression tasks, MAP estimate is found with Adam.
%
%
\subsection{Additonal UCI-Dataset Results}
We provide here additional results on datasets from the UCI
repository\cite{ucidatasets}. For each dataset, a simple MLP network with a
single hidden layer with 50 hidden units is used, along with a $\text{Tanh}$
activation. MAP estimates are found similar to \label{sec:synth}, followed by
2,000 samples generated by each method. Each experiment is run 5 times
with means results and standard deviations reported. We further include ESS
as measured on from the smallest principle component of samples. Results from these
experiments reflect that seen in Section \ref{sec:uci}; where SGLD is able to provide an almost
negligible improvement in MSE and NLL, though is unable to provide efficient
posterior exploration. The Boomerang sampler is able to consistently outperform
other sampling methods, with other samplers only able to match sample efficiency
for the smallest principal components where exploration is smallest.
\begin{table}[h!]
  \caption{Results on UCI-Naval Dataset}
  \label{tab:uci-naval}
  \begin{center}
    \begin{small}
      % \begin{sc}
      \scalebox{0.85}{
        %
        \begin{tabular}{l l l l l}
          \toprule
          {\bfseries Inference} & {\bfseries NLL}
                                & {\bfseries MSE}
                                & {\bfseries ESS-First}
                                & {\bfseries ESS-Last}
          \\
          \midrule\midrule[.1em]
          BPS                   & 1681.66 $\pm$ 0.64          & \textbf{0.01 $\pm$ 0.00} & 2.73 $\pm$ 0.02               & \textbf{2000.00 $\pm$ 0.00} \\
          \sigbps               & 1685.43 $\pm$ 4.97          & \textbf{0.01 $\pm$ 0.00} & 2.71 $\pm$ 0.02               & \textbf{2000.00 $\pm$ 0.00} \\
          Boomerang             & 1680.55 $\pm$ 0.08          & \textbf{0.01 $\pm$ 0.00} & \textbf{1777.57 $\pm$ 156.57} & 1804.77 $\pm$ 219.12        \\
          SGLD                  & \textbf{1680.47 $\pm$ 0.00} & \textbf{0.01 $\pm$ 0.00} & 2.88 $\pm$ 0.03               & 156.24 $\pm$ 72.36          \\
          SGHMC                 & 1689.88 $\pm$ 7.88          & \textbf{0.01 $\pm$ 0.00} & 2.72 $\pm$ 0.01               & \textbf{2000.00 $\pm$ 0.00} \\
          \bottomrule
        \end{tabular}
      }
    \end{small}
  \end{center}
\end{table}


\begin{table}[h!]
  \caption{Results on UCI Energy Dataset}
  \label{tab:uci-energy}
  \begin{center}
    \begin{small}
      % \begin{sc}
      \scalebox{0.85}{
        %
        \begin{tabular}{l l l l l}
          \toprule
          {\bfseries Inference} & {\bfseries NLL}
                                & {\bfseries MSE}
                                & {\bfseries ESS-First}
                                & {\bfseries ESS-Last}
          \\
          \midrule\midrule[.1em]
          BPS                   & 74.09 $\pm$ 0.10          & \textbf{2.96 $\pm$ 0.04} & 2.70 $\pm$ 0.01              & 1795.50 $\pm$ 49.44          \\
          \sigbps               & 74.09 $\pm$ 0.08          & 2.97 $\pm$ 0.04          & 2.69 $\pm$ 0.03              & 1527.35 $\pm$ 115.94         \\
          Boomerang             & 74.20 $\pm$ 0.03          & 3.01 $\pm$ 0.01          & \textbf{1981.29 $\pm$ 25.87} & 1880.46 $\pm$ 178.71         \\
          SGLD                  & \textbf{74.06 $\pm$ 0.00} & \textbf{2.96 $\pm$ 0.00} & 2.87 $\pm$ 0.00              & 272.34 $\pm$ 247.11          \\
          SGHMC                 & 74.36 $\pm$ 0.18          & 3.08 $\pm$ 0.08          & 2.72 $\pm$ 0.02              & \textbf{1992.02 $\pm$ 17.84} \\
          \bottomrule
        \end{tabular}
      }
    \end{small}
  \end{center}
\end{table}


\begin{table}[h!]
  \caption{Results on UCI Yacht Dataset}
  \label{tab:uci-yacht}
  \begin{center}
    \begin{small}
      % \begin{sc}
      \scalebox{0.85}{
        %
        \begin{tabular}{l l l l l}
          \toprule
          {\bfseries Inference} & {\bfseries NLL}
                                & {\bfseries MSE}
                                & {\bfseries ESS-First}
                                & {\bfseries ESS-Last}
          \\
          \midrule\midrule[.1em]
          BPS                   & 32.34 $\pm$ 0.03          & 7.55 $\pm$ 0.03          & 2.72 $\pm$ 0.02             & 1467.26 $\pm$ 11.96          \\
          \sigbps               & 32.33 $\pm$ 0.08          & 7.54 $\pm$ 0.07          & 2.72 $\pm$ 0.03             & 1291.29 $\pm$ 52.81          \\
          Boomerang             & 32.41 $\pm$ 0.11          & 7.62 $\pm$ 0.11          & \textbf{2000.00 $\pm$ 0.00} & \textbf{1945.22 $\pm$ 61.12} \\
          SGLD                  & 32.32 $\pm$ 0.00          & 7.53 $\pm$ 0.00          & 2.87 $\pm$ 0.00             & 3.29 $\pm$ 0.58              \\
          SGHMC                 & \textbf{32.27 $\pm$ 0.17} & \textbf{7.48 $\pm$ 0.17} & 2.73 $\pm$ 0.02             & 1652.04 $\pm$
          30.10                                                                                                                                     \\
          \bottomrule
        \end{tabular}
      }
    \end{small}
  \end{center}
\end{table}


\begin{table}[h!]
  \caption{Results on UCI Concrete Dataset}
  \label{tab:uci-concrete}
  \begin{center}
    \begin{small}
      % \begin{sc}
      \scalebox{0.85}{
        %
        \begin{tabular}{l l l l l}
          \toprule
          {\bfseries Inference} & {\bfseries NLL}
                                & {\bfseries MSE}
                                & {\bfseries ESS-First}
                                & {\bfseries ESS-Last}
          \\
          \midrule\midrule[.1em]
          BPS                   & 111.51 $\pm$ 0.12          & 9.55 $\pm$ 0.03          & 2.72 $\pm$ 0.03              & 1777.00 $\pm$ 112.52         \\
          \sigbps               & 111.60 $\pm$ 0.27          & 9.58 $\pm$ 0.08          & 2.72 $\pm$ 0.02              & 1503.21 $\pm$ 36.08          \\
          Boomerang             & 111.95 $\pm$ 0.35          & 9.68 $\pm$ 0.10          & \textbf{1975.76 $\pm$ 54.21} & \textbf{1982.03 $\pm$ 24.85} \\
          SGLD                  & \textbf{111.47 $\pm$ 0.00} & \textbf{9.54 $\pm$ 0.00} & 2.87 $\pm$ 0.00              & 88.94 $\pm$ 118.56           \\
          SGHMC                 & 112.18 $\pm$ 0.61          & 9.74 $\pm$ 0.17          & 2.72 $\pm$ 0.02              & 1906.15 $\pm$ 128.18         \\
          \bottomrule
        \end{tabular}
      }
    \end{small}
  \end{center}
\end{table}




\subsection{Convolutional Models}
%
%
%  We describe in this section
For the \sigbps, an initial warm-up stage is again required, which
is identical to that in Section \ref{sec:reg_bin}. For MNIST and Fashion-MNIST,
a batch size of 1024 is used, whilst a batch size of 512 is used for the
remaining models. MAP estimates for MNIST and Fashion-MNIST datasets were found
with the Adam optimiser \cite{kingma2015adam} for 10,000 iterations. SVHN and
CIFAR-10 used SGD with momentum of 0.1 and 0.9 respectively for 25,000
iterations, where for CIFAR-100, required 128,000 iterations and a momentum of 0.2.
%
%
\\
%
%
With the potential sensitivities to both refreshment rates and choice of velocity distribution $\Phi(\myv)$ identified in \ref{sec:sensitivity}, we deem it important to report the values used for fitting each model. We report these values in Table \ref{tab:conv_results} alongside full predictive performance measurements and sample efficiency metrics. Within Table \ref{tab:conv_results}, we represent the choice of velocity distribution with the $\gamma$ parameter. For \Gls{bps} and \sigbps, $\gamma$ describes the standard deviation of the velocity distribution chosen such that,
\begin{equation}
  \label{eq:2}
  \Phi(\myv) = \mathcal{N}(0, \gamma^{2}).
\end{equation}
For the Boomerang sampler, $\gamma$ represents the scaling factor as found in Equation \ref{eq:boomerang_sigma} from the body of the paper. Included in these results is the Effective Sample Size (ESS) when projecting the samples onto the first, second, and last principal components. We see from these results that the Boomerang Sampler can generate independent samples across all components, whilst other samplers are only able to offer this independence as the amount of information (or variance) in the projection of these samples decreases.
%
%
%
\begin{table}[t!]
  \caption{Summary of hyperparameters used for samplers within this work.}
  \label{tab:conv_results}
  \begin{center}
    % \begin{small}
    % \begin{sc}
    \scalebox{0.85}{
      %
      \begin{tabular}{l l l l l}
        \toprule
        {\bfseries Dataset } & {\bfseries Inference} & {\bfseries $\lambda_{ref}$} & {\bfseries $\gamma$} & {\bfseries Time}
        \\
        \midrule\midrule[.1em]
        \multirow{5}{1.5cm}{MNIST}
        % type ipp p lam warm iter acc NLL ece tim
                             & SGD                   & -                           & -                    & 74               \\
                             & SGLD                  & -                           & -                    & 87               \\
                             & SGLD-ND               & -                           & -                    & 87               \\
                             & BPS                   & 1.0                         & 0.001                & 145              \\
                             & \sigbps               & 1.0                         & 0.25                 & 197              \\
                             & Boomerang             & 0.1                         & 0.01                 & 151              \\
        \midrule
        \multirow{5}{1.5cm}{Fashion-MNIST}
        % type ipp p lam warm iter acc NLL ece tim
                             & SGD                   & -                           & -                    & 74               \\
                             & SGLD                  & -                           & -                    & 87               \\
                             & SGLD-ND               & -                           & -                    & 87               \\
                             & BPS                   & 1.0                         & 0.001                & 144              \\
                             & \sigbps               & 0.1                         & 0.001                & 192              \\
                             & Boomerang             & 0.1                         & 0.01                 & 156              \\
        \midrule
        \multirow{5}{1.5cm}{SVHN}
        % type ipp p lam warm iter acc NLL ece tim
                             & SGD                   & -                           & -                    & 3465             \\
                             & SGLD                  & -                           & -                    & 3653             \\
                             & SGLD-ND               & -                           & -                    & 3653             \\
                             & BPS                   & 1.0                         & 0.001                & 4125             \\
                             & \sigbps               & 0.1                         & 0.0001               & 4535             \\
                             & Boomerang             & 1.0                         & 0.0001               & 4375             \\
        \midrule
        \multirow{5}{1.5cm}{CIFAR 10}
        % type ipp p lam warm iter acc NLL ece tim
                             & SGD                   & -                           & -                    & 4905             \\
                             & SGLD                  & -                           & -                    & 5075             \\
                             & SGLD-ND               & -                           & -                    & 5074             \\
                             & BPS                   & 1.0                         & 0.001                & 5614             \\
                             & \sigbps               & 0.1                         & 0.0001               & 6053             \\
                             & Boomerang             & 0.1                         & 0.001                & 5868             \\
        \midrule
        \multirow{5}{1.5cm}{CIFAR 100}
        % type ipp p lam warm iter acc NLL ece tim
                             & SGD                   & -                           & -                    & 9811             \\
                             & SGLD                  & -                           & -                    & 9985             \\
                             & SGLD-ND               & -                           & -                    & 9985             \\
                             & BPS                   & 1.0                         & 0.001                & 10478            \\
                             & \sigbps               & 0.1                         & 0.001                & 10808            \\
                             & Boomerang             & 0.1                         & 0.001                & 10783            \\
        \bottomrule
      \end{tabular}
    }
    % \end{sc}
    % \end{small}
  \end{center}
\end{table}
%
%
\section{How well are we really exploring the posterior?}
In Radford Neals influential thesis \cite{neal2012bayesian}, he states that
``Bayesian neural network users may have difficulty claiming with a straight
face that their models and priors are selected because they are just what is
needed to capture their prior beliefs about the problem'' \footnote{Although
  much important work has been conducted to establish suitable priors and model
  design \cite{hafner2018reliable, sun2019functional,
    vladimirova2019understanding}, this statement largely remains true today.}.
In a similar vein, we would state that any Bayesian neural network user would
have a difficult time honestly saying their inference strategy has sufficiently
explored the posterior, including the work proposed here. Previous research has
investigated gold-standard MCMC methods for larger networks
\cite{izmailov2021bayesian}, though were unable to obtain a sufficient number of
samples to maintain confidence in the levels of exploration. Although the
metrics in the previous section may show sufficient results for a machine
learning application, from a statistical perspective we need to further
investigate the quality of our inference to justify whether we have satisfied
our goal of sampling from the posterior distribution.
%
%
\par
%
%
Previous papers for PDMP methods for MCMC have shown favourable performance in
terms of mixing and sampling efficiency \cite{bouchard2018bouncy,
  bierkens2019zig, wu2017generalized, bierkens2020boomerang} and has similarly
outperformed methods such as \Gls{sgld}. Most studies have been restricted to
well-defined models; where prior information can be suitably provided and
sufficient prior studies with gold standard methods such as HMC and NUTS have
confirmed the general geometry of the posteriors in question and the existence
of a central limit theorem. Inference in BNNs is challenged by a posterior with
strong multi-modality, making exploration of any sampler more difficult. This is
further challenged by the comparatively large dimensional space over which we
need to explore. The favourable Gaussian Process and functional properties seen
by networks with infinite width \cite{neal2012bayesian} encourage the use of
large models, whilst also narrowing the typical set in which we wish to explore
\cite{betancourt2017conceptual}.
%
%
\par
%
%
%
Another limitation is the computational complexity added with sampling-based
schemes. This complexity not only includes the cost of sampling, but the
increase in memory consumption. The popular ResNet-50 model contains more than
23 million parameters. To perform inference on ImageNet with a ResNet50 model
using a mini-batch size of 100 samples, more than 10 thousand samples would be
needed to iterate over the entire data set of over one million
images\footnotetext{The commonly used variant of ImageNet is from the 2012 Large
  Scale Visual Recognition Challenge, which contains 1,281,167 samples
  \cite{russakovsky2015imagenet}}. With single-point precision, these samples
for a single complete iteration of the dataset would require more than 9.2GB of
memory. These constraints currently limit the applicability of such methods, as
evaluating predictive posteriors will require a large number of samples and many
operations to read sampled values from non-volatile storage.
%
%
%
\par
%
%
These limitations offer insights into areas of future research relating to
sampling schemes for BNNs. The geometry of the joint posterior distribution
could be improved by investigating non-local methods for preconditioning the
gradients, similar to that done in Riemannian HMC \cite{girolami2011riemann}. As
seen in this work through the efficacy of the Boomerang sampler, exploitation of
this geometry can considerably improve exploration of the posterior space.
Finally and perhaps most importantly, bespoke design of model
architecture that respects the data and includes priors that appropriately
reflect domain expertise could yield posteriors that are more easily traversed
and explored.
%


\section{In and Out of Distribution Data}
We investigate here the performance of the different sampling methods for in and
out-of-distribution (OOD) data in terms of predictive classification entropy. We
have demonstrated that PDMP sampling methods present meaningful
epistemic uncertainty in predictions. It is important to identify uncertainty in
the final predictions that are made. Within this work, predictions are made by
taking the $\argmax$ of the mean for the predictive posterior,
\begin{equation}
  \label{eq:pred_mean}
  \mathbf{t} = \argmax_{y^{*} \in \mathcal{Y}} \  \mathop{\mathbb{E}_{y^{*}}} \Big[ p(y^* | x^*, \mathcal{D}) \Big]
\end{equation}
%
%
Where $p(y^* | x^*, \mathcal{D})$ is our predictive posterior. Entropy within
this categorical probability vector given by this expectation can be viewed as
an approximate measure for aleatoric uncertainty within our model
\cite{smith2018understanding} to accompany the epistemic uncertainty given by
our Bayesian models. To assess this, we compute the entropy of the expectation
within Equation \ref{eq:pred_mean} for in-distribution data and OOD data. It is
desirable to have a lower entropy for in-distribution data indicating lower
predictive aleatoric uncertainty, and a larger entropy for OOD data to represent
an increase in uncertainty. Figure \ref{fig:supp_entropy} illustrates this for
the models used within this work.
%
%
\begin{figure}[!h]
  \centering
  \begin{subfigure}[t]{0.2\textwidth}
    \centering
    \includegraphics[width=1\linewidth]{./figs/supp_entropy/mnist_im_fashion_mnist_im_bps_id_ood_entropy.pdf}
  \end{subfigure}%
  ~
  \begin{subfigure}[t]{0.2\textwidth}
    \centering
    \includegraphics[width=1\linewidth]{./figs/supp_entropy/mnist_im_fashion_mnist_im_cov_pbps_id_ood_entropy.pdf}
  \end{subfigure}%
  ~
  \begin{subfigure}[t]{0.2\textwidth}
    \centering
    \includegraphics[width=1\linewidth]{./figs/supp_entropy/mnist_im_fashion_mnist_im_boomerang_id_ood_entropy.pdf}
  \end{subfigure}%
  ~
  \begin{subfigure}[t]{0.2\textwidth}
    \centering
    \includegraphics[width=1\linewidth]{./figs/supp_entropy/mnist_im_fashion_mnist_im_sgld_id_ood_entropy.pdf}
  \end{subfigure}%
  \\
  \begin{subfigure}[t]{0.2\textwidth}
    \centering
    \includegraphics[width=1\linewidth]{./figs/supp_entropy/fashion_mnist_im_mnist_im_bps_id_ood_entropy.pdf}
  \end{subfigure}%
  ~
  \begin{subfigure}[t]{0.2\textwidth}
    \centering
    \includegraphics[width=1\linewidth]{./figs/supp_entropy/fashion_mnist_im_mnist_im_cov_pbps_id_ood_entropy.pdf}
  \end{subfigure}%
  ~
  \begin{subfigure}[t]{0.2\textwidth}
    \centering
    \includegraphics[width=1\linewidth]{./figs/supp_entropy/fashion_mnist_im_mnist_im_boomerang_id_ood_entropy.pdf}
  \end{subfigure}%
  ~
  \begin{subfigure}[t]{0.2\textwidth}
    \centering
    \includegraphics[width=1\linewidth]{./figs/supp_entropy/fashion_mnist_im_mnist_im_sgld_id_ood_entropy.pdf}
  \end{subfigure}%
  \\
  \begin{subfigure}[t]{0.2\textwidth}
    \centering
    \includegraphics[width=1\linewidth]{./figs/supp_entropy/svhn_cifar_10_bps_id_ood_entropy.pdf}
  \end{subfigure}%
  ~
  \begin{subfigure}[t]{0.2\textwidth}
    \centering
    \includegraphics[width=1\linewidth]{./figs/supp_entropy/svhn_cifar_10_cov_pbps_id_ood_entropy.pdf}
  \end{subfigure}%
  ~
  \begin{subfigure}[t]{0.2\textwidth}
    \centering
    \includegraphics[width=1\linewidth]{./figs/supp_entropy/svhn_cifar_10_boomerang_id_ood_entropy.pdf}
  \end{subfigure}%
  ~
  \begin{subfigure}[t]{0.2\textwidth}
    \centering
    \includegraphics[width=1\linewidth]{./figs/supp_entropy/svhn_cifar_10_sgld_id_ood_entropy.pdf}
  \end{subfigure}%
  \\
  \begin{subfigure}[t]{0.2\textwidth}
    \centering
    \includegraphics[width=1\linewidth]{./figs/supp_entropy/cifar_10_svhn_bps_id_ood_entropy.pdf}
    \caption{BPS}
  \end{subfigure}
  ~
  \begin{subfigure}[t]{0.2\textwidth}
    \centering
    \includegraphics[width=1\linewidth]{./figs/supp_entropy/cifar_10_svhn_cov_pbps_id_ood_entropy.pdf}
    \caption{\sigbps}
  \end{subfigure}
  ~
  \begin{subfigure}[t]{0.2\textwidth}
    \centering
    \includegraphics[width=1\linewidth]{./figs/supp_entropy/cifar_10_svhn_boomerang_id_ood_entropy.pdf}
    \caption{Boomerang}
  \end{subfigure}
  ~
  \begin{subfigure}[t]{0.2\textwidth}
    \centering
    \includegraphics[width=1\linewidth]{./figs/supp_entropy/cifar_10_svhn_sgld_id_ood_entropy.pdf}
    \caption{SGLD}
  \end{subfigure}
  \caption{Entropy within the final predictive categorical vector obtained from
    the tested sampling methods for the different datasets used. Blue histograms
    indicate in-data-distribution entropy and red for OOD data. Each column
    represents the predictive entropy for the corresponding labelled sampler and
    each row for a different dataset. From top to bottom, each row is for models
    fit on the MNIST, Fashion MNIST, SVHN, and CIFAR-10 data respectively. MNIST
    and Fashion MNIST datasets are used to model in and OOD datasets for the
    applicable models, and similarly SVHN and CIFAR-10 to model in and OOD for
    respective models.}
  \label{fig:supp_entropy}
\end{figure}
%
%
From Figure \ref{fig:supp_entropy}, we see the \Gls{bps} provides increased
entropy for OOD data though is unable to provide a small entropy for
in-distribution data. Results from the \sigbps, Boomerang, and SGLD samplers all
provide similar trends and provide increased uncertainty for OOD data, however
for in-distribution data, the Boomerang sampler provides a reduction in epistemic
uncertainty. A low predictive entropy for in-distribution data could indicate
overconfidence and should not be used in isolation to evaluate calibration.
Combining these results with the improved ECE calibration scores obtained with
predictions from the Boomerang sampler indicate favourable predictive
performance; predictions from these results are well-calibrated for
in-distribution and showing appropriately reduced aleatoric uncertainty, whilst
providing comparable or improved predictive uncertainty for OOD data.
%
%
\section{Examples of Difficult to Classify Samples}
%
%
Given the increasing desire to apply deep learning models in practice, the
ability for them to reliably communicate uncertainty information is crucial. We
expect our models to encounter difficult-to-understand scenarios. We need to
be able to identify when these challenging scenarios occur and to incorporate the
uncertainty encountered into final decisions. Figure \ref{fig:miss} illustrates
examples of misclassified samples from the datasets evaluated within this work,
and illustrates the predictive probabilities of these models. We see that all
PDMP samplers provide meaningful uncertainty information for
difficult-to-classify instances within each data set.

\begin{figure}[!h]
  \centering
  \begin{subfigure}[t]{0.4\textwidth}
    \centering \includegraphics[width=1\linewidth]{./figs/supp_classification/mnist/2_0.eps}
    \caption{`Two` from MNIST misclassified as `Seven`}
  \end{subfigure}%
  ~
  \begin{subfigure}[t]{0.4\textwidth}
    \centering
    \includegraphics[width=1\linewidth]{./figs/supp_classification/mnist/4_5.eps}
    \caption{`Four` from MNIST missclassified as `Six`}
  \end{subfigure}
  \\
  \begin{subfigure}[t]{0.4\textwidth}
    \centering
    \includegraphics[width=1\linewidth]{./figs/supp_classification/cifar10/3_4.eps}
    \caption{`Cat` from CIFAR-10 misclassified as `Deer`}
  \end{subfigure}%
  ~
  \begin{subfigure}[t]{0.4\textwidth}
    \centering
    \includegraphics[width=1\linewidth]{./figs/supp_classification/cifar10/5_6.eps}
    \caption{`Dog` from CIFAR-10 misclassified as `Horse`}
  \end{subfigure}
  \\
  \begin{subfigure}[t]{0.4\textwidth}
    \centering
    \includegraphics[width=1\linewidth]{./figs/supp_classification/svhn/3_4.eps}
    \caption{`Three` from SVHN missclassified as `Nine`}
  \end{subfigure}%
  ~
  \begin{subfigure}[t]{0.4\textwidth}
    \centering
    \includegraphics[width=1\linewidth]{./figs/supp_classification/svhn/9_10.eps}
    \caption{`Nine` from SVHN missclassified as `five`}
  \end{subfigure}
  \caption{Examples of difficult-to-classify images from the different image
    data sets used. Below each image is the predictive mean for each class
    represented by the dot, and error bars to represent the 95\% credible
    intervals. MNIST results fit with BPS, CIFAR-10 with \sigbps, and SVHN with
    Boomerang sampler using the proposed event thinning method. Best viewed on a
    computer screen.}
  \label{fig:miss}
\end{figure}
%
%
%
%
\clearpage
%
%
\bibliography{ref.bib}
%
%
\end{document}

%%% Local Variables:
%%% mode: latex
%%% TeX-master: t
%%% End:
