% \documentclass{uai2024} % for initial submission
\documentclass[accepted]{uai2024} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2024} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2024} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{subcaption}
\captionsetup{compatibility=false}
%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title
% {Evaluating approximate Bayesian inference for deep learning classification of radio galaxies}
{Evaluating Bayesian deep learning for radio galaxy classification}
% The standard author block has changed for UAI 2024 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<devina.mohan@postgrad.manchester.ac.uk>?Subject=Your UAI 2024 paper}{Devina Mohan}{}}
\author[1, 2]{Anna M. M. Scaife}
% \author[1,2]{Further~Coauthor}
% \author[3]{Further~Coauthor}
% \author[1]{Further~Coauthor}
% \author[3]{Further~Coauthor}
% \author[3,1]{Further~Coauthor}
% Add affiliations after the authors
\affil[1]{%
Jodrell Bank Centre for Astrophysics\\
Department of Physics \& Astronomy\\
University of Manchester, UK 
}
\affil[2]{%
    The Alan Turing Institute\\
    London, UK
}
% \affil[3]{%
%     Another Affiliation\\
%     Address\\
%     …
%   }
  
  \begin{document}
\maketitle

\begin{abstract}
The radio astronomy community is rapidly adopting deep learning techniques to deal with the huge data volumes expected from the next generation of radio observatories. Bayesian neural networks (BNNs) provide a principled way to model uncertainty in the predictions made by such deep learning models and will play an important role in extracting well-calibrated uncertainty estimates on their outputs. In this work, we evaluate the performance of different BNNs against the following criteria: predictive performance, uncertainty calibration and distribution-shift detection for the radio galaxy classification problem.



% In this work we use MCMC sampling to recover posterior estimates for classifying radio galaxies with convolutional neural networks. Using samples from MCMC as a benchmark, 

%We compare the posterior samples ... %MMD results

%However ...%problems with BNNs

%CPE results - 
% We show that the "cold posterior effect" previously observed in morphological classification of radio galaxies is due to model misspecification arising from poor variational posterior approximations.


\end{abstract}

\section{Introduction}
\label{sec:intro}

% BNNs and their potential in scientific application





% The next-generation of radio astronomy facilities such as the Square Kilometre Array \cite{sk} will produce huge volumes of data and the use of deep learning (DL) methods is inevitable given the expected data volumes \cite{scaife2020big, an2019science}. 
% Instead of imaging the sky directly, radio astronomy requires specialised image processing algorithms to solve the inverse problem of converting measured visibilities into an image. 
% Science data products from the first phase of construction of the SKA are expected to produce 600+ PB per year for image processing alone \cite{scaife2020big, an2019science}. Over its operational lifetime the observatory will produce exabytes of data and transform our understanding of the Universe. 

% Next generation radio astronomy facilities 
Bayesian neural networks (BNNs) have tremendous potential in scientific applications of machine learning. However, most large scale evaluations of BNNs focus on well-curated terrestrial datasets with lots of labelled examples \citep{wilson2022evaluating, vadera2022ursabench}. In contrast, in radio astronomy, the largest labelled datasets are of the order $10^3$ \citep{porter2023mirabest}. In this work we present an evaluation of Bayesian deep learning for radio astronomy, using the morphological classification of radio galaxies as a benchmark. Supervised CNNs have been the most widely used solution to this problem since their introduction to the field by \cite{Aniyan2017ClassifyingNetwork}. That work adopted the canonical morphological division of radio galaxies into Fanaroff-Riley Type~I (FRI) and Type~II (FRII), which has persisted as the most common classification scheme for radio galaxies in the literature for more than 40 years \citep{fanaroff1974morphology}. More recently the FR classification scheme has been used to demonstrate improvements in efficiency and accuracy for a variety of deep-learning models within both the supervised \citep{lukic2019, becker1995, bowles2021attention, scaife2021fanaroff} and unsupervised \citep{slijepcevic2022radio} learning regimes.

With recent improvements in the sensitivity and resolution of modern radio astronomy observatories, the morphological detail recovered in images of radio galaxies has indicated that more complex relationships exist beyond the original FR dichotomy \citep{mingo2019revisiting}. Whilst a more nuanced analysis will certainly be enabled by the development of increasingly fine-grained automated classification, the underlying continuum of physical processes that are represented by this diversity of morphology is perhaps better captured by understanding the confidence with which certain galaxies are assigned to different labels by these models. However, the confidence of individual predictions is not necessarily reflected in standard metrics for deep learning, but instead requires models to focus on uncertainty quantification of model predictions rather than raw performance \citep{mohan2022quantifying}.

% 

BNNs provide a principled way to model uncertainty \citep{mackay1992a,mackay1992b} by specifying priors, $P(\theta)$, over the neural network parameters, $\theta$, and learning the posterior distribution, $P(\theta|D)$, over those parameters, where $D$ is the data. Recovering this posterior distribution directly is intractable for neural networks. Several techniques have been developed to approximate Bayesian inference for neural networks. We consider Hamiltonian Monte Carlo \citep[HMC;][]{neal1998view, neal2011mcmc}, Variational Inference \citep[VI;][]{vireview, blundell, practicalvi}, last-layer Laplace approximation\citep[LLA;][]{daxberger2021laplace}, MC Dropout \citep{gal2015bayesian} and Deep Ensembles \citep{lakshminarayanan2017simple} for our application. We focus our evaluation on the following criteria: predictive accuracy, uncertainty calibration and ability to detect different types of distribution shifts.


% We leverage the asymptotic convergence of MCMC to the true posterior distribution to evaluate other approximate Bayesian inference methods including


% We show that .. %


In Section \ref{sec:bayesDL} we give a brief overview of the BNNs considered in this work; in Section \ref{sec:data} we describe the datasets used to train and evaluate our BNNs; in Section \ref{sec:exps} we describe our experimental setup and finally in Section \ref{sec:eval} we present our evaluation, followed by a discussion in Section \ref{sec:discuss}.

% In general, it has been suggested that deep learning models produce overconfident predictions \citep{guo2017calibration} and provide no uncertainty estimates, which are essential for scientific application of these models. 


%Instead of imaging the sky directly, radio astronomy requires specialised image processing algorithms to solve the inverse problem of converting measured visibilities into an image. Science data products from the first phase of construction of the SKA are expected to produce 600+ PB per year for image processing alone \cite{scaife2020big, an2019science}. Over its operational lifetime the observatory will produce exabytes of data and transform our understanding of the Universe. 
% Modern astrophysics is driven by population analyses and any automated classification pipeline should produce well-calibrated uncertainty estimates that quantify the model uncertainty introduced in the results. In this work we consider the morphological classification of radio galaxies and evaluate different Bayesian neural networks. 
%We also discuss the challenges faced while implementing Bayesian Convolutional Neural Networks (CNNs) for their classification. 

%Challenges for radio astronomy - how is RA data different from terrestrial datasets
% very few labelled data sets of the order $10^3$. Analyse the effect of training data on the final predictions using sensitivity analysis. Imp for identifying biases when our labelled datasets are so small
%discuss results from inigo's paper:
% pre-training using unlabelled data 

%What properties should our BNNs have for performing the science required
%/robustness to domain shift and ability to detect out-of-distribution data.
% These tests are done based on the posterior predictive distribution and form part of our function-space analysis. We also examine posterior samples from the different approximations in the weight space. \citep{qiu2023should}

% \textbf{Example}
% Next-generation radio astronomy facilities such as the Square Kilometre Array (SKA) are expected to produce exabyte scale data \citep{scaife2020big, an2019science}. Detecting distribution shift will be essential in identifying new classes of objects in surveys with improved resolution and sensitivity. 


%example of Mingo LOTSS to show why uncertainty calibration is imp
% \textbf{Example}
% How uncertanities will affect the science \citep{mingo2019revisiting}

%Challenges for implementing BNNs - different misspecifications
% More expressive approximations to the posterior 

% Evaluation of Bayesian DL for real-world applications

\section{Approximate Bayesian Inference for Deep Learning}
\label{sec:bayesDL}
%why these particular BNNs were chosen
% The BNN methods were chosen to encompass a whole range of posterior approximations - from the asymptotically exact HMC samples, to Gaussian approximations to the posterior of all the weights of the network (VI), to Gaussian approximations only to the last layer weights (LLA) and other cheaper approximations which are commonly implemented such as MC Dropout and Deep Ensembles. The methods have a trade-off between computational cost and performance. 

The Bayesian neural networks considered in this work were chosen to encompass a broad range of posterior approximations. While HMC provides asymptotically exact samples from the posterior, VI makes local approximations to the posterior for all the weights of the network and LLA makes local approximations only for the last layer weights. We also consider other cheaper approximations which are commonly implemented such as MC Dropout and Deep Ensembles.

\subsection{Hamiltonian Monte Carlo}
\label{sec:hmc_theory}

% MCMC methods are a class of algorithms used to obtain samples from probability distributions which are otherwise intractable or do not have a full analytical description. 
%They are most commonly used for probabilistic inference and fitting a model to data \citep{hogg2018data}. 
%MCMC algorithms allow us to sample from the posterior distribution once the samples have converged to the target distribution, but they do not scale well to large models such as deep neural networks. 
The first application of MCMC to neural networks was proposed by \citet{neal1998view}, who introduced Hamiltonian Monte Carlo (HMC) from quantum chromodynamics to the general statistics literature. 
%The more widely used No-U-Turn Sampler (NUTS) is an extension of HMC which allows the hyperparameters of the network to be set automatically \citep{hoffman2014no}. HMC requires the entire dataset to be stored in memory during inference which makes it difficult to scale it to large datasets.  
However, it wasn't until \citet{welling2011bayesian} introduced Stochastic Gradient Langevin Dynamics (SGLD), that MCMC for neural networks became feasible for large datasets. 
%SGLD can be used with mini-batches of data. Other gradient based MCMC algorithms include stochastic gradient Hamiltonian Monte Carlo (SG-HMC) \citep{chen2014stochastic} and  Cyclical Stochastic Gradient MCMC (SG-MCMC) which introduces a cyclical step-size schedule \citep{zhang2019cyclical}. The smaller steps explore one mode and larger steps sizes are used to discover and jump between multiple modes of the posterior. 
More recently, \citet{cobb2021scaling} have revisited HMC and proposed novel data splitting techniques to make it work with large datasets. We use the HMC algorithm in our work. 
%We use the HMC algorithm in this work since our dataset is small.


% \subsection{Hamiltonian Monte Carlo}
HMC simulates the path of a particle traversing the negative posterior density space using Hamiltonian dynamics \citep{neal2011mcmc, betancourt2017conceptual,hogg2018data}. 
%A Hamiltonian function describes the total energy of a system of particles in terms of their kinetic and potential energies. 
To apply HMC to deep learning, the neural network parameter space is augmented by specifying an additional momentum variable, $m$, for each parameter, $\theta$. Therefore, for a $d$-dimensional parameter space, the augmented parameter space contains $2d$ dimensions. 
We can then  define a log joint density as follows:
%
\begin{equation}
    \mathrm{log}[p(\theta, m)] = \mathrm{log}[p(\theta|D) p(m)] \, .
    \label{eq:joint_density}
\end{equation}
%
%that is proportional to the Hamiltonian. 

Hamiltonian dynamics allows us to travel on the contours defined by the joint density of the position and momentum variables.
% , also known as the phase space. 
The Hamiltonian function is given by: 
%
\begin{equation}
    H(\theta, m) = U(\theta) + K(m) = constant ,
    \label{eq:ham}
\end{equation}
%
where $U(\theta)$ is the potential energy and $K(m)$ is the kinetic energy. The potential energy is defined to be the negative log posterior probability and the kinetic energy is usually assumed to be quadratic in nature and of the form $ K(m) = (1/2) \, m^{T} M^{-1} m$,
%
% \begin{equation}
%     K(m) = (1/2) \, m^{T} M^{-1} m \, ,
%     \label{eq:ke}
% \end{equation}
%
where $M$ is a positive-definite mass matrix. This corresponds to the negative probability density of a zero-mean Gaussian, $p(m) = \mathcal{N}(m|0, M)$, with covariance matrix, $M$, which is usually assumed to be the identity matrix. 
% The Hamiltonian can thus be written as:

% %
% \begin{equation}
%     H(\theta, m) = - \mathrm{log} [p(D|\theta) p(\theta)] + (1/2) \, m^{T} M^{-1} m \, .
%     \label{eq:ham_expanded}
% \end{equation}
%

The partial derivatives of the Hamiltonian describe how the system evolves with time.
% :
% \begin{eqnarray}
%     \frac{\partial \theta_{i}}{\partial t}  = \frac{\partial H}{\partial m_{i}} = \frac{\partial K(m)}{\partial m_{i}}, 
%     \label{eq:ham_pdes1}
%     \\ 
%     \frac{\partial m_{i}}{\partial t} = \frac{- \partial H}{\partial \theta_{i}} = \frac{- \partial U(\theta)}{\partial \theta_{i}} . 
%     \label{eq:ham_pdes2}
% \end{eqnarray}
%theoretical acceptance rate should be 1 because the Hamiltonian is conserved.
In order to solve the partial differential equations using computers, we need to discretise the time, $t$, of the dynamical simulation using a step-size, $\epsilon$. The state of the system can then be computed iteratively at times $\epsilon$, $2\epsilon$, $3\epsilon$... and so on, starting at time zero upto a specified number of steps, $L$.
% Several existing numerical integrators can be used to solve the system of partial differential equations described by Equations \ref{eq:ham_pdes1}- \ref{eq:ham_pdes2}, including Euler's method and modified Euler's method \citep{neal2011mcmc}. These methods produce trajectories that diverge to infinity unless the step size is made smaller, but this makes the algorithm slow. %write about why symplectic integrators are used
The leapfrog integrator is used to solve the system of partial differential equations.
%described by Equations \ref{eq:ham_pdes1}- \ref{eq:ham_pdes2}.
Two hyperparameters, the step-size, $\epsilon$, and the number of leapfrog steps, $L$, together determine the trajectory length of the simulation. 
% The leapfrog integrator begins by user-specified initial position and momentum values at time $t=0$. The momentum values are then updated by half a step-size 
% %as follows:
% % \begin{equation}
% %     m_{i}(t + \epsilon/2)  =  m_{i}(t) - \frac{\epsilon}{2} \frac{\partial U}{\partial \theta_{i}} \, \theta(t) \, , 
% %     \label{eq:leapfrog1}
% % \end{equation}
% for the $i^{\mathrm{th}}$ leapfrog step. This is followed by a full step-size for the position values using the updated momentum values.
% % :
% % %
% % \begin{equation}
% %     \theta_{i}(t + \epsilon)  =  \theta_{i}(t) + \epsilon \, m_{i} (t + \epsilon / 2) \, .
% %     \\
% %     \label{eq:eq:leapfrog2}
% % \end{equation}
% %
% The momentum values are then updated by another half step-size using the updated position values.
% % %
% % \begin{equation}
% %     m_{i}(t + \epsilon/2)  =  m_{i}(t + \epsilon/2) - \frac{\epsilon}{2} \frac{\partial U}{\partial \theta_{i}} \, \theta(t + \epsilon) \, .
% %     \label{eq:eq:leapfrog3}
% % \end{equation}
% %
% %In order to make sure that the dynamics of the system are conserved, volume in the phase space must be preserved. The momentum variables should transform in an opposite way to the parameters in order to preserve the volume. 
The partial derivative of the potential energy with respect to the position, $\partial U/\partial \theta$, can be calculated using the automatic differentiation capabilities of most standard neural network libraries. %The use of gradient information reduces random walk behaviour in HMC.

In each iteration of the HMC algorithm, new momentum values are sampled from Gaussian distributions, followed by simulating the trajectory of the particles according to Hamiltonian dynamics for $L$ steps using the leapfrog integrator with step-size $\epsilon$. At the end of the trajectory, the final position and momentum variables, $(\theta^{*}, m^{*})$,  are accepted based on a Metropolis-Hastings accept/reject criterion that evaluates the Hamiltonian for the proposed parameters and the previous parameters. %The proposed parameters are accepted with probability given by:
% \begin{equation}
%     \mathrm{min} [1,\mathrm{exp}(-H(\theta^{*}, m^{*}) + H (\theta, m)] \, . 
% \end{equation}
% \subsection{Other inference methods}


\subsection{Variational Inference}
\label{sec:vi_theory}

Variational inference (VI) assumes an approximate posterior from a family of tractable distributions, and converts the inference problem into an optimisation problem \citep{practicalvi, blundell, vireview}.  The model learns the parameters of the distributions by minimising an Evidence Lower Bound objective (ELBO) function, which is composed of a data likelihood cost and a complexity cost that quantifies the difference between the prior and the variational approximation using KL divergence.  %We refer the reader to \citep{mohan2022quantifying} for a detailed explanation of VI.


% ELBO is a non-convex optimisation function
% lower bound on the evidence, connection to Jensen's inquality

\subsection{Last-layer Laplace approximation }
\label{sec:lla_theory}
% considers the region around the MAP solution (mode) where the posterior density might be high but the volume low

% LLA depends completely on the MAP estimate found by standard training. How does the MAP estimate compare to the [mode?] of the HMC posterior??


% Adv: post-hoc approximation following standard deep learning training. Effecient implementation

% https://machinelearningmastery.com/a-gentle-introduction-to-partial-derivatives-and-gradient-vectors/

% The LLA is a post-hoc method which allows us to fit Gaussian distributions over the network paarameters after finding MAP estimates

Last-layer Laplace approximation (LLA) constructs Gaussian approximations around the maximum a posteriori (MAP) values learned by standard NN training using the second order partial derivatives of the loss function, $\mathcal{L}$ \citep{daxberger2021laplace}. 
%which contain information about the local curvature of the loss function.
This method allows one to learn posteriors for the last layer weights of the network, $\theta^{(L)}$, % \epsilon \mathbb{R}^K $, 
% around the MAP estimates obtained by standard neural network training
while keeping the rest of the values fixed at their MAP estimates.
% :
%
% \begin{equation}
%     p(\theta^{(L)} |D) \approx \mathcal{N}(\theta^{(L)} | \theta^{(L)}_{\mathrm{MAP}}, \Sigma^{(L)})  , 
%     \label{eq:lla_eqn}
% \end{equation}
% %
% where 
The covariance matrix for the last layer 
% , $\Sigma^{(L)}$, 
is calculated using the empirical Fisher approximation to the Hessian, which contains information about the local curvature of the loss function for each parameter. 
% We use a diagonal approximation to the Hessian. 
The method assumes a zero mean Gaussian prior, $p(\theta) = \mathcal{N}( \theta; 0, \gamma^2 I)$. The prior variance, $\gamma^2$, is estimated using marginal likelihood maximisation \citep{immer2021scalable, daxberger2021laplace}.
% The marginal likelihood or evidence, Z, given by :
% \begin{equation}
%     Z = \int p(D|\theta) p(D) d \theta \, , 
%     \label{eq:marginal_likelihood}
% \end{equation}
% %normalising constant for the posterior
% can be approximated after a Laplace approximation has been fit as follows: 
% %
% \begin{equation}
%     Z \approx \mathrm{exp} ( - \mathcal{L} (D; \theta_{\mathrm{MAP}} )) \, (2 \pi) ^{K/2} (\mathrm{det} \, \Sigma)^{1/2} ,
%     \label{eq:la_marginal_likelihood}
% \end{equation}
% %
% where K is the number of weights in the last layer of the network.

% The Adam optimiser used to minimise the negative log marginal likelihood 
% https://arxiv.org/pdf/2104.04975.pdf -  Marginal Likelihood Estimation
% incode: https://github.com/aleximmer/Laplace/blob/main/laplace/baselaplace.py#L498 

%one can use information about the local curvature of the loss function to inform the 
% We apply Laplace approximation to the last linear layer of the neural network

\subsection{Monte Carlo Dropout}
\label{sec:dropout_theory}

% \citet{gal2015bayesian} showed how dropout can be used as a Bayesian approximation to the posterior. 
Another easily implemented Bayesian approximation is MC Dropout, which learns a distribution over the network outputs by setting randomly selected weights of the network to zero with probability, $p$ \citep{gal2015bayesian}. MC dropout can be considered an approximation to VI, where the variational approximation is a Bernoulli distribution. Although a convenient technique, this method lacks flexibility and does not fully capture the uncertainty in model predictions, especially under covariate shift where the data distributions at training and test time are not identically distributed \citep{chan2020unlabelled}.


% Regularisers used with such as weight decay 

\subsection{Deep Ensembles}
One can use the output of multiple randomly initialised models to form a uniformly-weighted mixture model whose predictions can be combined to form an ensemble \citep{lakshminarayanan2017simple}. % More recently, \citet{seligmann2023beyond} evaluate BNNs under distribution shift using the WILDS dataset.

% Deep ensembles capture different modes of the posterior.

%Discuss the pros and cons of each method

% \subsection{Multi-SWAG} if reviewers ask

\section{Data}
\label{sec:data}

\begin{figure}[t] %{\textwidth}
    \centering
    \includegraphics[width=0.45\textwidth]{figures/galaxies.png}
    \caption{Images from the datasets used in this work: top two rows contain images of Fanaroff Riley Type I (FRI) and Type II (FRII) radio galaxies from the MiraBest Confident dataset on which our BNNs are trained on. The third row contains FRI/FRII galaxies from the MIGHTEE dataset. The fourth row contains optical galaxies from the GalaxyMNIST dataset. We use the MIGHTEE and GalaxyMNIST datasets to evaluate our models' ability to detect different types of distribution shifts. See Section \ref{sec:data} for details about the datasets.}
    \label{fig:data}
\end{figure}



Radio galaxies are characterised by large scale jets and lobes which can extend up to mega-parsec distances from the central black hole and are observed in the radio spectrum. The original binary classification scheme proposed to classify such extended radio sources was based on the ratio of the extent of the highest surface brightness regions to the total extent of the galaxy, \citet{fr1974}. FRI galaxies are edge-darkened whereas FRII galaxies are edge-brightened. Over the years, several other morphologies such as bent-tail \citep{rudnick1976, odea1985owen}, hybrid \citep{gopalkrishna2000}, and double-double \citep{schoenmakers2000} sources have also been observed and there is still a continuing debate about the exact interplay between extrinsic effects, such as the interaction between the jet and the environment, and intrinsic effects, such as differences in central engines and accretion modes, that give rise to the different morphologies. 

We train our BNNs on the MiraBest Confident dataset [Section \ref{sec:mirabest}] and use the MIGHTEE [Section \ref{sec:mightee}] and GalaxyMNIST [Section \ref{sec:galaxy_mnist}] datasets to test the ability of our BNNs to detect different types of distribution shifts.
% To test the ability of our BNNs to detect distribution shift when faced with data from other telescopes, we use the MIGHTEE dataset. 

% \citet{mingo2019revisiting} recently showed that 



 % proposed a classification of such extended radio sources based on the ratio of the distance between the highest surface brightness regions on either side of the galaxy to the total extent of the radio source, $R_{{\rm FR}}$. Based on a threshold ratio of 0.5, the galaxies were classified into two classes as follows: if $R_{{\rm FR}} < 0.5$, the source was classified into Class I (FRI; edge-darkened), and if $R_{{\rm FR}} > 0.5$ it was classified into Class II (FRII; edge-brightened). 
 
% \begin{figure}[htb]
%     \centering % <-- added
% \begin{subfigure}{0.25\textwidth}
%   \includegraphics[width=\linewidth]{example-image-a}
%   \caption{image1}
%   \label{fig:1}
% \end{subfigure}\hfil % <-- added
% \begin{subfigure}{0.25\textwidth}
%   \includegraphics[width=\linewidth]{example-image-b}
%   \caption{image2}
%   \label{fig:2}
% \end{subfigure}\hfil % <-- added
% \begin{subfigure}{0.25\textwidth}
%   \includegraphics[width=\linewidth]{example-image-c}
%   \caption{image3}
%   \label{fig:3}
% \end{subfigure}

% \medskip
% \begin{subfigure}{0.25\textwidth}
%   \includegraphics[width=\linewidth]{example-image-a}
%   \caption{image4}
%   \label{fig:4}
% \end{subfigure}\hfil % <-- added
% \begin{subfigure}{0.25\textwidth}
%   \includegraphics[width=\linewidth]{example-image-b}
%   \caption{image5}
%   \label{fig:5}
% \end{subfigure}\hfil % <-- added
% \begin{subfigure}{0.25\textwidth}
%   \includegraphics[width=\linewidth]{example-image-c}
%   \caption{image6}
%   \label{fig:6}
% \end{subfigure}
% \caption{Fasi del processo di impregnazione}
% \label{fig:galaxies}
% \end{figure}


% Figure \ref{fig:galaxies}

\subsection{MiraBest}
\label{sec:mirabest}
The MiraBest dataset used in this work consists of 1256 images of radio galaxies of $150\times 150$ pixels pre-processed to be used specifically for deep learning tasks \citep{porter2023mirabest}. The galaxies are labelled using the FRI and FRII morphological types based on the definition of \citep{fanaroff1974morphology} and further divided into their subtypes. In addition to labelling the sources as FRI, FRII and their subtypes, each source is also flagged as `Confident' or `Uncertain' to indicate the human classifiers' confidence while labelling the dataset. In this work we use the MiraBest Confident subset and consider only the binary FRI/FRII classification during training, see Figure \ref{fig:data} (top two rows) for some examples. The training and validation sets are created by splitting the predefined training data into a ratio of 80:20. The final split consists of 584 training samples, 145 validation samples, and 104 withheld test samples. %No data augmentation is used.

%rephrase
The MiraBest dataset was constructed using the sample selection and classification described in \cite{Miraghaei2017TheEnvironment}, who made use of the parent galaxy sample from \cite{Best2012OnProperties}. Optical data from data release 7 of Sloan Digital Sky Survey \citep[SDSS DR7;][]{abazajian2009seventh} was cross-matched with NRAO VLA Sky Survey  \citep[NVSS;][]{condon1998} and Faint Images of the Radio Sky at Twenty-Centimeters  \citep[FIRST;][]{becker1995} radio surveys. 


%MBUncert

%MBHybrid

% Generalisation beyond training data - domain shift, 

\subsection{MIGHTEE}
\label{sec:mightee}
%domain adaptation
The MIGHTEE dataset is constructed using the Early Science data products from the MeerKAT International GHz Tiered Extragalactic Exploration survey \citep[MIGHTEE;][]{heywood2022mightee}. MIGHTEE is an ongoing radio continuum survey being conducted using the MeerKAT telescope, which is one of the precursors to the Square Kilometer Array (SKA). The survey provides radio continuum, spectral line and polarisation data, of which we use the radio continuum data and extract images for the COSMOS and XMMLSS fields. While there are thousands of objects in these fields, expert labels are only available for $117$ objects. We use the data pre-processing and expert labels made available by \citet{slijepcevic2024radio}. The dataset contains classifications based on the consensus of five expert radio astronomers. The final sample contains $45$ FRI and $72$ FRII galaxies, see Figure \ref{fig:data} (third row). We note that the MIGHTEE dataset contains significant observational differences from the MiraBest dataset.




%difference between MiraBest and MIGHTEE

% \subsection{Radio Galaxy Zoo}
% % -how good are the BNNs at identifying rare classes (e.g. hybrids in RGZ)
% %https://arxiv.org/pdf/2204.08816.pdf
% % https://arxiv.org/pdf/2305.16127.pdf

% Radio Galaxy Zoo is a source identification citizen science project. Data is from the FIRST and ATLAS surveys. 

% We use the Radio Galaxy Zoo DR 1 dataset (Wong et al. 2024 in prep). The dataset contains $107,893$ images of radio galaxies which may contain  contain out-of-distribution data-points, class imbalance or irregular sub-population distributions.

% Some RGZ-DR1 images overlap with MiraBest, which we have removed.


% has not been curated
% and may contain out-of-distribution data-points, class imbalance or
% irregular sub-population distributions. 

\subsection{Galaxy MNIST}
\label{sec:galaxy_mnist}
% Optical images - completely out-of-distribution. 

In addition to considering different datasets of radio galaxies which have been curated using data from radio telescopes, we also evaluate our models on data collected from optical telescopes. 
%diference in radio vs optical images
Optical images of galaxies contain different features and in a sense represent completely out-of-distribution galaxies which well-calibrated models should classify with a very high degree of uncertainty so that they can be flagged for inspection by an expert.

We use the GalaxyMNIST\footnote{\url{https://github.com/mwalmsley/galaxy_mnist}} dataset which contain images of $10,000$ optical galaxies classified into four morphological types using labels collected by the Galaxy Zoo citizen science project, see Figure \ref{fig:data} (last row) for examples. The galaxies are drawn from the Galaxy Zoo Decals catalogue \citep{walmsley2022galaxy}. We resize the high resolution images from 224x224 to 150x150 to match the input dimensions of our model. We construct a small test set of $104$ galaxies from the dataset to evaluate the out-of-distribution detection ability of our BNNs.

\section{Experiments} %/Implementation Details
\label{sec:exps}
Code for the experiments conducted in this work is available at: \url{https://github.com/devinamhn/RadioGalaxies-BNNs}.

\subsection{Model architecture}
We use an expanded LeNet-5 architecture with two additional convolutional layers with 26 and 32 channels, respectively, to be consistent with the literature on using BNNs for classifying the MiraBest dataset \citep{mohan2022quantifying}. The model has $232, 444$ parameters in total.

\subsection{HMC Inference}
We use the \textsc{hamiltorch} package\footnote{\url{ https://github.com/AdamCobb/hamiltorch}} developed by \citet{cobb2021scaling} for scaling HMC to large datasets. %The package provides implementations of HMC and NUTS samplers and has also been used to benchmark HMC inference for neural networks in \citet{vadera2022ursabench}. In this work I use their HMC sampler without deploying the data splitting technique because the MiraBest data set contains only $\sim$ 1200 images.
Using their HMC sampler, we set up two HMC chains of $200,000$ steps using different random seeds and run it on the MiraBest Confident dataset. We use a step size of $ \epsilon = 10^{-4}$ and set the number of leapfrog steps to $L = 50$. We specify a Gaussian prior over the network parameters and evaluate different prior widths, $\sigma = \{ {1, 10^{-1},10^{-2}, 10^{-3} \}}$, using the validation data set. We find that $\sigma = {10^{-1} }$ results in the best predictive performance and consequently use it to define the prior width for all weights and biases of the neural network in our experiments. To compute the final posteriors we thin the chains by a factor of $1000$ to reduce the autocorrelation in the samples and obtain $200$ samples. A compute time of $170$ hrs is required to run the inference on two Nvidia A100 GPUs. The acceptance rate of the proposed samples is $97.62\%$. We repeat the inference with data augmentation in the form of random rotations.

\textbf{Assessing Convergence}: The Gelman-Rubin diagnostic, $\hat{R}$, is used to assess the convergence of our HMC chains \citep{gelman1992inference}. %The variance between chains is compared to the variance within a chain. 
If $\hat{R} \approx 1$ we consider the HMC chains for that particular parameter to have converged. We examine the convergence of the last layer weights and find that using data augmentation leads to a higher proportion of weights with $\hat{R} \geq 1$. We also monitor the negative log-likelihood and accuracy, which converge by the $100,000^{\textit{\emph{th}}}$ inference step.


% While $\hat{R}$ values for some parameters in the network are greater than $1$, the final two neurons in the last layer of our network have $\hat{R} \leq 1$. 
% Mention acceptance rate
\subsection{Other inference methods}
We conduct $10$ experimental runs for each inference method presented in this section using different random seeds and random shuffling of data points between the training and validation datasets. 

\subsubsection{Deep Ensembles}
We train $10$ non-Bayesian CNN models with different random seeds and randomly shuffled training:validation splits to construct the posterior predictive distribution by combining the softmax values obtained for each galaxy in our test set. The models are trained for 600 epochs using the Adam optimiser with a learning rate of $10^{-4}$ and weight decay $10^{-6}$. We use a learning rate scheduler which reduces the learning rate by 10\% if the validation loss does not improve for two consecutive epochs and use an early stopping criterion based on the validation loss.

\subsubsection{MC Dropout} A dropout rate of 50\%  is implemented before the last two fully-connected layers of our neural network. This dropout configuration performed better compared to implementing dropout only before the last layer of the network.
%, which is standard for CNNs \citep{scaife2021fanaroff, gal2015bayesian}.
The network is trained for 600 epochs using the Adam optimser with a learning rate of $10^{-3}$ and a weight decay of $10^{-4}$.  We use a learning rate scheduler which reduces the learning rate by 10\% if the validation loss does not improve for two consecutive epochs and use an early stopping criterion based on the validation loss.

\subsubsection{LLA} We use the MAP values learned by our non-Bayesian CNNs to construct our last-layer Laplace approximation using the \textsc{Laplace} package\footnote{\url{ https://github.com/AlexImmer/Laplace}} developed by \citet{daxberger2021laplace}. We use a diagonal factorisation of the Hessian. The optimised prior standard deviation found using marginal likelihood maximisation for 10 experimental runs lies between $\sigma \in [0.03, 0.04]$.

\subsubsection{VI} We make a Gaussian variational approximation to the posterior and find that our model is optimised with a Gaussian prior width $\sigma = 0.01$. We also test a Laplace prior following \citep{mohan2022quantifying}, but find that it does not lead to a significant performance improvement. Results are reported for a tempered VI posterior, with $T = 0.01$ (see note below). The network is trained for 1500 epochs using the Adam optimser with a learning rate of $5 . 10^{-5}$. A compute time of $40$ mins is required to train the VI model on a single Nvidia A100 GPU. 
% A compute time of $12$ mins is required to train the VI model on a single Nvidia A100 GPU. 


% The network is trained for 1500 epochs using the Adam optimser with a learning rate of $5 10^{-5}$. A compute time of $40$ mins is required to train the VI model on a single Nvidia A100 GPU. 

% The Laplace prior provides optimal predictive performance and lowest uncertainty calibration error, however for direct comparison to our HMC baseline we also consider a Gaussian prior with $\sigma = 0.01$. 



% We consider different approximations to the Hessian based on diagonal factorisation and Kronecker-factored approximate curvature (KFAC).
% //compare diagonal/Kronecker/full. 
% What is the value of the posterior precision ?

%todo: optimise prior precision with gridsearch feature of the library




% We obtain 200 samples from VI and MC Dropout posterior predictive distributions by passing each sample in the test set through the test loop 200 times. 
% We use the same optimiser hyperparameters as the MC Dropout training for our non-Bayesian CNN model. 





\textbf{Note: Data augmentation and the cold posterior effect}
Several published works have reported that their BNNs experience a "cold posterior effect (CPE)", according to which the posterior needs to be down-weighted or tempered with a temperature term, $T \leq 1$, in order to get good predictive performance \citep{wenzel2020good}:
%add equation
\begin{equation}
    P(\theta|D) \propto (P(D|\theta) P(\theta))^{1/T}.
    \label{eq:cpe}
\end{equation}

Previous work on using VI for radio galaxy classification has shown that the "cold posterior effect" (CPE) persists even when the learning strategy is modified to compensate for model misspecification with a second order PAC-Bayes bound to improve the generalisation performance of the network \citep{mohan2022quantifying, masegosa2019learning}. We do not observe a CPE when we use samples from our HMC inference to construct the posterior predictive distribution for classifying the MiraBest dataset. However, the effect still persists in our VI models. In the general Bayesian DL literature, some authors argue that CPE is mainly an artifact of data augmentation \citep{izmailov2021bayesian}, while others have shown that data augmentation is a sufficient but not necessary condition for CPE to be present \citep{noci2021disentangling}. We find that data augmentation does not have a significant effect on the cold posterior effect observed in our VI models. However, it does lead to a different degree of trade-off between test error and uncertainty calibration error for our HMC model. The effect of augmentation on performance is further discussed in Section \ref{sec:eval}.

% , perhaps because the data is augmented using random rotations which

% We find that data augmentation does not have a significant effect on our VI models. 


\section{Evaluation}
\label{sec:eval}
To construct the posterior predictive distributions for a single experimental run of VI, LLA and MC Dropout, we obtain $N = 200$ samples from their posterior distributions and calculate $N$ Softmax probabilities for each class, for each galaxy in our test set. For Deep Ensembles we use $N = 10$ samples. In case of HMC we use the $200$ samples obtained after thinning the chains for evaluation. 
%See Section \ref{sec:eval}

\begin{table*}[!ht]
    \centering
    \caption{Test error and uncertainty calibration error (UCE) of the predictive entropy for all the Bayesian neural networks considered in this work. We also provide a baseline MAP error percentage. Inference methods with a ($^*$) indicate that no data augmentation was used during inference for those experiments. See Sections \ref{sec:predictive performance} and \ref{sec:Uncertainty Calibration}.} 
    \label{tab:eval}
    \begin{tabular}{rlll}
        \toprule % from booktabs package
        \bfseries Inference & \bfseries Error (\%) $\downarrow$  & \bfseries UCE $\downarrow$ \\
        \midrule % from booktabs package
        HMC & $4.16 \pm 0.45 $   & $14.76 \pm 0.95$ \\
        ${\rm HMC}^{*}$ & $6.24 \pm 0.45$ & $12.65 \pm 0.01$  \\
        VI  & $3.94 \pm 0.01 $  & $12.77 \pm 6.11$\\
        ${\rm VI}^{*}$  & $3.84 \pm 0.01 $  & $12.32 \pm 6.36$\\
        % VI (linear conditioner) &  & &\\
        % LLA (diag) & $6.86 \pm 0.17$ & & 22.39\\
        LLA & $8.85 \pm 2.09$  & $23.84 \pm 3.54$\\
        Dropout &  $7.88 \pm  2.81$ &  $25.75 \pm 4.44$ \\     
        Ensembles & $7.69 \pm 0.27 $  & 24.41 \\
        MAP & $5.76$  & \\
        \bottomrule 
    \end{tabular}
\end{table*}

%Table with MI and AE results %\\internal
% Table \ref{tab:eval}
% \begin{table}[!h]
%     \centering
%     \caption{Evaluation} \label{tab:eval}
%     \begin{tabular}{rlllll}
%         \toprule % from booktabs package
%         \bfseries Inference & \bfseries Error (\%) $\downarrow$ & \bfseries NLL $\downarrow$ & \bfseries UCE (PE) $\downarrow$ & \bfseries UCE (MI) $\downarrow$ & \bfseries UCE (AE) $\downarrow$ \\
%         \midrule % from booktabs package
%         HMC & $3.86 \pm 0.04$  & & \\
%         VI &  & &\\
%         LLA &  & &\\
%         Dropout &  & &\\
%         Ensembles & $6.41 \pm 0.20 $ & $0.186 \pm 0.32$  & $27.08$ & $11.24$ & $27.2$\\
%         % Non-Bayesian CNN &  & & \\
%         \bottomrule 
%     \end{tabular}
% \end{table}

\subsection{Predictive Performance}
\label{sec:predictive performance}
%are the different BNNs misclassifying the same galaxies?

% We construct the posterior predictive distribution by taking $N=200$ samples for HMC, VI, LLA, MC Dropout and $N=10$ samples 
We use the expected value of the posterior predictive distribution to obtain the classification of each galaxy in the MiraBest Confident test set and calculate the test error for a single experimental run by taking an average of the classification error over the entire test set. We report the mean and standard deviation of the test error for $10$ experimental runs, see Table \ref{tab:eval}. %For HMC we take an average of 5 runs

VI has the best predictive performance, irrespective of whether data augmentation is used or not. The low standard deviation values for VI indicate that the mean of the posterior predictive distribution found by VI optimisation is robust to random seeds and shuffling. The same does not hold true for LLA and Dropout, which are the two worst performing models. Deep ensembles lie somewhere in between. The MAP value reported in Table \ref{tab:eval} is chosen on the basis of the lowest validation loss from the ensemble of CNNs that we trained. 
% The difference between the test errors for the ensemble and a single MAP value clearly shows how non-Bayesian CNNs make over-confident predictions.



% We note that the difference between the test error for MAP and LLA (which fits distributions around the MAP value) is because we calculate the error by taking the 64\% credible interval of the posterior predicitve.

\subsection{Uncertainty Calibration}
\label{sec:Uncertainty Calibration}

We report the expected uncertainty calibration error \cite[UCE;][]{gal2015bayesian, laves2019well, mohan2022quantifying} of the predictive entropy for our posterior distributions in Table \ref{tab:eval}. For HMC, VI, LLA and MC Dropout, we use the $64\%$ credible intervals of the posterior predictive distributions to calculate UCE. For Deep Ensembles, we use the entire posterior predictive distribution constructed using the 10 ensemble members.

We find that HMC without data augmentation is the most well-calibrated BNN for the radio galaxy classification problem. HMC with data augmentation has a higher UCE. VI models with and without data augmentation are similarly calibrated. The high standard deviation values show how sensitive VI is to initialisation, and this is a well documented issue in the literature \citep{altosaar2018proximity,rossi2019good}. LLA, MC Dropout and Deep Ensembles are very poorly calibrated compared to HMC and VI.

We refrain from reporting the mutual information and conditional entropy as measures of epistemic and aleatoric uncertainty since they are known to be dependent on model specification and class separability \citep{hullermeier2021aleatoric}, making them difficult to interpret given our small statistical sample of radio galaxies.
% without designing specific tests. 
More recently, \citep{wimmer2023quantifying} have also shown that the additive decomposition of total predictive uncertainty 
% as measured by Shannon entropy
into mutual information and conditional entropy breaks down in machine learning settings where we have access to a limited number of data samples. They suggest that the difference between predictive entropy and mutual information can at most be interpreted as a lower bound on the aleatoric uncertainty, which converges to the true value when the model learns the true data generating distribution.


% This is because, we expect the total predictive uncertainty and the mutual information to be maximum at the beginning of training, which implies that aleatoric uncertainty should be zero if the additive assumption holds. This violates the assumption that aleatoric uncertainty is a constant. They suggest that the difference between predictive entropy and mutual information can at most be interpreted as a lower bound on the aleatoric uncertainty, which converges to the true value when the model learns the true data generating distribution, which is practically not possible. %which never really happens - and we wouldn't need UQ if we knew what the true data generating process is

% Predictive entropy behaves fairly sensibly.




% how well curated datasets affect uncertainty measures

\subsection{Detecting Distribution Shift}
\label{sec:ood}

%Discuss MB Uncertain - more noisy
% MBHybrid - semantic shift ?
%Discuss MIGHTEE results - domain shift
% RGZ - Some level of domain shift and semantic shift (galaxies belong to different classes)

When neural networks are deployed in real-world applications, the independent and identically distributed (i.i.d.) assumption often breaks down and leads to different types of distribution shifts. According to the i.i.d. assumption, the training and test sets are drawn from the same joint distribution defined by the input data and their labels $(x, y) \sim P(X, Y)$. 
Covariate shift occurs when there is a change in the input data distribution, $P(X)$, but no shift in the distribution of labels, $P(Y)$, at test time. This can be due to domain shift, for example when the model is faced with galaxies from a new telescope facility. Another type of shift occurs when the distribution of labels, $P(Y)$, changes at test time due to the presence of new classes. This is known as semantic shift. Some degree of semantic shift is expected when telescopes with improved resolution reveal new morphologies of galaxies.
% (give GRGs example).

In order to evaluate the sensitivity of our BNNs to different types of distribution shifts, we need a scoring function which can distinguish between in-distribution (iD) and distribution-shifted test galaxies without adding significant computational overhead. \citet{liu2020energy} showed that energy scores provide an easy to implement post-hoc scoring mechanism for discriminative classification models.

We calculate energy scores for different test samples, $x$, for all the datasets described in Section \ref{sec:data} using the logit values, $f_i(x)$, for each class, $i$, following \cite{liu2020energy}. For a non-Bayesian model, an input sample, $x$, is mapped to a scalar energy value as follows:
%
\begin{equation}
    \mathrm{E}(x; f) = -T . {\rm log} \sum_i^K e^{f_{i} (x) / T }, 
    \label{eq:energy_scores}
\end{equation}
%
where the temperature term, $T$, is set to $1$. For our Bayesian models we calculate the average energy value per input sample using $N$ posterior samples:
%
\begin{equation}
    \tilde{\mathrm{E}}(x; f) = \frac{1}{N}  \sum_j^{N} \, -T . \,  {\rm log} \sum_i^K e^{f_{i} (x) / T } .
\end{equation}

In this framework, out-of-distribution (OoD) samples are expected to have higher energy. 
% \citet{liu2020energy} also show that one can train models to pull up the energy of OoD samples by including an additional regularisation term to the training objective derived from the energy scores of distribution-shifted data. This makes iD and OoD samples more distinguishable by widening the energy gap between them. However, our application presents a unique conundrum where defining what OoD is difficult because the galaxies exists on a spectrum of physical processes that gives rise to different astrophysical phenomena and different morphologies. 

% For example, 'Hybrids' within the population of radio galaxies exhibit FRI-like morphology on one side and FRII-like on the other side, we could consider them as both iD or OoD.

Histograms of energy values for the different inference methods considered in this work are shown in Figure \ref{fig:energy}. We use the models with the lowest validation error from the experiments to calculate the energy scores. We see that the iD MiraBest Confident samples get mapped to a larger interval of energy values by our HMC and VI models. In comparison, the energy scores for iD samples lie in a very narrow interval for Deep Ensembles, MC Dropout and LLA, which suggests that fewer iD samples have been pushed to lower energy values. 

We find that HMC and VI models are good at separating the OoD optical galaxies from the GalaxyMNIST dataset, see Figure \ref{subfig: hmc} and Figure \ref{subfig: vi}. For all other models, there is a significant degree of overlap between the iD and OoD samples, see Figure \ref{fig:energy}. 

The FRI/FRII galaxies from the MIGHTEE dataset present a significant dataset shift due to differences in observational properties. MIGHTEE galaxies get mapped to a large interval of energy values, in some cases extending upto $E = -90$. However, HMC is the only model for which there exists a clear distinction between iD FRI/FRII galaxies from MiraBest Confident and distribution-shifted FRI/FRII galaxies from MIGHTEE. We also note that LLA maps some of the MIGHTEE galaxies to energies higher than OoD GalaxyMNIST data, see Figure \ref{subfig: lla}.


% This could also explain the poorer performance of all of these methods, so when we construct the 


% We find that galaxies from the MIGHTEE dataset get mapped to very low energy values by all of our models, in some cases upto $-90$, see Figure\ref{eq:energy_scores}.

% \citep{yang2021generalized}


% Figure~\ref{fig:energy}

% Figure~\ref{subfig: hmc}.




\begin{figure*}
    \centering
    \begin{subfigure}[t]{0.45\textwidth}
        \centering
        \includegraphics[width=\textwidth]{figures/energy_hist_hmc.png}
        \caption{Hamiltonian Monte Carlo}
        \label{subfig: hmc}
    \end{subfigure}%
    ~ 
    \begin{subfigure}[t]{0.45\textwidth}
        \centering
        \includegraphics[width=\textwidth]{figures/energy_hist_vi.png}
        \caption{Variational Inference}
        \label{subfig: vi}
    \end{subfigure}
    
     \begin{subfigure}[t]{0.45\textwidth}
        \centering
        \includegraphics[width=\textwidth]{figures/energy_hist_lla.png}
        \caption{Last Layer Laplace Approximation}
        \label{subfig: lla}
    \end{subfigure}
    ~
    \centering
    \begin{subfigure}[t]{0.45\textwidth}
        \centering
        \includegraphics[width=\textwidth]{figures/energy_hist_dropout.png}
        \caption{Dropout}
        \label{subfig: dropout}
    \end{subfigure}
    
    \begin{subfigure}[t]{0.45\textwidth}
        \centering
        \includegraphics[width=\textwidth]{figures/energy_hist_ensemble.png}
        \caption{Deep Ensembles}
        \label{subfig: deep ensemble}
    \end{subfigure}
    ~ 
    \begin{subfigure}[t]{0.45\textwidth}
        \centering
        \includegraphics[width=\textwidth]{figures/energy_hist_mlp.png}
        \caption{MAP}
        \label{subfig: mlp}
    \end{subfigure}

    
  \caption{Detecting distribution shift with energy scores: Histograms of energy scores calculated for the MiraBest Confident (MBConf; blue), GalaxyMNIST (orange) and MIGHTEE (green) test datasets for the different models considered in this work, see Section \ref{sec:ood} for details. The histograms are plotted with a bin width of $0.1$. Axes are truncated so that we can examine where samples from each dataset lie. We find that HMC is the only inference method for which all the datasets can be easily distinguished. %based on energy scores
  }
  \label{fig:energy}
\end{figure*}

% \begin{figure*}[t!] % "[t!]" placement specifier just for this example

% \begin{subfigure}{0.30\linewidth}
% \includegraphics[width=\linewidth]
% {figures/energy_hist_hmc.png}
% \caption{HMC} \label{fig:a}
% \end{subfigure}
% %\hspace*{\fill}

% \begin{subfigure}{0.30\linewidth}
% \includegraphics[width=\linewidth]{figures/energy_hist_vi.png}
% \caption{VI} \label{fig:b}
% \end{subfigure} %\hspace*{\fill}

% \begin{subfigure}{0.30\linewidth}
% \includegraphics[width=\linewidth]{figures/energy_hist_lla.png}
% \caption{LLA} \label{fig:c}
% \end{subfigure} \hspace*{\fill}

% % \medskip
% % \begin{subfigure}{0.24\textwidth}
% % \includegraphics[width=\linewidth]{figures/energy_hist_lla.png}
% % \caption{LLA} \label{fig:c}
% % \end{subfigure}\hspace*{\fill}
% \begin{subfigure}{0.50\linewidth}
% \includegraphics[width=\linewidth]{figures/energy_hist_dropout.png}
% \caption{MC Dropout} \label{fig:d}
% \end{subfigure}
% % \hspace*{\fill}

% % \medskip
% \begin{subfigure}{0.50\linewidth}
% \includegraphics[width=\linewidth]{figures/energy_hist_ensemble.png}
% \caption{Deep Ensemble} \label{fig:e}
% \end{subfigure}\hspace*{\fill}
% \begin{subfigure}{0.50\linewidth}
% \includegraphics[width=\linewidth]{figures/energy_hist_mlp.png}
% \caption{MAP} \label{fig:f}
% \end{subfigure}
% \caption{Detecting distribution shift with energy scores: Histograms of energy scores calculated for for the MiraBest Confident (MBConf; blue), GalaxyMNIST (orange) and MIGHTEE (green) test datasets for the different models considered in this work, see Section\ref{sec:ood} for details. The histograms are plotted with a bin width of $0.1$. We truncate the axes 
% % to $[-30, 0.1]$
% so that we can easily examine where samples from each dataset lie. We find that HMC is the only inference method for which all the datasets can be easily distinguished. }
% \label{fig:energy}
% \end{figure*}




% \subsection{Sensitivity Analysis} 

% Quantifying the sensitivity of a model to a subset of examples in the training data can help us identify data points that are most influential in making predictions and identify potential biases our models/predictions are susceptible to.
% Sensitivity of a model is also closely related to robustness to model misspecification, which comes from the loss and how sensitive it is to perturbations \citep{10.5555/3586589.3586721}.

% Sensitivity analysis using influence functions \citep{koh2017understanding}. Sensitivity analysis in variational bayes.
% \citep{giordano2018covariances}
% Memory perturbation equation \citep{nickl2023memory}.
% Resolving training biases via influence-based data relabeling\citep{kong2021resolving}

% \subsection{Weight Posteriors}
% %weight space analysis of posteriors; how similar different BNNs weights are to HMC weight posteriors; how is the difference correlated with performance metrics; 

% % which BNNs have flatter weight distributions?; Do flatter distributions allow one to find flatter minima in the loss function? do BNNs that have flatter weight distributions generalise well? (see performance on MBUncert, Hybrids, MIGHTEE) 

% Using samples from HMC as a benchmark, we analyse weight posteriors from different approximations for the last layer of our model. 

% For VI and LLA - can use z-statistic
% Compare the mean values of the weights for Dropout/MAP??

% \citep{qiu2023should} 

% \section{Practical Considerations}
%will change sections/subsections later
%could discuss pros and cons for each method here in addition to the challenges in implementation
% Each inference method comes with its set of trade-offs. 

% \subsection{Spurious Correlations in VI} 

% In practice, the scale parameter is often parameterised as $\sigma = \log (1 + \exp(\rho))$ or $\sigma = \exp(\rho)$. \citet{kim2023convergence} show that using a non-linear diagonal conditioner in the scale matrix prevents the ELBO from being strongly-convex and leads to sub-optimal convergence rates. We find that using a non-linear conditioner also leads to spurious correlations in the joint distributions of the weights of our VI models and could explain the sub-optimal performance of previous work on using VI for radio galaxy classification in \citet{mohan2022quantifying}. Weight posteriors for VI with different scale parameterisations are presented in Figure \ref{fig:vi_conditioner}.



% \begin{figure}
% % \begin{center}
%     \begin{subfigure}[u]{0.5\textwidth}
%         \includegraphics[width=\textwidth]{figures/temp/vi posteriors.png}
%         \caption[]{}
%         \label{fig:vi_linear}
%     \end{subfigure} 
 
%     \begin{subfigure}[u]{0.5\textwidth}
%         \includegraphics[width=\textwidth]{figures/temp/vi posteriors (1).png}
%         \caption[]{}
%         \label{fig:vi_nonlinear}
%     \end{subfigure}
    
%     \caption{Posterior distributions for five randomly selected weights from the last layer of our model. Samples from HMC (shown in blue) are overlaid with samples from VI (shown in orange) for (a) non-linear and (b) linear parameterisation of the scale factor.}
    
%     \label{fig:vi_conditioner} 
% % \end{center}
% \end{figure}

% %Appendix \ref{sec:appendix}

% \subsection{Model misspecification in VI}

% \citep{wang2019variational} show that in the posterior predictive distribution, the error due to model misspecification dominates over the error from the variational approximation. 

% The variational approximation error is the total variation distance (TVD) between the variational predictive distribution and the exact predictive distribution. This tends to vanish as the number of data points increases. The error due to model misspecification as measured by the TVD between the exact posterior predictive and the true density dominates over the error from the variational approximation.


% \citep{mohan2023mcmc}



\section{Discussion}
\label{sec:discuss}

% compare to results from other benchmarks


A certain degree of trade-off exists between a model's predictive performance and calibration. While VI has the best predictive performance, HMC without data augmentation is the most well-calibrated model and only $2.5 \%$ less accurate. HMC with data augmentation has a better predictive performance, but is less calibrated than HMC without data augmentation. A similar trend has also been reported by \citet{krishnan2020improving}, who propose a loss function which optimises for both accuracy and calibration. %pareto optmisation

% It is also promising to note that HMC and VI perform compariti


The differences in dataset separability via energy scores for different BNNs can be better understood if we examine the way in which each of these models is being optimised. \citet{lecun2006tutorial} show that many modern learning algorithms can be interpreted as energy-based models. In the energy-based framework, different loss objectives cause certain inputs' energies to be pulled up/down. LLA, Deep ensembles and MC Dropout are all trained by minimising the negative log likelihood (NLL) loss plus some regularisation term due to weight decay. Our evaluation suggests that NLL training is not be able to shape the energy functional well enough to distinguish between the datasets we have considered. While HMC is directly sampling from an energy surface that is proportional to the log of the posterior distribution, is case of VI the ELBO provides a well optimised surrogate energy function. Our HMC and VI models seem to have learned good energy surfaces. \citet{lecun2006tutorial} also note that softmax probabilities can be considered good if the energy function is estimated well enough from the data. Perhaps this is also why HMC and VI are the better calibrated models among all those we have considered in this work.
% are also more sensitive to different types of distribution shifts. 



Our observations on the cold posterior effect (CPE) contradict the results presented in \citet{izmailov2021bayesian}. They suggest that the CPE is largely due to data augmentation. While our HMC model does not require any tempering, the VI models require temperatures below $T = 0.01$ to produce good predictive performance. We also found that data augmentation does not have a significant impact on the CPE observed in our models. Finding the cause of the cold posterior effect observed in VI for radio galaxy classification is still an open research question. Thus we find that results from the CS literature where models are trained on terrestrial datasets often do not translate to domain-specific applications. While Deep Ensembles are generally considered a good approximation to the Bayesian posterior, \citep{seligmann2023beyond} recently showed that single-mode BDL algorithms approximate the posterior better than Deep Ensembles. We also find that Deep Ensembles do not work as well as VI and HMC for our application.
% same to be true it . 

%Future work / extensions
% We will include detailed descriptions of future work in the Discussion including the following points: (i) We plan to develop and improve VI further by using alternate optimisation strategies based on natural gradient descent and proximal gradient descent; (ii) Examination of the cold posterior effect with respect to data curation (as stated above); (iii) Additionally, to examine robustness to prior misspecification, we plan to develop different divergence metrics for the ELBO cost function; (iv) Developing the methods for self-supervised learning to exploit larger unlabelled datasets in astronomy (which is still unexplored in the literature)
% The methods that perform well are computationally heavy (e.g. HMC), so the results will not scale well with increased dataset size. Cheaper approximations to the posterior do not perform well. Therefore, there is a need to develop methods like VI to balance the tradeoff between performance and computational cost.

Through this work, we have identified VI as the most promising method for our application given the computational cost of HMC. In future work, we plan to develop and improve our VI implementation further by using alternate optimisation strategies based on natural gradient descent \citep{shen2024variational, khan2021bayesian} and proximal gradient descent \citep{kim2023convergence}. We also plan to investigate the cold posterior effect further, both experimentally and theoretically. To do this we will examine the effect of data curation, which requires the creation of new datasets. Additionally, to examine robustness to prior misspecification, we plan to develop different divergence metrics for the ELBO cost function. Future work could also develop BNNs for self-supervised learning to exploit larger unlabelled datasets in astronomy.
% and plan to develop it further in future work.

\section{Conclusions}
In this work we have evaluated different Bayesian neural networks for the classification of radio galaxies. We found that Hamiltonian Monte Carlo and variational inference perform well at our model and dataset scales for the three criteria we considered: predictive performance, uncertainty calibration and ability to detect distribution shift. Commonly used Bayesian NNs such as MC Dropout and Deep Ensembles are poorly calibrated for our application. Since HMC is very computationally heavy, optimising VI for future radio surveys might be the way forward. 

% \textbf{Function vs weight space eval}:  
% Is it similar/different to the CS literature

% \textbf{Predictive performance vs uncertainty calibration trade-off}: 
%verify with different splits in the dataset


%Resolved with data shuffling
%Different design choices lead to different trade-offs between predictive accuracy and uncertainty calibration error. For example, in case of MC Dropout, if we apply dropout before the last two linear layers, we get better predictive performance compared to applying dropout only before the last layer, but this comes at the cost of increased UCE, see Table \ref{tab:eval}. %MC Dropout is one of the most widely used BDL posterior approximations, thus practitioners must be cautious while evaluating their models. 

% \section{Limitations/Future Work} %internal

% scale



% Optimisation strategies
% Natural gradient descent VI
%Proximal gradient descent VI


\begin{acknowledgements} % will be removed in pdf for initial submission,
						 % (without ‘accepted’ option in \documentclass)
                         % so you can already fill it to test with the
                         % ‘accepted’ class option
                
AMS gratefully acknowledges support from an Alan Turing Institute AI Fellowship EP/V030302/1.
\end{acknowledgements}

% References
\bibliography{uai2024-template}

% \newpage
% \appendix
% \section{Additional results/plots}
% \label{sec:appendix}

% R_hat histogram for chains with and without augmentation 
% \onecolumn


% \paragraph{What is title case?}
% \href{https://en.wikipedia.org/wiki/Title_case}{Wikipedia} explains:
% \begin{quote}
%     Title case or headline case is a style of capitalization used for rendering the titles of published works or works of art in English.
%     When using title case, all words are capitalized except for ‘minor’ words (typically articles, short prepositions, and some conjunctions) unless they are the first or last word of the title.
% \end{quote}

% \subsubsection{Citations}

% We strongly advise to use reference list software such as Bib\TeX{} and a citation package such as \textsf{natbib}.


% \section{Math}\label{sec:math}

% Use the \textsf{amsmath} environments for displayed equations.
% So, specifically, use the \texttt{equation} environment instead of \verb|$$...$$| and the \texttt{align} environment instead of \texttt{eqnarray}.\footnote{For reasons why you should not use the obsolete \texttt{eqnarray} environment, see Lars Madsen, \textit{Avoid eqnarray!} TUGboat 33(1):21--25, 2012.}
% An \texttt{equation}:
% \begin{equation}\label{eq:example}
%   0 = 1 - 1.
% \end{equation}
% Two \texttt{align}'ed equations:
% \begin{align*} % no numbers with starred version
%   1 + 2 &= 3,\\
%   1 - 2 &= -1.
% \end{align*}
% Equations can also be put inline, of course.
% For example, Equation~\eqref{eq:example}: \(0=1+1\). % $0=1+1$ also works



% \begin{figure*}
%     \centering
%     \begin{tikzpicture}[xscale=1.5]
%         \coordinate (origin);
%         \draw[->] (origin) -- +(1cm,0) node[below] {$x$};
%         \draw[->] (origin) -- +(0,1cm) node[left] {$y$};
%         \fill[gray] (45:1cm) circle[radius=.2cm];
%     \end{tikzpicture}
%     \caption{A Nice Filled Ellipse with a Pair of Coordinate Axes.}\label{fig:tikz}
% \end{figure*}



% \begin{figure}[!htb]
%   \centering
%   \includegraphics[width=0.7\linewidth]{barcelona.jpg}
%   \caption{A View of a Nice City.}\label{fig:city}
% \end{figure}

% \begin{table}
%     \centering
%     \caption{An Interesting Table.}\label{tab:data}
%     \begin{tabular}{rl}
%       \toprule % from booktabs package
%       \bfseries Dataset & \bfseries Result\\
%       \midrule % from booktabs package
%       Data1 & 0.12345\\
%       Data2 & 0.67890\\
%       Data3 & 0.54321\\
%       Data4 & 0.09876\\
%       \bottomrule % from booktabs package
%     \end{tabular}
% \end{table}

% \begin{contributions} % will be removed in pdf for initial submission 
% 					  % (without ‘accepted’ option in \documentclass)
%                       % so you can already fill it to test with the
%                       % ‘accepted’ class option
%     Briefly list author contributions. 
%     This is a nice way of making clear who did what and to give proper credit.
%     This section is optional.

%     H.~Q.~Bovik conceived the idea and wrote the paper.
%     Coauthor One created the code.
%     Coauthor Two created the figures.
% \end{contributions}

% \title{Title in Title Case\\(Supplementary Material)}
% \maketitle



% This Supplementary Material should be submitted together with the main paper.




% Table~\ref{tab:supp-data} lists additional simulation results; see also \citet{einstein} for a comparison. 

% \begin{table}[!h]
%     \centering
%     \caption{An Interesting Table.} \label{tab:supp-data}
%     \begin{tabular}{rl}
%         \toprule % from booktabs package
%         \bfseries Dataset & \bfseries Result\\
%         \midrule % from booktabs package
%         Data1 & 0.12345\\
%         Data2 & 0.67890\\
%         Data3 & 0.54321\\
%         Data4 & 0.09876\\
%         \bottomrule % from booktabs package
%     \end{tabular}
% \end{table}

% \section{Math font exposition}
% % NOTE: necessary when ptmx or no mathfont class option is given
% \providecommand{\upGamma}{\Gamma}
% \providecommand{\uppi}{\pi}
% How math looks in equations is important:
% \begin{equation*}
%     F_{\alpha,\beta}^\eta(z) = \upGamma(\tfrac{3}{2}) \prod_{\ell=1}^\infty\eta \frac{z^\ell}{\ell} + \frac{1}{2\uppi}\int_{-\infty}^z\alpha \sum_{k=1}^\infty x^{\beta k}\mathrm{d}x.
% \end{equation*}
% However, one should not ignore how well math mixes with text:
% The frobble function \(f\) transforms zabbies \(z\) into yannies \(y\).
% It is a polynomial \(f(z)=\alpha z + \beta z^2\), where \(-n<\alpha<\beta/n\leq\gamma\), with \(\gamma\) a positive real number.


\end{document}
