\documentclass[accepted]{uai2024} % for initial submission
%\documentclass[accepted]{uai2024} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2024} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2024} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands

    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage[nameinlink,capitalise]{cleveref}
\usepackage{amsfonts}
\usepackage[nice]{nicefrac}
\usepackage{graphicx}                 % Including graphics and using colours
\usepackage{xcolor}                   % Defined more color names
\usepackage{eso-pic}                  % Watermark and other bag
\usepackage{float}
\usepackage{caption}
%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example
\newcommand{\bo}[1]{\boldsymbol #1}
\newcommand{\red}[1]{\textcolor{red}{ #1}}
\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}
\title{Towards Scalable Bayesian Transformers: Investigating stochastic subset selection for NLP}

% The standard author block has changed for UAI 2024 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1,*]{\href{mailto:<jj@example.edu>?Subject=Your UAI 2024 paper}{Peter J.T. Kampen}{}}
\author[1,*]{Gustav R.S. Als}
\author[1]{Michael Riis Andersen}

% Add affiliations after the authors
\affil[1]{%
    DTU Compute\\
    Technical University of Denmark\\
    Kongens Lyngby, Denmark
}
\affil[*]{%
    These authors contributed equally\\
}
  
\begin{document}
\maketitle

\begin{abstract}
Bayesian deep learning provides a framework for quantifying uncertainty. However, the scale of modern neural networks applied in Natural Language Processing (NLP) limits the usability of Bayesian methods. Subnetwork inference aims to approximate the posterior by selecting a stochastic parameter subset for inference, thereby allowing scalable posterior approximations. Determining the optimal parameter space for subnetwork inference is far from trivial. In this paper, we study partially stochastic Bayesian neural networks in the context of transformer models for NLP tasks for the Laplace approximation (LA) and Stochastic weight averaging - Gaussian (SWAG). We propose heuristics for selecting which layers to include in the stochastic subset. We show that norm-based selection is promising for small subsets, and random selection is superior for larger subsets. Moreover, we propose Sparse-KFAC (S-KFAC), an extension of KFAC LA, which selects dense stochastic substructures of linear layers based on parameter magnitudes. S-KFAC retains performance while requiring substantially fewer stochastic parameters and, therefore, drastically limits memory footprint.
\end{abstract}

\section{Introduction}\label{sec:intro}
The field of Natural Language Processing (NLP) has seen extraordinary advances recently with the introduction of the transformer architecture \citep{vaswani2017attention,devlin2019bert,openai2023gpt4}. Increased size and variety of data have improved calibration. However, many models still suffer from overconfident predictions \citet{askForCalibration,calibrationLanguageModel,finetuneCalibration}. Bayesian deep learning tackles uncertainty estimation by modeling neural network parameters as distributions, thereby gaining an inherent interpretation of model confidence. High-dimensional parameter spaces nevertheless limit application on transformer models. The advent of subnetwork inference can potentially improve Bayesian inference in transformer models by estimating the posterior distribution over a subset of the model weights while treating the majority of the weights as deterministic. Hence the dimensionality of the parameter space of the stochastic subnetwork will be significantly reduced, thus increasing the feasibility of applying Bayesian methodology\citep{daxberger2022bayesian,sharma2023bayesian}.

Bayesian methods for transformers have been explored with varying results \citep{tran2019bayesian,chen2023calibrating,cinquin2021pathologies}. \citet{sharma2023bayesian} finds that partial stochastic neural networks (NN) can outperform their fully stochastic versions using feed-forward neural networks for regression and convolutional neural networks for image classification. In this paper, we investigate whether the hypothesis of partial stochasticity improving predictive performance compared to both point estimates and fully stochastic solutions can be extended to the NLP domain. %We conduct experiments on three GLUE tasks \citep{GLUE_wang2019glue}.
The concept of partial stochasticity raises the question of how to select the stochastic subset. We propose heuristics for selecting which affine transformations to include in the stochastic subset for a transformer encoder model (DistilBERT \citep{sanh2020distilbert}). The study is performed using two methods for approximating the posterior distribution: Stochastic weight averaging - Gaussian (SWAG) \citep{swag} and Laplace approximation (LA) \citep{mackayPracticalFramework,daxberger2022laplace}. Both methods rely on a maximum a posteriori (MAP) parameter estimate, where the stochasticity is induced post hoc to the designated parameters. To study the trade-off between performance and memory requirements, we construct \textit{ramping} experiments, where the fraction of stochastic vs. total number of parameters varies, such that the performance between different subsets can be evaluated.

We propose a selection strategy for the Kronecker-Factored Approximate (KFAC) LA, dubbed Sparse-KFAC (S-KFAC). Here, fully connected substructures of all linear layers are included in the stochastic subset based on parameter magnitudes. Thus, it aims to capture uncertainty from all components of the model while limiting the number of stochastic parameters. Below, we summarize our contributions:

\begin{enumerate}
    \item We design and implement a set of numerical experiments to investigate the hypothesis of partial stochasticity improving predictive performance for transformer models in the NLP domain.
    %We survey the effects of introducing partial stochasticity in transformers, and seek to validate partial stochastic modeling outperforming fully stochastic modeling presented in \citet{sharma2023bayesian}.
    \item We propose and evaluate novel heuristics for efficient identification of optimal subsets of stochastic parameters.
    \item We demonstrate that the proposed S-KFAC Laplace approximation yields competitive predictive performance for substantially fewer stochastic parameters, leading to a reduced memory footprint.
\end{enumerate}

\section{Related Works}

\textbf{Bayesian Deep Learning Methods.} Bayesian deep learning is a promising approach for uncertainty quantification \citep{review_uncertainty_quantification}. Regrettably, the dimensionality of the parameter space in large neural networks makes exact posterior inference intractable. Several algorithmic approaches for approximating the posterior $p(\bo{w}|\mathcal{D})$ have been proposed \citep{magris2023bayesian}. Markov Chain Monte Carlo (MCMC) methods are prominent due to their ability to sample from the true posterior and Hamiltonian Monte Carlo (HMC) \citep{nealHMC} is the gold-standard posterior inference method by leveraging Hamiltonian dynamics in its sampling strategy, but HMC is prohibitively slow for even moderately sized models \citep{izmailov2021bayesian}. Variational inference (VI) has been tested extensively \citep{towardsBayesianDL,swiatkowski2020ktied,tran2019bayesian}. It aims to approximate $p(\bo{w}|\mathcal{D})$ using a fixed distributional family, often chosen as a Mean-Field (MF) Gaussian. However, it is known to exhibit pathological behavior \citep{foong2020expressiveness}. Estimating $p(\bo{w}|\mathcal{D})$ using the Gaussian functional form is also explored in the LA, where a Gaussian is fitted at the mode of the posterior distribution, using the inverse curvature as a covariance approximation \citep{mackayClassification,mackayPracticalFramework}. The post hoc and predictive performance-preserving nature of the LA makes it readily applicable \citep{daxberger2022laplace}. Methods utilizing Stochastic Gradient Descent (SGD) iterates as samples from the approximate posterior such as Stochastic weight averaging (SWA) \citep{stochastic_weight_averaging} and SWA-Gaussian (SWAG) \citep{swag} have also been examined. Much less computationally demanding methods like Monte Carlo (MC) dropout exist, where the dropout mechanism normally used at training time for regularization is extended to inference time \citep{uncertaintyQuantification}. This, too, has been shown to exhibit pathological behavior \citep{foong2020expressiveness}. 

\textbf{Subnetwork Inference In Neural Networks}. The concept of treating only a subset of the model parameters as stochastic has increased in popularity recently \cite{daxberger2022bayesian,sharma2023bayesian}. In \citet{daxberger2022laplace}, the linearized LA \citep{immer2021improving} is adapted for subnetworks and made readily applicable for neural networks using the generalized Gauss-Newton (GGN) Hessian approximation. The efficacy of the approximate posterior inference over a stochastic subset of neural network parameters has been investigated in \citet{sharma2023bayesian}. They argue that partially stochastic models are no less theoretically founded than fully Bayesian models. Additionally, they show that inference over a stochastic subset can, at times, yield better performance than full-model stochasticity. Finally, injecting auxiliary stochastic variables through node-based methods has been explored in \citet{nodebased_rank1,nodebased_covariate_shift}. This achieves partial stochasticity without altering the existing model parameters. 

\textbf{Bayesian Methods in NLP.} Transformer-based models for NLP have improved rapidly in recent years \citep{vaswani2017attention, devlin2019bert, openai2023gpt4}. However, after the fine-tuning process, problems such as overconfidence and sub-optimal calibration persist \citep{askForCalibration,calibrationLanguageModel,finetuneCalibration}. In \citet{xiao2022uncertainty}, calibration of pre-trained language models is investigated for methods that imitate uncertainty modeling, such as Temperature scaling \citep{temperature_scaling} and MC Dropout \citep{gal2016dropout}. Subnetwork inference on transformer models has been tested using Gaussian Mean-Field VI in \cite{xue2021bayesian, cinquin2021pathologies}, and through Sparse Gaussian Processes in \citet{chen2023calibrating}.in \citet{bayesian_lora}, the Laplace approximation is applied to Low-Rank Adaptions (LoRA) for fine-tuning large language models. Additionally, Last Layer Laplace showed similar performance to temperature scaling when applied to common NLP tasks in \citet{daxberger2022laplace}. In \citet{uncertainty_aware_swag_nlp} Stochastic Weight Averaging - Gaussian \citep{swag} was used to induce uncertainty awareness in Natural Language Inference tasks.
\begin{figure*}
    \centering
    \includegraphics[width = 0.95\linewidth]{Graphics/attention_block_v2.pdf}
    \caption{Depiction of a transformer block with stochastic parameters as induced by the Sparse-KFAC LA \cref{sec: skfac_theory}. Note that each linear layer is written in neural style, where small dense substructures have been made stochastic, indicated by \textcolor[rgb]{0,0.7,0}{green} lines rather than \textcolor{gray}{gray} for the deterministic weights. 'Soft' denotes the softmax function. }
    \label{fig: skfac_figure}
\end{figure*}

\section{Background}\label{sec: background}
In this paper, we study partially stochastic Bayesian neural networks with a primary focus on supervised classification tasks. A neural network $f_{\bo{w}}:  \mathbb{R}^D \rightarrow \mathbb{R}^C$, where $D$ is the input dimension and $C$ is the number of classes, parameterized by $\bo{w}$ is introduced. The likelihood of the data $\mathcal{D} = \{(\bo{x}_i, t_i)\}_{i=1}^{N}$ given the parameters $\bo{w}$ is then written as $p(\mathcal{D}|\bo{w}) = \prod_{i=1}^N p(t_i | f_{\boldsymbol{w}}(\boldsymbol{\bo{x}_i}))$. In Bayesian deep learning, Bayes' theorem relates a prior distribution over the neural network parameters $p(\boldsymbol{w})$ to a posterior $p(\boldsymbol{w}|\mathcal{D})$ through a likelihood $p(\mathcal{D}|\boldsymbol{w})$ and a marginal distribution $p(\mathcal{D}) = \int p(\mathcal{D}|\bo{w})p(\bo{w}) d\bo{w}$ over the data, by $p(\bo{w} | \mathcal{D}) = \frac{p(\mathcal{D}| \bo{w}) p(\bo{w})}{p(\mathcal{D})}$. In this paper we generally assume a Gaussian prior distribution $p(\bo{w}) = \mathcal{N}(\bo{w}|\bo{\mu}, \bo{V})$. Predictions are made using the posterior predictive distribution such that for a new data point $\bo{x}^*$ the predictive distribution becomes $p(t^*|\bo{x}^*,\mathcal{D}) = \mathbb{E}_{p(\bo{w}|\mathcal{D})}\left[p(t^* | f_{\bo{w}}(\bo{x}^*)\right]$. As the integral of the marginal $p(\mathcal{D})$ is often intractable for neural networks, we have to rely on approximate posterior inference techniques such as SWAG and the LA to perform Bayesian inference. In both methods, we approximate the posterior distribution $p(\bo{w}|\mathcal{D})$ with a Gaussian distribution $q(\bo{w}) = \mathcal{N}(\bo{w} | \bo{\mu}, \bo{\Sigma})$ such that $q(\bo{w}) \approx p(\bo{w}|\mathcal{D})$.

We define a partially stochastic Bayesian neural network $f_{\bo{w}}$ parameterized by $\bo{w} = \bo{w}_D \cup \bo{w}_S$. Here $\bo{w}_D$ and $\bo{w}_S$ are disjoint sets encompassing all the model parameters, where $\bo{w}_D$ are the deterministic parameters (point estimates) and $\bo{w}_S$ are the stochastic parameters (distributional). %For stochastic subsets that are not vanishingly small, relative to the deterministic parameter set, exact posterior inference remains intractable.
Hence, the partially stochastic posterior is approximated as $p(\bo{w}|\mathcal{D})  \approx p(\bo{w}_S|\mathcal{D}) \prod_{d} \delta(\hat{\bo{w}} - \bo{w}_d)
 \approx q(\bo{w}_S)  \prod_{d} \delta(\hat{\bo{w}} - \bo{w}_d),$ where $\delta$ is the Dirac's delta distribution. The following sections deal with approximating the posterior distribution of the stochastic weights $p(\bo{w}_S) \approx q(\bo{w}_S)$. For readability, we will not use the subset notation, and hence, $p(\bo{w})$ will refer to the distribution of the parameters for the stochastic subnetwork unless otherwise stated.
% \begin{equation}
%     \begin{aligned}
%     p(\bo{w}|\mathcal{D}) & \approx p(\bo{w}_S|\mathcal{D}) \prod_{d} \delta(\bo{w} - \bo{w}_d) \\
%     & \approx q(\bo{w}_S)  \prod_{d} \delta(\bo{w} - \bo{w}_d)
%     \end{aligned}
% \end{equation}



\subsection{Stochastic Weight Averaging - Gaussian (SWAG)}
\citet{swag} introduces SWAG as a method for Bayesian inference using SGD to obtain a Gaussian approximation of the posterior distribution. The first two moments are estimated using running averages of the first and second uncentered moments using Welford's algorithm \citep{welfords_algorithm} seen in \cref{eq: running_moments_welfords}.
\begin{equation}\label{eq: running_moments_welfords}
\overline{\bo{w}}_{n+1} = \frac{n \overline{\bo{w}}_n  + \bo{w}_n}{n + 1}, \quad  \quad \overline{\bo{w}^2}_{n+1} = \frac{n \overline{\bo{w}^2}_n  + \bo{w}_n^2}{n + 1}
\end{equation}
where $\overline{\bo{w}}_n$ and $\overline{\bo{w}^2}_n$ are the first and second running moments, respectively, and the square is taken elementwise. The updates are performed $N$ times over the cause of $M$ SGD iterations for $N < M$. For every iteration, we also save the next column in the deviation matrix given by $\bo{w}_i - \overline{\bo{w}}_i$, where the $k$ most recent columns are kept. The moment estimates are denoted as $\bo{w}_{swa} = \overline{\bo{w}}_N$, and a diagonal matrix is defined by the second moments as $\bo{\Sigma}_{diag} = \overline{\bo{w}}_N^2 - \overline{\bo{w}^2}_N$. In \cref{eq: swag_covariance} a low-rank approximation is done with the $k$ columns of the deviation matrix $\bo{D}$
%\begin{equation}
\begin{align}\label{eq: swag_covariance}
    &\operatorname{Cov}(\bo{w}) = \frac{1}{N-1} \sum_{i = 1}^N (\bo{w}_i - \bo{w}_{\text{swa}}) (\bo{w}_i - \bo{w}_{\text{swa}})^T  \\ & \approx \frac{1}{k-1} \sum_{i = 1}^k (\bo{w}_i - \overline{\bo{w}}_i) (\bo{w}_i - \overline{\bo{w}}_i)^T = \frac{1}{k-1} \bo{D} \bo{D}^T.
\end{align}
%\end{equation}
The covariance matrix is then estimated using a combination of the low-rank approximation and $\bo{\Sigma}_{\text{diag}}$, yielding the following  posterior approximation for the weights $\bo{w}$
\begin{equation}\label{eq: swag_posterior_approx}
    p(\bo{w} \mid \mathcal{D}) \approx \mathcal{N}\left(\bo{w} | \bo{w}_{\text{swa}}, \frac{1}{2} \bo{\Sigma}_{\text{diag}} + \frac{1}{2 (k-1)} \bo{D} \bo{D}^T \right).
\end{equation}
Finally, to sample from the approximate posterior, the following identity is used \citep{swag}
%\begin{equation}
    $\Tilde{\bo{w}} = \bo{w}_{\text{swa}} + \frac{1}{\sqrt{2}} \bo{\Sigma}_{\text{diag}}^{\frac{1}{2}} \bo{z}_1 + \frac{1}{\sqrt{2 (k-1)}} \bo{D} \bo{z}_2$
%\end{equation}
where $\bo{z}_1 \sim \mathcal{N} \left(0, \bo{I}_d\right), \ \bo{z}_2 \sim \mathcal{N}(0, \bo{I}_k)$, and 
here, the $d$ denotes the number of stochastic parameters in the model.
\subsection{The Laplace approximation}\label{sec: laplace_theory}
The Laplace approximation (LA) \cite{mackayClassification,mackayPracticalFramework} approximates the posterior $p(\bo{w} | \mathcal{D})$ using a Gaussian distribution. 
\begin{equation}\label{eq: laplace_normaldist}
p(\bo{w}|\mathcal{D}) = \frac{ p(\mathcal{D}|\bo{w}) p(\bo{w})}{p(\mathcal{D})} = \frac{1}{Z} f(\bo{w}) \approx \mathcal{N}(\bo{w}| \bo{\mu}, \bo{\Sigma}),
\end{equation}
where $f(\bo{w})$ is the unnormalized posterior, $\bo{\mu}$ is the mean and $\bo{\Sigma}$ is the covariance, and $Z = \int p(\mathcal{D}|\bo{w}) p(\bo{w}) d \bo{w} $ defines the normalizing constant. We assume a Gaussian prior $p(\bo{w}) = \mathcal{N}(\bo{0}, \sigma^2 \bo{I})$ and apply a second-order Taylor expansion around the MAP solution in log-space
\begin{equation}
    \text{ln} f(\bo{w}) \approx \text{ln} f(\bo{w}_{\text{MAP}}) - \frac{1}{2} (\bo{w} - \bo{w}_{\text{MAP}})^T \mathcal{H}_{\bo{w}} (\bo{w} - \bo{w}_{\text{MAP}}),
\end{equation}
where $\mathcal{H}_{\bo{w}}= - \nabla^2_{\bo{w}} \text{ln} f(\bo{w})|_{\bo{w}_{\text{MAP}}}$ is the Hessian estimated at the mode. It can be described in terms of the contributions from the likelihood and prior by
\begin{equation}
    \mathcal{H}_{\bo{w}} = \frac{1}{\sigma^2} \bo{I} - \sum_{i = 1 }^N \nabla_{\bo{w}}^2 \ln p(t_i | f_{\bo{w}} (\bo{x}_i)) |_{\bo{w}_{\text{MAP}}}
\end{equation}
where $\sigma^2$ is the prior variance. Finally, the Gaussian posterior approximation can be written as
\begin{equation}
    p(\bo{w}|\mathcal{D}) \approx q(\bo{w}) = \mathcal{N}(\bo{w}|\bo{w}_{\text{MAP}}, \mathcal{H}_{\bo{w}}^{-1}).
\end{equation}

\textbf{The Linearized LA} The Hessian $\mathcal{H}_{\bo{w}}$ is computationally demanding to calculate for large models and datasets. The generalized Gauss-Newton (GGN) \citep{schraudolph2002a,martens2020new} approximates the Hessian using first-order derivatives on the parameters and second-order derivatives on the outputs of the model
\begin{equation}\label{eq: ggn}
\mathcal{G} \equiv  \sum_{n=1}^N \mathcal{J}_{\bo{w}}(\bo{x}_n)^T 
    \nabla_{f_{\bo{w}}}^2 \ln p(t_n|f_{\bo{w_M}}(\bo{x}_n))
    \mathcal{J}_{\bo{w}}(\bo{x}_n).
\end{equation}
Here $f_{\bo{w_M}} = f_{\bo{w_{\text{MAP}}}}$. The covariance of the LA becomes $\bo{\Sigma} = \mathcal{H}_w^{-1} \approx \left(-\mathcal{G} + \frac{1}{\sigma^2} \bo{I}\right)^{-1}$. The GGN also ensures positive semi-definiteness
%($\mathcal{H}_{\bo{w}}$ is potentially indefinite)
. Estimating the posterior predictive distribution using MC sampling can be sub-optimal \citep{foong2019inbetween}. 
%
In \citet{immer2021improving}, a local linearization of the LA predictive is recommended. A first-order Taylor expansion is done around the model outputs
\begin{equation}
    f_{\bo{w}}(\bo{x}) \approx f_{\bo{w}_{\text{MAP}}}(\bo{x}) + \mathcal{J}_{\bo{w}_{\text{MAP}}} (\bo{w} - \bo{w}_{\text{MAP}}),
\end{equation}
here $\mathcal{J}_{\bo{w}_{\text{MAP}}}(\bo{x}^*) = \nabla_{\bo{w}} f_{\bo{w}}(\bo{x}^*) |_{\bo{w}_{\text{MAP}}}$ is the Jacobian of the outputs at the MAP solution. The predictive covariance is then defined by $\bo{\Sigma}^* = \mathcal{J}_{\bo{w}_{\text{MAP}}}^T(\bo{x}^*) \bo{\Sigma} \mathcal{J}_{\bo{w}_{\text{MAP}}}(\bo{x}^*)$ for an input $\bo{x}^*$. The posterior predictive for classification can be expressed using the $\operatorname{Softmax}$ as the inverse link function
\begin{equation}
    \begin{aligned}
        p(t^*| \bo{x}^*, \mathcal{D}) = \int_{R^C} & \operatorname{Softmax}\left(f(\bo{x}^*)\right) \\
        & \mathcal{N} \left(f(\bo{x}^*) | f_{\bo{w}_{\text{MAP}}}(\bo{x}^*), \bo{\Sigma}^* \right) d f(\bo{x}^*),    
    \end{aligned}
\end{equation}
where $C$ is the number of classes and $f(\bo{x}^*)$ is the marginal distribution over the neural network's outputs. Following \citet{daxberger2022laplace}, the extended probit approximation is applied such that predictive distribution becomes
\begin{equation}
    \begin{aligned}
        p(t^* = c| \bo{x}^*, \mathcal{D}) & = \frac{\exp\left(f_{\bo{w}_{M}}(\bo{x}^*)_c T_c\right)}{\sum_{i = 1}^C \exp \left(f_{\bo{w}_{M}}(\bo{x}^*)_i T_i\right)}, \\
        & T_i = \left(1+\frac{\pi}{8} \bo{\Sigma}_{i,i}^*\right)^{-\frac{1}{2}}.
    \end{aligned}
\end{equation}
Note here that only the diagonal covariance structure is considered in the distribution of the outputs.

\textbf{Kronecker Factored Approximate Curvature (KFAC)}. Although the GGN limits the computational requirements through linearization, both it and the Hessian still scale quadratically in the number of parameters, $\mathcal{O}(|\bo{w}|^2)$. To combat this issue, the block-diagonal factorization KFAC \citep{martens2020optimizing} is used to approximate the GGN. This factorizes the empirical Fisher information matrix (EFIM) $\mathcal{F}$ of each layer independently as the product of two smaller matrices. Note that the EFIM is equivalent to the GGN under common likelihoods \citep{martens2020new}. The full EFIM over all parameters is given by
\begin{equation}
    \mathcal{F} = \mathbb{E}_{p(\mathcal{D})}\left[ \nabla_{\boldsymbol{w}}\ln p(\mathcal{D}|\boldsymbol{w}) \nabla_{\boldsymbol{w}}\ln p(\mathcal{D}|\boldsymbol{w})^T \right].
\end{equation}
In the KFAC approximation, layers are assumed to be independent, giving rise to the block-diagonal structure. The Fisher block for the $i$-th layer can be written as
\begin{equation}
    \mathcal{F}_i = \mathbb{E}_{p(\mathcal{D})}\left[ \nabla_{\boldsymbol{w}^{(i)}}\ln p(\mathcal{D}|\boldsymbol{w}) \nabla_{\boldsymbol{w}^{(i)}}\ln p(\mathcal{D}|\boldsymbol{w})^T \right].
\end{equation}
Let $l_i$ be the $i$'th layer and define $\hat{\bo{x}}$ as the input to that layer then $l_i(\hat{\bo{x}}) = \bo{W} \hat{\bo{x}} = \bo{a}$, where $\operatorname{vec}(\bo{W}) = \bo{w}^{(i)}$ and bias is omitted for clarity. The $\operatorname{vec}$ operator denotes the vectorization of a matrix, independent of whether it is column- or row-major, as long as there is consistency throughout. $\bo{W} \in \mathbb{R}^{n \times m}$ is the weight matrix, and $\bo{a}$ is the pre-activation. The gradient is then defined as $\bo{\delta} = \frac{\partial \ln p(\mathcal{D} \mid \bo{w}) }{\partial \bo{a}}$, and the chain rule gives for an input $\bo{x}$ with target $t$
\begin{equation}\label{eq: fisher_derivative}
\frac{\partial \ln p(\mathcal{D} | \bo{w})}{\partial \bo{W}} =  \nabla_{\bo{w}^{(i)}} \ln p(t | \bo{x},  \bo{w}) = \operatorname{vec}(\hat{\bo{x}} \bo{\delta}^T) = \hat{\bo{x}} \otimes \bo{\delta}.
\end{equation}
where $\otimes$ is the Kronecker product and $\hat{\bo{x}}$ denotes the input to the specific layer for an input $\bo{x}$ to the model. The KFAC approximation of the Fisher block can then be expressed as
\begin{equation}\label{eq: fisher_block_approx}
    \begin{aligned}
        \mathcal{F}_i &= \mathbb{E}_{p(\mathcal{D})}\left[ \nabla_{\boldsymbol{w}^{(i)}}\ln p(\mathcal{D}|\boldsymbol{w}) \nabla_{\boldsymbol{w}^{(i)}}\ln p(\mathcal{D}|\boldsymbol{w})^T \right] \\
    &= \mathbb{E}[\hat{\bo{x}} \hat{\bo{x}}^T \otimes \bo{\delta} \bo{\delta}^T] \approx   \mathbb{E}[\hat{\bo{x}} \hat{\bo{x}}^T] \otimes \mathbb{E}[\bo{\delta} \bo{\delta}^T] \\
    &= \bo{A} \otimes \bo{G}.
    \end{aligned}
\end{equation}
Here, the expectation of the Kronecker product is approximated by the Kronecker product of the expectations of its parts \citep{martens2020optimizing}. The Kronecker product has the property that its inverse is equal to the Kronecker product of its inverse parts, $(\bo{A} \otimes \bo{G})^{-1} = \bo{A}^{-1} \otimes \bo{G}^{-1}$. This is crucial for the LA, as the inverse GGN can be computed without storing the full matrix.

\section{Sparse-KFAC Laplace Approximations}\label{sec: skfac_theory}

In this paper, we propose a selection strategy for efficiently defining stochastic subsets in the KFAC LA, an illustration of which can be seen in \cref{fig: skfac_figure}. For the LA, we observed that in small-scale MLPs, selecting the parameters with the highest $\ell_1$ norm performed better than random selection and selection based on variances as proposed in \citep{daxberger2022bayesian} \cref{sec: uci_results}. Only a small percentage of stochastic parameters was required to perform on par with full stochasticity if these were chosen based on their norm. Given the better performance of the largest $\ell_1$-norm selection strategy for MLPs we extend this concept into the space of Transformer models. The size of the transformer model applied to NLP problems, restricts the LA to using KFAC. The KFAC LA methodology \cref{sec: background} approximates the derivatives of a layer $i$ using the inputs $\hat{\bo{x}}$ and output derivatives $\bo{\delta}$ \cref{eq: fisher_block_approx}. Selecting a stochastic subset based on $\ell_1$-norm of the weights in the neural network is therefore not trivial as the GGN in a factorized form. Hence selecting the stochastic parameters requires consideration of the factorization procedure. The gradients of a selection of parameters from a weight matrix $\bo{W}$ can be factorized via the KFAC if they form a submatrix of $\bo{W}$, where the submatrix is formed by deleting rows or columns in $\bo{W}$. Freely selecting the parameters in $\bo{W}$ with the highest $\ell_{1}$-norm will most likely not form a submatrix. 

In the S-KFAC method, we relax the requirement of only selecting the weights with the largest $\ell_1$ norms to a requirement where the largest $\ell_{\infty}$-norm for the rows $\bo{r}$ and columns $\bo{c}$ is selected. From the duality of the $\ell_{\infty}$ and $\ell_{1}$ norms \citet{ole_christensen_function_spaces}, we know that the largest $\ell_1$ norm parameters are placed in the rows and columns of $\bo{W}$ with the largest $\ell_{\infty}$ norm. We then construct the stochastic subset using the intersection between the rows and columns with the largest $\ell_{\infty}$-norm. As such a proxy for $\ell_1$-norm selection is done through the $\ell_{\infty}$-norm, where we can not guarantee all largest $\ell_1$-norm weights to be selected, but a considerable portion is included.

We write the linear layer formulation $l(\hat{\bo{x}}) = \bo{W}\hat{\bo{x}} = \bo{a}$ used in deriving the Fisher block approximation in \cref{eq: fisher_derivative,eq: fisher_block_approx}. We consider the sets of $\ell_{\infty}$ norms of the rows $\bo{r} = \{||\mathbf{W}_{i,:}||_{\infty}\}_{i = 1}^n$ and columns $\bo{c} = \{||\mathbf{W}_{:, j}||_{\infty}\}_{j=1}^m$ in the weight matrix $\bo{W} \in \mathbb{R}^{n \times m}$. Importantly, since $n$ is the number of outputs and $m$ the number of inputs, $\bo{r}$ and $\bo{c}$ correspond to entries in $\bo{\delta}$ and $\hat{\bo{x}}$ respectively.

A predefined percentage $p$ is defined as a hyperparameter for controlling the subset size, e.g. $p=10$ would result in selecting $10\%$ of the rows and columns with the largest $\ell_{\infty}$-norm values. We then select subsets of $\hat{\bo{x}}$ and $\bo{\delta}$ by
\begin{equation}\label{eq: row_column_skfac}
\begin{aligned}
     \Tilde{\bo{x}} &= \{\hat{\bo{x}}_j | c_j > P_p(\bo{c}), j \leq m\}, \\
     \Tilde{\bo{\delta}} &= \{\bo{\delta}_i  | r_i > P_p(\bo{r}), i \leq n\}.
\end{aligned}
\end{equation}
Here $P_p: \bo{x} \rightarrow \mathbb{R}$ is the percentile operator for a percentile $p$. The Fisher block representing the selected stochastic subset is then constructed by

\begin{equation}\label{eq: fisher_block_sparse}
    \mathcal{F}_i \approx  \mathbb{E}[\Tilde{\bo{x}} \Tilde{\bo{x}}^T] \otimes \mathbb{E}[\Tilde{\bo{\delta}} \Tilde{\bo{\delta}}^T] =  \Tilde{\bo{A}} \otimes \Tilde{\bo{G}}.
\end{equation}

To relate this back to the specific weight matrix, the stochastic parameters become $\{\bo{W}_{i,j} | r_i > P_{p}(\bo{r}) \land c_j > P_{p}(\bo{c}), i \leq n, c \leq m\}$, i.e. the parameters lying in the intersection between the selected rows and columns. A visualization of a selected stochastic subset for a single transformer block is done in \cref{fig: skfac_figure}. The method is then extended to all layers, such that the stochastic subset is distributed over the entire network while controlling the subset size using the hyperparameter $p$.

In most cases, the linear mapping will be affine, i.e. including a bias term. For a bias $\bo{b} \in \mathbb{R}^n$ its derivative is given by $\nabla_{\bo{b}} \ln p(\bo{w} | \mathcal{D}) = \bo{\delta}$. As we are already storing $\Tilde{\bo{\delta}}$, we make the corresponding elements in the bias stochastic too. 

The selection made by the S-KFAC method ensures that the $\min(n,m) \times \frac{p}{100}$ largest $\ell_1$ norm parameters are included in the stochastic subset. This corresponds to selecting $\frac{p m}{100}$ input neurons and $\frac{p n}{100}$ output neurons and making the dense linear mapping between those stochastic.

Each input and output are chosen based on a single weight connecting them. Of course, with the possibility that multiple of the largest weights come from the same input or are going to the same output. Yet all weights between the chosen inputs and outputs are made stochastic. This causes the previously mentioned relaxation on choosing only the largest parameter weights, as we have no guarantee on the magnitude of the remaining weights.

We denote this strategy as Sparse-KFAC (S-KFAC) from the ensuing sparsity in the full covariance matrix \cref{sec: skfac_appendix}. Since each 'small' matrix in the KFAC approximation are $(n \times n)$ and $(m \times m)$, S-KFAC restricts the memory usage as the combined size of the matrices becomes $(\frac{n p}{100})^2 + (\frac{m p}{100})^2 \ll n^2 + m^2$ for small percentages $p$. Thus, S-KFAC is lightweight in memory while capturing the important uncertainty from the parameter selection. Further, a large portion of the computational load in the KFAC LA method is contained in matrix-matrix multiplications for calculating the posterior covariance. The computational demand of matrix-matrix multiplications has cubic scaling and S-KFAC drastically lowers the matrix dimensionality. Thus given the same number of stochastic parameters in a partially stochastic model for the KFAC and the S-KFAC methods, S-KFAC will have a significantly lower computational demand at inference time. 
\begin{figure*}
    \centering
    \includegraphics[width=0.95\linewidth]{Graphics/SKFAC-all_datasets.pdf}
    \caption{Ramping experiments using dense stochastic subparts of each weight matrix in the entire network over different percentages of stochasticity. We display the results over the three GLUE datasets SST-2, RTE, and MRPC. Points are median over 5 train/validation splits, and error bars are interquartile ranges. For each dataset, we compare with the median of the LLLA evaluated using the full GGN, the temperature-scaled MAP solution, and the MLP module with the largest $\|\bo{W}\|$.}
    \label{fig:ramping_experiments_kfac}
\end{figure*}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Experiments
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Experiments \& Methodology}\label{sec: experiments_and_methodology}

As the basis for this study is NLP tasks, we employ a pre-trained DistilBERT transformer encoder architecture\citep{sanh2020distilbert}. It is beyond the scope of the paper to provide a complete description of the Transformer architectures, and we refer to \citet{vaswani2017attention} for the details. For this study, however, it is important to recall that the multi-head attention mechanism \citep{vaswani2017attention} contains 4 affine mappings: $(\bo{W}_Q, \bo{b}_Q), (\bo{W}_K, \bo{b}_K), (\bo{W}_V, \bo{b}_V), (\bo{W}_O, \bo{b}_O) \in (\mathbb{R}^{n \times n}, \mathbb{R}^n)$, where for the distilBERT model $n = 768$. To complete the transformer block a one-layer feed-forward model or MLP is used containing affine mappings $(\bo{W}_{\text{in}}, \bo{b}_{\text{in}}) \in (\mathbb{R}^{n \times m}, \mathbb{R}^m)$ and $(\bo{W}_{\text{out}}, \bo{b}_{\text{out}}) \in (\mathbb{R}^{m \times n}, \mathbb{R}^n)$, where $m = 3072$. We use the terminology of 'modules' to refer to an affine mapping in either the attention or MLP block. This designation is motivated by the block structure of the KFAC approximation. We will therefore present results comparing performance between modules selected from either the MLP or attention blocks despite the varying number of parameters in them. A significant proportion of the learnable model parameters lie in the token embeddings (tokens are generated through the word-piece algorithm for DistilBERT \citep{wordpiece}). Which of these parameters are used is therefore dependent on the textual input and they are therefore never included in the stochastic subset. Additionally, when we calculate the percentage of stochastic parameters in the model, we do not count the token embeddings in the total number of parameters.
%
We perform our experiments on three GLUE tasks: SST-2 \citep{sst2}, RTE \citep{rte}, and MRPC \citep{MRPC}, all binary classification tasks. Information relating to the datasets is shown in \cref{tab: nlp_datasets}. For each, we fine-tune 5 pre-trained DistilBERT models \citep{distilbert} on 5 train/validation splits of the respective training sets, where we use the validation set for early stopping and hyperparameter tuning in for the respective methods for posterior inference. We use accuracy as the early stopping criterion. We keep the pre-allocated development sets as the test sets \cite{GLUE_wang2019glue}. Details can be found in \cref{sec: fine_tuning}. Hence, all experiments are performed 5 times on all three datasets. 
We will refer to the resulting point estimates from the fine-tuning as the MAP solution $\bo{w}_{\text{MAP}}$. 
\begin{table}[h]
\centering
\caption{The median accuracy, negative log-likelihood (NLL), and expected calibration error (ECE) for models using $\bo{w}_{\text{MAP}}$ as point estimates evaluated on the test sets.}
\begin{tabular}{@{}llll@{}}
\toprule
Metric   & SST-2 & MRPC & RTE \\ \midrule
NLL  &  0.75 & 0.82  &  1.70  \\
Accuracy & 0.89 &  0.84 &  0.61 \\
ECE   &  0.10 &  0.15 & 0.33  \\ \bottomrule
\end{tabular}
\label{tab: map_results}
\end{table}
%
%
We mainly consider two hyperparameters of interest in the LA and SWAG approximations. For the LA, it is the choice of prior variance, and for SWAG, it is the choice of learning rate. Both are estimated through cross-validation, with NLL on the validation set as the criterion. Additional details for SWAG and KFAC LA are shown in \cref{sec: la_swag_configurations}.


\subsection{Investigating the hypothesis of partial stochasticity in NLP}

\cref{fig:module_ramping_random_swag_la} shows the results for the effects of partial stochasticity vs full stochasticity for the SWAG and KFAC LA posterior approximations on the MRPC and RTE datasets. For both approximations, we iteratively increase the percentage of stochastic parameters in the model, where the stochastic parameters are chosen uniformly at random. We observe that partial stochasticity performs better than the MAP solution for even small percentages of stochastic parameters. Furthermore, we see that partial stochasticity tends to either outperform or perform equally to its fully stochastic variant. The results for the SST-2 dataset can be seen in \cref{sec: ramping_exp}. 
\begin{figure}[h]
    \centering
    \includegraphics[width=0.49\linewidth]{Graphics/combined_la_swa_random_mrpc.pdf}
    \includegraphics[width=0.49\linewidth]{Graphics/combined_la_swa_random_RTE.pdf}
    \caption{Experiment varying the percentages of stochastic parameters in the KFAC LA and SWAG approximations for posterior inference on the MRPC and RTE datasets.}
    \label{fig:module_ramping_random_swag_la}
\end{figure}

\subsection{Stochastic module selection}\label{subsec: module_ramping}
To compare the stochastic subset selection frameworks, we consider 5 ramping schemes. Of these, 3 are modular,  i.e. entire modules or linear mappings, including bias, on which to induce stochasticity are selected. Experiments conducted with small-scale feed-forward models indicated that a norm-based selection strategy on individual parameters improved results \cref{sec: uci_results}. We extend this to a modular basis by using the operator norm. More specifically, we set $||\bo{W}|| = \sigma_1(\bo{W})$, where $\bo{W}$ is a linear operator, e.g. one of the weight matrices in the model, and $\sigma_1$ is its largest singular value \citep{ole_christensen_function_spaces}. 

In the DistilBERT models, we observed that the operator norms of the weight matrices in the MLP modules were significantly larger than the ones in the self-attention mechanism \cref{sec: experiment_background,fig: operator_norm_boxplots}. Thus, we consider three main module ramping experiments: 1) Iteratively adding the next module with the largest operator norm from the MLP mappings to the stochastic subset, 2) the same procedure as in 1) but from the attention mechanism, 3) Random selection over the entire model. Note here that we include the classifier in the MLP grouping. In \cref{fig:module_ramping_laplace_sst2}, we present the results from these modular ramping schemes on the SST-2 dataset and using the KFAC LA.
%We do not include the MAP solution, as the change over modules would otherwise be indiscernible. 

The MAP results can be seen in \cref{tab: map_results}. We observe that the most immediate drop in NLL happens for the max $||\bo{W}||$ MLP module strategy. However, note that those linear layers contain four times as many parameters as a linear layer in the attention mechanism \cref{sec: experiments_and_methodology}. Additionally, random selection significantly outperforms norm-based selection from the inclusion of 8 layers. Similar tendencies were observed for the other two tasks, see \cref{sec: ramping_exp}. 

\begin{figure}[h]
    \centering
\includegraphics[width=0.49\linewidth]{Graphics/laplace_plot_one_full_nll.pdf}
\includegraphics[width=0.49\linewidth]{Graphics/ranking_plot_all.pdf}
%
\caption{\textbf{Left}: Max $\|\bo{W}\|$ ramping experiments for the attention and feed-forward modules, along with random selection on the SST-2 dataset, using LA as the posterior approximation. \textbf{Right}: Rankings of each modular stochastic subset selection scheme across the 3 datasets/tasks for each number of stochastic modules, where 0 is optimal.}
   
    \label{fig:module_ramping_laplace_sst2}
\end{figure}
%
\begin{figure}[h]
    \centering
    \includegraphics[width=0.9\linewidth]{Graphics/swag_la_comparison_plot_all_datasets.pdf}
    \caption{Comparison of negative log-likelihood (NLL) and expected calibration error (ECE) as a function of memory footprint for each ramping scheme for each dataset. For each ramping scheme, we select the best-performing level of stochasticity in terms of NLL. Error bars are interquartile ranges across 5 train/validation splits. The best-performing SWAG ramping scheme is included for comparison.}
    \label{fig: comparison_all}
\end{figure}

To decorrelate the performance increase for the max operator norm selection strategy from the additional stochastic parameters, we perform similar experiments by ramping from the minimum rather than the maximum \cref{sec: ramping_exp}. As the focus is to ascertain optimal selection strategies, we display the rankings of these 5 operator norm-related experiments - including random - in \cref{fig:module_ramping_laplace_sst2}. For each dataset, we compute the ranking of the 5 selection schemes. For each module, we present the median ranking of the respective method. Note that to have a 0 ranking, the method must have the highest performance on at least 2 of the datasets. We observe that selecting MLP modules with the largest operator norm is the highest-performing strategy across the 3 tasks for 1, 2, and 3 modules. Comparatively, selecting the minimum is the worst-performing strategy in that range. 

Finally, we observe that random selection, after the inclusion of 5 layers, outperforms the other methods. The observation that random selection outperforms the other selection heuristics for a sufficiently large number of stochastic modules is not necessarily intuitive. It could indicate that each mechanism (MLP \& attention) contributes differently to the uncertainty of the model. And that it is beneficial to capture both 'types' rather than thinking only in terms of the number of stochastic parameters.
 

\begin{table*}[h]
\centering
\caption{Results for ramping experiments conducted on the three GLUE tasks. For these experiments, we restrict the number of stochastic parameters in the model to be approx. 10 pct. of all model parameters. The NLL and ECE are reported on all tasks for the MAP solution, temperature scaling, fully stochastic KFAC LA, and SWAG. We show $\pm$ standard errors, where $\pm 0.00$ indicates the uncertainty is below $0.005$. The LLLA is calculated using the full GGN approximation. The best overall performance for each dataset is highlighted in \textbf{bold}, and the best partially stochastic method is highlighted with $\underline{\text{underline}}$.}
\begin{tabular}{lllllll}
\toprule
& \multicolumn{2}{c}{\textbf{SST-2}} &  \multicolumn{2}{c}{\textbf{MRPC}}  & \multicolumn{2}{c}{\textbf{RTE}} \\ \cmidrule(lr){2-3} \cmidrule(lr){4-5} \cmidrule(lr){6-7}
\textbf{Methods} & \multicolumn{1}{c}{\textbf{NLL}$\downarrow$}  & \multicolumn{1}{c}{\textbf{ECE}$\downarrow$} &   \multicolumn{1}{c}{\textbf{NLL}$\downarrow$} &  \multicolumn{1}{c}{\textbf{ECE}$\downarrow$}  &\multicolumn{1}{c}{\textbf{NLL}$\downarrow$}  & \multicolumn{1}{c}{\textbf{ECE}$\downarrow$}  \\
\midrule
 MAP  & $0.74\pm0.10$ & $0.11\pm0.01$ &   $0.57\pm0.34$ & $0.14\pm0.04$ &    $1.71\pm0.23$ & $0.33\pm0.05$    \\
 \midrule
\textbf{LA} \\
Temp. Scaled & $0.36\pm0.02$ & $0.06\pm0.00$ &    $0.44\pm0.01$ & $0.05\pm0.02$ &    $0.66\pm0.01$ & $0.13\pm0.04$    \\
 Min $||\bo{W}||$ MLP & $0.33\pm0.02$ & $0.06\pm0.01$ &    $0.42\pm0.01$ & $0.07\pm0.01$ &    $0.67\pm0.01$ & $0.08\pm0.03$    \\
 Max $||\bo{W}||$ attn. & $0.29\pm0.00$ & $0.04\pm0.00$ &    $0.39\pm0.02$ & $\underline{\boldsymbol{0.04}}\pm0.01$ &    $0.81\pm0.11$ & $0.13\pm0.03$    \\
Min $||\bo{W}||$ attn. & $\underline{0.28}\pm0.00$ & $0.04\pm0.01$ &    $0.39\pm0.02$ & $0.05\pm0.00$ &    $0.72\pm0.05$ & $0.11\pm0.02$    \\
Random & $\underline{0.28}\pm0.01$ & $0.04\pm0.01$ &    $0.38\pm0.02$ & $0.05\pm0.01$ &    $0.71\pm0.05$ & $0.14\pm0.03$    \\
S-KFAC & $\underline{0.28}\pm0.00$ & $0.04\pm0.01$ &    $\underline{\boldsymbol{0.37}}\pm0.02$ & $\underline{\boldsymbol{0.04}}\pm0.00$ &    $\underline{\boldsymbol{0.65}}\pm0.01$ & $0.05\pm0.01$    \\
Last Layer & $0.33\pm0.02    $ & $0.06\pm0.00$ &    $0.42\pm0.01$ & $0.07\pm0.01$ &    $0.67\pm0.01$ & $0.08\pm0.03$    \\
\midrule
\textbf{SWAG} \\
 Max $||\bo{W}||$ MLP  & $0.52\pm0.06$ & $0.09\pm0.01$ &    $0.51\pm0.12$ & $0.09\pm0.03$ &    $1.09\pm0.20$ & $0.28\pm0.03$    \\
 Max $||\bo{W}||$ attn. & $0.39\pm0.08$ & $\underline{\boldsymbol{0.03}}\pm0.04$ &    $0.42\pm0.18$ & $0.07\pm0.05$ &    $0.74\pm0.02$ & $0.11\pm0.03$    \\
 Random  & $0.32\pm0.10$ & $0.04\pm0.02$ &    $0.39\pm0.06$ & $0.05\pm0.01$ &    $0.69\pm0.01$ & $\underline{\boldsymbol{0.02}}\pm0.02$    \\
 Sublayer $\ell_1$ & $0.39\pm0.13$ & $0.07\pm0.03$ &    $0.44\pm0.04$ & $\underline{\boldsymbol{0.04}}\pm0.02$ &    $0.69\pm0.00$ & $\underline{\boldsymbol{0.02}}\pm0.01$    \\\midrule 
Fully Stoch. (\textbf{LA}) & $\boldsymbol{0.27}\pm0.01$ & $\boldsymbol{0.03}\pm0.00$ &    $0.39\pm0.01$ & $0.06\pm0.00$ &    $0.66\pm0.01$ & $0.06\pm0.01$    \\
Fully Stoch. (\textbf{SWAG}) & $\boldsymbol{0.27}\pm0.06$ & $\boldsymbol{0.03}\pm0.03$ &    $0.62\pm0.11$ & $0.07\pm0.02$ &   $0.69\pm0.00$ & $0.05\pm0.02$    \\
\bottomrule
\end{tabular}
\label{tab: all_ramping_scheme_comparison}
\end{table*}

\subsection{Sparse-KFAC Laplace}\label{subsec: skfac_results}

We now assess the performance of the S-KFAC selection scheme. We conduct three ramping experiments, where we vary the percentage of stochastic parameters in the model according to the methodology described in \cref{sec: skfac_theory}. The results can be seen in \cref{fig:ramping_experiments_kfac}. We compare the results with the commonly used Last Layer LA (LLLA) \citep{daxberger2022laplace} with full GGN approximation, Temperature scaling \citep{temperature_scaling}, max $||\bo{W}||$ MLP with one module and full stochasticity. Note that the number of floats corresponds to percentages as seen in \cref{sec: skfac_appendix,tab: sublayer_percentiles}.

On the SST-2 and MRPC, there exist several percentages of stochasticity for which S-KFAC requires less memory than the LLLA yet yields a significantly lower NLL. Additionally, it repeatedly outperforms the single module with max $||\bo{W}||$, again while requiring less memory. The exception is the RTE dataset, in which temperature scaling and the LLLA yield comparable performance to S-KFAC. The single MLP module outperforms LLLA on 2 out of 3 datasets. However, even a single module requires more memory than S-KFAC, with approx. 10 pct of the model parameters. 

Moreover, the percentage of stochastic parameters (Num. Floats) at which the best NLL is achieved for the S-KFAC method varies across the three datasets. Hence, applications of the methodology will require a sweep through cross-validation to estimate the percentage for optimal performance. Recall that given percentages $p_1 < p_2$ the following will hold for the stochastic subsets in the S-KFAC method $\bo{w}_{S, p_1} \subset \bo{w}_{S, p_2}$. Therefore, the amount of uncertainty captured by the S-KFAC using $p_2$ is unlikely to be significantly lower than by using $p_1$. We hypothesize that this near-convex behavior may assist the estimation of p.

\subsection{Selection strategies for SWAG \& LA}
\cref{fig: comparison_all} compares the S-KFAC, the three LA modular selection schemes, and random module selection for the SWAG approximation. For each selection strategy, we select the level of stochasticity yielding the lowest validation NLL. The random selection scheme was found to perform best for SWAG \cref{tab: all_ramping_scheme_comparison}.  We observe that the best performance is often found using random selection on a modular level for the Laplace approximation. However, it is rarely significantly better than the S-KFAC selection strategy, demanding approximately 7 times more memory on average. Additionally, we observe that SWAG displays similar performance to LA but with a significantly higher computational cost. 

\begin{figure}[H]
    \centering
    \includegraphics[width = \linewidth]{Graphics/SST2_calibration_ECE.pdf}
    \caption{Calibration curves on SST-2. The median expected calibration error (ECE) across 5 train/validation splits is displayed for each method. 20 bins are placed based on the prediction probability distribution rather than uniformly. The LA and SWAG are shown for 8 randomly selected stochastic linear layers. S-KFAC is shown for $1.588\%$ of parameters being stochastic.}
    \label{fig: calibration_sst2}
\end{figure}

In \cref{tab: all_ramping_scheme_comparison}, we present results for all subset selection strategies. However, we limit the percentage of stochastic parameters in the model to approximately 10 pct. of the full parameter set. We additionally compare with full stochasticity for both LA and SWAG. 
In \cref{fig: comparison_all}, we observed that SWAG was quite competitive with the LA when no restrictions were placed on the number of stochastic parameters. However, when limited to approximately 10 pct. It yields significantly higher NLL than all LA selection schemes. Furthermore, we also observe that S-KFAC shows the largest improvement over the MAP solution of all methods in terms of NLL. While SWAG often attains the lowest ECE, it does not manage to retain performance on NLL. Finally, on 2 out of three tasks, full model stochasticity yields worse performance than the partial schemes for SWAG and LA. 

\cref{fig: calibration_sst2} shows the calibration curves for a selection of the subset selection methods. The curves for the other datasets can be seen in \cref{sec: calibration}. The three high-performing methods, random selection LA and SWAG, along with S-KFAC, are all observed to alter the underlying structure of the predictive distribution. The LLLA and temperature scaling appear to shift inwards the extreme probabilities, approximately down to the model accuracy \cref{tab: map_results}. Squeezing the probabilities inwards lowers the ECE. However, it does facilitate improvement in post hoc applications such as thresholding. 

\section{Conclusion}

In this paper, we perform a detailed study of the effects of Bayesian inference for a partially stochastic subset of parameters in transformer models for NLP tasks. We validate the efficacy of subnetwork inference presented in \citet{daxberger2022bayesian, sharma2023bayesian} for transformers across three GLUE tasks. The Subnetwork inference methodology is evaluated on the posterior approximation methods Stochastic weight averaging - Gaussian (SWAG) and the Laplace approximation (LA). We find that stochastic subset inference unequivocally outperforms the MAP solutions and generally displays similar or improved performance compared to fully stochastic variants. 

We propose and evaluate heuristics for selecting the size of the stochastic subset on a modular level. We selected linear mappings to include in the stochastic subset from the MLP and attention blocks based on their operator norms. We found that norm-based selection yielded the best performance for small stochastic subsets. For larger numbers of modules, a random selection scheme is dominant. Indicating that the MLP and attention components contribute differently to the uncertainty of the model. Given this finding, we conclude that a homogeneous distribution of stochastic parameters is preferred.

We proposed a novel method, Sparse-KFAC, for selecting stochastic subsets by creating dense stochastic substructures in all linear mappings in the model. We found that Sparse-KFAC invariantly yielded competitive or higher performance than all other selection strategies while requiring orders of magnitude fewer parameters. Additionally, we found that when the stochastic subset was limited to 10 pct. of the parameters in the model, Sparse-KFAC outperformed all other methods, including full stochasticity, on two out of three tasks. However, the method introduces an additional hyperparameter defining the percentage of stochastic parameters in each affine mapping. We observed different 'speeds of convergence' towards the optimal across the three datasets. Hence employment of the method requires estimating the parameter through cross-validation. However, the fact that Sparse-KFAC is fully defined through the choice of a percentage of stochastic weights also greatly simplifies the selection process. For example, given memory limitations that allow for a certain percentage $P$ of stochastic parameters. One can achieve this through approximately $n$ MLP modules, $4n$ attention modules, or a combination. The performance of each of these module combinations can vary greatly without clear prior indicators. Hence, the selection process requires an exhaustive search. In Sparse-KFAC $P$ pct. of the parameters can simply be selected optionally testing if lower percentages yield similar performance. Sparse-KFAC partially decouples the size of the stochastic subset from the width of the model. This is highly relevant as modern transformer architectures are increasingly wide. As such, the Sparse-KFAC method is readily extendable to larger models, for which stochastic subset selection is an interesting avenue for further research. \let\thefootnote\relax\footnotetext{\href{https://github.com/GustavAls/PartialNLP}{Source code: https://github.com/GustavAls/PartialNLP}}



\newpage
% References
\bibliography{uai2024-template}

\newpage

\onecolumn

\title{Towards Scalable Bayesian Transformers: Investigating stochastic subset selection for NLP\\(Supplementary Material)}
\maketitle

\appendix


\section{Sparse-KFAC}\label{sec: skfac_appendix}

KFAC approximation derivation from \cref{eq: fisher_block_approx}
\begin{align*}
    \mathcal{F}_i &= \mathbb{E}_{p(\mathcal{D})}\left[ \nabla_{\boldsymbol{w}^{(i)}}\ln p(\mathcal{D}|\boldsymbol{w}) \nabla_{\boldsymbol{w}^{(i)}}\ln p(\mathcal{D}|\boldsymbol{w})^T \right] \\
    &=  \mathbb{E}_{p(\mathcal{D})}\left[(\hat{\bo{x}} \otimes \bo{\delta}) (\hat{\bo{x}} \otimes \bo{\delta})^T\right] \\
    &=  \mathbb{E}_{p(\mathcal{D})}\left[(\hat{\bo{x}} \otimes \bo{\delta}) (\hat{\bo{x}}^T \otimes \bo{\delta}^T) \right] \\
    &=  \mathbb{E}_{p(\mathcal{D})}\left[\hat{\bo{x}} \hat{\bo{x}}^T \otimes \bo{\delta} \bo{\delta}^T \right] \\
    &\approx  \mathbb{E}[ \hat{\bo{x}} \hat{\bo{x}}^T] \otimes \mathbb{E}[\bo{\delta} \bo{\delta}^T] \\
    &= \bo{A} \otimes \bo{G}.
\end{align*}

\subsection{Covariance structure of Sparse-KFAC}

The Sparse-KFAC methodology introduced in this paper aims to select substructures of the KFAC Hessian approximation to sparsely distribute stochastic parameters across the neural network. The selection is done by choosing specific inputs and doing a fully connected mapping to specific outputs. To visualize the covariance structure of this mechanism, a series of stochastic percentiles are chosen and used to generate an S-KFAC Hessian approximation. The stochastic weights are shown in white, and the deterministic parameters are shown in black \cref{fig: covariance_structure_percentile_kfac}. The fraction of stochastic parameters compared with the total number of parameters in the selected linear layers (modules) can be described by $g_P = \nicefrac{p^2}{100^2}, 0 < p \leq 100$. $g_P = 0.5$ indicates that half of the total number of parameters are selected for the stochastic subset.

\begin{figure}[H]
     \centering
\includegraphics[width=0.23\textwidth]{graphics-thesis/NLP/Explanations/kfac_cov_perc_0.4_cropped.pdf}
\includegraphics[width=0.23\textwidth]{graphics-thesis/NLP/Explanations/kfac_cov_perc_0.5_cropped.pdf}
\includegraphics[width=0.23\textwidth]{graphics-thesis/NLP/Explanations/kfac_cov_perc_0.8_cropped.pdf}
\includegraphics[width=0.23\textwidth]{graphics-thesis/NLP/Explanations/kfac_cov_perc_0.9_cropped.pdf}
\caption{Sparse covariance structures visualized for varying percentiles, and selection according to the Sparse-KFAC methodology described in \cref{eq: row_column_skfac,eq: fisher_block_sparse}. A two-layer neural network with a 100-layer width is shown. The selected percentiles from left to right are $p = 40$, $p = 50$, $p = 80$, and $p = 90$.}
\label{fig: covariance_structure_percentile_kfac}
\end{figure}

\subsection{Percentile ramping}

\begin{table}[H]
\centering
\caption{The percentile ramping schemes $\bo{w}_{S,p}$ used in this study for the Sublayer experiments. This includes S-KFAC on the full model, S-KFAC on the partial modules, and the SWAG $\ell_1$ ramping scheme. Percentages for S-KFAC are then used as explained in \cref{sec: laplace_theory} and correspond to the percentage of rows and columns in the weight matrices, not the percentage of parameters.}
\begin{tabular}{ll}
\toprule
Ramping Scheme   & $\bo{w}_{S,p}$ \\ \midrule
SWAG & $p \in \{0, 1, 12, 24, 36, 48, 60, 100\} $\\
S-KFAC & $p \in \{0, 1.0, 1.8, 2.5, 3.2, 3.9, 4.6,  5.4, 6.1, 6.8, 12.6, 18.4, 24.2, 30, 33, 100 \}$ \\

\bottomrule
\end{tabular}
\label{tab: sublayer_percentiles}
\end{table}

\subsection{Specific module ramping}

Here are the results where we specify the modules on which to apply the Sparse-KFAC selection.  For each dataset/task, we specify a selection of modules, e.g. the 2 MLP modules with the largest operator norm. We then apply the S-KFAc selection strategy to only those modules, leaving the remaining modules deterministic. This is done to show that S-KFAC can be applied post hoc even if one has first found a selection of modules that one wants to induce stochasticity on. The results show that S-KFAC captures the uncertainty in a model around the inclusion of 25 pct of the stochastic parameters in all three cases.

\begin{figure}[H]
     \centering
\includegraphics[width=0.4\textwidth]{graphics-thesis/NLP/RTE/Laplace/laplace_plot_three_predefined.pdf}
\includegraphics[width=0.4\textwidth]{graphics-thesis/NLP/MRPC/Laplace/laplace_plot_three_predefined.pdf}
\includegraphics[width=0.4\textwidth]{graphics-thesis/NLP/SST2/Laplace/laplace_plot_three_predefined.pdf}
\caption{Results for iteratively increasing stochasticity through S-KFAC on specified modules across the three tasks. The 100 pct are full model stochasticity and conform to points seen for 2 or three modules in the \cref{sec: ramping_exp}. The results show that S-KFAC captures the uncertainty in a model around the inclusion of 25 pct of the stochastic parameters in all three cases.}
\end{figure}

\subsection{Ramping a subset of linear mappings}\label{subsec: ramping_linear_mapping}



\section{UCI Results}\label{sec: uci_results}

SWAG and LA results are presented on the three UCI regression datasets: Boston, Energy, and Yacht.

\subsection{Laplace and SWAG on UCI}
To verify the efficacy of partially stochastic Bayesian neural networks, experimentation is done for small-scale regression problems. SWAG and the LA are tested across three UCI regression datasets. In \cref{fig: nll_torch_la_swag}, the median NLL is displayed for all three datasets where the pct. of stochastic parameters is varied. The MAP and fully stochastic performance are included for comparison. The Yacht dataset is shown with and without the MAP solution, so variation between subsets can be interpreted.

\begin{figure}[H]
     \centering
    \includegraphics[width=0.4\textwidth]{graphics-thesis/UCI/Torch/la_swa_NLL_Boston NLL__median.pdf}
    \includegraphics[width=0.4\textwidth]{graphics-thesis/UCI/Torch/la_swa_NLL_Energy NLL__median.pdf}
    \includegraphics[width=0.4\textwidth]{graphics-thesis/UCI/Torch/la_swa_NLL_Yacht NLL__median.pdf}
    \includegraphics[width=0.4\textwidth]{graphics-thesis/UCI/Torch/la_swa_NLL_Yacht NLL_no map_median.pdf}
     \caption{The median Negative log-likelihood (NLL) and its interquartile range of the posterior approximation strategies SWAG and Laplace across 15 train-test splits, compared on the UCI datasets Boston, Energy, and Yacht. Each method is fitted with a percentage of stochastic parameters shown on the x-axis and evaluated on the test set. The MAP solution and 100\% stochastic solution are both shown with a red and green line, respectively. The Yacht results are shown with and without the MAP solution such that the inter-percentile NLL variation can be seen clearly.}
     \label{fig: nll_torch_la_swag}
\end{figure}

\subsection{Norm vs variance based selection}
In \citet{daxberger2022bayesian}, it is argued that a parameter selection strategy based on choosing the parameters with the largest marginal variances is proposed, arguing that this is favorable for closely approximating the posterior predictive distribution. In \citet{sharma2023bayesian}, the $\ell_1$-norm is instead used as a proxy for selecting the optimal stochastic subset. We compare the two methodologies for the three UCI regression datasets, Boston, Energy, and Yacht, to ascertain the best-performing subset selection strategy \cref{fig: mean_vs_var}.

\begin{figure}[H]
\centering
\includegraphics[width=0.4\linewidth]{graphics-thesis/UCI/MeanVsVar/la_la_var_NLL_Boston NLL__median.pdf}
\includegraphics[width=0.4\linewidth]{graphics-thesis/UCI/MeanVsVar/la_la_var_NLL_Energy NLL__median.pdf}
\includegraphics[width=0.4\linewidth]{graphics-thesis/UCI/MeanVsVar/la_la_var_NLL_Yacht NLL__median.pdf}
\includegraphics[width=0.4\linewidth]{graphics-thesis/UCI/MeanVsVar/la_la_var_NLL_Yacht NLL_no map_median.pdf}
\caption{A comparison between two stochastic subset selection heuristics. Laplace $\mu$  chooses the highest parameter magnitudes based on $\bo{w}_{MAP}$. Laplace $\sigma^2$ follows Daxberger et al. \cite{daxberger2022bayesian} and selects a stochastic subset selection based on largest marginal variances, $\text{diag } \mathcal{G}^{-1}$. The median NLL and its interquartile range are shown over 15 runs for the UCI datasets Boston, Energy, and Yacht. Yacht is shown with and without MAP. The percentage of parameters modeled as stochastic is shown on the x-axis.}
\label{fig: mean_vs_var}
\end{figure}

\cref{fig: mean_vs_var} indicates that norm-based subset selection strategy is superior for low percentiles. For Boston and Energy, the median of the two methods is not significantly different. However, on the Yacht dataset, the percentiles $p \in \{1,2,5,8\}$ all show significant improvement by using the $\ell_1$-norm as compared with the marginal variance approach.


\section{Experiments background}\label{sec: experiment_background}
We extend the parameter magnitude based stochastic subset selection of \citet{sharma2023bayesian} to transformers models by computing the operator norm over the networks modules, characterized by their singular values. To gain an understanding of the magnitude of the singular values over the entire DistilBERT architecture \citep{sanh2020distilbert} a boxplot is done of the singular values for the self-attention and MLP modules \cref{fig: operator_norm_boxplots}.

\begin{figure}[H]
 \centering
 {\includegraphics[width=0.6\textwidth]{graphics-thesis/NLP/Explanations/operator_norm_boxplot.pdf}}
     
 \caption{Boxplots showing the distribution of the operator norms of the weight matrices in the two different types of modules, namely self-attention, denoted attention, and MLP modules. We use the operator norm defined by the largest singular value. Bias terms are not included.}
\label{fig: operator_norm_boxplots}
\end{figure}

Interestingly, from \cref{fig: operator_norm_boxplots}, the singular values of the MLP modules are distributed substantially higher than those of the self-attention modules. Therefore, if the modules with the largest operator norm are selected indiscriminately the MLP modules will be included in the stochastic subset first. Additionally, the variance of the MLP singular values is greater than for self-attention. This 

\subsection{Fine-tuning distilBERT}\label{sec: fine_tuning}

In \cref{tab: nlp_datasets}, information on the GLUE datasets investigated in this paper is presented.

\begin{table}[H]
\caption{Information on the Natural Language Processing datasets selected for this study. The datasets and tasks selected are Sentiment Classification (SST-2 \citep{sst2} Paraphrase Identification (MRPC \citep{MRPC}), and Natural Language Inference (RTE \citep{rte}).}
\centering
\begin{tabular}{@{}llll@{}}
\toprule
Dataset & SST-2  & MRPC & RTE \\ \midrule
Train size &    $ 60,600$   &   $ 3,300$  & $ 2240$ \\
Val size  &  $ 6700$   &  $ 370$   & $ 250$    \\
Test (dev) size  &  $872$  & $408$ & $277$ \\
Train/val class dist. ($\nicefrac{true}{false}$) & $\nicefrac{55.8\%}{44.2\%}$  & $\nicefrac{67.4\%}{32.6\%}$ & $\nicefrac{50.2\%}{49.8\%}$   \\
Test/dev class dist. ($\nicefrac{true}{false}$) &  $\nicefrac{55.8\%}{44.2\%}$  & $\nicefrac{68.4\%}{31.6\%}$ & $\nicefrac{52.7\%}{47.3\%}$  \\
Max Sequence length & $268$ & $226$ & $1400$ \\ \bottomrule
\end{tabular}
\label{tab: nlp_datasets}
\end{table}

In \cref{tab: fine_tune_hyperparams}, we present the hyperparameters chosen for fine-tuning the distilBERT model. 
\begin{table}[H]
\centering
\caption{The Hyperparameter configuration used for training DistilBERT (\texttt{distil-bert-uncased}) from the \texttt{transformers}     \textit{Huggingface} library.}
\begin{tabular}{ll}
\toprule
Hyperparameter   & Description \\ \midrule
Number of epochs  &  10 \\
Batch size  & 16  \\
Optimizer  & AdamW  \\
learning-rate & $5 \cdot 10^{-5}$ \\
$\beta_1$  & 0.9  \\
$\beta_2$  & 0.999  \\
FF dropout & 0.1 \\
Attention dropout & 0.1 \\
Sequence classifier dropout & 0.2 \\
\bottomrule
\end{tabular}
\label{tab: fine_tune_hyperparams}
\end{table}

\section{Implementation details: Laplace approximation and SWAG}\label{sec: la_swag_configurations}

The configurations used when fitting the KFAC LA and SWAG are listed in \cref{tab:training_config_la_swa_nlp}.

\begin{table}[H]
\centering
\caption{Training configurations for the Laplace and SWAG approximations. Common hyperparameters are noted in the top, SWAG-specific ones in the middle, and finally, Laplace}
\begin{tabular}{ll}
\toprule
Hyperparameter   & Description or Value \\ \midrule
Likelihood & Categorical (Cross-Entropy) \\ 
$\bo{w_{S}}$ Num. Modules &  $\left[0,1, 2, 3, 4, 5, 8, 11, 17, 38\right]$  \\
Batch Size & 16\\
\midrule
\textbf{SWAG} & \\
Learning rate sweep   &  $\left[10^{-3}, 10^{-2}, 5 \cdot 10^{-2}, 10^{-1}\right]$    \\
Optimizer & SGD \\
Momentum & 0.9 \\
Num. optim. steps sweep & 400 \\
Num. optim. steps final & 2000 \\
Num. Columns in D & 20 \\
Iterations between snapshots & 5 \\
Num. MC Samples & 50 \\
Sublayer $\bo{w}_{S, p}$ & $p \in [1,12,24,36,48,60,100]$ \\
\midrule
\textbf{Laplace} \\ 
Prior precision sweep   &  Equidistant in logspace in $[10^{-1}, 10^3]$    \\
\bottomrule
\end{tabular}
\label{tab:training_config_la_swa_nlp}
\end{table}

A validation set is used for tuning hyperparameters in both methods. For the LA the prior precision is tuned, and for SWAG, the learning rate.

\section{Ramping experiments}\label{sec: ramping_exp}

In \cref{fig:mod_ramp_sst_extra}, we present the figure on partial stochasticity vs full and MAP for the SST-2 dataset, corresponding to the results shown in \cref{fig:module_ramping_laplace_sst2} for the MRPC and RTE tasks. 

\begin{figure}
    \centering
    \includegraphics[width=0.5\textwidth]{Graphics/combined_la_swa_random_sst2.pdf}
    \caption{Results for the SST-2 dataset corresponding to the results presented in \cref{fig:module_ramping_laplace_sst2}}
    \label{fig:mod_ramp_sst_extra}
\end{figure}



In \cref{subsec: module_ramping}, ramping experiments are shown for random selection and using the maximum operator norm on MLP and attention modules as heuristics for selecting modules for a stochastic subset. In \cref{fig: rte_mrpc_laplace_plot_one}, the same ramping experiments are shown for the RTE and MRPC datasets. \textbf{Left}: the MAP performance is included, \textbf{Right} it is excluded such that variation between subset size performance can be interpreted. The performance of a full GGN LLLA is highlighted by a green dotted line.

\begin{figure}[H]
     \centering
     \vfill
\includegraphics[width=0.4\textwidth,keepaspectratio=true,height=0.42\textheight]{graphics-thesis/NLP/RTE/Laplace/laplace_plot_one_w_map.pdf}
\includegraphics[width=0.4\textwidth,keepaspectratio=true,height=0.42\textheight]{graphics-thesis/NLP/RTE/Laplace/laplace_plot_one.pdf}
\includegraphics[width=0.4\textwidth,keepaspectratio=true,height=0.42\textheight]{graphics-thesis/NLP/MRPC/Laplace/laplace_plot_one_w_map.pdf}
\includegraphics[width=0.4\textwidth,keepaspectratio=true,height=0.42\textheight]{graphics-thesis/NLP/MRPC/Laplace/laplace_plot_one.pdf}
\caption{Median NLL of the Laplace approximation as a function of the number of modules selected under the three ramping schemes; random module selection, max operator norm/max operator norm MLP selection, and max operator norm of the attention modules. The median is taken over 5 train/val splits and evaluated on the RTE (top) and MRPC (bottom) test sets. The lines/ranges are interquartile. The left figure includes the MAP evaluation and the right without it. A line showing the performance of 'last layer Laplace' with full GGN approximation is included.} 
\label{fig: rte_mrpc_laplace_plot_one}
\end{figure}

In \cref{fig: rte_mrpc_laplace_plot_two}, ramping experiments are shown for the minimum and maximum operator norm $\|\bo{w}\|$ are shown for the three GLUE datasets SST-2, RTE, and MRPC.
%
\begin{figure}[H]
\centering
\includegraphics[width=0.3\linewidth]{graphics-thesis/NLP/SST2/Laplace/laplace_plot_two_mlp.pdf}
\includegraphics[width=0.3\linewidth]{graphics-thesis/NLP/RTE/Laplace/laplace_plot_two_mlp.pdf}
\includegraphics[width=0.3\linewidth]{graphics-thesis/NLP/MRPC/Laplace/laplace_plot_two_mlp.pdf}
\includegraphics[width=0.3\linewidth]{graphics-thesis/NLP/SST2/Laplace/laplace_plot_two_attn.pdf}
\includegraphics[width=0.3\linewidth]{graphics-thesis/NLP/RTE/Laplace/laplace_plot_two_attn.pdf}
\includegraphics[width=0.3\linewidth]{graphics-thesis/NLP/MRPC/Laplace/laplace_plot_two_attn.pdf}

\caption{Median NLL results for the Laplace approximation for maximum and minimum operator norm ramping for MLP modules - left - and attention modules - right - across 5 train/val splits and evaluated on the test set for the SST-2 (top) RTE (middle) and MRPC (bottom) datasets. The shown ranges are interquartile.} 
\label{fig: rte_mrpc_laplace_plot_two}
\end{figure}



\section{SWAG Ramping Experiments}

In this section, we briefly present the results seen for SWAG with the corresponding ramping experiments as conducted for the Laplace approximation. Namely: Max operator norm MLP, Max operator norm attention, and random ramping. 
\begin{figure}[H]
    \centering
    \includegraphics[width=0.3\linewidth]{graphics-thesis/NLP/RTE/Swag/swag_plot_one_nll_w_map.pdf}
    \includegraphics[width=0.3\linewidth]{graphics-thesis/NLP/RTE/Swag/swag_plot_one_acc_w_map.pdf}
    \includegraphics[width=0.3\linewidth]{graphics-thesis/NLP/MRPC/Swag/swag_plot_one_nll_w_map.pdf}
    \includegraphics[width=0.3\linewidth]{graphics-thesis/NLP/MRPC/Swag/swag_plot_one_acc_w_map.pdf}
    \includegraphics[width=0.3\linewidth]{graphics-thesis/NLP/SST2/Swag/swag_plot_one_nll_w_map.pdf}
    \includegraphics[width=0.3\linewidth]{graphics-thesis/NLP/SST2/Swag/swag_plot_one_acc_w_map.pdf}
    \caption{SWAG ramping experiments across the three GLUE tasks for the ramping schemes: Max operator norm MLP, Max operator norm attention, and random ramping. All models are optimized to minimize the NLL, while disregarding the accuracy. Hence the perhaps odd drops in performance at certain ranges of stochastic modules.}
    \label{fig: swag_ramping_experiments}
\end{figure}

\subsection{Sublayer ramping in SWAG}
In \cref{fig: swag_ramping_experiments} we present the results for ramping stochastic subsets on a modular level for the SWAG method. Here, an investigation into parameter-specific ramping is done. Using the $\ell_1$-norm as a heuristic, the ramping scheme is conducted such $p$ pct. of the parameters with the largest norm are added to the stochastic subset. For comparison, the subsets with the lowest NLL are shown for the random and operator norm ramping schemes. Operator norm is shown for both MLP and attention module ramping.

The last SWAG results for the $\ell^1$ norm ramping scheme can be seen in \cref{fig: swag_sublayer_all}. 

\begin{figure}[H]
\centering
\includegraphics[width=0.3\textwidth]{graphics-thesis/NLP/SST2/Swag/swag_plot_two_nll_w_map.pdf}
\includegraphics[width=0.3\textwidth]{graphics-thesis/NLP/RTE/Swag/swag_plot_two_nll_w_map.pdf}
\includegraphics[width=0.3\textwidth]{graphics-thesis/NLP/MRPC/Swag/swag_plot_two_nll_w_map.pdf}
\includegraphics[width=0.3\textwidth]{graphics-thesis/NLP/SST2/Swag/swag_plot_two_acc_w_map.pdf}
\includegraphics[width=0.3\textwidth]{graphics-thesis/NLP/RTE/Swag/swag_plot_two_acc_w_map.pdf}
\includegraphics[width=0.3\textwidth]{graphics-thesis/NLP/MRPC/Swag/swag_plot_two_acc_w_map.pdf}

\caption{Median NLL and accuracy as a function of the model's number/percentage of stochastic parameters, displayed for SST-2, RTE, and MRPC from left to right. Parameters are included in the subset according to the largest $\ell^1$ parameter norm. The random and operator norm (MLP/Attention) subsets with lowest NLL are included for comparison.} 
\label{fig: swag_sublayer_all}
\end{figure}


\section{Calibration}\label{sec: calibration}
In \cref{fig: calibration_sst2} the calibration curves of the MAP, Temp scaled MAP, LLLA, S-KFAC, and LA and SWAG with random module selection are shown on the SST-2 dataset. In \cref{fig: calibraiton_rte_mrpc} the same curves are displayed for RTE and MRPC.

\begin{figure}[H]
     \centering
\includegraphics[width=0.49\textwidth]{Graphics/RTE_calibration_ECE_1.pdf}
\includegraphics[width=0.49\textwidth]{Graphics/MRPC_calibration_ECE_1.pdf}
\includegraphics[width=0.49\textwidth]{Graphics/RTE_calibration_ECE_2.pdf}
\includegraphics[width=0.49\textwidth]{Graphics/MRPC_calibration_ECE_2.pdf}
\caption{Calibration curves for the RTE (\textbf{left}) and MRPC (\textbf{right}) datasets shown evaluated on the test set. The median expected calibration error (ECE) across 5 runs is displayed for each method. 20 bins are placed based on the prediction probability distribution rather than uniformly. The Laplace approximation (LA) and Stochastic weight averaging - Gaussian (SWAG) are shown for \textbf{8} RTE, \textbf{4} MRPC of randomly selected stochastic neural network modules. S-KFAC is shown for $1.588\%$ of parameters being stochastic.}
\label{fig: calibraiton_rte_mrpc}
\end{figure}

\cref{fig: calibraiton_rte_mrpc} shows S-KFAC, LA and SWAG changing the structure of posterior predictive distribution towards a more uniform distribution of mean predicted probabilities. Temperature scaling and LLLA rely on reducing the certainty of the model by "shifting" the probabilities towards 0.5.

\end{document}
