% \documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
% \usepackage[american]{babel}
\usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

\usepackage{microtype}
\usepackage{graphicx}
\usepackage{subfigure}
\usepackage{booktabs} % for professional tables

\usepackage{hyperref}

\input{math_commands.tex}
\usepackage{microtype}
\usepackage{float}
\usepackage{subfigure}
\usepackage{adjustbox}
\usepackage{booktabs} % for professional tables
\usepackage{amsmath, nccmath}
\usepackage{wasysym}
\usepackage{graphicx,subfigure}

\usepackage[section]{placeins}

\newcommand{\xoverbrace}[2][\vphantom{\dfrac{A}{A}}]{\overbrace{#1#2}}
\newcommand{\xunderbrace}[2][\vphantom{\dfrac{A}{A}}]{\underbrace{#1#2}}

\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}

\usepackage[capitalize,noabbrev]{cleveref}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Learning Invariant Weights in Neural Networks}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{Tycho~F.A.~van der Ouderaa}
\author[1]{Mark~van der Wilk}
% Add affiliations after the authors
\affil[1]{%
    Imperial College London, UK
}
 
\begin{document}
\maketitle

\begin{abstract}
Assumptions about invariances or symmetries in data can significantly increase the predictive power of statistical models. Many commonly used machine learning models are constraint to respect certain symmetries, such as translation equivariance in convolutional neural networks, and incorporating other symmetry types is actively being studied. Yet, \textit{learning} invariances from the data itself remains an open research problem. It has been shown that the marginal likelihood offers a principled way to learn invariances in Gaussian Processes. We propose a weight-space equivalent to this approach, by minimizing a lower bound on the marginal likelihood to learn invariances in neural networks, resulting in naturally higher performing models.
\end{abstract}


\section{Introduction}
\label{sec:introduction}
Intuitively, invariances allow models to extrapolate, or rather `generalise', beyond training data (see Figure~\ref{fig:toy-problem} for an extreme example). An invariant model does not change in output when the input is changed by transformations to which it is deemed invariant. The most straightforward way to achieve this, is perhaps by enlarging the dataset with transformed examples: a process known as data augmentation. A link between invariance and data augmentation in kernel space was made by \citet{dao2019kernel}. We show that this invariance can equivalently be described as transformations on the weights, similar to \citet{cohen2016group} where a neural network is constrained to respect rotational symmetry through rotated weight copies. We do what is common in Bayesian model selection and find the correct invariance using the marginal likelihood. Optimizing the marginal likelihood has proven an effective way to learn invariances in Gaussian Processes (GPs) \citep{van2018learning}, but is not tractable for commonly used neural networks. To overcome this, we propose a lower bound of the marginal likelihood capable of learning invariances in neural networks. By learning distributions on affine groups, we can select the correct invariance for a particular task, without having to perform cross-validation or even requiring a separate validation set. We succesfully learn the correct invariance on different MNIST and CIFAR-10 image classification tasks leading to better performing models.

\section{Related Work}
\label{sec:related-work}

Convolutional neural networks (CNNs) have been successful in a wide range of problems and played a key role in the success of Deep Learning \citep{lecun2015deep}. It is commonly understood that the translational symmetries that arise from effective weight-sharing in CNNs is an important driver for its outstanding performance on many tasks.

In \citet{cohen2016group}, a group-theoretical framework was proposed extending CNNs beyond translational symmetries, and demonstrated this for discrete group actions. Many studies since have proposed ways to incorporate other symmetries in neural network weights, such as continuous rotation, scale and translation, into the weights of neural networks \citep{worrall2017harmonic,weiler20183d,marcos2017rotation,esteves2017polar,weiler2019general,bekkers2019b} and recent efforts allow practical equivariance in neural networks for arbitrary symmetry groups \citep{finzi2021practical}. Nevertheless, weight symmetries are typically fixed, must be known in advance, and can not be adjusted. 

\begin{figure*}[t]
    \centering
    %\resizebox{0.9\linewidth}{!}{
    \includegraphics[width=1.0\linewidth]{images/toy_plot/toy_plot_5s_5a_learn_1024rff_5.63s_15.42a_LS_LA__100000I_256o_2000N_Truebias.pdf}
    %}
    \caption{Illustration of extrapolating behaviour further away from toy data for models with no invariance (left), some invariance (middle) up to strict invariance (right). Model prediction plotted as contour and datapoints as $\times$'s and $\Circle$'s.}
    \label{fig:toy-problem}
\end{figure*}

Some studies have proposed invariance learning with data augmentations \citep{cubuk2018autoaugment, lorraine2020optimizing}, but thus do not embed symmetry in weights and often require a validation loss. \citep{zhou2020meta} do learn invariances as weight-sharing, but require a meta-learning procedure with an additional validation loss. \citet{benton2020learning} circumvent the need for validation data by learning a distribution of input transformations directly on the training loss. But, in doing so rely on an additional explicit regularization term that depends on how invariances are parameterised. Similar to this work, \citet{schwobel2020probabilistic} propose to use a lower bound, but again only considers a distribution in the input space rather than on weights.

We learn invariant weights by optimizing the marginal likelihood: the common method in Bayesian statistics to perform model selection, which is parameterization independent with the aim of being generally applicable to any chosen parameterization of invariance. Interestingly, it has been shown that the marginal likelihood objective coincides with an exhaustive leave-p-out cross-validation averaged over all values of p and held-out test sets \citep{fong2020marginal}.


% Group theory introduces a rigid mathematical framework to describe and incorporate symmetries in statistical and machine learning models. The framework is somewhat restricted, however, in the sense that the set of transformations governed by group actions to which a model is deemed invariant (or equivariant) needs to be closed, by definition of a group. Opposed to 'strict invariance' which applies to all group actions in a certain group, we might be more interested in `insensitivity' \citep{van2018learning}, a relaxed notion of invariance. To illustrate, imagine the classic MNIST image recognition problem \citep{lecun1998gradient}: here invariance to rotations up to a certain angle allows for better extrapolation to tilded versions of fitted digits and thus more robust predictions and increased sample efficiency. However, invariance to full 360 degree rotations (all SO(2) group actions) prohibits us from differentiating between a `6' and a `9'.
% 
% Besides the practical desire to relax the notion of invariance, `insensitivity' offers a natural way to interpolate between the space of non-invariant functions and the subspaces of functions that are `strictly invariant' to all actions of a particular group. Moreover, it has been shown that marginal likelihood offers a principled and effective way to infer invariances from data, thereby making it possible to learn the extent to which model should be invariant to certain transformations. This is done by integrating out the parameters, a technique that is sometimes referred to as Type II Maximum Likelihood or Empirical Bayes. The idea to learn invariances with marginal likelihood was first proposed in \citet{van2018learning} and demonstrated for Gaussian Process classification (GPC) models.


%In that regard, most similar to this work is \citet{benton2020learning}, but require an explicit regularization term that is dependent on the parameterization of the invariance that is being learned.
%We consider the marginal likelihood, which is independent in parameterization of the invariance with the aim of being generally applicable to any chosen parameterization of invariance. 

Lastly, in Topological VAEs \citep{keller2021topographic} capsules with `rolling' feature activations show similarities to the deterministically sampled features obtained from our method, but differ in the reliance on `temporal coherence'.

\section{On Invariant Modelling} 

A model $f(\cdot)$ is deemed `strictly invariant' its output is unaffected by a set of transformations:
$f(T_g \circ \vx) = f(\vx),\forall g \in G, \vx \in \gX$
where each transformation $T_g$ is governed by a group action $g \in G$ forming a group $G$. We can obtain an invariant model by averaging model outputs over all transformations $T_g$. Although group theory introduces a rigid mathematical framework that is often used to describe and incorporate symmetries in statistical and machine learning models, it is restricted in the sense that the set of transformations that generate a group is always closed, by the definition of a group. To illustrate, imagine the classic MNIST image recognition problem \citep{lecun1998gradient}: here invariance to rotations up to a certain angle allows for better extrapolation to tilded versions of fitted digits and thus more robust predictions and increased sample efficiency. However, invariance to full 360 degree rotations (all SO(2) group actions) may prohibit us from differentiating between a `6' and a `9'. In an effort to overcome this issue, we follow \citet{dao2019kernel,raj2017local,van2018learning,benton2020learning} and construct our invariant function $f_\vtheta(\vx; \boldsymbol{\eta})$ from a non-invariant function $g_\vtheta(\vx)$ by summing over the orbit:
\begin{align}
\label{eq:invariance}
f_\vtheta(\vx; \boldsymbol{\eta}) = \int g_\vtheta(T(\vx)) p_{\boldsymbol{\eta}}(T) \mathrm d T,
\end{align}
where $p_{\boldsymbol{\eta}}(T)$ denotes a density over the group action transformations parameterised by a vector $\boldsymbol{\eta}$. Through this construction, we hope to induce a relaxed notion of invariance upon the model, sometimes referred to as `insensitivity' \citep{van2018learning}, `soft-invariance' \citep{benton2020learning}, or `deformation stability' \citep{bronstein2021geometric}. The special case in which the density $p_{\boldsymbol{\eta}}(T)$ is uniformly distributed over the orbit results in the `Reynolds operator' from Group Theory, which averages functions and thereby induces a `strict invariance' over the entire group.
\begin{figure*}[t]
    \centering
    %\resizebox{0.9\linewidth}{!}{
    \includegraphics[width=1.0\linewidth]{images/recovering-invariances/recovering-invariances.pdf}
    %}
    \caption{Predicted invariance over training iterations for models initialised with different amounts of invariance when trained on fully rotated MNIST (left), partially rotated MNIST (middle) and regular MNIST (right).}
    \label{fig:recovering-invariance}
\end{figure*}

\subsection{Invariant Shallow Neural Network}
\label{sec:invariant-shallow-neural-network}

We construct our invariant function from a single-layer non-invariant neural network:
\begin{align}
\label{eq:network}
g_\vtheta(T(\vx)) &= \sigma \left( \mW_{2} \circ \phi\left( \mW_{1} \circ T \circ \vx \right) \right),
\end{align}
where $\sigma(\cdot)$ is the soft-argmax function, $\vx$ is the input, and $\mW_1$ and $\mW_2$ are the respective first and second layer weights and biases. We omitted the bias terms for notational clarity.

In this study, we consider two flavours for our neural network $g_{\vtheta}$, namely an RFF-network and ReLU-network. In the RFF-network, first layer weights $\mW_1$ are initialiased as Random Fourier Reatures (RFF) \citep{rahimi2007random} and a cosine activation function $\phi(\cdot) = \cos(\cdot)$ is used. For the ReLU-network, both first and second layer weights $\mW_1$ and $\mW_2$ are learned and we consider a ReLU non-linearity $\phi(x) = \max(0, x)$ for the activation function.

The RFF-network is interesting because we obtain a weight-space equivalent that is as close as possible to a GP with a radial basis function kernel (RBF), with exact correspondence in the infinite-width limit. From \cite{van2018learning}, we know that in this case the marginal likelihood is tight and can be used to learn invariance. The ReLU-network, on the other hand, is interesting as it more closely resembles the commonly used architectures in the Deep Learning (DL) community: basis weights are typically not fixed and the ReLU is one of the most commonly used activation functions in DL. In our experiments, we find that we can learn invariances with both the RFF-network and ReLU-network, indicating that for our purposes the bound on the marginal likelihood remains sufficiently tight for more general shallow architectures.

Section~\ref{sec:variational-inference} will discuss how variational inference is used to learn a variational distribution $q$ over the parameters in the second layer: $\vtheta=\text{vec}(\mW_2)$.

\begin{figure*}[ht]
\centering     %%% not \center
\subfigure[Feature bank \#1 over training iterations.]{\label{fig:visualized-weights-a}\includegraphics[width=0.45\linewidth]{images/visualized_weights/W0.pdf}}
\hspace{0.06\linewidth}
\subfigure[Feature bank \#2 over training iterations.]{\label{fig:visualized-weights-b}\includegraphics[width=0.45\linewidth]{images/visualized_weights/W1.pdf}}
\caption{Illustration of converging filter banks of two features. Features are initialised randomly with almost no invariance and converge to particular filters with practically full ($\pm 179$) rotational invariance after training on the fully-rotated MNIST.}
\label{fig:visualized-weights}
\end{figure*}

\subsection{Invariance in the weights}
\label{sec:invariance-in-the-weights}

In Equations~\ref{eq:invariance}-\ref{eq:network}, we showed how we construct an invariant function by integrating or summing over transformed input samples $T(\vx)$. Yet, instead of explicitly performing these transformations on the input, we can obtain a mathematically equivalent invariant function by considering transformations on the weights. Note that the inner term of our neural network definition in Equation~\ref{eq:network}, we have that $(\mW_1 \circ T) \circ \vx = \mW_1 \circ (T \circ \vx) $ are equal, by associativity of matrix transformations. In other words, first applying transformation $T$ on the weights, similar to the typical construction of equivariant layers, is equivalent to first applying it to the input, which could be interpret as built-in data augmentation. In practice, however, differences between the two could still arise if applying $T$ requires approximations (e.g. interpolation between discrete pixels). In our experiments we will consider transforming the weights, thus demonstrating that invariance can be `built into' the model.

\paragraph{Coordinate data and imaging data}
We consider simple affine transformations, which can be represented as $T \in \R^{3 \times 3}$ matrices. For 2d vector data, applying the transformations amounts to regular matrix multiplications, which only requires appending a single $1$-entry to the data vectors. For 2d images, where data points $\vx \in \R^{WH}$ correspond to $W \times H$ pixel grids, applying $T$ in image space requires interpolation. Here, we could use bilinear interpolation, which can also be written in matrix formula form. We use the grid sample operation (as used in \citep{jaderberg2015spatial}) 
which acts on the weight matrix values $\mW_1$ and outputs an equally shaped matrix. The operation treats the $HW$-dimensional row vectors of $\mW_1$ as a grid of $H \times W$ points where the coordinates of the point are transformed according to the affine transformation matrix $T$. The resulting values are obtained by interpolating the values of transformed pixels at the original grid coordinates using bilinear interpolation.

\subsection{Affine Lie Group Reparameterization}
\label{sec:affine-reparameterization}

The transformations that are applied on the weights and will define the invariances of the network are sampled from a probability distribution. To allow learnable invariances, we define a learnable probability distribution over the transformations $p_{\boldsymbol{\eta}}(T)$ parameterised by $\boldsymbol{\eta}$. We will refer to $\boldsymbol{\eta}$ as the `invariance parameters', as they parameterise to which transformations to which our network becomes invariant. To learn this distribution with back-propagation, we must make sure that samples taken from the distribution are differentiable with respect to the invariance parameter $\boldsymbol{\eta}$. For affine transformed weights, we consider a procedure similar to what \citet{benton2020learning} used to augment inputs, utilising the re-parameterization trick \citep{kingma2013auto} to remain differentiable. The distribution defines independent Gaussian probabilities over infinitesimal generators around their origin. sampling noise from a k-cubed uniform distribution $\boldsymbol{\epsilon} \sim U[-1, 1]^k$. With $k\text{=}6$ generator matrices $\mG_1, \cdots, \mG_6$ and learnable parameters $\boldsymbol{\eta} = [\eta_1, \cdots, \eta_6]^T$ we can separately parameterise translation in x, translations in y, rotations, scaling in x, scaling in y, and shearing (see Appendix D). A sample $T \sim p_{\boldsymbol{\eta}}(T)$ can be obtained by transforming noise $\boldsymbol{\epsilon}$:

\begin{equation}
\begin{split}
T = \exp \left( \sum_i \epsilon_i \eta_i \mG_i \right)
\end{split}, \hspace{1cm}
\begin{split}
    \boldsymbol{\epsilon} \sim U[-1, 1]^k
\end{split}
\end{equation}

with matrix exponential $\exp(M) = \sum_{n=0}^\infty \frac{1}{n!}M^n$. A distribution over the subgroup of 2d rotations SO(2) can be achieved by only learning the parameter for rotational invariance $\eta_\text{rot}=\eta_{3}$ and fixing $\eta_i=0$ for all $i\neq3$. Then,

\begin{equation}
T^{\text{(rot)}} =
\begin{bmatrix}
\cos(\epsilon_3 \eta_\text{rot}) & -\sin(\epsilon_3 \eta_\text{rot}) & 0 \\
\sin(\epsilon_3\eta_\text{rot}) &  \cos(\epsilon_3 \eta_\text{rot}) & 0 \\
0 & 0 & 1
\end{bmatrix}
\end{equation}

By learning $\eta_\text{rot}$, we can effectively interpolate between no invariance at $\eta_{\text{rot}}\text{ = 0}$ to full rotational invariance at $\eta_{\text{rot}}\equiv \pi$.

Similarly, we can define a distribution over the subgroup of 2d translations $\mathbb{T}(2)$ by fixing $\eta_i=0$ for all $i>2$ and learning the translational invariance parameters $\eta_1$ and $\eta_2$:


\begin{equation}
T^{\text{(trans)}} =
\begin{bmatrix}
1 & 0 & \epsilon_1 \eta_1 \\
0 & 1 & \epsilon_2 \eta_2 \\
0 & 0 & 1
\end{bmatrix}
\end{equation}

We include full derivations including scaling in Appendix~E. In general, $\boldsymbol{\eta}=\vzero$ corresponds to no invariance and increasing individual elements of $\boldsymbol{\eta}$ also increases insensitivity to corresponding transformations towards full invariance.

\subsection{Stochastic or Deterministic Sampling}

To estimate $f_\vtheta(x; \boldsymbol{\eta})$ from Equation~\ref{eq:invariance}, we approximate the integral with a Monte Carlo (MC) estimate:

\begin{align}
\label{eq:f-estimate}
    \hat{f}_\vtheta (\vx; \boldsymbol{\eta}) = \frac{1}{S} \sum^{S}_{i=1} g_\vtheta (T_{i}(\vx))
\end{align}

\begin{figure*}[t]
\centering     %%% not \center
\begin{adjustbox}{max width=1.0\linewidth}
\subfigure[Sampled filters of affine model trained on regular mnist.]{\label{fig:sampled-filter-banks-a}\includegraphics[width=0.52\linewidth]{images/w-samples/mnist_w.pdf}}
\subfigure[Sampled filters of affine model trained on rotated mnist.]{\label{fig:sampled-filter-banks-b}\includegraphics[width=0.52\linewidth]{images/w-samples/mnist180_w.pdf}}
\end{adjustbox}
\hspace{0.02\linewidth}
\begin{adjustbox}{max width=1.0\linewidth}
\subfigure[Sampled filters of affine model trained on scaled mnist.]{\label{fig:sampled-filter-banks-c}\includegraphics[width=0.52\linewidth]{images/w-samples/scaledmnist_w.pdf}}
\subfigure[Sampled filters of affine model trained on translated mnist.]{\label{fig:sampled-filter-banks-d}\includegraphics[width=0.52\linewidth]{images/w-samples/mnist_8t.pdf}}
\end{adjustbox}
\caption{Stochastic samples of learned filter banks of a model capable of learning affine invariances. The same model learns features that are insensitive to different kinds of transformations dependent on the data it was trained on.}
\label{fig:sampled-filter-banks}
\end{figure*}

where $S$ transformations are stochastically sampled from the distribution $T_{i} \sim p_{\boldsymbol{\eta}}(T)$. Samples can be differentiated with respect to invariance parameter $\boldsymbol{\eta}$ using the `re-parameterization trick' (see Section~\ref{sec:affine-reparameterization}). We know that MC is an unbiased estimator, and thus

\begin{align}
\label{eq:f-expectation}
    f_\vtheta (\vx; \boldsymbol{\eta}) = \E_{T} \left[ \hat{f}_\vtheta(\vx; \boldsymbol{\eta}) \right]
\end{align}

with $\E_T := \E_{\prod_{i=1}^S p_{\boldsymbol{\eta}}(T_{i})}$. Unlike stochastic MC sampling, we can obtain a deterministic surrogate of the procedure by replacing the stochastic samples from the noise source $U[-1, 1]^k$ with linearly spaced points along its $k$-cubed domain. This procedure is similar to quadrature in classical numerical integration, or from a programming perspective, as applying the re-parameterization trick on a fixed `linspace' instead of uniform noise. A visualization of a discretely sampled filter bank of a model learning rotational invariance over training iterations is shown in Figure~\ref{fig:visualized-weights}. By ensuring sufficient and equally spaced samples, deterministic sampling can be used to ensure reliable and robust inference at test time. Similar to the stochastic sampling, this deterministic procedure is also differentiable and can thus be used during training. We find, however, that deterministic sampling is only suitable when the number of invariances $\dim(\boldsymbol{\eta})$ is very small (see Section~\ref{sec:practical-sampling}). Nevertheless, deterministic sampling can be theoretically interesting and allow our model to be interpret as a generalization of other architectures. For instance, a single convolutional layer where the kernel is discretely and deterministically convolved over an image followed by spatial pooling, can be interpret as an instance of our invariant MLP with a specific affine invariance transformation in which weights are `zoomed-in' and deterministically sampled and reapplied over the image plane.

\subsection{Practical Transformation Sampling}
\label{sec:practical-sampling}
If $\boldsymbol{\eta}$ comprises multiple elements, the sampling suffers from the curse of dimensionality as the number of required samples grows exponentially with larger $K$. To illustrate, a sparse 3 quadrature points in $K=6$ dimensions would already require $3^6=729$ samples with deterministic sampling. In general, we found that stochastic MC sampling resulted in the most stable training behaviour and therefore used this when training the models in the experimental section, except for Figure~\ref{fig:visualized-weights} where deterministic sampling was used for both training and visualization of rotationally invariant filter bank.
%Also, with deterministic sampling we obtain filter bank transitions, similar to the `rolling' feature activations in the capsules of a Topological VAE \citep{keller2021topographic}.

\subsection{Lower Bounding the Marginal Likelihood}

We have a (typically large) vector $\vtheta$ containing the model parameters and a (typically small) vector for the invariance parameters $\boldsymbol{\eta}$. The approach we take in this paper is to perform Bayesian Model Selection and integrate out $\vtheta$ but find a point-estimate over $\boldsymbol{\eta}$:

\begin{align}
    \hat{\boldsymbol{\eta}}
    = \argmax_{\boldsymbol{\eta}} p(\train|\boldsymbol{\eta})
    = \argmax_{\boldsymbol{\eta}} \left[ \int p(\train|\vtheta) p(\vtheta|\boldsymbol{\eta}) \mathrm d \vtheta \right]
\end{align}

where $p(\train|\boldsymbol{\eta})$ is the marginal likelihood \citep{murphy2012machine} or model evidence, sometimes called empirical Bayes or type-II ML. The technique has been shown effective in GPs to learn hyper-parameters \citet{williams2006gaussian} and invariances \citet{van2018learning}, but is typically intractable for neural networks. We derive a lower bound that allows for optimization of the marginal likelihood in neural networks using stochastic variational inference:
\begin{align}
\label{eq:bound}
\log p(\train)
\geq \E_{\vtheta} \left[ \log p(\train|\vtheta) \right] - \text{KL}(q(\vtheta|\vmu, \mSigma) || p(\vtheta)) \nonumber \\
= \E_{\vtheta} \left[ \log p(\vy | f_\vtheta(\vx; \boldsymbol{\eta})) \right] - \text{KL}(q(\vtheta|\vmu, \mSigma) || p(\vtheta)) \nonumber \\
= \E_{\vtheta} \left[ \log p
\left(\vy \Big| \E_T \big[ \hat{f}_\vtheta(\vx; \boldsymbol{\eta}) \big] 
\right) \right] - \text{KL}(q(\vtheta|\vmu, \mSigma) || p(\vtheta)) \nonumber \\
\geq \E_{\vtheta} \left[ \E_T \left[ \log p(\vy | \hat{f}_\vtheta(\vx; \boldsymbol{\eta})) \right] \right] - \text{KL}(q(\vtheta|\vmu, \mSigma) || p(\vtheta))
\end{align}
with expectations $\E_\vtheta := \E_{q(\vtheta)}$ and $\E_T := \E_{\prod_{i=1}^S p_{\boldsymbol{\eta}}(T_{i})}$. We begin Eq~\ref{eq:bound} with the standard evidence lower bound (ELBO) derived from variational inference. In the second and third line, we expand the likelihood and plug-in Equation~\ref{eq:f-expectation}. In the last line, we use Jensen's inequality together with the fact that our log-likelihood is a convex function. The resulting lower bound comprises an expected log-likelihood term that can be estimated by taking the average cross-entropy on mini-batches of data (see Section~\ref{sec:variational-inference}) and a KL-divergence between two multivariate Gaussians which can efficiently be computed in closed-form. Note, we integrate out model parameter vector $\vtheta$, which is part of the KL-term, whereas the vector parameterizing the invariances $\boldsymbol{\eta}$ is only part of the first term. We optimise the derived lower bound w.r.t. both $\boldsymbol{\eta}$ and $\vtheta$ every iteration with stochastic gradient descent.

\begin{table*}[t]
    \centering
    \begin{adjustbox}{max width=0.9\linewidth}
    \begin{tabular}{l|c c c|c c c}
       & \multicolumn{3}{ c }{\underline{Test Accuracy}}
       & \multicolumn{3}{ c }{\underline{ELBO}} \\
      Model
      & \shortstack{{\footnotesize Fully rotated} \\ MNIST}
      & \shortstack{{\footnotesize Partially rotated} \\ MNIST}
      & \shortstack{Regular \\ MNIST}
      & \shortstack{{\footnotesize Fully rotated} \\ MNIST}
      & \shortstack{{\footnotesize Partially rotated} \\ MNIST}
      & \shortstack{Regular \\ MNIST}
      %& Full ±180$^\circ$ & Half ±90$^\circ$ & Regular
      \\
    \hline
      MLP + fixed 5$^\circ$ rotation
      & 79.29 & 86.71 & \textbf{96.00}
      & -1.07 & -0.80 & -0.36
      \\
      MLP + fixed 45$^\circ$ rotation
      & 87.35 & 91.13 & 95.93
      & -0.63 & -0.49 & \textbf{-0.26}
      \\
      MLP + fixed 90$^\circ$ rotation
      & 90.33 & \textbf{91.69} & 94.69
      & -0.52 & \textbf{-0.44} & -0.30
      \\
      MLP + fixed 135$^\circ$ rotation
      & 91.19 & 91.04 & 92.13
      & -0.45 & -0.45 & -0.36
      \\
      MLP + fixed 175$^\circ$ rotation
      & \textbf{91.57} & 90.47 & 90.97
      & \textbf{-0.43} & -0.47 & -0.45
      \\
    \hline
      MLP + learned rotation
      & \textbf{91.72} & \textbf{92.34} & \textbf{96.40}
      & \textbf{-0.43} & \textbf{-0.42} & \textbf{-0.26} \\
    \hline
    \end{tabular}
    \end{adjustbox}
    \caption{Test Accuracy and ELBO scores on MNIST using RFF neural network\protect\footnotemark. For each dataset, we observe that the correct level of invariance for that dataset corresponds with highest ELBO and also correlates with best test accuracy. In addition, we find that automatically learned invariance converges to ELBO and test accuracies similar or beyond the found optimal values from the models with fixed invariance.}
    \label{tab:results-elbo}
\end{table*}

\subsection{Variational Inference}
\label{sec:variational-inference}
To summarise, we propose to learn invariances using stochastic variational inference \citep{hoffman2013stochastic} and derived a lower bound of the marginal likelihood, or \textit{evidence lower bound} (ELBO) that can be optimised using a gradient descent methods, such as Adam \citep{kingma2014adam}. Variational inference minimises the $\text{KL}$-divergence between an variational posterior and the true posterior on our free model parameters $p(\vtheta | \train)$, where $\vtheta = \text{vec}(\mW_2)$. For the approximate posterior, we choose a multivariate Gaussian distribution $q(\vtheta|\vmu, \mSigma) := \mathcal{N}(\vtheta|\vmu, \mSigma)$ parameterised by variational parameters $\vmu$ and block-diagonal covariance $\mSigma$ with a separate block for each output class. The covariance is parameterised as a Cholesky decomposition $\mSigma=\mL^T \mL$, which is a common trick to maintain computational stability to ensure a positive semi-definite $\mSigma$ and does not influence the model. We obtain a differentiable Monte Carlo estimate of $q(\vtheta)$ by sampling $L$ times from the variational distribution, using the reparameterization trick \citep{kingma2013auto}, and maximise the ELBO:
\begin{align}
\mathcal{L} &=
\E_{\vtheta} \left[ \E_T \left[ \log p(\vy | \hat{f}_\vtheta(\vx; \boldsymbol{\eta})) \right] \right] - \text{KL}(q(\vtheta|\vmu, \mSigma) || p(\vtheta)) \nonumber \\
% \E_{\vtheta} \Big[ \E_{T} \Big[ \log p(\train|\vtheta) \Big] \Big]
% -\text{KL}(q(\vtheta) || p(\vtheta)) \\
% \E_{\vtheta} \Big[ \E_{T} \Big[ \xoverbrace{\log p(\train|\vtheta)}^{\text{Cross-entropy}} \Big] \Big]
% -\xoverbrace{\text{KL}(q(\vtheta) || p(\vtheta))}^{\text{Closed-form KL}} \\
&\approx \frac{1}{L} \sum^L_{l=1} \Big[ \xunderbrace{\log p(\vy | \frac{1}{S} \sum^{S}_{i=1} g_{\vtheta_l}(T_{i}(\vx))}_{\text{Cross-entropy}} \Big]
 -\xunderbrace{\text{KL}(q(\vtheta) || p(\vtheta))}_{\text{Closed-form KL}}
\end{align}
where we can choose $L\{=\}1$ given a sufficiently large batch size. We obtain a Stochastic Gradient Variational Bayes (SGVB) estimate of the lower bound $\frac{N}{M}\sum^M_{i=1}\mathcal{\tilde{L}}(\vtheta, \{\vx_i\}, \{y_i\})$ \citep{kingma2013auto} to allow efficient training on mini batches of data. Full derivations can be found in Appendix A.

% \subsection{discretised approximate diffeomorphic fields}
% \label{sec:field-reparameterization}
% 
% to extend the flexiblity of the transformations beyond that of affine transformations, we also consider a distribution over the group of diffeomorphisms. in this case, samples from the distribution correspond to vector fields that are invertible, and thereby constrained to preserve topology (e.g. `folding' of space is not allowed).
% 
% in this case we let $\boldsymbol{\eta}$ parameterise a stationary velocity vector field $\phi_{\boldsymbol{\eta}}$, which is a member of the lie algebra and is exponentiated to produce the diffeomorphic map $t_\epsilon$: 
% \tyc{this formula is not correct yet:}
% \begin{equation}
% \begin{split}
% t_{\boldsymbol{\epsilon}} = \exp \left[ \boldsymbol{\epsilon} \phi_{\boldsymbol{\eta}} \right]
% \end{split},
% \hspace{1cm}
% \begin{split}
%     \boldsymbol{\epsilon} \sim u[0, 1]^{|\boldsymbol{\eta}|}
% \end{split}
% \end{equation}
% 
% this construction can be interpret as the solution of an ordinary differential equation (ode) $\frac{\partial f^{(t)}}{\partial t} = \phi(f^{(t)})$ where $f^{(0)}=\text{id}$ is the identity transform and we have integrated over time $t=[0, 1]$.
% 
% note that the size of our invariance parameter vector $\boldsymbol{\eta}$ is not 6 as was the case for affine transformations, but $2 \times h \times w$ to cover the $h$ by $w$ grid of independent 2-dimensional vectors equally spaced out to cover the full height and width of the image.
% 



\section{Experiments and Results}
\label{results}


We implemented our method in PyTorch \citep{paszke2017automatic} and show results on a toy problem with different degrees of rotational invariance in Figure~\ref{fig:toy-problem} with 1024 RFF features, $\sigma=5$, and $T$ applied on the weights.

The following sections will describe experiments on different MNIST and CIFAR-10 image classification tasks where $T$ is applied on the weights by using the bilinear grid resampling as described in Section~\ref{sec:invariance-in-the-weights} and \citet{jaderberg2015spatial} in combination with small 0.1 sigma Gaussian blur to bandlimit high frequencies. We used Adam \citep{kingma2014adam} for optimization in combination with a learning rate of 0.001 ($\beta_1=0.9, \beta_2=0.999)$ cosine annealed \citep{loshchilov2016sgdr} to zero. Parameters were initialised as $\vmu_c=\vzero$, $\mL_c=\mI$ for all classes $c$, $\sigma=0.3$, and $\alpha=1.0$. We use $S=32$ samples from $p_{\boldsymbol{\eta}}(T)$, $L=1$ and a batch size of 128. %We intend to make the implementation public on Github.  

\subsection{On the Necessity of a Bayesian Approach}
\label{sec:necessity-of-elbo}
To investigate to what extent the variational inference is required to learn invariances, we compare our approach with regular maximum likelihood using Adam. We train one model that uses our objective (Variational Inference), and another model where we replaced the variational distribution $q(\vtheta)$ with a point-estimate and omitted the $\text{KL}$-term to get a regular cross-entropy loss. Interestingly, when trained on fully-rotated MNIST in Figure~\ref{fig:direct-vs-vi}, we find that the model trained with cross-entropy was completely incapable of learning the correct invariance, whereas our VI-based approach does learn the invariance. We hypothesise that maximum likelihood alone is not enough to learn invariance, as invariance is a constraint on the weights and thus does not help to fit the data better, whereas marginal likelihood also favours simpler models. This result substantiates the use of marginal likelihood (or a lower bound thereof) for hyper-parameter selection for neural networks, and invariance learning in particular. More broadly speaking, it proves a convincing case for probabilistic machine learning models, such as Bayesian neural networks, beyond their oft-cited use for uncertainty estimation.
\footnotetext{As explained in Section~\ref{sec:invariant-shallow-neural-network}, we use the RFF neural network to ensure a tight lower bound and for comparison purposes. Higher accuracies on MNIST and CIFAR-10 were achieved with a ReLU neural network as reported in Table~\ref{tab:mnist-results} and Table~\ref{tab:cifar-results}.}

\begin{figure}[h]
    \centering
    \begin{adjustbox}{max width=0.7\linewidth}
    \includegraphics{images/direct-vs-vi/direct-vs-vi.pdf}
    \end{adjustbox}
    \caption{Predicted invariance over training iterations with non-Bayesian point-estimate optimised with cross-entropy and approximate Bayesian inference. A regular point estimate can not learn invariances, whereas our VI-based approach does learn the invariance.}
    \label{fig:direct-vs-vi}
\end{figure}


\subsection{Identifying Invariance with ELBO}

To evaluate whether the ELBO is capable of identifying the apt level of invariance, we consider models with different fixed values of rotational invariance $\eta_\text{rot}$ and one model where $\eta_\text{rot}$ is learned. We then evaluate the models on three different versions of MNIST on which we artifically imposed different amounts of rotational invariance by randomly transforming the dataset beforehand. In `fully rotated MNIST', we rotate every image with a random uniformly sampled angle in range $[-180^{\circ}, 180^{\circ}]$. In `partially rotated MNIST' images are rotated with a random angle within $[-90^{\circ},90^{\circ}]$. Lastly, we also consider the `regular MNIST' dataset without any alterations. 

From Table~\ref{tab:results-elbo}, we observe that for each dataset the model with the best ELBO corresponds to the model with the right amount of invariance, also correlating with best test accuracy. This finding indicates that the ELBO can correctly identify the required level of invariance, and confirms that choosing the right invariance leads to better generalization on the test set. On regular MNIST, we observe that a small amount of invariance yields better ELBO than no invariance. This could be explained by some intrinsic rotational variation within the dataset. Furthermore, we find that the ELBO of the model with learned invariance $\eta_{\text{rot}}$ corresponds to the optimal ELBO in the set of models with fixed invariance. Therefore, we find that in this case, we can use the ELBO to learn invariances in a differentiable manner. Additional results can be found in Appendix C.



\subsection{Recovering Invariance from Initial Conditions}
\label{sec:recovering-invariance}

\begin{figure*}[t]
\centering     %%% not \center
\begin{adjustbox}{max width=1.0\linewidth}
\subfigure[Deterministic samples from learned rotationally invariant filter bank.]{\label{fig:sampled-filter-banks-cifar-a}
\includegraphics[width=0.58\linewidth]{images/w-samples/cifar_180r_W_17_deterministic.pdf}}
\subfigure[Stochastic samples from learned rotationally invariant filter bank.]{\label{fig:sampled-filter-banks-cifar-b}
\includegraphics[width=0.58\linewidth]{images/w-samples/cifar_180r_W_17_stochastic.pdf}}
\end{adjustbox}
\begin{adjustbox}{max width=1.0\linewidth}
\subfigure[Deterministic samples from learned rotationally invariant filter bank.]{\label{fig:sampled-filter-banks-cifar-c}
\includegraphics[width=0.58\linewidth]{images/w-samples/cifar_180r_W_0_deterministic.pdf}}
\subfigure[Stochastic samples from learned rotationally invariant filter bank.]{\label{fig:sampled-filter-banks-cifar-d}
\includegraphics[width=0.58\linewidth]{images/w-samples/cifar_180r_W_0_stochastic.pdf}}
\end{adjustbox}
\caption{Visualization of samples from learned filter banks using discrete sampling learned on different versions of \mbox{CIFAR-10}. The invariant transformations are learned starting from no invariance dependent on the data it was trained on.}
\label{fig:sampled-filter-cifar-banks}
\end{figure*}
To investigate robustness to different initial conditions, we repeat the experiment where we learn invariance parameters $\boldsymbol{\eta}$ during during training on fully-rotated, partially rotated and regular MNIST data but with different initial values, corresponding to rotational invariance of [$\pm$5$^\circ$, $\pm$45$^\circ$, $\pm$90$^\circ$, $\pm$135$^\circ$, $\pm$175$^\circ$] degrees. Results of this experiments for the RFF neural network are shown in Figure~\ref{fig:recovering-invariance}, and a similar figure for the ReLU neural network is attached in Appendix C.2. For most initial conditions, we observe that we can succesfully learn and recover the `correct' amount of invariance for each dataset. One exception being initial 175$^\circ$ degrees on partially rotated dataset, which suggests that training with low initial invariance could be advantageous in practice, for this method. Nevertheless, we conclude that our model can recover invariance relatively robustly independent of initial conditions.


\subsection{Learning Invariance in ReLU Network}
\label{sec:learning-invariant-features-in-MLPs}

So far, we have only considered the set-up where we learn the output layer $\mW_2$ and keep the first layer $\mW_1$ initialised as fixed RFF-features in combination with a cosine $\cos(\cdot)$ activation function. We chose this fixed basis function model to ensure a sufficiently tight bound on marginal likelihood where the only source of looseness is the non-Gaussian likelihood. Now, we will let loose of these constraints and consider a general single hidden layer neural network with ReLU non-linearity $\phi(x) = \max(x, 0)$ with Xavier \citep{kumar2017weight} initialised weights and 1024 hidden units, where we learn both the input layer $\mW_1$ the output layer $\mW_2$. We optimise the model using the same variational inference procedure.

We find that we are still able to learn invariances in the setting where parameters of both input and output layer are learned (full comparison in Appendix C). In Figure~\ref{fig:visualized-weights}, we plot an illustration of a feature bank (row vector in $\mW_1$ with 7 samples equally spaced between $-\eta_\text{rot}$ and $\eta_\text{rot}$ and plotted over training iterations). The top of the figure shows the randomly initialised features without any rotational invariance at the beginning of training. After training on a fully rotated MNIST, the features converge to a particular filter with practically full ±179$^\circ$ rotational invariance, as shown on the bottom of the same figure.

\subsection{Other Transformations}
\label{sec:experiment-transformations}

To explore invariance to transformations other than rotation, we allow for different kinds of affine invariance transformations, namely rotation, translation, scale and full affine transformations (see Section~\ref{sec:affine-reparameterization}). Again, we use the ReLU-network where both layers are learned.

In Table~\ref{tab:mnist-results}, we evaluate and compare models that can learn affine invariances with two non-invariant baselines, namely a regular Gaussian Process regression with RBF kernel baseline (SGPR) and a regular shallow neural network baseline (MLP). We use SGPR as a reference, because we know the training procedure is reliable and to ensure enough capacity is given to the single layer MLP. We separately trained the models on fully-rotated, translated, scaled and original versions of MNIST (see Appendix D for details). We find that models with learned invariances (bottom four rows) outperform the model with no invariance (top two rows) in all cases. As expected, a translationally invariant model performs better on a dataset that contains randomly translated examples, and similarly, the rotationally and scale invariant models perform best on the respective rotated and scaled versions of MNIST. In line with our expectations, the model capable of learning affine invariances performs best overall. Moreover, by inspecting the learned coefficients of $\boldsymbol{\eta}$ after training we verified that the learned transformations correspond to the dataset the it was trained on. This can also be observed in Figure~\ref{fig:sampled-filter-banks} by inspecting the resulting learned filter banks samples after training on different datasets.

\begin{table}[!ht]
    \centering
    \begin{adjustbox}{max width=1.0\linewidth}
    \begin{tabular}{l|c c c c}
       & \multicolumn{4}{ c }{\underline{Test Accuracy}}\\
      Model
      & \shortstack{{\footnotesize Fully rotated} \\ MNIST}
      & \shortstack{Translated \\ MNIST}
      & \shortstack{Scaled \\ MNIST}
      & \shortstack{Regular \\ MNIST}
      %& Full ±180$^\circ$ & Half ±90$^\circ$ & Regular
      \\
    \hline
    SGPR 
      & 91.19 & 89.22 & 72.10 & 97.52
      \\
    MLP
      & 90.35 & 89.34 & 96.61 & 98.10
      \\
    \hline
    MLP + Rotation (\textbf{ours})
      & \textbf{98.05} & 94.08 & 97.62 & 98.64
      \\
    MLP + Translation (\textbf{ours})
      & 93.59 & \textbf{97.87} & 97.98 & 98.76
      \\
    MLP + Scale (\textbf{ours})
      & 93.80 & 94.30 & \textbf{98.06} & 98.35
      \\
    MLP + Affine (\textbf{ours})
      & \textbf{98.14} & \textbf{97.66} & \textbf{98.31} & \textbf{98.93}
      \\
    %+ Diffeomorphism
    %  & & &
    %  & & &
    %  \\
    %\hline
    \end{tabular}
    \end{adjustbox}
    \caption{Test Accuracy scores for learned invariance using different transformations in a shallow ReLU neural network on the MNIST dataset.}
    \label{tab:mnist-results}
\end{table}

We repeated the same experiment on the \mbox{CIFAR-10} dataset \citet{krizhevsky2009learning} and trained on fully-rotated, translated, scaled version and the original version of the \mbox{CIFAR-10} dataset and plot test accuracies in Table~\ref{tab:cifar-results}. We consistently find that the best performing models are those that are parameterised such that it can learn the invariance that corresponds the dataset, typically resulting in several percentage points of improved accuracy compared to the MLP baseline. Furthermore, if we parameterise the MLP with the more general affine invariance, capable of expressing rotation, translation and scale invariances, we always achieve similar or improved results compared to model from the models parameterised with a a single invariance. Similar to the MNIST experiments, we find that the MLP with general affine invariances can select the correct invariance based on the used training data. Here we also verified this by inspecting the $\boldsymbol{\theta}$, and found that the learned invariance always matches the invariances that we expect for the corresponding dataset. For example, the model capable of learning affine invariances correctly learned to be rotationally invariant ($\eta_3 \approx \pi$ and $\eta_i\approx 0$ for $i\neq 3$) after training on the fully-rotated \mbox{CIFAR-10} dataset.


\begin{table}[!ht]
    \centering
    \begin{adjustbox}{max width=1.0\linewidth}
    \begin{tabular}{l|c c c c}
       & \multicolumn{4}{ c }{\underline{Test Accuracy}}\\
      Model
      & \shortstack{{\footnotesize Fully rotated} \\ CIFAR-10}
      & \shortstack{Translated \\ CIFAR-10}
      & \shortstack{Scaled \\ CIFAR-10}
      & \shortstack{Regular \\ CIFAR-10}
      %& Full ±180$^\circ$ & Half ±90$^\circ$ & Regular
      \\
    \hline
    MLP
      & 41.24 & 40.75 & 46.56 & 54.49
      \\
    \hline
    MLP + Rotation (\textbf{ours})
      & \textbf{46.04} & 40.71 & 46.77 & 54.72
      \\
    MLP + Translation (\textbf{ours})
      & 40.99 & \textbf{45.20} & 47.44 & \textbf{55.79}
      \\
    MLP + Scale (\textbf{ours})
      & 40.92 & 41.22 & \textbf{49.28} & 54.72
      \\
    MLP + Affine (\textbf{ours})
      & \textbf{46.12} & \textbf{45.77} & \textbf{48.81} & \textbf{55.44}
      \\
    %+ Diffeomorphism
    %  & & &
    %  & & &
    %  \\
    %\hline
    \end{tabular}
    \end{adjustbox}
    \caption{Test Accuracy scores for learned invariance using different transformations in a shallow ReLU neural network on the CIFAR-10 dataset.}
    \label{tab:cifar-results}
\end{table}

\section{Discussion and Conclusion}
\label{discussion-and-conclusion}
In this paper, we propose a single training procedure capable that can \textit{learn} invariant weights in neural networks automatically from data. We follow what is common in Bayesian statistics and optimise the marginal likelihood to perform Bayesian model selection: a method that has been proven capable to learn invariances in GPs \citep{van2018learning}. We propose a lower bound to allow optimization of the \textit{marginal likelihood} in shallow neural networks. On MNIST and \mbox{CIFAR-10} image classification tasks, we demonstrate that we can automatically learn weights that are invariant to correct correct affine transformations, solely using training data. Furthermore, we show that this leads to better generalization and higher predictive test accuracies.

The marginal likelihood is a general model selection method and is parameterization independent. Therefore, we can expect it to work on other invariances and other model architectures. In this work, we focussed on affine transformations, but it would be interesting to consider more complex parameterizable transformations over the image space, such as diffeomorphic vector fields \citep{schwobel2020probabilistic}. We showed that we can learn invariance by sampling a learned compactly supported continuous probability distribution over group actions in common Lie groups. Allowing discrete groups would either require differentiating through a discrete probability distribution, for instance utilizing the Gumbel-Softmax trick \citep{jang2016categorical}), or, by treating the discrete group as a subgroup of some Lie group and learn to approximately distribute all continuous density $p_{\boldsymbol{\eta}}(T)$ to the group actions of the subgroup. Furthermore, it would be interesting to consider more flexible and complex probability densities over group actions, such as a mixture distributions or normalizing flows \citep{rezende2015variational, tabak2013family}, capable of expressing multiple modes as in \citep{falorsi2019reparameterizing}. We found that we could succesfully learn invariance using marginal likelihood, also referred to as Empirical Bayes or Type-II ML, which is not possible with regular maximum likelihood (Type-I ML). To do so, we relied on $\boldsymbol{\eta}$ being small and learning higher dimensional invariances might therefore require more sophisticated methods or additional priors on $\boldsymbol{\eta}$. Lastly, this work focuses on single layer neural networks, and we will consider deeper architectures in future work. For deeper models, we should ask the question whether the bound on the marginal likelihood will stay sufficiently tight \citep{dutordoir2021deep, ober2020global, immer2021scalable}.

To conclude, we hope our findings inspire other works to allow neural networks that automatically learn symmetries from data.

\nocite{moler2003nineteen} % used in supplementary material


% In the unusual situation where you want a paper to appear in the
% references without citing it in the main text, use \nocite
\nocite{langley00}

\bibliography{paper}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% APPENDIX
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% \appendix
% \onecolumn
% 
% \label{appendix:vi-derivations}
% \section*{Appendix A: Detailed Derivation of Variational Inverence}
% 
% Applying Variational Inference (VI) \citep{hoffman2013stochastic}, we maximise the marginal likelihood w.r.t. parameters $\vtheta = \text{vec}(\mW_2)$ by minimizing the $\KL(\cdot||\cdot)$-divergence between approximate posterior $q(\mW_2|\vmu, \mSigma)$ and true posterior distribution of weights $p(\mW_2|\train)$, equivalent to maximizing the evidence lower bound (ELBO) denoted by $\mathcal{L}$:
% \begin{align*}
% &\argmin_{\vmu, \mSigma} \KL(q(\mW_2|\vmu, \mSigma) || p(\mW_2|\train)) \\
% &=\argmin_{\vmu, \mSigma} \E_{q(\mW_2|\vmu, \mSigma)}\left[ \log \frac{q(\mW_2|\vmu,\mSigma)}{p(\mW_2|\train)} \right] \\
% &= \argmin_{\vmu, \mSigma} \E_{q(\mW_2|\vmu, \mSigma)}\left[ \log \frac{q(\mW_2|\vmu,\mSigma)}{p(\mW_2)p(\train|\mW_2)} \right] + \log p(\train) \\
% &= \argmin_{\vmu, \mSigma} \E_{q(\mW_2|\vmu, \mSigma)}\left[ \log \frac{q(\mW_2|\vmu,\mSigma)}{p(\mW_2)p(\train|\mW_2)} \right] \\
% &= \argmin_{\vmu, \mSigma} \E_{q(\mW_2|\vmu, \mSigma)}\left[ \log p(\mW_2 | \vmu, \mSigma) - \log p(\mW_2) -\log p(\train|\mW_2) \right] \\
% &= \argmin_{\vmu, \mSigma} \E_{q(\mW_2|\vmu, \mSigma)} \left[ \log p(\mW_2 | \vmu, \mSigma) - \log p(\mW_2) \right] - \E_{q(\mW_2|\vmu, \mSigma)} \left[ \log p(\train|\mW_2) \right] \\
% &= \argmin_{\vmu, \mSigma} \KL( q(\mW_2 | \vmu, \mSigma) || p(\mW_2)) + \E_{q(\mW_2|\vmu, \mSigma)} [ -\log p(\train|\mW_2) ] \\
% &= \argmax_{\vmu, \mSigma} \E_{q(\mW_2|\vmu, \mSigma)} [ \log p(\train|\mW_2) ] - \KL( q(\mW_2 | \vmu, \mSigma) || p(\mW_2)) \\
% &= \argmax_{\vmu, \mSigma} \mathcal{L}
% \end{align*}
% 
% We independently model the weight $\vw_2^c$ for each class $c$ with a full co-variance multivariate Gaussian distribution $\mathcal{N}(\vw^c_2|\vmu^c, \mSigma^c)$, parameterised by mean vector $\vmu^c$ and lower-triangular (Cholesky) decomposition of the co-variance $(\mL^c)^T\mL^c = \mSigma^c$ to avoid computational issues, following \citet{kingma2017variational}. We can view the variational posterior $q(\mW_2|\vmu, \mSigma)$ as multi-variate Gaussian over all classes with concatenated mean and block-diagonally stacked covariances from which we sample flattened matrix $\mW_2$ in one go, or -equivalently- sample row vectors $\vw_2^c$ for each class and concatenate them to obtain matrix $\mW_2$. By sampling $L$ times from variational approximation $\mW_2^{(1)}, \mW_2^{(2)} \hdots \mW_2^{(L)} \sim q(\mW_2|\vmu, \mSigma)$ we obtain a Monte Carlo estimate of $\E_\mW:=\E_{\mW_2 \sim q(\mW_2|\vmu, \mSigma)}$ required to compute the final ELBO or negative loss $\mathcal{L}(\vtheta, \train)$:
% 
% \begin{align*}
% \mathcal{L}(\vtheta, \train)
% &= \E_{q(\mW_2|\vmu, \mSigma)} [ \log p(\train|\mW_2) ] - 
% \KL( p(\mW_2 | \vmu, \mSigma) || p(\mW_2)) \\
% &= \E_{q(\mW_2|\vmu, \mSigma)} [ \log p(\train|\mW_2) ] - \sum_c \KL( \mathcal{N}(\vw^c_2 | \vmu^c, \mSigma^c) || p(\vw^c_2)) \\
% &= \E_{q(\mW_2|\vmu, \mSigma^c)} [ \log p(\train|\mW_2) ] -
% \sum_c \KL( \mathcal{N}(\vw^c_2 | \vmu, \mSigma^c) || \mathcal{N}(\vzero; \mSigma_p)) \\
% &= -\xoverbrace{\sum_l^L \sum_i^N
% -\log \sigma_{y^{(i)}_c} \Big( \E_{T\sim p_{\boldsymbol{\eta}}(T)} \left[ \mW_{2} \circ \phi\left( \mW_{1} \circ T \circ \vx^{(i)} \right) \right] \Big)
% }^{\text{Regular Average Cross-entropy}} -
% \xoverbrace{\sum_c \frac{1}{2} \left[ \log \frac{|\mSigma^c|}{|\mSigma_p|} - D + \text{tr}\left\{\mSigma_p \mSigma^c\right\} + \vmu^T \mSigma_p^{-1} \vmu \right] }^{\text{Closed-form KL Regularizer}}
% \end{align*}
% 
% for every input $\vx^{(i)}$, log soft-argmax output $\sigma_{y_c}$ for class of corresponding label $y^{(i)}_c$, fixed first layer weights $\mW_1$, prior weights $\mSigma_p = \mI \alpha$, input dimensionality $D$, and trace $\text{tr}(\cdot)$. To allow for mini-batching, we use the Stochastic Variational Bayes Estimate (SGVB) from \citet{kingma2013auto} of the ELBO or negative loss $\mathcal{\tilde{L}}(\vtheta, \train)$:
% \begin{align*}
% \mathcal{\tilde{L}}(\vtheta, \train)
% &=
% -N
% \xoverbrace{
% \frac{1}{M}
% \sum_l^L \sum_i^M
% -\log \sigma_{y^{(i)}_c} \Big( \E_{T\sim p_{\boldsymbol{\eta}}(T)} \left[ \mW_{2} \circ \phi\left( \mW_{1} \circ T \circ \vx^{(i)} \right) \right] \Big)
% }^{\text{Regular Batch Averaged Cross-entropy}} -
% \xoverbrace{\sum_c \frac{1}{2} \left[ \log \frac{|\mSigma^c|}{|\mSigma_p|} - D + \text{tr}\left\{\mSigma_p \mSigma^c\right\} + \vmu^T \mSigma_p^{-1} \vmu \right] }^{\text{Closed-form KL Regularizer}}
% \end{align*}
% where we can choose $L=1$ if we use a sufficiently large batch size.
% 
% 
% \clearpage
% \label{appendix:weights-visualizations}
% \section*{Appendix B: Weight Visualizations of Learned Rotational Invariance}
% 
%  \begin{figure*}[!ht]
%  \centering 
%  \subfigure[Feature bank \#1 over training iterations]{\includegraphics[width=0.43\linewidth]{images/visualized_weights/W0.pdf}}
%  \hspace{0.05\linewidth}
%  \subfigure[Feature bank \#2 over training iterations]{\includegraphics[width=0.43\linewidth]{images/visualized_weights/W1.pdf}}
%  
%  \subfigure[Feature bank \#3 over training iterations]{\includegraphics[width=0.43\linewidth]{images/visualized_weights/W2.pdf}}
%  \hspace{0.05\linewidth}
%  \subfigure[Feature bank \#4 over training iterations]{\includegraphics[width=0.43\linewidth]{images/visualized_weights/W3.pdf}}
%  
%  \subfigure[Feature bank \#5 over training iterations]{\includegraphics[width=0.43\linewidth]{images/visualized_weights/W4.pdf}}
%  \hspace{0.05\linewidth}
%  \subfigure[Feature bank \#6 over training iterations]{\includegraphics[width=0.43\linewidth]{images/visualized_weights/W5.pdf}}
%  \caption{Illustration of the features banks over training iterations. Features are randomly initialised with almost no rotational invariance and converge to particular filters with full rotational invariance when trained on fully rotated MNIST data.}
%  \label{fig:visualized-weights-extra}
%  \end{figure*}
% 
% \clearpage
% \section*{Appendix C.1: Rotational Invariance in RFF Neural Network}
% 
% \begin{figure*}[!ht]
%     \centering
%     \begin{adjustbox}{max width=0.9\linewidth}
%     \includegraphics[width=0.9\linewidth]{images/recovering-invariances/recovering-invariances.pdf}
%     %}
%     \end{adjustbox}
%     \caption{Predicted invariance over training iterations for different initial invariances for RFF neural network.}
%     \label{fig:extra-recovering-invariance-rff}
% \end{figure*}
% 
% \begin{table*}[!ht]
%     \centering
%     %\begin{tabular}{r|c c c|c c c|c c c}
%     \begin{adjustbox}{max width=0.7\linewidth}
%     \begin{tabular}{r|c c c|c c c}
%        & \multicolumn{3}{ c }{Test Accuracy}
%        & \multicolumn{3}{ c }{ELBO} \\
%       Model
%       & \shortstack{{\footnotesize Fully rotated} \\ MNIST}
%       & \shortstack{{\footnotesize Partially rotated} \\ MNIST}
%       & \shortstack{Regular \\ MNIST}
%       & \shortstack{{\footnotesize Fully rotated} \\ MNIST}
%       & \shortstack{{\footnotesize Partially rotated} \\ MNIST}
%       & \shortstack{Regular \\ MNIST}
%       %& Full ±180$^\circ$ & Half ±90$^\circ$ & Regular
%       %& Full ±180$^\circ$ & Half ±90$^\circ$ & Regular
%       \\
%     \hline
%       Fixed 5$^\circ$
%       & 79.29 & 86.71 & \textbf{96.00}
%       & -1.07 & -0.80 & -0.36
%       \\
%       Fixed 45$^\circ$
%       & 87.35 & 91.13 & 95.93
%       & -0.63 & -0.49 & \textbf{-0.26}
%       \\
%       Fixed 90$^\circ$
%       & 90.33 & \textbf{91.69} & 94.69
%       & -0.52 & \textbf{-0.44} & -0.30
%       \\
%       Fixed 135$^\circ$
%       & 91.19 & 91.04 & 92.13
%       & -0.45 & -0.45 & -0.36
%       \\
%       Fixed 175$^\circ$
%       & \textbf{91.57} & 90.47 & 90.97
%       & \textbf{-0.43} & -0.47 & -0.45
%       \\
%     \hline
%       Learned (5$^\circ$ Init)
%       & \textbf{91.72} & \textbf{92.34} & \textbf{96.40}
%       & \textbf{-0.43} & \textbf{-0.42} & \textbf{-0.26}
%       \\
%       Learned (45$^\circ$ Init)
%       & \textbf{91.65} & \textbf{92.31} & \textbf{96.42}
%       & \textbf{-0.43} & \textbf{-0.42} & \textbf{-0.26}
%       \\
%       Learned (90$^\circ$ Init)
%       & \textbf{91.65} & \textbf{92.37} & \textbf{96.40}
%       & \textbf{-0.43} & \textbf{-0.42} & \textbf{-0.26}
%       \\
%       Learned (135$^\circ$ Init)
%       & \textbf{91.66} & \textbf{92.37} & \textbf{96.10}
%       & \textbf{-0.43} & \textbf{-0.42} & \textbf{-0.26}
%       \\
%       Learned (175$^\circ$ Init)
%       & \textbf{91.68} & \textbf{91.69} & 95.64
%       & \textbf{-0.43} & \textbf{-0.43} & \textbf{-0.26}
%       \\
%     \hline
%     \end{tabular}
%     \end{adjustbox}
%     \caption{Table containing Test Accuracy and ELBO scores after training for experiments with RFF network. In bold: the best scores for fixed invariance and, for learned invariances, all scores that surpass the best score using fixed invariance.}
%     \label{tab:additional-rff}
% \end{table*}
% 
% \newpage
% \section*{Appendix C.2: Rotational Invariance in ReLU Neural Network}
% 
% \begin{figure*}[!ht]
%     \centering
%     %\resizebox{0.9\linewidth}{!}{
%     \begin{adjustbox}{max width=0.9\linewidth}
%     \includegraphics[width=0.9\linewidth]{images/recovering-invariances/learning-invariances-nn.pdf}
%     %}
%     \end{adjustbox}
%     \caption{Predicted invariance over training iterations for different initial invariances of ReLU neural network with both input and output layer weights trained.}
%     \label{fig:extra-recovering-invariance-nn}
% \end{figure*}
% 
% 
% \begin{table*}[!ht]
%     \centering
%     \begin{adjustbox}{max width=0.7\linewidth}
%     \begin{tabular}{r|c c c|c c c}
%        %& \multicolumn{3}{ c }{Test NLL}
%        & \multicolumn{3}{ c }{Test Accuracy}
%        & \multicolumn{3}{ c }{ELBO} \\
%       Model
%       & \shortstack{{\footnotesize Fully rotated} \\ MNIST}
%       & \shortstack{{\footnotesize Partially rotated} \\ MNIST}
%       & \shortstack{Regular \\ MNIST}
%       & \shortstack{{\footnotesize Fully rotated} \\ MNIST}
%       & \shortstack{{\footnotesize Partially rotated} \\ MNIST}
%       & \shortstack{Regular \\ MNIST}
%       %& Full ±180$^\circ$ & Half ±90$^\circ$ & Regular
%       %& Full ±180$^\circ$ & Half ±90$^\circ$ & Regular
%       \\
%       \hline
%       Fixed 5$^\circ$
%       & 87.21 & 90.68 & 96.76 
%       & -0.28 & -0.20 & \textbf{-0.02}
%       \\
%       Fixed 45$^\circ$
%       & 95.24 & 96.46 & 98.13
%       & -0.09 & \textbf{-0.06} & -0.02
%       \\
%       Fixed 90$^\circ$
%       & 96.50 & 97.11 & \textbf{98.14}
%       & -0.07 & -0.06 & -0.03 
%       \\
%       Fixed 135$^\circ$
%       & 97.15 & \textbf{97.31} & 97.79
%       & \textbf{-0.06} & -0.06 & -0.04 
%       \\
%       Fixed 175$^\circ$
%       & \textbf{97.53} & 97.30 & 97.15
%       & -0.07 & -0.06 & -0.06
%       \\
%     \hline
%       Learned (0$^\circ$ Init)
%       & 97.34 & 97.13 & \textbf{98.40} 
%       & -0.07 & \textbf{-0.06} & \textbf{-0.02}
%       \\
%       Learned (45$^\circ$ Init)
%       & 97.23 & \textbf{97.36} & \textbf{98.27} 
%       & -0.07 & \textbf{-0.05} & \textbf{-0.02}
%       \\
%       Learned (90$^\circ$ Init)
%       & 97.28 & \textbf{97.22} & \textbf{98.19}
%       & -0.07 & \textbf{-0.06} & \textbf{-0.02}
%       \\
%       Learned (135$^\circ$ Init)
%       & 97.45 & \textbf{97.29} & \textbf{98.33} 
%       & \textbf{-0.06} & \textbf{-0.05} & \textbf{-0.02}
%       \\
%       Learned (175$^\circ$ Init)
%       & 97.23 & \textbf{97.23} & \textbf{98.03}
%       & \textbf{-0.06} & \textbf{-0.06} & \textbf{-0.03}
%       \\
%     \hline
%     \end{tabular}
%     \end{adjustbox}
%     \caption{Table containing Test Accuracy and ELBO scores after training for experiments of ReLU neural network with both input and output layer weights trained. In bold: the best scores for fixed invariance and, for learned invariances, all scores that surpass the best score using fixed invariance.}
%     \label{tab:additional-relu}
% \end{table*}
% 
% \clearpage
% \section*{Appendix C.3: Different Transformations in RFF Network}
% 
% \begin{table}[!ht]
%     \centering
%     \begin{adjustbox}{max width=1.0\linewidth}
%     \begin{tabular}{r|c c c c|c c c c}
%        & \multicolumn{4}{ c }{\underline{Test Accuracy}}
%        & \multicolumn{4}{ c }{\underline{ELBO}} \\
%       Model
%       & \shortstack{{\footnotesize Fully rotated} \\ MNIST}
%       & \shortstack{Translated \\ MNIST}
%       & \shortstack{Scaled \\ MNIST}
%       & \shortstack{Regular \\ MNIST}
%       & \shortstack{{\footnotesize Fully rotated} \\ MNIST}
%       & \shortstack{Translated \\ MNIST}
%       & \shortstack{Scaled \\ MNIST}
%       & \shortstack{Regular \\ MNIST}
%       %& Full ±180$^\circ$ & Half ±90$^\circ$ & Regular
%       \\
%     \hline
%     Regular MLP
%       & 79.29 & 66.07 & 89.25 & 95.16
%       & -1.14  & -1.49 & -0.69 & -0.39
%       \\
%     \hline
%     + Rotation
%       & \textbf{92.59} & 75.06 & 88.66 & 96.59
%       & \textbf{-0.43} & -1.08 & -0.62 & -0.26
%       \\
%     + Translation
%       & 83.66 & \textbf{87.81} & 86.15 & 96.78
%       & -0.82 & \textbf{-0.64} & -0.72 & -0.24
%       \\
%     + Scale
%       & 82.77 & 75.48 & \textbf{91.31} & 96.52
%       & -0.84 & -1.08 & \textbf{-0.49} & -0.26
%       \\
%       \hline
%     + Affine
%       & \textbf{92.64} & \textbf{87.77} & \textbf{90.58} & \textbf{97.38}
%       & \textbf{-0.43} & \textbf{-0.64} & \textbf{-0.54} & \textbf{-0.21}
%       \\
%     %\hline
%     %+ Diffeomorphism
%     %  & & &
%     %  & & &
%     %  \\
%     \hline
%     \end{tabular}
%     \end{adjustbox}
%     \caption{Test Accuracy and ELBO for learned invariance using different transformations in a shallow RFF neural network.}
%     \label{tab:additional-transformations-rff}
% \end{table}
% 
% \section*{Appendix C.4: Different Transformation in ReLU Network}
% 
% \begin{table}[h]
%     \centering
%     \begin{adjustbox}{max width=1.0\linewidth}
%     \begin{tabular}{r|c c c c|c c c c}
%        & \multicolumn{4}{ c }{\underline{Test Accuracy}}
%        & \multicolumn{4}{ c }{\underline{ELBO}} \\
%       Model      & \shortstack{{\footnotesize Fully rotated} \\ MNIST}
%       & \shortstack{Translated \\ MNIST}
%       & \shortstack{Scaled \\ MNIST}
%       & \shortstack{Regular \\ MNIST}
%       & \shortstack{{\footnotesize Fully rotated} \\ MNIST}
%       & \shortstack{Translated \\ MNIST}
%       & \shortstack{Scaled \\ MNIST}
%       & \shortstack{Regular \\ MNIST}
%       %& Full ±180$^\circ$ & Half ±90$^\circ$ & Regular
%       \\
%     \hline
%     Regular MLP
%       & 90.35 & 89.34 & 96.61 & 98.10
%       & -0.06 & -0.06 & -0.03 & -0.02
%       \\
%     \hline
%     + Rotation
%       & \textbf{98.05} & 94.08 & 97.62 & 98.64
%       & \textbf{-0.05} & -0.06 & -0.03 & -0.02
%       \\
%     + Translation
%       & 93.59 & \textbf{97.87} & 97.98 & 98.76
%       & -0.09 & -0.06 & -0.03 & -0.02
%       \\
%     + Scale
%       & 93.80 & 94.30 & \textbf{98.06} & 98.35
%       & -0.06 & -0.06 & -0.03 & -0.02
%       \\
%       \hline
%     + Affine
%       & \textbf{98.14} & \textbf{97.66} & \textbf{98.31} & \textbf{98.93}
%       & \textbf{-0.05} & -0.06 & -0.03 & -0.02
%       \\
%     \hline
%     \end{tabular}
%     \end{adjustbox}
%     \caption{Test Accuracy and ELBO for learned invariance using different transformations in a shallow ReLU neural network.}
%     \label{tab:additional-transformations-relu}
% \end{table}
% 
% \section*{Appendix C.4: Different Transformation in ReLU Network on datasets with combinations of two invariances.}
% 
% \begin{table}[h]
%     \centering
%     \begin{adjustbox}{max width=1.0\linewidth}
%     \begin{tabular}{r|c c c c|c c c c}
%        & \multicolumn{4}{ c }{\underline{Test Accuracy}}
%        & \multicolumn{4}{ c }{\underline{ELBO}} \\
%       Model
%       & \shortstack{{\footnotesize Fully rotated} \\ + Translated \\ MNIST}
%       & \shortstack{{\footnotesize Fully rotated} \\ + Scaled \\ MNIST}
%       & \shortstack{Translated \\+ Scaled\\ MNIST}
%       & \shortstack{Regular \\ MNIST}
%       & \shortstack{{\footnotesize Fully rotated} \\ + Translated \\ MNIST}
%       & \shortstack{{\footnotesize Fully rotated} \\ + Scaled \\ MNIST}
%       & \shortstack{Translated \\+ Scaled\\ MNIST}
%       & \shortstack{Regular \\ MNIST}
%       %& Full ±180$^\circ$ & Half ±90$^\circ$ & Regular
%       \\
%     \hline
%     Regular MLP
%       & 53.36 & 80.71 & 75.50 & 98.10
%       & \textbf{-0.26} & \textbf{-0.10} & \textbf{-0.12} & \textbf{-0.02}
%       \\
%     \hline
%     + Rotation
%       & \textbf{85.35} & \textbf{95.66} & 85.42 & 98.64
%       & -0.31 & -0.10 & -0.27 & -0.02
%       \\
%     + Translation
%       & \textbf{83.84} & 83.40 & \textbf{91.77} & 98.76
%       & -0.42 & -0.16 & -0.19 & -0.02
%       \\
%     + Scale
%       & 55.63 & \textbf{89.81} & \textbf{86.04} & 98.35
%       & -0.39 & -0.12 & -0.17 & -0.02
%       \\
%       \hline
%     + Affine
%       & \textbf{89.37} & \textbf{95.88} & \textbf{91.95} & \textbf{98.93}
%       & -0.37 & -0.09 & -0.18 & -0.02
%       \\
%     \hline
%     \end{tabular}
%     \end{adjustbox}
%     \caption{Test Accuracy and ELBO for learned invariance using different transformations in a shallow ReLU neural network on datasets augmented by two subsequent transformations (rotation+translation, rotation+scaling and translation+scaling). Surprisingly, the regular MLP ends up with the best ELBO in this experiment. We did not consistently observe the best ELBO for the regular MLP throughout optimization, and find that we can still use our method and the ELBO to learn invariances in this case. Again, we observe that models with learned invariances achieve the highest test accuracy.}
%     \label{tab:additional-transformations-relu}
% \end{table}
% 
% \section*{Appendix D: Dataset Details}
% 
% All datasets have 60000 training examples and 10000 test examples and are created by taking regular MNIST or CIFAR-10 and applying random transformations:
% 
% \textbf{Regular MNIST Dataset:} MNIST handwritten digit database \citep{lecun1998gradient}. \\
% \textbf{Regular CIFAR-10 Dataset:} CIFAR-10 dataset with 10 classes \citep{krizhevsky2009learning}. \\
% \textbf{Partially rotated dataset:} Every sample rotated by radian angle $\theta$, sampled from $\theta \sim U[-\frac{\pi}{2}, \frac{\pi}{2}]$. \\
% \textbf{Fully rotated dataset:} Every sample rotated by radian angle $\theta$, sampled from $\theta \sim U[-\pi, \pi]$. \\
% \textbf{Translated dataset:} Translated samples relatively by $dx$ and $dy$ pixels, sampled from $dx, dy \sim U[-8, 8]$. \\
% \textbf{Scaled dataset:} Every sample scaled around center with $\exp(s)$, sampled from $s \sim U[-\log(2), \log(2)]$. \\
% 
% \clearpage
% \section*{Appendix E: Lie Group Generators}
% 
% We follow \citet{benton2020learning} and, similarly, utilise six matrix generators:
% 
% \begin{align*}
% \begin{split}
% \mG_\text{transx} = \mG_1 &= 
% \begin{bmatrix}
% \ 0 & \ 0 & \ 1 \\
% \ 0 & \ 0 & \ 0 \\
% \ 0 & \ 0 & \ 0
% \end{bmatrix}
% \end{split}
% , \hspace{0.5cm}
% \begin{split}
% \mG_\text{transy} = \mG_2 &= 
% \begin{bmatrix}
% \ 0 & \ 0 & \ 0 \\
% \ 0 & \ 0 & \ 1 \\
% \ 0 & \ 0 & \ 0
% \end{bmatrix}
% \end{split}
% , \hspace{1.5cm}
% \begin{split}
% \mG_\text{rot} = \mG_3 &= 
% \begin{bmatrix}
% \ 0 & -1 &  \ 0 \\
% \ 1 & \ 0 & \ 0 \\
% \ 0 & \ 0 & \ 0
% \end{bmatrix}
% \end{split}
% \\
% \\
% \begin{split}
% \mG_\text{scalex} = \mG_4 &= 
% \begin{bmatrix}
% \ 1 & \ 0 & \ 0 \\
% \ 0 & \ 0 & \ 0 \\
% \ 0 & \ 0 & \ 0
% \end{bmatrix}
% \end{split}
% , \hspace{1.5cm}
% \begin{split}
% \mG_\text{scaley} = \mG_5 &= 
% \begin{bmatrix}
% \ 0 & \ 0 & \ 0 \\
% \ 0 & \ 1 & \ 0 \\
% \ 0 & \ 0 & \ 0
% \end{bmatrix}
% \end{split}
% , \hspace{1.5cm}
% \begin{split}
% \mG_\text{shear} = \mG_6 &= 
% \begin{bmatrix}
% \ 0 & \ 1 & \ 0 \\
% \ 1 & \ 0 & \ 0 \\
% \ 0 & \ 0 & \ 0
% \end{bmatrix}
% \end{split}
% \end{align*}
% 
% To parameterise affine transformations we compute the following matrix exponential \citep{moler2003nineteen}:
% 
% \begin{equation}
% \begin{split}
% T_{\boldsymbol{\epsilon}} = \exp \left( \sum_i \epsilon_i \eta_i \mG_i \right)
% \end{split}, \hspace{1cm}
% \begin{split}
%     \boldsymbol{\epsilon} \sim U[-1, 1]^k
% \end{split}
% \end{equation}
% 
% Optionally, the values of $\boldsymbol{\eta}$ can be constrained to a positive range by passing them through a `softplus'-function, or in case of $\eta_3 = \eta_{\text{rot}}$ to $[-\pi, \pi]$ using a scaled `tanh' function, preventing double coverage on the unit circle. In practice, however, we did not find such constraints necessary as long as $\eta_{\text{rot}}$ is reasonably initialised (e.g. $\boldsymbol{\eta} = \boldsymbol{0}$).
% 
% By fixing certain $\eta_i$ at 0, subsets of the generator matrices parameterise rotation, translation and scaling:
% 
% \begin{equation}
% \begin{split}
% &\text{For rotation only:} \\
% &\text{Learn $\eta_3$.}\\&\text{Fix $\eta_i=0$ for all $i\neq 3$.} \\
% T^\text{(rot)}_{\boldsymbol{\epsilon}} &= \exp \left( \sum_i \epsilon_i \eta_i \mG_i \right) \nonumber \\
% &= \exp \left( \epsilon_3 \eta_3 \mG_3 \right)  \nonumber\\
% &= \exp \left(
% \begin{bmatrix}
% 0 & -\epsilon_3\eta_3 & 0 \\
% \epsilon_3\eta_3 & 0 & 0 \\
% 0 & 0 & 0
% \end{bmatrix}
%  \right)  \nonumber\\
% &=
% \begin{bmatrix} 
% \cos(\epsilon_3\eta_3) & -\sin(\epsilon_3\eta_3) & 0 \\
% \sin(\epsilon_3\eta_3) &  \cos(\epsilon_3\eta_3) & 0 \\
% 0 & 0 & 1
% \end{bmatrix} 
% \end{split}
% \hspace{0.2cm}
% \vrule
% \hspace{0.2cm}
% \begin{split}
% &\text{For translation only:} \\
% &\text{Learn $\eta_1$ and $\eta_2$.}\\&\text{Fix $\eta_i=0$ for all $i>2$.} \\
% T^\text{(trans)}_{\boldsymbol{\epsilon}} &= \exp \left( \sum_i \epsilon_i \eta_i \mG_i \right) \nonumber \\
% &= \exp \left( \epsilon_1 \eta_1 \mG_1 + \epsilon_2 \eta_2 \mG_2 \right) \nonumber\\
% &= \exp \left(
% \begin{bmatrix}
% 0 & 0 & \eta_1 \\
% 0 & 0 & \eta_2 \\
% 0 & 0 & 0
% \end{bmatrix}
%  \right)  \nonumber\\
% &=
% \begin{bmatrix}
% 1 & 0 & \epsilon_1\eta_1 \\
% 0 & 1 & \epsilon_2\eta_2 \\
% 0 & 0 & 1
% \end{bmatrix} 
% \end{split}
% \hspace{0.2cm}
% \vrule
% \hspace{0.2cm}
% \begin{split}
% &\text{For scaling only:} \\
% &\text{Learn $\eta_4$ and $\eta_5$.}\\
% &\text{Fix $\eta_i=0$ for all $i \not\in \{4, 5\}$.} \\
% T^\text{(scale)}_{\boldsymbol{\epsilon}} &= \exp \left( \sum_i \epsilon_i \eta_i \mG_i \right) \nonumber \\
% &= \exp \left( \epsilon_4 \eta_4 \mG_4 + \epsilon_5 \eta_5 \mG_5 \right) \nonumber\\
% &= \exp \left(
% \begin{bmatrix}
% \eta_4 & 0 & 0 \\
% 0 & \eta_5 & 0 \\
% 0 & 0 & 0
% \end{bmatrix}
%  \right)  \nonumber\\
% &=
% \begin{bmatrix}
% \exp(\epsilon_4\eta_4) & 0 & 0 \\
% 0 & \exp(\epsilon_5\eta_5) & 0 \\
% 0 & 0 & 1
% \end{bmatrix}
% \end{split}
% \end{equation}
% 
% %\clearpage
% %\bibliography{sample}
% 
% 
\end{document}
