% \documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
% \usepackage[american]{babel}
\usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

\usepackage{microtype}
\usepackage{graphicx}
\usepackage{subfigure}
\usepackage{booktabs} % for professional tables

\usepackage{hyperref}

\input{math_commands.tex}
\usepackage{microtype}
\usepackage{float}
\usepackage{subfigure}
\usepackage{adjustbox}
\usepackage{booktabs} % for professional tables
\usepackage{amsmath, nccmath}
\usepackage{wasysym}
\usepackage{graphicx,subfigure}

\usepackage[section]{placeins}

\newcommand{\xoverbrace}[2][\vphantom{\dfrac{A}{A}}]{\overbrace{#1#2}}
\newcommand{\xunderbrace}[2][\vphantom{\dfrac{A}{A}}]{\underbrace{#1#2}}

\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}

\usepackage[capitalize,noabbrev]{cleveref}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Learning Invariant Weights in Neural Networks (Supplementary material)}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{Tycho~F.A.~van der Ouderaa}
\author[1]{Mark~van der Wilk}
% Add affiliations after the authors
\affil[1]{%
    Imperial College London, UK
}
 
\begin{document}

\maketitle

% In the unusual situation where you want a paper to appear in the
% references without citing it in the main text, use \nocite
%\nocite{langley00}
%
%\bibliography{paper}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% APPENDIX
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\appendix
\onecolumn

\label{appendix:vi-derivations}
\section*{Appendix A: Detailed Derivation of Variational Inverence}

Applying Variational Inference (VI) \citep{hoffman2013stochastic}, we maximise the marginal likelihood w.r.t. parameters $\vtheta = \text{vec}(\mW_2)$ by minimizing the $\KL(\cdot||\cdot)$-divergence between approximate posterior $q(\mW_2|\vmu, \mSigma)$ and true posterior distribution of weights $p(\mW_2|\train)$, equivalent to maximizing the evidence lower bound (ELBO) denoted by $\mathcal{L}$:
\begin{align*}
&\argmin_{\vmu, \mSigma} \KL(q(\mW_2|\vmu, \mSigma) || p(\mW_2|\train)) \\
&=\argmin_{\vmu, \mSigma} \E_{q(\mW_2|\vmu, \mSigma)}\left[ \log \frac{q(\mW_2|\vmu,\mSigma)}{p(\mW_2|\train)} \right] \\
&= \argmin_{\vmu, \mSigma} \E_{q(\mW_2|\vmu, \mSigma)}\left[ \log \frac{q(\mW_2|\vmu,\mSigma)}{p(\mW_2)p(\train|\mW_2)} \right] + \log p(\train) \\
&= \argmin_{\vmu, \mSigma} \E_{q(\mW_2|\vmu, \mSigma)}\left[ \log \frac{q(\mW_2|\vmu,\mSigma)}{p(\mW_2)p(\train|\mW_2)} \right] \\
&= \argmin_{\vmu, \mSigma} \E_{q(\mW_2|\vmu, \mSigma)}\left[ \log p(\mW_2 | \vmu, \mSigma) - \log p(\mW_2) -\log p(\train|\mW_2) \right] \\
&= \argmin_{\vmu, \mSigma} \E_{q(\mW_2|\vmu, \mSigma)} \left[ \log p(\mW_2 | \vmu, \mSigma) - \log p(\mW_2) \right] - \E_{q(\mW_2|\vmu, \mSigma)} \left[ \log p(\train|\mW_2) \right] \\
&= \argmin_{\vmu, \mSigma} \KL( q(\mW_2 | \vmu, \mSigma) || p(\mW_2)) + \E_{q(\mW_2|\vmu, \mSigma)} [ -\log p(\train|\mW_2) ] \\
&= \argmax_{\vmu, \mSigma} \E_{q(\mW_2|\vmu, \mSigma)} [ \log p(\train|\mW_2) ] - \KL( q(\mW_2 | \vmu, \mSigma) || p(\mW_2)) \\
&= \argmax_{\vmu, \mSigma} \mathcal{L}
\end{align*}

We independently model the weight $\vw_2^c$ for each class $c$ with a full co-variance multivariate Gaussian distribution $\mathcal{N}(\vw^c_2|\vmu^c, \mSigma^c)$, parameterised by mean vector $\vmu^c$ and lower-triangular (Cholesky) decomposition of the co-variance $(\mL^c)^T\mL^c = \mSigma^c$ to avoid computational issues, following \citet{kingma2017variational}. We can view the variational posterior $q(\mW_2|\vmu, \mSigma)$ as multi-variate Gaussian over all classes with concatenated mean and block-diagonally stacked covariances from which we sample flattened matrix $\mW_2$ in one go, or -equivalently- sample row vectors $\vw_2^c$ for each class and concatenate them to obtain matrix $\mW_2$. By sampling $L$ times from variational approximation $\mW_2^{(1)}, \mW_2^{(2)} \hdots \mW_2^{(L)} \sim q(\mW_2|\vmu, \mSigma)$ we obtain a Monte Carlo estimate of $\E_\mW:=\E_{\mW_2 \sim q(\mW_2|\vmu, \mSigma)}$ required to compute the final ELBO or negative loss $\mathcal{L}(\vtheta, \train)$:

\begin{align*}
\mathcal{L}(\vtheta, \train)
&= \E_{q(\mW_2|\vmu, \mSigma)} [ \log p(\train|\mW_2) ] - 
\KL( p(\mW_2 | \vmu, \mSigma) || p(\mW_2)) \\
&= \E_{q(\mW_2|\vmu, \mSigma)} [ \log p(\train|\mW_2) ] - \sum_c \KL( \mathcal{N}(\vw^c_2 | \vmu^c, \mSigma^c) || p(\vw^c_2)) \\
&= \E_{q(\mW_2|\vmu, \mSigma^c)} [ \log p(\train|\mW_2) ] -
\sum_c \KL( \mathcal{N}(\vw^c_2 | \vmu, \mSigma^c) || \mathcal{N}(\vzero; \mSigma_p)) \\
&= -\xoverbrace{\sum_l^L \sum_i^N
-\log \sigma_{y^{(i)}_c} \Big( \E_{T\sim p_{\boldsymbol{\eta}}(T)} \left[ \mW_{2} \circ \phi\left( \mW_{1} \circ T \circ \vx^{(i)} \right) \right] \Big)
}^{\text{Regular Average Cross-entropy}} -
\xoverbrace{\sum_c \frac{1}{2} \left[ \log \frac{|\mSigma^c|}{|\mSigma_p|} - D + \text{tr}\left\{\mSigma_p \mSigma^c\right\} + \vmu^T \mSigma_p^{-1} \vmu \right] }^{\text{Closed-form KL Regularizer}}
\end{align*}

for every input $\vx^{(i)}$, log soft-argmax output $\sigma_{y_c}$ for class of corresponding label $y^{(i)}_c$, fixed first layer weights $\mW_1$, prior weights $\mSigma_p = \mI \alpha$, input dimensionality $D$, and trace $\text{tr}(\cdot)$. To allow for mini-batching, we use the Stochastic Variational Bayes Estimate (SGVB) from \citet{kingma2013auto} of the ELBO or negative loss $\mathcal{\tilde{L}}(\vtheta, \train)$:
\begin{align*}
\mathcal{\tilde{L}}(\vtheta, \train)
&=
-N
\xoverbrace{
\frac{1}{M}
\sum_l^L \sum_i^M
-\log \sigma_{y^{(i)}_c} \Big( \E_{T\sim p_{\boldsymbol{\eta}}(T)} \left[ \mW_{2} \circ \phi\left( \mW_{1} \circ T \circ \vx^{(i)} \right) \right] \Big)
}^{\text{Regular Batch Averaged Cross-entropy}} -
\xoverbrace{\sum_c \frac{1}{2} \left[ \log \frac{|\mSigma^c|}{|\mSigma_p|} - D + \text{tr}\left\{\mSigma_p \mSigma^c\right\} + \vmu^T \mSigma_p^{-1} \vmu \right] }^{\text{Closed-form KL Regularizer}}
\end{align*}
where we can choose $L=1$ if we use a sufficiently large batch size.


\clearpage
\label{appendix:weights-visualizations}
\section*{Appendix B: Weight Visualizations of Learned Rotational Invariance}

 \begin{figure*}[!ht]
 \centering 
 \subfigure[Feature bank \#1 over training iterations]{\includegraphics[width=0.43\linewidth]{images/visualized_weights/W0.pdf}}
 \hspace{0.05\linewidth}
 \subfigure[Feature bank \#2 over training iterations]{\includegraphics[width=0.43\linewidth]{images/visualized_weights/W1.pdf}}
 
 \subfigure[Feature bank \#3 over training iterations]{\includegraphics[width=0.43\linewidth]{images/visualized_weights/W2.pdf}}
 \hspace{0.05\linewidth}
 \subfigure[Feature bank \#4 over training iterations]{\includegraphics[width=0.43\linewidth]{images/visualized_weights/W3.pdf}}
 
 \subfigure[Feature bank \#5 over training iterations]{\includegraphics[width=0.43\linewidth]{images/visualized_weights/W4.pdf}}
 \hspace{0.05\linewidth}
 \subfigure[Feature bank \#6 over training iterations]{\includegraphics[width=0.43\linewidth]{images/visualized_weights/W5.pdf}}
 \caption{Illustration of the features banks over training iterations. Features are randomly initialised with almost no rotational invariance and converge to particular filters with full rotational invariance when trained on fully rotated MNIST data.}
 \label{fig:visualized-weights-extra}
 \end{figure*}

\clearpage
\section*{Appendix C.1: Rotational Invariance in RFF Neural Network}

\begin{figure*}[!ht]
    \centering
    \begin{adjustbox}{max width=0.9\linewidth}
    \includegraphics[width=0.9\linewidth]{images/recovering-invariances/recovering-invariances.pdf}
    %}
    \end{adjustbox}
    \caption{Predicted invariance over training iterations for different initial invariances for RFF neural network.}
    \label{fig:extra-recovering-invariance-rff}
\end{figure*}

\begin{table*}[!ht]
    \centering
    %\begin{tabular}{r|c c c|c c c|c c c}
    \begin{adjustbox}{max width=0.7\linewidth}
    \begin{tabular}{r|c c c|c c c}
       & \multicolumn{3}{ c }{Test Accuracy}
       & \multicolumn{3}{ c }{ELBO} \\
      Model
      & \shortstack{{\footnotesize Fully rotated} \\ MNIST}
      & \shortstack{{\footnotesize Partially rotated} \\ MNIST}
      & \shortstack{Regular \\ MNIST}
      & \shortstack{{\footnotesize Fully rotated} \\ MNIST}
      & \shortstack{{\footnotesize Partially rotated} \\ MNIST}
      & \shortstack{Regular \\ MNIST}
      %& Full ±180$^\circ$ & Half ±90$^\circ$ & Regular
      %& Full ±180$^\circ$ & Half ±90$^\circ$ & Regular
      \\
    \hline
      Fixed 5$^\circ$
      & 79.29 & 86.71 & \textbf{96.00}
      & -1.07 & -0.80 & -0.36
      \\
      Fixed 45$^\circ$
      & 87.35 & 91.13 & 95.93
      & -0.63 & -0.49 & \textbf{-0.26}
      \\
      Fixed 90$^\circ$
      & 90.33 & \textbf{91.69} & 94.69
      & -0.52 & \textbf{-0.44} & -0.30
      \\
      Fixed 135$^\circ$
      & 91.19 & 91.04 & 92.13
      & -0.45 & -0.45 & -0.36
      \\
      Fixed 175$^\circ$
      & \textbf{91.57} & 90.47 & 90.97
      & \textbf{-0.43} & -0.47 & -0.45
      \\
    \hline
      Learned (5$^\circ$ Init)
      & \textbf{91.72} & \textbf{92.34} & \textbf{96.40}
      & \textbf{-0.43} & \textbf{-0.42} & \textbf{-0.26}
      \\
      Learned (45$^\circ$ Init)
      & \textbf{91.65} & \textbf{92.31} & \textbf{96.42}
      & \textbf{-0.43} & \textbf{-0.42} & \textbf{-0.26}
      \\
      Learned (90$^\circ$ Init)
      & \textbf{91.65} & \textbf{92.37} & \textbf{96.40}
      & \textbf{-0.43} & \textbf{-0.42} & \textbf{-0.26}
      \\
      Learned (135$^\circ$ Init)
      & \textbf{91.66} & \textbf{92.37} & \textbf{96.10}
      & \textbf{-0.43} & \textbf{-0.42} & \textbf{-0.26}
      \\
      Learned (175$^\circ$ Init)
      & \textbf{91.68} & \textbf{91.69} & 95.64
      & \textbf{-0.43} & \textbf{-0.43} & \textbf{-0.26}
      \\
    \hline
    \end{tabular}
    \end{adjustbox}
    \caption{Table containing Test Accuracy and ELBO scores after training for experiments with RFF network. In bold: the best scores for fixed invariance and, for learned invariances, all scores that surpass the best score using fixed invariance.}
    \label{tab:additional-rff}
\end{table*}

\newpage
\section*{Appendix C.2: Rotational Invariance in ReLU Neural Network}

\begin{figure*}[!ht]
    \centering
    %\resizebox{0.9\linewidth}{!}{
    \begin{adjustbox}{max width=0.9\linewidth}
    \includegraphics[width=0.9\linewidth]{images/recovering-invariances/learning-invariances-nn.pdf}
    %}
    \end{adjustbox}
    \caption{Predicted invariance over training iterations for different initial invariances of ReLU neural network with both input and output layer weights trained.}
    \label{fig:extra-recovering-invariance-nn}
\end{figure*}


\begin{table*}[!ht]
    \centering
    \begin{adjustbox}{max width=0.7\linewidth}
    \begin{tabular}{r|c c c|c c c}
       %& \multicolumn{3}{ c }{Test NLL}
       & \multicolumn{3}{ c }{Test Accuracy}
       & \multicolumn{3}{ c }{ELBO} \\
      Model
      & \shortstack{{\footnotesize Fully rotated} \\ MNIST}
      & \shortstack{{\footnotesize Partially rotated} \\ MNIST}
      & \shortstack{Regular \\ MNIST}
      & \shortstack{{\footnotesize Fully rotated} \\ MNIST}
      & \shortstack{{\footnotesize Partially rotated} \\ MNIST}
      & \shortstack{Regular \\ MNIST}
      %& Full ±180$^\circ$ & Half ±90$^\circ$ & Regular
      %& Full ±180$^\circ$ & Half ±90$^\circ$ & Regular
      \\
      \hline
      Fixed 5$^\circ$
      & 87.21 & 90.68 & 96.76 
      & -0.28 & -0.20 & \textbf{-0.02}
      \\
      Fixed 45$^\circ$
      & 95.24 & 96.46 & 98.13
      & -0.09 & \textbf{-0.06} & -0.02
      \\
      Fixed 90$^\circ$
      & 96.50 & 97.11 & \textbf{98.14}
      & -0.07 & -0.06 & -0.03 
      \\
      Fixed 135$^\circ$
      & 97.15 & \textbf{97.31} & 97.79
      & \textbf{-0.06} & -0.06 & -0.04 
      \\
      Fixed 175$^\circ$
      & \textbf{97.53} & 97.30 & 97.15
      & -0.07 & -0.06 & -0.06
      \\
    \hline
      Learned (0$^\circ$ Init)
      & 97.34 & 97.13 & \textbf{98.40} 
      & -0.07 & \textbf{-0.06} & \textbf{-0.02}
      \\
      Learned (45$^\circ$ Init)
      & 97.23 & \textbf{97.36} & \textbf{98.27} 
      & -0.07 & \textbf{-0.05} & \textbf{-0.02}
      \\
      Learned (90$^\circ$ Init)
      & 97.28 & \textbf{97.22} & \textbf{98.19}
      & -0.07 & \textbf{-0.06} & \textbf{-0.02}
      \\
      Learned (135$^\circ$ Init)
      & 97.45 & \textbf{97.29} & \textbf{98.33} 
      & \textbf{-0.06} & \textbf{-0.05} & \textbf{-0.02}
      \\
      Learned (175$^\circ$ Init)
      & 97.23 & \textbf{97.23} & \textbf{98.03}
      & \textbf{-0.06} & \textbf{-0.06} & \textbf{-0.03}
      \\
    \hline
    \end{tabular}
    \end{adjustbox}
    \caption{Table containing Test Accuracy and ELBO scores after training for experiments of ReLU neural network with both input and output layer weights trained. In bold: the best scores for fixed invariance and, for learned invariances, all scores that surpass the best score using fixed invariance.}
    \label{tab:additional-relu}
\end{table*}

\clearpage
\section*{Appendix C.3: Different Transformations in RFF Network}

\begin{table}[!ht]
    \centering
    \begin{adjustbox}{max width=1.0\linewidth}
    \begin{tabular}{r|c c c c|c c c c}
       & \multicolumn{4}{ c }{\underline{Test Accuracy}}
       & \multicolumn{4}{ c }{\underline{ELBO}} \\
      Model
      & \shortstack{{\footnotesize Fully rotated} \\ MNIST}
      & \shortstack{Translated \\ MNIST}
      & \shortstack{Scaled \\ MNIST}
      & \shortstack{Regular \\ MNIST}
      & \shortstack{{\footnotesize Fully rotated} \\ MNIST}
      & \shortstack{Translated \\ MNIST}
      & \shortstack{Scaled \\ MNIST}
      & \shortstack{Regular \\ MNIST}
      %& Full ±180$^\circ$ & Half ±90$^\circ$ & Regular
      \\
    \hline
    Regular MLP
      & 79.29 & 66.07 & 89.25 & 95.16
      & -1.14  & -1.49 & -0.69 & -0.39
      \\
    \hline
    + Rotation
      & \textbf{92.59} & 75.06 & 88.66 & 96.59
      & \textbf{-0.43} & -1.08 & -0.62 & -0.26
      \\
    + Translation
      & 83.66 & \textbf{87.81} & 86.15 & 96.78
      & -0.82 & \textbf{-0.64} & -0.72 & -0.24
      \\
    + Scale
      & 82.77 & 75.48 & \textbf{91.31} & 96.52
      & -0.84 & -1.08 & \textbf{-0.49} & -0.26
      \\
      \hline
    + Affine
      & \textbf{92.64} & \textbf{87.77} & \textbf{90.58} & \textbf{97.38}
      & \textbf{-0.43} & \textbf{-0.64} & \textbf{-0.54} & \textbf{-0.21}
      \\
    %\hline
    %+ Diffeomorphism
    %  & & &
    %  & & &
    %  \\
    \hline
    \end{tabular}
    \end{adjustbox}
    \caption{Test Accuracy and ELBO for learned invariance using different transformations in a shallow RFF neural network.}
    \label{tab:additional-transformations-rff}
\end{table}

\section*{Appendix C.4: Different Transformation in ReLU Network}

\begin{table}[h]
    \centering
    \begin{adjustbox}{max width=1.0\linewidth}
    \begin{tabular}{r|c c c c|c c c c}
       & \multicolumn{4}{ c }{\underline{Test Accuracy}}
       & \multicolumn{4}{ c }{\underline{ELBO}} \\
      Model      & \shortstack{{\footnotesize Fully rotated} \\ MNIST}
      & \shortstack{Translated \\ MNIST}
      & \shortstack{Scaled \\ MNIST}
      & \shortstack{Regular \\ MNIST}
      & \shortstack{{\footnotesize Fully rotated} \\ MNIST}
      & \shortstack{Translated \\ MNIST}
      & \shortstack{Scaled \\ MNIST}
      & \shortstack{Regular \\ MNIST}
      %& Full ±180$^\circ$ & Half ±90$^\circ$ & Regular
      \\
    \hline
    Regular MLP
      & 90.35 & 89.34 & 96.61 & 98.10
      & -0.06 & -0.06 & -0.03 & -0.02
      \\
    \hline
    + Rotation
      & \textbf{98.05} & 94.08 & 97.62 & 98.64
      & \textbf{-0.05} & -0.06 & -0.03 & -0.02
      \\
    + Translation
      & 93.59 & \textbf{97.87} & 97.98 & 98.76
      & -0.09 & -0.06 & -0.03 & -0.02
      \\
    + Scale
      & 93.80 & 94.30 & \textbf{98.06} & 98.35
      & -0.06 & -0.06 & -0.03 & -0.02
      \\
      \hline
    + Affine
      & \textbf{98.14} & \textbf{97.66} & \textbf{98.31} & \textbf{98.93}
      & \textbf{-0.05} & -0.06 & -0.03 & -0.02
      \\
    \hline
    \end{tabular}
    \end{adjustbox}
    \caption{Test Accuracy and ELBO for learned invariance using different transformations in a shallow ReLU neural network.}
    \label{tab:additional-transformations-relu}
\end{table}

\section*{Appendix C.4: Different Transformation in ReLU Network on datasets with combinations of two invariances.}

\begin{table}[h]
    \centering
    \begin{adjustbox}{max width=1.0\linewidth}
    \begin{tabular}{r|c c c c|c c c c}
       & \multicolumn{4}{ c }{\underline{Test Accuracy}}
       & \multicolumn{4}{ c }{\underline{ELBO}} \\
      Model
      & \shortstack{{\footnotesize Fully rotated} \\ + Translated \\ MNIST}
      & \shortstack{{\footnotesize Fully rotated} \\ + Scaled \\ MNIST}
      & \shortstack{Translated \\+ Scaled\\ MNIST}
      & \shortstack{Regular \\ MNIST}
      & \shortstack{{\footnotesize Fully rotated} \\ + Translated \\ MNIST}
      & \shortstack{{\footnotesize Fully rotated} \\ + Scaled \\ MNIST}
      & \shortstack{Translated \\+ Scaled\\ MNIST}
      & \shortstack{Regular \\ MNIST}
      %& Full ±180$^\circ$ & Half ±90$^\circ$ & Regular
      \\
    \hline
    Regular MLP
      & 53.36 & 80.71 & 75.50 & 98.10
      & \textbf{-0.26} & \textbf{-0.10} & \textbf{-0.12} & \textbf{-0.02}
      \\
    \hline
    + Rotation
      & \textbf{85.35} & \textbf{95.66} & 85.42 & 98.64
      & -0.31 & -0.10 & -0.27 & -0.02
      \\
    + Translation
      & \textbf{83.84} & 83.40 & \textbf{91.77} & 98.76
      & -0.42 & -0.16 & -0.19 & -0.02
      \\
    + Scale
      & 55.63 & \textbf{89.81} & \textbf{86.04} & 98.35
      & -0.39 & -0.12 & -0.17 & -0.02
      \\
      \hline
    + Affine
      & \textbf{89.37} & \textbf{95.88} & \textbf{91.95} & \textbf{98.93}
      & -0.37 & -0.09 & -0.18 & -0.02
      \\
    \hline
    \end{tabular}
    \end{adjustbox}
    \caption{Test Accuracy and ELBO for learned invariance using different transformations in a shallow ReLU neural network on datasets augmented by two subsequent transformations (rotation+translation, rotation+scaling and translation+scaling). Surprisingly, the regular MLP ends up with the best ELBO in this experiment. We did not consistently observe the best ELBO for the regular MLP throughout optimization, and find that we can still use our method and the ELBO to learn invariances in this case. Again, we observe that models with learned invariances achieve the highest test accuracy.}
    \label{tab:additional-transformations-relu}
\end{table}

\newpage
\section*{Appendix D: Dataset Details}

All datasets have 60000 training examples and 10000 test examples and are created by taking regular MNIST or CIFAR-10 and applying random transformations:

\textbf{Regular MNIST Dataset:} MNIST handwritten digit database \citep{lecun1998gradient}. \\
\textbf{Regular CIFAR-10 Dataset:} CIFAR-10 dataset with 10 classes \citep{krizhevsky2009learning}. \\
\textbf{Partially rotated dataset:} Every sample rotated by radian angle $\theta$, sampled from $\theta \sim U[-\frac{\pi}{2}, \frac{\pi}{2}]$. \\
\textbf{Fully rotated dataset:} Every sample rotated by radian angle $\theta$, sampled from $\theta \sim U[-\pi, \pi]$. \\
\textbf{Translated dataset:} Translated samples relatively by $dx$ and $dy$ pixels, sampled from $dx, dy \sim U[-8, 8]$. \\
\textbf{Scaled dataset:} Every sample scaled around center with $\exp(s)$, sampled from $s \sim U[-\log(2), \log(2)]$. \\

\clearpage
\section*{Appendix E: Lie Group Generators}

We follow \citet{benton2020learning} and, similarly, utilise six matrix generators:

\begin{align*}
\begin{split}
\mG_\text{transx} = \mG_1 &= 
\begin{bmatrix}
\ 0 & \ 0 & \ 1 \\
\ 0 & \ 0 & \ 0 \\
\ 0 & \ 0 & \ 0
\end{bmatrix}
\end{split}
, \hspace{0.5cm}
\begin{split}
\mG_\text{transy} = \mG_2 &= 
\begin{bmatrix}
\ 0 & \ 0 & \ 0 \\
\ 0 & \ 0 & \ 1 \\
\ 0 & \ 0 & \ 0
\end{bmatrix}
\end{split}
, \hspace{1.5cm}
\begin{split}
\mG_\text{rot} = \mG_3 &= 
\begin{bmatrix}
\ 0 & -1 &  \ 0 \\
\ 1 & \ 0 & \ 0 \\
\ 0 & \ 0 & \ 0
\end{bmatrix}
\end{split}
\\
\\
\begin{split}
\mG_\text{scalex} = \mG_4 &= 
\begin{bmatrix}
\ 1 & \ 0 & \ 0 \\
\ 0 & \ 0 & \ 0 \\
\ 0 & \ 0 & \ 0
\end{bmatrix}
\end{split}
, \hspace{1.5cm}
\begin{split}
\mG_\text{scaley} = \mG_5 &= 
\begin{bmatrix}
\ 0 & \ 0 & \ 0 \\
\ 0 & \ 1 & \ 0 \\
\ 0 & \ 0 & \ 0
\end{bmatrix}
\end{split}
, \hspace{1.5cm}
\begin{split}
\mG_\text{shear} = \mG_6 &= 
\begin{bmatrix}
\ 0 & \ 1 & \ 0 \\
\ 1 & \ 0 & \ 0 \\
\ 0 & \ 0 & \ 0
\end{bmatrix}
\end{split}
\end{align*}

To parameterise affine transformations we compute the following matrix exponential \citep{moler2003nineteen}:

\begin{equation}
\begin{split}
T_{\boldsymbol{\epsilon}} = \exp \left( \sum_i \epsilon_i \eta_i \mG_i \right)
\end{split}, \hspace{1cm}
\begin{split}
    \boldsymbol{\epsilon} \sim U[-1, 1]^k
\end{split}
\end{equation}

Optionally, the values of $\boldsymbol{\eta}$ can be constrained to a positive range by passing them through a `softplus'-function, or in case of $\eta_3 = \eta_{\text{rot}}$ to $[-\pi, \pi]$ using a scaled `tanh' function, preventing double coverage on the unit circle. In practice, however, we did not find such constraints necessary as long as $\eta_{\text{rot}}$ is reasonably initialised (e.g. $\boldsymbol{\eta} = \boldsymbol{0}$).

By fixing certain $\eta_i$ at 0, subsets of the generator matrices parameterise rotation, translation and scaling:

\begin{equation}
\begin{split}
&\text{For rotation only:} \\
&\text{Learn $\eta_3$.}\\&\text{Fix $\eta_i=0$ for all $i\neq 3$.} \\
T^\text{(rot)}_{\boldsymbol{\epsilon}} &= \exp \left( \sum_i \epsilon_i \eta_i \mG_i \right) \nonumber \\
&= \exp \left( \epsilon_3 \eta_3 \mG_3 \right)  \nonumber\\
&= \exp \left(
\begin{bmatrix}
0 & -\epsilon_3\eta_3 & 0 \\
\epsilon_3\eta_3 & 0 & 0 \\
0 & 0 & 0
\end{bmatrix}
 \right)  \nonumber\\
&=
\begin{bmatrix} 
\cos(\epsilon_3\eta_3) & -\sin(\epsilon_3\eta_3) & 0 \\
\sin(\epsilon_3\eta_3) &  \cos(\epsilon_3\eta_3) & 0 \\
0 & 0 & 1
\end{bmatrix} 
\end{split}
\hspace{0.2cm}
\vrule
\hspace{0.2cm}
\begin{split}
&\text{For translation only:} \\
&\text{Learn $\eta_1$ and $\eta_2$.}\\&\text{Fix $\eta_i=0$ for all $i>2$.} \\
T^\text{(trans)}_{\boldsymbol{\epsilon}} &= \exp \left( \sum_i \epsilon_i \eta_i \mG_i \right) \nonumber \\
&= \exp \left( \epsilon_1 \eta_1 \mG_1 + \epsilon_2 \eta_2 \mG_2 \right) \nonumber\\
&= \exp \left(
\begin{bmatrix}
0 & 0 & \eta_1 \\
0 & 0 & \eta_2 \\
0 & 0 & 0
\end{bmatrix}
 \right)  \nonumber\\
&=
\begin{bmatrix}
1 & 0 & \epsilon_1\eta_1 \\
0 & 1 & \epsilon_2\eta_2 \\
0 & 0 & 1
\end{bmatrix} 
\end{split}
\hspace{0.2cm}
\vrule
\hspace{0.2cm}
\begin{split}
&\text{For scaling only:} \\
&\text{Learn $\eta_4$ and $\eta_5$.}\\
&\text{Fix $\eta_i=0$ for all $i \not\in \{4, 5\}$.} \\
T^\text{(scale)}_{\boldsymbol{\epsilon}} &= \exp \left( \sum_i \epsilon_i \eta_i \mG_i \right) \nonumber \\
&= \exp \left( \epsilon_4 \eta_4 \mG_4 + \epsilon_5 \eta_5 \mG_5 \right) \nonumber\\
&= \exp \left(
\begin{bmatrix}
\eta_4 & 0 & 0 \\
0 & \eta_5 & 0 \\
0 & 0 & 0
\end{bmatrix}
 \right)  \nonumber\\
&=
\begin{bmatrix}
\exp(\epsilon_4\eta_4) & 0 & 0 \\
0 & \exp(\epsilon_5\eta_5) & 0 \\
0 & 0 & 1
\end{bmatrix}
\end{split}
\end{equation}



\end{document}

\end{document}
