
% \documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.


%% Choose your variant of English; be consistent
% \usepackage[american]{babel}
\usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
    
%\usepackage[round]{natbib}
%\renewcommand{\bibname}{References}
%\renewcommand{\bibsection}{\subsubsection*{\bibname}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
%\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)


%\usepackage{cleverref}
\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}    % use 8-bit T1 fonts
\usepackage{xr-hyper}
\usepackage{hyperref}       % hyperlinks
\usepackage{url}            % simple URL typesetting
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{xcolor}         % colors
\usepackage{subcaption}     % subfigs
\usepackage{enumitem}
\usepackage{stfloats}      % positioning floats with two cols
\usepackage{placeins}      % for \FloatBarrier
\usepackage{mathtools,amsmath,amssymb}
\usepackage{refcount}     % to get counter of last eqn in main text
\usepackage{float}

\usepackage{adjustbox}  % for figs in appendix E

\newcommand{\x}{\mathbf{x}}
%\newcommand{\W}{\mathbf{W}}
\newcommand{\w}{\mathbf{w}}
\newcommand{\X}{\mathbf{X}}
\newcommand{\y}{\mathbf{y}}
\newcommand{\Y}{\mathbf{Y}}
\newcommand{\p}{\mathbf{p}}
\newcommand{\f}{\mathbf{f}}

\newcommand{\lrbracket}[3]{\left#1 #3 \right#2}
\newcommand{\lmrbracket}[5]{\left#1 #4 \middle#2 #5 \right#3}

\renewcommand{\b}{\lrbracket{(}{)}}
\newcommand{\bc}{\lmrbracket{(}{\vert}{)}}
\newcommand{\sqb}{\lrbracket{[}{]}}
\newcommand{\sqbc}{\lmrbracket{[}{\vert}{]}}

\renewcommand{\L}{\mathcal{L}}
\newcommand{\Lh}{\hat{\mathcal{L}}}
\newcommand{\Xh}{\hat{X}}

\newcommand{\Lnoaug}{\mathcal{L}_\text{noaug}}
\newcommand{\Ladd}{\mathcal{L}_\text{add}}
\newcommand{\Llogprob}{\mathcal{L}_\text{loss}}
\newcommand{\Llogits}{\mathcal{L}_{\text{logits}}}
\newcommand{\Lprob}{\mathcal{L}_{\text{prob}}}
\newcommand{\Linv}{\mathcal{L}_\mathrm{inv}}

\newcommand{\Lhlogprob}{\hat{\mathcal{L}}_\text{loss}}
\newcommand{\Lhlogits}{\hat{\mathcal{L}}_{\text{logits},K}}
\newcommand{\Lhprob}{\hat{\mathcal{L}}_{\text{prob},K}}

\newcommand{\LlogitsK}{\mathcal{L}_{\text{logits},K}}
\newcommand{\LprobK}{\mathcal{L}_{\text{prob},K}}

\newcommand{\LhlogitsKpo}{\hat{\mathcal{L}}_{\text{logits},K+1}}
\newcommand{\LhprobKpo}{\hat{\mathcal{L}}_{\text{prob},K+1}}

\newcommand{\LhlogitsKone}{\hat{\mathcal{L}}_{\text{logits};1}}
\newcommand{\LhprobKone}{\hat{\mathcal{L}}_{\text{prob};1}}

\newcommand{\Ktrain}{K_\text{train}}
\newcommand{\Ktest}{K_\text{test}}


\let\P\relax
\DeclareMathOperator{\P}{P}
\DeclareMathOperator{\Q}{Q}
\DeclareMathOperator{\E}{\mathbb{E}}
\newcommand{\Plogits}{\P_\text{logits}}
\newcommand{\Pprob}{\P_\text{prob}}
\newcommand{\Pnoaug}{\P_\text{noaug}}

\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}
\DeclareMathOperator{\const}{const}
\DeclareMathOperator{\softmax}{softmax}

\newcommand{\Exdash}{\E_{\x'|\x}}
\newcommand{\Exdashi}{\E_{\x'_i|\x_i}}
\newcommand{\Exdashik}{\E_{\{\x'_{i;k}\}|\x_i}}

\newcommand{\finv}{\f_\text{inv}}
\newcommand{\pinv}{\p_\text{inv}}
\newcommand{\transpose}{^{\tiny T}}


\makeatletter
\def\blfootnote{\xdef\@thefnmark{}\@footnotetext}
\makeatother

\newcommand{\tsum}{{\textstyle \sum}}

\newcommand{\myeqref}[1]{(Eq.~\ref{#1})}

\makeatletter
\newcommand*{\addFileDependency}[1]{% argument=file name and extension
  \typeout{(#1)}
  \@addtofilelist{#1}
  \IfFileExists{#1}{}{\typeout{No file #1.}}
}
\makeatother

\newcommand*{\myexternaldocument}[1]{%
    \externaldocument{#1}%
    \addFileDependency{#1.tex}%
    \addFileDependency{#1.aux}%
}
\myexternaldocument{nabarro_119}
% \externalcitedocument{nabarro_119}


% The \author macro works with any number of authors. There are two commands
% used to separate the names and addresses of multiple authors: \And and \AND.
%
% Using \And between authors leaves it to LaTeX to determine where to break the
% lines. Using \AND forces a line break at that point. So, if LaTeX puts 3 of 4
% authors names on the first line, and the last on the second line, try using
% \AND instead of \And before the third author name.

\title{Data augmentation in Bayesian neural networks and the cold posterior effect (supplementary material)}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<sdn09@ic.ac.uk>?Subject=DA in BNNs (UAI 2022)}{Seth Nabarro$^*$}{}}
\author[2]{Stoil Ganev$^*$}
\author[3]{Adrià Garriga-Alonso}
\author[3,4]{\\Vincent Fortuin}
\author[1]{Mark van der Wilk$^\dagger$}
\author[2]{Laurence Aitchison$^\dagger$}

% Add affiliations after the authors
\affil[1]{%
  Department of Computing\\Imperial College London %London, SW7 2BX, UK
}
\affil[2]{%
  Department of Computer Science\\University of Bristol %\\
  %Bristol, BS8 1UB, UK
}
\affil[3]{%
  Department of Engineering\\University of Cambridge %\\
  %Cambridge, CB2 1PZ, UK
  }
\affil[4]{%
  Department of Computer Science\\ETH Zürich %,\\ 
  %Zürich,  Switzerland
  }

\begin{document}

\onecolumn
% Set figure and equation counters to start at their final values from the main text
\setcounter{equation}{\getrefnumber{eq:finite_orbit}}
\setcounter{figure}{\getrefnumber{fig:cold_posterior}}

\maketitle
~   % to force break between affiliations and first appendix

\appendix

\section{Averaging Losses Emerges when using DA in VI and SGLD}
\label{sec:app:vi_sgld}


There are two particularly important algorithms for doing Bayesian inference in neural networks: stochastic gradient Langevin dynamics \citep[SGLD;][]{welling2011bayesian} and variational inference \citep[VI;][]{blundell2015weight}.
In SGLD without DA, we draw samples from the posterior over weights by following gradient of the log-probability with added noise,
\begin{align}
% \begin{split}
  \b{\Delta \w}_\text{noaug} =\frac{\epsilon}{2} \nabla_{\w} \Bigg[\log \P\b{\w} + \sum_{i=1}^N \log \P_\text{noaug}\bc{y_i}{\x_i, \w} \Bigg]  + \sqrt{\epsilon} \; \boldsymbol{\eta}
  \label{eq:sgld}
  %&= \frac{\epsilon}{2} \nabla_{\w} \sqb{\P\b{\w} + \sum_{i=1}^N \log \softmax_{y_i}\b{f(\x_i; \w)}}  + \sqrt{\epsilon} \; \boldsymbol{\eta}
%  \end{split}
\end{align}
where $\boldsymbol{\eta}$ is standard Gaussian IID noise, and for simplicity we give the expression for full-batch Langevin dynamics rather than minibatched SGLD (they do not differ for the purposes of reasoning about DA). 
Likewise the variational inference objective is,
\begin{align}
  \label{eq:elbo}
  \text{ELBO}_\text{noaug} = \E_{\Q\b{\w}}\Bigg[\log\P\b{\w} +\sum_{i=1}^N \log \P_\text{noaug}\bc{y_i}{\x_i, \w} 
   -\log \Q\b{\w}\Bigg]
\end{align}
where $\Q\b{\w}$ is the variational approximate posterior learned by optimizing this objective.
%The typical approach to incorporating DA in these settings is to replace the unaugmented image, $\x_i$, with an augmented image, $\x'_i$, in Eq.~\eqref{eq:sgld} and \eqref{eq:elbo}.
%\begin{align}
%  \E\sqb{\b{\Delta \w}_\text{aug}} %&= \frac{\epsilon}{2} \nabla_{\w} \sqb{\P\b{\w} + \sum_{i=1}^N \E\sqb{\log \softmax_{y_i}\b{f(\x_i'; \w)}}}  + \sqrt{\epsilon} \; \boldsymbol{\eta}\\
%  &= \frac{\epsilon}{2} \nabla_{\w} \sqb{\P\b{\w} + \sum_{i=1}^N \Llogprob^i\b{y_i; \w} }  + \sqrt{\epsilon} \; \boldsymbol{\eta},\\
%  \text{ELBO}_\text{aug} &= \E_{\Q\b{\w}}\sqb{\P\b{\w} + \sum_{i=1}^N \Llogprob^i\b{y_i; \w} - \log \Q\b{\w}}.
%\end{align}
To understand the effect of the standard approach to DA, we replace $\log \P_\text{noaug}\bc{y_i}{\x_i, \w}$ with $\Llogprob^i(y_i; \w)$.
Then, we consider the expected update to the weights, averaging over the augmented images, $\x_i'$ conditioned on the underlying unaugmented images, $\x_i$,
\begin{align}
  \E\sqb{\b{\Delta \w}_\text{aug}} %&= \frac{\epsilon}{2} \nabla_{\w} \sqb{\P\b{\w} + \sum_{i=1}^N \E\sqb{\log \softmax_{y_i}\b{f(\x_i'; \w)}}}  + \sqrt{\epsilon} \; \boldsymbol{\eta}\\
  &= \frac{\epsilon}{2} \nabla_{\w} \Bigg[\log\P\b{\w} +\sum_{i=1}^N \Llogprob^i\b{y_i; \w} \Bigg]  + \sqrt{\epsilon} \; \boldsymbol{\eta},\\
  \text{ELBO}_\text{aug} &= \E_{\Q\b{\w}}\Bigg[\log\P\b{\w}+ \sum_{i=1}^N \Llogprob^i\b{y_i; \w} - \log \Q\b{\w}\Bigg].
\end{align}
In both cases, this ultimately replaces $\log \P_\text{noaug}\bc{y_i}{\x_i, \w}$ with $\Llogprob^i\b{y_i; \w}$, which as discussed in Sec.~\ref{sec:meth} is not a valid log-likelihood.

%\section{Principles for Incorporating DA in Probabilistic Models}


\section{The Approximate Posterior in VI Reduces Variance}
\label{sec:app:vi_var}

Here, we derive the ELBO using Jensen's inequality; we take $x$ to be the data and $z$ to be a latent variable.
Our goal is to compute the model evidence, $\P\b{x}$, by integrating out $z$,
\begin{align}
  \P\b{x} &= \int dz \P\bc{x}{z} \P\b{z} =\int dz \P\b{x,z}
\end{align}
where $\P\b{z}$ is the prior, $\P\bc{x}{z}$ is the likelihood and $\P\b{x, z}$ is the joint.
We introduce an approximate posterior, $\Q\b{z}$, and rewrite the integral as an expectation over that approximate posterior and apply Jensen's inequality,
\begin{align}
  \log \P\b{x} &= \log \int dz \Q\b{z} \frac{\P\b{x,z}}{\Q\b{z}} \\
  &= \log \E_{\Q\b{z}}\sqb{\frac{\P\b{x,z}}{\Q\b{z}}}\geq\E_{\Q\b{z}}\sqb{\log \frac{\P\b{x,z}}{\Q\b{z}}}.
\end{align}
Now it is evident that the tightness of the bound is controlled by the variance of $\P\b{x, z} / \Q\b{z}$.
Critically, if $\Q\b{z}$ matches the true posterior,
\begin{align}
  \Q\b{z} &= \P\bc{z}{x} \propto \P\b{x, z}
\end{align}
then $\P\b{x, z} / \Q\b{z}$ is constant (zero variance) and the bound is tight.

% \pagebreak


%\section{Results on general multisample bounds}
%Here, we show that:
%\begin{itemize}
%  \item Using more samples results in a tighter bound, $\L_{K+1} \geq \L_{K}$.
%  \item Taking the limit as $K\rightarrow \infty$ gives the right value, $\lim_{K\rightarrow\infty} \L_K = \L$.
%\end{itemize}
%The goal is to bound $\L$, defined by a convex function, $\psi$ (which is a $\log$ for the usual VAE Jensen bound and for averaging logits, but is $\log \softmax$ for averaging probabilities), applied to the expected value of a random variable, $X$,
%\begin{align}
%  \L &= \psi\b{\E\sqb{X}}.
%\end{align}
%Applying Jensen / convexity, we obtain the usual single-sample bound,
%\begin{align}
%  \L &= \psi\b{\E\sqb{X}} \geq \E\sqb{\psi(X)} = \L_1.
%\end{align}
%Now, we consider taking $K$ IID copies of $X$, denoted $X_k$,
%\begin{align}
%  \Xh_K &= \tfrac{1}{K} \sum_{k=1}^K X_k &
%  \Lh_K &= \E\sqb{\psi(\Xh_K)}
%\end{align}
%By the convexity of $\psi$,
%\begin{align}
%  \psi\b{\Xh_2} =\psi\b{\tfrac{1}{2}\b{X_1 + X_2}}\geq \tfrac{1}{2} \b{\psi(X_1) + \psi(X_2)}.
%\end{align}
%Taking the expectation,
%\begin{align}
%  \Lh_2 = \E\sqb{\psi\b{\Xh_2}} \geq \E\sqb{\tfrac{1}{2} \b{\psi(X_1) + \psi(X_2)}} = \E\sqb{\psi(X)} = \L_1.
%\end{align}
%so, $\L_2$ is a tighter bound than $\L_1$.
%Generalising this, by the convexity of $\psi$,
%\begin{align}
%  \psi\b{\Xh_{K+1}} =\psi\b{\tfrac{K}{K+1} \Xh_K + \tfrac{1}{K+1} X_{k+1}}\geq \b{\tfrac{K}{K+1}\psi(\Xh_K) + \tfrac{1}{K+1}\psi(X_{k+1})}.
%\end{align}
%Taking the expectation,
%\begin{align}
%  \Lh_{K+1} = \E\sqb{\psi\b{\Xh_{K+1}}} \geq \tfrac{K}{K+1}\E\sqb{\psi(\Xh_K)} + \tfrac{1}{K+1}\E\sqb{\psi(X_{k+1})} = \tfrac{K}{K+1} \Lh_{K} + \tfrac{1}{K+1} \Lh_1
%\end{align}
%
\section{Kinetic Diagnostic Results}
The values of kinetic temperature during inference are plotted in Fig.~\ref{fig:kinetic_temp}.
\label{sec:app:kinetic_temp}
\begin{figure}%[h!]
  \centering
  \begin{subfigure}{\textwidth}
    \includegraphics{figs/mnist_kinetic_temperatures_two_rows_rs.pdf}
    \caption{MNIST, FCNN.}
    \label{fig:kinetic_temp_mnist}
  \end{subfigure}
  \begin{subfigure}{\textwidth}
    \includegraphics{figs/cifar10_kinetic_temperatures_two_rows_rs.pdf}
    \caption{CIFAR-10, ResNet20.}
    \label{fig:kinetic_temp_cifar}
  \end{subfigure}
  \caption{The evolution of the kinetic temperature diagnostic~\citep{leimkuhler2015canonical} during inference. Good agreement between the diagnostic temperature and intended temperature (in legend) suggests accurate inference.}
  \label{fig:kinetic_temp}
\end{figure}

\section{Generalization Outside of Classification}
\label{sec:app:generalization}
We may be interested in generalizing the averaging logits and averaging probabilities ideas outside  classification.
For averaging probabilities, we use,
\begin{align}
  \P\b{y_i| \x_i} &= \int d \x'_i \P\b{y_i| \x_i'} \P\b{\x_i'| \x_i} = \E\sqb{\P\b{y_i| \x_i'}}
  \label{eq:avg_p}
\end{align}
Intuitively, each augmentation forms one component of a (potentially infinite) mixture model over the outputs, $y_i$.
Importantly, this expression makes no assumption about the support of distributions over $y_i$, so $y_i$ could be a finite set (classification), real-valued (regression), or anything else (a string, a graph, etc.) 
Note that directly applying a multi-sample estimator to (the logarithm of)~\myeqref{eq:avg_p} gives us a log-likelihood lower bound as in~\myeqref{eq:bound_prob}.

To generalize averaging logits, consider a situation where a distribution over an arbitrary $y_i$ is parameterized by a vector, $\f_i$ output by a neural network,
\begin{align}
  \P\b{y_i| \x} &= \pi\b{y_i; \f_i}.
\end{align}
In the standard case with no augmentation, we would take $\f_i=\f(\x_i; \w)$, (where we take $\f_i$ as the specific vector for input $i$, and $\f(\cdot; \cdot)$ as a function represented by a neural network, that takes an image and weights and returns a vector).
In the case with augmentation, we can average neural network outputs across different augmentations,
\begin{align}
  \f_{i} &= \int d \x'_i \P\b{\x_i'| \x_i} \f(\x_i'; \w) = \E\sqb{\f(\x_i'; \w)}.
  \label{eq:avg_f}%\\
  %\log\P\bc{y_i}{\E\sqb{\mathbf{g}(\x_i'; \w)}}\geqslant \E\sqb{\log\P\bc{y_i}{\frac{1}{K}\tsum_k\mathbf{g}(\x_{i,k}'; \w)}}
\end{align}
Note that in this case we need additional conditions for the multi-sample estimator to form a lower bound.
In particular, we need $\log \pi\b{y_i; \f_i}$ to be concave when treated as a function of $\f_i$ for a fixed $y_i$.% We explore the properties and justifications of both~\myeqref{eq:avg_p} and~\myeqref{eq:avg_f} in Appendix~\ref{sec:app:perspectives}.

\section{Perspectives on Probabilistic Data Augmentation}
\label{sec:app:perspectives}
Here we explore in depth the two general approaches to probabilistic data augmentation~\myeqref{eq:avg_p} and~\myeqref{eq:avg_f}. We discuss their justifications in Sec.~\ref{sec:app:invariance} and~\ref{sec:app:noisy_input}, and compare their properties in Sec.~\ref{sec:app:behaviour}.

\subsection{Invariance construction}
\label{sec:app:invariance}
In the main text, we suggest two ways of incorporating data augmentation: 1) by averaging logits output by the neural network, and 2) by averaging the predicted probabilities. In classification, both of these methods can be understood as justified by attempting to create a prediction that is more invariant to the transformations in the data augmentation.

When averaging logits, we aim to make the neural network mapping $\f: \mathbb{R}^D \to \mathbb{R}^C$ more invariant by averaging the outputs as in~\myeqref{eq:avg_f}.
% \begin{equation}
%     \label{eq:invavgfunc}
%     \finv(\x; \w) = \E\sqb{\f(\x'; \w)} = \int \f(\x'; \w) \P(\x'|\x) \mathrm{d}\x' \,.
% \end{equation}
This construction influences only the regression function, and so has a similar effect to changing the neural network architecture or changing the prior on the functions $\f(\cdot)$ in the Bayesian case \citep{van2018learning}. Since only the outputs are affected, this can be directly applied to any likelihood that depends only on an evaluation of the function, i.e.~any likelihood which can be written as $\P(y_i | \f_i)$.

In the case of averaging the probabilities, we can consider the model to be learning a mapping from image inputs to probability vectors $\p: \mathbb{R}^D \to \mathbb{P^C}$. We can make this mapping more invariant in the same way:
\begin{align}
    \label{eq:invavgprob}
    \pinv(\x_i;\w)&= \E\sqb{\softmax \left(\f(\x'_i; \w)\right)} \nonumber\\
    &= \int \softmax \left(\f(\x'_i; \w)\right) \P(\x'_i|\x_i) \mathrm d\x'_i\,.
\end{align}
The straightforward generalization of this construction would be to replace the softmax with the appropriate likelihood (see general case in~\myeqref{eq:avg_p}). When considering likelihoods other than softmax classification (e.g.~Gaussian likelihoods for regression), stronger differences between these constructions emerge in both behaviour and justification. We investigate further in  Appendix~\ref{sec:app:behaviour}.

\subsection{Noisy-input model}
\label{sec:app:noisy_input}
As stated above, we can generalize averaging the classification probabilities by replacing the softmax with the appropriate likelihood as in~\myeqref{eq:avg_p}.
% \begin{align}
%     p_\text{inv}(y_n|\x_n, \w) = \int p(y_n|\w, \x_n') p_\text{aug}(\x_n'|\x_n) \mathrm{d}\x_n' \,.
%     \label{eq:noisy_input}
% \end{align}
This modified likelihood, which incorporates data augmentation, was also discussed in \citet[Appendix~K]{wenzel2020good} and is a (potentially continuous) mixture model on the observation $y_n$, where each augmentation introduces a mixture component.  This is as a \emph{noisy-input} model \citep{girard2003learning,mchutchon2011gaussian,damianou2016variational} where the input $\x_i$ is corrupted via the augmentation distribution.

\subsection{Model comparison}
\label{sec:app:behaviour}
The forms of the invariance construction~\myeqref{eq:avg_f} and the noisy-input model~\myeqref{eq:avg_p} imply a difference of purpose. In using the invariance construction, we seek a regression function with the specified symmetry, which is consistent with the data according to the likelihood function $\P\lrbracket{(}{)}{y_i | \f_i}$. Conversely, with the noisy-input model~\myeqref{eq:avg_p} we aim to find a function which gives rise to an invariant likelihood, consistent with the observed outputs for inputs randomly perturbed by $\P(\x'|\x)$. The role of $\x'$ is different in each case. In the noisy-input model, $\x'$ is a latent variable on which we could, in principle, do inference (with e.g. an amortized variational approach). While in the invariance construction, we integrate over $\x'$ to parameterize $\f(\x;\w)$.

We now compare the behaviours of the invariance and noisy-input constructions. We will see that they result in quite different posteriors.

In the main text, we compared the empirical performance of averaging probabilities and averaging logits for BNN classification (see Figs.~\ref{fig:class} and~\ref{fig:cold_posterior}). However, as the invariance perspective justifies both averaging logits and probabilities, this comparison does not clearly distinguish between the noisy-input and invariance viewpoints. Further, we are interested not only in predictive performance but also in understanding how each construction behaves. With this in mind, we investigate the models with an illustrative example, where we can both integrate over the orbit and do inference in closed form.

\begin{figure*}[t]
    \begin{subfigure}{0.38\textwidth}
        \adjincludegraphics[trim={0 0 {.6\width} 0},clip,width=\textwidth]{figs/app_general_lhoods/posterior_densities.pdf}
        \caption{Invariant GP}
    \end{subfigure}%
    \begin{subfigure}{0.38\textwidth}
        \adjincludegraphics[trim={{.4\width} 0 {.2\width} 0},clip,width=\textwidth]{figs/app_general_lhoods/posterior_densities.pdf}
      \caption{Noisy-input GP}
        \label{subfig:noisy_input_posterior}
    \end{subfigure}%
    \begin{subfigure}{0.19\textwidth}
      \adjincludegraphics[trim={{.8\width} 0 0 0},clip,width=\textwidth]{figs/app_general_lhoods/posterior_densities.pdf}
      \caption{$p(y|x=4,\mathcal{D})$}
      \label{subfig:noisy_input_vs_invariance_marginals}
    \end{subfigure}
    \caption{Posterior densities for the model constructions for a single observation at $x_1=-4,y_1=2.5$.}
    \label{fig:noisy_input_vs_invariance_densities}
\end{figure*}

We consider Gaussian process (GP) regression with a one-dimensional input and data augmentation which enforces symmetry about $x=0$, i.e. $\P(x'|x)=\frac{1}{2}\left(\delta(x'-x)+\delta(x'+x)\right)$. From \Citet{van2018learning}, the invariance view can be expressed in the kernel of the GP:
\begin{align}
    g&\sim\mathcal{GP}(\mathbf{0},k_{\text{base}})\\
   f(x)&=g(x)+g(-x)\\
    \implies f&\sim\mathcal{GP}(\mathbf{0},k_\text{inv}),\\
       \text{where} \hspace{15pt} k_\text{inv}(x_i,x_j) &= \sum_{c_i\in\{-1,1\}} \sum_{c_j\in\{-1,1\}}k_{\text{base}}(c_i x_i,c_j x_j).
\intertext{
We then follow standard GP inference to find the posterior over invariant functions. Note that unlike \Citet{van2018learning}, we are not concerned with learning invariances here.\endgraf
The noisy-input model for this case is}
    \P(\x,\y,\f)&= \P(\f)\prod_{i=1}^N \int \P(y_i|f(x_i'))\P(x_i'|x_i)dx_i'\label{eq:noisy_input_joint}\\
    \P(y_i|f(x_i'))&= \mathcal{N}\left(y_i;f(x_i'),\sigma^2\right)\\
    f&\sim \mathcal{GP}(0, k).
\intertext{
Given a single observation $(x_1,y_1)$, the noisy-input posterior is} 
    \P(f|x_1,y_1)&= \frac{1}{Z}\P(f(x_1),x_1,y_1)\\
    &=\frac{1}{2Z}\P\lrbracket{(}{)}{f(x_1)}\sqb{\P(y_1|f(x_1))+\P(y_1|f(-x_1))}\\
    &= \frac{1}{2}\sqb{\P(f|x_1,y_1)+\P(f|-x_1,y_1)},
\end{align}
a mixture of GP posteriors, with two components (one for each point in the orbit).

How do these posteriors compare? For an observation at $(x_1=-4,y_1=2.5)$ we plot the posterior predictive densities in Fig.~\ref{fig:noisy_input_vs_invariance_densities}. Both posteriors are symmetric around $x=0$ as we expect, however the noisy-input model is bimodal in the regions surrounding $x=4$ and $x=-4$, where the invariance posterior has unimodal density concentrated around the observed $y$ value of 2.5. The difference is clear in Fig.~\ref{subfig:noisy_input_vs_invariance_marginals}, which shows the marginal predictive densities at $x=4$.

\begin{figure*}[t]
    \centering
    \includegraphics[width=\textwidth]{figs/app_general_lhoods/posterior_samples.png}
    \caption{Samples from the model posteriors. The red dot marks the location of the observation $(x_1=-4,y_1=2.5)$. The noisy input posterior comprises two components: one conditioned on $(x_1,y_1)$ (dashed lines), the other on $(-x_1,y_1)$ (solid lines).}
    \label{fig:noisy_input_vs_invariance_samples}
\end{figure*}

In the noisy-input case, our observation is $(x_1,y_1)$, but $x$ is uncertain, so the observation could have been generated by $(-x_1,y_1)$ with equal probability. This results in a mixture posterior with two components: one component has ``seen'' $(x_1,y_1)$, while the other ``saw'' $(-x_1,y_1)$. The first component's prediction at $-x_1$ remains uninformed by its ``observation'' and the same is true for the second component's prediction at $x_1$. Thus, the predictions made by these components at these locations revert to the zero-mean prior. 

From the invariance perspective, we condition on the point $(x_1,y_1)$ but the double-sum kernel forces the function to be the same at $(-x_1,y_1)$. As the posterior is a single GP, it has unimodal marginals with high density around both points.

We can gain further intuition by looking at samples from both posteriors (Fig.~\ref{fig:noisy_input_vs_invariance_samples}). We can see that the \emph{every} sample from the invariance posterior is symmetric about $x=0$, where the functions drawn from the noisy input posterior are not symmetric in general.

The samples illustrate the key difference between the models. For the noisy input model, we can see the two components  of the mixture posterior arise from conditioning on different locations in the orbit of $x_1$ as described above. The component going through $(x=4,y_1)$ (samples drawn with dashed lines) is close to the prior at $(x=-4,y_1)$, the other (solid lines) goes through $(x=-4,y_1)$ and is close to the prior at $(x=4,y_1)$. However, under the invariance model, inference on the observation concentrates all model density around $y_1$ for both points in the orbit of $x_1$.

We now consider how this comparison changes as we observe more data. The noisy-input model \myeqref{eq:noisy_input_joint} requires integration over $\P(x'|x)$ to compute the likelihood of each datapoint, all of which are multiplied together to calculate their combined likelihood. Thus, the number of posterior components grows exponentially with the number of observations: $A^N$ (for orbit size $A$). Suppose all observations are at the same location $(x_1,y_1)$. In this case, the posterior density due to prior reversion at $\{x_1,-x_1\}$ decreases exponentially with $N$. This is because the fraction of mixture components conditioned on all input observations being at the same point in the orbit of $x_1$, i.e. all at $x_1$ or $-x_1$, is given by $A^{1-N}$. The predictive posteriors for ten observations, each at $(x=-4,y=2.5)$ is shown in Fig.~\ref{fig:noisy_input_vs_invariance_densities_10data}. Contrasting this noisy-input posterior (Fig.~\ref{subfig:noisy_input_posterior_10data}) to that for one observation (Fig.~\ref{subfig:noisy_input_posterior}), we can see the reduction in density around to the prior mean for points around the orbit of $x_1$. In summary, the noisy-input and invariance posteriors become more alike as we observe more data in the same orbit.
\begin{figure*}[t]
    \centering
    \begin{subfigure}{0.38\textwidth}
        \adjincludegraphics[trim={0 0 {.6\width} 0},clip,width=\textwidth]{figs/app_general_lhoods/posterior_densities_10data.pdf}
        \subcaption{Invariant GP}
    \end{subfigure}%
    \begin{subfigure}{0.38\textwidth}
        \adjincludegraphics[trim={{.4\width} 0 {.2\width} 0},clip,width=\textwidth]{figs/app_general_lhoods/posterior_densities_10data.pdf}
       \subcaption{Noisy-input GP}
       \label{subfig:noisy_input_posterior_10data}
    \end{subfigure}%
    \begin{subfigure}{0.19\textwidth}
      \adjincludegraphics[trim={{.8\width} 0 0 0},clip,width=\textwidth]{figs/app_general_lhoods/posterior_densities_10data.pdf}
      \subcaption{$p(y|x=4,\mathcal{D})$}
      \label{subfig:noisy_input_vs_invariance_marginals_10data}
    \end{subfigure}%
    \caption{Posterior densities for the model constructions for ten observations at $\{(x_i=-4,y_i=2.5)\}_{i=1}^{10}$.}
    \label{fig:noisy_input_vs_invariance_densities_10data}
\end{figure*}





% TODO:
% \begin{itemize}
%     \item Example of different behaviours for exact orbit in GPs. Avg function gives strong generalisation to orbit, avg likelihood gives invariant predictive probability but with uncertainty along orbit.
%     \item How do marginal likelihoods compare for exact orbit GPs?
%     \item What does noisy input model posterior look like as training data grows?
%     \item Single-sample bound is equivalen to VI with posterior over input equal to the prior.
% \end{itemize}
\end{document}
