
% \documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.


%% Choose your variant of English; be consistent
% \usepackage[american]{babel}
\usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
    
%\usepackage[round]{natbib}
%\renewcommand{\bibname}{References}
%\renewcommand{\bibsection}{\subsubsection*{\bibname}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
%\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)


%\usepackage{cleverref}
\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}    % use 8-bit T1 fonts
\usepackage{xr-hyper}
\usepackage{hyperref}       % hyperlinks
\usepackage{url}            % simple URL typesetting
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{xcolor}         % colors
\usepackage{subcaption}     % subfigs
\usepackage{enumitem}
\usepackage{stfloats}      % positioning floats with two cols
\usepackage{placeins}      % for \FloatBarrier
\usepackage{mathtools,amsmath,amssymb}
\usepackage{newfile}       % to write counters to


\usepackage{adjustbox}  % for figs in appendix E

\newcommand{\x}{\mathbf{x}}
%\newcommand{\W}{\mathbf{W}}
\newcommand{\w}{\mathbf{w}}
\newcommand{\X}{\mathbf{X}}
\newcommand{\y}{\mathbf{y}}
\newcommand{\Y}{\mathbf{Y}}
\newcommand{\p}{\mathbf{p}}
\newcommand{\f}{\mathbf{f}}

\newcommand{\lrbracket}[3]{\left#1 #3 \right#2}
\newcommand{\lmrbracket}[5]{\left#1 #4 \middle#2 #5 \right#3}

\renewcommand{\b}{\lrbracket{(}{)}}
\newcommand{\bc}{\lmrbracket{(}{\vert}{)}}
\newcommand{\sqb}{\lrbracket{[}{]}}
\newcommand{\sqbc}{\lmrbracket{[}{\vert}{]}}

\renewcommand{\L}{\mathcal{L}}
\newcommand{\Lh}{\hat{\mathcal{L}}}
\newcommand{\Xh}{\hat{X}}

\newcommand{\Lnoaug}{\mathcal{L}_\text{noaug}}
\newcommand{\Ladd}{\mathcal{L}_\text{add}}
\newcommand{\Llogprob}{\mathcal{L}_\text{loss}}
\newcommand{\Llogits}{\mathcal{L}_{\text{logits}}}
\newcommand{\Lprob}{\mathcal{L}_{\text{prob}}}
\newcommand{\Linv}{\mathcal{L}_\mathrm{inv}}

\newcommand{\Lhlogprob}{\hat{\mathcal{L}}_\text{loss}}
\newcommand{\Lhlogits}{\hat{\mathcal{L}}_{\text{logits},K}}
\newcommand{\Lhprob}{\hat{\mathcal{L}}_{\text{prob},K}}

\newcommand{\LlogitsK}{\mathcal{L}_{\text{logits},K}}
\newcommand{\LprobK}{\mathcal{L}_{\text{prob},K}}

\newcommand{\LhlogitsKpo}{\hat{\mathcal{L}}_{\text{logits},K+1}}
\newcommand{\LhprobKpo}{\hat{\mathcal{L}}_{\text{prob},K+1}}

\newcommand{\LhlogitsKone}{\hat{\mathcal{L}}_{\text{logits};1}}
\newcommand{\LhprobKone}{\hat{\mathcal{L}}_{\text{prob};1}}

\newcommand{\Ktrain}{K_\text{train}}
\newcommand{\Ktest}{K_\text{test}}


\let\P\relax
\DeclareMathOperator{\P}{P}
\DeclareMathOperator{\Q}{Q}
\DeclareMathOperator{\E}{\mathbb{E}}
\newcommand{\Plogits}{\P_\text{logits}}
\newcommand{\Pprob}{\P_\text{prob}}
\newcommand{\Pnoaug}{\P_\text{noaug}}

\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}
\DeclareMathOperator{\const}{const}
\DeclareMathOperator{\softmax}{softmax}

\newcommand{\Exdash}{\E_{\x'|\x}}
\newcommand{\Exdashi}{\E_{\x'_i|\x_i}}
\newcommand{\Exdashik}{\E_{\{\x'_{i;k}\}|\x_i}}

\newcommand{\finv}{\f_\text{inv}}
\newcommand{\pinv}{\p_\text{inv}}
\newcommand{\transpose}{^{\tiny T}}


\makeatletter
\def\blfootnote{\xdef\@thefnmark{}\@footnotetext}
\makeatother

\newcommand{\tsum}{{\textstyle \sum}}

\newcommand{\myeqref}[1]{(Eq.~\ref{#1})}

\makeatletter
\newcommand*{\addFileDependency}[1]{% argument=file name and extension
  \typeout{(#1)}
  \@addtofilelist{#1}
  \IfFileExists{#1}{}{\typeout{No file #1.}}
}
\makeatother

\newcommand*{\myexternaldocument}[1]{%
    \externaldocument{#1}%
    \addFileDependency{#1.tex}%
    \addFileDependency{#1.aux}%
}

\myexternaldocument{nabarro_119-supp}

% The \author macro works with any number of authors. There are two commands
% used to separate the names and addresses of multiple authors: \And and \AND.
%
% Using \And between authors leaves it to LaTeX to determine where to break the
% lines. Using \AND forces a line break at that point. So, if LaTeX puts 3 of 4
% authors names on the first line, and the last on the second line, try using
% \AND instead of \And before the third author name.

\title{Data augmentation in Bayesian neural networks and the cold posterior effect}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<sdn09@ic.ac.uk>?Subject=DA in BNNs (UAI 2022)}{Seth Nabarro$^*$}{}}
\author[2]{Stoil Ganev$^*$}
\author[3]{Adrià Garriga-Alonso}
\author[3,4]{\\Vincent Fortuin}
\author[1]{Mark van der Wilk$^\dagger$}
\author[2]{Laurence Aitchison$^\dagger$}

% Add affiliations after the authors
\affil[1]{%
  Department of Computing\\Imperial College London %London, SW7 2BX, UK
}
\affil[2]{%
  Department of Computer Science\\University of Bristol %\\
  %Bristol, BS8 1UB, UK
}
\affil[3]{%
  Department of Engineering\\University of Cambridge %\\
  %Cambridge, CB2 1PZ, UK
  }
\affil[4]{%
  Department of Computer Science\\ETH Zürich %,\\ 
  %Zürich,  Switzerland
  }

\begin{document}

\maketitle

\begin{abstract}
Bayesian neural networks that incorporate data augmentation implicitly use a ``randomly perturbed log-likelihood [which] does not have a clean interpretation as a valid likelihood function'' (Izmailov et al. 2021).
Here, we provide several approaches to developing principled Bayesian neural networks incorporating data augmentation. 
We introduce a ``finite orbit'' setting which allows valid likelihoods to be computed exactly, and for the more usual ``full orbit'' setting we derive multi-sample bounds tighter than those used previously.
These models cast light on the origin of the cold posterior effect.
In particular, we find that the cold posterior effect persists even in these principled models incorporating data augmentation.
This suggests that the cold posterior effect cannot be dismissed as an artifact of data augmentation using incorrect likelihoods.
%Data augmentation has recently been suggested as the origin of the cold posterior effect in Bayesian neural networks (Izmailov et al. 2021).
%However, this argument is problematic as long as we do not have a principled approach to integrating data augmentation within Bayesian neural networks.
%We give several such approaches, 
%The cold posterior effect persists even in these principled models, suggesting that the cold posterior effect cannot be dismissed as an artifact of data curation.
%Indeed, 
%Data augmentation is a highly effective approach for improving performance in deep neural networks. The standard view is that it creates an enlarged dataset by adding synthetic data, which raises a problem when combining it with Bayesian inference: how much data are we really conditioning on? This question is particularly relevant to recent observations linking data augmentation to the cold posterior effect. We investigate various principled ways of finding a log-likelihood for augmented datasets. Our approach prescribes augmenting the same underlying image multiple times, both at test and train-time, and averaging either the logits or the predictive probabilities. Empirically, we observe the best performance with averaging probabilities. While there are interactions with the cold posterior effect, neither averaging logits or averaging probabilities eliminates it.
\end{abstract}

\blfootnote{$*$ equal contribution}\blfootnote{$\dagger$ equal contribution}

\section{Introduction}
\label{sec:intro}
%It is well known that Bayesian inference in combination with Bayes decision theory should give optimal performance in a well-specified model .
The cold posterior effect \citep[CPE;][]{wenzel2020good} is the surprising  observation that performance in neural networks is not optimal when we use the usual Bayesian posterior \citep{kolmogorov1950foundations,savage1954foundations,jaynes2003probability},
\begin{align}
  \P\bc{\w}{\y, \X} \propto \P\b{\w} \P\bc{\y}{\w, \X}
\end{align}
where $\w$ are the neural network weights, $\X$ is all inputs (typically images), and $\y$ is all outputs (typically class labels).
Instead, we get better performance when using a ``cold'' posterior, i.e. the posterior taken to the power of $1/T$ where $T<1$,
\begin{align}
  \Q\b{\w} &\propto \b{\P\b{\w} \P\bc{\y}{\w, \X}}^{1/T}.
\end{align}
The origin of the CPE is by now highly contentious, with three leading potential explanations \citep{noci2021disentangling}.
The first hypothesis is that the process of data curation for popular datasets such as CIFAR-10 and ImageNet \citep{krizhevsky2009learning,deng2009imagenet} involves multiple annotators agreeing upon the label for each image.
In that case, there are in effect multiple labels for each image, which inflates the likelihood (but not the prior) term causing a ``cooler'' posterior \citep{adlam2020cold,aitchison2020cold}.
Second, the prior is always misspecified, and prior misspecification is known to induce cold posterior-like effects in specific (non-neural network) models \citep{grunwald2012safe,grunwald2017inconsistency,adlam2020cold}, which might give an explanation for the CPE in neural networks \citep{wenzel2020good,fortuin2021bayesian}. % This is supported by \citet{noci2021disentangling}, who show that the CPE can be more pronounced when less data is available, and therefore the prior is more influential.
%while a variety of improved priors have been tried and they do improve performance, these improvements are typically small in comparison to the magnitude of the cold-posterior effect for large-scale network of interest. 
However, \citet{fortuin2021bayesian} showed that better priors do not always reduce the size of the CPE, but can actually increase it.  
In particular, they found that incorporating spatial correlations in convolutional filters improved the performance of a ResNet trained on CIFAR-10, but also increased the magnitude of the CPE. %, which shows that better priors do not necessarily reduce the CPE. % see e.g.\ \citet{ober2020global} and \citet{fort2021drawing} (specifically, the ResNet for CIFAR-10 in that paper).
%As such, \citet{aitchison2020cold} argues that there is as of yet no evidence that a prior could be found that closes the cold-posterior effect.
The third possible explanation is that the CPE is an artifact of data augmentation \citep[DA; ][]{wenzel2020good,izmailov2021bayesian}, as DA gives a ``randomly perturbed log-likelihood [which] does not have a clean interpretation as a valid likelihood function'' \citep{izmailov2021bayesian}. This is supported by observations in which the CPE only exists with DA, and disappears without DA \citep{wenzel2020good,fortuin2021bayesian,izmailov2021bayesian}.
Of course it is quite possible that practical CPEs arise from a complex combination of these causes \citep{aitchison2020cold,noci2021disentangling}.

In spite of this controversy, recent work on the CPE agrees that it is important to investigate integrating DA with Bayesian neural networks (BNNs), and to examine the interaction with the CPE.
From \citet{noci2021disentangling}: ``It remains an interesting open problem how to properly account for data augmentation in a Bayesian sense.''
And from \citet{izmailov2021bayesian}: ``Data augmentation cannot be naively incorporated in the Bayesian neural network model.'' and ``We leave incorporating data augmentation ... as an exciting direction of future work.''

Perhaps the most common understanding of the interaction between the CPE and DA in BNNs is that DA increases the effective dataset size.
From \citet{izmailov2021bayesian}: ``intuitively, data augmentation increases the amount of data observed by the model, and should lead to higher posterior contraction''.  
From \citet{osawa2019practical}: ``DA increases the effective sample size''.
%In footnote 3 in \citet{wenzel2020good}, they note ``For \citep{heek2019bayesian} we communicated with the authors, and tempering arises due to overcounting data by a factor of 5, approximately justified by data augmentation, corresponding to T = 1/5.''
From, \citet{noci2021disentangling}: ``while data augmentation may increase the amount of data seen by the model, that increase is certainly not equal to the number of times each data point is augmented (after all, augmented data is not independent from the original data).''

In this work, we seek to understand whether the commonly used, but invalid DA likelihood can cause the CPE. Our contributions are as follows.
\begin{enumerate}
\item We give a formal argument that the notion that DA increases the effective dataset size is flawed (Sec.~\ref{sec:previous_da}).
%\item We use the principle of functional invariance \citep{kondor2008group} to derive a set of multi-sample lower bounds on the log-likelihood of a DA-invariant BNN.
\item We motivate the need for multi-sample bounds, by showing that previous single-sample bounds on the likelihood are equivalent to averaging log-likelihoods, which is known to be problematic (Eq.~\ref{eq:Ktrain=1}).
\item We derive a set of multi-sample lower bounds on the log-likelihood of a BNN incorporating DA (Sec.~\ref{sec:tighter}).
% Second, we consider issues with the current training objectives for Bayesian generative models which incorporate DA \citep{van2018learning,wenzel2020good}. %, and which have been investigated empirically in the non-Bayesian setting \citep[e.g.][]{krizhevsky2012imagenet,lyle2020benefits,fort2021drawing}.
% In particular, \citet{wenzel2020good} proposed a single-sample training objective that \citet{izmailov2021bayesian} observed to be problematic, and \Citet{van2018learning} proposed a training objective that only works for quadratic log-likelihoods (i.e.\ Gaussians or P\'{o}lya-Gamma approximations).
%the invariance construction of \Citet{van2018learning} for BNNs. This prescribes a generative model for the labels in which we take a single image, augment it multiple times, apply the neural network to each augmentation, then form an overall prediction by averaging the outputs.
%Similar algorithms have been investigated empirically in a non-Bayesian setting \citep[e.g.][]{krizhevsky2012imagenet,lyle2020benefits,fort2021drawing}.
%We show that multi-sample estimators which average network logits or probabilities form lower bounds on the intractable log-likelihoods of principled Bayesian models.
These bounds are tighter than existing single-sample estimators for BNNs \citep[e.g.][]{wenzel2020good} and can be applied to a broad class of likelihood functions \Citep[unlike][]{van2018learning}.
\item We introduce a ``finite orbit''\footnote{We employ the term ``orbit'' from group theory and function invariance \citep{kondor2008group}, even though our augmentations do not always form groups. In this work, it refers to the support of augmentation distribution $\P(\x' | \x).$} setting with a small number of admissible augmentations which allows us to compute \textit{exact} log-likelihoods (Sec.~\ref{sec:finite_orbit}). 
%Fourth, we give the natural generalizations beyond classification to arbitrary outputs (Appendix~\ref{sec:app:generalization}). %, and compare their justifications and behaviours (Appendix~\ref{sec:app:perspectives}).
\item We empirically evaluate the performance of the multi-sample bounds in both SGD training and BNN inference for image classification tasks (Sec.~\ref{sec:results}). In the latter case we explore the impact of both the bounds and the exact finite orbit likelihood on the CPE.
\item We find that the CPE persists even when using these principled DA likelihood bounds. This falsifies the hypothesis that the CPE is an artifact of loose bounds on the log-likelihood given by previous single-sample estimators. 
%\item We consider the consequences for explanations of the CPE, concluding that the CPE is not an artifact which only results from DA giving an invalid likelihood function \citep{izmailov2021bayesian}. 
% In the finite orbit setting, we can compute the exact, valid likelihood, while in the usual setting there is in principle a deterministic likelihood with a clean interpretation as a valid likelihood function, but as this is difficult to evaluate in practice we use a multi-sample lower bound.
\end{enumerate}
We finish with some discussion summarising our findings and their reflection on the CPE (Sec.~\ref{sec:conclusions}). Our conclusion in this work is that the CPE is not an artifact resulting from DA giving ``randomly perturbed log-likelihood''s \citep{izmailov2021bayesian}.

Note that in the remainder of the paper, we will follow \citet{izmailov2021bayesian} in regarding models with loose, single-sample bounds as ``unprincipled'' (from \citet{izmailov2021bayesian}, the ``randomly perturbed log-likelihood does not have a clean interpretation as a valid likelihood function'').
In contrast, we term models using our exact log-likelihoods or our multi-sample bounds as being ``principled''.

\section{Background}
\label{sec:background}
\subsection{Data augmentation}
\label{sec:background:da}

In supervised learning, we are interested in learning some unknown functional relationship from example input-output pairs $(\x_i, y_i), i=1,\ldots,N$. 
%We can usually expect better performance when more examples are given \citep{loog2019erm}.
Often, we have information about some form of invariance, i.e.\ the knowledge that the function does not change its output for certain transformations of the input.
These might occasionally be true invariances, such as the identity of a molecule being invariant to rotations.
But in most settings, these are so-called ``soft'' invariances or ``insensitivities'' \citep{van2018learning}. 
For instance, the class label for an image should not change due to small translations/crops of that image (but might change if we radically alter the image).
The most basic form of DA takes advantage of this information by transforming, or augmenting, the inputs and copying the output value, to create additional input-output pairs which are then included in training.
Often, the amount of additional ``augmented data'' can be unbounded, for example when allowable transformations are specified in a continuous range, e.g.\ rotations.
This simple procedure has been very successful in improving performance in a wide variety of machine learning methods \citep{loosli2007training,krizhevsky2012imagenet,bishop2006pattern}, and recent work has analysed the effect of data augmentation on invariances in the learned functions \citep{dao2019kernelda,chen2020groupda,lyle2020benefits}.

\subsection{Bayesian Inference}
\label{sec:background:bayes_inf}

Bayesian inference allows us to infer a distribution over neural network weights, which incorporates uncertainty induced by having finite data.
Bayes prescribes a strict procedure for updating beliefs about unknown quantities in light of observed data.
The model is specified by a prior on the weights $\P(\w)$ and a log-likelihood, $\sum_{i=1}^N \L^i(y_i;\w)$. 
Thus, the log-posterior is given by
%
\begin{align}
\log\P\bc{\w}{\X, \y} &= \log \P\b{\w} + \sum_{i=1}^N \L^i(\w) + \const.
\end{align}
%
We define the no augmentation log-likelihood as
\begin{align} 
  \Lnoaug^i(y_i;\w)&=\log \P\bc{y_i}{g(\x_i; \w)}. %\nonumber \\&= \log \softmax_{y_i} \f(\x_i; \w) ,  \label{eq:def:Lnoaug}
\end{align}
where $g(\cdot;\w)$ is the neural network.
%where $\f(\x_i; \w)$ is the neural network outputs, which are treated as the logits for a categorical distribution.

\section{Methods}
\label{sec:meth}

\subsection{Does DA increase dataset size?}
\label{sec:previous_da}
Many authors have claimed that DA increases the effective dataset size \citep{noci2021disentangling,osawa2019practical,izmailov2021bayesian}. 
Here we argue that this view leads to problems within the framework of probabilistic modelling.
%For example, the infinite number of augmentations given by continuous transformations lead to the prior having no influence on the posterior. 
We can see this in the form of the resulting log-likelihood. 
For $K$ augmented inputs, $\x'_{i; k}$, we can write the log-likelihood for a single underlying image as,
\begin{align}
  \Ladd^i(y_i;\w)=\sum_{k=1}^K\log \P\bc{y_i}{g(\x'_{i;k}; \w)}. %\softmax_{y_i} \f(\x'_{i;a}; \w).
\end{align}
%Here $A$ is the total number of augmentations, and $a$ indexes different augmentations of the same underlying image.
%The first issue with this approach is that the ``correct'' number of augmentations can be very difficult to define.
For continuous transformations such as rotations, there are an infinite number of possible augmentations, $K=\infty$, so $\Ladd$ would result in the prior being ignored during inference.
While this result seems strange, if the outputs for all augmentations $\x'_{i;k}$ were independently labelled (or if all the labels were correct) we would indeed have an infinitely large (conditionally) IID dataset and ignoring the prior would be the right answer.
However, in practice, the unaugmented input $\x_i$ is labelled by an annotator who sometimes makes mistakes \citep{peterson2019human}, the result $y_i$ is assumed to apply to all augmentations $\x'_{i;k}$.
%However, when doing inference we are free to choose the prior and likelihood, but not the way they are combined to find the posterior.
%Changing the dataset size through augmentation is inconsistent with the Bayesian principle of requiring one likelihood for each ``true'' datapoint.
%%While we might try to interpret $\Ladd^i(y_i;\w)$, but this does not make sense, because the log-likelihood has to form a valid distribution over output labels, and this ``distribution'' does not in general normalise,
%%In this context, $\Ladd^i(y_i;\w)$ is not a valid likelihood for the underlying ``true'' datapoint, because it does not in general form a distribution that normalises,
%Suppose we were to go ahead anyway, treat each augmented image as separate datapoints and thus use $\Ladd$,
%
%\begin{align}
%  \Ladd^i(y_i;\w)=\sum_{a=1}^A\log \softmax_{y_i} f(\x'_{i;k}; \w)
%\end{align}
%
%By doing this, we are assuming that all $(\x'_{i,a},y_i)$ and $(\x_i,y_i)$ are iid, when in fact we have generated $\x'_{i,a}$ directly from $\x_{i}$. 
%This is likely to result in a posterior which is overconfident, as it assumes $A$ times more data points than we really have.
%The infinite augmentation case is particularly problematic, as it results in ignoring the prior, which is clearly unjustified given all our examples are generated from a finite set of underlying observations.
%
% This issue hints at a deeper problem: in treating each augmented image as a separate datapoint, we have implicitly assumed that the labels for each augmented image are independent.
% This could be achieved if we took each augmentation of the same underlying image and presented them to a different human annotator.
% However, that is not what happens in practice.
% Usually it is only the underlying unaugmented image, $\x_i$, that is labelled by a human, and that single label is assumed to apply to all augmented images, $\x'_{i; a}$.
As such, the labels for different augmentations of the same input are not independent, and an approach (such as this one) which assumes they are cannot be valid.

%\subsection{Averaging log-likelihoods}
A method which avoids having to specify the augmented dataset size is to average the log-likelihood over the augmentation distribution $\P(\x_i'|\x_i)$
\begin{align}
  \Llogprob^i\b{y_i; \w} &=  \E\sqb{\log \P\bc{y_i}{g(\x'_i; \w)}} . \label{eq:avg_losses_bound}
\end{align}
Indeed, most implementations which use DA when training BNNs target this log-likelihood, at least implicitly. They do so by taking a pre-existing inference algorithm and replacing the original input, $\x_i$, with a random augmentation, $\x_i'$.
This approach is convenient, as a single sample from the augmentation distribution can provide an unbiased estimate $\Lhlogprob=\log \P\bc{y_i}{g(\x'_i; \w)}$. %, so it is used explicitly in some settings \citep[e.g.][]{benton2020learning}.
Importantly though, a valid likelihood should arise from a valid distribution over labels, and should therefore normalize if we sum over labels.
For instance, without augmentation,
\begin{align}
  \label{eq:likelihood_normalize}
  1 &= {\textstyle \sum}_{y_i=1}^Y \exp\b{\Lnoaug^i(y_i;\w)}.
\end{align}
However, if we try to interpret $\Llogprob^i\b{y_i; \w}$ as a log-likelihood we find that it does not normalize to $1$,
\begin{align}
  1 \neq \sum_{y_i=1}^Y \exp\b{\Llogprob^i\b{y_i; \w}}.
\end{align}
and therefore $\Llogprob^i\b{y_i; \w}$ cannot be the log of a valid probability distribution.
Note that we might try to get a valid likelihood by including a normalizer.
The problem is that this normalizer would need to depend on $\w$, and thus would need to be included in the log-likelihood, and of course no normalizer terms appear in the loss~\myeqref{eq:avg_losses_bound}. While we could renormalize $\Llogprob^i / \mathrm{LogSumExp}_{\mathcal{Y}} \b{\Llogprob^i}$ to ensure validity, we will see in the next section that the form of $\Llogprob^i$ constitutes an unnecessarily slack bound on a principled log-likelihood, which we can tighten significantly. 

%where $Z$ is a constant normalizer
%Note this sum not equally $1$ might be okay if it were constant across inputs, $\x_i$, as we could then normalize.  But even that 
%(technically, we could renormalize if this sum were constant for all input locations, but even that is not true).
%if we consider $\log \softmax_{y_i} \f(\x'_i; \w)$ to be the log-likelihood, 
%the randomness in the augmented input, $\x'_i$ gives a ``randomly perturbed log-likelihood [which] does not have a clean interpretation as a valid likelihood function'' \citep{izmailov2021bayesian}.
% Perhaps a more fundamental issue is that ultimately, the resulting algorithms target the averaged (negative) loss (Appendix~\ref{sec:app:vi_sgld}),
% \begin{align}
%   \Llogprob^i\b{y_i; \w} &=  \E\sqb{\log \P\bc{y_i}{g(\x'_i; \w)}} . \label{eq:avg_losses_bound}
% \end{align}
% This approach is convenient, as a single sample from the augmentation distribution can provide an unbiased estimate $\Lhlogprob=\log \P\bc{y_i}{g(\x'_i; \w)}$, so it is used explicitly in some settings \citep[e.g.][]{benton2020learning}.
%Further, a valid likelihood should arise from a valid distribution over labels, and should therefore normalize if we sum over labels.
%For instance, without augmentation,
%\begin{align}
%  \label{eq:likelihood_normalize}
%  1 &= {\textstyle \sum}_{y_i=1}^Y \exp \Lnoaug^i(y_i;\w).
%\end{align}
%In practice, a likelihood which normalizes to a constant other than one is sufficient when doing inference with e.g. MCMC. However, the normalization constant for $\Llogprob(\cdot;\w)$ may vary with input location: ${\tsum}_{y_i=1}^Y\exp\Llogprob^i(y_i;\w)=Z(\x_i)$, and thus $\Llogprob^i(y_i;\w)$ is not a valid log-likelihood nor sufficient for approximate inference.

%When doing inference with either, $\Ladd$ or $\Llogprob$,  the true size of the dataset remains unclear: the augmentations in some sense provide additional information, but it is not apparent how much that is worth relative to the raw data.
%This precludes their use for principled Bayesian inference.

%\subsection{Data augmentation}
%\label{sec:data_aug}
%
%In Bayesian inference, we compute the posterior over the neural network weights, $\w$, conditioned on all inputs, $\X$, and outputs, $\y$, by combining the prior, $\P\b{\w}$, and the likelihood, $\P\bc{y_i}{\w, \x_i}$, where $y_i$ is the $i$th class-label and $\x_i$ is the $i$th input image,
%\begin{align}
%  \log\P\bc{\w}{\X, \y} &= \log \P\b{\w} + \sum_{i=1}^N \log \P\bc{y_i}{\w, \x_i} + \const.
%  \intertext{We will also need to consider data-dependent objectives that are not actually valid likelihoods.  We denote these more general objectives as $\L^i\b{y_i; \w}$, and as the resulting distribution over weights is no longer a valid posterior, we denote it as $\Q(\w)$,}
%  \log\Q\b{\w} &= \log \P\b{\w} + \sum_{i=1}^N \L^i\b{y_i; \w} + \const.
%  \intertext{In the simplest case where we do not augment data, we have a valid likelihood,}
%  \label{eq:def:Lnoaug}
%  \Lnoaug^i\b{y_i; \w} &= \log \Pnoaug\bc{y_i}{\x_i, \w} = \log \softmax_{y_i}\b{f(\x_i; \w)}.
%\end{align}
%
%
%
%%%An alternative is to use the average log-probability.
%%To incorporate data augmentation into this framework, the most common approach is to replace the unaugmented image, $\x_i$ with an augmented image, $\x'_i$,
%%\begin{align}
%%\Lhlogprob^i\b{y_i; \w} &= \log \softmax_{y_i}\b{f(\x'_{i}; \w)}.
%%\end{align}
%%This can be viewed as an unbiased estimator of an expected ``log-likelihood'', where the expectation is taken over the distribution over augmentations, conditioned on the underlying image, $\P\bc{\x'_{i}}{\x_i}$, 
%%\begin{align}
%%  %\Llogprob^i\b{y_i; \w} &= \frac{1}{A} \sum_{a=1}^A \log \softmax_{y_i}\b{f(\x'_{i,a}; \w)} = \E\sqb{\log \softmax_{y_i}\b{f(\x'_{i}; \w)}}
%%  \label{eq:def:Llogprob}
%%  \Llogprob^i\b{y_i; \w} &= \E\sqb{\log \softmax_{y_i}\b{f(\x'_{i}; \w)}}
%%\end{align}
%An alternative is to use the average (negative) loss, where the expectation is taken over $\P\bc{\x'_i}{\x_i}$ where $\x_i$ is the underlying image and $\x'_i$ is an augmented image,
%\begin{align}
%  \label{eq:def:Llogprob}
%  \Llogprob^i\b{y_i; \w} &= \E\sqb{\log \softmax_{y_i}\b{f(\x'_{i}; \w)}}
%\end{align}
%%To incorporate data augmentation into this framework, the most common approach is to replace the unaugmented image, $\x_i$ with an augmented image, $\x'_i$,
%This approach is often convenient, because a single augmentation readily provides an unbiased estimate of $\Llogprob^i\b{y_i; \w}$,
%\begin{align}
%\Lhlogprob^i\b{y_i; \w} &= \log \softmax_{y_i}\b{f(\x'_{i}; \w)}.
%\end{align}
%%This can be viewed as an unbiased estimator of an expected ``log-likelihood'', where the expectation is taken over the distribution over augmentations, conditioned on the underlying image, $\P\bc{\x'_{i}}{\x_i}$, 
%%\begin{align}
%%  %\Llogprob^i\b{y_i; \w} &= \frac{1}{A} \sum_{a=1}^A \log \softmax_{y_i}\b{f(\x'_{i,a}; \w)} = \E\sqb{\log \softmax_{y_i}\b{f(\x'_{i}; \w)}}
%%  \label{eq:def:Llogprob}
%%  \Llogprob^i\b{y_i; \w} &= \E\sqb{\log \softmax_{y_i}\b{f(\x'_{i}; \w)}}
%%\end{align}
%It is possible to use this form explicitly \citep{benton2020learning}, but its importance really comes from the fact that using augmented images in pre-existing inference algorithms designed for non-augmented data generally also targets this ``log-likelihood'' (Appendix~\ref{sec:app:vi_sgld}).
%However, $\Llogprob^i$ is not actually a valid log-likelihood, i.e.\ it is not the log of a valid probability distribution over labels, because the implied ``probabilities'' do not, in general, sum to one,
%\begin{align}
%  \sum_{y=1}^Y \exp \Llogprob^i(y; \w) &= \sum_{y=1}^Y \exp \E\sqb{\log \softmax_{y_i}\b{f(\x'_{i}; \w)}} \neq 1.
%  \intertext{In contrast, e.g.\ for no augmentation, the probabilities always sum to one,}
%  \sum_{y=1}^Y \exp \Lnoaug^i(y; \w) &= \sum_{y=1}^Y \softmax_{y}\b{f(\x_{i}; \w)} = 1.
%\end{align}
%This rules out the use of $\Llogprob^i$ in principled statistical approaches.


%\subsection{The cold posterior effect}
%\label{sec:background:cold post}
%The discussion above highlights that the number of datapoints, and hence the uncertainty, is unclear when using data augmentation in combination with Bayesian inference.
%Interestingly, we can often dramatically improve the performance of a Bayesian NN by artificially reducing uncertainty by taking the posterior to the power of $1/T$ where $0<T<1$,
%%A cold posterior is the usual Bayesian posterior taken to the power of $1/T$ where the temperature is small ($0<T<1$, hence ``cold'').
%%If we start with a valid posterior over weights, $\P\bc{\w}{\X, \y}$, the cold posterior is,
%\begin{align}
%  \label{eq:cold post_exact_like}
%  \Q\b{\w} &\propto \P\bc{\w}{\X, \y}^{1/T} \propto \P\b{\w}^{1/T} \prod_{i=1}^N \P\bc{y_i}{\w, \x_i}^{1/T}.
%  %\intertext{or if we are using one of the data augmentation procedures above,}
%  %\log \Q\b{\w} &= \frac{1}{T} \b{\log \P\b{\w} + \sum_{i=1}^N \L^i(y_i; \w)}.
%\end{align}
%This is known as the cold posterior effect \citep{wenzel2020good}.
%Note that the cold posterior effect is typically seen when using $\Llogprob$ to incorporate augmentation \citep[Appendix~\ref{sec:app:vi_sgld},][]{wenzel2020good}, so it might be analogous to artificially reducing uncertainty by treating each augmented image as a separate datapoint.


%Cold posteriors are usually found to markedly improve performance \citep{wenzel2020good}.

%
%Thus, almost all pre-existing applications of data augmentation in the Bayesian framework use $\Llogprob^i$ (Eq.~\ref{eq:def:Llogprob}) in place of the log-likelihood.
%However, $\Llogprob^i$ is not actually a valid log-likelihood i.e.\ it is not the log of a valid probability distribution over labels,
%%\begin{align}
%%  \exp \Llogprob^i\b{y_i; \w} &\neq \P\bc{y_i}{\x_i, \w}
%%\end{align}
%because the implied ``probabilities'' do not sum to one,
%\begin{align}
%  \sum_{y=1}^Y \exp \Llogprob^i(y; \w) &= \sum_{y=1}^Y \exp \E\sqb{\log \softmax_{y_i}\b{f(\x'_{i}; \w)}} \neq 1.
%  \intertext{In contrast, e.g.\ for no augmentation, the probabilities \textit{do} sum to one,}
%  \sum_{y=1}^Y \exp \Lnoaug^i(y; \w) &= \sum_{y=1}^Y \softmax_{y}\b{f(\x_{i}; \w)} = 1.
%\end{align}
%This rules out the use of $\Llogprob^i$ in principled statistical approaches.


%To begin, it is worth noting that a common understanding of the interaction of DA and the CPE is the notion that DA in effect provides datapoints ``intuitively, data augmentation increases the amount of data observed by the model, and should lead to higher posterior contraction'' \citep{izmailov2021bayesian}, and ``On the one hand, data augmentation affects the data points that enter the likelihood function. However, while data augmentation may increase the amount of data seen by the model, that increase is certainly not equal to the number of times each data point is augmented (after all, augmented data is not independent from the original data).''
%We could augment the data $A$ times, and treat each augmented datapoint, $\x'_{i,a}$, as a ``true'' datapoint,
%\begin{align}
%  \Ladd^i(\w) &= \sum_{a=1}^A \log \softmax_{y_i}\b{f(\x'_{i,a}; \w)}
%\end{align}
%%This artificially inflates the effective ``number'' of datapoints, thus highlighting a potential connection to the cold posterior effect (see Sec.~\ref{sec:background:cold post}).
%The first issue with this approach is that the ``correct'' number of augmentations is very difficult to define.
%Indeed, in the case of e.g.\ rotations, there are an uncountably infinite number of possible augmentations even within a narrow range, perhaps implying that we should take $A=\infty$, which is clearly pathological as it results in ignoring the prior.
%This issue is really hinting at a deeper problem: we have assumed that the label for each augmentation of the same underlying image is independent (conditioned on the image).
%This could be achieved if we took each augmentation of the same underlying image and presented them to a different human annotator.
%However, that is not what happens in practice.
%In practice, only the underlying unaugmented image is labelled by a human, and the label is assumed to be the same for all augmentations of that image.
%As such, a mistaken label will propagate to all the augmented images, and thus the labels for different augmentations of the same underlying image are not independent and an approach (such as this one) which assumes they are cannot be valid.

%\subsection{Invariant Models}
%\label{sec:tighter}
%
%%To obtain a principled log-likelihood incorporating DA, we cannot take the standard approach of just using augmented data in a pre-existing algorithm.
%%Instead, we need to build DA into the probabilistic generative model for labels.
%We aim to incorporate DA into BNN inference in a manner which is correct from a probabilistic perspective. Instead of viewing DA as inflating the training dataset, we seek to formulate it as a change to the model, and so look to incorporate DA into either the prior or the likelihood function.
%
%Previous work \citep{van2018learning,kondor2008group,ginsbourger2012argumentwise,ginsbourger2013invariances} has shown we can construct an invariant function $h(\cdot;\w)$ by averaging a non-invariant function $g(\cdot;\w)$ over the distribution of interest, in this case augmentation distribution $\P\bc{\x'}{\x}$
%\begin{align}
%    h(\x_i;\w)=\int g(\x_i';\w)\P\bc{\x_i'}{\x_i}d\x_i'.
%    \label{eq:invariance}
%\end{align}
%We thus consider models in which the output label is given by averaging the (non-invariant) neural network over a distribution of all possible augmentations for each input. %, while ensuring that normalization (Eq.~\ref{eq:likelihood_normalize}) is satisfied. 
%This simultaneously ensures that the likelihood normalizes \myeqref{eq:likelihood_normalize}, the averaged output is invariant to the augmentation transformations and circumvents the need to find the ``true'' number of augmentations. However, the integration over $\P\bc{\x'}{\x}$ is generally intractable, raising the question of how we can do inference in practice.
%
%%We will consider the likelihood of $h(\cdot; \w)$, but we could equivalently view the invariance relation~\myeqref{eq:invariance} as a deterministic prior on the parameterisation of $h(\cdot; \w)$ which translates into an adjusted likelihood after marginalisation. 
%For any log-concave likelihood function, we can lower bound the log-likelihood of the invariant function via Jensen's inequality as is common in variational inference (VI) \citep{jordan1999introduction}
%%
%\begin{align}
%    \Linv^i =\log \P\bc{y_i}{h(\x_i;\w)}&\geq \E\sqb{ \log \P\bc{y_i}{ g(\x_i';\w) }} = \Llogprob^i. 
%    \label{eq:inv_ll_loose}
%\end{align}
%%
%We note that our derivation thus provides a possible justification for the averaging losses bound \myeqref{eq:avg_losses_bound}. However, we will show that this bound is unnecessarily loose, and if we seek to do accurate inference with our defined model, we need a bound which is as tight as possible.
%
%We can improve the tightness of \myeqref{eq:inv_ll_loose} by applying a trick familiar to importance-weighted VI \citep{burda2015importance}.
%Our invariant function is unchanged if we average the integrand in~\myeqref{eq:invariance} over $K\geq1$ identically distributed random variables, $\{\x'_{i;k}\}$
%%
%\begin{align}
%    h(\x_i;\w)&=\int \b{\frac{1}{K} \sum_{k}^K g(\x_{i;k}';\w)} \prod_k\P\bc{\x_{i;k}'}{\x_i}d\{\x_{i;k}'\}.
%\end{align}
%%
%We still cannot evaluate $h(\x_i;\w)$, however we again apply Jensen's inequality to get a multi-sample bound
%%
%\begin{align}
%    \L_K^i &= \E_{\{\x_{i;k}'\}}\sqb{ \log \P\bc{y_i}{ \frac{1}{K} \sum_k^K g(\x_{i;k}';\w) }}\leq \Linv^i, \label{eq:inv_ll_bound_K}
%\end{align}
%%
%which provably becomes tighter and lower variance as $K$ is increased~\citep{burda2015importance}, so we have 
%%
%\begin{align}
%    \Linv^i \geq \L_K^i \geq \Llogprob^i.
%\end{align}
%Note that we use ``multi-sample'' to describe bounds such as $\log \frac{1}{K}\tsum_k L(x_k)$ as in \citet{burda2015importance}. They are not to be confused with evaluating a single-sample bound with more Monte Carlo samples, $\frac{1}{K} \tsum_k \log L(x_k)$.
%
%We observe that the result of our derivation of~\myeqref{eq:inv_ll_bound_K} prescribes a straightforward implementation adjustment from the commonly used $\Lhlogprob$. We need only to average the function output over multiple augmentations of the input before likelihood computation. In fact, such a scheme has been used in previous work, though without the awareness that it constitutes a bound on the log-likelihood of an invariant function \citep[see e.g.][]{krizhevsky2012imagenet,he2015delving,szegedy2015going,simonyan2014very,foster2020improving}. Further, most of these works apply averaging at test time only and are not concerned with a principled model of DA in a Bayesian context (see Sec.~\ref{sec:related_work} for further discussion). Note that~\cite{wenzel2020good} also derive $\Llogprob^i$ in their Appendix K, but do so from a noisy-input model perspective, and do not derive the tighter multi-sample estimator. A multi-sample bound for the noisy input model would take on a different form to~\myeqref{eq:inv_ll_bound_K}. We contrast the noisy-input model and the invariance construction in Appendix~\ref{sec:app:perspectives}.
%%Note that \myeqref{eq:invariance} implies the algorithm used in much non-Bayesian work, which averages the neural network output over multiple augmentation of each input \citep[e.g.][]{krizhevsky2012imagenet,he2015delving,szegedy2015going,simonyan2014very,foster2020improving}.
%
%%In doing so, we circumvent the need to find the ``true'' number of examples in our augmented dataset.
%% Importantly, averaging over an augmentation group (such as horizontal flips) creates an invariance, so our averaging methods are deeply related to a view of data-augmentation as introducing function-space invariance, which follows the method developed for GPs by \citet{van2018learning}.
%
%
%%More specifically, the method constructs a function invariant to augmentations $\finv(\cdot;\w)$, by integrating a non-invariant function $f(\cdot;\w)$ over $\P(\x'|\x)$
%%\begin{equation}
%%    \finv(\x;\w)=\int_{\mathcal{X}'} f(\x';\w)\P(\x'|\x)d\x'
%%    \label{eq:invariant_construct}
%%\end{equation}
%%This allows us to do inference on the weights of $\finv(\cdot;\w)$ with a minor amendment to existing inference algorithms: average the values of $f(\cdot;\w)$ over multiple augmentation samples for each input. See Section~\ref{sec:meth:invariance} for details.
%%
%We note that the approach developed by \cite{van2018learning} is arguably the first formulation of truly Bayesian data augmentation, in that case for GP inference. The work also describes a method of learning function-space invariances using the marginal likelihood, however this is beyond the scope of this paper. 

%\subsubsection{Invariant Classification}
\subsection{Tighter lower bounds on the log-likelihood of principled DA models}
\label{sec:tighter}
%We aim to build a classifier $p:\mathcal{X}\rightarrow \mathbb{P}^{|Y|}$ which is invariant to transformations under augmentation distribution $P(\cdot | \x)$.
To incorporate DA into BNN likelihoods, we define the probabilities for each class as being averages over augmentations.
We can choose to either average logits (equal to the neural network outputs, $\f(\cdot; \w)$) or predictive probabilities ($\softmax \f(\cdot; \w)$),
%, and hence at which level we enforce the invariance: at the level of logits or predictive probabilities, that is,
\begin{align}
  \pinv(\x_i;\w)&= \E\sqb{\softmax \f(\x_i'; \w)}, \label{eq:p_inv}\\
  \finv(\x_i;\w)&= \E\sqb{\f(\x_i'; \w)}. \label{eq:f_inv}
\end{align}
where we take expectations over $\P(\x_i'|\x_i)$. 
Remember that $\f(\x_i'; \w)$ is the (vector-valued) neural network output for an augmented input, which is used as the logits in classification, so $\finv(\x_i; \w)$ is the outputs averaged over all augmentations of the same underlying image. 
Likewise, $\pinv(\x_i; \w)$ is the vector of probabilities given by averaging the predicted probabilities over augmentations.
These are denoted ``inv'' for invariant, because averaging over augmentations can give invariances in $\finv(\x_i; \w)$ and $\pinv(\x_i; \w)$ that are not present in the underlying neural network, $\f(\x_i; \w)$. 
The resulting (usually intractable) log-likelihoods are
\begin{align}
  \Lprob^i\b{y_i; \w} &= \log \Pprob \bc{y_i}{\x_i, \w} \nonumber\\
  &= \log \E\sqb{\softmax_{y_i} \f(\x'_{i}; \w)},\label{eq:avgp_exact}\\
  \Llogits^i\b{y_i; \w} &= \log \Plogits\bc{y_i}{\x_i, \w} \nonumber \\
  &= \log \softmax_{y_i} \E\sqb{\f(\x'_{i}; \w)}.
\end{align}
%For instance, if our augmentations are uniform over full 360 degree rotations, then $\finv(\x; \w)$ and $\pinv(\x; \w)$ will be invariant to rotations of the input [REF!!].
%The usual augmentations distributions used for image classification do take this form, and therefore do not induce full invariances. 
%Instead, they induced so-called ``soft'' invariance \citep{van2018learning}, meaning intuitively that $\finv(\x; \w)$ will vary less to perturbations of the input than $f(\x; \w)$, particularly when those perturbations are similar to the augmentation distribution.
%Finally, note that the averaging probabilities approach can also be understood as resulting from a ``noisy input'' model \citep{wenzel2020good} (Appendix~\ref{todo}).

%The exact but intractable log-likelihoods, for averaging logits and averaging probabilities are
%\begin{align}
%  \Lprob^i\b{y_i; \w} &= \log \Pprob \bc{y_i}{\x_i, \w} \nonumber\\
%  &= \log \E\sqb{\softmax_{y_i} \f(\x'_{i}; \w)},\label{eq:avgp_exact}\\
%  \Llogits^i\b{y_i; \w} &= \log \Plogits\bc{y_i}{\x_i, \w} \nonumber \\
%  &= \log \softmax_{y_i} \E\sqb{\f(\x'_{i}; \w)}.
%\end{align}
%
%\begin{align}
%\Llogits^i&=\log \softmax_{y_i} \finv(\x_i;\w) =\log\softmax_{y_i} \int \P(\x_i'|\x_i) f(\x_i';\w)d\x'_i\\
%\Lprob^i&=\log \pinv(\x_i;\w)[y_i] =\log\int \P(\x_i'|\x_i)\softmax_{y_i} f(\x_i';\w)d\x_i'
%\end{align}
%The concavity of the log and log-softmax functions \citep{boyd2004convex} allow us to apply Jensen's inequality in either case, giving us the following stochastic lower bounds, which can be evaluated with multiple samples $K\geqslant1$
%
%\begin{equation}
%\small
%\begin{aligned}
%\Lhlogits^i&\geqslant\int \left[\prod_k \P(\x'_{i,k}|\x_i)\right] \log \softmax_{y_i} \left[\frac{1}{K}\sum_kf(\x_{i,k}';\w)\right] \prod_k d\x'_{i,k}\triangleq \Exdashik \sqb{\Lhlogits{K}^i} \\
%% \end{aligned}
%% \\
%% \\%\qquad
%% \small
%% \begin{aligned}
%\LhprobK^i&\geq\int \left[\prod_k \P(\x'_{i,k}|\x_i)\right]\log \left[\frac{1}{K}\sum_{k=1}^K \softmax_{y_i} f(\x_{i,k}';\w)\right]\prod_k d\x'_{i,k}\triangleq \Exdashik \sqb{\LhprobK{K}^i}\\
%\end{aligned}
%\end{equation}
%
%

%To obtain a principled log-likelihood incorporating data augmentation, we cannot take the standard approach of just using augmented data in a pre-existing algorithm.
%Instead, we need to build data augmentation into the probabilistic generative model describing the likelihood.
%To do this, we consider priors in which the output class label is given by averaging over a distribution over all possible augmentations of a single underlying image.
%There are two sensible choices as to what quantities to average which both form valid probability distributions over labels and hence valid log-likelihoods: averaging logits and averaging predictive probabilities.
%Averaging logits has been used in the Gaussian process community \citep[e.g.][]{van2018learning},
%\begin{subequations}
%\begin{align}
%  \Plogits\bc{y_i}{\x_i, \w} &= \softmax_{y_i} \E\sqb{f(\x'_{i}; \w)}\\ 
%  \Llogits^i\b{y_i; \w} &= \log \Plogits\bc{y_i}{\x_i, \w} = \log \softmax_{y_i} \E\sqb{f(\x'_{i}; \w)}.
%\end{align}
%\end{subequations}
%The averaging probabilities generative model was initially written down in \citet{wenzel2020good}, but they only gave the training objective for the trivial case of one training sample (see Eq.~\ref{eq:Ktrain=1}),
%\begin{subequations}
%\begin{align}
%  \Pprob \bc{y_i}{\x_i, \w} &= \E\sqb{\softmax_{y_i} f(\x'_{i}; \w)}\\
%  \Lprob^i\b{y_i; \w} &= \log \Pprob \bc{y_i}{\x_i, \w} = \log \E\sqb{\softmax_{y_i} f(\x'_{i}; \w)}.
%\end{align}
%\end{subequations}
These likelihoods were originally proposed in \citep{wenzel2020good} for averaging probabilities and \citep{van2018learning} for averaging logits.
However, they are intractable, as it is not (usually) possible to evaluate the expectation under all data augmentations.
Instead, we need to choose an estimator or bound on these quantities.
\citep{wenzel2020good} suggested a loose single sample bound for averaging probabilities, and \citep{van2018learning} suggested an unbiased estimator that is restricted to quadratic log-likelihoods.
In contrast, we show that we can get tight, intuitive and easy to evaluate, multi-sample bounds analogous to those in \citep{burda2015importance},
\begin{align}
  \nonumber
  \Lhprob^i\b{y_i; \w} &= 
       \log \b{\tfrac{1}{K} \tsum_{k=1}^K \softmax_{y_i} \f(\x'_{i; k}; \w)},\\
  \label{eq:inv_ll_bound_K}
  \Lhlogits^i\b{y_i; \w} &= \log \softmax_{y_i} \b{\tfrac{1}{K} \tsum_{k=1}^K \f(\x'_{i; k}; \w)}.
\end{align}
To prove the lower bound for averaging probabilities, we first rewrite the expectation inside the logarithm of \myeqref{eq:avgp_exact} as the expectation of its average, over $K$ identically distributed random variables, $\x'_{i;k}$. We then take an approach familiar from variational inference \citep{jordan1999introduction} by applying Jensen's inequality to the (concave) logarithm function.   
\begin{align}
  \nonumber
  \Lprob^i\b{y_i; \w} &= \log \E\sqb{\tfrac{1}{K} \tsum_{k=1}^K \softmax_{y_i} \f(\x'_{i;k}; \w)}\\
   &\geq \E\sqb{\log \tfrac{1}{K} \tsum_{k=1}^K \softmax_{y_i} \f(\x'_{i;k}; \w)}\nonumber\\
   &= \E\sqb{\Lhprob^i\b{y_i; \w}}. \label{eq:bound_prob}
   \intertext{For averaging logits, we follow a similar method, noting that $\log \softmax_{y_i}$ is a concave function \citep{boyd2004convex} taking a vector of logits and returning a scalar log-probability for class $y_i$.  As such, we can again apply Jensen's inequality,}
  \nonumber
  \Llogits^i\b{y_i; \w} &= \log \softmax_{y_i} \E\sqb{\tfrac{1}{K} \tsum_{k=1}^K \f(\x'_{i;k}; \w)}\\
   &\geq \E\sqb{\log \softmax_{y_i} \tfrac{1}{K} \tsum_{k=1}^K \f(\x'_{i;k}; \w)} \nonumber\\
   &= \E\sqb{\Lhlogits^i\b{y_i; \w}}.\label{eq:bound_logits}
\end{align}
Finally, note that these objectives naturally correspond to the notions of averaging logits or averaging probabilities, which could be motivated using non-probabilistic considerations.
Importantly, we do not claim the notion of averaging probabilities or averaging logits for different augmentations as a contribution in itself. 
We only claim as a contribution the notion that averaging probabilities or logits provide lower-bounds on principled log-likelihoods including DA, implying they can be used in a principled Bayesian setting, and they are not ruled out despite having some degree of stochasticity.
%These estimators are consistent in the sense that as $K\rightarrow \infty$ they become equal to the expectation.
%However, in practice we use finite and small $K$, raising the question of whether there are any guarantees (such as unbiasedness or lower-bounds) available for small $K$.
%Averaging logits was introduced in the Gaussian process setting, where such guarantees are available \citep{van2018learning}; however for neural networks we know of no method by which to obtain such guarantees and we suspect that this explains the community's reluctance to apply averaging logits to neural networks.
%In contrast, we can show that the finite sample estimator for averaging probabilities, $\Lhprob^i$, forms a lower bound by applying Jensen's inequality.
%First, note that for $\Lhprob^i$, the term inside the log is an unbiased estimator,
%\begin{align}
%  \nonumber
%  \E\sqb{\exp \Lhprob^i\b{y_i; \w}} &= \E\sqb{\tfrac{1}{K} \sum_{k=1}^K \softmax_{y_i} f(\x'_{i; k}; \w)} \\
%  &= \E\sqb{\softmax_{y_i} f(\x'_{i}; \w)} = \exp \Lprob^i\b{y_i; \w}
%\end{align}
%Taking the logarithm and applying Jensen's inequality we obtain,
%\begin{align}
%  \Lprob^i\b{y_i; \w} = \log \E\sqb{\exp \Lhprob^i\b{y_i; \w}} \geq \E\sqb{\Lhprob^i\b{y_i; \w}}.
%\end{align}
%so the expected value of $\Lhprob^i$ is a lower-bound on $\Lprob^i$, and by increasing $\Lhprob^i$, we can increase the true likelihood $\Lprob^i$.
%
%It is possible to eliminate the need for these approximations by considering the ``fixed-orbit'' setting...

%These bounds have a free parameter, $K$, raising the question of which values for $K$ are likely to be sensible.
%We know that the bounds become tighter as $K$ increases \citep[e.g.][]{burda2015importance}, and eventually become exact as $K$ approaches infinity, suggesting that larger values of $K$ will better.
%Remarkably, in variational inference (VI), practitioners frequently use a single-sample bound.
%%Na\"ively, we would expect a single-sample estimate of an expectation to have high variance, and high-variance estimators give loose bounds and biased inferences when used with Jensen's inequality \citep{liao2018sharpening}.
%%We do indeed find benefit in using $\Ktrain>1$ (Fig.~\ref{fig:class}), single-sample estimates are used frequently in variational inference \citep[VI;][]{jordan1999introduction}, which suggests they might be viable here too.
%%That said, high variance and hence biased estimators are a problem in VI for the same reason, which is commonly mitigated by using $K>1$ \citep{burda2015importance,aitchison2018tensor}.
%%That said, exactly the same issue (of high variance and bias from a single-sample bound) is encountered in VI, and a common strategy for reducing variance and tightening the bound is to use multiple samples  \citep[ $K>1$,][]{burda2015importance,aitchison2018tensor}.
%However, VI incorporates a highly effective variance reduction strategy that is absent in our setting: an optimized variational approximate posterior (see Appendix~\ref{sec:app:vi_var}). 
%%only reason they do something sensible is that in VI we optimizes a variational approximate posterior that is absent in our setting. 
%%Optimizing the approximate posterior itself suppresses the variance and tightens the bound (see Appendix~\ref{sec:app:vi_var}).
%In principle, similar variance reduction strategies exist in our setting, but would involve learning a separate variance-reducing augmentation distribution for each image, which is clearly impractical.
%%In the absence of such a strategy, the only viable approach to reducing variance in the Jensen bound is to use multiple samples, though the exact number of samples required at test and train time is an empirical question.
%%
%%However, it is important to note that VI introduces a variational approximate posterior which is optimized, and one effect of this optimized approximate posterior is to minimize variance of the term inside the expectation.
%%
%%That said, it is important to note that VI is very different.
%%To confirm that indeed, a single-sample estimator indeed represents a problematic approximation, note that 
%%While one might think this choice would be immaterial or even beneficial, for our purposes it is very concerning as 
%Indeed, in our setting, $K=1$ represents such a crude approximation that it collapses the differences between averaging probabilities, logits, and losses,
%\begin{align}  
%  \LhlogitsKone^i\b{y_i; \w} &=\LhprobKone^i\b{y_i; \w} = \Lhlogprob^i\b{y_i; \w}\label{eq:Ktrain=1}
%\end{align}
%which are all equal to $\log \softmax_{y_i} \f(\x'_{i; 1}; \w)$.
%In contrast, we show empirically that $\Lhprob$, $\Lhlogits$ and $\Lhlogprob$ have significant performance differences when $K>1$ (Figs.~\ref{fig:class} and~\ref{fig:cold_posterior}). Note that we are free to use different numbers of samples at test and training time, $\Ktest$ and $\Ktrain$ respectively.

%This collapse is intuitive because no averaging actually occurs if we take $K=1$. 
% To get any difference between the methods, we need to actually take an average across two or more augmentations of the same underlying image.
%Thus, if we are interested in principled data augmentation priors, using one training sample is highly undesirable: it has such large approximation errors that it collapses the differences between averaging probabilities, averaging logits, and averaging losses.
%Indeed, the reason that \citet{wenzel2020good} considered only a single-sample estimator of the bound (i.e.\ $\Ktrain=1$) is precisely that it is equivalent to the standard data augmentation setup.% they used in the rest of the paper. %which is what they implemented in the rest of the paper).
%
%In general, we may use different values of $K$ at training and test time, $\Ktrain$ and $\Ktest$ respectively.
%In particular, we apply Jensen's inequality, noting that 
%%Substituting $\softmax \f(\cdot;\w)$ and $\f(\cdot;\w)$ in for $g(\cdot;\w)$ in multi-sample bound~\myeqref{eq:inv_ll_bound_K}, we arrive at the following bounds for the classification log-likelihoods
%\begin{align}
%  \LprobK^i\b{y_i; \w} &= \E\sqb{\log \tfrac{1}{K} \tsum_{k=1}^K \softmax_{y_i} \f(\x'_{i;k}; \w)}\label{eq:bound_prob}\\
%  &= \E\sqb{ \Lhprob^i },\nonumber\\
%  \LlogitsK^i\b{y_i; \w} &= \E\sqb{\log \softmax_{y_i} \tfrac{1}{K} \tsum_{k=1}^K \f(\x'_{i;k}; \w)} \label{eq:bound_logits}\\
%  &= \E\sqb{ \Lhlogits^i }. \nonumber
%\end{align}
%%To prove the lower bound for averaging probabilities, we first rewrite the expectation inside the logarithm of \myeqref{eq:avgp_exact} as the expectation of its average, over $K$ identically distributed random variables, $\x'_{i;k}$. We then take an approach familiar from variational inference \citep{jordan1999introduction} by applying Jensen's inequality to the (concave) logarithm function.   
%%\begin{align}
%%  \nonumber
%%  \Lprob^i\b{y_i; \w} &= \log \E\sqb{\tfrac{1}{K} \tsum_{k=1}^K \softmax_{y_i} \f(\x'_{i;k}; \w)}\\
%%   &\geq \E\sqb{\log \tfrac{1}{K} \tsum_{k=1}^K \softmax_{y_i} \f(\x'_{i;k}; \w)}\nonumber\\
%%   &= \E\sqb{\Lhprob^i\b{y_i; \w}}. \label{eq:bound_prob}
%%   \intertext{For averaging logits, we follow a similar method, noting that $\log \softmax_{y_i}$ is a concave function \citep{boyd2004convex} taking a vector of logits and returning a scalar log-probability for class $y_i$.  As such, we can again apply Jensen's inequality,}
%%  \nonumber
%%  \Llogits^i\b{y_i; \w} &= \log \softmax_{y_i} \E\sqb{\tfrac{1}{K} \tsum_{k=1}^K \f(\x'_{i;k}; \w)}\\
%%   &\geq \E\sqb{\log \softmax_{y_i} \tfrac{1}{K} \tsum_{k=1}^K \f(\x'_{i;k}; \w)} \nonumber\\
%%   &= \E\sqb{\Lhlogits^i\b{y_i; \w}}.\label{eq:bound_logits}
%%\end{align}
%%These estimators are consistent in the sense that as $K\rightarrow \infty$ they become equal to the expectation.
%%However, in practice we use finite and small $K$, raising the question of whether there are any guarantees (such as unbiasedness or lower-bounds) available for small $K$.
%%Averaging logits was introduced in the Gaussian process setting, where such guarantees are available \citep{van2018learning}; however for neural networks we know of no method by which to obtain such guarantees and we suspect that this explains the community's reluctance to apply averaging logits to neural networks.
%%In contrast, we can show that the finite sample estimator for averaging probabilities, $\Lhprob^i$, forms a lower bound by applying Jensen's inequality.
%%First, note that for $\Lhprob^i$, the term inside the log is an unbiased estimator,
%%\begin{align}
%%  \nonumber
%%  \E\sqb{\exp \Lhprob^i\b{y_i; \w}} &= \E\sqb{\tfrac{1}{K} \sum_{k=1}^K \softmax_{y_i} f(\x'_{i; k}; \w)} \\
%%  &= \E\sqb{\softmax_{y_i} f(\x'_{i}; \w)} = \exp \Lprob^i\b{y_i; \w}
%%\end{align}
%%Taking the logarithm and applying Jensen's inequality we obtain,
%%\begin{align}
%%  \Lprob^i\b{y_i; \w} = \log \E\sqb{\exp \Lhprob^i\b{y_i; \w}} \geq \E\sqb{\Lhprob^i\b{y_i; \w}}.
%%\end{align}
%%so the expected value of $\Lhprob^i$ is a lower-bound on $\Lprob^i$, and by increasing $\Lhprob^i$, we can increase the true likelihood $\Lprob^i$.
%%
%%It is possible to eliminate the need for these approximations by considering the ``fixed-orbit'' setting...
%Note that \citet{wenzel2020good} gave the single-sample averaging probabilities bound, but did not generalize it to tighter multi-sample bounds.

Increasing $K$ reduces the variance and tightens the bounds which eventually become exact as $K\rightarrow\infty$~\citep{burda2015importance}.
\begin{align}  
  \E\sqb{\Lhlogits^i\b{y_i; \w}} &\leq \E\sqb{\LhlogitsKpo^i\b{y_i; \w}}\\
  \Llogits^i\b{y_i; \w} &= \lim_{K\rightarrow\infty} \Lhlogits^i\b{y_i; \w}  \\
  \E\sqb{\Lhprob^i\b{y_i; \w}} &\leq \E\sqb{\LhprobKpo^i\b{y_i; \w}}\\
  \Lprob^i\b{y_i; \w} &= \lim_{K\rightarrow\infty} \Lhprob^i\b{y_i; \w}
\end{align}
However, larger $K$ introduces greater computational cost. 
We therefore consider what value of $K$ is likely to be sensible, by plotting the bound against $K$.
We indeed found that the bound increases with $K$ up to around $10$, when it saturates (Fig.~\ref{fig:loglik_vs_Ktest}).
While these differences might seem small when evaluated purely at test-time, they seem to cause much larger differences when integrated into training (Figs.~\ref{fig:class} and~\ref{fig:cold_posterior}).
In contrast, in VI, practitioners frequently use a single-sample bound.
%Na\"ively, we would expect a single-sample estimate of an expectation to have high variance, and high-variance estimators give loose bounds and biased inferences when used with Jensen's inequality \citep{liao2018sharpening}.
%We do indeed find benefit in using $\Ktrain>1$ (Fig.~\ref{fig:class}), single-sample estimates are used frequently in variational inference \citep[VI;][]{jordan1999introduction}, which suggests they might be viable here too.
%That said, high variance and hence biased estimators are a problem in VI for the same reason, which is commonly mitigated by using $K>1$ \citep{burda2015importance,aitchison2018tensor}.
%That said, exactly the same issue (of high variance and bias from a single-sample bound) is encountered in VI, and a common strategy for reducing variance and tightening the bound is to use multiple samples  \citep[ $K>1$,][]{burda2015importance,aitchison2018tensor}.
However, VI incorporates a highly effective variance reduction strategy that is absent in our setting: an optimized variational approximate posterior (see Appendix~\ref{sec:app:vi_var}). 
%only reason they do something sensible is that in VI we optimizes a variational approximate posterior that is absent in our setting. 
%Optimizing the approximate posterior itself suppresses the variance and tightens the bound (see Appendix~\ref{sec:app:vi_var}).
In principle, similar variance reduction strategies exist in our setting, but would involve learning a separate variance-reducing augmentation distribution for each image, which is clearly impractical.
%In the absence of such a strategy, the only viable approach to reducing variance in the Jensen bound is to use multiple samples, though the exact number of samples required at test and train time is an empirical question.
%
%However, it is important to note that VI introduces a variational approximate posterior which is optimized, and one effect of this optimized approximate posterior is to minimize variance of the term inside the expectation.
%
%That said, it is important to note that VI is very different.
%To confirm that indeed, a single-sample estimator indeed represents a problematic approximation, note that 
%While one might think this choice would be immaterial or even beneficial, for our purposes it is very concerning as 
Indeed, in our setting, $K=1$ represents such a crude approximation that it collapses the differences between averaging probabilities, logits, and losses,
\begin{align}  
  \LhlogitsKone^i\b{y_i; \w} &=\LhprobKone^i\b{y_i; \w} = \Lhlogprob^i\b{y_i; \w}\label{eq:Ktrain=1}
\end{align}
which are all equal to $\log \softmax_{y_i} \f(\x'_{i; 1}; \w)$.
%In contrast, we show empirically that $\Lhprob$, $\Lhlogits$ and $\Lhlogprob$ have significant performance differences when $K>1$ (Figs.~\ref{fig:class} and~\ref{fig:cold_posterior}). 
%However, the finite orbit setting uses the same augmentations, and therefore the same number of augmentations, at test and train time: $\Ktrain=\Ktest=K$.

%This collapse is intuitive because no averaging actually occurs if we take $K=1$. 
% To get any difference between the methods, we need to actually take an average across two or more augmentations of the same underlying image.
%Thus, if we are interested in principled data augmentation priors, using one training sample is highly undesirable: it has such large approximation errors that it collapses the differences between averaging probabilities, averaging logits, and averaging losses.
%Indeed, the reason that \citet{wenzel2020good} considered only a single-sample estimator of the bound (i.e.\ $\Ktrain=1$) is precisely that it is equivalent to the standard data augmentation setup.% they used in the rest of the paper. %which is what they implemented in the rest of the paper).
%
%In general, we may use different values of $K$ at training and test time, $\Ktrain$ and $\Ktest$ respectively.


\begin{figure}[t]
  \centering
  \includegraphics[width=0.43\textwidth]{figs/single_batch_test_log_lik_vs_samples_se.pdf}
  \caption{The effect of $\Ktest$ on the log-likelihood bound for a test batch (size 512) of CIFAR-10. Values shown for ResNet20 BNN trained and tested with $\Lhprob$ ($\Ktrain=8$ and $T=0.001$). Error bars cover two standard errors above/below mean for DA sampling with different seeds. Sixty seeds used for $\Ktest=\{1,2\}$, thirty for $\Ktest=4$ and five for all other $\Ktest$.}
  \label{fig:loglik_vs_Ktest}
%   \vspace*{-10pt}
\end{figure}

\subsection{Finite Orbit}
\label{sec:finite_orbit}
Finally, all of the above is for the usual ``full orbit'' setting, where there is a distribution over a very large, or even infinite number of possible augmentations.
The full orbit setting necessitates the use of the bound in \myeqref{eq:inv_ll_bound_K}, and allows us to use different numbers of samples at test and training time, $\Ktest$ and $\Ktrain$ respectively. 
Remarkably, if we consider an alternative ``finite orbit'' by restricting the augmentations to a small subset, we can \textit{exactly} evaluate the log-likelihood.
In the finite orbit setting, the distribution over augmented images, $\x_i'$, conditioned on the underlying unaugmented image, $\x_i$, can be written as,
\begin{align}
  \P\bc{\x_i'}{\x_i} &= \tfrac{1}{K} \tsum_{k=1}^K \delta\b{\x_i' - a_k(\x_i)}, \label{eq:finite_orbit}
\end{align}
where $\delta$ is the Dirac-delta, and $a_k$ is a function that applies the $k$th fixed augmentation.
In this setting, it is possible to exactly compute $\Llogits^i(y_i; \w)$ and $\Lprob^i(y_i; \w)$ by summing over the $K$ augmentations. 
This allows us to empirically explore how exact log-likelihood computation influences the CPE, comparing it with the bounds in the full-orbit setting \myeqref{eq:inv_ll_bound_K} in Sec.~\ref{sec:res:cold_post}. 
When implementing finite orbit augmentation in practice, we choose the $K$ fixed augmentations by sampling them before training.
The finite orbit setting uses the same augmentations, and therefore the same number of augmentations, at test and train time: $\Ktrain=\Ktest=K$.

\section{RESULTS}
\label{sec:results}

\subsection{Principled DA in Non-Bayesian Networks}
\label{sec:sgd_results}
\newcommand{\augprob}{red}
\newcommand{\noaugprob}{green}
\newcommand{\auglogits}{orange}
\newcommand{\noauglogits}{blue}


We begin by comparing averaging logits and averaging probabilities in a non-Bayesian setting: SGD.
%While averaging logits (or its closely related cousin ``feature averaging'') is sometimes applied at test time \citep{he2015delving,benton2020learning,foster2020improving}, to our knowledge averaging logits is not used at train time in neural networks \citep[though see][which averages logits at train-time in GPs]{van2018learning}. Similarly, averaging probabilities has been used at test time \citep{krizhevsky2012imagenet}, but not at train time (to our knowledge).
Critically, higher values of $\Ktrain$ imply a larger computational cost per epoch, as each image is replicated and augmented $\Ktrain$ times before going through the network.
When assessing the benefit of averaging probabilities/logits over standard DA for SGD training, we must therefore control for computational cost. We do this by training for $200/\Ktrain$ epochs.
Note that $\Ktrain=1$ with no test-time augmentation (i.e. \noaugprob{} and \noauglogits{} in Fig.~\ref{fig:class}) corresponds to the standard DA approach for both averaging logits and averaging probabilities \myeqref{eq:Ktrain=1}.
In this experiment, we consider only full orbit, which unlike finite orbit allows us to decouple $\Ktrain$ and $\Ktest$.

\begin{figure*}
  \centering
  \includegraphics{figs/logit_vs_prob_performance.pdf}
  \caption{Comparison of averaging logits and probabilities for different values of $\Ktrain$, and using $\Ktest=10$ vs. using no test-time augmentations.
  Here, we use ResNet18 with SGD (i.e.\ no Bayesian inference).
  We use only full orbit to decouple $\Ktrain$ from $\Ktest$.
    \label{fig:class}
  }
%   \vspace{-10pt}
\end{figure*}
We trained ResNet18\footnote{\href{https://github.com/kuangliu/pytorch-cifar}{\nolinkurl{github.com/kuangliu/pytorch-cifar}}; MIT Licensed} on CIFAR-10, CIFAR-100 \citep{krizhevsky2009learning}\footnote{\href{https://cs.toronto.edu/~kriz/cifar.html}{\nolinkurl{cs.toronto.edu/~kriz/cifar.html}}} and FashionMNIST \citep{xiao2017fashion}\footnote{\href{https://github.com/zalandoresearch/fashion-mnist}{\nolinkurl{github.com/zalandoresearch/fashion-mnist}}; MIT Licensed} with a learning rate of 0.1, decayed to 0.01 three quarters of the way through training. We apply the same two augmentation transformations as \cite{wenzel2020good,fortuin2021bayesian,noci2021disentangling}: 1. a random crop with padding of four pixels on all borders and 2. a random horizontal flip with probability 0.5. The training runs took around 12 GPU-days on Nvidia 2080s.\footnote{Code available: \href{https://anonymous.4open.science/r/Augmentations-1E35/}{\nolinkurl{anonymous.4open.science/r/Augmentations-1E35/}}} %with averaging logits and averaging probabilities (Fig.~\ref{fig:class}).

In agreement with past work~\citep{lyle2020benefits}, we found that averaging over augmentations at test-time (\augprob{} and \auglogits{}) is better than using the test image without augmentation (\noaugprob{} and \noauglogits{}), with $\Ktrain=1$ corresponding to the standard DA procedure.
In addition, we show that improved performance with multiple test-time augmentations continues to hold for larger values of $\Ktrain$.
Thus, if sufficient compute is available at test-time, averaging across augmentations gives an easy method to improve the performance of a pre-trained network.

Importantly, we see some performance gains with higher values of $\Ktrain$ if we focus on the case with test augmentations, though they are somewhat inconsistent across datasets.
We see strong improvements for the hardest dataset (CIFAR-100), and smaller improvements that saturate at $\Ktrain=2$ for CIFAR-10.
For FashionMNIST, the picture is more mixed.
We suspect this is because we used a DA strategy tuned for CIFAR-10 and CIFAR-100, rather than FashionMNIST. %In particular, the augmentation includes a random horizontal flip which is unlikely to be a useful invariance for recognising digits in SVHN.
%These gains are smaller and saturate at $\Ktrain=2$ on easier datasets such as CIFAR-10, Fashion-MNIST and SVHN, but are very evident on the hardest dataset: CIFAR-100.

In addition, averaging probabilities seems to give somewhat better performance than averaging logits: compare averaging probabilities vs.\ logits both with test-time augmentation (\augprob{} vs.\ \auglogits{}) and without test-time augmentation (\noaugprob{} vs.\ \noauglogits{}).
%Further, note that the differences between averaging logits and averaging probabilities are smaller and somewhat consistent when using test-time augmentation (\augprob{} vs.\ \auglogits{}), but are much larger when using no test-time augmentation (\noaugprob{} vs.\ \noauglogits{}).
The performance differences are consistent in both comparisons, though smaller when test-time augmentation is applied.

Indeed, performance falls quite dramatically as $\Ktrain$ increases for averaging logits without test-time augmentation (\noauglogits{}).
This is an indication that averaging probabilities and logits might actually behave quite differently.
%perhaps the underlying neural network actually becomes less invariant as $K_\text{train}$ increases.
%This might occur because averaging over multiple augmentations introduces a degree of invariance in and of itself, so there is less need for the underlying neural network to be invariant.
%We would expect a less-invariant network to have reasonable performance with test-time augmentation (\auglogits{}) but to give worse performance without test-time augmentation (\noauglogits{}). From a statistical point of view, this is not problematic, as the same likelihood should be applied at test time as at training time, which prescribes augmentation at test time. From a practical point of view however, this behaviour is problematic, as we may want to use a small $\Ktest$ for computational efficiency at test time.
%% While this can perhaps be resolved by just using $\Ktrain=1$, it is extremely problematic in our statistical framework because as $\Ktrain$ increases, we get closer to the correct log-likelihood.
%In contrast, this does not occur with averaging probabilities: performance is much more constant as $\Ktrain$ increases, even when evaluating without test-time augmentation (\noaugprob{}).
%
%
To understand how these differences might arise, consider the effect of averaging on the NN function itself. Both schemes can be justified by using averaging to increase invariance to the augmentation transformations (Sec.~\ref{sec:tighter}). Averaging probabilities, however, also forces the NN function itself to become invariant. If different augmentations produce different predictions, then the resulting averaged class probabilities will be more uncertain, which is penalized by the likelihood on the training points. This effect is much weaker when averaging logits.
Consider an extreme example, as illustrated in Fig.~\ref{fig:avg_example}. It is a two-class classification problem with two augmentations, $\x'_1$ and $\x'_2$, of the same image with logits, $\f(\x'_1) = (10, -10)$ and $\f(\x'_2) = (-1, 1)$.
Averaging logits gives us $\E\sqb{\f(\x')} = (4.5, -4.5)$, and applying the softmax, we very confidently predict the first class.
In contrast, if we use averaging probabilities, then the first augmentation almost certainly predicts the first class $p(\x'_1) \approx (1, 0)$ and the second augmentation almost certainly predicts the second class, $p(\x'_2) \approx (0, 1)$, so when we average them we obtain $\E\sqb{p(\x')} \approx (0.5, 0.5)$, which indicates a high degree of uncertainty.


\begin{figure}[t]
  \centering
  \includegraphics{figs/logit_vs_prob_average.pdf}
  \caption{Example effect of averaging logits against averaging probabilities. $\x'_1$ and $\x'_2$ are two augmentations of the same image, $\f(\x'_1)$ and $\f(\x'_2)$ are logits outputted by a NN, and $p(\x'_1)$ and $p(\x'_2)$ are the probabilities corresponding  to these logits. The prediction derived from the averaged logits is much more certain than the average of the individual probabilities.
    \label{fig:avg_example}
  }
%   \vspace{-10pt}
\end{figure}
% %See Discussion for intuition about why averaging logits with no test-time augmentations performs worse with increasing $\Ktrain$ while averaging probabilities is has roughly constant performance.
% To understand this difference, note that the two methods described above have two very different underlying intuitions.
% %Averaging logits has connections to partially invariant Gaussian processes kernels \citep{ginsbourger2012argumentwise,ginsbourger2013invariances,ginsbourger2016degeneracy,van2018learning}.
% In \Citet{van2018learning}, averaging logits was motivated by taking less-invariant Gaussian process priors and making them ``more invariant'' by averaging over augmentations.
% In contrast, we conceptualize averaging probabilities in a very different manner: it forces the NN function itself to become invariant.
% %For a NN's predictions to be trustworthy, we would hope that the prediction would vary little under DA.
% %Indeed, if the NN's predictions changed dramatically for different augmentations, we would be inclined to trust its predictions less.
% If different augmentations produce different predictions, then the resulting averaged prediction will be more uncertain, which is penalised by the likelihood on the training points. This is not necessarily the case with averaging logits.
% Consider an extreme example: a two-class classification problem with two augmentations, $\x'_1$ and $\x'_2$, of the same image with logits, $f(\x'_1) = (100, -100)$ and $f(\x'_2) = (-10, 10)$.
% Averaging logits gives us $\E\sqb{f(\x')} = (45, -45)$, and applying the softmax, we very confidently predict the first class.
% In contrast, if we use averaging probabilities, then the first augmentation almost certainly predicts the first class $p(X'_1) \approx (1, 0)$ and the second augmentation almost certainly predicts the second class, $p(X'_2) \approx (0, 1)$, so when we average them we obtain $\E\sqb{p(X')|X} \approx (0.5, 0.5)$, which indicates a high degree of uncertainty.
%
%This provides a potential explanation for the differential effects of increasing $\Ktrain$ with no test-time augmentations in Fig.~\ref{fig:class}.
%The supervised objective encourages the classifier to be as certain as possible about the true class label.
%To achieve certainty for averaging probabilities, the predictive distributions for almost all augmentations of the same underlying image need to be concentrated on the true class. 
%This requirement encourages the trained neural network to be invariant to the underlying augmentation, just like in the standard approach to DA.
%While this happens to some extent with averaging logits, the effect is smaller: as in the above example, it is possible for the overall system to be highly confident about the correct class-label with an underlying network that is less invariant.

\subsection{Bayesian Neural Networks and the Cold Posterior Effect}
\label{sec:res:cold_post}

\begin{figure*}[t]
  \centering
  \begin{subfigure}{\textwidth}
      \includegraphics{figs/mnist_cold_posterior_summary_reruns_2row_LA_leg_rs.pdf}
    %   \caption{MNIST, FCNN}
      \label{fig:cold_posterior_mnist}
  \end{subfigure}
  \begin{subfigure}{\textwidth}
      \includegraphics{figs/cifar10_cold_posterior_summary_reruns_2row_LA_leg_rs.pdf}
      \label{fig:cold_posterior_cifar10}
  \end{subfigure}
  \caption{The cold posterior effect for different DA setups with GGMC inference \citep{guided-mcmc}. Without DA, there is a minimal CPE. Most other configurations show significant improvement for $T<1$, with the exception of averaging the logits over a finite orbit. Averages computed with $\Ktrain=8$ and $\Ktest=8$.
  }
  \label{fig:cold_posterior}
%   \vspace{-10pt}
\end{figure*}

Next, we ask a very different question: how is the CPE influenced when DA is incorporated into the model in a principled way?
To this end, we use a different experimental setup.
In particular, we take the code\footnote{\href{https://github.com/ratschlab/bnn_priors}{\nolinkurl{github.com/ratschlab/bnn_priors}}; MIT Licensed} and networks from \citet{fortuin2021bayesian,fortuin2021bnnpriors} and mirror their experimental setup for CIFAR-10 and MNIST as closely as possible.
This code combines a cyclical learning rate schedule \citep{zhang2019cyclical}, a gradient-guided Monte Carlo (GGMC) scheme \citep{guided-mcmc}, and the preconditioning and convergence diagnostics from \citet{wenzel2020good}. The CIFAR-10 DA transformations are the same as those described in Sec.~\ref{sec:sgd_results} and for MNIST we apply random cropping with a padding of two pixels, then random rotation by an angle sampled uniformly over $(-\pi / 6, \pi / 6)$. 
Following \citet{fortuin2021bayesian}, we ran 60 cycles with 50 epochs in each cycle. 
We recorded one sample at the end of each of the last five epochs of a cycle, giving 300 samples total. 

Importantly, to allow for running many sampling epochs in these experiments, we follow \citet{fortuin2021bayesian} in using the ResNet20 architecture from \citet{wenzel2020good} for CIFAR-10, which has far fewer channels than the ResNet18 used in Sec.~\ref{sec:sgd_results} (i.e.\ 32 channels for the first block up to 128 in the last block compared to 64 channels up to 512 \citep{he2016deep}).
As such, SGD with this network performs poorly compared with that in Sec.~\ref{sec:sgd_results} (ResNet20, CIFAR-10 accuracy $\sim 92\%$ \citep{wenzel2020good} vs. ResNet18, CIFAR-10 accuracy $\sim 95\%$ \citep{he2016identity}). For MNIST, we use the three-layer fully connected network (FCNN) used by \citet{fortuin2021bayesian}. The experiments took around 90 GPU-days on Nvidia RTX6000s\footnote{Code available: \url{https://github.com/sethnabarro/bnn-data-aug/}}.% For all augmentation and averaging configurations, we found the posterior predictive at some non-zero temperature to be more accurate than its SGD equivalent (see Appendix~\ref{app:sgd_cold_posterior}).

The results are presented in Fig.~\ref{fig:cold_posterior}. We replicate the finding that the CPE is largely absent without DA (dashed black line), and is present in the standard setup with DA at training time ($\Ktrain=1$) but without augmentation at test time (solid black).
Further, we show that the CPE persists with principled DA likelihoods: averaging logits with full orbit (purple, first and third rows), and averaging probabilities with finite and full orbits (green).

For CIFAR-10, the best method overall appears to be averaging probabilities with a full orbit (dark green line, third row) at $T=0.001$, though at $T=1$ averaging logits (dark purple lines) outperforms the other methods. For the MNIST experiments, logit averaging over a full orbit (purple line, top row) performs best at all temperatures, though has a similar accuracy to averaging probabilities (green line, top row) at $T=0.001$. Interestingly, the CPE for averaging probabilities (green) is stronger than that for both logit averaging (dark purple) and standard DA (solid black), across all MNIST experiments.   

%The expected calibration error (ECE) scores also benefit from a cooled posterior (Fig.~\ref{fig:cold_posterior}, RHS). The lowest ECEs are achieved by averaging logits over a full orbit, and standard DA. In combination with log-likelihoods, which more heavily penalize a low model density where there are observations, these results suggests the logit-averaging produces more certain predictions. This is consistent with the intuition presented in Fig.~\ref{fig:avg_example}.  
%Averaging probabil clearly find that averaging probabilities (Fig.~\ref{fig:cold_posterior} green lines) gives considerably better performance than averaging logits (Fig.~\ref{fig:cold_posterior} purple lines), at least for $T<1$.

%Surprisingly, in this setting we found that the standard data augmentation setup (training augmentations, but no averaging or test-time augmentation) performs almost as well as as averaging probabilities (with a full-orbit) and considerably better than averaging logits.
%%In this setting, we find that the standard data augmentation setup performs almost as well as averaging probabilities.
%This is surprising because in Fig.~\ref{fig:class}, we found that averaging logits or averaging probabilities at test time (red and orange lines) performed much better than the standard setting (blue and green lines with $\Ktrain=1$).
%There are two things to remember.
%First, the yellow line (training augmentations, but no averaging or test-time augmentation) represents an invalid likelihood: the only view ...
%The first thing to remember is that the purpose of this plot is to assess whether the cold-posterior effect is still present when we use principled approaches to data augmentation.

For both datasets, the CPE is near absent in one particular setting: averaging logits with a finite orbit (purple line, second and fourth rows).
However, the relevance of this is unclear, as for CIFAR-10 it is clearly the worst performing of all DA approaches, and for MNIST it is outperformed by standard DA.
Indeed, remember that the arguments for the optimality of Bayesian inference apply only in the case that the model is well-specified \citep{kolmogorov1950foundations,savage1954foundations,jaynes2003probability}.
However, the comparatively poor performance of averaging logits with a finite orbit indicates that it is likely to be the wrong model, while other settings are likely to be closer to the true model.
In that case, the presence or absence of the CPE in the wrong model (averaging logits with a finite orbit) is immaterial to our understanding of the CPE in the right model.
Note that this argument could not be made if there was a model without the CPE with performance equal to or better than the other models (see Sec.~\ref{sec:conclusions} for further discussion).

The CPE was originally discovered in \citet{wenzel2020good} when assessing test accuracy and log-likelihood --- they did not consider other measures of distribution calibration like expected calibration error (ECE).
Indeed, later work on the CPE found that measures such as ECE are far more complex and usually do not agree with test accuracy and log-likelihood \citep{fortuin2021bnnpriors}.
It is therefore difficult to interpret the differences between test log-likelihod and ECE, especially if we remember that test log-likelihood is itself a proper scoring rule \citep{gneiting2007strictly}, and therefore captures one possible notion of calibration.
In particular, test log-likelihood heavily penalizes an event assessed as low probability actually happening, e.g.\ if our classifier predicts a probability of $0.001\%$, while the actually happens even $0.1\%$ of the time.
In contrast, ECE considers the absolute difference in probability, so it far more heavily penalises e.g.\ a predicted probability of $40\%$ while the event actually happens $60\%$ of the time.
Needless to say, the most appropriate measure of calibration will depend heavily on the domain, with log-likelihood being more appropriate for low-probability but high risk events. In our CIFAR-10 experiments, averaging probabilities (green) achieves the greatest log-likelihood scores, standard DA (solid black) achieves the lowest ECE. This is contrasted with MNIST, for which averaging logits (purple) has highest log-likelihood and no DA (dashed black) has lowest ECE.


%In the finite orbit setting, it is evident that performance is much better with averaging probabilities.
%Surprisingly, the cold posterior effect is much smaller for averaging logits with finite orbit (dashed purple line), which demonstrated best performance at $T=1$ (close to  averaged logits over full orbit; solid purple line).
%However, the relevance of this is unclear, as the reduction in the cold posterior effect arises primarily because of worse low-temperature performance.
%However, the performance at lower temperatures is far worse than other approaches such as those based on averaging probabilities.

%This suggests that the 
%However, the relevance of this finding is debatable: if we believe that $T=1$ is the Bayesian solution then this result is highly relevant.
%However, Bayesian inference in the correct model should would have $T=1$ as the optimal temperature, and give optimal performance, which it clearly does not.

The usefulness of our results is contingent on understanding whether we are indeed accurately approximating the posterior.
To check this, we computed the kinetic temperature~\citep{leimkuhler2015canonical}, which estimates the temperature of a given parameter in the Langevin dynamics simulation from the norm of its momentum. 
In expectation, the kinetic temperature estimator should be equal to the desired temperature, $T$.
The results (Appendix~\ref{sec:app:kinetic_temp}) indicate that all the samplers run at their desired temperature, a result that is consistent with accurate posterior sampling.

As discussed in Sec.~\ref{sec:meth}, increasing $K$ tightens our log-likelihood bounds, but incurs greater computational cost. It is natural to question which value of $K$ is a good trade-off. We explore how the log-likelihood of test data under a trained model varies with $\Ktest$. As expected, the results (Fig.~\ref{fig:loglik_vs_Ktest}) show the log-likelihood increases with $\Ktest$, with even $K=2$ being a significant improvement over $K=1$ (standard DA). However, the curve plateaus, suggesting that for CIFAR-10, there is little benefit of using $K>8$.



\section{Related Work}
\label{sec:related_work}

Past work introduced noisy-input generative models which average probabilities \citep{wenzel2020good}.
However, this work did not consider the tighter multi-sample bounds developed here, or the finite orbit setting which allows us to evaluate the exact likelihood.
This left open the possibility raised by \citet{izmailov2021bayesian} that the CPE was an artifact of standard DA resulting in an invalid likelihood.
In contrast, we considered exact likelihoods in the finite orbit setting, and tighter multi-sample lower bounds in the full orbit setting. 
Further, the invariant function perspective allowed us to derive a log-likelihood bound for averaging logits, not considered by \cite{wenzel2020good}.
As the CPE persists when using our principled DA models, we can exclude the possibility that the CPE is an artifact of DA giving a ``randomly perturbed log-likelihood''.
Other work has introduced a log-likelihood estimator for averaging GP logits using the invariance principle \citep{van2018learning}. However, the method only works for a quadratic log-likelihood and thus necessitates P\'{o}lya-Gamma approximations for classification. Further, the work did not consider BNNs or the connection to the CPE.

%Past work purely in the Gaussian process (GP) context introduced a principled generative model incorporating averaging logits \citep{van2018learning}.
%However, they did not apply the method to (convolutional) neural networks, e.g.\ in modern image classification settings and they did not consider either averaging probabilities or the cold posterior effect.

%Nonetheless, the only possible explanation of the cold posterior effect that \citet{wenzel2020good} were unable to dismiss was that of model misspecification. Our present results, and those of \citet{fortuin2021bayesian} have not been able to remove the cold posterior effect by changing the prior over neural network functions.
%% But the present results and \citet{fortuin2021bayesian} suggest that the misspecification cannot lie in the prior over neural network functions.
%So which part is misspecified? \Citet{aitchison2020cold} suggests that the misspecification can be in the likelihood, not just in the prior over functions.
%%As such, both our results and those from \citet{fortuin2021bayesian} are compatible with an alternative view of model misspecification proposed in \citep{aitchison2020cold}.
%They noted that in heavily curated datasets such as CIFAR-10 and ImageNet, multiple annotators look at each image, and the image is only included in the dataset if they agree.
%If we take $S$ annotators who all agree about the class-label, the resulting true posterior is
%\begin{align}
%  \P\bc{\w}{\X, \y} &= \P\b{\w} \prod_{i=1}^N \P\bc{y_i}{\w, \x_i}^S,
%\end{align}
%where $\P\bc{y_i}{\w, \x_i}$ is taken to be the single-annotator predictive distribution.
%Critically, this strongly resembles the cold posterior in Eq.~\eqref{eq:cold post_exact_like} with $S=1/T$; the main difference being that only the likelihood, and not the prior is modified (see \citealp{aitchison2020cold} for further discussion).
%In this context, the observation that there is no cold posterior effect without augmentation is still puzzling.
%
%One potential resolution is to posit that the right prior must include DA, as evidenced by its undeniable performance benefits.
%Importantly, while the Bayesian framework states that the true model performs best, it does not say anything about the relative performance of misspecified models.
%As such, if we think that the ``true model'' includes both curation and augmentation, then Bayesian theory only predicts that a model with curation and augmentation should perform best.
%In particular, it does not tell us about the relative performance of misspecified models that lack either curation or DA.
%In practice, we suspect that the additional certainty provided by data curation only helps generalisation in the context of an invariant model with DA, whereas a non-invariant model without DA may just overfit to the training data.
%In that context, a model without DA has a strongly misspecified prior, and the Bayesian framework makes no reasonable predictions about the behaviour of such a model.


There is a small but growing body of work that considers averaging over multiple augmentations at training time \citep{hoffer2019augment,berman2019multigrain,choi2019faster,benton2020learning,lyle2020benefits,touvron2021going,fort2021drawing}. 
%The issue is still highly topical, with important contemporaneous work \citep{fort2021drawing}.
However, this work was not done within a Bayesian framework (e.g.\ by using stochastic gradient Langevin dynamics (SGLD) or a similar inference algorithm), did not show that averaging across multiple training augmentations gives a multi-sample bound on the log-likelihood of a principled model, did not consider the finite-orbit setting where the log-likelihood can be computed exactly, and did not consider the interaction with the CPE.
In addition, much of this work uses averaging losses \citep{hoffer2019augment,berman2019multigrain,choi2019faster,benton2020learning,touvron2021going,fort2021drawing} which is equivalent to using a loose single-sample bound on the log-likelihoods. While \cite{lyle2020benefits} show that feature averaging during training can improve generalization, our work is, to the best of our knowledge, the first to average predicted probabilities at training time.
%Past \citep{lyle2020benefits} and contemporaneous work \citep{fort2021drawing}, suggests that in agreement with our results in Fig.~\ref{fig:class}, using multiple training augmentations improves generalization in large neural networks.
%However, \citep{lyle2020benefits}   In addition, \citep{fort2021drawing} considered only averaging losses. % within a principled Bayesian framework as they used averaging losses, and they did not assess any connection to the cold posterior effect. 
%Similarly, \citet{hoffer2019augment} and~\citet{berman2019multigrain} averaged losses for improving the generalization of models trained with large batches, and \cite{choi2019faster} did so for better hardware utilization. \cite{benton2020learning} averaged losses in order to learn invariances in neural networks.
%In this case, there is no difference in the expected training objective for averaging one sample vs.\ multiple samples, so \citet{benton2020learning} used a single sample.
%Last, the averaging of losses is applied in the context of vision transformers by \cite{touvron2021going}. 
Finally, the idea of averaging at test-time is more common and has been practiced for longer \citep[e.g.][]{krizhevsky2012imagenet,simonyan2014very,he2015delving,szegedy2015going,foster2020improving}.
%Averaging probabilities, which we found to perform slightly better than averaging logits, has been used at test time in many image classification works 
%Averaging logits is less widespread, though ``feature averaging'', which is closely related, was used by \cite{foster2020improving} to improve test time performance.
%To our knowledge, neither averaging logits nor probabilities has been used at training time.
% In order to learn invariances in neural networks, \cite{benton2020learning} averaged losses.
% For averaging losses, there is no difference in the expected training objective for averaging one sample vs.\ multiple samples, so \citet{benton2020learning} used a single sample.


%%obtaining an unbiased estimate of the objective based on a finite number of sampled augmentations required specific properties of the Gaussian process variational approximate posterior which are not available in our neural-network setting.
%%As such, 
%%Furthermore, 
%Precursors to this work created GP kernels that are actually invariant to a transformation group by averaging the logits for all elements of that group \citep{ginsbourger2012argumentwise,ginsbourger2013invariances,ginsbourger2016degeneracy}.

%\citet{wenzel2020good} wrote down the probabilistic generative model for averaging probabilities in Appendix K, but did not actually implement the resulting methods.
%Further, they considered only the trivial case of $\Ktrain=1$, under which averaging probabilities, averaging logits, and past heuristic approaches all have an equivalent training objectives (see Methods).
%As such, they were not able to distinguish the predictions for (and differential effects of cold posteriors on) these three different methods.

%Both \citet{van2018learning} and subsequent work in the neural network context \citep{benton2020learning} showed that it is possible to use similar objectives to learn partial invariances using gradient descent purely on the training dataset.
%However (and as discussed in the Background and Methods sections), \citet{benton2020learning} used the heuristic $\Llogprob$ objective.
%This objective does not correspond to a valid log-probability, so they were not able to evaluate basic quantities such as test-log-likelihood.
%In practice, they trained using $\Ktrain=1$, presumably because they saw no benefit in using higher values.
%As such, their training procedure was almost exactly equivalent to the standard neural-network setting (the key differences being that they differentiate with respect to the parameters of the augmentations, and included a term in the objective encouraging broader augmentations).
%In contrast, we give principled log-likelihood objectives under which we see strong benefits of training using multiple augmentations of the same underlying image.

%Critically, obtaining a unified, principled understanding of data augmentation as a log-likelihood is widely believed to be highly  important despite this prior work \citep{izmailov2021bayesian}.
%
A considerable body of past work on BNNs uses DA, both with variational inference \citep{blundell2015weight,zhang2018noisy,osawa2019practical,ober2020global,unlu2021gradient}, Laplace approximations \citep{immer2021scalable} and SGLD \citep[e.g.][]{zhang2019cyclical,fortuin2021bayesian,xi2021bayesian}.
However, as discussed in Sec.~\ref{sec:background} (Background), these methods simply substitute non-augmented for augmented data and thus do not use a valid log-likelihood.
In contrast, we incorporated DA into the probabilistic generative model, and thus are able to give valid log-likelihoods based on averaging logits or averaging probabilities in the classification case.

%\section{Conclusions}
%We considered two principled probabilistic generative models incorporating DA. We found that they give improved performance in standard SGD settings with a fixed compute budget.
%In addition, we looked at their interaction with the cold posterior effect.
%The cold posterior effect did not go away when using principled DA, suggesting that the question of dataset size in DA may not provide an explanation of the cold posterior effect.
%Instead, these results are consistent with an alternative theory of the cold posterior effect, which introduces a probabilistic generative model describing data curation \citep{aitchison2020cold}.
%No particular negative social impacts are anticipated as this is largely theoretical work.

\section{Conclusion}
\label{sec:conclusions}
We have shown how DA can be properly incorporated into a generative model suitable for BNN inference, by deriving a lower-bound on the log-likelihood of the augmentation-averaged network output. Empirically, we have seen that the CPE persists even when using our principled DA formulation, and in agreement with past work \citep{wenzel2020good,fortuin2021bayesian,izmailov2021bayesian}, we show that the CPE disappears without DA.

What do these results imply for the origin of the CPE?
Our models in principle have a clean log-likelihood which can be evaluated exactly in the finite orbit setting, or which we estimate using tightened multi-sample bounds in the full orbit setting.
This falsifies the hypothesis, that the CPE is an artifact arising from DA giving a ``randomly perturbed log-likelihood [which] does not have a clean interpretation as a valid likelihood function''.

Indeed, it is worth stepping back and considering the original motivation for studying the CPE, namely that if we have the correct model, then Bayesian inference with $T=1$ should give optimal performance \citep{kolmogorov1950foundations,savage1954foundations,jaynes2003probability,wenzel2020good}.
Critically, we need the right model for us to expect optimal performance at $T=1$.
We now have two classes of model, with DA and without DA, so which is right(er)?
Given the significant and widely recognised performance benefits of DA, it seems very likely that the ``right'' model would include some form of DA. %, and that the model without DA is ``wrong(er)''.
If the model with DA is right(er), and that model displays the CPE, then the CPE still demands an explanation, and the presence or absence of the CPE in the wrong model without DA is immaterial.
As such, the presence of the CPE in models with DA remains an important problem, and is likely to be caused by one of the two other explanations discussed in Sec.~\ref{sec:intro} (Introduction): either data curation \citep{aitchison2020cold} or prior misspecification \citep{wenzel2020good,fortuin2021bayesian}.
Indeed, we would tentatively suggest the opposite of \citet{izmailov2021bayesian}: that it is in reality the \textit{lack} of a CPE without DA that is an artifact of using the wrong model (i.e.\ without DA).

Finally, note that the CPE is not always observed, e.g.\ in language classification \citep{izmailov2021bayesian}.
This is absolutely expected as the data-curation explanation of \citet{aitchison2020cold} only implies CPE in fairly restricted settings; i.e.\ \textit{only} in the case of reasonably accurate approximate posterior inference, such as SGLD, in a BNN where the data has been curated by excluding datapoints with an ambiguous class-label.
Thus, \citet{aitchison2020cold} does not lead us to expect the CPE e.g. in latent variable models, in regression settings (where you typically do not curate data), or in hybrid models where we perform Bayesian inference over only a small subset of parameters.



% \subsection*{Acknowledgments}

% VF was supported by the Swiss Data Science Center, the Swiss National Science Foundation, and St.~John's College Cambridge.


%\pagebreak


%\section{Discussion}
%
%
%%Intuitively, we can think of averaging logits as willing to give a confident answer if it finds one augmentation under which the image-class is obvious.
%%Thus,  underlying network can be non-invariant
%%In contrast averaging probabilities is only willing to give a confident answer if the classification is reasonably
%
%Our work has important implications for a potential explanation of the cold posterior effect.
%There are ... potential explanations.
%
%
%Second, ...

%\begin{contributions} % will be removed in pdf for initial submission,
%                      % so you can already fill it to test with the
%                      % ‘accepted’ class option
%    Briefly list author contributions.
%    This is a nice way of making clear who did what and to give proper credit.
%
%    H.~Q.~Bovik conceived the idea and wrote the paper.
%    Coauthor One created the code.
%    Coauthor Two created the figures.
%\end{contributions}

\begin{acknowledgements} % will be removed in pdf for initial submission,
                        % so you can already fill it to test with the
                        % ‘accepted’ class option
VF was supported by the Swiss Data Science Center, the Swiss National Science Foundation, and St.~John's College Cambridge.
\end{acknowledgements}

% \bibliographystyle{icml2021}
\bibliography{refs}

% needed for supplementary
\nocite{welling2011bayesian, girard2003learning,mchutchon2011gaussian,damianou2016variational}

\end{document}
