\appendix

\section{Prediction Variance and Proof of Proposition 1}
\label{sec:proof}


Before proving the theoretical results in the main paper, we briefly summarize the theoretical findings from related works. 
\cite{futami2021loss} focus on particle variational inference and derive a second-order Jensen inequality (their Theorem 3), which for fixed $x$ reads 
\begin{equation*}
    J(q(\theta)) \ge \E_{q(\theta)}\left( \frac{\ln(p(x |\theta) - \E_{q(\theta)} \ln(p(x|\theta)}{2 h(x|\theta)} \right)^2
\end{equation*}
with
\begin{align*}
    h(x,\theta)^{-2} &= \exp\Big( \ln p(x|\theta) + E_{q(\theta)}[\ln p(x|\theta)]%\\ 
    %&\qquad\qquad 
    - 2 \max_\theta \ln p(x|\theta)  \Big)  \enspace.
\end{align*}
The gap is upper bounded by the weighted variance of the loss function (in contrast to ours, where we focus on the variance in predictions), and utilized as a `repulsion' loss term.
\cite{masegosa_model_misspecification} presents a lower bound on the Jensen gap in terms of the prediction variance (their Theorem 2) which is a special case of the results by \cite{liao2019sharpening}, which for fixed $x$ reads
\begin{equation*}
    J(q(\theta)) \ge \frac{1}{2 \max_{\theta} p(x|\theta)^2} \,\underbrace{\E_\theta[(p(x|\theta) - \E_\theta p(x|\theta))^2]}_{\sigma^2} \enspace.
\end{equation*}
We will make use of the latter inequality in our \Cref{theorem:JensenGap} .
The bounds in \cite{gao2017bounds} relate the Jensen gap to the (centered) moments of a random variable, but are not directly applicable in our setting.

% ----------------- %

Let us now restate \Cref{theorem:JensenGap} and prove it.
\begin{theoremApp}[Bounds on the Jensen Gap]
% \label{theorem:JensenGap}
Consider a parametrized distribution $p: (\mathcal{X} \times \mathcal{Y}) \times \Theta $, a posterior $q(\theta)$ over the parameter space $\Theta$, and input pairs $(x_n,y_n) \in (\mathcal{X} \times \mathcal{Y})$ for $i \in \{1,\dots,N\}$.
Assume that for each $n$, $p(y_n|x_n,\theta)$ satisfies $p(y_n|x_n,\theta) \in [a_n,1]$ for some $a_n > 0$ with 
\begin{itemize}
    \item $\mu_n = \mathbb{E}_\theta [p(y_n|x_n,\theta) ]$ (mean),
    \item $\absdev_{n} = \mathbb{E}_\theta[|p(y_n|x_n,\theta) - \mu_n|]$, (absolute deviation)
    \item $ \sigma_n^2 = \mathbb{E}_\theta[(p(y_n|x_n,\theta)  - \mu_n)^2] $ (variance).
\end{itemize}
Then, the Jensen gap $J(\theta)$ between the objectives is bounded by:
\begin{equation*}
\sum_{n=1}^N \max\left\{\frac{\sigma_n^2}{2},\delta_{p,n}\right\} \leq J(q(\theta)) \leq \sum_{n=1}^N \min\left\{\frac{\sigma_n^2}{2 a_n^2} , \frac{\absdev_n }{a_n} \right\}
\enspace ,
\end{equation*}
where for $p>1$ and $n \in \{1,\dots,N\}$
\begin{equation*}
    \delta_{p,n} 
    :=
 \ln \left( \frac{\mathbb{E}_{q(\theta)}[p(y_n| x_n, \theta)]}{\left(\mathbb{E}_{q(\theta)}\left[{p(y_n| x_n, \theta)^\frac{1}{p}}\right]\right)^p} \right) \ge 0 \enspace .
\end{equation*}

\end{theoremApp}

\begin{proof}
    We are interested in deriving upper and lower bounds for some random variable $X$ on $[a,b]$. 
    Let $X$ be a shorthand for the model prediction $p(y_n|x_n,\theta)$ (in dependence of the posterior $q(\theta)$) for some fixed data point $(x_n,y_n)$.
    Further, the support of $X$ satisfies $a>0$ by assumption and $b=1$ as we are analyzing a classification problem.

    We start with an \emph{upper bound} on the Jensen gap, involving the expected absolute deviation $\absdev$. 
    The Jensen gap itself reads % which reads
    \begin{align}
        J(X) &= \ln(\mathbb{E}[X]) - \mathbb{E}[\ln(X)] \\
        &= \int \ln(\mu) - \ln(x) \, \d P(x)
        \intertext{where $P(x)$ denotes the density function of the random variable $X$. We continue with}
        &\le \int \left|\, \ln(\mu) - \ln(x)  \,\right| \, \d P(x)
        \intertext{and by the Lipschitz-continuity of the logarithm on $[a,b]$ with Lipschitz constant $\frac{1}{a}$, i.e., $\forall x,y \in [a,b] : |\ln(x) - \ln(y)| \le |x-y|/a $, we conclude}
        \label{eq:upperBound}
        &\le \frac{1}{a} \int \left| \mu - x  \right| \, \d P(x) = \frac{1}{a} \absdev
    \end{align}
    in which $\absdev$ denotes the first absolute centered moment of $X$.

    We now turn to the \emph{lower bound} based on the $p$-compressed expectation spread $\delta_p$. 
    This bound is inspired by the self-improvement version of the AM-GM inequality, see,  e.g., \cite{AMGM_2009}.
    We consider the random variable $X^\frac{1}{p}$ for some $p>1$ (a `compressed' version of $X$) and start with the classical Jensen inequality
    \begin{align}
        \mathbb{E}_{q(\theta)} \left[\ln\left({X^\frac{1}{p} }\right) \right] &\leq \ln\left( \mathbb{E}_{q(\theta)}\left[X^\frac{1}{p} \right]\right) 
    \end{align}
    which implies
    \begin{align}
        \mathbb{E}_{q(\theta)} \left[\ln\left(X \right) \right] & \leq p \ln\left( \mathbb{E}_{q(\theta)}\left[X^\frac{1}{p}\right]\right) \\
         &\leq \ln \left( \left( \mathbb{E}_{q(\theta)}\left[X^\frac{1}{p} \right]\right)^p \right) \\
        &\leq  
        \ln \left( \left( \mathbb{E}_{q(\theta)}\left[X^\frac{1}{p} \right]\right)^p \right)  %\notag
        + \ln \left( \mathbb{E}_{q(\theta)}[X] \right)- \ln \left(\mathbb{E}_{q(\theta)}[X] \right) \\
        & \leq \ln \left( \mathbb{E}_{q(\theta)}[X]\right)         -   \ln \left( \frac{\mathbb{E}_{q(\theta)}[X]}{\left(\mathbb{E}_{q(\theta)}\left[X^\frac{1}{p}\right]\right)^p} \right) 
    \end{align}
    Rearranging and invoking the definition of $X$ gives
    \begin{align}
            \underbrace{\ln \left(\mathbb{E}_{q(\theta)}[p(y_n| x_n, \theta)] \right)}_{\text{from } \ML} - \underbrace{\mathbb{E}_{q(\theta)}\left[\ln \left( p(y_n| x_n, \theta) \right)\right]}_{\text{from } \VI} 
        \geq \underbrace{\ln \left( \frac{\mathbb{E}_{q(\theta)}[p(y_n| x_n, \theta)]}{\left(\mathbb{E}_{q(\theta)}\left[p(y_n| x_n, \theta)^\frac{1}{p}\right]\right)^p} \right)}_{\eqqcolon \; \delta_{p,n}} \enspace .
        \label{eq:lowerBoundDelta}
    \end{align}

    The Jensen inequality guarantees the non-negativity of $\delta_{p,n}$ (by concavity of the $p$th root).

    The \emph{variance-based bounds} follow from a slight modification from the argumentation in \cite{masegosa_model_misspecification} (see statement above), which in turn is just a special case of the results by \cite{liao2019sharpening}. We include the proof for completeness: We utilize the Taylor series representation of the logarithm up to the second degree about $\mu = \E[\ln(X)]$ with the Lagrange form of the remainder, which reads
    \begin{align}
        \ln(X) = \ln(\mu) + \frac{1}{\mu} (X - \mu) - \frac{1}{2 \xi^2} (X-\mu)^2
    \end{align}
    for some $\xi$ between $X$ and $\mu$. Taking the expectation
    \begin{align*}
        \E\ln(X) = \underbrace{\E\ln(\mu)}_{=\ln(\mu)} + \frac{1}{\mu} \underbrace{(\E X - \mu)}_{=0}
        - \E\bigg[\frac{1}{2 \xi^2}  (X-\mu)^2\bigg] %\enspace ,
    \end{align*}   %\vspace{-6mm}
    and noting that $a \le \xi \le b$ implies
    \begin{align}
        \E\bigg[\frac{1}{2 \xi^2}  (X-\mu)^2\bigg] &\le \E\bigg[\frac{1}{2 \max \xi^2}  (X-\mu)^2\bigg] = \frac{1}{2a^2}\sigma^2 \\    
        \E\bigg[\frac{1}{2 \xi^2}  (X-\mu)^2\bigg] &\ge \E\bigg[\frac{1}{2 \min\xi^2}  (X-\mu)^2\bigg] = \frac{1}{2b^2}\sigma^2
    \end{align} %\vspace{-6mm}
    directly gives 
    \begin{align}
        J(X) &= \ln(\mu) - \E\ln(X) = \E\bigg[\frac{1}{2 \xi^2}  (X-\mu)^2\bigg] \\
        &\implies J(X) \in \left[\frac{1}{2b^2}\sigma^2, \frac{1}{2a^2}\sigma^2\right] \enspace . \label{eq:lowerBoundVar}
    \end{align}


All presented bounds hold for any $X_n = p(y_n| x_n, \theta)$ on $[a_n,1]$, with mean $\mu_n$, first absolute centered moment $\absdev_n$, variance $\sigma^2_n$ and $p$-compressed expectation spread $\delta_{p,n}$.
We can thus combine \Cref{eq:upperBound,eq:lowerBoundVar,eq:lowerBoundDelta} such that the Jensen gap satisfies
\begin{align}
\sum_{n=1}^N \max\left\{\frac{ \sigma_n^2}{2},\delta_{p,n}\right\} \leq J(q(\theta)) \leq \sum_{n=1}^N \min\left\{\frac{\sigma_n^2}{2 a_n^2} , \frac{\absdev_n }{a_n} \right\} ,
\end{align}
which concludes the proof.
\end{proof}


\begin{figure*}[bht]
    \centering
    \includegraphics[width=0.975\linewidth]{figs/jensen_gap_bounds.pdf}

    \caption{\textbf{The Jensen gap and a comparison of the presented bounds for various distributions}. 
    Depicted are the Jensen gap for the logarithm $J(X) = \ln(\E[X]) - \E\ln(X)$ and upper and lower bounds from \Cref{theorem:JensenGap} for $X$ following the specified distribution on $[a,b]$ (we simply take $a$ and $b$ to be the sample minimum/maximum; note that we include cases with $b \neq 1$).
    Depicted are the variance-based bounds from the Taylor expansion $ \frac{1}{2b^2}\sigma^2 \le J(X) \le \frac{1}{2a^2}\sigma^2$, the upper bound $J(X) \le \frac{1}{a}\absdev$ and the $p$-compressed expectation spread $\delta_p \le J(X)$ for $p \in \{2,5,10\}$).
    Results are based on 100 samples per distribution.}
    \label{fig:JensenGapComparison}
\end{figure*}


Notably, the function $\delta_p\big(\theta; (y_n, x_n)\big)$ quantifies the variations of the random variable $p(y_n| x_n, \theta)$ by comparing the expectation of the `compressed' random variable---by taking the ${p}$th root which pulls everything towards one---to the uncompressed expectation.
We therefore refer to this quantity as the \textit{p-compressed expectation spread}.
When the predictions are almost constant (little variation), we have $\mathbb{E}_{q(\theta)}[p(y_n| x_n, \theta)]^\frac{1}{p} \approx \mathbb{E}_{q(\theta)}[p(y_n| x_n, \theta)^\frac{1}{p}]$, such that $\delta_p \approx 0$.
For $p=2$ the gap relates to the variance (of $\sqrt{p(y_n| x_n, \theta)}$) and usually becomes tighter with growing $p$.

A simple comparison of the different bounds is presented in \Cref{fig:JensenGapComparison}.
We see that the variance-based bounds following \cite{masegosa_model_misspecification,liao2019sharpening} become tight for very low variances (almost constant random variables), while the $p$-compressed expectation spread yields tighter lower bounds for higher variances and heavy-tailed distributions. 
In such settings, the bound based on the absolute deviation is usually tighter then the variance-based bound.
Thus, we decided to include both lower bounds in \Cref{theorem:JensenGap} to cater to both extremes.

To summarize the key point from \Cref{theorem:JensenGap}: The Jensen gap between the objectives of interest grows (and vanishes) with the variability in the predictions. In the limit, i.e., if $\forall n: \Var_{q(\theta)}[{p(y_n| x_n, \theta)}] = 0$, equality between $\ML$ and $\VI$ is reached again.


\section{Training details}
\label{app:training_details}
\paragraph{Training hyperparameters}
All models were trained with AdamW \citep{adamw} with an initial learning rate of 0.001. Note, that for ResNet20 on CIFAR10 we used the example code by~\citet{krishnan2022bayesiantorch} without modifications on the hyperparameter setting. Note, that in this implementation, the KL divergence of each layer is given as the mean KL divergence over the parameters in that layer and is hence down-weighted in comparison to the other models we used. All training runs were executed on NVIDIA A40 GPUs (a single A40 is sufficient for each single experiment). 


\begin{table}[ht]
    \centering
    \begin{tabular}{lrrrr}
\toprule
        Dataset &  Resolution & \#Classes & \#Train &\#Test \\ \midrule %\headrule
        MNIST &         28x28 & 10 & 60k & 60k \\
        FashionMNIST  & 28x28 & 10  & 60k & 60k \\
        CIFAR10 &       32x32& 10  & 50k & 10k \\
        PathMNIST &   224x224 & 9 & 89,996 & 7,180 \\
        DermaMNIST &  224x224 & 7 & 7,007 & 2,005 \\
        \bottomrule
    \end{tabular}
    \caption{\textbf{Datasets} used in this work.}
    \label{tab:datasets}
\end{table}

We used different numbers of epochs for the models trained on different datasets as we observed that although accuracy might be at the highest value, the balance between the expectation- and KL divergence is reached at a much later point in training. Therefore, we fixed the number of epochs to an amount where we saw the loss to be stagnating. That is, % for MVN after 50 epochs, 
on MNIST, FashionMNIST after 70 epochs for the FF architecture and 100 epochs for the FF-MVN architecture, for CIFAR10 after 100 epochs (both ResNet20 and FF),
% 100 epocchs, 
for PathMNIST after 120 epochs and for models trained on DermaMNIST after 150 epochs. Note, that DermaMNIST is by far the smallest datasets which explains the increased number of epochs (cf. \Cref{tab:datasets}).


\begin{figure}[!bht]
    \centering
     \def\colwidth{0.24} % 0.45 
        \begin{subfigure}{\colwidth\columnwidth}
            \includegraphics[width=\columnwidth]{figs/mnist_loss.pdf}
            \caption{Loss values}
        \end{subfigure}
        \begin{subfigure}{\colwidth\columnwidth}
            \includegraphics[width=\columnwidth]{figs/mnist_kld.pdf}
            \caption{KL term in loss}
        \end{subfigure}
        \begin{subfigure}{0.26\columnwidth}
            \includegraphics[width=\linewidth]{figs/logE.pdf}
            \caption{$\ln\E$}
        \end{subfigure}
      \begin{subfigure}{\colwidth\columnwidth}
        \includegraphics[width=\linewidth]{figs/Elog.pdf}
        \caption{$\E\ln$}
        \end{subfigure}
    \caption{\textbf{Value of the respective loss functions, KL divergence, $\ln\E$ and $\E\ln$ during training} (on MNIST). The same characteristics were observed for other models and datasets.}
    \label{fig:trainstats} 
\end{figure}

\paragraph{Comparing training metrics}
This subsection presents training metrics for the simple feedforward architectures  FF trained on MNIST~\citep{deng2012mnist}. Even though this is an easy task, we found the training behavior of this simple approach to be exemplary for all other models and datasets used in this paper. 


First, it is noted that training for 70 epochs resulted in a test set accuracy of roughly 0.98 for each objective type, such that all models perform reasonably well.
In \Cref{fig:trainstats} we see (a) the respective losses of the models throughout training and (b) the respective KL divergence between the learned weight distribution and the prior. The loss is grossly dominated by the Kullback-Leibler divergence, which is more strongly minimized for the model trained with $\VI$.


To investigate further differences during training, we tracked $\ln \E$ and $\E \ln$ for each training model in \Cref{fig:trainstats}(c) and (d). While the results for $\ln \E$, which $\ML$ uses in its loss objective are comparable for all three training objectives, the results for $\E \ln$ significantly deviate. This indicates, that the gap described in \cref{theorem:JensenGap} is significantly increased for $\ML$ being indicative of higher diversity in predictions.
The baseline and models trained with $\VI$ show a similar training behavior with respect to these metrics.



\begin{wrapfigure}{r}{0.4\textwidth}
% \begin{figure}[!hbt]
    \centering
    \vspace{-5mm}
    \includegraphics[width=1.0\linewidth]{figs/means_pathmnist.pdf}
    \caption{\textbf{Weights' mean values of the learned $q(\theta)$} for the DINOTopping model on PathMNIST.}
    \label{fig:mean} 
    \vspace{-1.5cm}
% \end{figure}
\end{wrapfigure}


\paragraph{Mean of the learned weight distributions} 
As mentioned in the main part of the paper, the mean weights' mean value distributions do not differ notably between the objectives of interest as can be seen in \Cref{fig:mean}.


\section{Performance with other hyperparameters}
%


To ensure, that the finding of increased prediction variance is not only an artifact to our choice of prior or KL weighting, we experimented with  different values of 
i) $\lambda$ and ii) a distributional change of $p(\theta)$ to $\mathcal{N}(0, 3)$ and tested how these changes impact accuracy and prediction variance of the models.

In \Cref{tab:accuracy_priors} we show the results for models trained on MNIST with MVN and observe, that the reduction of $\lambda$ leads to higher accuracy (in line with findings with regards to the ``cold posterior'' effect, see e.g., the work by \cite{wenzel2020howgoodposterior}) while reducing the average per sample variance compared to the standard setting. Since $\lambda$ scales the KL divergence which hinders $q(\theta)$ from collapsing the reduced prediction variance is expected.
Interestingly though, a higher prior variance does not translate to a notably higher per-sample prediction variance but decreases the test set accuracy.
We find this trend also to be true on the other data sets.

\begin{table}[!hbt]
\centering
\begin{tabular}{l ccc ccc}
\toprule
 & \multicolumn{2}{c}{Standard setting} & \multicolumn{2}{c}{ $\lambda=0.1$}& \multicolumn{2}{c}{$\sigma^2=3$}\\
& Accuracy & Avg var & Accuracy & Avg var & Accuracy & Avg var \\ \hline 
$\VI$  &    97.21 $\pm$ 0.11   & 0.02  &  98.39 $\pm$ 0.06 &  0.01 & 96.91 $\pm$ 0.12 &  0.02  \\
baseline &   97.01 $\pm$ 0.12    &  0.02 & 98.22 $\pm$ 0.06  &  0.01 & 96.67 $\pm$ 0.16 &  0.02 \\
$\ML$    &   97.21 $\pm$ 0.10    &  0.06 & 98.25 $\pm$ 0.11  &  0.02 & 96.83 $\pm$ 0.11 &  0.07 \\ 
                          \bottomrule \\
\end{tabular}%
\caption{\textbf{Influence of regularization strength $\lambda$ and prior variance $\sigma^2$} on test set accuracy (in \%) and average prediction variance (MNIST). 
We here report mean and standard deviation over 10 different seeds, trained with Adam.}
\label{tab:accuracy_priors}
\end{table}

\clearpage


\section{Illustration of Ensembles with low and high function space diversity}

\label{sec:Illustration_EnsembleVariance}
Here, we briefly recapitulate two prototypical scenarios for the behavior of ensemble members, as illustrated in \Cref{fig:schema}.
In the left column, the models predict classes in the same ordering,
in particular their predictions $\arg \max_c p(y_c|x, \theta_i)$ agree.
In contrast, 
% while 
models' predictions in the right column disagree.
This is possible even if \textit{ensemble predictions} are identical (top row) or the \textit{prediction variances} are identical (bottom row) in both scenarios.
The latter motivates our analysis of the function space diversity in \Cref{sec:ImplicationsTraining} (to answer the question whether models are just uncertain on some samples but still largely agree in prediction, or whether the models predict different classes).


\begin{figure}[!ht]
    \centering
        \includegraphics[width=0.75\linewidth]{figs/Schema_ensemble2.pdf}
        
        \includegraphics[width=0.75\linewidth]{figs/Schema_ensembleVariances.pdf}

    \caption{\textbf{Two scenarios depicting similar (left) and dissimilar behavior (right) across ensemble members} (model predictions indicated by small arrows on top).
    \textit{Top row:} Ensemble predictions are identical (grey lines show $\frac{1}{3}\sum_{i=1}^3 p(y|x,\theta_i)$). \textit{Bottom row:} Prediction variances of the ensemble are identical (in parentheses below).
    }
    \label{fig:schema} 
\end{figure}


\clearpage



\section{Examples of image corruptions}
In Figure~\ref{fig:corrupted_images} we show exemplary corruptions of the images which compose the OOD data sets used in~\Cref{subsec:OOD} of the main paper.

\begin{figure}[!htb]
\centering
\resizebox{0.56\linewidth}{!}{
\setlength{\tabcolsep}{2pt}
\begin{tabular}{ %p{2cm} b{2cm} b{2cm} b{2cm} b{2cm} b{2cm} b{2cm} b{2cm}}%
 l c c c c c c c c}
%\setlength{\tabcolsep}{0.05em}
\toprule
Perturbation &   \multicolumn{6}{c}{Severity} \\
\hline
& Benign & Level 1 & Level 2 & Level 3 & Level 4 & Level 5 \\
\midrule
\raisebox{0.7cm}{Gaussian noise} &
\includegraphics[width=0.12\columnwidth]{figs/corrupted/gaussian_noise_0_normal_1.png}  &
\includegraphics[width=0.12\columnwidth]{figs/corrupted/gaussian_noise_0_corrupted_1.png} & 
\includegraphics[width=0.12\columnwidth]{figs/corrupted/gaussian_noise_1_corrupted_1.png} & 
\includegraphics[width=0.12\columnwidth]{figs/corrupted/gaussian_noise_2_corrupted_1.png}  & 
\includegraphics[width=0.12\columnwidth]{figs/corrupted/gaussian_noise_3_corrupted_1.png} &
\includegraphics[width=0.12\columnwidth]{figs/corrupted/gaussian_noise_4_corrupted_1.png} \\
\raisebox{0.7cm}{Shot noise 2} &
\includegraphics[width=0.12\columnwidth]{figs/corrupted/gaussian_noise_0_normal_2.png}  &
\includegraphics[width=0.12\columnwidth]{figs/corrupted/shot_noise_0_corrupted_2.png} &
\includegraphics[width=0.12\columnwidth]{figs/corrupted/shot_noise_1_corrupted_2.png} & 
\includegraphics[width=0.12\columnwidth]{figs/corrupted/shot_noise_2_corrupted_2.png}  & 
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/shot_noise_3_corrupted_2.png} &
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/shot_noise_4_corrupted_2.png} \\
\raisebox{0.7cm}{Impulse noise 3} &
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/gaussian_noise_0_normal_3.png}  &
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/impulse_noise_0_corrupted_3.png} &
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/impulse_noise_1_corrupted_3.png} & 
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/impulse_noise_2_corrupted_3.png}  & 
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/impulse_noise_3_corrupted_3.png} &
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/impulse_noise_4_corrupted_3.png} \\
\raisebox{0.7cm}{Defocus blur} &
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/gaussian_noise_0_normal_4.png}  &
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/defocus_blur_0_corrupted_4.png} &
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/defocus_blur_1_corrupted_4.png} & 
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/defocus_blur_2_corrupted_4.png}  & 
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/defocus_blur_3_corrupted_4.png} &
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/defocus_blur_4_corrupted_4.png} \\
\raisebox{0.7cm}{Glass blur} &
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/gaussian_noise_0_normal_5.png}  &
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/glass_blur_0_corrupted_5.png} &
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/glass_blur_1_corrupted_5.png} & 
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/glass_blur_2_corrupted_5.png}  & 
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/glass_blur_3_corrupted_5.png} &
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/glass_blur_4_corrupted_5.png} \\
\raisebox{0.7cm}{Motion blur} &
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/gaussian_noise_0_normal_6.png}  &
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/motion_blur_0_corrupted_6.png} &
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/motion_blur_1_corrupted_6.png} & 
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/motion_blur_2_corrupted_6.png}  & 
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/motion_blur_3_corrupted_6.png} &
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/motion_blur_4_corrupted_6.png} \\
\raisebox{0.7cm}{Zoom blur} &
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/gaussian_noise_0_normal_7.png}  &
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/zoom_blur_0_corrupted_7.png} &
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/zoom_blur_1_corrupted_7.png} & 
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/zoom_blur_2_corrupted_7.png}  & 
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/zoom_blur_3_corrupted_7.png} &
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/zoom_blur_4_corrupted_7.png} \\
\raisebox{0.7cm}{Snow} &
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/gaussian_noise_0_normal_8.png}  &
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/snow_0_corrupted_8.png} &
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/snow_1_corrupted_8.png} & 
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/snow_2_corrupted_8.png}  & 
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/snow_3_corrupted_8.png} &
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/snow_4_corrupted_8.png} \\
\raisebox{0.7cm}{Frost} &
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/gaussian_noise_0_normal_9.png}  &
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/frost_0_corrupted_9.png} &
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/frost_1_corrupted_9.png} & 
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/frost_2_corrupted_9.png}  & 
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/frost_3_corrupted_9.png} &
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/frost_4_corrupted_9.png} \\
\raisebox{0.7cm}{Fog} &
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/gaussian_noise_0_normal_10.png}  &
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/fog_0_corrupted_10.png} &
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/fog_1_corrupted_10.png} & 
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/fog_2_corrupted_10.png}  & 
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/fog_3_corrupted_10.png} &
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/fog_4_corrupted_10.png} \\
\raisebox{0.7cm}{Brightness} &
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/gaussian_noise_0_normal_11.png}  &
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/brightness_0_corrupted_11.png} &
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/brightness_1_corrupted_11.png} & 
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/brightness_2_corrupted_11.png}  & 
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/brightness_3_corrupted_11.png} &
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/brightness_4_corrupted_11.png} \\
\raisebox{0.7cm}{Contrast} &
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/gaussian_noise_0_normal_12.png}  &
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/contrast_0_corrupted_12.png} &
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/contrast_1_corrupted_12.png} & 
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/contrast_2_corrupted_12.png}  & 
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/contrast_3_corrupted_12.png} &
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/contrast_4_corrupted_12.png} \\
\raisebox{0.7cm}{Elastic transform} &
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/gaussian_noise_0_normal_13.png}  &
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/elastic_transform_0_corrupted_13.png} &
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/elastic_transform_1_corrupted_13.png} & 
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/elastic_transform_2_corrupted_13.png}  & 
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/elastic_transform_3_corrupted_13.png} &
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/elastic_transform_4_corrupted_13.png} \\
\raisebox{0.7cm}{Pixelate} &
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/gaussian_noise_0_normal_14.png}  &
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/pixelate_0_corrupted_14.png} &
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/pixelate_1_corrupted_14.png} & 
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/pixelate_2_corrupted_14.png}  & 
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/pixelate_3_corrupted_14.png} &
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/pixelate_4_corrupted_14.png} \\
\raisebox{0.7cm}{JPEG compression} &
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/gaussian_noise_0_normal_15.png}  &
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/jpeg_compression_0_corrupted_15.png} &
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/jpeg_compression_1_corrupted_15.png} & 
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/jpeg_compression_2_corrupted_15.png}  & 
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/jpeg_compression_3_corrupted_15.png} &
\includegraphics[width=0.12\columnwidth ]{figs/corrupted/jpeg_compression_4_corrupted_15.png} \\
\end{tabular}
}
\caption{\textbf{Example images of CIFAR10 when applying different corruptions with increasing corruption strength} based on~\cite{michaelis2019dragon}'s repository.}
\label{fig:corrupted_images}
\end{figure}

