%\documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
\usepackage{amsfonts}
% \usepackage[british]{babel}
\usepackage{longtable}
%% Custom commands
\newcommand{\bs}{\boldsymbol}
\newcommand{\x}{\boldsymbol{x}}
\newcommand{\z}{\boldsymbol{z}}
\newcommand{\m}{\boldsymbol{m}}
\newcommand{\p}{\boldsymbol{p}}
\newcommand{\h}{\boldsymbol{h}}
\newcommand{\mub}{\boldsymbol{\mu}}
\newcommand{\xib}{\boldsymbol{\xi}}
\newcommand{\sigmab}{\boldsymbol{\sigma}}
\newcommand{\Sigmab}{\boldsymbol{\Sigma}}
\newcommand{\thetab}{\boldsymbol{\theta}}
\newcommand{\phib}{\boldsymbol{\phi}}
\newcommand{\psib}{\boldsymbol{\psi}}
\newcommand{\Psib}{\boldsymbol{\Psi}}
\newcommand{\etab}{\boldsymbol{\eta}}
\newcommand{\0}{\boldsymbol{0}}
\newcommand{\W}{\boldsymbol{W}}
\newcommand{\norm}[1]{\left\lVert#1\right\rVert}
\newcommand{\Var}{\mathrm{Var}}
\newcommand{\thetam}{\boldsymbol{\theta}_m}
\newcommand{\given}{\,|\,}
\newcommand{\lik}{p(\x \given \thetab)}
\newcommand{\priorm}{p(\thetab \given \mathcal{M})}
\newcommand{\likm}{p(\x \given \thetab, \mathcal{M})}
\newcommand{\post}{p(\thetab \given \x)}
\newcommand{\postm}{p(\thetab \given \x, \mathcal{M})}
\newcommand{\NIW}{\text{N-}\mathcal{W}^{-1}}
\newcommand{\NIG}{\text{N-}\Gamma^{-1}}
\newcommand{\xbtilde}{\tilde{\boldsymbol{x}}}
\newcommand{\prior}{p(\thetab)}
\newcommand{\joint}{p(\thetab, \x)}
\newcommand{\jointm}{p(\thetab, \x \given \mathcal{M})}
\newcommand{\noise}{p(\xib)}
\newcommand{\noised}{p(\xib \given \thetab)}
\newcommand{\model}{G(\thetab, \xib)}
\newcommand{\diff}{\mathrm{d}}
\newcommand{\tripleopt}{(\phib^*,\etab^*,\psib^*)}
\newcommand{\triple}{(\phib,\etab,\psib)}
\newcommand{\lika}{l_{\etab}(\x \given \thetab)}
\newcommand{\posta}{p_{\phib}(\thetab \given \x)}
\newcommand{\postah}{p_{\phib}(\thetab \given \mathcal{H}_{\psib}(\x))}

\usepackage{placeins}
\usepackage{subcaption}
\usepackage{float}
\usepackage{forest}
\usepackage{adjustbox}

\usepackage{algorithm}
\usepackage{algorithmic}
\renewcommand{\algorithmicrequire}{\textbf{Input:}}
\renewcommand{\algorithmicensure}{\textbf{Output:}}

\usepackage[backend=biber,
style=apa,
maxcitenames=2,
minbibnames=2,
maxbibnames=2, 
language=english,
doi=false,
isbn=false,
url=false,
uniquename=false
]{biblatex} 
\DeclareLanguageMapping{english}{english-apa} % mapping for apa
\addbibresource{radev_354.bib} % bib file

\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr} 
\externaldocument{uai2023-main}
\usepackage{xcolor}
\definecolor{darkblue}{RGB}{0,0,128}
\hypersetup{
    colorlinks=true,
	linkcolor=darkblue,
	filecolor=darkblue,
	urlcolor=darkblue,
	citecolor=darkblue,
	pdfauthor={},
	pdftitle={}
}
%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

\title{JANA: Jointly Amortized Neural Approximation of Complex\\Bayesian Models\\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\vspace{-2.5em}Stefan T.~Radev}
\author[2]{Marvin Schmitt}
\author[3]{Valentin Pratz}
\author[4]{Umberto Picchini}
\author[3]{\authorcr Ullrich Köthe$^*$}
\author[2]{Paul-Christian Bürkner$^*$}
% Add affiliations after the authors
\affil[1]{%
Cluster of Excellence STRUCTURES\\
Heidelberg University
}
\affil[2]{%
Cluster of Excellence SimTech\\
University of Stuttgart
}
\affil[3]{%
Visual Learning Lab\\
Heidelberg University
}
\affil[4]{%
Department of Mathematical Sciences\\
Chalmers University of Technology \& University of Gothenburg
}

\DeclareMathOperator*{\argmin}{arg\,min}

\newcommand{\twomoonsposterior}[2]{
\begin{subfigure}[t]{0.40\linewidth}
    \includegraphics[width=\linewidth]{#2}
    \caption{#1}
\end{subfigure}
}


\newcommand{\covidscenario}[2]{
\begin{subfigure}[t]{0.40\linewidth}
    \frame{\includegraphics[width=\linewidth]{#2}}
    \caption{#1}
\end{subfigure}
}


\newcommand{\appendixbenchmark}[3]{
    \begin{figure}
        \centering
        \begin{subfigure}[t]{0.7\linewidth}
            \centering
            \includegraphics[width=1.0\linewidth]{plots/benchmarks/appendix/#2_losses.pdf}
            \caption{Training and validation loss history.}
        \end{subfigure}
        \begin{subfigure}[t]{1.0\linewidth}
            \centering
            \includegraphics[scale=0.25]{plots/benchmarks/appendix/#2_posterior_calibration_diff_separate.pdf}
            \caption{Posterior calibration.}
        \end{subfigure}
        \begin{subfigure}[t]{1.0\linewidth}
            \centering
            \includegraphics[scale=0.25]{plots/benchmarks/appendix/#2_joint_calibration_diff_separate.pdf}
            \caption{Joint calibration.}
        \end{subfigure}
        \caption{\textbf{Benchmark #1, #3.} Loss history, posterior calibration, and joint calibration.}
        \label{fig:app:benchmark:#2}
    \end{figure}
}

\newcommand{\appendixbenchmarkcompressed}[3]{
    \begin{figure}
        \centering
        \begin{subfigure}[t]{0.7\linewidth}
            \centering
            \includegraphics[width=1.0\linewidth]{plots/benchmarks/appendix/#2_losses.pdf}
            \caption{Training and validation loss history.}
        \end{subfigure}\\
        \vspace{0.5cm}
        \begin{subfigure}[t]{0.40\linewidth}
            \includegraphics[width=\linewidth]{plots/benchmarks/appendix/#2_posterior_calibration_diff_separate.pdf}
            \caption{Posterior calibration.}
        \end{subfigure}
        \hspace{1cm}
        \begin{subfigure}[t]{0.40\linewidth}
            \includegraphics[width=\linewidth]{plots/benchmarks/appendix/#2_joint_calibration_diff_separate.pdf}
            \caption{Joint calibration.}
        \end{subfigure}
        \caption{\textbf{Benchmark #1, #3.} Loss history, posterior calibration, and joint calibration.}
        \label{fig:app:benchmark:#2}
    \end{figure}
}

\begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle

\appendix

\section{Frequently Asked Questions (FAQ)}

\textbf{Q: How can I reproduce the results?}\\[3pt]
Code to reproduce all results is available in the repository at \url{https://github.com/bayesflow-org/JANA-Paper}.

\vspace*{6pt}\textbf{Q: How can I apply JANA to my own Bayesian models?}\\[3pt]
Simulation-based algorithms for jointly amortized inference are implemented in the \texttt{BayesFlow} library. 
Take a look at the code and tutorials, available at:
\url{https://github.com/stefanradev93/BayesFlow}.


\vspace*{6pt}\textbf{Q: When should I use amortized inference instead of sequential methods?}\\[3pt]
Whenever you want to follow a principled Bayesian workflow and you have lots of data sets on which a Bayesian model needs to be applied independently.

\vspace*{6pt}\textbf{Q: Does amortization come at the cost of wasteful simulations?}\\[3pt]
Some previous papers assume that this is generally the case. 
On the contrary, we believe that wasteful simulations are primarily the consequence of poorly chosen priors, whereas modern neural networks actually profit from broader simulation scopes, as long as the priors are informative. Moreover, amortization makes a principled Bayesian workflow much easier than case-based inference.
Still, specifying sensible joint priors is not always easy.

\vspace*{6pt}\textbf{Q: Can you somehow combine the three networks and utilize weight sharing?}\\[3pt]
Finding a suitable weight sharing approach which is applicable to various model structures---such as exchangeable or Markovian---proves challenging.
Since JANA is an attempt at a universal method, we refrain from customizing the overall architecture to suit a particular model structure.
Instead of weight sharing, we exploit the \text{probabilistic symmetries} of joint Bayesian learning, which is universal across all model structures (see Figure 1 of the main paper).
Although it remains a possible area for further investigation, we are uncertain whether weight sharing in our context is even desirable.

\vspace*{6pt}\textbf{Q: Can I use a different type of generative network for the posterior or likelihood networks?}\\[3pt]
JANA can operate with arbitrary conditional density approximators. 
However, it is important that these approximators are able to efficiently compute \textit{normalized densities} for the purpose of marginal likelihood and posterior predictive estimation.

\vspace*{6pt}\textbf{Q: Why do you need a summary network?}\\[3pt]
Because most real world data comes in various sizes and shapes. 
Thus, we need an interface between the Bayesian model and the posterior network which renders the latter applicable to various sizes and shapes.

\vspace*{6pt}\textbf{Q: Can you also use a summary network for the likelihood network?}\\[3pt]
It is possible and can be helpful if the parameter space of the reference Bayesian model requires some form of compression.
Indeed, in the second iteration of the paper, we included a Bayesian denoising experiment (\textbf{Experiment 5}) which equips the surrogate likelihood with a convolutional summary network.

\vspace*{6pt}\textbf{Q: Is it necessary to have normalized likelihood estimates or would a standard feedforward neural network suffice?}\\[3pt]
A normalized likelihood is necessary to estimate the expected log predictive density (ELPD) for approximating out-of-sample predictive performance via cross validation or log marginal likelihoods (LMLs) for approximating Bayes factors. Normalization is also needed to compare likelihoods obtained from different models (which might otherwise report unnormalized likelihoods at different scales). If none of these (log) likelihood metrics is needed for a particular analysis, normalization of the likelihood network is not strictly required.

\FloatBarrier
\clearpage
\section{Code}

The code and instructions for running and reproducing all experiments are available at the project's repository \url{https://github.com/bayesflow-org/JANA-Paper}.
We use fixed seeds for the random number generators of test (held-out) sets. 
Training uses no seeds, as we believe the methods to be stable enough to converge on any run.

\FloatBarrier
\clearpage
\section{Method Details}
\subsection{Pseudocode}

\begin{algorithm}[H]
\caption{Jointly amortized neural approximation: offline training using a pre-simulated training set}
\label{alg:jana}
\begin{algorithmic}[1]
\REQUIRE{Bayesian model $\joint$; summary network $\mathcal{H}_{\psib}$; posterior network $\mathcal{P}_{\phib}$; likelihood network $\mathcal{L}_{\etab}$; number of simulations $N$ (budget); batch size $B$}
\STATE{Initialize $\mathcal{D} = \{\}$.}
\FOR{$n=1,\ldots,N$}
	\STATE{Sample from prior: $\thetab_n \sim \prior$}
	\STATE{Sample from (implicit) likelihood: $\x_n \sim p(\x \given \thetab_n)$}
	\STATE{Add simulations to training data: $\mathcal{D} := \mathcal{D} \cup \{(\thetab_n, \x_n)\}$ \hfill}
\ENDFOR
\WHILE{not converged}
	\STATE{Sample batch from training data: $\{(\thetab_b, \x_b)\}_{b=1}^B \sim \mathcal{D}$}
    \STATE{Compute Monte Carlo estimate of loss function over batch (Eq.~12).}
    \STATE{Update neural network parameters $(\psib, \phib, \etab)$ via backpropagation.}
\ENDWHILE
\RETURN{trained networks $\lika, \postah, \mathcal{H}_{\psib}$}
\end{algorithmic}
\end{algorithm}

\subsection{Likelihood Networks for Exchangeable Data}

Exchangeable models generate IID data, that is, each run $\model$ with a fixed configuration $\thetab$ is independent of all other runs. 
Thus, for $N$ runs of such a (memoryless or stateless) model, the likelihood decomposes into the product of point-wise likelihoods:
\begin{equation}
    \lik = \prod_{n=1}^N p(\x_n \given \thetab)
\end{equation}

Accordingly, we can represent such data as unordered sets and simply apply the likelihood network exchangeably by concatenating each $\x_n$ with $\thetab$ in each coupling layer.
The forward pass for a single conditional affine coupling layer \autocite{ardizzone2019guided, radev2020bayesflow} of an exchangeable likelihood network is given by:
\begin{align} 
\z_n^{\mathcal{A}} &= \x_n^{\mathcal{A}} \odot \exp(S_1(\x_n^{\mathcal{B}}; \thetab)) + T_1(\x_n^{\mathcal{B}}; \thetab)\nonumber  \\ 
\z_n^{\mathcal{B}} &= \x_n^{\mathcal{B}} \odot \exp(S_2(\z_n^{\mathcal{A}}; \thetab)) + T_2(\z_n^{\mathcal{A}}; \thetab)\nonumber,
\end{align}
where $\x_n = (\x_n^{\mathcal{A}}, \x_n^{\mathcal{B}})$ is a disjoint partition of the input data at position $n$, $\z_n = (\z_n^{\mathcal{A}}, \z_n^{\mathcal{B}})$ is the  corresponding latent partition, and the functions $S_1$, $S_2$, $T_1$, $T_2$ are implemented as multi-headed fully connected (FC) neural networks (with trainable parameters suppressed for clarity). 
The forward pass for neural spline flows \autocite{durkan2019neural} is modified accordingly, such that the spline parameters are generated exchangeably, conditioned on the parameter vector $\thetab$.

\subsection{Likelihood Networks for Markovian Data}

The widely used family of Markovian models factorize in a way that the probability of each data point depends on previous data points:
\begin{equation}
    \lik = \prod_{n=1}^Np(\x_n\given\thetab,\x_{1:{n-1}})
\end{equation}

Such models require a slightly different coupling layer design which respects their non-IID outputs.
To this end, we augment standard coupling layers with a conditional recurrent (GRU) memory $\h_n = M(\thetab, \x_n; \h_{n-1})$ which encodes temporal dependencies into a hidden state vector $\h$.

For instance, the forward pass for a single conditional affine coupling layer of the non-exchangeable likelihood network is then given by:
\begin{align} 
\h_n &= M(\thetab, \x_n; \h_{n-1})\nonumber\\
\z_n^{\mathcal{A}} &= \x_n^{\mathcal{A}} \odot \exp(S_1(\x_n^{\mathcal{B}}; \thetab, \h_{n-1})) + T_1(\x_n^{\mathcal{B}}; \thetab, \h_{n-1}) \nonumber\\ 
\z_n^{\mathcal{B}} &= \x_n^{\mathcal{B}} \odot \exp(S_2(\z_n^{\mathcal{A}}; \thetab, \h_{n-1})) + T_2(\z_n^{\mathcal{A}}; \thetab, \h_{n-1}) \nonumber,
\end{align}
where now each latent representation $\z_n$ at position $n$ depends on the preceding data points, as encoded by $\h_{n-1}$, and the functions $S_1$, $S_2$, $T_1$, $T_2$ are implemented as multi-headed fully connected (FC) neural networks.
The forward pass for neural spline flows \autocite{durkan2019neural} is modified accordingly, such that the spline parameters are generated using a recurrent memory, conditioned on the parameter vector $\thetab$.

\subsection{Correctness of Joint Simulation-Based Training}

To show that our jointly optimized criterion yields correct posterior, likelihood, and marginal likelihood inference, consider first the joint optimization of the posterior and the summary network:
\begin{align}
    (\phib^*, \psib^*) &= \argmin_{\phib, \psib}\mathbb{E}_{p^*(\x)}\left[\mathbb{KL}(\post\,||\,\postah) \label{eq:kl_full}
    \right] \\
    &= \argmin_{\phib, \psib}\mathbb{E}_{p^*(\x)}\left[ \mathbb{E}_{p(\thetab \given \x)}\left[\log \post - \log \postah \right]\right] \\
    &= \argmin_{\phib, \psib}\mathbb{E}_{p^*(\x)}\left[ \mathbb{E}_{p(\thetab \given \x)}\left[-\log \postah \right]\right] \label{eq:kl_reduced}
\end{align}
The above criterion (Eq.~\ref{eq:kl_full}) states that, in order to achieve proper amortized inference, we want to minimize the Kullback-Leibler (KL) divergence between the analytic and the approximate posterior density in expectation over all possible observations from the true data-generating distribution $p^*$.
This reduces to the expected negative log posterior (Eq.~\ref{eq:kl_reduced}), since the negative entropy of the analytic posterior $-\mathbb{H}\left[ \post \right] = \mathbb{E}_{p(\thetab \given \x)}\left[\log \post \right]$ does not depend on the neural network parameters $(\phib, \psib)$.

In order to make amortized posterior inference tractable under Eq.~\ref{eq:kl_full}, we need to assume that the true data-generating distribution $p^*$ and the model-implied (i.e., prior predictive) distribution $p(\x) = \mathbb{E}_{\prior}\left[\lik\right]$ match, that is, $p^*(\x) = p(\x)$ for any $\x$.
In other words, we invoke the so-called \textit{closed-world assumption}, which states the Bayesian model is a correct representation of the true data-generating distribution.
In that case, we can simply replace $p^*(\x)$ with $p(\x)$ and write our criterion as:
\begin{align}
    (\phib^*, \psib^*) &= \argmin_{\phib, \psib}\mathbb{E}_{p(\x)}\left[ \mathbb{E}_{p(\thetab \given \x)}\left[-\log \postah \right]\right] \\
    &= \argmin_{\phib, \psib}\mathbb{E}_{p( \thetab, \x)}\left[-\log \postah \right] \label{eq:kl_final}
\end{align}
We can now readily approximate the expectation with its empirical mean over a data set of simulations $(\thetab, \x) \sim \mathcal{D}$ generated from the Bayesian joint model $p(\thetab, \x)$. 
Thereby, we leverage the fact that we can directly evaluate $-\log \postah$ (and not a lower bound) due to the use of a normalizing flow (NF) for the approximate posterior.
Moreover, as shown by \textcite{radev2020bayesflow}, perfect convergence under Eq.~\ref{eq:kl_final} ensures that the summary network learns maximally informative (ideally sufficient) summary statistics and the posterior network samples from the analytic posterior.
Note, however, that if the key assumption of $p^*(\x) = p(\x)$ is violated for some $\x$, then the approximate posterior may no longer be a faithful representation of the analytic posterior in general. 
This situation motives the introduction of the summary space distribution $p(\mathcal{H}_{\psib}(\x))$ (to be discussed shortly).

As for the likelihood network, we aim to minimize the KL divergence between the analytic and the approximate posterior density in expectation over all possible parameter configurations from the prior:
\begin{align}
    \etab^* &= \argmin_{\etab}\mathbb{E}_{\prior}\left[\mathbb{KL}(\lik\,||\,\lika) \label{eq:kl_full_lik}
    \right]
\end{align}
Following the same reasoning as for the posterior KL and leveraging the fact that the expectation runs over a model-implied quantity (i.e., the prior), the above criterion directly reduces to:
\begin{align}
    \etab^* = \argmin_{\etab}\mathbb{E}_{p( \thetab, \x)}\left[-\log \lika \right] \label{eq:kl_final_lik}
\end{align}
Observing that both optimization criteria (Eq.~\ref{eq:kl_final} and Eq.~\ref{eq:kl_final_lik}) include an expectation over the Bayesian joint $p(\thetab, \x)$, we arrive at our combined loss function:
\begin{align}
    \mathcal{L}_{\text{JANA}} := -\mathbb{E}_{p( \thetab, \x)}\big[\log \lika + \log \postah \big] \label{eq:kl_final_joint}
\end{align}
Thus, under the closed-world assumption, proper minimization of this loss ensures correct posterior and likelihood approximation.
However, in practice, we want to obtain some measure of the mismatch between $p^*(\x)$ and $p(\x)$.
Moreover, since $\x$ is typically a high dimensional data set (e.g., a data set of multivariate IID observations) and the posterior network only ``sees'' $\x$ through the lens of the summary network, it makes sense to measure the potential mismatch in the reduced summary space given by $\mathcal{H}_{\psib}(\x)$.
To make the detection task even easier, we want to re-structure the unrestricted $p(\mathcal{H}_{\psib}(\x))$ into a simple distribution (e.g., Gaussian) with a well-defined notion of an outlier.
Accordingly, we utilize the Maximum Mean Discrepancy \autocite[MMD;][]{Gretton2012}:
\begin{align}
    \mathbb{MMD}^2\big[p^*(\x)\,||\,p(\x)\big] =
    \mathbb{E}_{
    p^*(\x)}\big[\kappa(\x, \x')\big]
    + \mathbb{E}_{
    p(\x)}\big[\kappa(\x, \x')\big]
    - 2 \mathbb{E}_{
      \x \sim p^*(\x), \x' \sim  p(\x)}\big[\kappa(\x, \x')\big],
\end{align}
where $\kappa(\cdot, \cdot)$ is any reproducing kernel and we simply replace $\x$ with $\mathcal{H}_{\etab}(\x)$.
The MMD is a suitable alternative to the KL whenever we want to measure the distance between two distributions from which we can obtain samples but cannot evaluate explicitly.
% TODO star for analytic posterior
Our augmented loss function then becomes:
\begin{align}
    \mathcal{L}_{\text{JANA-MMD}} := -\mathbb{E}_{p( \thetab, \x)}\big[\log \lika + \log \postah \big] + \lambda \cdot \mathbb{MMD}^2\big[p(\mathcal{H}_{\psib}(\x))\,||\,\mathcal{N}(\boldsymbol{0}, \mathbb{I})\big]\label{eq:kl_final_joint_mmd},
\end{align}
where $\mathcal{N}(\boldsymbol{0}, \mathbb{I})$ denotes a spherical multivariate Gaussian distribution.
Note, that, in theory, proper minimization of the MMD term does not trade off the performance of the posterior network, but simply implies a reparameterization $\phib \rightarrow \phib', \psib \rightarrow \psib'$, such that:
% TODO => star - hat
\begin{align}
    p(\thetab) = \int \postah\,p(\x)\,\diff\x = \int p_{\phib'}(\thetab \given \mathcal{H}_{\psib'}(\x))\, \mathcal{N}(\mathcal{H}_{\psib'}(\x)\,\given \,\boldsymbol{0}, \mathbb{I})\,\diff\x
\end{align}

In a particular empirical setting, neural network parameters $(\phib, \psib)$ may be more easily reachable by a given optimizer than corresponding parameters $(\phib', \psib')$, resulting in a practical trade-off. 
However, \textcite{schmitt2022bayesflow} did not observe a diminished performance of amortized posterior approximators trained with a structured summary space, warranting promising results and further investigation into latent summary spaces.

Finally, the correctness of the posterior and likelihood networks trivially implies a correct marginal likelihood (i.e., model evidence) due to the probabilistic change-of-variable resulting from Bayes' rule:
\begin{equation}
    \post = \frac{\lik\,\prior}{p(\x)} \Longleftrightarrow p(\x) = \lik\,\frac{\prior}{\post}
\end{equation}
Thus, assuming perfect convergence of the posterior and the likelihood network under either $\mathcal{L}_{\text{JANA}}$ (Eq.~\ref{eq:kl_final_joint}) or $\mathcal{L}_{\text{JANA-MMD}}$ (Eq.~\ref{eq:kl_final_joint_mmd}), we can compute the log marginal likelihood (LML) of $\x$ by using any single $\thetab \sim p(\thetab)$ as:
\begin{equation}
    \log p(\x) = \log \lika + \log \prior - \log \postah \label{eq:lml} 
\end{equation}
Moreover, it follows, that we can use any violation of Eq.~\ref{eq:lml} to diagnose non-convergence and measure the joint approximation error incurred by the networks.


\FloatBarrier
\clearpage
\section{Implementation Details and Additional Results}

All experiments are implemented using the BayesFlow library \url{https://github.com/stefanradev93/BayesFlow} built on top of TensorFlow \autocite{abadi2016tensorflow}.
Throughout, we use an Adam optimizer \autocite{kingma2014adam} with an initial learning rate between $0.0005$ and $0.001$, default hyperparameters, and a cosine learning rate decay schedule.
All networks are trained on a single machine equipped with an NVIDIA\textsuperscript{\textregistered} T4 graphics accelerator with 16GB of GPU memory. 

\FloatBarrier
\subsection{Experiment 1: Ten Benchmarks}

We follow the model specifications from \textcite{lueckmann2021benchmarking}.
Model implementations are directly imported from the BayesFlow library under MIT license because this implementation has no dependencies on a particular deep learning framework.
For inspecting the software code for the benchmark implementations, we kindly refer the reader to the BayesFlow repository \url{https://github.com/stefanradev93/BayesFlow/tree/master/bayesflow/benchmarks}.
\autoref{tab:app:benchmarks:overview} contains an overview of the benchmarks and core network settings. The full network configurations can be inspected in the code section of the \textbf{Appendix}.

\begin{table}[h]
    \begin{center}
        \caption{Overview of the model and training configurations for \textbf{Experiment 1}.}
        \label{tab:app:benchmarks:overview}
        \begin{tabular}{r|l|c|c|c|c|c|l}
        \textbf{\#} & \textbf{Benchmark name} & \textbf{\# Dimensions}$^1$ & \textbf{Epochs} & \textbf{Batch size} & \textbf{LR} & \textbf{\# Coupling}$^2$ & \textbf{Results} \\
        \hline
             1 & Gaussian Linear & (10, 10) & 50 & 64  & 0.001 & (5, 5) & \autoref{fig:app:benchmark:gaussian_linear}\\
             2 & Gaussian Linear Uniform & (10, 10) & 50 & 64  & 0.001  & (5, 5) & \autoref{fig:app:benchmark:gaussian_linear_uniform}\\
             3 & SLCP$^3$ & (8, 5) & 100 & 32  & 0.0005 & (4, 6) & \autoref{fig:app:benchmark:slcp}\\
             4 & SLCP$^3$ with Distractors & (100, 5) & 60 & 32 & 0.001 & (6, 8) & \autoref{fig:app:benchmark:slcp_distractors}\\
             5 & Bernoulli GLM & (10, 10) & 50 & 32  & 0.0001 & (5, 8) & \autoref{fig:app:benchmark:bernoulli_glm}\\
             6 & Bernoulli GLM Raw & (100, 10) & 50 & 32 & 0.0001 & (8, 8) & \autoref{fig:app:benchmark:bernoulli_glm_raw}\\
             7 & Gaussian Mixture & (2, 2) & 150 & 64 & 0.0005  & (6, 6) & \autoref{fig:app:benchmark:gaussian_mixture}\\
             8 & Two Moons & (2, 2) & 50 & 32  & 0.0005 & (6,6)  & \autoref{fig:app:benchmark:two_moons}\\
             9 & SIR & (10, 2) & 250 & 32 & 0.0001 & (6,6)  &\autoref{fig:app:benchmark:sir}\\
             10 & Lotka-Volterra & (20, 4) & 150 & 128 & 0.001 & (8,6) & \autoref{fig:app:benchmark:lotka_volterra}\\
        \end{tabular}\\
    \end{center}
    \hspace*{0.5cm}{\footnotesize $^1$ Dimensionality of the Bayesian model, denoted as a tuple for $\x$ and $\thetab$, respectively.}\\
    \hspace*{0.5cm}{\footnotesize $^1$ Number of coupling layers, denoted as a tuple for the likelihood and posterior network, respectively.}\\
    \hspace*{0.5cm}{\footnotesize $^2$ Simple Likelihood, Complex Posterior.}
    %\hspace{2cm}{\footnotesize $^1$ minutes of wall time for training on an NVIDIA T4 GPU}
\end{table}

The following figures show the loss history (training and validation) as well as detailed calibration diagnostics for the posterior and joint learning tasks.
Note that the simulation budget is fixed at $10\,000$ simulations.
However, depending on the benchmark, the \emph{number of training steps} may vary (i.e., Gaussian Linear is trivial to learn and requires a few epochs, in contrast to a more challenging benchmark, such as Lotka-Volterra).

Further, note that most of these models are \textit{not} meaningful for joint or likelihood estimation in their original formulation.
Still, we apply JANA to all benchmarks for the sake of completeness, as these experiments serve as a proof-of-concept for more advanced applications.

Special care is needed for the Bernoulli GLM Raw model, as its likelihood yields $N$ IID binary data points. 
These should neither be directly modeled as $N$-dimensional vectors (as this completely ignores the permutation-invariance of the data), nor as exchangeable inputs for coupling-based invertible networks (as the latter assumes at least two-dimensional continuous outputs).
In order to tackle the likelihood of this model, we augment each binary data point $x_n$ with an independent random variate $u_n \sim \mathcal{N}(0, 1)$ and use a SoftFlow architecture \autocite{kim2020softflow} for dequantization of the binary data.

\clearpage

\appendixbenchmark{1}{gaussian_linear}{Gaussian Linear}
\appendixbenchmark{2}{gaussian_linear_uniform}{Gaussian Linear Uniform}
\appendixbenchmark{3}{slcp}{Simple Likelihood Complex Posterior}
\appendixbenchmark{4}{slcp_distractors}{Simple Likelihood Complex Posterior with Distractors}
\appendixbenchmark{5}{bernoulli_glm}{Bernoulli GLM}
\appendixbenchmark{6}{bernoulli_glm_raw}{Bernoulli GLM raw}
\appendixbenchmarkcompressed{7}{gaussian_mixture}{Gaussian Mixture}
\appendixbenchmarkcompressed{8}{two_moons}{Two Moons}
\appendixbenchmarkcompressed{9}{sir}{SIR time series}
\appendixbenchmark{10}{lotka_volterra}{Lotka-Volterra}

\FloatBarrier
\clearpage
\subsection{Experiment 2: Two Moons}

\paragraph{Model details}
This experiment utilizes the two moons simulator from \textcite{greenberg2019automatic} -- not to be confused with the standard two moons data set used for unconditional estimation -- with the same experimental setup as described in \textcite{wiqvist2021sequential}.

\paragraph{Network and training details}

The posterior network is a neural spline flow with $4$ coupling layers and a Gaussian latent space.
The likelihood network uses an interleaved coupling architecture with $5$ coupling layers.
We train the networks in an offline fashion on the respective simulation budget ($2\,000$, $6\,000$ and $10\,000$ simulations) for $64$ epochs with a batch size of $32$ and a learning rate of $0.0005$. 

The wall-clock times on a consumer-grade CPU are listed in \autoref{tab:tm_wallclock}. 
While the JANA implementation in the BayesFlow framework would certainly benefit from GPU acceleration, the available implementations of SNPLA and SNVI do not come with GPU support out-of-the-box due to their APIs to dependent packages (i.e., issues with Pyro for SNVI and issues with PyTorch for SNPLA).
We repeat the training phase of each method $10$ times to further investigate the reliability of the methods.
We only conducted one repetition with SNL due to the prohibitively slow run time (see \autoref{tab:tm_wallclock}).

\begin{table}[ht]
    \centering
    \begin{tabular}{l||c|c|c|c|c|c|c}
        & NPE-C & SNPE-C & SNRE-B & SNL & SNVI & SNPLA & JANA \\
        \hline
        \textbf{Training (seconds)}  & 229 & 1151 & 5533 & 17492 & 198 & 496 & 435 \\
        \textbf{Posterior Inference (seconds)}  & 0.02 & 0.02 & 592.63 & 1890 & 0.60 & 0.01 & 0.13 \\
        \textbf{Posterior Predictive Inference (seconds)}  & --- & --- & --- & 1872 & 0.61 & 0.03 & 0.27 \\
    \end{tabular}
    \caption{Average wall-clock times (seconds) on a consumer-grade CPU for different neural methods. Training time is based on offline learning with $10\,000$ simulations. Posterior inference indicates wall-clock time for obtaining $1\,000$ samples from the approximate posterior on a single observation. Posterior predictive inference indicates wall-clock time for obtaining $1\,000$ samples from the approximate posterior and evaluating the approximate likelihood of each sample. Note, that NPE-C and JANA are \textit{amortized}, so no further training is needed for applications on new observations.}
    \label{tab:tm_wallclock}
\end{table}


\begin{figure}
    \centering
    \twomoonsposterior{Repetition \#1 (main paper)}{plots/two_moons_1/tm_posterior_seed1.pdf}
    \hspace*{1cm}
    \twomoonsposterior{Repetition \#2}{plots/two_moons_1/tm_posterior_seed2.pdf}
    \\
    \twomoonsposterior{Repetition \#3}{plots/two_moons_1/tm_posterior_seed3.pdf}
    \hspace*{1cm}
    \twomoonsposterior{Repetition \#4}{plots/two_moons_1/tm_posterior_seed4.pdf}
    \\
    \twomoonsposterior{Repetition \#5}{plots/two_moons_1/tm_posterior_seed5.pdf}
    \hspace*{1cm}
    \twomoonsposterior{Repetition \#6}{plots/two_moons_1/tm_posterior_seed6.pdf}
    \\
    \twomoonsposterior{Repetition \#7}{plots/two_moons_1/tm_posterior_seed7.pdf}
    \hspace*{1cm}
    \twomoonsposterior{Repetition \#8}{plots/two_moons_1/tm_posterior_seed8.pdf}
    \\
    \twomoonsposterior{Repetition \#9}{plots/two_moons_1/tm_posterior_seed9.pdf}
    \hspace*{1cm}
    \twomoonsposterior{Repetition \#10}{plots/two_moons_1/tm_posterior_seed10.pdf}
    \caption{\textbf{Experiment 2.} $1\,000$ Posterior draws for all methods and repetitions of the experiment. The main paper shows repetition \#1, which is in line with the other runs. The axis limits represent the support of the uniform prior distribution.}
\end{figure}

\FloatBarrier
\paragraph{Different Simulator}

We repeat the experiment with the simulator from \textcite{lueckmann2021benchmarking}, which produces smaller moons with larger relative distance.
The only difference to \textcite{lueckmann2021benchmarking} is that we use a broader uniform prior with bounds $[-2, 2]$ (instead of $[-1,1]$) to further increase the difficulty of the task.
The results are largely equivalent to the ones reported in the main text.

\begin{figure}[H]
    \centering
    \twomoonsposterior{Repetition \#1 (main paper)}{plots/two_moons_2/tm_posterior_seed1.pdf}
    \hspace*{1cm}
    \twomoonsposterior{Repetition \#2}{plots/two_moons_2/tm_posterior_seed2.pdf}
    \\
    \twomoonsposterior{Repetition \#3}{plots/two_moons_2/tm_posterior_seed3.pdf}
    \hspace*{1cm}
    \twomoonsposterior{Repetition \#4}{plots/two_moons_2/tm_posterior_seed4.pdf}
    \\
    \twomoonsposterior{Repetition \#5}{plots/two_moons_2/tm_posterior_seed5.pdf}
    \hspace*{1cm}
    \twomoonsposterior{Repetition \#6}{plots/two_moons_2/tm_posterior_seed6.pdf}
    \\
    \twomoonsposterior{Repetition \#7}{plots/two_moons_2/tm_posterior_seed7.pdf}
    \hspace*{1cm}
    \twomoonsposterior{Repetition \#8}{plots/two_moons_2/tm_posterior_seed8.pdf}
    \\
    \twomoonsposterior{Repetition \#9}{plots/two_moons_2/tm_posterior_seed9.pdf}
    \hspace*{1cm}
    \twomoonsposterior{Repetition \#10}{plots/two_moons_2/tm_posterior_seed10.pdf}
    \caption{\textbf{Experiment 2.} $1\,000$ Posterior draws for all methods and repetitions of the experiment with a more challenging simulator. The axis limits represent the support of the uniform prior distribution.}
\end{figure}

\FloatBarrier
\clearpage
\subsection{Experiment 3: Exchangeable Diffusion Model}

\paragraph{Model details}
We focus on the drift diffusion model (DDM)---a cognitive model describing reaction times (RTs) in binary decision tasks \autocite{ratcliff2008}.
The DDM assumes that perceptual information for a choice alternative accumulates continuously according to a Wiener diffusion process. 
The change in information accumulation $\mathrm{d}x$ follows a random walk with drift and Gaussian noise:
\begin{equation}
    \mathrm{d}x = v\mathrm{d}t + \xi \sqrt{\mathrm{d}t}\quad\text{with} \quad\xi\sim\mathcal{N}(0, 1).
\end{equation}

The model consists of four parameters: drift-rate $v$, boundary separation $a$, non-decision time $t_0$ and bias (relative starting point) $w$. 
The model has the particularity of being very sensible to early outliers, as all reaction times smaller than the non-decision time are considered impossible (i.e., have a likelihood of zero).
We employ the simple DDM, as its likelihood function is tractable \autocite{voss2007fast}, and place truncated normal priors over the parameters $\thetab = (v, a, t_0, w)$,
\begin{equation}
  v \sim \mathcal{TN}_{[-5,5]}(0, 10),\quad a \sim \mathcal{TN}_{[0.5,3]}(1, 1),\quad t_0 \sim \mathcal{TN}_{[0.2,1]}(0.4, 0.2),\quad w \sim \mathcal{TN}_{[0.3,0.7]}(0.5, 0.1),
\end{equation}


where $\mathcal{TN}_{[a, b]}(\mu, \sigma)$ denotes the truncated normal distribution with location $\mu$ and standard deviation $\sigma$ truncated within the interval $[a, b]$.
The summary network is a permutation-invariant network which reduces simulated IID RT data sets to $S=10$ summary statistics \autocite{radev2020bayesflow}.

\paragraph{Network and training details}

The summary network is a deep permutation-invariant network with $2$ equivariant modules followed by an invariant module \autocite{radev2020bayesflow, bloem2020probabilistic}.
The summary network reduces the IID RT data sets into $10$-dimensional learned summary statistics.
The posterior network is a conditional invertible neural network (cINN) with $5$ conditional coupling layers and a Student-$t$ latent space ($df = 50$).
The internal networks of the coupling layers are fully connected (FC) networks with $2$ hidden layers featuring $128$ units and \texttt{tanh} activation function.

The likelihood network is a cINN with $12$ conditional coupling layers, with smaller internal FC networks of $2$ hidden layers having $32$ units each, a \texttt{tanh} activation function, and a Student-$t$ latent space.
We train the networks in an offline fashion.
The likelihood network is trained for 20 epochs with a batch size of $64$ and a learning rate of $0.001$.
The posterior network is trained for 100 epochs with a batch size of $64$ and a learning rate of $0.002$.

\begin{figure}[H]
    \centering
    \begin{subfigure}[t]{0.20\linewidth}
        \includegraphics[width=\linewidth]{plots/diffusion_model/lik1.pdf}
        %\caption{\small$v{=}, a{=}, t_0{=}, w{=}$}
    \end{subfigure}
    \hspace*{0.2cm}
    \begin{subfigure}[t]{0.20\linewidth}
        \includegraphics[width=\linewidth]{plots/diffusion_model/lik2.pdf}
        %\caption{$v=1, a=0.5, t_0=0.5, w=0.5$}
    \end{subfigure}
    \hspace*{0.2cm}
    \begin{subfigure}[t]{0.20\linewidth}
        \includegraphics[width=\linewidth]{plots/diffusion_model/lik3.pdf}
        %\caption{$v=, a=, t_0=, w=$}
    \end{subfigure}
    \hspace*{0.2cm}
    \begin{subfigure}[t]{0.20\linewidth}
        \includegraphics[width=\linewidth]{plots/diffusion_model/lik4.pdf}
        %\caption{$v=, a=, t_0=, w=$}
    \end{subfigure}
    \caption{\textbf{Experiment 3.} JANA exhibits essentially perfect likelihood emulation for various parameter configurations.}
    \label{fig:app:ddm:likelihood}
\end{figure}

\begin{figure}[H]
    \centering
    \includegraphics[width=0.90\linewidth]{plots/diffusion_model/recovery_stan.pdf}
    \caption{\textbf{Experiment 3.} The parameter recovery of JANA is largely identical to the estimates obtained via the gold-standard HMC-MCMC implementation in Stan.}
    \label{fig:app:ddm:stan-recovery}
\end{figure}

% \begin{figure}
%     \centering
%     \includegraphics[width=0.95\linewidth]{plots/diffusion_model/pymc_dm.pdf}
%     \caption{\textbf{Experiment 3.}.}
%     \label{fig:app:ddm:pymc}
% \end{figure}

\FloatBarrier
\clearpage
\subsection{Experiment 4: Markovian Compartmental Model}

\paragraph{Model details} We use the model formulation from \textcite{radev2021amortized}, which consists of three components: 1) a latent sub-model, 2), an intervention sub-model; 3) and an observation sub-model.

First, the latent sub-model is a SIR-type system of non-linear ordinary differential equations (ODEs) with six population compartments representing the interactions between 
susceptible ($S$), exposed ($E$), infected ($I$), carrier ($C$), recovered ($R$), and dead ($D$) individuals.
The interaction dynamics are governed by:
\begin{align}
    \frac{dS}{dt} &= -\lambda (t)\,\left(\frac{C + \beta\,I}{N}\right)\,S \\
    \frac{dE}{dt} &= \lambda (t)\,\left(\frac{C + \beta\,I}{N}\right)\,S - \gamma\,E \\
    \frac{dC}{dt} &= \gamma\,E - (1 - \alpha)\,\eta\,C - \alpha\,\theta\,C\\
    \frac{dI}{dt} &= (1 - \alpha)\,\eta\,C - (1-\delta)\,\mu\,I - \delta\,d\,I \\
    \frac{dR}{dt} &= \alpha\,\theta\,C + (1-\delta)\,\mu\,I \\
    \frac{dD}{dt} &= \delta\,d\,I
\end{align}
For simulating the system, we use $dt = 1$ which corresponds to a time scale of days.

Second, an \textit{intervention sub-model} accounts for changes in the transmission rate $\lambda(t)$ due to non-pharmaceutical policies.
It defines three change points for $\lambda(t)$ encoding an assumed transmission rate reduction in response to intervention measures imposed by the German authorities in 2020.
Each change point is a piece-wise linear function with three parameters: the effect strength and the boundaries defining the time interval for the effect to take place \autocite{radev2021outbreakflow}.

The observation sub-model assumes that only compartments $I$, $R$, and $D$ are potentially observable.
Moreover, it accounts for the fact that officially reported cases might not represent the true latent numbers of an outbreak:
\begin{align}
     I^{(obs)}_t &= I^{(obs)}_{t-1} + (1 - f_I(t))\,(1 - \alpha)\,\eta\,C_{t-L_I} + \sqrt{I^{(obs)}_{t-1}} \,\sigma_I\,\xi_t \\ 
    R^{(obs)}_t &= R^{(obs)}_{t-1} + (1 - f_R(t))\,(1-\delta)\,\mu\,I_{t-L_R}  + \sqrt{R^{(obs)}_{t-1}}\,\sigma_R\,\xi_t \\
    D^{(obs)}_t &= D^{(obs)}_{t-1} + (1 - f_D(t))\, \delta\,d\,I_{t-L_D} + \sqrt{D^{(obs)}_{t-1}}\,\sigma_D\, \xi_t 
\end{align}
In the above equations, $L_I, L_R$, and $L_D$ denote the reporting delays (lags), and denote $\sigma_I, \sigma_R$, and $\sigma_D$ the scales of multiplicative reporting noise for the respective compartments. 
The noise variables $\xi_t$ follow a Student-\textit{t} distribution with 4 degrees of freedom. 
The weekly modulation of reporting coverage $f_{\mathcal{C}}(t)$ for each of the compartments $\mathcal{C} \in \{I, R, D\}$ is computed as follows:
\begin{align}
    f_{\mathcal{C}}(t) = (1 - A_{\mathcal{C}})\,\left(1 - \left| \sin \left( \frac{\pi}{7}t - 0.5\, \Phi_{\mathcal{C}} \right) \right|  \right)
\end{align}
This yields three additional unknown parameters for the weekly modulation amplitudes $A_I, A_R, A_D$, and phases $\Phi_I, \Phi_R, \Phi_D$, each. 

\paragraph{Network and training details}
The summary network is a combination of 1D convolutional and LSTM layers, which reduce the multivariate time series into a vector of $192$ learned summary statistics \autocite{radev2021outbreakflow}.
The posterior network is a conditional invertible neural network (cINN) with $6$ conditional affine coupling layers.
The internal networks of the coupling layers are fully connected (FC) networks with $2$ hidden layers of $128$ units and a \texttt{swish} activation function.
The likelihood network is a recurrent cINN with $8$ conditional coupling layers with the same structure as the coupling layers of the posterior network.
We use a gated recurrent unit (GRU) with $256$ hidden units for the internal recurrent memory.
We train the networks in an online fashion (i.e., on-the-fly simulations) for $100$ epochs with a batch size of $32$ and a learning rate of $0.0005$. 
This initial learning rate is reduced throughout the training phase following a cosine decay schedule with a minimum learning rate of $0$.

\begin{figure}
    \centering
    \includegraphics[width=0.9\linewidth]{plots/covid/loss_history.pdf}
    \caption{\textbf{Experiment 4.} Loss history}
    \label{fig:app:covid:loss}
\end{figure}


\begin{figure}
    \centering
    \covidscenario{Scenario I}{plots/covid/surrogate_0.pdf}
    \hspace*{1cm}
    \covidscenario{Scenario II}{plots/covid/surrogate_1.pdf}
    \\
    \covidscenario{Scenario III}{plots/covid/surrogate_2.pdf}
    \hspace*{1cm}
    \covidscenario{Scenario IV}{plots/covid/surrogate_3.pdf}
    \\
    \covidscenario{Scenario V}{plots/covid/surrogate_4.pdf}
    \hspace*{1cm}
    \covidscenario{Scenario VI}{plots/covid/surrogate_5.pdf}
    \\
    \covidscenario{Scenario VII}{plots/covid/surrogate_6.pdf}
    \hspace*{1cm}
    \covidscenario{Scenario VIII}{plots/covid/surrogate_7.pdf}
    \\
    \includegraphics[width=0.17\linewidth]{plots/covid/legend_a.pdf}
    \includegraphics[width=0.17\linewidth]{plots/covid/legend_b.pdf}
    \includegraphics[width=0.17\linewidth]{plots/covid/legend_c.pdf}
    \caption{\textbf{Experiment 4.} The likelihood network can emulate the simulator and its aleatoric uncertainty strikingly well. Each sub-panel depicts $1000$ runs from the original and the surrogate neural simulator given the same parameter configuration, each leading to a qualitatively different outbreak scenario.}
\end{figure}

\begin{figure}
    \centering
    \begin{subfigure}[t]{0.62\linewidth}
        \includegraphics[width=\linewidth]{plots/covid/sbc_post_ecdf.pdf}
        \caption{Posterior calibration}
    \end{subfigure}
    \begin{subfigure}[t]{0.62\linewidth}
        \includegraphics[width=\linewidth]{plots/covid/sbc_joint_ecdf.pdf}
        \caption{Joint calibration}
    \end{subfigure}
    \caption{\textbf{Experiment 4.} Posterior and joint calibration results}
    \label{fig:app:covid:calibration}
\end{figure}

\FloatBarrier
\clearpage
\subsection{Experiment 5: High-Dimensional Bayesian Denoising}

\paragraph{Model Details} This experiment follows the problem formulation from \textcite{ramesh2022gatsbi, pacchiardi2022score}.
However, we choose the Fashion MNIST data set because of its richer and more interesting structure. 
In this Bayesian denoising setup, a simulated noisy camera applies a multidimensional Gaussian filter (i.e., a blur) to each Fashion MNIST image.
Thus, the original image represents the ``parameters'' $\thetab \in \mathbb{R}^{784}$ and its blurry version $\mathbb{R}^{784}$ represents the ``observation''.
In order to make the problem more challenging, we do not use the class label as an additional conditioning input for the networks.
We also do not process the image data optimally \autocite[e.g., by applying a Haar wavelet downsampling or using convolutional couplings, as in][]{ardizzone2019guided, kingma2018glow}, as our goal is not to perform high-quality image reconstruction, but simply to illustrate the applicability of JANA for analyzing potentially high-dimensional Bayesian models.

\paragraph{Network and training details}

Since both ``data'' and ``parameters'' are images with a (theoretically) lower intrinsic dimensionality than the total number of pixels, both the likelihood and the posterior network utilize a separate summary network with identical architecture.
For each, we use a $4$-layer fully convolutional network with a final global average pooling layer yielding a $128$-dimensional summary representation of the original and blurry image, respective.
The posterior network is a conditional invertible neural network (cINN) comprising $12$ conditional affine coupling layers.
The internal networks of the coupling layers are fully connected (FC) networks with a single hidden layer of $512$ units and a \texttt{ReLU} non-linearity.
The likelihood network uses the same architecture as the posterior network.
Finally, we use a multivariate Student-T latent space \autocite{alexanderson2020robust}, as it allows us to perform a much more stable maximum likelihood training with higher learning rates.

We train the networks on the official training set of $60\,000$ Fashion MNIST images for $120$ epochs with a batch size of $32$ and a learning rate of $0.001$. 
This initial learning rate is reduced throughout the training phase following a cosine decay schedule with a minimum learning rate of $0$.
For each batch, we add a small amount of Gaussian noise with a scale of $0.001$ as a form of dequantization \autocite{ardizzone2019guided}.
We use $500$ images from the test set as a validation set to estimate the generalization error during training.
We utilize the remaining $9500$ images from the test set for evaluating the approximation quality and calibration of the networks.

\begin{figure}
    \centering
    \includegraphics[width=0.95\linewidth]{plots/denoising/loss_history.pdf}
    \caption{\textbf{Experiment 5.} Loss history}
    \label{fig:app:denoisinhg}
\end{figure}

\begin{figure}
    \centering
    \begin{subfigure}[t]{0.49\linewidth}
        \includegraphics[width=\linewidth]{plots/denoising/appendix_1.pdf}
        \caption{Posterior estimation \#1}
    \end{subfigure}
    \begin{subfigure}[t]{0.49\linewidth}
        \includegraphics[width=\linewidth]{plots/denoising/appendix_2.pdf}
        \caption{Posterior estimation \#2}
    \end{subfigure}
    \begin{subfigure}[t]{0.49\linewidth}
        \includegraphics[width=\linewidth]{plots/denoising/appendix_3.pdf}
        \caption{Posterior estimation \#3}
    \end{subfigure}
    \begin{subfigure}[t]{0.49\linewidth}
        \includegraphics[width=\linewidth]{plots/denoising/appendix_4.pdf}
        \caption{Posterior estimation \#4}
    \end{subfigure}
    \begin{subfigure}[t]{0.49\linewidth}
        \includegraphics[width=\linewidth]{plots/denoising/appendix_5.pdf}
        \caption{Posterior estimation \#5}
    \end{subfigure}
    \begin{subfigure}[t]{0.49\linewidth}
        \includegraphics[width=\linewidth]{plots/denoising/appendix_6.pdf}
        \caption{Posterior estimation \#6}
    \end{subfigure}
    \begin{subfigure}[t]{0.49\linewidth}
        \includegraphics[width=\linewidth]{plots/denoising/appendix_5.pdf}
        \caption{Posterior estimation \#7}
    \end{subfigure}
    \begin{subfigure}[t]{0.49\linewidth}
        \includegraphics[width=\linewidth]{plots/denoising/appendix_6.pdf}
        \caption{Posterior estimation \#8}
    \end{subfigure}
    \caption{\textbf{Experiment 5.} Posterior (denoising) results on $8$ randomly selected sets of images from the official Fashion MNIST test set.}
    \label{fig:app:denoising:posterior}
\end{figure}

\begin{figure}
    \centering
    \includegraphics[width=0.95\linewidth]{plots/denoising/likelihood_samples1.pdf}
    \caption{\textbf{Experiment 5.} Samples from the surrogate camera (i.e., the ``likelihood'' in the Bayesian denoising setup) given ten randomly selected clean images (i.e., ``parameters'') from each class.}
    \label{fig:app:denoising:likelihood1}
\end{figure}

\begin{figure}
    \centering
    \includegraphics[width=0.95\linewidth]{plots/denoising/likelihood_samples2.pdf}
    \caption{\textbf{Experiment 5.} Further samples from the surrogate camera.}
    \label{fig:app:denoising:likelihood2}
\end{figure}

\FloatBarrier
\clearpage

\subsubsection*{References}
\printbibliography[heading=none]

\end{document}
