% \documentclass[accepted]{preprint_class} % for initial submission
\documentclass[accepted]{uai2023} % for initial submission
% \documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
\usepackage{amsfonts}

%% Custom commands
\newcommand{\bs}{\boldsymbol}
\newcommand{\x}{\boldsymbol{x}}
\newcommand{\z}{\boldsymbol{z}}
\newcommand{\m}{\boldsymbol{m}}
\newcommand{\p}{\boldsymbol{p}}
\newcommand{\h}{\boldsymbol{h}}
\newcommand{\mub}{\boldsymbol{\mu}}
\newcommand{\xib}{\boldsymbol{\xi}}
\newcommand{\sigmab}{\boldsymbol{\sigma}}
\newcommand{\Sigmab}{\boldsymbol{\Sigma}}
\newcommand{\thetab}{\boldsymbol{\theta}}
\newcommand{\phib}{\boldsymbol{\phi}}
\newcommand{\psib}{\boldsymbol{\psi}}
\newcommand{\Psib}{\boldsymbol{\Psi}}
\newcommand{\etab}{\boldsymbol{\eta}}
\newcommand{\0}{\boldsymbol{0}}
\newcommand{\W}{\boldsymbol{W}}
\newcommand{\norm}[1]{\left\lVert#1\right\rVert}
\newcommand{\Var}{\mathrm{Var}}
\newcommand{\thetam}{\boldsymbol{\theta}_m}
\newcommand{\given}{\,|\,}
\newcommand{\lik}{p(\x \given \thetab)}
\newcommand{\priorm}{p(\thetab \given \mathcal{M})}
\newcommand{\likm}{p(\x \given \thetab, \mathcal{M})}
\newcommand{\post}{p(\thetab \given \x)}
\newcommand{\postm}{p(\thetab \given \x, \mathcal{M})}
\newcommand{\NIW}{\text{N-}\mathcal{W}^{-1}}
\newcommand{\NIG}{\text{N-}\Gamma^{-1}}
\newcommand{\xbtilde}{\tilde{\boldsymbol{x}}}
\newcommand{\prior}{p(\thetab)}
\newcommand{\joint}{p(\thetab, \x)}
\newcommand{\jointm}{p(\thetab, \x \given \mathcal{M})}
\newcommand{\noise}{p(\xib)}
\newcommand{\noised}{p(\xib \given \thetab)}
\newcommand{\model}{G(\thetab, \xib)}
\newcommand{\diff}{\mathrm{d}}
\newcommand{\tripleopt}{(\phib^*,\etab^*,\psib^*)}
\newcommand{\triple}{(\phib,\etab,\psib)}
\newcommand{\lika}{l_{\etab}(\x \given \thetab)}
\newcommand{\posta}{p_{\phib}(\thetab \given \x)}
\newcommand{\postah}{p_{\phib}(\thetab \given \mathcal{H}_{\psib}(\x))}

\usepackage{subcaption}

%% Some suggested packages, as needed:
%\usepackage{natbib} % has a nice set of citation styles and commands
%    \bibliographystyle{plainnat}
%    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage[backend=biber,
style=apa,
maxcitenames=2,
minbibnames=2,
maxbibnames=2, 
language=english,
doi=false,
isbn=false,
url=false,
uniquename=false
]{biblatex} 
\addbibresource{radev_354.bib} % bib file
\DeclareLanguageMapping{english}{english-apa} % mapping for apa
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{xcolor}
\definecolor{darkblue}{RGB}{0,0,128}
\hypersetup{
    colorlinks=true,
	linkcolor=darkblue,
	filecolor=darkblue,
	urlcolor=darkblue,
	citecolor=darkblue,
	pdfauthor={},
	pdftitle={}
}
%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
%% Uncomment to see new changes
% \newcommand{\new}[1]{\textcolor{blue}{#1}}
\newcommand{\new}[1]{\textcolor{black}{#1}}

\title{JANA: Jointly Amortized Neural Approximation of Complex\\Bayesian Models}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
% \author[1]{\href{mailto:<stefan.radev93@gmail.com>?Subject=Your UAI 2023 paper}{Jane~J.~von~O'L\'opez}{}}
\author[1]{\vspace{-1.5em}Stefan T.~Radev}
\author[2]{Marvin Schmitt}
\author[3]{Valentin Pratz}
\author[4]{Umberto Picchini}
\author[3]{\authorcr Ullrich K\"othe$^*$}
\author[2]{Paul-Christian B\"urkner\thanks{Shared senior authorship}}
% Add affiliations after the authors
\affil[1]{%
Cluster of Excellence STRUCTURES\\
Heidelberg University
}
\affil[2]{%
Cluster of Excellence SimTech\\
University of Stuttgart
}
\affil[3]{%
Visual Learning Lab\\
Heidelberg University
}
\affil[4]{%
Department of Mathematical Sciences\\
Chalmers University of Technology \& University of Gothenburg
}

\begin{document}
\maketitle
\begin{abstract}
This work proposes ``jointly amortized neural approximation'' (JANA) of intractable likelihood functions and posterior densities arising in Bayesian surrogate modeling and simulation-based inference.
We train three complementary networks in an end-to-end fashion: 1) a summary network to compress individual data points, sets, or time series into informative embedding vectors; 2) a posterior network to learn an amortized approximate posterior; and 3) a likelihood network to learn an amortized approximate likelihood.
Their interaction opens a new route to amortized marginal likelihood and posterior predictive estimation -- two important ingredients of Bayesian workflows that are often too expensive for standard methods.
We benchmark the fidelity of JANA on a variety of simulation models against state-of-the-art Bayesian methods and propose a powerful and interpretable diagnostic for joint calibration.
In addition, we investigate the ability of recurrent likelihood networks to emulate complex time series models without resorting to hand-crafted summary statistics.
\end{abstract}

\section{Introduction}
\label{sec:intro}

%SM and SBI from a Bayesian perspective
Surrogate modeling (SM) and simulation-based inference (SBI) are two %crucial 
ingredients of the new generation of methods for simulation science \autocite{lavin2021simulation}.
From a Bayesian perspective, SM seeks to approximate the intractable likelihood function, whereas SBI targets %aims to approximate 
the intractable posterior distribution of a complex probabilistic model.
Both problems are hard, as they involve 
%multidimensional 
integrals which cannot be solved with standard analytical or numerical methods.
Thus, specialized neural approximators have emerged as promising tools for taming the intractable \autocite{cranmer2020frontier}.

% Short teaser for what we do
\new{Neural networks trained on model simulations enable \textit{amortized inference}: A pre-trained network can be stored and re-used for Bayesian inference on millions of data sets \autocite{von2022mental}. Crucially, most previous neural approaches have tackled either SM or SBI in isolation, but little attention has been paid to learning both tasks simultaneously.}
\new{To address this gap}, we propose JANA (``Jointly Amortized Neural Approximation''), a Bayesian neural framework for {\em simultaneously amortized} SM and SBI, and show how it enables \new{novel} solutions to challenging downstream tasks like the estimation of marginal and posterior predictive distributions (see \autoref{fig:conceptual}).
JANA also presents a major qualitative upgrade to the BayesFlow framework \autocite{radev2020bayesflow}, which was originally designed for amortized SBI alone.
% \footnote{All capabilities explored in this work are implemented in the BayesFlow library: \url{https://github.com/stefanradev93/BayesFlow}.}

\begin{figure*}[t]
    \centering
    \includegraphics[width=\linewidth]{plots/conceptual_overview_new.pdf}
    \caption{A conceptual illustration of our method for jointly amortized neural approximation (JANA). On the one hand, the summary and posterior network can perform amortized posterior estimation and detect model misspecification. On the other hand, the likelihood network can perform amortized likelihood estimation, surrogate simulations, and interact with probabilistic programming languages. Together, the two networks enable posterior predictive and marginal likelihood estimation, which allow for amortized Bayesian model comparison and validation.
    }
    \label{fig:conceptual}
\end{figure*}

It is commonly presumed that amortized inference is wasteful \autocite{greenberg2019automatic, papamakarios2016fast} and requires much larger simulation budgets than sequential inference to make up for the much larger prediction domain.
Our results challenge this premise.
Given identical simulation budgets, JANA outperforms or is on par with sequential (i.e., non-amortized) methods, such as ABC-SMC, SNL, and SNPE (see~\autoref{fig:two-moons:posterior}).
Furthermore, we hypothesize that modern neural networks benefit strongly from a broad simulation scope.
Thanks to their excellent generalization capabilities, they can exploit outcomes from the entire prior predictive distribution of a simulation to improve local accuracy for each specific case.
In this sense, amortized inference seems to be a natural by-product of deep probabilistic modeling, and the initial training effort more than repays with global diagnostics, nearly instant estimation at test time, and no loss in accuracy.

We show that JANA unlocks the potential of powerful Bayesian tools for model comparison, validation, and calibration, which are essential in Bayesian workflows \autocite{gelman2020bayesian}, but widely underutilized in current simulation-based analysis.
For one, JANA offers an efficient way to compute \textit{marginal likelihoods} via the probabilistic change-of-variables formula (instead of integration over the model's entire prior space) as a prerequisite for \textit{prior predictive} model selection (i.e., probabilistic Occam's razor).
For another, it can rapidly produce both posterior samples and normalized likelihood estimates of new data instances, as are needed in strong validation procedures of the \textit{posterior predictive} performance \autocite{vehtari_survey_2012}. 
In other words, JANA can directly quantify both prior and posterior predictive performance without resorting to Markov chain Monte Carlo (MCMC) sampling or costly model re-fits, in addition to the well-studied advantages of individual posterior or likelihood networks (see~\autoref{fig:conceptual}). 

In summary, our key contributions are:
\begin{enumerate}
    \item We develop a neural architecture for fully amortized joint posterior estimation and likelihood emulation;
    \item We propose a sensitive and interpretable method to test for joint calibration of the networks;
    \item We extensively validate our new architecture on analytic toy examples and complex simulation models;
    \item We show how our joint architecture solves the challenges of computing both out-of-sample predictive performance and intractable marginal likelihoods;
    \item We demonstrate a recurrent neural likelihood for surrogate simulations in a complex time series model.
\end{enumerate}

\section{Method}
\label{sec:method}

\subsection{Problem Formulation}

\paragraph{Bayesian Models}
We focus on generative Bayesian models specified as a triple $\mathcal{M}{=}\big( \model, \noised, \prior \big)$.
Such models yield observables $\x \in \mathcal{X}$ according to the system
\begin{equation}\label{eq:model}
    \x = \model \quad\textrm{with}\quad \xib \sim \noised,\; \thetab \sim \prior,
\end{equation}
where $G$ denotes a simulation program, $\xib \in \Xi$ denotes externalized randomness (i.e., noise or pseudorandom program states) with density function $\noised$, and $\prior$ encodes prior knowledge about plausible simulation parameters $\thetab \in \Theta$.

\paragraph{Forward Inference}
Running the simulator $G$ with a fixed parameter configuration $\thetab$ and different values of $\xib$ is equivalent to random draws from an \textit{implicit likelihood} $\lik$:
\begin{equation}
    \x \sim \lik \Longleftrightarrow \x =\model \quad \textrm{with} \quad \xib \sim \noised
\end{equation}
In theory, implicit likelihoods can be obtained by marginalizing the joint distribution $p(\xib, \x \given \thetab)$ over all possible execution trajectories of the simulation program (i.e., over $\xib$), but this is typically intractable \autocite{cranmer2020frontier}.

\paragraph{Inverse Inference}
In Bayesian analysis, we want to infer a model's latent parameters $\thetab$ from manifest data $\x$ through the probabilistic factorization of the joint distribution into prior and (implicit) likelihood:
\begin{equation}
    \post \propto \joint = \prior \int_{\Xi} p(\xib, \x \given \thetab)\,\diff \xib.
\end{equation}
Since we assume that the likelihood is not available in closed form, we also cannot access the posterior $\post$ and perform parameter estimation through gold-standard Bayesian methods, such as MCMC \autocite{carpenter2017stan}.

\paragraph{Marginal Likelihoods}
In addition to estimating parameters, modelers often want to compare and assign preferences to competing models.
From a Bayesian perspective, the canonical measure of evidence for a given model is the \textit{marginal likelihood} (aka the \textit{prior predictive distribution}),
\begin{equation}
    p(\x) = \int_{\Theta} \int_{\Xi} \prior\,p(\xib, \x \given \thetab)\,\diff \xib\,\diff \thetab, \label{eq:marg_lik}
\end{equation}
which is doubly intractable for complex models because both involved integrals are highly difficult to approximate with sufficient precision \autocite{meng_simulating_1996}.
However, the estimation of the marginal likelihood is central to Bayesian model comparison, since it naturally embodies a probabilistic version of Occam’s razor by penalizing the prior complexity of a model \autocite{mackay2003information}.
Thus, it allows us to express our preference for a simpler model over a more complex one, given that both models can account for the observed data equally well.

\paragraph{Posterior Predictive Distribution}
Bayesian models can also be compared and validated on the basis of their posterior predictive performance \autocite{vehtari_survey_2012}.
However, many posterior predictive metrics rely on the likelihood density being available analytically. 
In particular, this is true for the expected log-predictive density (ELPD), which is a widely-applied, general-purpose metric to measure (out-of-sample) posterior predictive performance when no application-specific utilities are known \autocite{vehtari2017practical}. 
For $K$ (new) observations \smash{$\x^{(k)}_{\rm new}$} not previously seen by the model, the ELPD can be defined as
\begin{equation}\label{elpd}
    \text{ELPD} = \sum_{k=1}^{K} \log \int_{\Theta} p(\x^{(k)}_{\rm new} \given \thetab) \, \post \, \diff \thetab.
\end{equation}

The ELPD has a strong connection to information theory \autocite{vehtari_survey_2012} and is widely used in Bayesian cross-validation \autocite{vehtari2017practical}, where it is one of the most prominent sources of computational intractability.

\paragraph{Probabilistic Symmetry}
Our joint training will leverage the symmetry in the arguments of $\post$ and $\lik$, along with the fact that a single run of the simulator (Eq.~\ref{eq:model}) yields a reusable tuple of parameters and synthetic data $(\thetab, \x)$.
However, many simulation models are characterized by a relatively low-dimensional parameter space $\Theta$ (e.g., low-dimensional vectors) and a rather high-dimensional data space with a rich structure $\mathcal{X}$ (e.g., multivariate time series or sets of exchangeable observations).
Thus, we need different neural architectures, each separately aligned with the structural properties of $\post$ and $\lik$.

% We will employ two separate conditional invertible neural networks (cINNs) for approximating both $\post$ and $\lik$.
% We focus on cINNs due to their favorable theoretical properties \autocite{ardizzone2018analyzing, ardizzone2019guided, radev2020bayesflow} and successful practical applications in amortized SBI \autocite{shiono2021estimation, von2022mental, radev2021outbreakflow, bieringer2021measuring}, but any other conditional density approximator can be applied.

\begin{figure}[t]
    \centering
    \includegraphics[width=\linewidth]{plots/covid_teaser_nolegend.pdf}\\
    \includegraphics[width=0.3\linewidth]{plots/covid/legend_a.pdf}
    \includegraphics[width=0.3\linewidth]{plots/covid/legend_b.pdf}
    \includegraphics[width=0.3\linewidth]{plots/covid/legend_c.pdf}
    \caption{Recurrent likelihood networks can emulate complex Bayesian stochastic differential equation models of disease outbreaks (see~\textbf{Experiment 4}).The top and bottom row each depict $1\,000$ simulations (same $\thetab$) from the surrogate and the actual simulator, respectively.}
    \label{fig:covid}
\end{figure}

\subsection{Posterior Network}
\label{sec:post_net}

The posterior network $\mathcal{P}_{\phib}$ implements a normalizing flow between $\thetab$ and a latent variable $\z_{\thetab}$ with a simple density (e.g., Gaussian) given observed or simulated data $\x$:
\begin{align}
\posta &= p(\z_{\thetab})\,\left|\det \left(\frac{\partial \z_{\thetab}}{\partial \thetab}\right)\right| \label{eq:post}\\
\z_{\thetab} &= \mathcal{P}_{\phib}(\thetab;\x).
\end{align} 
The normalizing flow is realized via a conditional invertible neural network (cINN) composed by a series of conditional coupling layers with affine and/or spline transformations.
Since the observed or simulated data will typically have a complex structure and/or contain varying numbers of observations, the posterior cINN includes a trainable summary network sub-module $\mathcal{H}_{\psib}$ \autocite[see][]{radev2020bayesflow} which we optimize alongside to extract maximally informative data representations $\mathcal{H}_{\psib}(\x)$ in an end-to-end manner.

The design of the conditional coupling layers follows the work of \autocite{durkan2019neural, ardizzone2019guided, ardizzone2018analyzing}, since compositions of such layers exhibit favorable theoretical properties \autocite{draxler2022whitening} and remarkable empirical performance on high-dimensional unstructured data \autocite{kingma2018glow, dinh2016density} or complex Bayesian models in various domains \autocite{bellagente2022understanding, von2022mental, radev2021outbreakflow, bieringer2021measuring}.
However, any other coupling design can be used as a plug-in replacement.

\subsection{Likelihood Network}
\label{sec:lik_net}

The likelihood network $\mathcal{L}_{\etab}$ implements a normalizing flow between $\x$ and a (multivariate) Gaussian latent variable $\z_{\x}=\mathcal{L}_{\etab}(\x;\thetab)$ given a parameter configuration $\thetab$,
\begin{align}
\lika = p(\z_{\x})\,\left|\det \left(\frac{\partial \z_{\x}}{\partial \x}\right)\right|. \label{eq:lik}%\\
%\z_{\x} &= \mathcal{L}_{\etab}(\x;\thetab)
\end{align} 
This formulation is similar to the pushforward expression for the posterior network (Eq.~\ref{eq:post}), but with $\thetab$ swapped for $\x$.
The likelihood network, like the posterior network, is also implemented as a cINN. 
As the conditioning information is now the parameter vector $\thetab$ (and not a complex data structure), it can be fed directly to the conditional coupling layers of the cINN without an additional summary network. 

However, since the data $\x$ (i.e., simulator outputs) is typically in non-vector form, the design of the coupling layers needs to be tailored according to the probabilistic symmetry of $\lik$.
Learning $\lik$ in its raw form is typically much harder than learning the likelihood $p(\mathcal{H}(\x) \given \thetab)$ of some (learned or hand-crafted) summary statistics $\mathcal{H}(\x)$, since the latter are already in a compressed vector form and do not require specialized architectures. 
JANA can learn either $p(\mathcal{H}(\x) \given \thetab)$ or $\lik$, as required by the particular application or dictated by the (un-)availability of good summary statistics.
In our experiments, we directly target $\lik$ and the \textbf{Appendix} details how to design likelihood networks for exchangeable or Markovian data.

% \paragraph{Exchangeable Models}
% Certain models generate data exchangeably, that is, each run $\model$ with a fixed configuration $\thetab$ is independent of all other runs \autocite{turner2016bayesian}. 
% Thus, for $N$ runs of such a (memoryless or stateless) model, the likelihood decomposes into the product of point-wise likelihoods $\smash{\lik = \prod_{n=1}^N p(\x_n \given \thetab)}$.
% Accordingly, we can represent such data as unordered sets and simply apply Eq.~\ref{eq:lik} exchangeably by concatenating each $\x_n$ with $\thetab$ in each coupling layer.
% Accordingly, the forward pass for a single affine coupling layer of an exchangeable likelihood network is given by:
% \begin{align} 
% \z_n^{\mathcal{A}} &= \x_n^{\mathcal{A}} \odot \exp(S_1(\x_n^{\mathcal{B}}; \thetab)) + T_1(\x_n^{\mathcal{B}}; \thetab)\nonumber  \\ 
% \z_n^{\mathcal{B}} &= \x_n^{\mathcal{B}} \odot \exp(S_2(\z_n^{\mathcal{A}}; \thetab)) + T_2(\z_n^{\mathcal{A}}; \thetab)\nonumber,
% \end{align}
% where $\x_n = (\x_n^{\mathcal{A}}, \x_n^{\mathcal{B}})$ is a disjoint partition of the input data at position $n$, $\z_n = (\z_n^{\mathcal{A}}, \z_n^{\mathcal{B}})$ is the  corresponding latent partition, and the functions $S_1$, $S_2$, $T_1$, $T_2$ are implemented as multi-headed fully connected (FC) neural networks (with trainable parameters suppressed for clarity). 
% However, our framework is not restricted to likelihoods which factorize in this sample way, as we describe next.

% \paragraph{Non-Exchangeable Models}
% Models with memory \autocite[i.e., stateful models;][]{radev2021outbreakflow} might imply a more complex factorization of the likelihood. 
% For example, the widely used family of Markovian models factorize in a way that the probability of each data point depends on previous data points $\smash{\lik = \prod_{n=1}^Np(\x_n\given\thetab,\x_{1:{n-1}})}$.
% Such models require a slightly different coupling layer design which respects their non-IID outputs.
% To this end, we augment standard coupling layers with a conditional recurrent (LSTM) memory $\h_n = M(\thetab, \x_n; \h_{n-1})$ which encodes temporal dependencies into a hidden state vector $\h$.
% The forward pass for a single affine coupling layer of the non-exchangeable likelihood network is then given by:
% \begin{align} 
% \h_n &= M(\thetab, \x_n; \h_{n-1})\nonumber\\
% \z_n^{\mathcal{A}} &= \x_n^{\mathcal{A}} \odot \exp(S_1(\x_n^{\mathcal{B}}; \thetab, \h_{n-1})) + T_1(\x_n^{\mathcal{B}}; \thetab, \h_{n-1}) \nonumber\\ 
% \z_n^{\mathcal{B}} &= \x_n^{\mathcal{B}} \odot \exp(S_2(\z_n^{\mathcal{A}}; \thetab, \h_{n-1})) + T_2(\z_n^{\mathcal{A}}; \thetab, \h_{n-1}) \nonumber,
% \end{align}
% where now each latent representation $\z_n$ at position $n$ depends on the preceding data points, as encoded by $\h_{n-1}$.
% We set $\h_0$ as the zero vector, indicating empty memory for the very first observable $\x_1$ in a sequence.

\subsection{Simulation-based training}

In contrast to previous joint learning approaches \autocite{wiqvist2021sequential, glockler2022snvi}, we aim for a fully amortized approach: Once the networks have converged, we want to evaluate the normalized densities $\posta$ and $\lika$ for \textit{any} pair $(\thetab, \x)$ consistent with a generative Bayesian model $\mathcal{M}$.
In addition, we want to generate conditional random draws $\thetab \given \x$ and $\x \given \thetab$ from both networks for parameter estimation and surrogate modeling.
Finally, we want to prescribe a simple distribution to the summary network outputs $p\big(\mathcal{H}_{\psib}(\x)\big)$ in order to detect atypical data during inference (i.e., model misspecification) and highlight potential posterior errors \autocite{schmitt2022bayesflow}.
Thus, we minimize the following criterion:
\begin{equation}\label{eq:cost}
\begin{aligned}
    \min_{\phib, \psib, \etab} \mathbb{E}_{ \joint}\big[&-\left(\log p_{\phib}(\thetab \given \mathcal{H}_{\psib}(\x) \right) + \log l_{\etab}(\x \given \thetab))  \big] \\&+ \lambda \cdot \mathbb{MMD}^2\big[p(\mathcal{H}_{\psib}(\x))\,||\,\mathcal{N}(\boldsymbol{0}, \mathbb{I})\big]
\end{aligned}
\end{equation}
where $\mathbb{MMD}^2$ is the maximum mean discrepancy \autocite[MMD;][]{Gretton2012} between the distribution of summary network outputs and a unit Gaussian density.
This divergence imposes a probabilistic structure on the summary space learned by $\mathcal{H}_{\psib}(\x)$ and enables error detection and model criticism during inference \autocite[to be explained shortly, see also][]{schmitt2022bayesflow}.
\new{We approximate the expectation over $\joint$ via online or offline simulations from the generative model $\mathcal{M}$ and train the three networks until convergence (see the \textbf{Appendix} for a detailed derivation of simulation-based training).}

Proper minimization of the criterion in Eq.~\ref{eq:cost} results in correct posterior and likelihood approximation, along with an interpretable summary space.
However, the objective promises self-consistency only in the ``small world'', as it does not guarantee correct posterior inference or likelihood evaluation in the real world when there may be a severe simulation gap.
This is due to the fact that simulation-based training optimizes the expectation with respect to the Bayesian joint model $\joint$, but not (necessarily) the empirical data distribution $p^*(\x)$.
Thus, the MMD term allows us to detect potential simulation gaps during inference via distribution matching \autocite{schmitt2022bayesflow}.
Moreover, the posterior network can serve as a ``critic'' for the likelihood network by rejecting surrogate simulations which are judged to be highly unlikely under the true simulator. 

\subsection{Validation Methodology: Joint Calibration}
\label{sec:val}

Faithful uncertainty representation (i.e., calibration) is an essential precondition for self-consistent and interpretable simulation-based inference.  
Simulation-based calibration \autocite[SBC;][]{talts2018validating} is a general diagnostic method which considers the performance of a sampling algorithm over the entire joint distribution $p(\thetab, \x)$, regardless of the specific probabilistic structure of a model.

SBC leverages the generative nature of Bayesian models as well as the self-consistency of the Bayesian joint model $\model$ in the following sense: For all quantiles $q \in (0, 1)$, all uncertainty regions $U_q(\thetab \mid \x)$ of $\post$ are well calibrated, as long as the generating distribution of the assumed model is equal to true data-generating distribution and posterior computation is exact \autocite{talts2018validating}. 
We can formally write this property as
\begin{equation}
\label{eq:sbc}
  q = \int_{\mathcal{X}} \int_{\Theta} \mathbb{I}_{\left[ \thetab^* \in U_q(\thetab \given \x) \right]} \, p(\x \given \thetab^*) \, p(\thetab^*) \, \diff \x \, \diff \thetab^*,
\end{equation}
where $\thetab^*$ is the true data-generating parameter and $\mathbb{I}_{\left[\cdot\right]}$ is the indicator function.
%Accordingly, if the posterior network and the likelihood network are perfectly converged, then the average posterior distribution in expectation over $\lika\,\prior$ (as approximated via random draws from the prior and the surrogate simulator) should always recover the prior distribution $p(\thetab^*)$ for any fixed $\thetab^*$:
%\begin{equation}
%    p(\thetab^*) = \int_{\mathcal{X}} \int_{\Theta} p_{\phib}(\thetab^* \given \mathcal{H}_{\psib}(\x))\,l_{\etab}(\x \given \thetab)\,\prior\,\diff\thetab\,\diff\x \label{eq:sbc}
%\end{equation}
If the posterior network $\mathcal{P}_{\phib}$ generates draws from the true posterior and the likelihood network $\mathcal{L}_{\etab}$ mimics the simulator perfectly, then the equality implied by Eq.~\ref{eq:sbc} holds regardless of the particular form of the true likelihood or the true posterior. 
Thus, any violation of this equality indicates some error incurred by joint training, so we refer to our validation procedure as joint simulation-based calibration (JSBC).

The reasons for faulty JSBC can be any combination of (i) inaccurate representation of the posterior; (ii) inaccurate representation of the likelihood; or (iii) an erroneous implementation of the simulation model itself. 
To differentiate between (i) and (ii), we can first run standard SBC for the posterior network using data draws from the actual simulator instead of the likelihood network. 
If this check passes, but subsequently JSBC fails, the calibration problems must stem from the likelihood network.
Thereby, we can use the posterior network for \emph{model criticism} of the likelihood network, which would otherwise be infeasible for most Bayesian models.
%Otherwise, checking the calibration of the likelihood network alone would require expensive MCMC sampling, which would be infeasible for most Bayesian models.

As part of a Bayesian workflow \autocite{gelman2020bayesian}, calibration procedures can quickly become infeasible for non-amortized methods, as they require independent posterior draws from hundreds or thousands of simulated data sets.
However, we can effortlessly assess the calibration of amortized methods, since we can obtain many posterior draws from thousands of data sets in a matter of seconds.
In practice, we follow \textcite{sailynoja2022graphical} by transforming the posterior draws intro fractional rank statistics and computing their empirical cumulative distribution functions (ECDFs). 
This method provides \textit{simultaneous confidence bands} and eliminates the need to manually select a binning parameter (e.g., as required by histogram-based methods).


\subsection{Use Cases for Joint Learning}

% In the following, we describe several use cases of our joint learning method which can neither be tackled by a likelihood nor posterior network alone.

\paragraph{Posterior Predictive Estimation}
Estimating the expected predictive performance of a Bayesian model (Eq.~\ref{elpd}) requires an analytic expression for the pointwise \smash{$p(\x^{(k)}_{\rm new} \given \thetab)$} at arbitrary new data \smash{$\x^{(k)}_{\rm new}$} \autocite{burkner_nfloo_2021}.
For this reason, the ELPD cannot be computed for Bayesian models with intractable likelihoods or sequential neural estimators.

Moreover, even if the likelihood itself were analytic, the integral in Eq.~\eqref{elpd} would still be intractable for most models. 
It can be efficiently approximated using posterior draws, but doing so in the context of cross-validation requires importance sampling or costly model refits \autocite{vehtari2017practical}.
Hence, evaluating the ELPD for arbitrary cross-validation schemes critically requires both the amortized likelihood and posterior approximator.

Given data used for model fitting $\x$ and upcoming data \smash{$\x^{(k)}_{\rm new}$}, the two networks can estimate a model’s expected predictive performance in two steps.
First, we can obtain a large amount of $S$ random draws from the amortized posterior given $\x$:
\begin{equation}
    \thetab^{(s)} \sim \postah \textrm{ for } s = 1,...,S.
\end{equation}
Then, the likelihood network can approximate the ELPD at all $\x_{\rm new}^{(k)}$ given $\{\thetab^{(s)}\}$ via its Monte Carlo estimate:
\begin{equation}\label{eq:elpd}
    \widehat{\textrm{ELPD}} = \sum_{k=1}^{K} \log \frac{1}{S} \sum_{s=1}^S l_{\psib}(\x^{(k)}_{\rm new} \given \thetab^{(s)})
\end{equation}
In the context of cross-validation (CV), $\x$ and \smash{$\x_{\rm new}$} refer to a random data split, and we can estimate the predictive performance of a Bayesian model by summing over the \smash{$\widehat{\textrm{ELPDs}}$} from all data splits. 
In \textbf{Experiment 3}, we demonstrate this for leave-one-out (LOO)-CV, which is one of the most expensive validation methods.

\paragraph{Marginal Likelihood Estimation}
Bayesian (prior) predictive model comparison depends on computing a marginal likelihood (Eq.~\ref{eq:marg_lik}).
We can leverage the probabilistic change of variable, which results directly from Bayes' rule:
\begin{align}\label{eq:log_marg}
    \log \widehat{p}(\x) &= \log \lika + \log\prior\\ &- \log \postah \nonumber.
\end{align}
Thus, for any data set, we can obtain an estimate of the log marginal likelihood by evaluating Eq.~\ref{eq:post} and Eq.~\ref{eq:lik}, along with the prior density $\prior$.
Evaluating all above terms is infeasible with standard Bayesian methods, since either the normalized posterior, the likelihood, or both quantities are typically intractable. 
Bridge sampling \autocite{meng_simulating_1996} enables the approximation of marginal likelihoods from posterior draws, but only works for models with analytical likelihoods and in tandem with non-amortized MCMC.

From a Bayesian perspective, evaluating Eq.~\ref{eq:log_marg} across multiple data sets amounts to \textit{amortized bridge sampling}.
At the same time, we can use Eq.~\ref{eq:log_marg} for assessing non-convergence or problems during inference by evaluating the right-hand side for a fixed $\x$ and different $\thetab$ drawn from the approximate posterior.
Under perfect convergence, the right-hand side of Eq.~\ref{eq:log_marg} is independent of $\thetab$, so any ensuing variation is a measure of pure approximation error.

\textbf{Surrogate Simulators}
In some modeling scenarios, the simulator might be a large-scale computer program implementing a complex generative algorithm \autocite{lavin2021simulation}.
Thus, a simulation-based inference workflow might be severely limited by the inability to obtain a large amount of simulations in a reasonable time. 
In such cases, an amortized surrogate simulator can generate additional data for the posterior network or a black-box optimizer \autocite{gutmann2016bayesian}.
A notable advantage of neural surrogate simulators is that they can directly emulate complex data without summary statistics (see~\autoref{fig:covid}).
In addition, they can render a non-differentiable simulator differentiable for downstream tasks, such as amortized design optimization \autocite{ivanova2021implicit} or interact with MCMC samplers \autocite{fengler2021likelihood, boelts2022flexible}.
% \begin{figure*}[t]
%     \centering
%       \begin{subfigure}[t]{0.49\textwidth}
%               \includegraphics[width=\textwidth]{plots/benchmarks/gaussian_mixture_diff_ind_post.pdf}
%               \caption{Gaussian Mixture: Posterior Calibration}
%       \end{subfigure}
%       \begin{subfigure}[t]{0.49\textwidth}
%               \includegraphics[width=\textwidth]{plots/benchmarks/gaussian_mixture_diff_ind_joint.pdf}
%               \caption{Gaussian Mixture: Joint Calibration}
%       \end{subfigure}
%       \hfill
%       \begin{subfigure}[t]{0.49\textwidth}
%               \includegraphics[width=\textwidth]{plots/benchmarks/sir_diff_ind_post.pdf}
%               \caption{SIR: Posterior Calibration}
%       \end{subfigure}
%       \begin{subfigure}[t]{0.49\textwidth}
%               \includegraphics[width=\textwidth]{plots/benchmarks/sir_diff_ind_joint.pdf}
%               \caption{SIR: Joint Calibration}
%       \end{subfigure}
%     \caption{\textbf{Experiment 1.} Example calibration tests for 2 of the more challenging benchmarks. Panels (a) and (b) depict the good posterior and joint calibration of JANA for the Gaussian Mixture model. Panels (c) and (d) demonstrate how posterior and joint calibration can be used in tandem to detect an underperforming likelihood network with the aid of ECDF difference plots. The posterior network alone induces no systematic deviations when applied to simulator outputs (c), but it tends to overestimate the actual parameters given the outputs of the surrogate network (d).}
%     \label{fig:benchmarks}
% \end{figure*}

\section{Related Work}
\label{sec:rel_work}

% The following section presents a very brief overview of previous approaches to simulation-based inference and surrogate modeling, with a particular focus on neural methods, since the latter are most relevant to our work. 
% It starts with non-amortized methods for SBI and then peruses the literature on amortized neural posterior and likelihood estimation \autocite[for thorough reviews of contemporary approaches to SBI and SM, see][]{cranmer2020frontier, lavin2021simulation}.

\paragraph{Approximate Bayesian Computation} An established approach to SBI is embodied by approximate Bayesian computation \autocite[ABC;][]{marin2012approximate, sisson2018handbook}. 
ABC is a family of algorithms where the simplest one, ``ABC rejection'', generates draws from an approximate posterior by repeatedly proposing parameters from the prior distribution, and then simulating a corresponding synthetic data set by running the simulator with the proposed parameters.
% If a simulated data set is sufficiently similar to observed data (as measured by a user-defined distance function or a kernel function), the corresponding proposed parameter is retained as a random draw from the approximate posterior, and is otherwise rejected.  
More sophisticated ABC samplers are Sequential Monte Carlo \autocite[ABC-SMC;][]{beaumont2009adaptive,smc,del2012adaptive,picchini2022guided} and Markov chain Monte Carlo ABC \autocite[ABC-MCMC;][]{marjoram2003markov,picchini2014inference}. 
In ABC, raw data are typically reduced via summary functions.
However, \textit{hand-crafted} summary statistics are often insufficient, which results in a leak of information about the parameters \autocite{marin2018likelihood}. 
Recent work has used neural networks to learn informative summary statistics of model parameters in ABC \autocite{jiang2017learning, wiqvist2019partially,chen2020neural}. 

\paragraph{Synthetic Likelihoods and Particle MCMC} Despite being intuitive to grasp and use, the above ABC methods are notoriously inefficient, typically requiring millions of model simulations, which can be prohibitive for expensive simulators. 
Another established SBI alternative, also based on data-reduction via summary statistics, is \textit{synthetic likelihood} \autocite{wood2010statistical,price2018bayesian}, which is more suitable for high-dimensional summary statistics. 
Since synthetic likelihood is typically implemented in tandem with an MCMC sampler where multiple data sets are simulated at each proposed $\thetab$, it can also be computationally intensive. 
Particle MCMC \autocite{andrieu2010particle} is a simulation-based method for exact Bayesian inference which has found considerable success, especially for state-space models. 
However, particle MCMC could be infeasible when multiple inference runs are required to separately fit several different data sets.

\begin{figure}[t]
    \centering
    \begin{minipage}[c]{0.05\linewidth}
    \,
    \end{minipage}
    \begin{minipage}[c]{0.46\linewidth}
        \centering Posterior Calibration
    \end{minipage}
    \begin{minipage}[c]{0.46\linewidth}
        \centering \quad Joint Calibration
    \end{minipage}\\
    \begin{minipage}[c]{0.05\linewidth}
        %\adjustbox{minipage=3cm,rotate=90}{Gaussian Mixture}
        \rotatebox{90}{Gaussian Mixture}
    \end{minipage}
    \begin{minipage}[c]{0.46\linewidth}
        \includegraphics[width=\linewidth]{plots/benchmarks/gaussian_mixture_diff_ind_post.pdf}
    \end{minipage}
    \begin{minipage}[c]{0.46\linewidth}
        \includegraphics[width=\linewidth]{plots/benchmarks/gaussian_mixture_diff_ind_joint.pdf}
    \end{minipage}\\
    \begin{minipage}[c]{0.05\linewidth}
        \rotatebox{90}{SIR}
    \end{minipage}
    \begin{minipage}[c]{0.46\linewidth}
        \includegraphics[width=\linewidth]{plots/benchmarks/sir_diff_ind_post.pdf}
    \end{minipage}
    \begin{minipage}[c]{0.46\linewidth}
        \includegraphics[width=\linewidth]{plots/benchmarks/sir_diff_ind_joint.pdf}
    \end{minipage}
    \caption{\textbf{Experiment 1.} Example calibration tests for 2 of the more challenging benchmarks. \textit{Top row:} Good posterior and joint calibration of JANA for the Gaussian mixture model. \textit{Bottom row:} Posterior and joint calibration can be used in tandem to detect an underperforming likelihood network in the SIR model. The posterior network alone induces no systematic deviations when applied to simulator outputs (bottom left), but overestimates the parameters given the outputs of the surrogate network (bottom right).}
    \label{fig:benchmarks}
\end{figure}

\paragraph{Neural Posterior Estimation}

Methods for neural posterior estimation either specialize a neural approximator for inference on a single observation\footnote{The term \textit{observation} may refer to an entire data set, depending on how the data is used to update the posterior. \new{For instance, typical toy models (e.g., two moons) use a single data point, whereas realistic model applications typically use a set of data points for inference.}} \autocite{papamakarios2016fast, lueckmann2017flexible, greenberg2019automatic, durkan2020contrastive, deistler2022truncated}, or inference across arbitrarily many observations \autocite{ardizzone2018analyzing, gonccalves2020training, radev2020bayesflow, pacchiardi2022score, avecilla2022neural}.
The former methods perform \textit{sequential estimation} by iteratively refining the prior to generate simulations in the vicinity of the observation.
Thus, they are \textit{not amortized}, as each new observation necessitates a costly re-training of the neural approximator.
In contrast, the latter methods can perform \textit{amortized inference}, as the neural approximator is trained to generalize over the entire prior predictive distribution and can be queried for any observation assumed to arise from the Bayesian model.
Importantly, amortization can be performed over any aspect of the model, including data sets \autocite{gonccalves2020training} or other contextual factors, such as the number of observations in a data set or the number of time points in a time series \autocite{radev2020bayesflow}.

\paragraph{Neural Likelihood Estimation}
A related family of neural methods directly targets the intractable likelihood function instead of the posterior \autocite{papamakarios2019sequential, lueckmann2019likelihood, hermans2020likelihood, fengler2021likelihood, boelts2022flexible, munk2022probabilistic}.
The endpoint of these methods is an \textit{amortized likelihood approximator} which can mimic a complex simulator or be used in tandem with non-amortized MCMC samplers for posterior estimation.
The latter can be prohibitively time-consuming, since it not only requires expensive simulation-based training, but also integrating likelihood approximators into MCMC.
This makes validating the posteriors \autocite[e.g., via simulation-based-calibration; SBC;][]{talts2018validating, sailynoja2022graphical} challenging or even impossible in practice.
Nevertheless, likelihood approximators have certain advantages over posterior approximators, for instance, they do not need to be retrained for different priors and can emulate the behavior of large-scale simulators \autocite{lavin2021simulation}.

\paragraph{Neural Posterior and Likelihood Estimation}

\new{In a pioneering work, \textcite{wiqvist2021sequential} attempt to embody the best of both worlds by training together two networks for sequential neural posterior and likelihood approximation (SNPLA).
A potential shortcoming of SNPLA is that it optimizes the reverse Kullback-Leibler (rKL) divergence, which is prone to mode collapse and instabilities \autocite{arjovsky2017wasserstein}.
Sequential neural variational inference \autocite[SNVI;][]{glockler2022snvi} improves on SNPLA by targeting the forward KL (fKL) and using an importance sampling correction of the posterior estimates.
JANA also optimizes the mode-covering fKL by approximating an expectation over the Bayesian joint model (Eq.~\ref{eq:cost}).
In addition, JANA operates in a fully amortized manner, such that the posterior network can be applied to any set of observations (i.e., data sets; potentially with different sizes) and the likelihood network can produce instantaneous surrogate simulations given any parameter configuration.
This enables us to amortize some of the most costly procedures in Bayesian analysis, such as simulation-based calibration and leave-one-out cross-validation. 
In contrast, both SNPLA and SNVI focus on sequential (non-amortized) inference and employ a likelihood network only to support posterior estimation.
}

% “A recent other work has addressed joint posterior and surrogate modelling, with a similar loss to the authors [1]. This work seems relevant to discuss. The authors should elaborate how this work is different from theirs.”

% Thank you for pointing us to this interesting paper, which has escaped our attention. Our method deviates from the referenced method in [1] quite fundamentally, among others, in the following ways:

% We learn not only a direct amortized surrogate for the (intractable) simulator, but also an amortized surrogate for the (intractable) posterior;
% We use the two amortized surrogates in tandem to perform log marginal likelihood estimation and Bayesian cross validation;
% We propose a diagnostic for testing the consistency of the networks in the closed-world setting;

% From this perspective, JANA is fully compatible with the framework proposed in [1], since a PSN network which has been trained to emulate the stochastic traces of a probabilistic program can be used in place of our direct probabilistic approximation of the implicit likelihood. Indeed, we believe this to be an intriguing avenue for future research and we will discuss the related paper along with the general idea of inference compilation (IC) in the camera-ready version of our manuscript.


\section{Experiments}
\label{sec:experiment}

In the following, we will illustrate the utility of JANA in thirteen Bayesian models across five experiments.
For \textbf{Experiments 1--3}, we train the networks without the Maximum Mean Discrepancy (MMD) criterion in Eq.~\ref{eq:cost} (i.e., $\lambda = 0$), because our validations feature no model misspecification.
The code for running and reproducing all experiments is available at \url{https://github.com/bayesflow-org/JANA-Paper}.
JANA is implemented in the BayesFlow library.

\subsection{Ten Benchmark Experiments}
\begin{figure}[t]
    \centering
    \includegraphics[width=\linewidth]{plots/two_moons_1/tm_posterior_seed1.pdf} %_7
    \caption{\textbf{Experiment 2}. Samples from the approximate posterior distribution (Two Moons, repetition \#1). \new{No evident advantage of non-amortized over amortized neural methods (i.e., NPE-C and JANA) .}}
    \label{fig:two-moons:posterior}
\end{figure}

\paragraph{Setup} 
This experiment demonstrates the fidelity of our proposed architecture as well as the utility of our calibration checks to diagnose approximation faults on a set of ten benchmark simulation models proposed by \textcite{lueckmann2021benchmarking}.
Since these benchmarks were originally designed for gauging the performance of (non-amortized) posterior estimation, we deviate from the original problem setting by (i) approximating both posterior and likelihood; and (ii) validating our results on a much larger held-out set of $1\,000$ simulations (as compared to just 10).

For each benchmark, we train our networks with a fixed budget of $10\,000$ simulations, as we consider this to be a challenging practical setup with low-to-medium training data availability.
Importantly, our goal here is \textit{not to propose a better method for posterior estimation}, but to demonstrate the feasibility of joint amortization and the utility of our joint calibration diagnostic on a set of popular and rather diverse models.
See the \textbf{Appendix} and the accompanying code for more details and diagnostics.

\begin{figure}[t]
   \centering
    \begin{subfigure}[t]{0.62\linewidth}
    \includegraphics[width=\linewidth]{plots/two_moons_1/tm_boxplot_posterior_nsim10000.pdf}
        \caption{Posterior MMD}
        \label{fig:two-moons:boxplots:posterior}
    \end{subfigure}
    \hfill
    \begin{subfigure}[t]{0.37\linewidth}
        \includegraphics[width=\linewidth]{plots/two_moons_1/tm_boxplot_posterior_predictive_nsim10000.pdf}
        \caption{Joint MMD}
        \label{fig:two-moons:boxplots:posterior-predictive}
    \end{subfigure}
    \caption{\textbf{Experiment 2}. Performance with $N{=}10\,000$ training simulations, as indexed by the empirical Maximum Mean Discrepancy (MMD) estimate (lower is better).}
    \label{fig:two-moons:boxplots}
\end{figure}

\begin{figure*}[t]
    \centering
    \begin{subfigure}[t]{0.22\linewidth}
        \includegraphics[width=\linewidth]{plots/diffusion_model/Likelihood_comparison_01.pdf}
        \caption{Likelihood emulation}
        \label{fig:diffusion-model:likelihood}
    \end{subfigure}
    \hspace*{0.25cm}
    \begin{subfigure}[t]{0.22\linewidth}
        \includegraphics[width=\linewidth]{plots/diffusion_model/ECDF_Joint_Stacked.pdf}
        \caption{Joint calibration}
        \label{fig:diffusion-model:calibration}
    \end{subfigure}
    \hspace*{0.25cm}
    \begin{subfigure}[t]{0.22\linewidth}
        \includegraphics[width=\linewidth]{plots/diffusion_model/LML_comparison.pdf}
        \caption{Prior predictive (LML)}
        \label{fig:diffusion-model:lml}
    \end{subfigure}
    \hspace*{0.25cm}
    \begin{subfigure}[t]{0.22\linewidth}
    \includegraphics[width=\linewidth]{plots/diffusion_model/LOO_PSIS-LOO_comparison.pdf}
        \caption{Post. predictive (ELPD)}
        \label{fig:diffusion-model:loo}
    \end{subfigure}
    \caption{\textbf{Experiment 3.}
    The true and synthetic likelihood align almost perfectly (\subref{fig:diffusion-model:likelihood}).
    The joint approximation of all parameters is well calibrated (\subref{fig:diffusion-model:calibration}).
    Both prior predictive (\subref{fig:diffusion-model:lml}) and posterior predictive (\subref{fig:diffusion-model:loo}) estimates of JANA closely approximate those obtained via gold standard bridge sampling and Pareto smoothed importance sampling (PSIS). Each point in (\subref{fig:diffusion-model:lml}) and (\subref{fig:diffusion-model:loo}) represents one out of 100 held-out simulations.}
\end{figure*}

\paragraph{Results} 
Overall, we observe stable training and good calibration across the ten benchmarks models, with the SIR model exhibiting systematic joint miscalibration due to likelihood approximations errors.
\autoref{fig:benchmarks} illustrates the utility of our calibration diagnostic to reveal both good calibration (i.e., ECDF trajectories completely contained in the confidence ellipsis for the Gaussian Mixture benchmark) as well as systematic deviations owing to the likelihood network (i.e., ECDF trajectories partially outside the confidence ellipsis for SIR).
Moreover, due to the inherent interpretability of the ECDF calibration plots, we can pinpoint the reasons for joint miscalibration of the SIR model:
The likelihood network tends to generate more rapid synthetic outbreaks than the actual model, which leads to the posterior network overestimating the parameters of surrogate simulations.
\subsection{Two Moons: Method Comparison}
\paragraph{Setup}
Here, we focus specifically on the Two Moons benchmark \autocite{greenberg2019automatic, lueckmann2021benchmarking} and use the code from \textcite{wiqvist2021sequential} to compare JANA with the popular sequential methods SNL \autocite{papamakarios2019sequential}, SNPE-C \autocite{greenberg2019automatic}, SNRE-B \autocite{durkan2020contrastive}, SNPLA \autocite{wiqvist2021sequential}, SNVI \autocite{glockler2022snvi}, and a recent ABC-SMC algorithm with ``guided particles'' \autocite[here abbreviated with g-SMC, which is the method called ``hybrid'' in ][]{picchini2022guided}.
The model is characterized by a bimodal posterior with two separated crescent moons for the observed point $\x_{\text{new}} = (0, 0)^\top$ which a posterior approximator needs to recover.
We train SNL, SNPE-C, SNRE-B, SNVI, SNPLA, g-SMC, and JANA following the same setup from \textcite{wiqvist2021sequential}.\footnote{For comparability with \textcite{wiqvist2021sequential}, the setup differs from \textcite{lueckmann2021benchmarking} in terms of location and size of the moons. The results of \textbf{Experiment 2} with the implementation of \textcite{lueckmann2021benchmarking} are comparable, see the \textbf{Appendix}.}
For each method, we repeat the experiment ten times using a fixed budget of $2\,000$, $6\,000$, and $10\,000$ simulations and subsequently obtain $1\,000$ posterior draws from the converged methods.
For a numerical evaluation, we apply MMD between the approximate and analytical distributions.

%TODO - Update results
\paragraph{Results}
JANA consistently explores both crescent moons throughout all repetitions and already captures the local patterns of the posterior after $2\,000$ training samples (see \autoref{fig:two-moons:posterior}).
\new{With respect to posterior performance, JANA is on par with all sequential methods which are tailored to one observed data set (see \autoref{fig:two-moons:boxplots:posterior}).
In terms of joint (posterior predictive) performance, JANA outperforms non-amortized sequential methods, see \autoref{fig:two-moons:boxplots:posterior-predictive}.}
In light of these and previous results, \textit{amortization across data sets} seems to be a reasonable choice even with limited simulation budgets, especially since sequential (non-amortized) methods may be infeasible for large data \autocite{hermans2021}.
\new{The \textbf{Appendix} contains wall-clock times and further details for training and inference.}

%The mean (SD) MMD after training on $N=10\,000$ training samples for SNL, SNPLA, and our amortized posterior and likelihood approximator across the ten repetitions of the experiment follow as (lower is better): SNL 0.xxx (0.xxx), SNPLA 0.xxx (0.xxx), Ours 0.xxx (0.xxx).

% \begin{figure}[t]
%     \centering
%     \includegraphics[width=0.85\linewidth]{plots/diffusion_model/ECDF_Joint_Stacked.pdf}
%     \caption{\textbf{Experiment 3}. Well-calibrated joint approximation of all parameters of the exchangeable diffusion model.}
%     \label{fig:ddm_cal}
% \end{figure}

% \begin{figure}[t]
%     \centering
%     \begin{subfigure}[t]{0.49\linewidth}
%     \includegraphics[width=\linewidth]{plots/diffusion_model/LML_comparison.pdf}
%     \caption{LML}
%     \label{fig:diffusion-model:lml}
%     \end{subfigure}
%     \begin{subfigure}[t]{0.49\linewidth}
%     \includegraphics[width=\linewidth]{plots/diffusion_model/LOO_PSIS-LOO_comparison.pdf}
%     \caption{ELPD}
%     \label{fig:diffusion-model:loo}
%     \end{subfigure}
%     \caption{\textbf{Experiment 3}. Each point represents one of 100 held-out simulation. Both the LML (a) and LOO ELPD (b) estimates of JANA closely approximate those obtained via bridge sampling and PSIS-LOO, respectively.}
%     \label{fig:ddm_ml_loo}
% \end{figure}
\subsection{Exchangeable Diffusion Model}
\paragraph{Setup} 
This example demonstrates amortized log marginal likelihood (LML) and expected log predictive density (ELPD) estimation based on a mechanistic model of decision making: the diffusion model \autocite{ratcliff2008}.
We benchmark our results against state-of-the-art likelihood-based methods.
First, we compare our marginal likelihood estimates with those obtained with bridge sampling \autocite{gronau2017bridgesampling}.
Second, we compare our leave-one-out (LOO)-ELPD estimates (Eq.~\ref{eq:elpd}) with those obtained using Pareto smoothed importance sampling \autocite{vehtari2017practical}.
Both methods use random draws obtained via MCMC, as implemented in Stan \autocite{carpenter2017stan}.

% Here, we use random draws obtained via Stan \autocite{carpenter2017stan}, which we consider the current gold-standard for likelihood-based Bayesian inference.
% We use \texttt{fast-dm} \autocite{voss_assessing_2015} via the \texttt{rtdists} \autocite{singmann2022rtdists} R-package to simulate data sets for training and evaluation.
% We use 200 held-out simulations for the benchmark results and 5000 held-out simulations for JSBC.
% The DM is an ample example of a complex exchangeable model, which has been previously tackled in the context of both posterior estimation \autocite{radev2020bayesflow} or likelihood estimation \autocite{fengler2021likelihood}.

\paragraph{Results}
Our results indicate well-calibrated joint approximation (see~\autoref{fig:diffusion-model:calibration}) as well as accurate posterior and likelihood estimation (see~\autoref{fig:diffusion-model:lml} and \ref{fig:diffusion-model:loo}).
For the approximation of marginal likelihoods, we first perform amortized posterior sampling on the 100 held-out data sets. 
% Since we use bounded uniform priors, we reject samples with $\prior = 0$. 
We then evaluate the approximate likelihood on these samples, and finally apply Eq.~\ref{eq:log_marg} to compute the LML.
Our numerical results reveal a very close correspondence between our neural log marginal likelihoods and those obtained via MCMC-based bridge sampling (see \autoref{fig:diffusion-model:lml}). 
Furthermore, our amortized LOO-CV estimates align very closely with the estimates obtained via PSIS-LOO (see \autoref{fig:diffusion-model:loo}).

\paragraph{MCMC Integration}
Surrogate likelihoods provide all information that is needed for MCMC sampling. 
We provide an interface to PyMC \autocite{salvatier_probabilistic_2016} to allow for easy model building and use of existing samplers. 
Note, that the performance of gradient-based samplers, such as Hamiltonian Monte Carlo, critically depends on the precision of partial log-likelihood derivatives. 
Using PyMC's No-U-Turn sampler (NUTS) with our neural likelihood, we obtained results similar to those using Stan. 
If gradient-based sampling methods fail, we advise to use gradient-free sampling methods, such as slice sampling. 
For detailed information, see the \textbf{Appendix}.

\begin{figure*}[h]
    \centering
    \includegraphics[width=0.99\textwidth]{plots/denoising/main.pdf} %_7
    \caption{\textbf{Experiment 5}. Example denoising results from each class of Fashion MNIST. \textit{First row:} Original image acting as the ``parameters'' of the noisy camera simulator. \textit{Second row:} Blurred image, acting as the output of the camera simulator. \textit{Third and fourth row:} Means and standard deviations of the posteriors estimated from the corresponding blurry ``observations''. \textit{Note:} For standard deviations, darker regions indicate larger variability in the outputs.}
    \label{fig:denoising}
\end{figure*}

\subsection{Markovian Compartmental Model}

\paragraph{Setup} This experiment demonstrates surrogate simulations of a complex non-exchangeable model of infectious diseases.
The model features 34 parameters and thus represents a considerable extension of the two-parameter toy SIR model \autocite{lueckmann2021benchmarking, radev2020bayesflow}.
We use the model specification and posterior network from \textcite{radev2021outbreakflow}.
We implement the likelihood network as a recurrent cINN (see Section~\ref{sec:lik_net}) to test its ability to emulate raw and noisy time series.
Further, we train the summary network with the MMD criterion (Eq.~\ref{eq:cost}) with $\lambda = 1$ to judge the quality of the surrogate simulations numerically.

\paragraph{Results} Upon convergence, we use the likelihood network to generate synthetic outbreak trajectories and compare them visually with the outputs of the original simulator.
We observe good emulation across a variety of different parameter configurations, each leading to a qualitatively different simulated scenario (see~\autoref{fig:covid} for an example and the \textbf{Appendix} for detailed results).
Moreover, it seems that the surrogate network is not only able to accurately approximate the median trajectory, but also the variability (i.e., \textit{aleatoric uncertainty}) in simulated trajectories.
 
Beyond purely visual comparisons, we also compute the posterior and joint calibration of the two networks using joint SBC on $1\,000$ held-out simulations.
We confirm the good posterior calibration observed by \textcite{radev2021outbreakflow}.
In addition, the joint calibration results help us highlight some subtle deficiencies of the likelihood network.
For instance, it tends to overestimate the variability of simulated time series, thus ``tricking'' the posterior network into estimating higher values for the noise parameters (see \textbf{Appendix}).
We attribute this deficiency to the extremely wide magnitude range of the simulated data (incidence in the order of millions) which is not captured by our simple input standardization procedure.

\subsection{High-Dimensional Bayesian Denoising}
\paragraph{Setup} The last experiment demonstrates the feasibility of JANA for tackling high-dimensional Bayesian models with relatively low simulation budgets. Similarly to \textcite{ramesh2022gatsbi}, we consider a Bayesian denoising setup on the Fashion MNIST data set, where the ``parameter vector'' $\thetab \in \mathbb{R}^{784}$ represents the original image and the ``observation'' $\x \in \mathbb{R}^{784}$ is a blurry version of the image generated by a simulated noisy camera. 

We train a JANA architecture comprising two fully connected affine coupling architectures operating on the flattened images (as they would, if the Bayesian model were a scientific simulator with $784$ parameters). 
Since both ``parameters'' and ``data'' in this unusual example are images, we use two simple convolutional networks as summary networks for both the posterior and likelihood networks. 

\paragraph{Results} We evaluate the performance of the networks on the official Fashion MNIST test set. 
To summarize their calibration, we report the average expected calibration error \autocite{radev2020bayesflow} for the posterior ($\approx 0.03 \pm 0.02$) and joint samples ($\approx 0.04 \pm 0.03$), indicating reasonable approximation fidelity and slightly increased joint miscalibration.
We also inspect the visual quality of random samples generated from the posterior and the synthetic likelihood (see~\autoref{fig:denoising} for an example of posterior estimation). 
These results suggest that the networks have captured the basic structure of the problem, with ``core features'' being easier to reconstruct than ``details''.
An extended description and more results are provided in the \textbf{Appendix}.

\section{Conclusion}
\label{sec:conclusion}

We investigated the utility of JANA for Bayesian surrogate modeling and simulation-based inference within the BayesFlow framework.
% Amortization across data sets allows users to perform efficient validation and calibration diagnostics, which are typically infeasible with non-amortized methods.
% Moreover, the combination of the two networks enables amortized computation of key Bayesian quantities (i.e., marginal likelihood and log-predictive densities), which present a major source of intractability in traditional Bayesian workflows.
We believe that JANA can greatly enrich applications of amortized Bayesian inference.
Future work should investigate weight sharing schemes for the various network components and advance a framework-independent benchmark database for joint estimation of non-trivial scientific models.

\clearpage
\begin{acknowledgements}
    We thank Samuel Wiqvist for the fruitful discussions and his help with running the SNPLA experiments.
    We thank Manuel Gloeckler for the forthcoming assistance with the SNVI implementation of Experiment 2.
    We thank the reviewers for their thought-provoking feedback which has enabled us to improve the manuscript considerably.
    STR was supported by the Deutsche Forschungsgemeinschaft (DFG, German Research Foundation) under Germany’s Excellence Strategy -– EXC-2181 - 390900948 (the Heidelberg Cluster of Excellence STRUCTURES) and Google Cloud through the Academic Research Grants program.
    MS was supported by the Cyber Valley Research Fund (grant number: CyVy-RF-2021-16).
    MS and PCB were supported by the Deutsche Forschungsgemeinschaft (DFG, German Research Foundation) under Germany’s Excellence Strategy -- EXC-2075 - 390740016 (the Stuttgart Cluster of Excellence SimTech).
    VP was supported by the state of Baden-Württemberg through bwHPC and the German Research Foundation (DFG) through grant INST 35/1597-1 FUGG. 
    UP was supported by the Swedish National Research Council (Vetenskapsrådet 2019-03924) and the Chalmers AI Research Centre.
    UK was supported by the the Informatics for Life initiative funded by the Klaus Tschira Foundation.
    The authors gratefully acknowledge the support and funding.
\end{acknowledgements}

% Conjure a magic robust page break.

% By the power of the written word,
% I call forth a new page, unfurled.
% Let the ink flow and the pages turn,
% Bringing forth a new chapter, so learn.

% Page break, page break,
% Let the next story take.
% Appear before me, with magic's grace,
% A new page for a new space.


% That does the trick :)
% References
%\bibliography{uai2023-template}
\subsubsection*{References}
\printbibliography[heading=none]
\end{document}
