\onecolumn

\title{Expectation Programming: Adapting Probabilistic Programming Systems to Estimate Expectations Efficiently (Supplementary Material)}
\maketitle


\appendix

\section{Annealed Importance Sampling}
\label{apd:anis}

Annealed importance sampling (AnIS) \citepsupp{neal1998Annealed} is an inference algorithm which was
developed with the goal of efficiently estimating the normalization constant $Z$ of an 
unnormalized density $\gamma(x)$. It works by defining a sequence of annealing 
distributions $\pi_0(x), \dots, \pi_n(x)$ which interpolate between a simple 
base distribution $\pi_0(x)$ (typically the prior 
for a Bayesian model) and the 
complex target density $\pi_n(x) = \gamma(x)$. The most common scheme 
is to take
\begin{equation}
\pi_i(x) \propto \lambda_i(x) = \pi_0(x)^{1-\beta_n} \gamma(x)^{\beta_n},
\end{equation}
with $0\!=\!\beta_0\!<\!\dots\!<\!\beta_n\!=\!1$.
The algorithm further requires the definition of Markov chain transition 
kernels $\tau_1(x, x'), \dots, \tau_{n-1}(x, x')$ and proceed to generate 
the $j^{\text{th}}$ weighted sample as follows
First, sample initial particle $x^{(1)}_j \sim \pi_0(x)$, then
for $i = 1, \ldots, (n-1)$, generate $x^{(i+1)}_j \sim \tau_i(x^{(i)}_j, \cdot)$ and, finally, return sample $x^{(n)}_j$ with weight
\begin{equation}
    w_j = \frac{\lambda_1(x_j^{(1)})\lambda_2(x_j^{(2)}) \dots \lambda_n(x_j^{(n)})}{\pi_0(x_j^{(1)})\lambda_1(x_j^{(2)}) \dots \lambda_{n-1}(x_j^{(n)})}
\end{equation}
      
We can estimate expectations with the weights and samples just as in importance 
sampling.
Thus we can estimate the expectation and the normalization constant as 
\begin{align*}
\begin{split}
\mathbb{E}_{\pi(x)}[f(x)] \approx \frac{\sum_{j=1}^{N} w_j f(x^{(n)}_j)}{\sum_{j=1}^{N} w_j}\quad\text{and}\quad
Z \approx \frac{1}{N} \sum_{j=1}^{N} w_j.
\end{split}
\end{align*}

\subsection{Implementation Details of Turing Inference Engine}

The implementation of our new Turing inference engine is available at \url{https://github.com/treigerm/AnnealedIS.jl}.
It is a stand-alone package that can be used completely independently from EPT and is therefore useful for any Turing user who wishes to run AnIS on their model.
Furthermore, our implementation leverages the modularity of the Turing ecosystem by using existing MCMC transition kernels from the packages \texttt{AdvancedMH.jl} \citepsupp{turingdevelopmentteam2020TuringLang}
and \texttt{AdvancedHMC.jl} \citepsupp{xu2020AdvancedHMC}.

Keeping the same notation as above, given a Turing model the AnIS inference creates Julia functions for the prior density $\pi_0(x)$ and the unnormalized density $\gamma(x)$.
The unnormalized density of the program is evaluated as described in Section~\ref{sec:turing_details} and the prior density is evaluated similarly but ignores all the `likelihood' terms $h_j(y_j \mid \phi_j)$ and all the terms added with \ept{@addlogprob} primitive.
Once we have Julia functions for $\pi_0(x)$ and $\gamma(x)$ it is straightforward to create a function for the intermediate targets $\lambda_i(x)$ for a given $\beta_i$.
The Julia function for the intermediate targets $\lambda_i(x)$ can then be used by one of the MCMC samplers in \texttt{AdvancedMH.jl} or \texttt{AdvancedHMC.jl} to collect samples from the intermediate distributions.

\section{Theoretical Details}
\label{apd:theory_details}

\subsection{Assumptions in Definition~\ref{def:expectation_program}}
\label{sec:apd_assumptions}

To ensure correctness most PPSs assume that a particular inference 
algorithm will converge to the distribution of $F$ (i.e. the distribution over return values).
A standard PPS Monte Carlo inference engine will now produce a sequence of samples $F_n,~n=1,2,\dots$ and consistency requires that $F_n$ converges in distribution to $F$ as $n\to\infty$.
This is equivalent to requiring that for \emph{any} integrable function $h$, $\mathbb{E}[h(F_n)]\to \mathbb{E}[h(F)]$;
and it presupposes that the distribution of $F$ is a finite measure, i.e., $\mathbb{E}[F]$ is finite.
We thus see our assumption is strictly weaker than that of standard PPSs that allow return values from programs: we only need convergence in the case where $h$ is the identity mapping, not all integrable functions.


\subsection{Proof for Theorem~\ref{thm:tabi_valid}}

% \tabi*
\begin{restatable}{theorem}{tabi}
% \label{thm:tabi_valid}
Let $\mathcal{E}$ be a valid expectation program in EPT with unnormalized density $\gamma(x_{1:n})$, defined on possible traces $x_{1:n}\in\mathcal{X}$, with return value $F=f(x_{1:n})$.
Then
$\gamma_1^+(x_{1:n}):=\gamma(x_{1:n})\max(0,f(x_{1:n}))$, $\gamma_1^-(x_{1:n}):=-\gamma(x_{1:n})\min(0,f(x_{1:n}))$, and $\gamma_2(x_{1:n}):=\gamma(x_{1:n})$ are all valid unnormalized probabilistic program densities.
Further, if $\{\hat{Z}_1^+\}_{m}$, $\{\hat{Z}_1^-\}_{m}$, $\{\hat{Z}_2\}_{m}$ are sequences of estimators for $m \in \mathbb{N}^+$ such that
\vspace{-5pt}
\begin{align*}
\{\hat{Z}_1^\pm\}_{m} &\overset{p}{\to}
\int_\mathcal{X} \gamma^\pm_1(x_{1:n}) d\mu(x_{1:n}),
\\
\{\hat{Z}_2\}_{m} &\overset{p}{\to}
\int_\mathcal{X} \gamma_2(x_{1:n}) d\mu(x_{1:n})
\end{align*}
\vspace{-15pt}

where $\overset{p}{\to}$ means convergence in probability as $m\to \infty$, then
$(\{\hat{Z}_1^+\}_{m}-\{\hat{Z}_1^-\}_{m})/\{\hat{Z}_2\}_{m} \overset{p}{\to} \mathbb{E}[F].$
\end{restatable}
\begin{proof}
We start by noting that as $\gamma_2(x_{1:n})$ is identical to $\gamma(x_{1:n})$, it is by assumption a valid unnormalized program density. 
Meanwhile, by construction, $\gamma(x_{1:n})^+_1,\gamma(x_{1:n})^-_1 \geq 0, \forall x_{1:n} \in \mathcal{X}$.
Further, each can be written in the form of~\eqref{eq:ppl_density} by taking the correspond definition of $\gamma(x_{1:n})$ and adding in factors $\exp(\psi_{K+1})=\max(0,f(x_{1:n}))$ and $\exp(\psi_{K+1})=-\min(0,f(x_{1:n}))$ for $\gamma(x_{1:n})^+_1$ and $\gamma(x_{1:n})^-_1$ respectively.
To finish the proof that $\gamma^{\pm}(x_{1:n})$ are valid densities, we show that $0<Z_1^\pm<\infty$.

Starting with the standard definition of an expectation for arbitrary random variables, we can express $\mathbb{E}[F]$ as
\begin{align}
  \int_\mathcal{X} f(x_{1:n}) d\mathbb{P}(x_{1:n}) 
    &= \int_\mathcal{X} f^+(x_{1:n}) d\mathbb{P}(x_{1:n})  - \int_\mathcal{X} f^-(x_{1:n}) d\mathbb{P}(x_{1:n}). \label{eq:pos_neg_breakdown} \\
\intertext{
Noting that if $F$ is integrable then by the definition of the Lebesgue integral $\int_\mathcal{X} f^+(x_{1:n}) d\mathbb{P}(x_{1:n}) < \infty$ and $\int_\mathcal{X} f^-(x_{1:n}) d\mathbb{P}(x_{1:n}) < \infty$. Now inserting the distribution the program defines over $x_{1:n}$,}
    &= \int_\mathcal{X} f^+(x_{1:n}) \pi(x_{1:n}) d\mu(x_{1:n}) - \int_\mathcal{X} f^-(x_{1:n}) \pi(x_{1:n}) d\mu(x_{1:n}) \label{eq:dist_to_density} \\ 
\intertext{and noting that $\gamma(x_{1:n}) \geq 0$ for all $x_{1:n} \in \mathcal{X}$ and $0 < \int_\mathcal{X} \gamma(x_{1:n}) d\mu(x_{1:n}) < \infty$,}
    &= \frac{\int_\mathcal{X} f^+(x_{1:n}) \gamma(x_{1:n}) d\mu(x_{1:n}) - \int_\mathcal{X} f^-(x_{1:n}) \gamma(x_{1:n}) d\mu(x_{1:n})}{\int_\mathcal{X} \gamma(x_{1:n}) d\mu(x_{1:n})} \label{eq:unnormalized} \\
    &= \frac{\int_\mathcal{X} \gamma^+_1(x_{1:n}) d\mu(x_{1:n}) - \int_\mathcal{X} \gamma_1^-(x_{1:n}) d\mu(x_{1:n})}{\int_\mathcal{X} \gamma_2(x_{1:n}) d\mu(x_{1:n})} \label{eq:relabeling}
    =: \frac{Z^+_1 - Z^-_1}{Z_2}.
\end{align}
In our theorem statement we have assumed that $\{\hat{Z}_1^+\}_{m}\overset{p}{\to} Z_1^+$,
$\{\hat{Z}_1^-\}_{m}\overset{p}{\to} Z_1^-$, and
$\{\hat{Z}_2\}_{m}\overset{p}{\to} Z_2$, from which it now follows by Slutsky's Theorem that
\begin{align}
    \frac{\{\hat{Z}_1^+\}_{m}-\{\hat{Z}_1^-\}_{m}}{\{\hat{Z}_2\}_{m}} \overset{p}{\to} \frac{Z^+_1 - Z^-_1}{Z_2} = \mathbb{E}[F]
\end{align}
as required.
\end{proof}

\subsection{Details about Equation~(\ref{eq:ppl_density})}

Any probabilistic program defines a `density' function in the form of Equation~\eqref{eq:ppl_density}. 
This definition makes sense for a large class of programs, permitting branching on random variables, higher-order functions, recursion, stochastic memoization, and conditioning on internally sampled variables \citepsupp[\S4.3]{rainforth2017Automating}.
However, for this function to correspond to a valid unnormalized probability density we need to assume that a) the program halts with probability 1 and b) that the integral over the entire domain of $\gamma$ with respect to the implicitly defined reference measure is finite, i.e. $Z= \int_\mathcal{X} \gamma(x_{1:n}) d\mu(x_{1:n}) < \infty$ where $\mu$ is the reference measure and $\mathcal{X}$ denotes the space of valid program traces.

We further need to clarify our usage of the term `density function.'
In general, probabilistic programs denote measures (or kernels if there are free variables) \citepsupp{kozen1979semantics,staton2016semantics,borgstrom2011measure}.
When we talk about the density function of a probabilistic program, formally we are referring to the Radon-Nikodym derivative of the measure denoted by this program with respect to an appropriate reference measure, where this reference measure is itself implicitly defined by the program.

\section{Estimating Expectations in Turing}
\label{apd:turing_expectation}

\subsection{Standard approach}
\label{apd:turing_monte_carlo}


\begin{minted}[breaklines,escapeinside=||,mathescape=true,numbersep=3pt,gobble=2]{eptlexer.py:EPTLexer -x} 
@model function model(y=2)
    x |$\sim$| Normal(0, 1) 
    y |$\sim$| Normal(x, 1)
end

num_samples = 1000
posterior_samples = sample(model(), NUTS(0.65), num_samples)

f(x) = x^3
posterior_x = Array(posterior_samples[:x])
expectation_estimate = mean(map(f, posterior_x))
\end{minted}

Full example of the estimation of an expectation with the Turing language. 
The user first defines the model, then conditions it on some observed data, 
computes posterior samples and then uses these samples to compute a Monte Carlo
estimate of the expectation.

\subsection{Using generated quantities function}

When we designed the API Turing largely ignored the \ept{return} statements in 
the model definition. In the meantime Turing introduced a convenience function 
\ept{generated_quantities}. Given a model and $N$ samples it returns a list of 
the $N$ return values generated by running the program on each sample. 
Note that \ept{generated_quantities} reruns the entire \ept{model} function for 
each posterior sample to compute the return value. This means that for models 
which have an expensive likelihood computation the use of 
\ept{generated_quantities} might incur a significant overhead.

It is important to note that \ept{generated_quantities} is merely a convenience 
function and does not change how Turing interprets model definitions. In fact, 
the \ept{generated_quantities} function provides complimentary functionality and 
Turing models generated with EPT can use this function without problems.

The example from Section~\ref{apd:turing_monte_carlo} can be rewritten to use 
\ept{generated_quantities}:
\begin{minted}[breaklines,escapeinside=||,mathescape=true,numbersep=3pt,gobble=2]{eptlexer.py:EPTLexer -x} 
@model function model(y=2)
    x |$\sim$| Normal(0, 1) 
    y |$\sim$| Normal(x, 1)
    return x^3
end

num_samples = 1000
posterior_samples = sample(model(), NUTS(0.65), num_samples)

expectation_estimate = mean(generated_quantities(model(), posterior_samples))
\end{minted}


\section{Full Example of Macro Transformation}
\label{apd:macro_transformation}

The expectation
\vspace{-8pt}
\begin{minted}[breaklines,escapeinside=||,mathescape=true,numbersep=3pt,gobble=2,highlightlines={4}]{eptlexer.py:EPTLexer -x} 
@expectation function expt_prog(y)
    x |$\sim$| Normal(0, 1) 
    y |$\sim$| Normal(x, 1)
    return x^3
end
\end{minted}
\vspace{-8pt}
gets transformed into
\vspace{-8pt}
\begin{minted}[breaklines,escapeinside=||,mathescape=true,numbersep=3pt,gobble=2,highlightlines={4-8,14-18,24}]{eptlexer.py:EPTLexer -x} 
@model function gamma1_plus(y)
    x |$\sim$| Normal(0, 1) 
    y |$\sim$| Normal(x, 1)
    tmp = x^3
    if _context isa Turing.DefaultContext
        @addlogprob!(log(max(tmp, 0)))
    end
    return tmp
end

@model function gamma1_minus(y)
    x |$\sim$| Normal(0, 1) 
    y |$\sim$| Normal(x, 1)
    tmp = x^3
    if _context isa Turing.DefaultContext
        @addlogprob!(log(-min(tmp, 0)))
    end
    return tmp
end

@model function gamma2(y)
    x |$\sim$| Normal(0, 1) 
    y |$\sim$| Normal(x, 1)
    return x^3
end

expt_prog = Expectation(
    gamma1_plus,
    gamma1_minus,
    gamma2
)
\end{minted}
The type \ept{Expectation} is simply used to have one common 
object which stores the three different Turing models. Notice that for 
\ept{gamma2} the function body is identical to the original function.

For \ept{gamma1_plus} and \ept{gamma1_minus} we also have to check in what 
\ept{_context} the model is executed in. Turing allows to execute the model with 
different contexts which change the model behaviour. For example, there is a 
\ept{PriorContext} which essentially ignores the tilde statements which have 
observed data on the LHS. This is useful for evaluating the prior probability of 
some parameters. However, by default the \ept{@addlogprob} macro ignores the model 
context. As a consequence if a Turing model includes an \ept{@addlogprob} macro 
and is executed with a \ept{PriorContext} then it no longer calculates the log
prior probability but instead the log prior probability plus whatever value was 
added with the \ept{@addlogprob} statement. 
Since we want to use the Turing model with Annealed Importance Sampling 
we need to be able to extract the prior from our model and 
hence we need to ensure that we do not call \ept{@addlogprob} when executed in a \ept{PriorContext}. 
This is what the added \ept{if} clause ensures.

\section{Different Estimators for $Z_1^+$, $Z_1^-$ and $Z_2$}
\label{apd:positive_target_function}


The target function $f(x)=x^2$ in the following expectation is always positive:
\vspace{-8pt}
\begin{minted}[breaklines,escapeinside=||,mathescape=true,numbersep=3pt,gobble=2]{eptlexer.py:EPTLexer -x} 
@expectation function expt_prog(y)
    x |$\sim$| Normal(0, 1)
    y |$\sim$| Normal(x, 1)
    return x^2
end
\end{minted}
\vspace{-8pt}
Therefore, we already know that $Z_1^-=0$,
so it would be wasteful to spend computational resources on estimating $Z_1^-$. EPT
allows users to specify the marginal likelihood estimator for each of the terms in TABI
separately which means if the user knows that the target function is always 
positive they can specify that 0 samples should be used to estimate $Z_1^-$:
\vspace{-8pt}
\begin{minted}[breaklines,escapeinside=||,mathescape=true,numbersep=3pt,gobble=2]{eptlexer.py:EPTLexer -x} 
expct_estimate, diagnostics = estimate_expectation(
    expt_prog(2), TABI(
        TuringAlgorithm(AnIS(), num_samples=1000), # $Z_1^+$
        TuringAlgorithm(AnIS(), num_samples=0),    # $Z_1^-$
        TuringAlgorithm(AnIS(), num_samples=1000)  # $Z_2$
))
\end{minted}
\vspace{-8pt}
It is easy to see how this can be adapted to the case in which we have $Z_1^+=0$.
This interface is not just useful for avoiding unnecessary computation, in some 
cases the user might also want to have different marginal likelihood estimators 
for each term. 
This allows user to further tailor the inference algorithm for 
the given target function $f(x)$.

\section{Hyperparameters for Experiments}
\label{apd:exp_hyperparams}

EPT
runs standard annealed importance sampling twice: one time to estimate $Z^{+}_1$
and the other time to estimate $Z_2$. For each of the problems 
we always use the same hyperparameters for the annealed importance sampling 
algorithm both to run AnIS and for the two estimates in EPT.

\subsection{Posterior Predictive}

For the annealed importance sampling, we use a MH transition kernel with an 
isotropic Gaussian with covariance $0.5I$ as a proposal and $5$ MH steps on each 
annealing distribution. We use $100$ uniformly spaced annealing distributions.
For the MCMC, we collect $5\cdot10^7$ samples in total. To parallelise sampling 
we run $5\cdot10^3$ chains with $10^4$ samples each in parallel, 
discarding the first $10^3$ samples as burn-in. We use a MH transition kernel with 
standard normal proposal.

\subsection{SIR Model}
\label{apd:sir_hyperparamas}

For the annealed importance sampling estimators we use HMC 
transition kernels with a step size of 0.05, 10 leapfrog steps and 10 MCMC steps 
on each annealing distribution. We use 100 geometrically spaced annealing 
distributions.

For the MCMC model we collect $10^6$ samples in total with Turing's 
implementation of NUTS and a target acceptance rate of 65\%.\footnote{\url{https://turing.ml/dev/docs/library/\#Turing.Inference.NUTS}} 
We parallelise sampling over $10^2$ chains with $10^4$ samples and discard the
first $10^3$ samples as burn-in.

The ground truth is computed using importance sampling with $10^8$ samples and 
the prior as a proposal distribution. See Equation~\eqref{eq:bayesian_sir}
for the full SIR model including the priors.
The observed data was generated from the model described in \eqref{eq:bayesian_sir}
with $\beta = 0.25$, $I_0 = 100$, $N=10^4$ and $\phi = 10$ as the overdispersion parameter 
of the SIR model. We generate data for 15 time steps.

\subsection{Radon model}

We run EPT and AnIS with 200 intermediate distributions and one step of the 
dynamic HMC transition kernel \citepsupp{betancourt2018conceptual,hoffman2014TheNS} on 
each intermediate distribution with a step size of $0.044$. The step size was 
informed by running adaptive MCMC on the target distribution.

\section{SIR Experiment}
\label{apd:sir_exp}

We assume we are given data in the form of observations $y_i$, 
the number of observed newly infected people on day $i$. Fixing $\gamma=0.25$, this gives us the statistical model
\begin{subequations}
\label{eq:bayesian_sir}
\begin{align}
    \beta &\sim \text{TruncatedNormal}(2, 1.5^2, [0, \infty]),
    &I_0 &\sim \text{TruncatedNormal}(100, 100^2, [0, 10000]), \\
    S_0 &= 10000 - I_0, &R_0 &= 0,\\
    \mathbf{x} &= \texttt{ODESolve}(\beta, \gamma, S_0, I_0, R_0), 
    &y_i &\sim \text{NegativeBinomial}(\mu=x_i, \phi=0.5).
\end{align}
\end{subequations}
Here $\texttt{ODESolve}$ indicates a call to a numerical ODE solver 
which solves the set of equations~\eqref{eq:sir_diffeq}. It outputs $x_i$,
the predicted number of newly infected people on day $i$. We assume the observation process is noisy and model it using a negative binomial distribution, which is parametrised by a mean $\mu$ and 
an overdispersion coefficient $\phi$. 
For an in-depth discussion about doing Bayesian parameter inference in the SIR model we 
refer the reader to the case study of \citet{grinsztajn2020Bayesian}.

We are further given a cost function in terms of $R_0$, $\text{cost}(R_0) = 10^{12} * \text{logistic}(10R_0 - 30)$.
Intuitively, the cost initially increases exponentially with $R_0$. However, 
the total cost also saturates for very large $R_0$ (as the entire population becomes infected).

\section{Hierarchical Radon Model}
\label{apd:radon_exp}

The data for this problem was taken from: \url{https://github.com/pymc-devs/pymc-examples/blob/main/examples/data/radon.csv} (the repository uses an MIT license; the data contains no personally identifiable information).
The original data contains information about houses in 85 counties.
In order to make estimating normalization constants more tractable we reduce the 
number of counties to 20. 

Our target function is a function of predicted radon levels $y_i$ for a typical 
house with a basement (i.e. $x_i = 0$) in county $i$; $y_i$ is calculated using 
the predictive equation given in~\eqref{eq:radon_pred}.
We apply the function 
\begin{align*}
    f(y_i) = \frac{1}{1 + \exp(5 (y_i - 4))}
\end{align*}
to all the predicted radon levels and then take the product of all the $f_i$.
Finally, to avoid floating point underflow we set a minimum value of $1\mathrm{e}{-200}$.

\section{Multiple Expectations and Restrictions on $f(\cdot)$}
\label{apd:multiple_expectations}

The user is not restricted to defining only one expectation per model. 
By specifying multiple return values the user can specify multiple expectations.
The \ept{@exptectation} macro can recognise multiple return values and generates 
an expectation for each of them.
The user can then estimate each expectation independently using \ept{estimate_expectation}:
\vspace{-8pt}
\begin{minted}[breaklines,escapeinside=||,mathescape=true,numbersep=3pt,gobble=2]{eptlexer.py:EPTLexer -x} 
@expectation function expt_prog(y)
    x |$\sim$| Normal(0, 1)
    y |$\sim$| Normal(x, 1)
    return x, x^2, x^3
end
y_observed = 3
expt_prog1, expr_prog2, expt_prog3 = expt_prog
expct1 = expt_prog1(y_observed)
expct1_estimate, diagnostics = estimate_expectation(
  expct1, method=TABI(marginal_likelihood_estimator=TuringAlgorithm(
    AnIS(), num_samples=1000)))
\end{minted}

\section{Posterior Predictive Model in EPT}
\label{apd:post_pred_ept}

The expectation from Section~\ref{sec:exp_post_pre} can be defined 
in just 5 lines of code with EPT:
\vspace{-8pt}
\begin{minted}[breaklines,escapeinside=||,mathescape=true,numbersep=3pt,gobble=2]{eptlexer.py:EPTLexer -x} 
@expectation function expt_prog(y)
    x |$\sim$| MvNormal(zeros(length(y)), I)       # $\mathbf{x} \sim \mathcal{N}(\mathbf{x}; 0, I)$
    y |$\sim$| MvNormal(x, I)                      # $\mathbf{y} \sim \mathcal{N}(\mathbf{y}; \mathbf{x}, I)$
    return pdf(MvNormal(x, 0.5*I), -y)     |$\phantom{\sim}$|# $f(\mathbf{x}) = \mathcal{N}(-\mathbf{y}; \mathbf{x}, \frac{1}{2}I)$
end
\end{minted}

\section{Syntax Design}
\label{apd:syntax}

Prior works have considered two families of syntax design corresponding to the semantics required by EPT.
\citet{gordon2014Probabilistic} define the semantics for expectation computation 
via the syntax of probabilistic program's return expression, which is the approach we adopted in the design of EPT.
\citet{zinkov2017Composinga} take a different route and define the expectation semantics
via the use of syntax \mintinline{eptlexer.py:EPTLexer -x}{expect(m, f)} 
where \mintinline{eptlexer.py:EPTLexer -x}{m} is the program defining a measure and 
\mintinline{eptlexer.py:EPTLexer -x}{f} is the target function.

While designing the interface of EPT we considered two different design for 
defining the target function: either letting users specify the target function 
implicitly through the return values of the function or allowing users to 
specify a target function \ept{f} externally. The external function could then 
be passed to the \ept{estimate_expectation} function explicitly.

For EPT, we decided to adopt the former of the two designs mainly due to the simplicity of the resulting user interface and implementation.
In particular, it allows for simple to execute program transformations of 
the \mintinline{eptlexer.py:EPTLexer -x}{@expectation} macro 
into valid Turing programs to represent the individual densities, 
and thus the ability to use native Turing inference algorithms. 
Adopting the other approach
would additionally require designing and specifying the interface between 
the function signature \mintinline{eptlexer.py:EPTLexer -x}{f(.)} and 
the values of the named random draws performed by the model 
\mintinline{eptlexer.py:EPTLexer -x}{m}.
This would result in a more complex user-facing interface,
at the slight advantage of improved compositionality of models and functions.

\section{SIR Discussion}
\label{apd:sir_discussion}

\begin{figure*}[h!]
    \centering
    \begin{subfigure}{0.49\textwidth}
        \centering
        \includegraphics[clip,trim=1.0cm 0cm 0cm 0cm,width=\textwidth]{figures/sir_joint_dist_mcmc.png}
        \vspace{-20pt}
        \caption{MCMC samples.}
        \label{fig:sir_mcmc}
    \end{subfigure}
    %\qquad
    \hfill
    \begin{subfigure}{0.49\textwidth}
        \centering
        \includegraphics[clip,trim=1.0cm 0cm 0cm 0cm,width=\textwidth]{figures/sir_joint_dist_mcmc_with_burn_in.png}
        \vspace{-20pt}
        \caption{MCMC samples including burn-in samples (in black).}
        \label{fig:sir_mcmc_burn_in}
    \end{subfigure}
    \hfill
    \begin{subfigure}{0.49\textwidth}
        \centering
        \includegraphics[clip,trim=1.0cm 0cm 0cm 0cm,width=\textwidth]{figures/sir_anis_Z2_joint_samples.png}
        \vspace{-20pt}
        \caption{AnIS samples.}
        \label{fig:sir-Z2}
    \end{subfigure}
    \hfill
    \begin{subfigure}{0.49\textwidth}
        \centering
        \includegraphics[clip,trim=1.0cm 0cm 0cm 0cm,width=\textwidth]{figures/sir_Z1_joint_samples.png}
        \vspace{-20pt}
        \caption{EPT samples for $Z_1$.}
        \label{fig:sir-Z1}
    \end{subfigure}
    % \vspace{-10pt}
    \caption{Samples from the different algorithms for the SIR model. Note that 
    for Figure~\ref{fig:sir_mcmc_burn_in} some burn-in samples lie outside the boundaries 
    of the plot but we adjusted the axis limits so that they are the same for all 
    plots to allow for easier comparison.}
	\vspace{-5pt}
    \label{fig:sir_samples}
\end{figure*}

In the SIR experiment AnIS achieved a significantly lower RSE than MCMC 
even though both are non-target-aware. 
Figure~\ref{fig:sir_samples} shows samples from the different algorithms.
The EPT samples for $Z_1$ visualise well in which regions of parameter space 
both the posterior and the target function have sufficient mass ($\beta \in [0.5, 2.0]$). 
The samples 
from AnIS and MCMC suggest that most of the posterior mass is located in the interval 
$\beta \in [0.3, 0.7]$. However, AnIS also generates a significant amount of 
samples in the parameter region $\beta \in [1.0, 1.5]$. The samples in this 
second ``mode'' are directly in the region of the target-aware samples. 
Further, the plots suggest that AnIS generates more samples in this regions than 
MCMC which is what allows AnIS to achieve a lower RSE. 
However, it seems that the AnIS represents the second ``mode'' disproportionally.
Specifically looking at the burn-in samples from 
MCMC in Figure~\ref{fig:sir_mcmc_burn_in} shows that MCMC will converge to the 
parameter space in $\beta \in [0.3, 0.7]$ even if the initial parameter samples 
are around $\beta \in [1.0, 1.5]$. This indicates that this is not a failure of MCMC 
to detect another mode but rather that there is 
negligible posterior mass in that parameter region.
Therefore the better performance of AnIS compared to MCMC 
seems to occur mostly because AnIS got lucky by accidentally generating samples in the 
right parameter region. 

\subsection{A Note on MCMC ESS}

The SIR experiment provides a good example of how the MCMC ESS \citepsupp{vehtari2020rank}
is unreliable for our use case. As detailed in Section~\ref{apd:sir_hyperparamas}
for MCMC we run $100$ chains with $10,000$ samples each. This is replicated $5$ 
times to get estimates on the variability in behaviour. 
After discarding the burn-in samples for each chain the $5$ replications give 
us the following final ESS estimates: $[631,360; \, 805,868; \, 873,269; \, 665,683; \, 5,114]$.
We observe that all but one replication give disproportionally high ESS estimates.
We found that the replication which gives a more conservative ESS estimate of $5,114$ is the 
replication which generated samples in the parameter region $\beta \in [1.0, 1.5]$
(see Figure~\ref{fig:sir_mcmc}). 
More importantly, the MCMC ESS estimates do not seem to show 
any correlation with the RSE values (see Figure~\ref{fig:sir_experiment}) which 
is the more important metric because it directly measures the error in our 
estimate. Therefore, we decided against using the MCMC ESS in our evaluation 
because it can give the impression that MCMC is performing well when it is actually 
failing dramatically (in terms of RSE).

\subsection{Additional Stan MCMC Baseline}

\begin{wraptable}{r}{0.6\linewidth}
    \vspace{-2pt}
	\caption{Quantiles of the RSE for different methods (the same performance metric as Figure~\ref{fig:sir_experiment}, left); computed over 5 runs.}
    \vspace{-10pt}
	\label{tab:additional_mcmc_baseline}
	\begin{center}
		\begin{small}
			\begin{sc}
				\begin{tabular}{llll}
					\toprule
					Method & 25\% Quantile & Median & 75\% Quantile \\
					\midrule
					EPT           & $2.96\mathrm{e}{-6}$ & $8.10\mathrm{e}{-6}$ & $2.92\mathrm{e}{-4}$ \\
					AnIS          & $0.02$ & $0.13$ & $0.15$ \\
					MCMC (Turing) & $0.96$ & $0.97$ & $0.97$ \\
					MCMC (Stan)   & $1.00$ & $1.00$ & $1.00$ \\
					\bottomrule
				\end{tabular}
			\end{sc}
		\end{small}
	\end{center}
	\vspace{-10pt}
\end{wraptable}

To validate our MCMC baseline we reimplemented the SIR model in Stan and used Stan's built-in default MCMC sampler.
We expressed the expectation within the \texttt{generated\_quantities} block leveraging the functionality described in Section~\ref{sec:related_work}.
We have picked Stan because its built-in MCMC sampler can be reasonably considered the state-of-the-art in its domain and has been extensively tested for correctness.
As shown in Table~\ref{tab:additional_mcmc_baseline}, Stan gives results that are similar to our current MCMC baseline (and potentially even a little worse). 
This demonstrates that the differences between existing PPSs are negligible compared to the effect of making inference target-aware.

\section{Effective Sample Size}

In Figure~\ref{fig:ess_details} we plot all the individual ESS values for EPT and the AnIS baseline.
Plotting each ESS value separately shows that the performance of AnIS is severely limited by its ability to generate samples in regions in which the target function $f(x)$ is large. 
This is indicated by the low values for $\text{ESS}_{Z_1}^{\text{AnIS}}$.

\begin{figure*}[h!]
    \centering
    \begin{subfigure}{0.32\textwidth}
        \centering
        \includegraphics[clip,trim=1.0cm 0cm 0cm 0cm,width=\textwidth]{figures/post_pred_ess_plot_not_min_more_samples.pdf}
        \vspace{-10pt}
        \caption{Gaussian Posterior Predictive.}
    \end{subfigure}
    %\qquad
    \hfill
    \begin{subfigure}{0.32\textwidth}
        \centering
        \includegraphics[clip,trim=1.0cm 0cm 0cm 0cm,width=\textwidth]{figures/sir_ess_not_min.pdf}
        \vspace{-10pt}
        \caption{SIR.}
    \end{subfigure}
    \hfill
    \begin{subfigure}{0.32\textwidth}
        \centering
        \includegraphics[clip,trim=1.0cm 0cm 0cm 0cm,width=\textwidth]{figures/radon_ess_not_min.pdf}
        \vspace{-10pt}
        \caption{Radon.}
    \end{subfigure}
    % \vspace{-10pt}
    \caption{Individual ESS values as defined in Section~\ref{sec:experiments} for the
    three different experiments. Instead of taking $\min(\text{ESS}_{Z_1}, \text{ESS}_{Z_2})$ for EPT and 
    $\min(\text{ESS}_{Z_1}^{\text{AnIS}}, \text{ESS}_{Z_2}^{\text{AnIS}})$ for AnIS we 
    plot each value individually.}
	\vspace{-15pt}
    \label{fig:ess_details}
\end{figure*}


\section{Positive and Negative Target Functions}

To demonstrate that EPT is also beneficial for target functions which are positive 
and negative we provide a brief description of a synthetic experiment.
We assume the following model which gives us a banana shaped density (see Figure~\ref{fig:banana_experiment}):
\vspace{-8pt}
\begin{minted}[breaklines,escapeinside=||,mathescape=true,numbersep=3pt,gobble=2]{eptlexer.py:EPTLexer -x}
@expectation function banana()
    x1 |$\sim$| Normal(0, 4)
    x2 |$\sim$| Normal(0, 4)
    @addlogprob!(banana_density(x1, x2))
    return banana_f(x1, x2)
end

banana_density(x1, x2) = -0.5*(0.03*x1^2+(x2/2+0.03*(x1^2-100))^2)
\end{minted}
\vspace{-8pt}
Note that there is no observed data in this experiment which is why we chose to 
express the banana distribution as an unnormalized density
(i.e. use the \ept{@addlogprob!} primitive). 
Our target function is given by
\vspace{-8pt}
\begin{minted}[breaklines,escapeinside=||,mathescape=true,numbersep=3pt,gobble=2]{eptlexer.py:EPTLexer -x}
function banana_f(x1, x2)
    cond = 1 / (1 + exp(50 * (x2 + 5)))
    return cond * (x1 - 2)^3
end
\end{minted}
\vspace{-8pt}
Note that the target function can be positive and negative. Figure~\ref{fig:banana_experiment}
shows the RSE for EPT and AnIS. We used an MH transition kernel and 200 
intermediate potentials for the Annealed Importance Sampling estimators. The RSE 
of AnIS does not improve because it fails to generate samples in the regions in 
which the target $f(x)$ is large. 
\cite{rainforth2020Target} provide a comparison to MCMC on a similar problem 
so we omit it here.

\begin{figure*}[h!]
    \centering
    \begin{subfigure}{0.49\textwidth}
        \centering
        \includegraphics[clip,trim=1.0cm 0cm 0cm 0cm,width=\textwidth]{figures/banana_density.pdf}
        \label{fig:banana_density}
    \end{subfigure}
    %\qquad
    \hfill
    \begin{subfigure}{0.49\textwidth}
        \centering
        \includegraphics[clip,trim=1.0cm 0cm 0cm 0cm,width=\textwidth]{figures/banana_error_plot.pdf}
        \label{fig:banana_error}
    \end{subfigure}
    \vspace{-20pt}
    \caption{Banana experiment. [Left] Heatmap of the density of the model. [Right] 
    Relative Squared Error for EPT and AnIS.}
	\vspace{-8pt}
    \label{fig:banana_experiment}
\end{figure*}

\bibliographysupp{references}
\bibliographystylesupp{plainnat}