%\documentclass{uai2022} % for initial submission
\documentclass[accepted,onecolumn]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{xr}
%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros

\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts,amsmath,amssymb,amsthm}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{xcolor}         % colors
\usepackage{dsfont}
\usepackage{bm}
\usepackage{siunitx}
\usepackage[ruled]{algorithm2e}
\DeclareUrlCommand\UScore{\urlstyle{rm}}
\newtheorem{theorem}{Theorem}
\newtheorem{definition}{Definition}
\usepackage[capitalise]{cleveref}
\input{texdef}
\externaldocument{munk_22}

\title{Probabilistic Surrogate Networks for Simulators with Unbounded Randomness - Supplementary Material}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<amunk@cs.ubc.ca>?Subject=Probabilistic Programming, Surrogate Modeling}{Andreas Munk}}
\author[1]{Berend Zwartsenberg}
\author[1]{Adam \'Scibior$^{2,}$}
\author[3]{At{\i}l{\i}m G{\"u}ne{\c s} Baydin}
\author[4]{Andrew Stewart}
\author[4]{\qquad Goran Fernlund}
\author[5]{Anoush Poursartip$^{4,}$}
\author[1]{Frank Wood$^{2,6,}$}
% Add affiliations after the authors
\affil[1]{%
  Department of Computer Science\\
  University of British Columbia
  \quad
  $^{2}$Inverted AI Ltd.
}

\affil[3]{%
  Department of Engineering Science\\
  University of Oxford
}
\affil[4]{%
    Convergent Manufacturing Technologies Inc.
}
\affil[5]{%
   Composites Research Network\\
   University of British Columbia
   \quad
   $^{6}$Mila, CIFAR AI Chair
}


\begin{document}
\appendix
\maketitle
\section{Proofs}
\subsection{Proof of Theorem~\ref{theorem:1}}
\label{app:proof-1}

For an address $a$ define $\mathcal{C}$ and $\mathcal{K}$ as specified in
\cref{sec:psn}. That is $\mathcal{C}$ is the set of address transitions we know
are possible and $\mathcal{K}$ is the set of newly encountered address
transitions found in a sample of traces drawn from a reference simulator. Let
$C=\abs{\mathcal{C}}$ and $K=\abs{\mathcal{K}}$ be the size of each set
respectively. We consider the set of previous unknown address transitions
$\mathcal{U}$ and denote the new set of unknown transitions
$\tilde{\mathcal{U}}=\mathcal{U}\setminus \mathcal{K}$. Finally, define the
probability measures $\mathbb{P}$ and $\tilde{\mathbb{P}}$ both associated with
the sample space $\Omega$ and $\sigma$-algebra $\mathcal{F}$ according to

\begin{align*}
  \mathbb{P}(E) & = \frac{1}{Z} \begin{cases}
    e^{\vvv_{\gamma(c)}}, &\quad  \text{if}~E=\tub{c}~\text{and}~c \in \mathcal{C}\\
   e^{\vvv_{C+1}} &\quad \text{if}~E=\mathcal{U}
   \end{cases} \\
    \tilde{\mathbb{P}}(E) & = \frac{1}{\tilde{Z}} \begin{cases}
      e^{\vvv_{\gamma(c)}}, &\quad  \text{if}~E=\tub{c}~\text{and}~c \in \mathcal{C}\\
      e^{\vvv_{C+1} - \log\paren{K+1}}, &\quad  \text{if}~E=\tub{k}~\text{and}~ k\in\mathcal{K}\\
    e^{\vvv_{C+1} - \log\paren{K+1}}, &\quad \text{if}~E=\tilde{\mathcal{U}},
    \end{cases}
\end{align*}
where $\vvv\in\real^{C+1}$, $Z$ and $\tilde{Z}$ are normalization constants, and
$\gamma:\mathcal{C}\rightarrow\tub{1,\dots,C}$ is a mapping from observed
addresses to a unique ``address index''.

Observe that the relationship between $\tilde{\mathbb{P}}$ and $\mathbb{P}$ is
equivalent to the relationship between $\mathbb{P}_{a_{t}}^{\tilde{\zeta}}$ and
$\mathbb{P}_{a_{t}}^{\zeta}$ defined in \cref{sec:psn}. In particular, we
consider the functional mapping $h:\mathcal{G}\rightarrow\mathcal{G}$ such that $\tilde{\zeta}=h(\zeta)$, where
$\tilde{\zeta},\zeta\in\mathcal{G}$. The
proof of \cref{theorem:1} therefore reduces to proving that for all
$E\in\mathcal{B}=2^{\mathcal{C}}\cup\tub{\mathcal{U}}\subseteq\mathcal{F}$, $\tilde{\mathbb{P}}(E)=\mathbb{P}(E)$ holds.

We start by comparing the normalization constants:

\begin{align}
  \label{eq:proof-norm}
  \tilde{Z} &= \sum_{c\in\mathcal{C}}e^{\vvv_{\gamma(c)}} + \sum_{k\in\mathcal{K}}e^{\vvv_{C+1}-\log(K+1)} + e^{\vvv_{C+1}-\log(K+1)}\nonumber \\
            &=  \sum_{c\in\mathcal{C}}e^{\vvv_{\gamma(c)}} + (K+1)e^{\vvv_{C+1}-\log(K+1)}\nonumber \\
            &= \sum_{c\in\mathcal{C}}e^{\vvv_{\gamma(c)}} + e^{\vvv_{C+1}} \nonumber \\
            &= Z,
\end{align}

leading to,
\begin{figure*}[t!]
  \centering
  \includegraphics[scale=0.2]{program-to-surrogate.pdf}
  \caption{ Illustration of the equivalence between a simple generative model
    and a probabilistic surrogate network. The red arrows represent
    what is extracted from the program and fed to the surrogate network during
    training. Generally, this would be an address $a$ and the distribution type
    $d_{a}$ at that address. This extraction happens at every address
    encountered when executing the program while training the surrogate. The
    dashed arrows represents possible extractions after one step of running the
    PSN. Which extraction depends on the sampled value \texttt{c}. If $c=1$
    then $a_{2}=\alpha_{2}$ and the blue dashed arrow extraction happens
    otherwise $a_{2}=\alpha_{3}$ and the green dashed arrow extraction happens.}
  \figlab{construct}
\end{figure*}
\begin{align}
  \tilde{\mathbb{P}}(\{c\}) &= \frac{1}{\tilde{Z}}e^{\vvv_{\gamma(c)}} = \frac{1}{Z} e^{\vvv_{\gamma(c)}} = \mathbb{P}(\{c\}) \quad \forall c\in\mathcal{C} \label{eq:single-eq}\\
  \tilde{\mathbb{P}}(\mathcal{K}\cup \tilde{\mathcal{U}}) &= \tilde{\mathbb{P}}(\mathcal{U}) = \tilde{\mathbb{P}}(\tilde{\mathcal{U}}) + \tilde{\mathbb{P}}(\mathcal{K}) \nonumber\\
                            &= \frac{1}{\tilde{Z}}\paren{e^{\vvv_{C+1}-\log(K+1)} +\sum_{k\in\mathcal{K}}e^{\vvv_{C+1}-\log(K+1)}} \nonumber\\
  &= \frac{1}{Z}e^{\vvv_{C+1}} = \mathbb{P}(\mathcal{U}) \label{eq:unknown-eq}.
\end{align}

Since all events $\tub{\{c\}|c\in\mathcal{C}}$ are mutually exclusive, it follows
from \cref{eq:single-eq} that

\begin{equation}\label{eq:subsets-eq}
  \tilde{\mathbb{P}}(E) = \sum_{e\in E}\tilde{\mathbb{P}}(\{e\}) = \sum_{e\in E}\mathbb{P}(\{e\}) = \mathbb{P}(E), \quad \forall E\in 2^{\mathcal{C}}.
\end{equation}

Combining \cref{eq:unknown-eq} and \cref{eq:subsets-eq}, we arrive at the final result,

$$
\tilde{\mathbb{P}}(E)=\mathbb{P}(E), \quad \forall E\in\mathcal{B}=2^{\mathcal{C}}\cup\tub{\mathcal{U}},
$$

which completes the proof. \qed

\subsection{Proof of Theorem~\ref{theorem:any-evaluation}}
\label{app:proof-2}
The proof of \cref{theorem:any-evaluation} only requires the consideration of
two possible scenarios regarding a trace $(\xxx,\aaa)$: (1) the trace either
contains address transitions observed during the training of $s(\xxx,\aaa)$ in
which case its evaluation is straightforward. (2) $(\xxx,\aaa)$ contains
addresses and transitions not encountered during training. In the latter case,
we would simply expand our PSN to account for those new transitions according to
\cref{eq:new-probs}.\qed

\section{Algorithms}
\label{app:alg}
The procedure we use to expand the address transition distribution at address $a_t$ upon encountering a set of yet unseen transitions $\mathcal{K}_{a_t}$ is outlined in \cref{algo:psn-expand}. The procedure is applied to the final layer of a neural network which follows an intermediate layer of size $n_{emb}$. The operation $\mathrm{detach}(\cdot)$ denotes duplication without copying the gradient information, hence detaching the argument from the computational graph. The $\mathrm{concat}(\cdot,\cdot)$ operation concatenates the second argument to the first, and re-attaches the newly created matrix or vector to the computational graph as a leaf.
\begin{algorithm}[h]
  \DontPrintSemicolon
  \KwIn{A set $\mathcal{K}_{a_t}$ of new address transitions with size $K = \abs{\mathcal{K}_{a_t}}$}
  \KwIn{Weights $\vec{W} \in \mathbb{R}^{ (C+1) \times n_{emb}}$ and biases $\vec{b} \in  \mathbb{R}^{C+1}$, with $C = \abs{\mathcal{C}_{a_t}}$}
  % \KwOut{The largest element in the set}
  $\vec{w}^{u} = \mathrm{detach}\left(\vec{w}_{C+1}\right)$ \tcp*{$\vec{w}_{C+1}$ denotes row $C+1$ of $\vec{W}$}
  $b^{u} = \mathrm{detach}\left(b_{C+1}\right) - \log (1 + K)$ \tcp*{${b}_{C+1}$ denotes element $C+1$ of $\vec{b}$}
  $\vec{W} = \vec{W}_{:C}$ \tcp*{$\vec{W}_{:C}$ denotes the first $C$ rows of $\vec{W}$}
  $\vec{b} = \vec{b}_{:C}$ \tcp*{$\vec{b}_{:C}$ denotes the first $C$ elements of $\vec{b}$}
  \For{$k=0$ \KwTo $K+1$} {
	$\vec{W} = \mathrm{concat}(\vec{W}, \vec{w}^{u})$ \\
	$\vec{b} = \mathrm{concat}(\vec{b}, b^{u})$ \\
      }
  \caption{PSN address transitions expansion. Definitions of the $\mathrm{detach}$ and $\mathrm{concat}$ operations are given in \appref{alg}}
  \label{algo:psn-expand}
\end{algorithm}

\section{Surrogate Network Architecture}
\label{app:psn-design}

The PSN architecture is dynamically constructed during training and uses an LSTM
core as well as embeddings of the addresses, distribution types, and other
random variables. These embeddings are referred to as $a_{i}$, $d_{i}$, $x_{i}$
respectively. In particular, each address is associated with a fixed
distribution type. These deterministic and fixed pairings between addresses and
distribution types are stored and made part of the surrogate model. In other
words, when constructing the PSN we know the distribution type associated with
each address. The dynamic construction is driven by the program, where the
embeddings are fed to the LSTM core whose output is then fed to so-called
``distributions layers'' $\xi_{a_t}$ and $\zeta_{a_{t}}$, that for each unique
address $a_t$ produces the parameters for
$s(x_{a_t}|\xi_{a_t}(x_{<a_{t}},a_{\leq t},\theta))$ and
$s(a_{t+1}|\zeta_{a_{t}}(x_{<a_{t+1}},a_{\leq t},\theta))$ respectively. Note
that the value sampled from $s(x_{a_t}|\xi_{a_t}(x_{<a_{t}},a_{\leq t},\theta))$
is additionally fed to $\zeta_{a_{t}}$. In practice, this means that all
conditional probabilities of the PSN are conditioned on the distribution types
and therefore their embeddings $d_{i}$. While not part of the problem
formulation of PSN, as they are not theoretically necessary, we use them as
additional inputs to the LSTM as they might help training. This construction is
illustrated in \cref{fig:construct}. New embeddings and distribution layers are
created upon encountering new addresses during training. In practice this is
implemented by sweeping through the samples used to calculate the gradient
estimator. It is similarly during these sweeps that new address transitions are
identified. For each address $a_{t}$ we construct $\mathcal{K}_{a_{t}}$ when new
address transitions are found. \cref{algo:psn-expand} is then used for each of
those addresses.

When replacing the reference simulator with the PSN, it is initialized using
$h_{0}$ and embeddings $x_0$, $d_0$, and $a_0$. These initial values are
typically set to zero, but could be learnable parameters. The unique first
address $a_1$ (which is guaranteed to be unique as the first point of
stochasticity in a program is always the same) is fed to the PSN and the
surrogate program starts its execution. At each subsequent time step $t$ the
PSN produces a sample $x_{a_t}$ and address $a_{t+1}$, which then propagates
the PSN forward where until an \texttt{end-execution} address is sampled. This
process is illustrated in~\cref{fig:construct}.

\section{Experiments}
\label{app:exp}

Here we provide various model, training, and validation specifications, along
with additional results and evidence that support the claims made in the main
paper.
\subsection{Model Specifications}

We largely use the default specifications found in PyProb~\citep{pyprob}. We
report the configurations whenever they differ from those default values.
We use the same configuration names found in PyProb, so that they can be
directly transferable from this paper. A description to each configuration will
be given the first time the configuration appears and only when the configuration
is not obvious (such as learning rate and optimizer).

\subsubsection{Stochastic Control Flow Experiment}
\label{app:model_spec_flow}

\cref{fig:toy_learning_curve} shows learning curves (training and validation)
for (a) the PSN and (b) the inference network. For this experiment we
continuously generate traces during training in an online
fashion. Therefore there is no risk of overfitting to a specific dataset and no
validation set is used.

\begin{figure*}[h!]
  \centering
  \includegraphics[scale=1.0]{toy_learning_curves.pdf}
  \caption{Learning curves for (a) the PSN and (b) the
    inference network associated with the stochastic control flow experiment.}
  \figlab{toy_learning_curve}
\end{figure*}

\begin{table*}[h!]
  \caption{Experiment configuration for the stochastic control flow experiment}
  \tablab{exp-config-toy}
  \centering
  \begin{tabular}{p{0.40\linewidth}p{0.14\linewidth}p{0.14\linewidth}p{0.2\linewidth}}
    \toprule
    Parameter/setting & IC & PSN & Description \\
    \midrule
    Optimizer & Adam & Adam & \\
    Learning rate & $\num{5d-4}$ & $\num{5d-4}$ & \\
    Training data size & 500,000 & 500,000 & \\
    Batch Size & 512 & 512 & \\
    \texttt{sample\_embedding\_dim} & 10 & 10 & The size of each variable embedding \\
    \texttt{address\_embedding\_dim} & 24 & 24 & The size of the address embedding which are learnable parameters \\
    \texttt{distribution\_type\_embedding\_dim} & 24 & 24 & The size of the distribution type embedding which are learnable parameters \\
    \texttt{observe\_embedding} & \{x: \{\text\{depth: 4, dim: 10, hidden\_dim: 10\}\}\} & N/A & \texttt{depth} is the number of linear layers mapping from the value $x$ each with \texttt{hidden\_dim} number of neurons. The output size (going into the LSTM) is \texttt{dim} \\
    \texttt{lstm\_depth} & 1 & 1 & Number of stacked LSTMs \\
    \texttt{lstm\_dim} & 150 & 150 & Size of hidden state in each LSTM \\
    \texttt{inf\_variable\_embedding} & \{theta: \{\text\{num\_layers: 2, hidden\_dim: 50\}\}\} & N/A & The names should be self-explanatory and are similar to \texttt{observe\_embedding} except the input to these layers are the output from the LSTM\\
    \texttt{surr\_variable\_embedding} & N/A & \{theta: \{\text\{num\_layers: 2, hidden\_dim: 50\}\}\} &  Same meaning as above but for the PSN \\
    \bottomrule
  \end{tabular}
\end{table*}

\clearpage

\subsubsection{Process Simulation of Composite Materials}
\label{app:model_spec_raven}
\cref{fig:raven_learning_curve} shows learning curves (training and validation)
for (a) the PSN and (b) the inference network. In this experiment we construct
a training set containing 200,000 traces which is iterated through until the number of traces
specified in~\tabref{exp-config-raven} has been encountered. The validation set
contains 7680 traces.

\begin{figure*}[h!]
  \centering
  \includegraphics[scale=1.0]{raven_learning_curves.pdf}
  \caption{Training and validation learning curves for (a) the PSN and (b) the
    inference network associated with the process simulation of composite
    materials experiment.}
  \figlab{raven_learning_curve}
\end{figure*}

\begin{table*}[h!]
  \caption{Experiment configuration for the process simulation of composite materials experiment}
  \tablab{exp-config-raven}
  \centering
  \begin{tabular}{p{0.35\linewidth}p{0.28\linewidth}p{0.20\linewidth}}
    \toprule
    Parameter/setting & IC & PSN \\
    \midrule
    Optimizer & Adam & Adam \\
    Learning rate & $\num{d-3}$ & $\num{d-4}$ \\
    Training data size & 500,000 & 1,000,000 \\
    Batch Size & 256 & 256 \\
    \texttt{sample\_embedding\_dim} & 256 & 256 \\
    \texttt{address\_embedding\_dim} & 24 & 24 \\
    \texttt{distribution\_type\_embedding\_dim} & 24 & 24 \\
    \texttt{observe\_embedding} & \{temps\_bottom: \{depth: 2, dim: 500, hidden\_dim: 500\}, air\_temp\_bot: \{depth: 2, dim: 500, hidden\_dim: 500\}, air\_temp\_top: \{depth: 2, dim: 500, hidden\_dim: 500\}, temps\_config: \{dim: 10, hidden\_dim: 256\}\} & N/A \\
    \texttt{lstm\_depth} & 2 & 2 \\
    \texttt{lstm\_dim} & 512 & 512 \\
    \texttt{inf\_variable\_embedding} & \{config: \{\text\{hidden\_dim: 256\}\}\} & N/A  \\
    \texttt{surr\_variable\_embedding} & N/A & \{latent\_temps: \{\text\{num\_layers: 2, hidden\_dim: 500\}, temps\_config: \{hidden\_dim: 256\}\}\} \\
    \bottomrule
  \end{tabular}
\end{table*}

\clearpage

\subsubsection{Program synthesis Flow Experiment}
\label{app:model_spec_program}

The configurations used for training the surrogate in the program synthesis
experiment are the same as those found in \cref{tab:exp-config-toy}, while
\cref{synthesis_learning_curve} presents learning curves for the trained
surrogate.

\begin{figure*}[h!]
  \centering
  \includegraphics[scale=0.9]{program_synthesis_learning_curves.pdf}
  \caption{Learning curves for the PSN.}
  \label{synthesis_learning_curve}
\end{figure*}

\subsection{Running Times for Process Simulation of Composite Materials}
\label{app:running_times}

\begin{table}[h!]
  \caption{Runtime [traces/s] comparisons. We calculate the number of traces
    produced per second when (1) running just the simulator or PSN and (2) when
    performing SIS in either model. We see a slowdown in traces per second for
    the PSN when performing inference, as the inference engine adds additional
    overhead. However, as the simulator is
    considerably slower, it remains the computational bottleneck during
    inference. The reported run-times are achieved using an Intel(R) Xeon(R) CPU
    E3-1505M v5 @ 2.80GHz.}
  \tablab{runtime}
  \centering
  \begin{tabular}{llll}
    \toprule
    & Simulator ($t_{\mr{sim}}[\mr{traces}/\si{\second}]$)  & PSN ($t_{\mr{PSN}}[\mr{traces}/\si{\second}]$) & \textbf{Speedup} [$\nicefrac{t_{\mr{PSN}}}{t_{\mr{sim}}}$] \\
    \midrule
    PSN & 0.32 & 28.87 & \textbf{90.16}  \\
    IC in PSN & 0.31 & 4.75 & \textbf{15.32} \\
    \bottomrule
  \end{tabular}
\end{table}
\clearpage
\subsection{Results for the Process Simulation of Composite Materials Experiment}
\label{app:raven}

\begin{figure*}[h!]
  \centering
  \includegraphics[scale=0.81]{heatmap_guess_annotated.pdf}
  \caption{Illustration of a process simulation of composite materials. Each
    subfigure shows a temperature profile in degrees Celsius as a function of
    time along the $x$ axis and depth along the $y$-axis. (a) shows the output
    of the Convergent Composite material simulator
    RAVEN~\citep{Technologies2019}, simulating the curing process of a
    particular part. (b) shows the same process but originating from our
    \textit{probabilistic surrogate network}. We perform inference in this
    process, where we infer the expected temperature in a specific time window
    (black box) conditioned on observed surface temperature measurements (blue
    boxes).}
  \label{fig:guess}
\end{figure*}
\cref{fig:guess} compares output from our PSN and the reference simulator. As
these outputs are indistinguishable, it provides further evidence that our PSN
accurately models the reference simulator.

\clearpage
\subsection{Stochastic Control Flow Address Transitions}
\label{app:transitions}

\begin{figure}[h!]
  \centering
  \includegraphics[scale=1.0, trim={0 0 4.5cm 0}, clip]{toy_program.pdf}
  \caption{Program containing stochastic control flow in the form of a for-loop
    with a nested if-else statement. The task here would be to perform posterior
    inference of $\theta$ given the observed value of $x$.}
  \figlab{prog}
\end{figure}

For reference we re-illustrate the program~\cref{fig:prog} also shown in the
main paper. The program contains two nested layers of stochastic control flow,
allowing for an assessment of PSNs' capacity to learn the associated address
transitions.~\cref{fig:address-transitions}(a) and (b) complements the results
reported in the main paper by showing that the address transition paths and their associated estimated
probabilities (using 50,000 traces each) of the program and the trained PSN are near indistinguishable. Only for long traces does small deviations begin to appear. It is reasonable to expect slight discrepancies between
the address transition probabilities for increasingly long traces. The address
occurrence probability decreases exponentially in the number of times $n$ the
original program stays in the for-loop -- \ie $\theta^n$. Therefore we can
expect (with reasonable probability) either the PSN or the program to produce
addresses not produced by the other, when those addresses originate from
executions with large \texttt{for loop} iterations. We conclude that these
results show that the PSN indeed has learned accurate address transitions and
support the claim made in the main paper.

\begin{figure*}[h!]
  \centering
  \includegraphics[scale=0.95]{address_transition.pdf}
  \caption{(a) Address transitions sampled from the original model shown
    in~\cref{fig:prog} with \tabref{orig_address_dict} mapping the address id
    A[i] to the actual address. (b) Address transitions sampled from the PSN,
    with \tabref{surr_address_dict} mapping the address id A[i] to the actual
    address allowing us to compare (a) and (b). For each plot the address
    transition probabilities are estimated across 50,000 traces.}
  \figlab{address-transitions}
\end{figure*}
\clearpage
\input{orig_address_id_to_variable.txt}
\input{surr_address_id_to_variable.txt}
\clearpage

\subsection{Program Synthesis Details}
\label{app:prog-synthesis-details}
The python code describing the generative model we approximate with a surrogate
is given in \cref{fig:simulator}. Note that the $\texttt{depth\_allow\_else}$
data structure is in effect a stack that keeps track of the nesting of
$\texttt{if}$ and $\texttt{else}$ statements. To generate valid programs, the
surrogate has to learn that valid programs can only sample an $\texttt{else}$
statement if an $\texttt{if}$ statement has preceded it on the same nesting
level. Furthermore, in our generative model, a valid program can only end at the
lowest nesting level. Expanding on the results presented in the main text,
additional example programs for both the original and the surrogate are
displayed in \cref{fig:example_programs}. Address transitions for the synthetic
programs can be found in \cref{fig:address_transitions}. The structure of these
transitions makes it clear that the program can only finish from specific
addresses, corresponding to those sampled at the lowest nesting level. It is
evident from the transitions presented for the surrogate that these dependencies
are accurately captured.

\begin{figure*}[h!]
  \centering
  \includegraphics[scale=0.55]{simulator.pdf}
  \caption{Model describing the program synthesis generative model.}
  \label{fig:simulator}
\end{figure*}

\begin{figure*}[h!]
  \centering
  \includegraphics[scale=1.0]{combined_address_transition_synthesis.pdf}
  \caption{(a) Address transitions sampled from the original model shown
    in~\cref{fig:simulator} (b) Address transitions sampled from the PSN. For
    each plot the address transition probabilities are estimated across 50,000
    traces.}
  \label{fig:address_transitions}
\end{figure*}

\begin{figure*}[h!]
  \centering
  \includegraphics[scale=0.9]{combined_sample_programs.pdf}
  \caption{(a) Example programs sampled from the original model shown
    in~\cref{fig:simulator} (b) Example programs sampled from the learned
    surrogate.}
  \label{fig:example_programs}
\end{figure*}
\clearpage
\bibliography{bibtex.bib}

\end{document}
