% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}



% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz,tikz-cd} % nice language for creating drawings and diagrams
\usetikzlibrary{backgrounds}

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr} 
\externaldocument{cao_757}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\usepackage{listings}
\lstset{frame=tb,
  language=Caml,
  aboveskip=3.5mm,
  belowskip=0mm,
  showstringspaces=false,
  columns=flexible,
  basicstyle={\small\ttfamily},
  numberstyle=\tiny\color{black},
  keywordstyle=\color{black},
  commentstyle=\color{dkgreen},
  stringstyle=\color{mauve},
  breaklines=true,
  tabsize=3,
  numbers=left,
  xleftmargin=2em,
  framexleftmargin=1.5em,
  escapeinside={<@}{@>}
}

\usepackage{ebproof}
\usepackage{amsmath}
\usepackage{amsthm}
\usepackage{amssymb}
\usepackage{stmaryrd}
\usepackage{verbatim}
\usepackage{verbatimbox}
\usepackage{forest}
\usetikzlibrary{arrows.meta}
\usetikzlibrary{positioning}
\usepackage[pdf]{graphviz}

\usepackage{svg}
\usetikzlibrary{shapes,arrows}
\usepackage{psfrag}

\tikzstyle{bdd}=[
  >=latex, thick, >=stealth, font=\small,auto,scale=0.9,every node/.style={scale=0.9,line width=0.5}
]
\tikzstyle{bddnode}=[
  draw,circle,inner sep=1.5pt,fill=white,minimum size=5.5mm,font=\small
  % inner sep used to be 0pt,font=small added
]
\tikzstyle{root}=[
    draw, rectangle, thin, inner sep=2pt, minimum size=5.5mm,font=\small
]

\tikzstyle{edge}=[
    % line width=0.9
    ->, filled
]
\tikzstyle{highedge}=[
    line width=0.9
]
\tikzstyle{lowedge}=[
    line width=0.9,dotted
]
\tikzstyle{bddterminal}=[
  draw,fill=gray!20,inner sep=3.5pt, font=\small
  % inner sep used to be 2.5pt
]


\newcommand{\true}[0]{ \mathtt{T} }
\newcommand{\false}[0]{ \mathtt{F} }

\forestset{
  BDT/.style={
    for tree={
      {minimum size=2em, inner sep = 2em},
      if n children=0{ minimum size=1.8 em}{inner sep=0.15em},,% use a circle unless there are 0 children
      % draw,% draw every node
      edge={
        high,% use the my edge style for edges (with the arrow)
      },
      % if n=1{
      %   edge+={low},% if the child is the first one, add the 0 my edge style (dashed)
      % }{},
      % font=\sffamily,% use sans serif for node text
    }
  },
}

\usepackage{xpatch}
\makeatletter
\newcommand*{\addFileDependency}[1]{% argument=file name and extension
  \typeout{(#1)}
  \@addtofilelist{#1}
  \IfFileExists{#1}{}{\typeout{No file #1.}}
}
\makeatother
\xpretocmd{\digraph}{\addFileDependency{#2.dot}}{}{}

\newtheorem{theorem}{Theorem}
\newtheorem{lemma}{Lemma}
\newtheorem{proposition}{Proposition}
\theoremstyle{definition}
\newtheorem{definition}{Definition}

\newcommand{\op}[2]{\operatorname{#1}\mathopen{}\left(#2\right)\mathclose{}}
\newcommand{\opsub}[3]{\operatorname{#1}_{#2}\mathopen{}\left(#3\right)\mathclose{}}
\newcommand{\low}[1]{\op{low}{#1}}
\newcommand{\high}[1]{\op{high}{#1}}
\newcommand{\lsb}[2]{\opsub{LSB}{#1}{#2}}

\newcommand{\ftobdd}[1]{\left\llbracket#1\right\rrbracket}
\newcommand{\bddrt}[2]{{#1}_{#2}(\vec{x})}
\newcommand{\ftobddrt}[2]{\bddrt{\ftobdd{#1}}{#2}}

\newcommand{\itesty}[1] {#1}
\newcommand{\ite}[3]{ \text{\itesty{if} \quad } #1 \text{\quad \itesty{then} \quad} #2 \text{\quad \itesty{else} \quad} #3 }
\newcommand{\itep}[3]{\left(\ite{#1}{#2}{#3}\right)}
\newcommand{\mathcolorbox}[2]{\colorbox{#1}{$\displaystyle #2$}}
% \newcommand{\mhl}[1]{\mathcolorbox{cyan!15}{#1}}
\newcommand{\mhl}[1]{#1}

\newcommand{\BitwiseHoltzen}{BITWISE\_INT }
\newcommand{\BWH}{BITWISE\_INT}

\newcommand{\concat}{^\frown}

\newtheorem{innerpropositionhalf}{Proposition}
\newenvironment{propositionhalf}[1]
  {\renewcommand\theinnerpropositionhalf{#1}\innerpropositionhalf}
  {\endinnerpropositionhalf}
  
\DeclareMathOperator\Beta{Beta}

\title{Scaling Integer Arithmetic in Probabilistic Programs\\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{William X. Cao}
\author[1]{Poorva Garg$^*$}
\author[2]{Ryan Tjoa$^*$}
\author[3]{Steven Holtzen}
\author[1]{Todd Millstein}
\author[1]{Guy Van den Broeck}
% Add affiliations after the authors
\affil[1]{%
    Department of Computer Science\\
    University of California\\
    Los Angeles, California, USA
}
\affil[2]{
    Department of Computer Science\\
    University of Washington\\
    Seattle, Washington, USA
}
\affil[3]{%
    Department of Computer Science\\
    Northeastern University\\
    Boston, Massachussetts, USA
}

  
\begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle

% This Supplementary Material should be submitted as a separate file. Please do not append the Supplementary Material to the main paper. 

% Fig. \ref{fig:pitt} and Eq \ref{eq:example} in the main paper can be cross referenced using \texttt{xr}. 

\appendix




% This application naturally fits our probabilistic programming language with the process described above: we can easily model a Bayesian network with random flips drawn from Beta priors, representing our prior belief for each parameter. Observations on the network variables will then update our beliefs, from which we can get an expectation in the manner described above.
% The distribution over the internal program variables representing the parameters of the Beta distribution is a mixture over Beta distributions that is the exact posterior to our problem. Compared to common sampling-based approaches for representing the Beta distribution, 
% An example of this learning, on the \verb#survey#~\citep{survey} Bayesian network, is given in Figure ~\ref{beta_example_plot}. 



% We have implemented such an application on the \verb#survey#~\citep{survey} Bayesian network, on which we can scale to roughly 7 missing data points \sh{plots? this feels unfinished}. 



\section{Experimental Details}
All experiments were run on a server with a 2.20GHz CPU and 504GB RAM. WebPPL experiments were run on WebPPL v0.9.15; Psi experiments were run on Psi version \texttt{ec2cfc14a62a168afe7ce1d7269b92cf2882b830}. \verb#Dice.jl# experiments were run using the code available at \url{https://github.com/Juice-jl/Dice.jl/tree/arithmetic}. The implementations of each model run are available at the same repository. 


\section{Benchmark Models}
We provide a small description of our benchmarks in this section with details of their sources. The code implementations of all models are available at the repository \url{https://github.com/Juice-jl/Dice.jl/tree/arithmetic}. 

\begin{enumerate}
    \item book: A model of flipping towards a target page in a book adapted from the Psi test directory~\citep{PsiDirectory}. 
    
    \item tugofwar: Adapted from a traditional tug-of-war example~\citep{huang2021aqua}, with values made discrete.
    \item caesar: The caesar-cipher example from Dice~\citep{HoltzenOOPSLA20}. with different number of characters being observed. 

    \item ranking: A model for learning a ranking system, adapted from \cite{KisaLTPM14}.

\item radar1: A model of radar reception, adapted from a continuous model from Psi's~\citep{gehr2016psi} benchmark suite.

\item floydwarshall: An implementation of the Floyd-Warshall algorithm~\citep{10.1145/367766.368168} on a graph with edges of random weight. 

\item linear-extensions: A model counting linear extensions~\citep{https://doi.org/10.48550/arxiv.1802.06312} where we observe a partial order and get an output distribution over all matching total orders.

\item triangle: A model categorizing a triangle of random side lengths, adapted from the Psi test directory~\citep{PsiDirectory}.

\item gcd: A model checking if two random numbers are coprime implementing Euclid's algorithm~\citep{doi:10.1080/00029890.1938.11990797}.

\item disease: A discrete disease model taken from existing works~\citep{10.1007/978-3-030-44914-8_14}.

\item luhn: A probabilistic model of student IDs leveraging the Luhn algorithm~\citep{luhn}. 
\end{enumerate}


\section{Additional Experimental Results}
We provide additional experimental results supplementing those in the main paper. 

BDD size serves as a proxy for how well knowledge compilation can exploit structure: the more compact the BDD, the smaller the representation of our function, and the faster weighted model counting can be executed. Figure \ref{bddsizes} compares the resultant BDD size for various models when compiled in Dice.jl using a binary or one-hot encoding. We can see that in almost all cases the binary encoding results in a smaller BDD, in some cases much smaller. Note that these models are those for which the one-hot encoding did not timeout; it is likely for those models that timed out that the true compiled BDD size will end up being much larger than the binary encoded BDD size. 

\begin{table}[ht]
\centering
\caption{BDD sizes for probabilistic models using a binary vs one-hot encoding}\label{bddsizes}
\scriptsize
\begin{tabular}{lrr}\toprule
Benchmarks              &binary & one-hot  \\\midrule
tugofwar                &\textbf{2400}    &2821\\\cmidrule{1-3}
caesar-small            &\textbf{1304}    &3879 \\
caesar-medium           &\textbf{6344}             &17879  \\
caesar-large            &\textbf{12644}             & 35379 \\\cmidrule{1-3}
ranking-small           &\textbf{691 }   &1146  \\
ranking-medium          &\textbf{6218}     & 8297 \\
ranking-large           &\textbf{11491 }    &15680\\\cmidrule{1-3}
radar1                  &\textbf{181}     &332 \\\cmidrule{1-3}
floydwarshall-small     &\textbf{10}    &\textbf{10}  \\
floydwarshall-medium    &341    &\textbf{237} \\\cmidrule{1-3}
linear extensions-small &\textbf{29}     & 53\\
linear extensions-medium &\textbf{133}   & 257 \\
linear extensions-large &\textbf{997}    & 1538 \\\cmidrule{1-3}
triangle-small          &\textbf{10273 }   &150089 \\
triangle-medium         &\textbf{40419 }    & 1156785\\\cmidrule{1-3}
luhn-small              &\textbf{518 }   &1010 \\
luhn-medium             &\textbf{2899 }   &7361  \\
\bottomrule
\end{tabular}
\end{table}


\section{BDD Sizes of Integer Distribution Encodings (Proposition 1)}

\subsection{Notation and Definitions}

A $b$-rooted BDD $B$ with $m$ decision variables computes some function $\{0, 1\}^m \to \{0, 1\}^b$ at its roots.

Let the values of the roots of $B$, given truth assignments to decision variables $\vec{x}$, be denoted $\bddrt{B}{1}, \dots, \bddrt{B}{b}$. Let the variable order of $B$ match the order of the components of $\vec{x}$. A BDD can thus be specified by a tuple of functions $(\{0, 1\}^m \to \{0, 1\})^b$. Let $B(\vec{x})$ denote this tuple, i.e.\ $(\bddrt{B}{1}, \dots, \bddrt{B}{b})$.

As $b$-length bitvectors are isomorphic to $b$-bit unsigned machine integers, we can also specify a BDD by a function from assignments to $b$-bit unsigned integers. Given $f : \{0, 1\}^m \to \{0, \dots, 2^b - 1\}$, let $\ftobdd{f}$ denote the BDD defined such that $\ftobddrt{f}{i}$ = $\lsb{i}{f(\vec{x})}$, where $\lsb{i}{j}$ denotes the $i$th least significant bit of $j$.

Let $\vec{p} \in [0, 1]^m$ assign a probability to each decision variable. For each $p_i$, let $X_i$ denote an independent $\operatorname{Bernoulli}(p_i)$. We can construct a random variable for the result of $B$ with $B(\Vec{X})$ (passing in $X_i$ for each $x_i$). Note the indexing of the decision variable $x_i$, the corresponding probability $p_i$, and the corresponding random variable $X_i$ may vary to make notation clearer: for CATEG\_INT, these will be zero-indexed; for \BWH, these will be indexed by bit strings.

Let $\mathcal{D}_b$ denote a distribution over $b$-length bit vectors. Let $v^\star$ be an arbitrary random variable sampled from $\mathcal{D}_b$, interpreted as an unsigned integer. Denote the bits of $v^\star$, from least to most significant, as $v_1^\star, \dots, v_b^\star$.

\begin{definition}[Integer distribution encoding]
A BDD and probability assignments $(B, \vec{p})$ \textit{encode} a distribution $\mathcal{D}_b$ if $B(\vec{X}) \sim \mathcal{D}_b$.
\end{definition}



\subsection{Categorical Encoding of Integer Distributions}
\begin{definition}
Define $\operatorname{encode_{CATEG}}(\mathcal{D}_b) = (\ftobdd{g}, \vec{p})$ such that $p_i = \Pr(v^\star = i | v^\star \geq i)$, $0 \leq i \leq 2^b-2$, and that $g$ is defined as follows.
\begin{gather*}
    g(\vec{x}) = \begin{cases}
        0 & x_0\\
        1 & \lnot x_0 \land x_1\\
        2 & \lnot x_0 \land \lnot x_1 \land x_2\\
        \dots\\
        j & x_j \land \bigwedge\limits_{i=0}^{j-1}\lnot x_i \operatorname{\ and\ } 0 \leq j \leq 2^b-2\\
        \dots\\
        2^b - 1 & \bigwedge\limits_{i=0}^{2^b-2}\lnot x_i
    \end{cases}
\end{gather*}
We could also write this as the following, which is also how it would likely be implemented in a probabilistic programming language.
\begin{gather*}
    g(\vec{x}) = \begin{cases}
        0 & \operatorname{if\ }x_0\\
        1 & \operatorname{else\ if\ }x_1\\
        2 & \operatorname{else\ if\ }x_2\\
        \dots\\
        2^b - 2 & \operatorname{else\ if\ }x_{2^b - 2}\\
        2^b - 1 & \operatorname{else}
    \end{cases}
\end{gather*}
\end{definition}

\begin{lemma} \label{geqinductive}
 If $\operatorname{Img}(v) \subseteq \mathbb{N}_0$, then $\prod_{i=0}^{x-1} \Pr(v \neq i | v \geq i) = \Pr(v \geq x)$. Proof omitted; proceed by induction.
\end{lemma}
\begin{lemma} \label{sbkencodes}
For any distribution $\mathcal{D}_b$ over unsigned integers, $\operatorname{encode_{CATEG}}(\mathcal{D}_b)$ encodes $\mathcal{D}_b$.
\end{lemma}
\begin{proof}
Let $\operatorname{encode_{CATEG}}(\mathcal{D}_b) = (B, \vec{p})$. We prove that for all $0 \leq j \leq 2^b - 1$, $\Pr(g(\vec{X})=j) = \Pr(v^\star=j)$.

\textit{Case 1:} $0 \leq j \leq 2^b-2$.
\begin{align*}
    \Pr(g(\vec{X})=j) &= \Pr\left(X_{j} \land \bigwedge\limits_{i=0}^{j-1}\lnot X_i\right)\\
    &=  \Pr(X_{j})\prod_{i=0}^{j-1}\Pr(\lnot X_i) && \text{(Independence)}\\
    &=\Pr(v^\star=j|v^\star\geq j)\prod_{i=0}^{j - 1}\Pr(v^\star \neq i \mid v^\star \geq i) && \text{(As $\Pr(X_i) = p_i$)}\\
    &=\Pr(v^\star=j|v^\star\geq j)\Pr(v^\star \geq j) && \text{(Lemma \ref{geqinductive})}\\
    &=\Pr(v^\star=j)
\end{align*}

\textit{Case 2:} $j = 2^b-1$.
\begin{align*}
    \Pr(g(\vec{X})=2^b-1) &= \Pr\left(\bigwedge\limits_{i=0}^{2^b-2}\lnot X_i\right)\\
    &=\prod_{i=0}^{2^b-2}\Pr(\lnot X_i) && \text{(Independence)}\\
    &=\prod_{i=0}^{2^b-2}\Pr(v^\star \neq i | v^\star \geq i) && \text{(As $\Pr(X_i) = p_i$)}\\
    &= \Pr(v^\star \geq 2^b - 1) && \text{(Lemma \ref{geqinductive})}\\
    &= \Pr(v^\star = 2^b - 1)
\end{align*}
\end{proof}

\subsection{BDD Size of the CATEG\_INT Encoding}


We prove the first half of Proposition 1.
\begin{propositionhalf}{1 (first half)} \label{sbksize}
A discrete distribution over the integers $\{0, 1 \ldots, 2^b - 1\}$ compiles to a BDD of size $\Theta(b2^b)$ when represented using CATEG\_INT (Algorithm 1).
\end{propositionhalf}

\begin{lemma} \label{sbkstructure}
Let $\operatorname{encode_{CATEG}}(\mathcal{D}_b) = (\ftobdd{g}, P)$. $\ftobdd{g}$ exactly matches a BDD $C$ with the following reduced structure.

For $1 \leq i \leq b$, $C$ has nodes $N_{i, 0}, N_{i, 1}, \dots, N_{i, 2^b - 2^{i-1} - 1}$, at the levels of decision variables $x_0, x_1, \dots, x_{2^b - 2^{i - 1} - 1}$, respectively.
\begin{align*}
    \low{N_{i, j}} &= \text{if \quad } j < 2^b - 2^{i-1} - 1 \text{\quad then \quad} N_{i, j + 1} \text{\quad else \qquad} 1\\
    \high{N_{i, j}} &= \lsb{i}{j}
\end{align*}
Let the roots of $C$ be placed such that $\bddrt{C}{i}$ matches $N_{i,0}$ given assignments to decision variables $\vec{x}$.
\end{lemma}

As an example, the BDD for $b = 2$ follows (terminal nodes visually duplicated for clarity).
\vspace{-10pt}
\begin{center}
    \begin{minipage}[t]{0.15\textwidth}
    \vspace{0pt}
        \begin{center}
        \begin{tikzpicture}
          \def\lvl{35pt}
        \node (c1) at (0, 0) [root] {$C_1$};
        \node (n10) at ($(c1) + (0bp, -30pt)$) [bddnode] {$N_{1,0}$};
        \node (n11) at ($(n10) + (0bp, -\lvl)$) [bddnode] {$N_{1,1}$};
        \node (n12) at ($(n11) + (0bp, -\lvl)$) [bddnode] {$N_{1,2}$};
        \node (true) at ($(n12) + (-15bp, -30pt)$) [bddterminal] {$\true$};
        \node (false) at ($(n12) + (15bp, -30pt)$) [bddterminal] {$\false$};
        
        \begin{scope}[on background layer]
          \draw [-stealth] (c1) -- (n10);
          \draw [lowedge] (n10) -- (n11);
          \draw [lowedge] (n11) -- (n12);
          \draw [lowedge] (n12) -- (true);
          \draw [highedge] (n12) -- (false);
          
          \path[-] (n11) edge [bend right, highedge] node {} (true);
          \path[-] (n10) edge [bend left, highedge] node {} (false);

        \end{scope}
        \end{tikzpicture}
        \end{center}
    \end{minipage}
    \begin{minipage}[t]{0.15\textwidth}
    \vspace{0pt}
    
        \begin{tikzpicture}
          \def\lvl{35pt}
        \node (c2) at (0, 0) [root] {$C_2$};
        \node (n20) at ($(c2) + (0bp, -30pt)$) [bddnode] {$N_{2,0}$};
        \node (n21) at ($(n20) + (0bp, -\lvl)$) [bddnode] {$N_{2,1}$};
        \node (true) at ($(n21) + (-15bp, -30pt-\lvl)$) [bddterminal] {$\true$};
        \node (false) at ($(n21) + (15bp, -30pt-\lvl)$) [bddterminal] {$\false$};
        
        \begin{scope}[on background layer]
          \draw [-stealth] (c2) -- (n20);
          \draw [lowedge] (n20) -- (n21);
          \draw [lowedge] (n21) -- (true);
          \draw [highedge] (n21) -- (false);
          
          \path[-] (n20) edge [bend left, highedge] node {} (false);

          % \draw [highedge] (
        \end{scope}
        \end{tikzpicture}
    % \begin{forest}
    %     bdd
    %     [$C_2$
    %         [$N_{2,0}$, tikz={\draw [high] () [bend left] to (z.north) ; }, bddnode
    %             [$N_{2,1}$, edge=low, bddnode
    %                 [1, name=o, edge=low]
    %                 [0, name=z]
    %             ]
    %         ]
    %     ]
    % \end{forest}\vspace{4.25em}
    \end{minipage}
\end{center}

\begin{proof} We will show for all $\vec{x}$,  $\bddrt{C}{i} = \lsb{i}{g(\vec{x})}$.

Let $S_{i,j}(\vec{x})$ denote the state of $C_i$ at the level of $x_j$, given an assignment to $\vec{x}$. This is either a node at level $x_j$, or possibly one at a lower level if there was a low/high edge that skipped levels. See the following BDD and corresponding states as an example.


\begin{center}
    \begin{minipage}{0.3\textwidth}
    \begin{center}
        \begin{alignat*}{4}
        S_{1,0}(&x_0,x_1,x_2) &&= N_{1,1}\\
        S_{1,1}(&0, x_1, x_2) &&= N_{1,2}\\
        S_{1,1}(&1, x_1, x_2) &&= 0\\
        S_{1,2}(&0, 0, x_2) &&{}={} N_{1,2}\\
        S_{1,2}(&0, 1, x_2) &&{}={} 1\\
        S_{1,2}(&1, x_1, x_2) &&= 0
        \end{alignat*}

    \end{center}
    \end{minipage}
    \begin{minipage}{0.4\textwidth}

    % \begin{minipage}[t]{0.15\textwidth}
    \vspace{0pt}
        \begin{center}
        
        \begin{tikzpicture}
          \def\lvl{35pt}
        \node (c1) at (0, 0) [root] {$C_1$};
        \node (n10) at ($(c1) + (0bp, -30pt)$) [bddnode] {$N_{1,0}$};
        \node (n11) at ($(n10) + (0bp, -\lvl)$) [bddnode] {$N_{1,1}$};
        \node (n12) at ($(n11) + (0bp, -\lvl)$) [bddnode] {$N_{1,2}$};
        \node (true) at ($(n12) + (-15bp, -30pt)$) [bddterminal] {$\true$};
        \node (false) at ($(n12) + (15bp, -30pt)$) [bddterminal] {$\false$};
        
        \begin{scope}[on background layer]
          \draw [-stealth] (c1) -- (n10);
          \draw [lowedge] (n10) -- (n11);
          \draw [lowedge] (n11) -- (n12);
          \draw [lowedge] (n12) -- (true);
          \draw [highedge] (n12) -- (false);
          
          \path[-] (n11) edge [bend right, highedge] node {} (true);
          \path[-] (n10) edge [bend left, highedge] node {} (false);
        \end{scope}
        \end{tikzpicture}
        \end{center}
    % \end{minipage}
    \end{minipage}
\end{center}

Let $P(t)$ be the statement, $S_{i,t} = \ite{ g(\vec{x}) < t \lor t > 2^b-2^{i-1} - 1}
                                            { \lsb{i}{g(\vec{x})} }
                                            { N_{i,t} } $.

$P(1)$ holds as the initial node of all $\bddrt{C}{i}$ is always $N_{i,0}$.

Assume $P(t)$ for an arbitrary $t \leq 2^b - 1$.

Consider going to the next node. If we are at a terminal node, then the state stays the same, otherwise we go to the low or high edge based on $x_t$. Thus, to advance state, we replace $N_{i,t}$ with $\text{if \quad } x_t \text{\quad then \quad} \high{N_{i,t}}\text{\quad else \quad} \low{N_{i,t}}$.
\begin{align*}
    S_{i,t + 1} {}={} 
        &\text{if \quad  } g(\vec{x}) < t \lor t > 2^b-2^{i-1} - 1 \\
        &\text{then \quad} \mhl{\lsb{i}{g(\vec{x})}} \\
        &\text{else \quad} \mhl{\itep{ x_t }{ \high{N_{i,t}} }{ \low{N_{i,t}} }}
\end{align*}
We substitute for $\high{N_{i,t}}$ and $\low{N_{i,t}}$.
\begin{align*}
    S_{i,t + 1} {}={} 
        &\text{if \quad } g(\vec{x}) < t \lor t > 2^b-2^{i-1} - 1 \\
        &\text{then \quad} \lsb{i}{g(\vec{x})} \\
        &\text{else \quad} \itep { x_t }{\mhl{ \lsb{i}{t} }}{\mhl{ \itep{t < 2^b - 2^{i - 1} - 1}{N_{i,t+1}}{1} }}
\end{align*}
$x_t \land \lnot (g(\vec{x}) < t)$ implies $g(\vec{x}) = t$.
\begin{align*}
    S_{i,t + 1} {}={} 
        &\text{if \quad } g(\vec{x}) < t \lor t > 2^b-2^{i-1} - 1\\
        &\text{then \quad} \lsb{i}{g(\vec{x})} \\
        &\text{else \quad} \itep { x_t }{ \lsb{i}{\mhl{ g(\vec{x}) }} }{ \itep{t < 2^b - 2^{i - 1} - 1}{N_{i,t+1}}{1} }
\end{align*}

Two branches are now equivalent ($\lsb{i}{g(\vec{x})}$) and can be combined.
\begin{align*}
    S_{i,t + 1} {}={} 
        &\text{if \quad } g(\vec{x}) < t \lor t > 2^b-2^{i-1}-1 \mhl{ \lor x_t }\\
        &\text{then \quad} \lsb{i}{g(\vec{x})} \\
        &\text{else \quad} \mhl{ \itep{t < 2^b - 2^{i - 1} - 1}{N_{i,t+1}}{1} }
\end{align*}
$g(\vec{x}) < t \lor x_t$ is equivalent to $g(\vec{x}) < t + 1$.
\begin{align*}
    S_{i,t + 1} {}={} 
        &\text{if \quad } \mhl{ g(\vec{x}) < t + 1 } \lor t > 2^b-2^{i-1}-1 \\
        &\text{then \quad} \lsb{i}{g(\vec{x})} \\
        &\text{else \quad} \itep{t < 2^b - 2^{i - 1} - 1}{N_{i,t+1}}{1}
\end{align*}

In the outer else, $t \leq 2^b-2^{i-1} - 1$ (by the condition). In the inner else, $t$ is also $\geq 2^b-2^{i-1} - 1$.
\begin{align*}
    S_{i,t + 1} {}={} 
        &\text{if \quad } g(\vec{x}) < t + 1 \lor t > 2^b-2^{i-1} - 1 \\
        &\text{then \quad} \lsb{i}{g(\vec{x})} \\
        &\text{else \quad} \itep{t \mhl{=} 2^b - 2^{i - 1} - 1}{N_{i,t+1}}{1}
\end{align*}

In the inner else, $g(\vec{x}) > t$ by the outer condition and $t = 2^b - 2^{i-1} - 1$ by the inner condition; together, $g(\vec{x}) > 2^b - 2^{i-1} - 1$. For all such $g(\vec{x})$, $\lsb{i}{g(\vec{x})} = 1$ (as $2^b - 1 - 2^{i - 1})$ is the largest number at most $2^b - 1$ with the $i^\text{th}$ bit set to 0). Thus, we can merge two more branches.
\begin{align*}
    S_{i,t + 1} {}={} 
        &\text{if \quad } g(\vec{x}) < t + 1 \lor t \mhl{ + 1 } > 2^b-2^{i-1} - 1\\
        &\text{then \quad} \lsb{i}{g(\vec{x})} \\
        &\text{else \quad} \mhl{N_{i,t+1}}
\end{align*}

Thus $P(t) \to P(t + 1)$ and thus $P(t)$ holds for all $1 \leq t \leq 2^b - 1$.

Consider $P(2^b - 1)$.
\begin{align*}
S_{i,2^b - 1} &= \ite{ g(\vec{x}) < 2^b - 1 \lor 2^b - 1 > 2^b-2^{i-1} - 1}
                                            { \lsb{i}{g(\vec{x})} }
                                            { N_{i,2^b} } \\                             
    S_{i,2^b - 1} &=  \lsb{i}{g(\vec{x})}
\end{align*}

Therefore $\ftobdd{g}$ and $C$ are equivalent.
\end{proof}

\textbf{Proposition 1 (first half).}

\begin{proof}
Let $\operatorname{encode_{CATEG}}(\mathcal{D}_b) = (B, \vec{p})$. $B$ has $b2^b-2^b+1$ decision nodes.

It follows directly from Lemma \ref{sbkstructure} that $\sum_{i=1}^{n} 2^b-2^{i-1} = b2^b - 2^b + 1$ nodes are needed for $B$ as well.
\end{proof}


\subsection{Bitwise Encoding of Integer Distributions}

Let $s_i$ denote the $i^\text{th}$ bit of a 1-indexed bit string; for example, $0010_3 = 1$. Let $|s|$ denote the length of a bit string. Let $\concat$ denote concatenation for bits and bit strings.

\begin{definition}
Define $\operatorname{encode_{\BWH}}(\mathcal{D}_b) = (B, \vec{p})$ such that:

For each bit string $s$, $0 \leq |s| < b$, there is a decision variable $x_s$ with truth probability $p_s = \Pr(v_{|s|+1}^\star \mid  \bigwedge\limits_{i=1}^{|s|} v_i^\star = s_i)$. 
Intuitively, the bits are chosen left-to-right, and each decision variable chooses the next bit given a bit string of past choices. Consider the following examples, where the empty bit string is denoted as $\varepsilon$.
\begin{align*}
    \Pr(x_\varepsilon) &= \Pr(v_{1}^\star)\\
    \Pr(x_0) &= \Pr(v_{2}^\star \mid \lnot v_{1}^\star)\\
    \Pr(x_1) &= \Pr(v_{2}^\star \mid v_{1}^\star)\\
    \Pr(x_{0100}) &= \Pr(v_{5}^\star \mid \lnot v_{1}^\star \land v_{2}^\star \land \lnot v_{3}^\star \land \lnot v_{4}^\star)
\end{align*}
 We specify $\bddrt{B}{i}$ for $1 \leq i \leq b$:
\begin{align*}
    \bddrt{B}{i} = x_{\bddrt{B}{1} \concat \bddrt{B}{2} \concat \hdots \concat \bddrt{B}{i - 1}}
\end{align*}
For example, $\bddrt{B}{1} = x_\varepsilon$, $\bddrt{B}{2} = x_{\bddrt{B}{1}}$, $\bddrt{B}{3} = x_{\bddrt{B}{1} \concat \bddrt{B}{2}}$, and so on. Note that the specification above is equivalent to the following, which is a more natural representation in probabilistic programming, and an ``unrolled" version of Algorithm 2.
\begin{align*}
    \bddrt{B}{1} &= x_\varepsilon\\
    \bddrt{B}{2} &= \text{if } x_\varepsilon \text{ then } x_1 \text { else } x_0\\
    \bddrt{B}{3} &= \text{if } x_\varepsilon \text{ then } (\text{if } x_1 \text{ then } x_{11} \text{ else } x_{10}) \text{ else } (\text{if } x_0 \text{ then } x_{01} \text{ else } x_{00})\\
    \vdots\\
    \bddrt{B}{n} &= \dots
\end{align*}
\end{definition}

\begin{lemma} \label{bwhencodes}
For any distribution $\mathcal{D}_b$ over unsigned integers, $\operatorname{encode_{\BWH}}(\mathcal{D}_b)$ encodes $\mathcal{D}_b$.
\end{lemma}

\begin{proof}
We prove that for any possible assignment to bits $\vec{a} \in \{0, 1\}^b$, $\Pr(B(\vec{X}) = \vec{a}) = \Pr(\vec{v^\star} = \vec{a})$.
\begin{align*}
    \Pr(B(\vec{X}) = \vec{a}) &= \prod_{i=1}^b \Pr(B_i(\vec{X})) = a_i \mid \bigwedge_{j=1}^{i-1} B_j(\vec{X}) = a_j) && \text{(Chain rule of probability)}\\
    &= \prod_{i=1}^b \Pr(\mhl{X_{B_1(\vec{X})\concat \dots \concat B_{i-1}(\vec{X})}} = a_i \mid \bigwedge_{j=1}^{i-1} B_j(\vec{X}) = a_j) && \text{(Bit specification)}\\
    &= \prod_{i=1}^b \Pr(X_{\mhl{a_1 \concat \dots \concat a_{i-1}}} = a_i \mid \bigwedge_{j=1}^{i-1} B_j(\vec{X}) = a_j) && \text{(Condition)}\\
    &= \prod_{i=1}^b \Pr(X_{a_1\concat \dots \concat a_{i-1}} = a_i \mid \bigwedge_{j=1}^{i-1} \mhl{X_{B(\vec{X})_1\concat \dots \concat B_{j-1}(\vec{X})}} = a_j) && \text{(Bit specification)}\\
    &= \prod_{i=1}^b \Pr(X_{a_1\concat \dots \concat a_{i-1}} = a_i) && \text{(Independence)}\\
    &= \prod_{i=1}^b \Pr(b^\star_i = a_i \mid \bigwedge_{j=1}^{i-1} b^\star_j=a_j) && \text{(As $\Pr(X_s) = p_s$)}\\
    &= \Pr(\vec{b^\star} = \vec{a}) && \text{(Chain rule of probability)}
\end{align*}
\end{proof}



\subsection{BDD Size of the BITWISE\_INT  encoding}

We prove the second half of Proposition 1.
\begin{propositionhalf}{1 (second half)} \label{bwhsize}
A discrete distribution over the integers $\{0, 1 \ldots, 2^b - 1\}$ compiles to a BDD of size $\Theta(2^b)$ when represented using \BWH (Algorithm 2).
\end{propositionhalf}


\begin{lemma} \label{bwhstructure}
Let $\operatorname{encode_{\BWH}}(\mathcal{D}_b) = (B, P)$.

$B$ exactly matches a BDD $C$ with the following reduced structure.

For all $1 \leq i \leq b$, for all bit strings $s$ of length less than $i$, let $C$ have a node $N_{i,s}$ corresponding to decision variable $x_s$.
\begin{align*}
    \low{N_{i, s}} &= \ite{ |s| = i - 1 }{ 0 }{N_{i, s\concat 0}}\\
    \high{N_{i, s}} &= \ite{|s| = i - 1} {1}{N_{i, s\concat 1}}
\end{align*}
Let the roots of $C$ be placed such that $\bddrt{C}{i}$ matches $N_{i,\varepsilon}$ given assignments to decision variables $\vec{x}$.

Any ordering of decision variables is valid as long is they are ordered by increasing bit string lengths. We arbitrarily choose to order bit strings of the same length by their lexicographical order.
\end{lemma}

As an example, the BDD for $b = 2$ follows (terminal nodes visually duplicated for clarity).
\vspace{-10pt}
% \input{uai23/bdds/supp_sbk1.pdf_tex}
% \psfrag{C1}{$C_1$}
% \psfrag{N10}{$N_{10}$}
% \psfrag{N11}{$N_{11}$}
% \psfrag{N12}{$N_{12}$}
% \psfrag{0}{0}
% \psfrag{1}{1}
% \includegraphics[width=.15\textwidth]{uai23/bdds/supp_sbk1.eps}

\begin{center}
    \begin{minipage}[t]{0.15\textwidth}
    \vspace{0pt}
        \begin{center}
        \begin{tikzpicture}
          \def\lvl{30pt}
          \def\offset{15bp}
            \node (c1) at (0, 0) [root] {$C_1$};
            \node (n1e) at ($(c1) + (0bp, -30pt)$) [bddnode] {$N_{1,\varepsilon}$};
            \node (true) at ($(n1e) + (\offset, -\lvl-30pt)$) [bddterminal] {$\true$};
            \node (false) at ($(n1e) + (-\offset, -\lvl-30pt)$) [bddterminal] {$\false$};
            
            \begin{scope}[on background layer]
              \draw [-stealth] (c1) -- (n1e);
              \draw [lowedge] (n1e) -- (false);
              \draw [highedge] (n1e) -- (true);
        \end{scope}
        \end{tikzpicture}
        \end{center}
    \end{minipage}
    \begin{minipage}[t]{0.15\textwidth}
    \vspace{0pt}
    \begin{tikzpicture}
      \def\lvl{30pt}
      \def\offset{15bp}
        \node (c2) at (0, 0) [root] {$C_2$};
        \node (n2e) at ($(c2) + (0bp, -30pt)$) [bddnode] {$N_{2,\varepsilon}$};
        \node (n20) at ($(n2e) + (-\offset, -\lvl)$) [bddnode] {$N_{2,0}$};
        \node (n21) at ($(n2e) + (\offset, -\lvl)$) [bddnode] {$N_{2,1}$};
        \node (true) at ($(n21) + (0bp, -30pt)$) [bddterminal] {$\true$};
        \node (false) at ($(n20) + (0bp, -30pt)$) [bddterminal] {$\false$};
        
        \begin{scope}[on background layer]
          \draw [-stealth] (c2) -- (n2e);
          \draw [lowedge] (n2e) -- (n20);
          \draw [lowedge] (n20) -- (false);
          \draw [lowedge] (n21) -- (false);
          \draw [highedge] (n2e) -- (n21);
          \draw [highedge] (n20) -- (true);
          \draw [highedge] (n21) -- (true);
    \end{scope}
    \end{tikzpicture}
    \end{minipage}
\end{center}


\begin{proof}


Let $S_{i,j}(\vec{x})$ denote the state of $C_i$ at the level of the first decision variable whose bit string is of length $j$. See the following BDD and corresponding states as an example.
\begin{center}
    \begin{minipage}{0.3\textwidth}
    \begin{center}
        \begin{alignat*}{4}
        S_{2,0}(&x_\varepsilon,x_0,x_1) &&= N_{2,\varepsilon}\\
        S_{2,1}(&0, x_0, x_1) &&= N_{2,0}\\
        S_{2,1}(&1, x_0, x_1) &&= N_{2,1}\\
        S_{2,2}(&0, x_0, x_1) &&= x_0\\
        S_{2,2}(&1, x_0, x_1) &&= x_1
        \end{alignat*}
    \end{center}
    \end{minipage}
    \begin{minipage}{0.4\textwidth}
    \begin{center}
    \begin{tikzpicture}
      \def\lvl{30pt}
      \def\offset{15bp}
        \node (c2) at (0, 0) [root] {$C_2$};
        \node (n2e) at ($(c2) + (0bp, -30pt)$) [bddnode] {$N_{2,\varepsilon}$};
        \node (n20) at ($(n2e) + (-\offset, -\lvl)$) [bddnode] {$N_{2,0}$};
        \node (n21) at ($(n2e) + (\offset, -\lvl)$) [bddnode] {$N_{2,1}$};
        \node (true) at ($(n21) + (0bp, -30pt)$) [bddterminal] {$\true$};
        \node (false) at ($(n20) + (0bp, -30pt)$) [bddterminal] {$\false$};
        
        \begin{scope}[on background layer]
          \draw [-stealth] (c2) -- (n2e);
          \draw [lowedge] (n2e) -- (n20);
          \draw [lowedge] (n20) -- (false);
          \draw [lowedge] (n21) -- (false);
          \draw [highedge] (n2e) -- (n21);
          \draw [highedge] (n20) -- (true);
          \draw [highedge] (n21) -- (true);
    \end{scope}
    \end{tikzpicture}
    \end{center}
    \end{minipage}
\end{center}


Let $P(t)$ be the statement, $S_{i,t}(\vec{x}) =  \ite{ t \geq i }{ \bddrt{B}{i} }{ N_{i,\bddrt{B}{1} \concat \dots \concat \bddrt{B}{t}}}$.

$P(0)$ holds as there is no bit index greater than or equal to 0 and $S_{i,0}(\vec{x}) = N_{i,\varepsilon}$ by the placement of the roots.

Assume $P(t)$, which specifies $S_{i, t}$. Consider advancing to the next node in the BDD based on the assignment to decision variables (which does nothing if we are already at a terminal node):
\begin{align*}
    \operatorname{next(S_{i,t})} {}={} 
        &\text{if \quad } t \geq i\\
        &\text{then \quad}
            \bddrt{B}{i}\\
        &\text{else \quad} \itep
        { x_{\bddrt{B}{1} \concat \dots \concat \bddrt{B}{t}} } 
        { \high{N_{i,\bddrt{B}{1} \concat \dots \concat \bddrt{B}{t}}} }
        { \low{N_{i,\bddrt{B}{1} \concat \dots \concat \bddrt{B}{t}}} }
\end{align*}

We replace $x_{\bddrt{B}{1} \concat \dots \concat \bddrt{B}{t}}$ with $\bddrt{B}{t + 1}$.
\begin{align*}
    \operatorname{next(S_{i,t})} {}={} 
        &\text{if \quad } t \geq i\\
        &\text{then \quad}
            \bddrt{B}{i}\\
        &\text{else \quad} \itep{ \bddrt{B}{t + 1} }
            { \high{N_{i,\bddrt{B}{1} \concat \dots \concat \bddrt{B}{t}}} }
            { \low{N_{i,\bddrt{B}{1} \concat \dots \concat \bddrt{B}{t}}} }
\end{align*}

We replace the high and low edges by the definition:
\begin{align*}
    \operatorname{next(S_{i,t})} {}={} 
        &\text{if \quad } t \geq i &&\\
        &\text{then \quad} \bddrt{B}{i} &&\\
        &\text{else \quad} (&&\text{if \quad } \bddrt{B}{t + 1}\\
            & &&\text{then \quad}
                \left(
                \text{if \quad } t = i - 1 \text{\quad then \quad} 1 \text{\quad else \qquad} N_{i, \bddrt{B}{1} \concat \dots \concat \bddrt{B}{t}\concat 1}
                \right)\\
            & &&\text{else \quad}
                \left(
                \text{if \quad } t = i - 1 \text{\quad then \quad} 0 \text{\quad else \qquad} N_{i, \bddrt{B}{1} \concat \dots \concat \bddrt{B}{t}\concat 0}
                \right)
            )
\end{align*}

We rearrange the if conditions:
\begin{align*}
    \operatorname{next(S_{i,t})} {}={} 
        &\text{if \quad } t \geq i &&\\
        &\text{then \quad} \bddrt{B}{i} &&\\
        &\text{else \quad} (&&\text{if \quad } t=i-1\\
            & &&\text{then \quad}
                \left(
                \text{if \quad } \bddrt{B}{t+1} \text{\quad then \quad} 1 \text{\quad else \qquad} 0
                \right)\\
            & &&\text{else \quad}
                \left(
                \text{if \quad } \bddrt{B}{t+1} \text{\quad then \quad} N_{i, \bddrt{B}{1} \concat \dots \concat \bddrt{B}{t} \concat 1} \text{\quad else \qquad} N_{i, \bddrt{B}{1} \concat \dots \concat \bddrt{B}{t} \concat 0}
                \right)
            )\\
    \operatorname{next(S_{i,t})} {}={} 
        &\text{if \quad } t \geq i &&\\
        &\text{then \quad} \bddrt{B}{i} &&\\
        &\text{else \quad} (&&\text{if \quad } t=i-1\\
            & &&\text{then \quad}
                \bddrt{B}{t+1}\\
            & &&\text{else \quad}
                N_{i, \bddrt{B}{1} \concat \dots \concat \bddrt{B}{t+1}}
            )
\end{align*}

The first two $\operatorname{then}$ branches collapse:
\begin{align*}
    \operatorname{next(S_{i,t})} {}={} 
        &\text{if \quad } t + 1 \geq i &&\\
        &\text{then \quad} \bddrt{B}{i} &&\\
        &\text{else \quad}
                N_{i, \bddrt{B}{1} \concat \dots \concat \bddrt{B}{t+1}}
\end{align*}

As the only node we now reach has bit string length $t+1$, $\operatorname{next}(S_{i,t}) = S_{i,t+1}$. Therefore $P(t) \to P(t+1)$.

Consider $P(b-1)$: $S_{i,b-1} =  \text{if \quad } b-1 \geq i \text{\quad then \quad} \bddrt{B}{i} \text{\quad else \quad} N_{i,\bddrt{B}{1} \concat \dots \concat \bddrt{B}{b-1}}$.

For all $i \in \{1, \dots, b-1\}$, we see that the root labeled $\bddrt{C}{i}$ reaches the value $\bddrt{B}{i}$. For $i = b$, the state reaches $N_{n, \bddrt{B}{1}\concat \dots \concat \bddrt{B}{b-1}}$, whose high and low edges are 1 and 0, respectively. The last step goes to $\itep{ x_{b_1\concat \dots \concat b_{b-1}} }{ 1 }{ 0 } = \bddrt{B}{b}$. Therefore $B$ and $C$ are equivalent.
\end{proof}

\textbf{Proposition 1 (second half).}

\begin{proof}

Let $\operatorname{encode_{\BWH}}(\mathcal{D}_b) = (B, \vec{p})$. $B$ has $2^{b+1}-b-2$ decision nodes.

It directly follows from Lemma \ref{bwhstructure} that $B$ requires the following number of nodes.
\begin{gather*}
    \sum_{i=1}^b (\text{\# of bit strings of length less than $i$}) = 2^{n+1}-n-2
\end{gather*}
\end{proof}


\subsection{Encoding Uniform Integer Distributions}
Let ${U_{n}}$ denote the discrete uniform distribution over the integers $\{0, 1, ..., n-1\}$
\begin{flalign}
\;\;\;\;\;\;\;\;\;\;&{U_{n}}(i) = \begin{cases} 
      \frac{1}{n} & i \in {0, 1, ..., n-1}   \\
      0 & \text{otherwise} 
  \end{cases}&&\nonumber
\end{flalign}

\begin{lemma} \label{uniform1}
For all non-negative integers $b$ and $i$ such that $0 < i < b$, $x_{i} \in \{0, 1\}$
\[U_{2^b}(<x_b, x_{b-1}, \ldots, x_{1}>) =  \frac{1}{2} \cdot U_{2^{b-1}}(<x_{b-1}, \ldots, x_{1}>) \]
\end{lemma}
\begin{proof}
Let $a = <x_b, x_{b-1}, \ldots, x_{1}>$, then
\begin{flalign*}
U_{2^b}(a) &= \frac{1}{2^b} = \frac{1}{2} \cdot \frac{1}{2^{b-1}} = \frac{1}{2} \cdot U_{2^{b-1}}(a - x_b 2^{b-1}) = \frac{1}{2} \cdot U_{2^{b-1}}(<x_{b-1}, \ldots, x_1>)
\end{flalign*}
\end{proof}

\begin{lemma} \label{uniform2} For integers b and i such that $0 < i < b$, $x_i \in \{0, 1\}$ and $b > 1$
\[ [\mathit{UNIFORM}(b)](x_b, x_{b-1}, ..., x_1) = \frac{1}{2} [\mathit{UNIFORM}(b-1)](x_{b-1}, ..., x_1) \]
\end{lemma}
\begin{proof}
Consider the case when $x_b = 0$. The proof for the other case when $x_b = 1$ would go in a similar fashion.
Let $\pi$ be the set of executions of UNIFORM($b$) resulting in $[0, x_{b-1}, ..., x_1]$. Let $\pi'$ be the set of executions of UNIFORM($b-1$) resulting in $[x_{b-1}, ..., x_1]$. Since $b > 1$ is false, $X_b$ gets assigned flip$_1$ evaluating to false; we return $0::$UNIFORM($b-1$).
\begin{flalign*}
&[\mathit{UNIFORM}(b)](0, x_{b-1}, ..., x_1)\\ 
 &= \sum_{p \in \pi} w(p) = \frac{1}{2}\sum_{p \in \pi'} w(p) = \frac{1}{2} [\mathit{UNIFORM}(b-1)](x_{b-1}, ..., x_1)
\end{flalign*}
\end{proof}

\begin{propositionhalf}{2}\label{uniform}
$\forall b > 0$, [UNIFORM($b$)] = $U_{2^{b}}$
\end{propositionhalf}
\begin{proof}
The proof is by induction on $b$\\
\textbf{Base Case:} For $b = 1$,
\[ U_{2}(a) = \begin{cases}
    \frac{1}{2} & a = 0\\
    \frac{1}{2} & a = 1\\
    0 & \text{otherwise}
    \end{cases}
\]
Executing UNIFORM(1) would result in $X_{1} = \text{flip}\frac{1}{2}.$
So, Pr$(X_{1} = 1) = \frac{1}{2}$ and Pr$(X_{1} = 0) = \frac{1}{2}$\\
\textbf{Induction Hypothesis:} $\forall b' < b\; [\mathit{UNIFORM}(b')] = U_{2^{b'}}$\\
\textbf{Inductive Step:} To prove $[\mathit{UNIFORM}(b)] = U_{2^{b}}$\\
Let the output of UNIFORM($b$) be $a$ where $a_b = <x_b, x_{b-1}, ..., x_1>$

\underline{Case 1:} $\forall 0 \leq a < 2^{b-1}$\\
Since $a < 2^{b-1}, x_{b} = 0$
\begin{flalign*}
&[\text{UNIFORM}(b)](0, x_{b-1}, ..., x_1)&&\\
&= \frac{1}{2}[\text{UNIFORM}(b-1)](x_{b-1}, ..., x_1)
= \frac{1}{2}\cdot U_{2^{b-1}}(<x_{b-1}, ..., x_1>) && \tag{by Lemma \ref{uniform2} and I.H.}\\
&= U_{2^b}(<0, x_{b-1}, ..., x_1>) && \tag{by Lemma \ref{uniform1}}
\end{flalign*}

\underline{Case 2:} $\forall 2^{b-1} \leq a < 2^{b}$\\
Since $a \geq 2^{b-1}$, $x_{b} = 1$, the proof is similar to that for Case 1.
\end{proof}

\section{Beta Prior Application: Bayesian Network Parameter Learning}



  %   \begin{subfigure}[b]{0.2\linewidth}
  %   %
  %   \begin{tabular}{c|c|c|c|c|c}
  %       A & S & E & O & R & T \\
  %       \hline
  %       1  & ?  & ? & 1 & 1 & 1 \\
  %       1  & 0  & ? & 1 & 0 & 1 \\
  %       1  & 0  & 1 & ? & 0 & 1 \\
  %       1  & ?  & 1 & 0 & ? & 1 \\
  %       0  & 0  & 1 & 1 & ? & 1 \\
  %       0  & 1  & ? & 1 & 1 & ? \\
  %       % 1  & ?  & 0 & 0 & 1 & 0 \\
  %       % 0  & ?  & 1 & ? & ? & ? \\
  %       % 1  & 1  & 1 & ? & 1 & ? \\
  %       % 1  & 0  & 0 & ? & 1 & 1 \\
  %       \hline
  %   \end{tabular}
  %   \end{subfigure}
  %   \\
  %   % \begin{minipage}{1cm}
  %   \begin{subfigure}[b]{0.2\linewidth}
  %   \begin{tabular}{c|c|c}
  %       $\alpha$ & $\beta$ & Pr(.)  \\
  %       \hline
  %       9 & 3 & 0.226\\
  %       4 & 8 & 0.035\\
  %       \hline
  %   \end{tabular}
  %   \end{subfigure}
  %   &
  %   \begin{subfigure}[b]{0.2\linewidth}
  %   \begin{tikzpicture}
  %   \begin{axis}[
  %       height=3cm,
  %       width=4cm,
  %       grid=major,
  %       xmode=linear,
  %       ymode=linear,
  %       % xtick={2,4,6,8,10},
  %       % ytick={0.001, 0.01, 0.1, 1, 10, 100, 1000},
  %       xlabel={$\theta_{O|E}$}
  %       % ylabel={PDF}
  %       % legend columns=4,
  %       % legend style={at={(-5em,-7em)},anchor=north west}
  %   ]
  %   \addplot[mark=none, thick, blue] table [x index={0}, y index={1}] {\pdfdata}; %
  %   % \addlegendentry{WebPPL};
    
  %   \end{axis}
  % \end{tikzpicture}
  % \end{subfigure}
  %   \\
  % \end{tabular}
    


% \william{steven: either priors, or highly structured constraints (priors are all sampling based, struggle with hd-discrete constraints}

% \william{typically people use things like stan, we do this exactly}

One natural application of a Beta prior is as a prior distribution for learning the parameters of a (binary) Bayesian network. The task of Bayesian network parameter learning can be described as follows: given a set of data consisting of instantiations of network variables, we want to find the network parameters maximizing the probability of this data. One interesting case is when our data is incomplete; that is, there are some variables not given a value. In this setting, a Bayesian approach to parameter learning must consider all (exponentially many) possible instantiations of these missing values ~\citep{darwiche_2009}.

Using our Beta prior implementation described in the main paper, this setting can naturally be modeled within \texttt{Dice.jl}.. A Bayesian network can be expressed in the probabilistic program with Beta priors on each network parameter; a dataset can then be observed. By returning the distribution over our Beta parameters $\alpha$ and $\beta$, we obtain our posterior, a mixture over Beta distributions. These can then be manually combined to obtain the exact posterior density function.

We provide a brief example of this on the survey Bayesian network\footnote{https://www.bnlearn.com/bnrepository/}. The structure of this network is shown below; we simplify the network by making all variables binary so that the Beta a suitable prior. We note that we can actually generalize to the non-binary case with a Dirichlet prior using an approach similar to that used for the Beta. 

\begin{center}
    \begin{tikzpicture}[node distance=0.75cm, baseline=-1em]


      \node (A) at (0, 0) [bddnode] {\textsc{a}};
      \node (S) at ($(A) + (40bp, 0)$) [bddnode] {\textsc{s}};
      \node (E) at ($(A) + (20bp, -20bp)$) [bddnode] {\textsc{e}};
      \node (O) at ($(E) + (-20bp, -20bp)$) [bddnode] {\textsc{o}};
      \node (R) at ($(O) + (40bp, 0)$) [bddnode] {\textsc{r}};
      \node (T) at ($(R) + (-20bp, -20bp)$) [bddnode] {\textsc{t}};

      \draw[->] (A) -- (E);
      \draw[->] (S) -- (E);
      \draw[->] (E) -- (O);
      \draw[->] (E) -- (R);
      \draw[->] (O) -- (T);
      \draw[->] (R) -- (T);

    \end{tikzpicture}
\end{center}


Suppose we want to learn parameters from the dataset given below; the entry ? indicates a missing value. We focus on the specific parameter $\theta_{o|e} = \Pr(O=1|E=1)$, which we give a uniform prior $\Beta(1, 1)$. 

% \begin{center}
%     %
%     \centering
%     \begin{tabular}{c|c|c|c|c|c}
%         A & S & E & O & R & T \\
%         \hline
%         1 & 0 & 1 & 1 & 0 & 1 \\
%         1 & 1 & 1 & 0 & 1 & 1 \\
%         1 & 1 & 1 & 1 & 0 & 1 \\
%         1 & 1 & 0 & 1 & 0 & 1 \\
%         0 & 1 & 0 & 0 & 0 & 0 \\
%         \hline
%     \end{tabular}
% \end{center}

% Based on our knowledge of the Beta prior's conjugate property, our posterior should be $\Beta(3, 2)$: when $E = 1$, we see $O=1$ twice and $O=0$ once. If we write this as a \texttt{Dice.jl} program in the manner described in the main paper and return the parameters of the correct Beta prior, we get the following output:
% \begin{center}
%     \begin{tabular}{c|c|c}
%         $\alpha$ & $\beta$ & Pr(.)  \\
%         \hline
%         5 & 2 & 0.4\\
%         4 & 3 & 0.4\\
%         3 & 4 & 0.2 \\
%         \hline
%     \end{tabular}
% \end{center}

% What we have is in fact a mixture over Beta distributions --- $0.4\Beta(5, 2) + 0.4\Beta(4, 3) + 0.2\Beta(3, 4)$; this results from our method of representing the Beta; in the update step, we conditionally update our internal representation of the Beta parameters. If the flips we generate are fully observed, our output distribution will be the posterior, however, they are not in this case, resulting in a mixture. This is not an issue, however: looking at the corresponding density functions, we can see they are actually equivalent. 


\begin{center}
    %
    \centering
    \begin{tabular}{c|c|c|c|c|c}
        A & S & E & O & R & T \\
        \hline
        1 & ? & ? & 1 & 1 & 1 \\
        1 & 0 & ? & 1 & 0 & 1 \\
        1 & 0 & 1 & ? & 0 & 1 \\
        1 & ? & 1 & 0 & ? & 1 \\
        0 & 0 & 1 & 1 & ? & 1 \\
        0 & 1 & ? & 1 & 1 & ? \\
        1 & ? & 0 & 0 & 1 & 0 \\
        0 & ? & 1 & ? & ? & ? \\
        1 & 1 & 1 & ? & 1 & ? \\
        1 & 0 & 0 & ? & 1 & 1 \\
        \hline
    \end{tabular}
\end{center}

By running the program representing this task, we get the following large output distribution over Beta parameters - a mixture over Beta distributions. Note that it does not contain as many entries as possible instantiations, as some complete variable instantiations result in the same posterior Beta. 

\begin{center}
    \begin{tabular}{c|c|c}
        $\alpha$ & $\beta$ & Pr(.)  \\
        \hline
        9 & 3 & 0.226\\
        8 & 4 & 0.207\\
        10 & 2 & 0.177 \\
        7 & 5 & 0.160 \\
        6 & 6 & 0.109 \\
        5 & 7 & 0.066 \\
        4 & 8 & 0.035 \\
        3 & 9 & 0.016 \\
        2 & 10 & 0.005 \\
        \hline
    \end{tabular}
\end{center}

If we plot the corresponding mixture of Betas, we can get the following posterior PDF --- note that this is an exact posterior recovered from the Beta parameters, in constrast to the approximate result one would get from sampling-based inference methods.

The code used in this example is available in the same \texttt{Dice.jl} repository (\url{https://github.com/Juice-jl/Dice.jl/tree/arithmetic}).

\begin{center}
    \includegraphics[scale=0.5]{data/posterior.png}
\end{center}

\bibliography{cao_757}

\end{document}
