% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent

\makeatletter
\def\blfootnote{\gdef\@thefnmark{}\@footnotetext}
\makeatother



% \newcommand\blfootnote[1]{%
%   \begingroup
%   \renewcommand\thefootnote{}\footnote{#1}%
%   \addtocounter{footnote}{-1}%
%   \endgroup
% }

% 


\usepackage[american]{babel}
\usepackage{svg, amsfonts}

\usepackage{subcaption}
\usepackage[titlenumbered,ruled,resetcount,noend]{algorithm2e}
\usepackage{algpseudocode}
\algdef{SE}[SUBALG]{Indent}{EndIndent}{}{\algorithmicend\ }%
\algtext*{Indent}
\algtext*{EndIndent}
\newtheorem{theorem}{Theorem}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{corollary}[theorem]{Corollary}
% \newtheoremstyle{Definition} % <name>
% {}% <Space above>
% {}% <Space below>
% {}% <Body font>
% {}% <Indent amount>
% {\upshape}% <Theorem head font>
% {:}% <Punctuation after theorem head>
% {}% <Space after theorem headi>
% {}% <Theorem head spec (can be left empty, meaning `normal')>
\newtheorem{definition}{Definition}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{wrapfig}
\usepackage{float}
\usepackage{caption}
\usepackage{subcaption}
\usepackage{graphicx}
\usepackage{listings}
% \lstset{basicstyle=\ttfamily, }
\lstset{frame=tb,
  language=Caml,
  aboveskip=3.5mm,
  belowskip=0mm,
  showstringspaces=false,
  columns=flexible,
  basicstyle={\small\ttfamily},
  numberstyle=\tiny\color{black},
  keywordstyle=\color{black},
  commentstyle=\color{dkgreen},
  stringstyle=\color{mauve},
  breaklines=true,
  tabsize=3,
  numbers=left,
  xleftmargin=2em,
  framexleftmargin=1.5em,
  escapeinside={<@}{@>}
}

\usetikzlibrary{patterns,calc,backgrounds}


%
\tikzstyle{nnf}=[
  >=stealth,font=\small,auto,scale=0.7,every node/.style={scale=0.7}
]
\tikzstyle{extnode}=[
  draw,circle,inner sep=2pt,fill=white
]

\tikzstyle{leafnode}=[
  draw,fill=gray!20,inner sep=3.5pt
]
\tikzstyle{constnode}=[
  draw,fill=white,inner sep=3.5pt
]
\tikzstyle{label}=[
  fill=white,inner sep=2.5pt
]

\tikzstyle{acarrow}=[
    decoration={markings,mark=at position 1 with {\arrow[scale=0.6]{>}}},
    postaction={decorate},
    shorten >=0.4pt,
    >=latex,
    line width=0.1
]

\tikzstyle{bnarrow}=[
    decoration={markings,mark=at position 1 with {\arrow[scale=1.5]{>}}},
    postaction={decorate},
    shorten >=0.7pt,
    >=latex,
    line width=0.3
]
\tikzstyle{bayesnet}=[
  >=latex, thick, auto
]
\tikzstyle{bnnode}=[
  draw,ellipse,minimum size=7mm,inner sep=1pt,font=\small
]
\tikzstyle{cpt}=[
  font=\footnotesize
]

\tikzstyle{graph}=[
  >=stealth,font=\small,auto,scale=1,every node/.style={scale=1}
]
\tikzstyle{node}=[
  draw,circle,inner sep=3pt,fill=white
]

%

\tikzstyle{bdd}=[
  >=latex, thick, >=stealth, font=\small,auto,scale=0.9,every node/.style={scale=0.9}
]
\tikzstyle{bddnode}=[
  draw,circle,inner sep=0pt,fill=white,minimum size=5.5mm
]

\tikzstyle{highedge}=[
    line width=0.9
]
\tikzstyle{lowedge}=[
    line width=0.9,dotted
]
\tikzstyle{bddterminal}=[
  draw,fill=gray!20,inner sep=2.5pt, font=\small
]

\tikzstyle{bddroot}=[
  draw,fill=white,inner sep=2.5pt, font=\small
]

\lstdefinestyle{compact}{
  \ttfamily\tiny
}

\usepackage{forest}
\usepackage{tikz}
\usetikzlibrary{shapes,arrows}
\usetikzlibrary{arrows.meta}
\usetikzlibrary{positioning}
\tikzset{
  low/.style={densely dashed, high},
  high/.style={-{Stealth[]}},
  % zeroarrow/.style = {-stealth,dashed},
  % onearrow/.style = {-stealth,solid},
  % c/.style = {circle,draw,solid,minimum width=2em,
  %       minimum height=2em},
  % r/.style = {rectangle,draw,solid,minimum width=2em,
  %       minimum height=2em}
}
\forestset{
  BDT/.style={
    for tree={
      {rectangle, minimum size=1.9em, inner sep = 2em},
      if n children=0{rectangle, minimum size=1.9 em}{circle, inner sep=0.2em},% use a circle unless there are 0 children
      draw,% draw every node
      edge={
        high,% use the my edge style for edges (with the arrow)
      },
      % if n=1{
      %   edge+={low},% if the child is the first one, add the 0 my edge style (dashed)
      % }{},
      % font=\sffamily,% use sans serif for node text
    }
  },
}


% \lstset{frame=tb,
%   language=Caml,
%   aboveskip=3.5mm,
%   belowskip=0mm,
%   showstringspaces=false,
%   columns=flexible,
%   basicstyle={\small\ttfamily},
%   numberstyle=\tiny\color{black},
%   keywordstyle=\color{black},
%   commentstyle=\color{dkgreen},
%   stringstyle=\color{mauve},
%   breaklines=true,
%   tabsize=3,
%   numbers=left,
%   xleftmargin=2em,
%   framexleftmargin=1.5em,
%   escapeinside={<@}{@>}
% }

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example
\newcommand{\guy}[1]{\textbf{\textcolor{red}{{guy: #1}}}}

\newcommand{\todd}[1]{\textbf{\textcolor{orange}{{todd: #1}}}}
\newcommand{\poorva}[1]{\textbf{\textcolor{blue}{{poorva: #1}}}}
\newcommand{\steven}[1]{\textbf{\textcolor{green}{{steven: #1}}}}
\newcommand{\william}[1]{\textbf{\textcolor{red}{{william: #1}}}}
\newcommand{\ryan}[1]{\textbf{\textcolor{yellow}{{ryan: #1}}}}

\newcommand{\true}[0]{ \mathtt{T} }
\newcommand{\false}[0]{ \mathtt{F} }
\definecolor{amethyst}{rgb}{0.6, 0.4, 0.8}

% \newcommand{\todd}[1]{}
% \newcommand{\guy}[1]{}
% \newcommand{\william}[1]{}
\newcommand{\sh}[1]{}
% \newcommand{\ryan}[1]{}

\newcommand{\addcite}[0]{\textbf{\textcolor{red}{{*}}}}


\DeclareMathOperator\flip{flip}
\DeclareMathOperator\Beta{Beta}
\DeclareMathOperator\uniform{uniform}
\usepackage{pifont}
\newcommand{\xmark}{\ding{55}}
\usepackage{pgfplots}
\usetikzlibrary{plotmarks}
% \pgfuseplotmark{asterisk}
% \pgfuseplotmark{star}
\pgfplotstableread[col sep = comma]{data/lessthan.csv}\lessthandata
\pgfplotstableread[col sep = comma]{data/equals.csv}\equalsdata
\pgfplotstableread[col sep = comma]{data/plus.csv}\plusdata
\pgfplotstableread[col sep = comma]{data/luhn.csv}\luhndata
\title{Scaling Integer Arithmetic in Probabilistic Programs}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{William X. Cao}
\author[1]{Poorva Garg$^*$}
\author[2]{Ryan Tjoa$^*$}
\author[3]{Steven Holtzen}
\author[1]{Todd Millstein}
\author[1]{Guy Van den Broeck}
% Add affiliations after the authors
\affil[1]{%
    Department of Computer Science\\
    University of California\\
    Los Angeles, California, USA
}
\affil[2]{
    Department of Computer Science\\
    University of Washington\\
    Seattle, Washington, USA
}
\affil[3]{%
    Khoury College of Computer Sciences\\
    Northeastern University\\
    Boston, Massachusetts, USA
}


\begin{document}
\maketitle

\blfootnote{$^*$These authors contributed equally to this work.}

\begin{abstract}
Distributions on integers are ubiquitous in probabilistic modeling but remain challenging for many of today's probabilistic programming languages (PPLs). The core challenge comes from discrete structure: many of today's PPL inference strategies rely on enumeration, sampling, or differentiation in order to scale, which fail for high-dimensional complex discrete distributions involving integers. 
Our insight is that there is structure in arithmetic that these approaches are not using. We present a binary encoding strategy for discrete distributions that exploits the rich logical structure of integer operations like summation and comparison. We leverage this structured encoding with knowledge compilation to perform exact probabilistic inference, and show that this approach scales to much larger integer distributions with~arithmetic. 
\end{abstract}

\begin{figure*}
\centering
\begin{minipage}[b]{.59\textwidth}
\begin{lstlisting}
id = [discrete([0.72, 0.01, 0.01, 0.01, 0.01, 
                     0.01, 0.2, 0.01, 0.01, 0.01]),...,
        discrete([0.01, 0.01, 0.05, 0.01, 0.01,
                     0.63, 0.2, 0.01, 0.01, 0.05])]
check_digit = id[0]
remaining_id = id[1:] //tail of the array
check_val = luhn_checksum(remaining_id)
observe((check_digit + (check_val % 10)) == 10)
return id
\end{lstlisting}
\caption{A probabilistic program for the student ID probabilistic inference problem using integer random variables (discrete), integer arithmetic (the Luhn algorithm function), and Bayesian conditioning (observe)}\label{example2} 
% \sh{I'm not a huge fan of the Python-style array indexing here; it could be confusing to people that are not familiar with those rules. Let's add a comment telling them what [1:] means}
\end{minipage}
\quad
\begin{minipage}[b]{.37\textwidth}
\begin{lstlisting}[language=Python]
def luhn_checksum(id)
  sum = 0
  for i in 0..length(id) - 1
    if i % 2 == length(id) % 2:
      if id[i] > 4:
        sum += 2 * id[i] - 9
      else:
        sum += 2 * id[i] 
    else: 
        sum += id[i]
  return sum 
\end{lstlisting}
\caption{Luhn algorithm implementation}\label{example1}
\end{minipage}
\end{figure*}


% \begin{figure}
% \centering
% \begin{subfigure}[b]{width = 0.3\textwidth}
% \centering
% \begin{lstlisting}
% function luhn_checksum(id)
%     sum = 0
%     for i in 0..length(id)-1
%         digit = id[i]
%         if i % 2 == length(id) % 2:
%             if digit > 4:
%                 sum += 2*digit - 9
%             else:
%                 sum += 2*digit 
%         else: 
%             sum += digit
%     return sum 
% \end{lstlisting}
% \caption{compute luhn checksum}
% \end{subfigure}
% \hfill
% % [numbers=left,
% %   stepnumber=1,
% %   numbersep=10pt,
% %   tabsize=4,
% %   showspaces=false,
% %   showstringspaces=false, 
% %   basicstyle=\small]
% \begin{subfigure}[]{0.3\textwidth}
% \begin{lstlisting} 
% id = [discrete([0.72, 0.01, 0.01, 0.01, 0.01, 0.01, 0.2, 0.01, 0.01, 0.01]), 
%       ..,
%       discrete([0.01, 0.01, 0.05, 0.01, 0.01, 0.63, 0.2, 0.01, 0.01, 0.05])]
% check_digit = id[0]
% remaining_id = id[1:]
% check_val = luhn_checksum(remaining_id)
% observe((check_digit + (check_val % 10)) == 10)
% return id
% \end{lstlisting}
% \end{subfigure}
% \caption{A probabilistic program implementing a student ID model using integer arithmetic \william{needs formatting, I want these to be side-by-side subfigures}}
% \label{example1}
% \end{figure}

\section{Introduction}\label{sec:intro}
Probabilistic programming languages (PPLs) are expressive languages for defining
probability distributions. The core idea of a PPL is to enrich a programming 
language with the ability to define, observe, and compute with random variables: hence, the program itself defines a probabilistic model.
This paper focuses on a particular programming feature: scaling inference for programs with random integers and integer arithmetic. Integers are very challenging for today's approaches to probabilistic inference.
The relationships between integer-valued random variables can be very complex: they can be added,
multiplied, compared, etc. This rich structure is opaque to today's inference strategies. Trace-based sampling like Markov-Chain Monte Carlo, importance sampling, and sequential Monte-Carlo all collapse integer distributions
to a single sampled point~\citep{gelman2015stan, bingham2019pyro, dillon2017tensorflow,van2018introduction,lew2019trace}. These approximate inference strategies can scale well in many cases, but they struggle to find valid sampling regions in the presence of low-probability observations and non-differentiability (e.g., 
observing the sum of two large random integers to be a constant)~\citep{gelman2015stan, bingham2019pyro, dillon2017tensorflow}. 
Exact inference strategies work by preserving the global structure of the distribution, but here there is a challenge: \emph{what is the right strategy for efficiently representing and manipulating distributions on integers}?
Today's PPLs that support exact inference and integer manipulation -- such as {Dice}~\citep{HoltzenOOPSLA20}, {ProbLog}~\citep{de2007problog}, Psi~\citep{gehr2016psi}, and WebPPL~\citep{dippl} -- model integer distributions using what is essentially a one-hot categorical encoding (i.e., an integer distribution $[\texttt{0} \mapsto 0.25, \texttt{1} \mapsto 0.25, \texttt{2} \mapsto 0.25, \texttt{3} \mapsto 0.25]$ is represented simply as a vector). This encoding style is not capable of exploiting the structure of addition: adding two random variables effectively requires full enumeration.

% Integers are ubiquitous in traditional programs, and correspondingly it is very natural for probabilistic programs to manipulate distributions over integers, for example to model rankings, text manipulations, and distributions over graphs.  
% Most PPLs have the means to express distributions over integers as well as arithmetic operations, but unfortunately programs with complex arithmetic often pose serious scalability challenges for today's approaches to inference.  Integer distributions are inherently {\em discrete}, so languages whose inference approaches rely on forms of differentiability typically resort to enumerating all possible discrete values~\citep{gelman2015stan, bingham2019pyro, dillon2017tensorflow}.  Further, even languages that support exact reasoning about discrete distributions typically model integer distributions as categoricals and integer operations as explicit transformations on such categoricals~\citep{dippl, HoltzenOOPSLA20}, which also reduces to value enumeration. As a result, existing PPLs do not scale well on programs that manipulate integers, thereby preventing their usage on many natural problems.

% A commonality among these prior approaches is that they treat integer operations as {\em black boxes}.  Our key insight is that integer arithmetic is
% itself inherently full of structure that can be exploited to scale probabilistic inference. In particular, the binary representation of integers and integer operations exposes this structure; by extending this binary encoding to manipulate {\em distributions} over integers, we naturally expose conditional independences that can be used to factorize inference. 

% This paper develops a new efficient strategy for representing distributions on integers. 
Our first contribution is a new representation of distributions on integers as distributions on \emph{binary encodings}.
For instance, in the above example, rather than representing the distribution as an exhaustive map from integer values to probabilities, we represent it as a joint distribution on binary bits
$
    [\texttt{00} \mapsto 0.25, \texttt{01} \mapsto 0.25, \texttt{10} \mapsto 0.25, \texttt{11} \mapsto 0.25].
$
% A core element of discrete probabilistic programs is the ability to represent a Bernoulli random variable, something which is often given the syntax \verb#flip(p)#. While integers are traditionally represented as sequences of bits, one can analogously replace these bits with logical expressions over \verb#flip#s to instead represent distributions over integers.  Arithmetic can then be implemented within the program using standard logic circuits as are used in computer hardware.  
The upsides of this seemingly-equivalent representation are twofold. First, we can more efficiently represent the joint distribution itself when it has certain structure. In this case, because the distribution is uniform, we can represent it as a product of two independent Bernoulli distributions, one for each bit: we will show that the ability to factorize the distribution in this manner leads to significant performance improvements. Second, this binary representation reveals the structure of arithmetic: for instance, we can compare two integers by \emph{independently} comparing each of their binary digits and aggregating the results. 

Clearly a binary representation of integers reveals structure, but how can we automatically find and exploit this structure during inference in a PPL?
As our second contribution, we show that two of today's PPLs -- Dice~\citep{HoltzenOOPSLA20} and ProbLog \citep{de2007problog,Fierens2015} -- are already capable of exploiting this structure if it is properly encoded into the program, by virtue of their \emph{knowledge compilation} approach to inference.
We give a lightweight strategy for encoding integer distributions, and show empirically that when using our new binary-encoded distributions these two languages scale to significantly larger and more complex integer distributions without essential modifications to their existing inference~strategies. 
% We show how to adapt existing exact inference methods for discrete probabilistic programs based on \emph{knowledge compilation}  naturally identify and exploit the structure exposed by this new probabilistic integer representation~\citep{HoltzenOOPSLA20, Raedt2007, chavira2006compiling, Chavira2008, Fierens2015}. The knowledge compilation approach reduces inference to weighted model counting (WMC) on weighted Boolean formulas (WBFs). Typically these formulas are represented using data structures such as binary decision diagrams (BDDs), which exploit conditional independences to represent distributions compactly and support linear-time WMC in the size of the~BDD.

% To evaluate our approach, we have implemented Dice~\citep{HoltzenOOPSLA20} and its semantics as a shallow embedding in Julia~\citep{Julia-2017} along with the support for binary encoding for integers. We refer to this implementation as \verb#Dice.jl# in the rest of the paper. We compare the performance of \verb#Dice.jl# to that of other PPLs that support exact discrete inference, on a range of probabilistic models over integers taken from the literature and derived from natural examples. Our approach is the fastest in the majority of benchmarks and in many cases provides orders-of-magnitude speedups. We also demonstrate the benefits that our binary representation provides for the existing knowledge compilation based languages Dice and ProbLog ~\citep{Fierens2015}.
% We also empirically show that the gains from our approach of binarising integers generalize to other systems using knowledge compilation. \sh{what is this referring to? Be concrete}

As our third contribution we show that scalable support for random integer arithmetic allows us to push the boundaries of discrete probabilistic programming systems in surprising ways. We demonstrate how to model a Beta distribution, a continuous distribution, using probabilistic integers. This modelling method exploits the conjugacy property of the Beta distribution, through which we can always characterize the distribution through its (integral) sufficient statistics. By doing so, we can use the Beta distribution as a prior for Bayesian learning.

% We demonstrate how this approach allows us to use the Beta distribution as a prior distribution, and apply it to the problem of Bayesian learning of Bayesian network parameters given incomplete data. We also explore how this model belongs to a general class of urn models \citep{polya}, which can also be used to represent other common distributions such as the negative binomial and hypergeometric.

% A direct result of having a better approach to handling integer distributions is the ability to handle much larger distributions than before. One surprising use case is that of representing Beta distribution, a continuous distribution not supported by discrete languages. Beta distribution 

% This distribution is one of a class of distributions that can be reasoned about using its sufficient statistics, which can now be modelled using our representation. To showcase this, we provide an implementation of a Beta-Bernoulli process, which we test on the task of Bayesian network parameter learning with incomplete data. We also explore several other such distributions that can be represented in the same manner. \william{i still hate this section...}

The structure of this paper is as follows: Section \ref{sec:motivation} gives a motivating example for integer arithmetic. Section \ref{sec:math} explains our integer representation and explores how common integer operations on this representation have structure exploitable by knowledge compilation. Section \ref{section:experiments} empirically evaluates our representation strategy against existing PPLs. Section \ref{sec:betabern} explores the representation of a continuous Beta prior with random integers. Sections \ref{sec:related} and \ref{sec:conclusion} discuss related work and conclude respectively.  

\section{Motivation}\label{sec:motivation}
We begin with a motivating example highlighting how integer distributions are used in probabilistic programs.
Consider the following probabilistic model based on student ID numbers. Suppose that an optical character recognition system is attempting to parse a handwritten student ID number. For each digit of the ID, it produces a probability distribution representing its beliefs about what the digit could be. Combining this output, we get a probability distribution over all possible student IDs. 

The Luhn algorithm~\citep{luhn} is a commonly used method of validating various ID numbers including student IDs. Given a starting ID such as 70733428, the algorithm provides for a way to compute a sum over the ID, giving us a check digit (4) which is then prepended to the original ID to get a final ID: 470733428. This ID is the one actually issued to a student; when provided with an ID, we can validate it by recomputing the sum and looking at the check digit. 

We wish to use the fact that the student ID can be validated to additionally inform our single-digit distributions from the OCR system. We can implement this as a probabilistic program like the one in Figure \ref{example1}.  Figure \ref{example1} implements a function \verb#luhn_checksum# that takes as input a list representing the digits of the student ID, excluding the check digit. It then does computation according to the Luhn algorithm to compute a sum over the digits, which is then returned. Figure \ref{example2} then uses this function: we create a list \verb#id# which contains distributions over integers derived from the OCR system. The syntax \verb#discrete(v)# for a vector $v = [p_0, .., p_n]$ creates a distribution over the numbers $0, .., n$, in which the number $i$ has the probability $p_i$, and is used in our program to represent said OCR distributions. We call the \verb#luhn_checksum# function on these integer distributions to get a distribution over checksums and condition using the \verb#observe# keyword in line 9 to get an updated distribution over IDs. 
% Now whenever a system encounters an ID, it can check this property in the ID number


% We wish to combine these two sources of information to have a more reliable prediction of student ID number. Figure 1 shows the probabilistic program that captures this problem. Figure 1a)


% It is a commonly used method of validating various kinds of ID numbers including student IDs. 

% Given a starting ID, the algorithm provides for a way to compute a check digit, which is then appended to the initial number to form a final ID. One can then verify that the ID number by recomputing the check digit and comparing it to the corresponding entry of the ID, providing protection against misreads or typos. 

% We wish to leverage this fact that there is a Luhn-algorithm-computed checksum within the student ID to refine our distribution over possible student IDs. In order to do this, we can write a probabilistic program like the one in Figure \ref{example1}. Figure 1a implements a function \verb#luhn_checksum# that takes as input a list representing the digits of the student ID, excluding the check digit. It then does computation according to the Luhn algorithm to get some sort of sum over the digits, which is then returned. Figure 1b then uses this function: we create a list \verb#id# which contains not deterministic integers, but rather distributions over integers. The syntax \verb#discrete(v)# for a vector $v = [p_0, .., p_n]$ represents a distribution over the numbers $0, .., n$, in which the number $i$ has the probability $p_i$, and is used in our program to represent the distribution over possible digits given by the OCR system. By passing the (non-check) digits of ID to our function, we obtain \verb#check_val#, which is a distribution over potential sums. We then use the \verb#observe# keyword to condition the output of the program on the checksum condition, before returning the final distribution over IDs. \william{help..}

\begin{figure}[t]
    \centering
    \begin{tikzpicture}
    \begin{axis}[
        height=4cm,
        width=6cm,
        grid=major,
        xmode=linear,
        ymode=log,
        xtick={2,4,6,8,10},
        ytick={0.001, 0.01, 0.1, 1, 10, 100, 1000},
        yticklabel style = {font=\footnotesize},
        xlabel={number of digits},
        ylabel={time (s)},
        legend columns=1,
        legend style={at={(12em,4em)},anchor=north west},
        yminorticks=false,
    ]
    \addplot[mark=square*, thick, red, densely dashdotted, mark options={scale=1}] table [x index={0}, y index={1}] {\luhndata}; %
    \addlegendentry{WebPPL};
    \addplot[mark=triangle*, thick, magenta, densely dotted, mark options={scale=1.3}] table [x index={0}, y index={2}] {\luhndata}; %
    \addlegendentry{Psi (DP)};
    \addplot[mark=star, thick, blue, dashed, 
        mark options={scale=2}] table [x index={0}, y index={3}] {\luhndata}; %
    \addlegendentry{Psi};
    \addplot[mark=diamond*, thick, amethyst, 
        mark options={scale=1.5}] table [x index={0}, y index={4}] {\luhndata}; %
    \addlegendentry{{Dice.jl}};
    \end{axis}
  \end{tikzpicture}
    \caption{Single-marginal performance for ID example on increasing ID lengths. WebPPL and Psi scale exponentially due to having to enumerate all paths.}
    \label{luhn_scaling_plot}
\end{figure}

If we implement this program in today's probabilistic programming languages, we will run into a problem. Even if we only wish to compute the marginal probability over a single digit of the ID, Figure \ref{luhn_scaling_plot} shows that the runtime will scale exponentially in the number of digits in the student ID. Each additional digit will contribute a multiplicative amount to the number of total possible ID instantiations, meaning that any approach involving enumeration is inherently exponential. In practice, this means that programs containing student IDs of a realistic length (9-10 digits) will not run. 

The fact that such straightforward programs fail to scale on existing probabilistic programming systems is the primary motivation behind our work.  We have implemented our encoding of integer distributions in \verb#Dice.jl#, a discrete PPL embedded in Julia that uses the
same knowledge compilation approach as Dice~\citep{HoltzenOOPSLA20}. Figure \ref{luhn_scaling_plot} shows that our technique allows inference for such programs to scale for a larger, more realistic number of digits.

% \section{Introduction}\label{sec:intro}

% \begin{itemize}
% \item integer arithmetic is currently unsupported - DONE
% \item important in real life PPLs - DONE
% \item we import binary arithmetic in PPLs to deal with discrete integers - DONE
% \item give semantics of probabilistic add, instead of enumeration or path based + - DONE
% \item how far can we push it to support continuous
% \item can be done by exploiting conjugate priors
% \end{itemize}


% Probabilistic programming languages are domain specific languages that allow for the specification of and inference on probability distributions given observed data. One notable family of distributions are those over integers which occur in numerous real-life settings like that of ranking places based on reviews or decoding an encryption scheme \poorva{add citations}. Existing probabilistic programming systems can handle distribution over integers but as categorical distributions. This involves enumerating all the integers and their probabilities which poses the issue of scalability. We present a scalable solution to perform inference on probabilistic models with integers using binary arithmetic.

% Binary representation of numbers has been the building block for all of computer science. All the operations that can be done on integers have their binary analogous. In this paper, we describe how binary representation of numbers and integer arithmetic can be imported in probabilistic programming to have a better support for integers. First, we transform distribution over integers to distribution over numbers represented as binary which can further be represented as distribution over tuples of Boolean random variables. The binary representation enables two different aspects of probabilistic program over integers. First, it enables an efficient encoding for distributions over integers. Second, it enables representing operations over integers as Boolean circuits. Both these features naturally compose with existing discrete PPL Dice which is known to work well with programs with Boolean random variables.

% Exact inference algorithms are desirable due to being exact: we can be sure that our final probability distribution is accurate \poorva{ry combining last line with this one too motivate exact algorithms more}. However, current PPLs which implement exact inference do not scale well on large distributions of integers. For example, WebPPL computes an exact distribution over a discrete program by enumerating all paths of the program. Consider the probabilistic program shown in Figure \ref{exampleprog}, which probabilistically adds two uniform distributions over integers and checks if the result is less than a value. In order to compute the resulting distribution through enumeration, we must go over $2^{22}$ possible combinations of our two integers $a$ and $b$, and so it becomes clear that such an approach will fail to scale to larger distributions. Other approaches to integer implementation such as that of Psi must also work through all possible integer values, and so will similarly fail to scale \poorva{Do we now for sure?}. 

% % \begin{wrapfigure}{r}{0.6\textwidth}
% % \begin{lstlisting}[mathescape=true]
% % <@\color{teal}// uniformInt(a, b) is a uniform distribution on a, .., b-1@>
% % <@\color{teal}a = uniformInt(2500, 5000)@>
% % <@\color{teal}b = uniformInt(0, 2500)@>
% % <@\color{teal}c = if flip(0.7) then a+b else b@>
% % <@\color{teal}return c < 2500@>
% % \end{lstlisting}
% % \caption{A motivating example for better computation on random integers}
% % \label{exampleprog}
% % \end{wrapfigure}

% If we take a close look at this program, we see that computing the resultant probability is actually relatively straightforward: the minimum value that \verb#a+b# can take is 2500, while the maximum value that \verb#b# can take is 2499. Therefore, \verb#c < 2500# is entirely dependent on the result of the flip: it will be false if the flip is true, and true if the flip is false, and so it seems from this simple analysis that actually enumerating all possible integer values is unnecessary! The failure of enumeration based approaches is that they essentially treat random integers as categorical distributions which we can add, multiply, and so forth. In doing so, they fail to utilize the inherent structure of numbers and arithmetic, such as the transitivity that allowed us to make the argument above. 






% % \begin{wrapfigure}{l}{0.4\textwidth}

% % \begin{subfigure}[b]{0.4\textwidth}
% % \begin{lstlisting}[mathescape=true]
% % <@\color{teal}a = uniformInt(0, $2^n$)@>
% % <@\color{teal}b = uniformInt(0, $2^n$)@>
% % <@\color{teal}return a<b@>
% % \end{lstlisting}
% % \caption{Simple program}
% % \label{lt_prog}
% % \end{subfigure}

% % \begin{subfigure}[b]{0.4\textwidth}
% % \centering
% % \input{bdd.tex}
% % \caption{Compiled BDD for the above program when n=3}
% % \label{lt_bdd}
% % \end{subfigure}

% % \caption{Program and compiled BDD}
% % \label{lt}


% % \end{wrapfigure}

% Binary numbers suggest a natural approach for arithmetic: using the existing circuits that are used for arithmetic computation in hardware \poorva{Instead use the term binary arithmetic}. In order to do \poorva{perform} such computation, the binary numbers must contain the same number of bits; in order to enforce this requirement, we parameterize our random integers with their bit-width, and only allow for arithmetic on numbers of matching bit-width. To address issues of overflow, we can simply upcast the numbers by padding bits to the front of the representation \poorva{This is too much detail}. By doing this, we implement operations such as addition multiplication and their inverses, as well as comparisons such as equality and less-than. 

% To see how this all comes together, consider the very simple arithmetic program in Figure \ref{lt_prog}. Were we to naively compute this probability through enumeration, we would have to enumerate $2^{2n}$ values. However, our efficient uniform representation combined with BDD compilation means that the compiled BDD for the program is very small: as can be seen in Figure \ref{lt_bdd}, it has a simple repeated structure and has size linear in the number of bits $n$. In this case, our representation uses the structure of the uniform and less-than comparison to achieve massive speedup. While in the worst case we still essentially resort to enumeration, in many cases like this we can successfully exploit structure for much faster arithmetic computation \poorva{nice}. 

% \section{Motivation}
% Give a working example - Luhn's algorithm and describe how existing approaches do not support it (enumeration or approximation)
% Mini Experiment

\section{Representing \& Manipulating Integer Distributions}\label{sec:math}

This section describes the key technical details behind a binary encoding approach and explains how such an encoding allows the knowledge compilation inference strategy used by Dice and ProbLog to automatically exploit arithmetic structure. We first provide a brief introduction to inference via knowledge compilation. We then demonstrate and analyze various approaches to constructing distributions over integers within probabilistic programs. Finally, we show how the binary encoding can be leveraged by knowledge compilation to identify and exploit conditional independencies for inference over distributions with integer arithmetic.

% Our work is built upon the knowledge compilation approach to discrete probabilistic inference~\citep{HoltzenOOPSLA20, Raedt2007, chavira2006compiling, Chavira2008, Fierens2015}. This approach reduces inference on a discrete probabilistic program to weighted model counting (WMC) --- a task commonly done on binary decision diagrams (BDDs), which support WMC in linear time in the BDD size.  Therefore, the ability to scale inference is directly related to the ability to compile to a compact BDD. The BDD construction process identifies and exploits a ubiquitous property of distributions --- conditional independencies --- to reduce BDD size. In this section, we show that such independencies are widespread in integer distributions and thus can be exploited for scalable inference. We propose the use of a standard binary encoding of integers as a method that exposes such independencies to the knowledge compilation system. 

% We first provide a brief introduction to knowledge compilation and demonstrate how distributions over binary-encoded integers are converted to BDDs. We then present two methods of compactly converting distributions over integers to a distribution over binary bits: one for arbitrary distributions, and one for the special case of the uniform distribution. Finally, we show how the binary encoding can be leveraged to identify and exploit conditional independencies for inference over distributions that employ integer arithmetic. 

% This section resolves two technical hurdles for exploiting the structure of integer operation in today's PPLs. First, we give a generic procedure for representing arbitrary binary-encoded probability distributions as a collection of discrete Bernoulli random variables. This representation enables a lightweight strategy for adding our new encoding to existing PPLs with support for Bernoulli variables. Next, we show how to efficiently implement integer operations such as comparison and addition, and finally we show how Dice's and ProbLog's inference algorithms are already capable of exploiting these new representations in order to scale.

% Our work is built upon the knowledge compilation approach to discrete probabilistic inference~\citep{HoltzenOOPSLA20, Raedt2007, chavira2006compiling, Chavira2008, Fierens2015}. This approach reduces inference on a discrete probabilistic program to weighted model counting (WMC) --- a task commonly done on binary decision diagrams (BDDs), which support WMC in linear time in the BDD size.  Therefore, the ability to scale inference is directly related to the ability to compile to a compact BDD. The BDD construction process identifies and exploits a ubiquitous property of distributions --- conditional independencies --- to reduce BDD size. In this section, we show that such independencies are widespread in integer distributions and thus can be exploited for scalable inference. We propose the use of a standard binary encoding of integers as a method that exposes such independencies to the knowledge compilation system. 

% This section describes our key technical contributions, which exploit the inductive binary structure of integers to scale probabilistic inference.
% We first provide a brief introduction to knowledge compilation and demonstrate how distributions over binary-encoded integers are converted to BDDs. We then present two methods of compactly converting distributions over integers to a distribution over binary bits: one for arbitrary distributions, and one for the special case of the uniform distribution. Finally, we show how the binary encoding can be leveraged to identify and exploit conditional independencies for inference over distributions that employ integer arithmetic. 


% After some preliminaries, we provide a binary encoding of distributions on integers and highlight the resulting compact representation. We then show how the binary encoding can be leveraged to identify and exploit conditional independencies for inference over distributions that employ integer arithmetic. 

\subsection{Integer Distributions via BDDs}


\begin{figure}
\centering
\begin{subfigure}[b]{\linewidth}
     \centering
    \begin{tikzpicture}
      \def\lvl{20pt}

    
    
    \node (f1) at (0, 0) [bddnode] {$f_1$};
    \node (f2) at ($(f1) + (-7bp, -\lvl)$) [bddnode] {$f_2$};
    \node (f3) at ($(f2) + (-7bp, -\lvl)$) [bddnode] {$f_3$};
    \node (f4) at ($(f3) + (-0bp, -\lvl)$) [bddnode] {$f_4$};

    \node[draw, rounded corners] at ($(f4) + (-30bp, 60bp)$)  {.1};
    \node[draw, rounded corners] at ($(f4) + (-30bp, 40bp)$)  {.11};
    \node[draw, rounded corners] at ($(f4) + (-30bp, 20bp)$)  {.25};
    \node[draw, rounded corners] at ($(f4) + (-30bp, 0bp)$)  {.5};

    
     

    % \node (f5) at ($(f3) + (0bp, -\lvl)$) [bddnode] {$f_5$};
    % \node[draw, rounded corners] at ($(f5) + (21bp, 0bp)$)  {.5};
    
    \node (false) at ($(f4) + (30bp, -1*\lvl)$) [bddterminal] {$\false$};
    % \node (true2) at ($(true) + (60bp, 0bp)$) [bddterminal] {$\true$};
    % \node (true3) at ($(true2) + (40bp, 0bp)$) [bddterminal] {$\true$};
    % \node[draw, rounded corners] at ($(true) + (-21bp, 0bp)$)  {1};

    \node (true) at ($(f4) + (90bp, -1*\lvl)$) [bddterminal] {$\true$};
    % \node (false2) at ($(false) + (60bp, 0bp)$) [bddterminal] {$\false$};
    % \node (false3) at ($(false2) + (60bp, 0bp)$) [bddterminal] {$\false$};
    % \node[draw, rounded corners] at ($(false) + (21bp, 0bp)$)  {0};

    \node (f6) at ($(f1) + (20bp, 0bp)$) [bddnode] {$f_1$};
    \node (f7) at ($(f6) + (10bp, -\lvl)$) [bddnode] {$f_2$};
    \node (f8) at ($(f7) + (20bp, -\lvl)$) [bddnode] {$f_3$};
    \node (f9) at ($(f8) + (45bp, -\lvl)$) [bddnode] {$f_4$};

    \node (f10) at ($(f6) + (35bp, 0bp)$) [bddnode] {$f_{1}$};
    \node (f11) at ($(f10) + (40bp, -\lvl)$) [bddnode] {$f_{2}$};
    \node (f12) at ($(f11) + (-20bp, -\lvl)$) [bddnode] {$f_{3}$};
    % \node (f13) at ($(f12) + (0bp, -\lvl)$) [bddnode] {$f_{4}$};

    \node (b2) at ($(f1) + (0bp, 20bp)$) [bddroot] {$b_2$};
    \node (b1) at ($(f6) + (0bp, 20bp)$) [bddroot] {$b_1$};
    \node (b0) at ($(f10) + (0bp, 20bp)$) [bddroot] {$b_0$};

    
    \begin{scope}[on background layer]
      \draw [highedge] (f1) -- (false);
      \draw [lowedge] (f1) -- (f2);
      \draw [highedge] (f2) -- (false);
      \draw [lowedge] (f2) -- (f3);
      \draw [highedge] (f3) -- (false);
      \draw [lowedge] (f3) -- (f4);

      \draw [highedge] (f4) -- (false);
      % \draw [highedge] (f5) -- (true);
      \draw [lowedge] (f4) -- (true);
      \draw [highedge] (f6) -- (false);
      \draw [lowedge] (f6) -- (f7);
      \draw [highedge] (f7) -- (false);
      \draw [lowedge] (f7) -- (f8);
      \draw [highedge] (f8) -- (true);
      \draw [lowedge] (f8) -- (f9);
      \draw [highedge] (f9) -- (true);
      \draw [lowedge] (f9) -- (false);

      \draw [highedge] (f10) -- (false);
      \draw [lowedge] (f10) -- (f11);
      \draw [highedge] (f11) -- (true);
      \draw [lowedge] (f11) -- (f12);
      \draw [highedge] (f12) -- (false);
      \draw [lowedge] (f12) -- (f9);

      \draw[-stealth] ($(f1) + (0bp, 20bp)$) -- (f1);
      \draw[-stealth] ($(f6) + (0bp, 20bp)$) -- (f6);
      \draw[-stealth] ($(f10) + (0bp, 20bp)$) -- (f10);
      % \draw [highedge] (f13) -- (true3);
      % \draw [lowedge] (f13) -- (f9);
      
      % \draw [lowedge] (f5) -- (false);

       
    \end{scope}
    \end{tikzpicture}
    \caption{
      Multi-rooted BDD for a CATEG\_INT encoded integer.
    }
    \label{fig:sbk_bdd}
  \end{subfigure}
\vfill

\begin{subfigure}[b]{\linewidth}
     \centering
    \begin{tikzpicture}
      \def\lvl{20pt}

    
    \node (f1) at (0, 0) [bddnode] {$f_1$};
    \node (f2) at ($(f1) + (40bp, 0bp)$) [bddnode] {$f_1$};
    \node (f3) at ($(f2) + (-15bp, -20bp)$) [bddnode] {$f_2$};
    \node (f4) at ($(f2) + (20bp, 0bp)$) [bddnode] {$f_1$};
    \node (f5) at ($(f4) + (20bp, -20bp)$) [bddnode] {$f_2$};
    \node (f6) at ($(f5) + (-5bp, -40bp)$) [bddnode] {$f_4$};
    \node (f7) at ($(f5) + (-15bp, -20bp)$) [bddnode] {$f_3$};

    \node (false) at ($(f7) + (-20bp, -40bp)$) [bddterminal] {$\false$};
    \node (true) at ($(false) + (-45bp, 00bp)$) [bddterminal] {$\true$};

    \node (b1) at ($(f1) + (0bp, 20bp)$) [bddroot] {$b_2$};
    \node (b2) at ($(f2) + (0bp, 20bp)$) [bddroot] {$b_1$};
    \node (b3) at ($(f4) + (0bp, 20bp)$) [bddroot] {$b_0$};

    \node[draw, rounded corners] at ($(f1) + (-30bp, 0bp)$)  {.3};
    \node[draw, rounded corners] at ($(f1) + (-30bp, -20bp)$)  {.714};
    \node[draw, rounded corners] at ($(f1) + (-30bp, -40bp)$)  {.5};
    \node[draw, rounded corners] at ($(f1) + (-30bp, -60bp)$)  {.6};

    \begin{scope}[on background layer]
      \draw [highedge] (f1) -- (true);
      \draw [lowedge] (f1) -- (false);
      \draw [highedge] (f2) -- (false);
      \draw [lowedge] (f2) -- (f3);
      \draw [highedge] (f3) -- (true);
      \draw [lowedge] (f3) -- (false);

      \draw [highedge] (f4) -- (false);
      \draw [highedge] (f5) -- (f6);
      \draw [lowedge] (f4) -- (f5);
      \draw [lowedge] (f5) -- (f7);
      \draw [highedge] (f6) -- (true);
      \draw [lowedge] (f6) -- (false);
      \draw [highedge] (f7) -- (true);
      \draw [lowedge] (f7) -- (false);

      \draw[-stealth] (b1) -- (f1);
      \draw[-stealth] (b2) -- (f2);
      \draw[-stealth] (b3) -- (f4);

       
    \end{scope}

    
    % \node[draw, rounded corners] at ($(f1) + (-40bp, 0bp)$)  {.1};

    % \node (f2) at ($(f1) + (20bp, -\lvl)$) [bddnode] {$f_2$};
    % \node[draw, rounded corners] at ($(f2) + (-60bp, 0bp)$)  {.11};

    % \node (f3) at ($(f2) + (20bp, -\lvl)$) [bddnode] {$f_3$};
    % \node[draw, rounded corners] at ($(f3) + (-80bp, 0bp)$)  {.25};

    % \node (f4) at ($(f3) + (20bp, -\lvl)$) [bddnode] {$f_4$};
    % \node[draw, rounded corners] at ($(f4) + (-100bp, 0bp)$)  {.5};

    % % \node (f5) at ($(f3) + (0bp, -\lvl)$) [bddnode] {$f_5$};
    % % \node[draw, rounded corners] at ($(f5) + (21bp, 0bp)$)  {.5};
    
    % \node (true) at ($(f4) + (0bp, -\lvl)$) [bddterminal] {$\true$};
    % \node (true2) at ($(true) + (60bp, 0bp)$) [bddterminal] {$\true$};
    % \node (true3) at ($(true2) + (40bp, 0bp)$) [bddterminal] {$\true$};
    % % \node[draw, rounded corners] at ($(true) + (-21bp, 0bp)$)  {1};

    % \node (false) at ($(f4) + (-20bp, -\lvl)$) [bddterminal] {$\false$};
    % \node (false2) at ($(false) + (60bp, 0bp)$) [bddterminal] {$\false$};
    % \node (false3) at ($(false2) + (60bp, 0bp)$) [bddterminal] {$\false$};
    % % \node[draw, rounded corners] at ($(false) + (21bp, 0bp)$)  {0};

    % \node (f6) at ($(f4) + (0bp, 3*\lvl)$) [bddnode] {$f_1$};
    % \node (f7) at ($(f6) + (20bp, -\lvl)$) [bddnode] {$f_2$};
    % \node (f8) at ($(f7) + (20bp, -\lvl)$) [bddnode] {$f_3$};
    % \node (f9) at ($(f8) + (30bp, -\lvl)$) [bddnode] {$f_4$};

    % \node (f10) at ($(f6) + (70bp, 0bp)$) [bddnode] {$f_{1}$};
    % \node (f11) at ($(f10) + (20bp, -\lvl)$) [bddnode] {$f_{2}$};
    % \node (f12) at ($(f11) + (20bp, -\lvl)$) [bddnode] {$f_{3}$};
    % % \node (f13) at ($(f12) + (0bp, -\lvl)$) [bddnode] {$f_{4}$};

    
    
    \end{tikzpicture}
    \caption{
      Multi-rooted BDD for an BITWISE\_INT encoded integer.
    }
    \label{fig:bwh_bdd}
  \end{subfigure}
\caption{BDDs representing the integer distribution $[\texttt{0} \mapsto 0.1, \texttt{1} \mapsto 0.1, \texttt{2} \mapsto 0.2, \texttt{3} \mapsto 0.3, \texttt{4} \mapsto 0.3]$ resulting from CATEG\_INT and BITWISE\_INT encoding methods. BITWISE\_INT achieves a smaller BDD by compactly representing higher order bits.
% \william{also maybe one-hot bdd} \william{uniform bdd?} \sh{We need to render these BDDs better. We can label internal nodes in a way that is less confusing. Also, include PDFs not PNGs (you can have graphviz output PDFs with the right arugment)}  \todd{Also the root notes - the names l, rl, etc. will not be understood.}  
}\label{bddexample}
\end{figure}

Thus far we have seen how binary-represented distributions expose structure and can enable effective scaling in practice. In this section we explain exactly how this performance improvement is achieved during inference. 
In particular, we show how \emph{knowledge compilation} is capable of automatically finding and exploiting the structure of integer distributions and operations. Inference via knowledge compilation is currently the state-of-the-art approach to exact discrete probabilistic inference in certain classes of probabilistic programs~\citep{HoltzenOOPSLA20, Raedt2007, chavira2006compiling, Chavira2008, Fierens2015}. 
The heart of inference via knowledge compilation is a reduction from inference to \emph{weighted model counting} (WMC). Let $\varphi$ be a Boolean formula and $w$ be a map from literals in $\varphi$ to real-valued weights; the pair $(\varphi, w)$ is called a \emph{weighted Boolean formula}. Then, the \emph{weighted model count} $\mathtt{WMC}(\varphi, w)$ is a weighted sum of models of $\varphi$:

\begin{align}
    \mathtt{WMC}(\varphi, w) = \sum_{m \models \varphi} \prod_{\ell \in m} w(\ell).
\end{align}
This reduction to WMC is not useful on its own however: the WMC task is \#P-hard for an arbitrary Boolean formula $\varphi$. This is where knowledge compilation comes into the picture: $\varphi$ is compiled into a data structure that supports efficient weighted model counting in the size of the data structure. A common example of such a knowledge-compilation data structure is a binary decision diagram (BDD), which supports linear-time WMC, but there are many others~\citep{Darwiche_2002}.
PPLs like Dice and ProbLog work by compiling a program into a BDD or related compilation target and thereby reducing probabilistic program inference to WMC on that target~\citep{chavira2005compiling,sang2005performing}. 

The cost of the knowledge compilation approach to inference is almost entirely determined by the structure of the program; the more structure that exists, the more compact the resulting BDD or related data structure can be.
This leads to our core contribution: a new logical representation of integer distributions that 
is amenable to efficient compilation into BDDs. 
To demonstrate how BDDs can encode integer distributions,
Figure~\ref{bddexample} shows two different multi-rooted BDDs that represent \emph{the same distribution} on 
integers.
In both cases the roots in each BDD represent random variables for each binary digit: $b_0$ is the 0-order digit,
$b_1$ is the 1-order digit. 
Positive weights of each literal are shown on the left (with the negative weight being 1 minus 
the positive weight); dotted edges represent a false assignment and solid edges represent true assignment.
Intuitively, a binary representation has the potential to be more compact than a naive categorical
representation due to the reduction in the number of roots: for instance, Dice~\citep{HoltzenOOPSLA20} 
requires one root for each possible integer value.

As an example of how to use these data structures, consider computing the marginal probability of the high-order bit $b_2$ being true.  This is 
$\mathtt{WMC}(b_2, w)$, which is $0.3$ -- that is clear in Figure~\ref{fig:bwh_bdd} since the sole path from $b_2$ to the true node has weight 0.3, but it is also true for the sole path from $b_2$ to the true node in Figure~\ref{fig:sbk_bdd}, which has weight $(0.9 * 0.89 * 0.75 * 0.5) \approx 0.3$.
In general, to compute the probability of an arbitrary integer, we convert it into binary and 
conjoin the appropriate roots: for instance, to compute the probability of the integer \texttt{0}, 
we compute $\mathtt{WMC}(\overline{b_0} \land \overline{b_1} \land \overline{b_2}, w)$.


% This approach reduces inference on a discrete probabilistic program to weighted model counting (WMC) --- a task commonly done on binary decision diagrams (BDDs), which support WMC in linear time in the BDD size ~\cite{Darwiche_2002}.  Therefore, the ability to scale inference is directly related to the ability to compile to a compact BDD. The BDD construction process identifies and exploits conditional independencies to reduce BDD size. 

% Consider a binary encoding of integers, where we have a series of bits each corresponding to a power of two. It is clear that we can extend this to a probabilistic space by replacing each (deterministic) bit instead with a distribution; the resulting joint distribution over all bits will correspond to a distribution over integers. It is this observation that our integer representation approach uses. 

% From this perspective, inference on a distribution over binary-encoded integers is equivalent to inference on a distribution over a tuple of bits. 
% \todd{Technically you mean inference over a distribution on tuples of bits.} 
% For a knowledge compilation system using BDDs as a compilation target such as Dice ~\citep{HoltzenOOPSLA20}, this manifests as compilation to and inference over a multi-rooted BDD, where the roots of the BDD correspond to the bits of the integer. The structure of the BDD will depend on encoding-specific details: specifically, how the distribution over bits is constructed. Some example multi-rooted BDDs, corresponding to different encoding methods, are given in Figure \ref{bddexample}; the encoding methods they correspond to will be explored in future sections. 

% In the Figure ~\ref{bddexample} BDDs, the roots correspond to the three bits of the number being represented, here notated $b_2$, $b_1$, and $b_0$ from most to least significant. Each is distributed according to various biased coin flips; these flips are represented here by the intermediate decision nodes labeled $f_i$. Looking at the bit $b_0$, the possible paths through the decision nodes to the terminal nodes $0$ and $1$ encode the manner in which $b_0$'s value depends on these random flips. The other bits $b_2$ and $b_1$ have similar decision paths depending on the same random flips. Decision nodes are reused when possible, with this reuse corresponding to repeated structure in the decision functions for each bit. 

% In Dice, the structure of the BDD is derived from the program, with the decision nodes being the random flip variables. Given this data structure, WMC for inference is simply recursively computing the probability of reaching the terminal node $1$ for each intermediate node given the biases. As noted before, this can be done in time linear in the size of the BDD~\citep{Darwiche_2002}, and so a more compact BDD means faster inference --- something we will use to justify encoding performance in the future sections. 


\subsection{Integer Encodings}

The previous section demonstrated the potential for binary encodings in knowledge compilation, but how do we connect this to probabilistic programs? In this section we give lightweight encoding strategies for translating 
\verb#discrete(..)# syntax for an arbitrary distribution over integers into distributions on Booleans, which knowledge-compilation-based languages like Dice and ProbLog are already capable of representing. How might such distributions be represented in a probabilistic programming language in practice? To make the problem concrete, we define the integer representation problem as follows: given an input vector $[p_0, .., p_w]$, we want a method which returns a distribution over integers taking on value $i$ with probability $p_i$. While this is relatively restricted by demanding that our distribution is contiguous with lowest value 0, we can convert this to other distributions (for example) by adding an offset or multiplying by a constant. 

% \todd{I don't like the next several paragraphs because the reader has no idea where we're going.  For example, there's no indication until many paragraphs in to the narrative that this first approach is undesirable and not what we do.  Also there's a weird conflation of how to represent an integer distribution at the source level with how to encode it -- the if expressions are a source-level thing, but then you're using that to express an encoding algorithm.  This is confusing.  Why do we even need the source-level thing?  Instead, just assume we have the discrete syntax for defining arbitrary integer distributions, which was already introduced earlier, and directly talk about the SBK encoding.  You could still show the example of a sequence of ifs but it should be made clear that you're just giving intuition on SBK.}
% Consider the problem of representing an arbitrary distribution over integers in a probabilistic program; as integers are commonly used in programs, providing an easy syntax to represent distributions over integers provides a great ease of life for a user. To make the problem more concrete, we define the integer representation problem as follows: given an input vector $[p_0, .., p_l]$, provide a method for returning a single random integers that takes on the value $i$ with probability $p_i$. While this is relatively restricted by demanding that our distribution is contiguous with lowest value 0, we can convert this to other distributions (for example) by adding an offset or multiplying by a constant value. 
% \subsubsection{A Basic Approach}

\subsubsection{A First Approach}
One natural way of constructing such a categorical distribution, is as a set of if-else statements, with each branch corresponding to a different value. For example, the following probabilistic program snippet would correspond to the integer distribution with probability vector $[0.1, 0.2, 0.3, 0.4]$. The syntax $\flip(\theta)$ used in the program is commonly used in discrete PPLs to represent a Bernoulli random variable with bias $\theta$. 

\begin{lstlisting}[mathescape=true]
if flip(0.1) // Bernoulli(0.1)
    return 0
elseif flip(0.2/0.9)
    return 1
elseif flip(0.3/0.7)
    return 2
else
    return 3 
\end{lstlisting}

We use a sequence of these random flips as arguments to the if-else statements to generate the mixture of numbers; note that we renormalize the flip probability at each step to get the correct distribution. This approach is generalized in Algorithm \ref{alg:categ_int}. This and future algorithms should be interpreted as a general method to represent a distribution over integers in any probabilistic programming language supporting Bernoulli random variables and (non-probabilistic) integers. 
% , which can be viewed as a function in a probabilistic program that takes in as input a probability vector and returns the corresponding distribution over integers. 
Note that representing a categorical variable in this way is a probabilistic program framing of the SBK encoding presented by \cite{10.5555/1619332.1619409}.

\begin{algorithm}
    \caption{CATEG\_INT ($v \in [0, 1]^{w}$)}
    \label{alg:categ_int}
    \DontPrintSemicolon
    \KwIn{Vector $v$ such that $v[i] \propto $ pr($i$)}
    \KwOut{Distribution over the integers $0,\ldots,w-1$ matching $v$}
    \color{black}
    \eIf{$w$ == $1$ {\normalfont \textbf{or}} $\flip\left(\frac{v[0]}{\sum v}\right)$}
        {\Return $0$}
        {//\ Recurse on the remainder of $v$\\
        \Return 1 + CATEG\_INT($v$[1:]) 
        % \guy{instead of passing l around can we return 1 + SBK?}
        } 
        
\end{algorithm}

What would occur if we use Algorithm \ref{alg:categ_int} to represent a distribution over binary-encoded integers in a language such as Dice? The BDD for one distribution represented using this approach is given in Figure \ref{fig:sbk_bdd}. Note that for each bit, the decision diagram is essentially a linear chain; intuitively, this corresponds to checking each if-else guard in sequence. In addition, notice that there is almost no node reuse occurring in this BDD; each root has its own linear chain.

% \todd{It would help to break up the text and make the narrative clearer if you have subsections.  This would start a new subsection, and you'd have an earlier one on the SBK algorithm.}

\subsubsection{A More Compact Encoding}

We propose an alternative method of representing integers from a probability vector that produces provably more compact BDDs. Rather than constructing our mixture by a linear pass through the probability vector, we can instead divide the vector into two parts, using a divide-and-conquer approach. 
% \todd{The next few sentences are too specific -- you are describing the algorithm in English, which is difficult to really understand.  And all the "said" stuff is very distracting.  I would just give the high level idea.}  By splitting at the largest power of two contained in the distribution, we can differentiate between those terms which contain said power of two, and those which do not. We can then conditionally add said power of two, and recursively construct the distributions corresponding to the subvectors. 
Consider the same example as above, where we are again given as input a probability vector $[0.1, 0.2, 0.3, 0.4]$. To get the wanted distribution, we can conditionally add the value $2$ with probability $\frac{0.3+0.4}{0.1+0.2+0.3+0.4}$, corresponding to the latter half of the vector. Depending on if $2$ is added, we then conditionally add the value $1$, with probability derived from the subvectors $[0.1, 0.2]$ and $[0.3, 0.4]$. An example program implementing this is given below: 

\begin{lstlisting}[mathescape=true]
num = 0
if flip(0.7) // 0.3 + 0.4
    num += 2
    if flip(0.4/0.7)
        num += 1
else
    if flip(0.2/0.3)
        num +=1 
return num 
\end{lstlisting}

% We conditionally   
This approach is formalized in Algorithm \ref{alg:categbits}. For the sake of simplicity, we assume the input vector is always of length $2^b$ for some number $b$; this means that we always divide the vector into its two halves. For an arbitrary input vector, we can simply pad 0 probability values to fulfill this condition; in practice, this algorithm can easily be adapted to work without this explicit padding. 
% be implemented without the assumption.  

% \todd{Does "without the assumption" mean that it's correct as is even for non-powers of 2, or that there is a way to update it to handle that case?} 

\begin{algorithm}
    \caption{BITWISE\_INT ($v \in [0, 1]^{2^b}$)}
    \label{alg:categbits}
    \DontPrintSemicolon
    \KwIn{Vector $v$ such that $v[i] \propto $ pr($i$)}
    \KwOut{Distribution over the integers $\{0, \ldots, 2^b - 1\}$ matching $p$ }
        \color{black}
        $p \gets \frac{\sum_{i = 2^{b-1}}^{2^b - 1} v[i]}{\sum_{i = 0}^{2^b - 1} v[i]}$\;
        % $f \gets \flip(p)$\; 
        % \eIf{b == 1}{pvec \gets []}{
        \eIf{$\mathit{length}(v)==1$}{\Return $0$}{\eIf{$\flip(p)$}{
        % // Recurse on the second half of $v$ with offset
        //\ Recurse on second half $\geq 2^{b-1}$
        \\\Return BITWISE\_INT($v[2^{b-1}:2^b]$) + $2^{b-1}$ }
        {
        % // Recurse on the first half of $v$
        //\  Recurse on first half $< 2^{b-1}$
        \;\Return BITWISE\_INT($v[0:2^{b-1}] $)}}
    % }
\end{algorithm}

% \begin{algorithm}
%     \caption{BITWISE\_INT ($v \in [0, 1]^{2^b}$, $lo (default 0)$, $hi (default (2^b-1)$)}
%     \label{alg:categbits}
%     \DontPrintSemicolon
%     \KwIn{Vector $v$ such that $v[i] \propto $ pr($i$)}
%     \KwOut{Distribution over the integers $\{0, \ldots, 2^b - 1\}$ matching $p$ }
%         \color{black}
%         $mid \gets ceil(\frac{hi-lo}2)$\;
%         $p \gets \frac{\sum_{i = ceil(\frac{hi-lo}2)}}{\sum_{i = lo}^{hi} v[i]}$\;
%         % $f \gets \flip(p)$\; 
%         % \eIf{lo == hi}{pvec \gets []}{
%         \eIf{$\mathit{lo==hi}$}{\Return $lo$}{\eIf{$\flip(p)$}{
%         % // Recurse on the second half of $v$ with offset
%         //\ Return value $\geq 2^{b-1}$
%         \\\Return BITWISE\_INT($v$, $mid$, $hi$)}
%         {
%         % // Recurse on the first half of $v$
%         //\  Return value $< 2^{b-1}$
%         \;\Return BITWISE\_INT($v$, $lo$, $mid - 1$)}}
%     % }
% \end{algorithm}

% Note that when we conditionally add the largest power of two, we are essentially setting the value of the most significant binary bit needed to represent the numbers. \todd{I think these two sentences are confusing.  I would just say directly that when implementing this algorithm for a representation of integer distributions as distributions on tuples of bits, we can simply do blah...}  In practice, this algorithm is implemented in this manner by conditionally setting the values of bits corresponding to the tuples. 


Note that while Algorithm \ref{alg:categbits} uses arithmetic to produce the distribution, it only ever adds or return powers of two, which directly correspond to the bits of the integer. Therefore, when implementing the algorithm as a distribution on a tuple of bits, we encode each such addition by simply setting the appropriate bit. For the same example as above, our implementation constructs a tuple of bits $(b_1, b_0)$ corresponding to a binary number such that $b_1 = \flip(0.7)$ and $b_0 =$ if $b_1$ then $\flip(\frac{0.4}{0.7})$ else $\flip(\frac{0.2}{0.3})$. 

How does this method of representing integer distributions differ than the one given before? To see this, we look at the BDD for a distribution written in this manner given in Figure \ref{fig:bwh_bdd}. We can see a clear difference between this BDD and that for the approach given in Algorithm \ref{alg:categ_int}. The most significant bit corresponds to a BDD depending on only one flip, as this corresponds to the largest power of two: only one flip is used to determine its value. For the less significant bits, we add an additional layer of variables for each one, with the number of layers in total corresponding to the number of bits needed to represent the input distribution. This is in contrast to the CATEG\_INT encoding, which requires checking a linear chain of variables for each bit, and so achieves a much more compact BDD representation. 

We formalize this difference in BDD size in Proposition \ref{th:bdd_bound}. Note that variable order can greatly influence the size of a BDD, and finding the optimal variable order is an NP-hard problem~\citep{meinel1998algorithms}; 
% \guy{I don't think variable orders are relevant at this point. introduce them later when it matters, which is really only when specifying the size bounds.}
 we follow the Dice convention of ordering logical variables using (strict, left-to-right) evaluation order. For example, in Figure ~\ref{example2}, the Boolean variables encoding the \texttt{discrete} distribution on Line 1 occur before the variables in the distribution on Line 2 in the order.
% we will follow the Dice convention of taking program order for the variables of a BDD \sh{This is not clear; what does it mean to ``take program order''? We need to be very precise here. We should say something like ``We follow the Dice convention of ordering logical variables using lexical order: for example, in Figure~\ref{example1}, the Boolean variables encoding the \texttt{discrete} distribution on Line 1 occur before the variables in the distribution on Line 2 in the order.}. \todd{What about the issue of the variable order?}

\begin{proposition} \label{th:bdd_bound}
A discrete distribution over the integers $\{0, 1 \ldots, 2^b - 1\}$ compiles to a BDD of size $\Theta(b2^b)$ when represented using CATEG\_INT (Algorithm \ref{alg:categ_int}) and a BDD of size $\Theta(2^b)$ when represented using BITWISE\_INT (Algorithm \ref{alg:categbits}), with variables in flip evaluation order.
\end{proposition}

It is BITWISE\_INT that we have implemented in \texttt{Dice.jl} and experimentally evaluate in the next section.

% We have implemented the  algorithm in \verb#Dice.jl# and it is the encoding that we experimentally evaluate in the next section.

% To provide intuition for our encoding of integer distributions,
% consider the probability distribution $\pi$ over the integers $\{0, 1, 2, 3\}$ such that $\pi(0)=0.1, \pi(1)=0.2, \pi(2) = 0.3$ and $\pi(3) = 0.4$. These probabilities can also be represented as a probability vector $[\pi(0), \pi(1), \pi(2), \pi(3)]$. We wish to represent this distribution using a tuple of two Boolean random variables $(b_1, b_0)$ such that $\Pr(b_1 = 1, b_0 = 1)$ represents the probability of (1, 1) (the binary representation of 3), namely $0.4$, and similarly for the other integers. That can be achieved by defining $b_1 = \flip(\frac{7}{10})$ and $b_0 =$ if $b_1$ then $\flip(\frac{4}{7})$ else $\flip(\frac{2}{3})$.  For example, $\Pr(b_1 = 1, b_0 = 1)$ is $(\frac{7}{10})(\frac{4}{7}) = 0.4$.


% Using the above example, we would like to draw attention to two main insights. First, the most significant bit is a single coin flip whose probability is equal to the ratio of the elements in the second half of the probability vector to the total sum of the vector. Second, conditioning on the most significant bit renders us two different distributions on smaller supports. Combining these insights, we arrive at Algorithm \ref{alg:categbits} shown below, which converts an arbitrary integer distribution (given as vector of probabilities for input) into a binary encoding.  The algorithm first computes the distribution on the most significant bit that is required, and then conditions on it to recurse on the two halves of the distribution. The algorithm assumes that the given distribution has an upper bound that is a power of 2, but we can always pad distributions that do not satisfy this with trailing zero probabilities.



% \begin{algorithm}
%     \caption{Adapted SBK ($v \in [0, 1]^{l}$)}
%     \label{alg:categ_int}
%     \DontPrintSemicolon
%     \KwIn{Vector $v$ such that $v[i] = $ pr($i$)}
%     \KwOut{Distribution over integers $0,1,..,l-1$}
%     \color{black}
%     \eIf{len(v) == 1 or flip$\left(\frac{p[0]}{\sum p}\right)$}
%         {\Return $l$}
%         {
%         \Return SBK($v$ without first element, $l$ + 1)}
% \end{algorithm}







% \begin{proposition} \label{th:bwhcorrect}
% For a discrete distribution over the integers $\{0, 1 \ldots, 2^b - 1\}$, INT\_DIST is correct.
% \end{proposition}

% We prove that Algorithm \ref{alg:categbits} is correct in the supplementary material.  
% We provide an upper bound on the size of the BDDs compiled from the resulting tuples of Boolean random variables for the variable order described in the algorithm.

% \begin{proposition} \label{th:bwhbound}
% For a discrete distribution over the integers $\{0, 1 \ldots, 2^b - 1\}$, the resulting tuple of Boolean random variables of INT\_DIST compiles to a BDD of size $\Theta(2^b)$ when variable order matches flip instantiation order.
% \end{proposition}
% \guy{I think the reader will have a hard time imagining what this multi-rooted BDD looks like for a random number. UAI people are not super comfortable with BDDs.}
% \sh{This still does not mention variable order? Also, this is a bit confusing because a tuple compiles to a multi-rooted BDD, so we should say ``compiles to a tuple of BDDs of total size ...'' or something like that.}
% Notably, an alternate approach in the literature for encoding categorical distributions such as integers ~\citep{10.5555/1619332.1619409} leads to a BDD that is at least larger by a linear factor. We provide more details of these proofs in the appendix.

% \begin{proposition} \label{th:sbkbound}
% A discrete distribution over integers represented using the approach of \cite{10.5555/1619332.1619409}, with variables ordered by integer order, gets compiled to a BDD of size $\Theta(b2^b)$.
% \end{proposition}

% \guy{People get less annoyed when you call a simple theorem a proposition. You should only have 1 or 2 theorems in a paper and the rest are probably propositions.}



% \todd{I also wish the "is correct" part were more formal, but that would require introducing some more notation.}



% means that we look at the first and second halves of the original vector separately.


% We observe that we can assume $v$ to be of length $2^b$ for some $b$, something which can always be accomplished by padding 0s. This allows us to correspond nicely to a distribution over $b$ bits. \ryan{Less emphasis on padding in the two sentences before? ``We first observe that we can assume $v$ to be of length $2^b$ for some $b$, and thus correspond to a distribution over $b$ bits, by left-padding with 0s if necessary."} We also note that the probability of the most significant bit needed is easy to compute: it is simply the ratio of the elements in the second half of the vector to the total sum of the vector. Finally, conditioning on the most significant bit means that we look at the first and second halves of the original vector separately. Combining these insights, we arrive at Algorithm \ref{alg:categbits}. In this algorithm, we compute the distribution on the most significant bit needed to represent the vector before recursively calling on the two halves of the distribution. 

% We capture this intuition in Algorithm \ref{alg:categbits}.




% Consider a pair of Bernoulli random variables $(\flip(0.3), \flip(0.4))$, where the argument of $\flip$ indicates the bias of the random variable. If we interpret this sequence as a binary number $n = b_1b_0$ where $b_1 = \flip(0.3),  b_0 = \flip(0.4))$, what we get is actually a distribution over integers: $\Pr(n=3) = \Pr(b_1 = 1 \wedge b_0 = 1) = 0.3 * 0.4 = 0.12$; $\Pr(n=1) = (1-0.3) * 0.4 = 0.28$ \ryan{delete... } in similar manner \ryan{...until here?}, and the rest of the distribution can be computed in a similar manner. We can see that distributions over integers can therefore be represented by sequences of random flips. 

% To be able to use such a binary representation for probabilistic arithmetic, we must be able to construct a binary representation for arbitrary (bounded) integer distributions. This problem can be summarized as the following task: given a vector $v=[p_0, .., p_n]$, we want a sequence of random bits $b_k..b_0$ such that $\Pr(b_k..b_0 = i) = p_i$. 

% Consider the case $v = [0.1, 0.2, 0.3, 0.4]$. This corresponds to a distribution over $\{0, 1, 2, 3\}$, which requires exactly 2 bits. Therefore, we want to return two bits $b_1$ and $b_0$. Suppose we start by trying to express $b_1$; clearly, it should be true with probability $0.3+0.4=0.7$. Therefore, we start by letting $b_1 = \flip(0.7)$. If we try to construct $b_0$ in a similar manner, we run into a problem: the overall probability of $b_0$ is $0.4+0.2 = 0.6$, but if we let $b_0 = \flip(0.6)$, $\Pr(b_1b_0 = 3) = 0.7 * 0.6 = 0.42 \neq 0.4$, and every other probability will be similarly incorrect. What we must instead do is condition $b_0$'s value on $b_1$: if 
% $b_0 = \flip(0.4/(0.4+0.3))$ when $b_1 = 1$ and $b_0 = \flip(0.2/(0.2+0.1))$ when $b_1 = 0$, we will get the wanted distribution. In a PPL, this can be expressed with a conditional expression such as \verb#b0 = if b1 then flip(4/7) else flip(2/3)#. 

% How can we generalize the above approach to any vector $v$? We observe that we can assume $v$ to be of length $2^b$ for some $b$, something which can always be accomplished by padding 0s. This allows us to correspond nicely to a distribution over $b$ bits. \ryan{Less emphasis on padding in the two sentences before? ``We first observe that we can assume $v$ to be of length $2^b$ for some $b$, and thus correspond to a distribution over $b$ bits, by left-padding with 0s if necessary."} We also note that the probability of the most significant bit needed is easy to compute: it is simply the ratio of the elements in the second half of the vector to the total sum of the vector. Finally, conditioning on the most significant bit means that we look at the first and second halves of the original vector separately. Combining these insights, we arrive at Algorithm \ref{alg:categbits}. In this algorithm, we compute the distribution on the most significant bit needed to represent the vector before recursively calling on the two halves of the distribution. 
% \sh{I see some typos in this code, have a look}

\subsubsection{Uniform Integers} \label{subs:uniform}
% \sh{We can pump up the contribution in this section a little bit. We can say ``the previous encoding strategy works for arbitray distributions on integers, but in practice one often encounters common highly-structured distributions such as the uniform. One advantage of our approach is that we can exploit the structure of certain kinds of distributions in order to scale significantly better than the general approach in Algorithm~\ref{alg:categbits}.}

The previous encoding strategy works for arbitrary distributions on integers, but in practice one often encounters common highly-structured distributions such as the uniform. One advantage of our approach is that we can exploit the structure of such distributions in order to scale significantly better than the general approach presented in Algorithm~\ref{alg:categbits}. In particular, the structure of the uniform distribution allows for a special encoding with fully independent flips. 

Since the probability of every integer is equal, we can encode a uniform distribution over integers $\{0, 1, \ldots 2^b-1\}$ by adding the values $2^0, 2^1, \ldots, 2^{b-1}$ independently with probability $0.5$. From a bitwise perspective, this is same as independently setting each bit of the number to be true with probability $0.5$. As an example, consider the uniform distribution over integers $\{0, 1 \ldots 15\}$.  Clearly, each possible instantiation of $(\mathit{flip}(0.5), \mathit{flip}(0.5), \mathit{flip}(0.5), \mathit{flip}(0.5))$ is equally likely, and thus equivalently the probability of each integer in the range. 
% Using this idea, we present Algorithm \ref{alg:uniformgen}, a method to generate a uniform distribution using a lesser number of flips. 

\begin{algorithm}
\SetAlgoLined
    \caption{UNIFORM(n)}
    \label{alg:uniformgen}
    \DontPrintSemicolon
    \KwIn{Positive integer $n$}
    \KwOut{Integer uniformly distributed over $0, \ldots, n-1$}
        $b \gets \lfloor\log_2(n)\rfloor$\;
        \eIf{ $\flip\left(\frac{2^b}{n}\right)$}{
            $\mathit{sum} \gets 0$\;
            \For{$i \gets 0$ \KwTo $b-1$}{
                \uIf{$\flip\left(\frac{1}{2}\right)$} {$\mathit{sum} \gets \mathit{sum} + 2^i$}}
                
        \Return $\mathit{sum}$\;
        }
        {\Return UNIFORM($n - 2^b$) + $2^b$} 
\end{algorithm}

The method described above works for uniform distributions whose range is $2^n$ for some $n$; for ranges that are not a power of 2, we use the fact we can any decompose natural number into a sum of powers of 2. This enables a uniform distribution over any range to be represented as a mixture of multiple uniform distributions over smaller power-of-two ranges. We 
% capture this insight \sh{we use the word ``insight'' too much. Find an alternative} 
formalize this idea in Algorithm \ref{alg:uniformgen}, which gives a method for representing uniform distributions starting at 0; the correctness of this approach is shown in the supplementary material. We can then use this approach to achieve any uniform distribution by adding an offset. Just like the previous algorithms, this algorithm is implemented by constructing sequences of bits in a manner equivalent to arithmetic. 

We note that unlike the BITWISE\_INT algorithm, where less significant bits have a dependence on more significant bits, our uniform algorithm leverages independence between the bits. Therefore, the BDD obtained when using UNIFORM is more compact than for our other algorithms, and fewer variables are needed to represent such a distribution. 

%Notably, this method of expressing a uniform distributions uses $\mathcal{O}(\log^2(n))$ flips, in contrast to the $\mathcal{O}(n)$ flips used by INT\_DIST. We also provide proof of this result in the supplementary material. 

% Consider an 4-bit unsigned binary integer. Now, suppose that we replace each bit of this integer with a Bernoulli variable (henceforth also referred to as a "flip") of bias $0.5$ (or $\flip(0.5)$). It is clear that any possible instantiation of the 4 bits is equally likely - that is, they are just as likely to be $0000$ as $1111$, or any other combination. Therefore, the resulting integer will be equally likely to take on any of the values $0, .., 15$, each with probability $\frac{1}{16}$. 

% \begin{definition} \label{discrete-uniform}
% The (discrete) uniform distribution, ${U_{a,b}}$ over the integers $\{a, a+1, .., b-1, b\}$
% \begin{flalign}
% \;\;\;\;\;\;\;\;\;\;&{U_{a,b}}(i) = \begin{cases} 
%       \frac{1}{b-a} & i \in {a, a+1, .., b}   \\
%       0 & \text{otherwise} 
%   \end{cases}&&\nonumber
% \end{flalign}
% We call $b-a$ the \emph{width} of this distribution. \william{can justify this better}
% \end{definition}


% The example given has a special property - it is the \emph{discrete uniform distribution} ${U_{0,15}}$ from 0 to 15, as defined in Definition \ref{discrete-uniform}. We will refer to this simply as the uniform distribution as we are working purely in the discrete setting.  From the example above, it is clear that we can represent any uniform distribution ${U_{0, 2^b}}$ with a bitwise representation simply by making each of $b$ bits $\flip(0.5)$. Notably, this only requires a logarithmic number of flips in the width of the distribution. 


\subsection{Efficient Integer Operations}


% The two terminal nodes with values $0$ and $1$ correspond to the possible values these bits can take on; each intermediate is a vari  The intermediate nodes correspond to biased coin flips, each with some probability taken from the program; 

% \todd{Pointing to these examples without explaining them is useless.  I was expecting something like a small example of a BDD representing a distribution over a two-bit integer to give a flavor for how BDDs represent distributions, which seemed to be the point being made by this paragraph.  Otherwise, what is the point of this paragraph?} \guy{Yeah we should explain how to read these, say that the value of the first bit depends on the outcome of X biased coin flips, explain how it depends, and explain how later bits share some of the same coin flip values. Then explain that WMC is just computing the probability of reaching the 1-terminal from each node recursively} \todd{Also will you describe the weights and how they are used somewhere?  Overkill to describe WMC in detail but maybe at least showing how to compute the weight of a single accepting path on a small example would give some intuition.}

% In the future sections, we will use BDDs to justify encoding performance: as inference can be done in time linear in the size of a BDD, a more compact BDD means faster inference. 

% We will use certain conventions in this discussion. Depending on context, a BDD and BDD size could refer to either the structure and size of a multi-rooted BDD (when doing inference on a distribution over integers) or a BDD with a single root (such as when computing less-than over two integers, which is a binary function). \guy{I don't see the point of the previous, isn't a single rooted BDD just a special case of a multi-rooted BDD, why special case it?} 




% To that end, we employ a binary encoding of integer distributions, whereby such a distribution is represented as a tuple of Boolean random variables.  Our approach will compile such tuples into \emph{multi-rooted BDDs}, with one root per Boolean random variable in the tuple.  Finally, we will encode integer operations as logical circuits that use this representation for its inputs and outputs.

% In the following subsections, we describe construction of needed probability distributions using probabilistic programs. These programs make use of the syntax \emph{flip($r$)} which is treated as a non-deterministic choice between true and false associating weights $r$ and $1-r$ respectively. This choice induces different executions of a program, one considering the particular \textit{flip} to be \texttt{true} and other considering the \textit{flip} to be \texttt{false}. Following these divergent executions, these programs return a joint probability distribution over the random variables $X_b, ... X_1$. The weight of each execution (denoted by $w$ in the proofs) is the product of the flip weights along that execution path. Further, the probability of any event B can be obtained by summing up the weights of all the executions in which event B occurs. 


% We represent distributions over integers as tuples of Boolean random variables where each Boolean random variable can be compiled to a weighted Boolean formula. This allows us to capture conditional independencies between bits for inference. In this spirit, the algorithms described in the subsequent sections take a description of the probability distribution and construct a tuple of Boolean random variables for the integer distributions. The integer arithmetic operations combine these tuples of Boolean random variables into a logical circuit which we describe further in the later section. 


% \subsection{Integer Arithmetic}
While the binary representations of discrete and uniform distributions over integers are interesting, they do not by themselves necessarily give much advantage. If adding two such distributions still requires an explicit enumeration of all possible sums, then we have not gained much over the existing inference approaches. However, the binary encoding enables us to leverage the structure of integers to do much better than this for common operations. In this section, we demonstrate this for integer comparisons and addition.
% While we now have a method of expressing distributions over integers, there still remains the inference task to actually compute these values. A knowledge compilation approach to compiling a probabilistic program converts the program into a weighted Boolean formula. With our integer representation, each binary digit actually has its own WBF; to compute a distribution over possible integers, we are actually interested in the joint distribution over the binary digits. 

\subsubsection{Integer Comparisons}

The comparison operator on binary tuples can be implemented using logic circuits like those in computer hardware. Suppose we compute $a < b$ for two binary numbers $a = 001$ and $b = 100$. The circuit first compares the most significant bits (MSBs) of these numbers, which are 0 and 1 respectively - enough to know that $a < b$ is true. If the two numbers instead had the same MSB, we would need to start this comparison over on remaining bits. This process of computing $a < b$ highlights its key conditional independencies. First, given the MSBs of the operands are different, the result of $a < b$ does not depend on the remaining bits. Second, given the MSBs of the operands are same, the computation on the remaining bits does not depend on the value of the MSBs. This structure gets automatically exploited when we use this standard logic circuit to compare integer distributions, where the inputs are now weighted Boolean formulas represented as BDDs, rather than bits.

More concretely, consider the following probabilistic program which defines two random variables having a uniform distribution over the integers \{0, 1, \ldots, 7\} and then outputs the probability of one integer being less than the other.

\label{equality_prog}
\begin{lstlisting} 
a = uniform(0, 8)
b = uniform(0, 8)
return (a < b)
\end{lstlisting}

% \begin{figure}
%     \centering
%     \includesvg[width = 0.25\textwidth]{equal_bdd.svg}
%     \caption{BDD for Equality}
%     \label{equality}
% \end{figure}

Enumerating all the values that $a$ and $b$ can take in the above program would lead to 64 combinations. In contrast, the BDD for the comparison operation has size linear in the number of bits, as it exploits conditional independences.  We later present empirical results demonstrating that this leads directly to better scalability for discrete inference.

\subsubsection{Integer Addition}

%As the previous section did for comparison, this section describes addition and the structure that falls out of it which can be exploited for probabilistic inference. 
Consider two binary numbers $a = 001$ and $b = 100$ that we wish to add. %Computing $a+b$ involves computing 4 new bits, each a function of the bits of $a$ and $b$.  
The least significant bit (LSB) of $a+b$ is computed as the \emph{xor} of the LSBs of $a$, $b$ and 0 (the initial carry bit). The carry, computed as the \emph{and} of the LSBs of $a$ and $b$, is passed on to the next bit and the same process will be repeated for the remaining bits. The process described above shows that given the carry bit, each bit of the result is independent of the lesser significant bits of the operands.  Similar to the comparison operation, encoding addition on integer distributions as a logical circuit directly exploits such conditional independences to produce a compact BDD, which in turn leads to significant performance gains. 
The manner in which addition corresponds to a compact BDD has been explored before; \cite{WEGENER2004229} show that for an optimal variable ordering, there is a linear bound on the BDD for addition. 
% \todd{The last sentence seems like a non sequitur.  Clearly we are not guaranteeing an optimal variable ordering so it's unclear how it's relevant.} \guy{actually the optimal order is known, so we do guarantee it if we want to}

% \todd{I commented out the program that was here because we don't actually do anything with it.}

%\begin{verbatim}
%    a = uniformInt(0, 7)
%    b = uniformInt(0, 7)
%    result = a + b
%    return result
%\end{verbatim}

%Now consider adding two integer distributions as shown in the above program. Since the distributions corresponding to $a$ and $b$ already have a binary representation (as described in Section~\ref{subs:uniform}, we can directly combine the weighted Boolean formulas for their bits to get the bits of the resulting distributions. This process automatically encodes the conditional independence described above resulting in considerable scaling of integer addition. We demonstrate the speedups gained by exploiting this structure in a later section.

% \subsubsection{Binary encoded integers as BDDs}
% % \william{title..}


% Consider a binary encoding of integers, where we have a series of bits each corresponding to a power of two. It is clear that we can extend this to a probabilistic space by replacing each (deterministic) bit instead with a distribution; the resulting joint distribution over all bits will correspond to a distribution over integers. It is this observation that we leverage with our integer representation approach. 

% From this perspective, inference on a distribution over binary-encoded integers is equivalent to inference on a distribution over a tuple of bits. 
% % \todd{Technically you mean inference over a distribution on tuples of bits.} 
% For a knowledge compilation system using BDDs as a compilation target such as that used in Dice ~\citep{HoltzenOOPSLA20}, this manifests as compilation to and inference over a multi-rooted BDD, where the roots of the BDD correspond to the bits of the integer. The structure of the BDD will depend on encoding-specific details; specifically, how the distribution over bits is constructed. Some example multi-rooted BDDs, corresponding to different encoding methods, are given in Figure \ref{bddexample}; the encoding methods they correspond to will be further explored in the future sections. 

% In the Figure ~\ref{bddexample} BDDs, the roots correspond to the three bits of the number being represented, here notated $b_2$, $b_1$, and $b_0$ from most to least significant. Each is distributed according to various biased coin flips; these flips are represented here by the intermediate decision nodes labeled $f_i$. Looking at the bit $b_0$, the possible paths through the decision nodes to the terminal nodes $0$ and $1$ encode the manner in which $b_0$'s value depends on these random flips. The other bits $b_2$ and $b_1$ have similar decision paths depending on the same random flips. Decision nodes are reused when possible, with this reuse corresponding to repeated structure in the decision functions for each bit. 

% In Dice, the structure of the BDD is derived from the program, with the decision nodes being the random flip variables. Given this data structure, WMC for inference is simply recursively computing the probability of reaching the terminal node $1$ for each intermediate node given the biases. This can be done in time linear in the size of the BDD~\cite{Darwiche_2002}, and so a more compact BDD means faster inference --- something we will use to justify encoding performance in the future sections. 


In this section, we described the structure of two arithmetic operations, comparison and addition, independently. When composing these operations together, the compilation of weighted Boolean formulas will naturally compose as described in previous work~\citep{HoltzenOOPSLA20}. 
The sizes of the resulting BDDs depend highly on the variable ordering --- but even when the variable ordering is not optimal, conditional independences can still be identified and exploited, as shown by the experiments in the next section.


\section{Empirical Evaluation}\label{section:experiments}
\begin{figure*}[t]
  \centering
\begin{minipage}[t]{0.45\linewidth}
\vspace{0pt}
  \centering
  \begin{tikzpicture}
    \begin{axis}[
        height=3.5cm,
        width=4.5cm,
        grid=major,
        xmode=linear,
        ymode=log,
        xtick={2,4,6,8,10,12,14},
        % xtick={2, 4, 6, 8, 12},
        ytick={0.001, 0.01, 0.1, 1, 10, 100, 1000},
        xlabel={$a<b$},
        ylabel={\small time (s)},
        legend columns=1,
        yticklabel style = {font=\scriptsize},
        xticklabel style = {font=\scriptsize},
        legend style={at={(-15em,2.5em)},anchor=north west, font=\tiny},
        yminorticks=false
    ]
    \addplot[mark=none, thick, blue, mark size = 1.0pt] table [x index={0}, y index={2}] {\lessthandata}; %
    \addlegendentry{\small ProbLog (binary)};
    \addplot[mark=none, thick, blue, densely dashed, mark size = 1.0pt] table [x index={0}, y index={1}] {\lessthandata}; %
    \addlegendentry{\small ProbLog (native)};
    \addplot[mark=none, thick, red, mark size = 1.0pt] table [x index={0}, y index={3}] {\lessthandata}; %
    \addlegendentry{\small {Dice.jl} (binary)};
    \addplot[mark=none, thick, red, densely dashed, mark size = 1.0pt] table [x index={0}, y index={4}] {\lessthandata}; %
    \addlegendentry{\small {Dice.jl} (one-hot)};
    \end{axis}
  \end{tikzpicture}
  % \caption{$a < b$}
  \label{fig:lessthan}
\end{minipage}~
\begin{minipage}[t]{0.25\linewidth}
\vspace{0pt}
  \centering
  \begin{tikzpicture}
    \begin{axis}[
		height=3.5cm,
		width=4.5cm,
		grid=major,
    xmode=linear,
    ymode=log,
    xtick={2,4,6,8,10,12,14},
    ytick={0.001, 0.01, 0.1, 1, 10, 100, 1000},
    yticklabel style = {font=\scriptsize},
    xticklabel style = {font=\scriptsize},
    xlabel={$a==b$},
    yminorticks=false
    ]
    \addplot[mark=none, thick, blue] table [x index={0}, y index={2}] {\equalsdata}; %
    \addplot[mark=none, thick, blue, densely dashed] table [x index={0}, y index={1}] {\equalsdata}; %
    \addplot[mark=none, thick, red] table [x index={0}, y index={3}] {\equalsdata}; %
    \addplot[mark=none, thick, red, densely dashed] table [x index={0}, y index={4}] {\equalsdata}; %
    \end{axis}
  \end{tikzpicture}
  % \caption{$a==b$}
  \label{fig:laddernetscale}
\end{minipage}~
\begin{minipage}[t]{0.2\linewidth}
\vspace{0pt}
  \centering
      \begin{tikzpicture}
    \begin{axis}[
		height=3.5cm,
		width=4.5cm,
		grid=major,
    xmode=linear,
    ymode=log,
    xtick={2,4,6,8,10,12, 14},
    ytick={0.001, 0.01, 0.1, 1, 10, 100, 1000},
    yticklabel style = {font=\scriptsize},
    xticklabel style = {font=\scriptsize},
    xlabel={$\mathbb{E}[a + b]$},
    yminorticks=false
    ]
    \addplot[mark=none, thick, blue] table [x index={0}, y index={2}] {\plusdata}; %
    \addplot[mark=none, thick, blue, densely dashed] table [x index={0}, y index={1}] {\plusdata}; %
    \addplot[mark=none, thick, red] table [x index={0}, y index={3}] {\plusdata}; %
    \addplot[mark=none, thick, red, densely dashed] table [x index={0}, y index={4}] {\plusdata}; %
  \end{axis}
\end{tikzpicture}
% \caption{$\mathbb{E}[a + b]$}
\label{fig:motiv_scale}
\end{minipage}
\caption{Time needed to compute the given operation on two random integers with varying bitwidth (x-axis). 
% X-axis is the bit-width. %The binary representation provides a benefit over existing native representation methods. 
% \todd{Mention the reason that expectation is used for the addition experiment?}
}
  \label{fig:scaling}
\end{figure*}
In this section, we empirically evaluate our integer compilation strategy. While we have demonstrated that a binary encoding exposes structure that knowledge compilation can exploit, it remains to be seen if this can improve the performance of probabilistic programs. In addition, we have yet to show how our approach compares to other inference methods on larger, more complex arithmetic models. We seek to answer the following questions. 

\begin{enumerate}[label=\textbf{\arabic*}),noitemsep]
    \item Does a binary encoding benefit existing knowledge compilation based languages?
    
    \item Does our approach outperform those of existing PPLs that support exact discrete inference?
\end{enumerate}

To this end, we have implemented our integer representation in \verb#Dice.jl#, a PPL embedded in Julia that uses the same knowledge compilation approach as Dice~\citep{HoltzenOOPSLA20}. 
% \sh{we can trim this odd discussion if we only compare against Dice.jl} 
In \verb#Dice.jl# (binary), unsigned random integers are implemented using the strategy described in the previous section; signed random integers are additionally implemented as a natural extension. The language provides the syntax \verb#discrete(..)# for arbitrary integer distributions and \verb#uniform(..)# for uniform distributions, implemented using the algorithms in Section \ref{sec:math}, as well as the operators $=, <, +, -, *, /,$ and $\%$. Simple operators are implemented as logical circuits, while more complex operators are implemented by composing simpler ones. \footnote{Our implementation and code for all experiments are available at \url{https://github.com/Juice-jl/Dice.jl/tree/arithmetic}.}

Reported runtimes are a median over at least 5 runs; all experiments were run with a 1 hour timeout. 
% The experiments were run on a server with a 2.20GHz CPU and 504GB RAM. 
A best-effort attempt was made for each language to implement benchmarks in a maximally performant manner. More experimental details are available in the supplementary material. 

% \vspace{-100pt}



\subsection{Improving Knowledge Compilation Languages with a Binary Encoding}
% \william{name section better}

% \begin{figure*}[t]
%     \minipage{\textwidth}
%       \includegraphics[width=\linewidth]{dice.png}
%     \endminipage
%     \caption{Dice performance on micro-benchmarks.}
%     \label{dice_results}
% \end{figure*}

% \begin{figure*}[t]
%     \minipage{\textwidth}
%       \includegraphics[width=\linewidth]{problog.png}
%     \endminipage
%     \caption{ProbLog performance on micro-benchmarks.}
%     \label{problog_results}
% \end{figure*}

% \sh{I thought we were only comparing Dice.jl and no longer against Dice? We should be precise here}
In this section, we demonstrate the benefit a binary encoding brings to knowledge compilation based languages. We compare our integer representation method to the methods of the PPL Dice~\citep{HoltzenOOPSLA20} and the probabilistic logic programming language ProbLog~\citep{Fierens2015}, both of which use knowledge compilation as their approach to inference. We do this by comparing the time needed to compute the simple arithmetic operations $a+b$, $a<b$, and $a==b$ on random integers of width $2^n$ for varying $n$ from 1 to 15. Here, the runtime of each method corresponds to the time needed to compile and run inference on each representation, effectively measuring how well the knowledge compilation based inference can exploit the structure of each simple function. For addition, we use the expectation of the sum as our target computation to avoid an output distribution with an exponentially increasing support. 

To allow for a fair comparison between ProbLog's native integer representation and our binary representation, we implement equivalent ProbLog programs computing the arithmetic operations, one using native ProbLog encodings and one using a binary representation. Both programs can then be run using ProbLog, controlling for the specific knowledge compilation system. To compare with Dice's native one-hot integer encoding, we implement both the one-hot encoding and our binary encoding in \verb+Dice.jl+. We then run the simple arithmetic programs using both encodings. % In both cases, arithmetic for the binary encoding is implemented as a logical circuit over the bits of the representation. 

The results of these experiments are presented in Figure \ref{fig:scaling}. 
We can clearly see that our binary encoding outperforms the existing integer representation strategy used in each language; while at small distribution widths (on the order of $2^4$), they are roughly comparable, our approach scales much better to larger integer distributions. 


\subsection{Complex Arithmetic Models}
\begin{table*}[!htp]\centering
\caption{Runtimes in seconds for probabilistic models using integers in various PPLs. \xmark\, indicates a timeout (over 1 hour).}\label{baselines}
\scriptsize
\begin{tabular}{lrrrrr}\toprule
Benchmarks              &{Dice.jl} (binary)         & {Dice.jl} (one-hot)   &WebPPL &Psi (DP) &Psi \\\midrule
book                    &\textbf{5.297}    &\xmark  &\xmark                  &\xmark &\xmark \\\cmidrule{1-6}
tugofwar                &\textbf{0.106}    &2660.373 &21.012             &\xmark &\xmark \\\cmidrule{1-6}
caesar-small            &\textbf{0.041}    &4.968 &0.074 &2.022       &402.196 \\
caesar-medium           &0.239             &39.518 &\textbf{0.135}     &12.505 &\xmark \\
caesar-large            &0.556             &122.109 &\textbf{0.227}     &30.387 &\xmark \\\cmidrule{1-6}
ranking-small           &\textbf{0.007}    &0.025 &0.83 &103.572      &\xmark \\
ranking-medium          &\textbf{0.022}     &0.077&\xmark &318.658         &\xmark \\
ranking-large           &\textbf{0.048}     &0.150&\xmark &330.51          &\xmark \\\cmidrule{1-6}
radar1                  &\textbf{0.034}     &0.664&118.002            &394.525 &2.517 \\\cmidrule{1-6}
floydwarshall-small     &\textbf{0.009}    &0.152 &\textbf{0.009}     &0.115 &113.467 \\
floydwarshall-medium    &\textbf{0.515}    &624.220 &9.51               &2792.14 &\xmark \\
floydwarshall-large     &\textbf{3.406}     &\xmark&\xmark                  &\xmark &\xmark \\\cmidrule{1-6}
linear extensions-small &\textbf{0.003}     &0.004&0.016              &0.351 &5.153 \\
linear extensions-medium &\textbf{0.007}   &0.013 &0.465              &111.38 &\xmark \\
linear extensions-large &\textbf{0.072}    &0.164 &162.009            &\xmark &\xmark \\\cmidrule{1-6}
triangle-small          &\textbf{0.086}     &102.544&3.693              &616.746 &482.14 \\
triangle-medium         &\textbf{0.455}     &1123.171&28.354             &\xmark &\xmark \\
triangle-large          &\textbf{17.365}   &\xmark &\xmark           &\xmark &\xmark \\\cmidrule{1-6}
gcd-small               &2.876             &\xmark &\textbf{0.189}     &24.33 &\xmark \\
gcd-medium              &103.614           &\xmark &\textbf{2.501}     &467.581 &\xmark \\
gcd-large               &\xmark            &\xmark &\textbf{46.626}    &\xmark & \xmark\\\cmidrule{1-6}
disease-small           &7.91              &\xmark &\textbf{1.093}     &109.242 &1009.848 \\
disease-medium          &764.212           &\xmark &\textbf{327.545}   &\xmark &\xmark \\\cmidrule{1-6}
luhn-small              &\textbf{0.039}    &0.594 &0.428              &44.164 &\xmark \\
luhn-medium             &\textbf{4.575}    &23.933 &42.372             &\xmark &\xmark \\
\bottomrule
\end{tabular}
\end{table*}

We also evaluated \verb+Dice.jl+ on more complex models involving distributions over integers. The models were taken from a variety of sources. These include examples involving integers from the existing PPL literature, various examples using continuous distributions adapted to a discrete space, natural modelling tasks using integers such as ranking and text manipulation, and traditional algorithms in a probabilistic setting. A short description of our baselines and their sources is given in the supplementary material.

As a point of comparison, we use two other PPLs supporting exact discrete inference. We identify two major classes of exact inference approaches used for discrete probabilistic programs: enumerative methods, which work by enumerating all paths through the program, and symbolic methods, which represent and compute the probability distribution through symbolic expressions. We compare against WebPPL~\citep{dippl} from the former category and Psi~\citep{gehr2016psi} from the latter. 
% \footnote{We ran on WebPPL v0.9.15 and Psi version \texttt{ec2cfc14a62a168afe7ce1d7269b92cf2882b830}.}

We also compare against a version of \verb#Dice.jl# that uses a one-hot encoding of integer distributions as a proxy for existing knowledge compilation approaches; this comparison avoids language-specific differences in performance. Compiled BDD sizes for the programs are provided in the supplementary material as an additional metric. 
% An additional metric for knowledge compilation based approaches, compiled BDD sizes, is provided in the supplementary material. 

Our results are summarized in Table \ref{baselines}. Many of our benchmark models can naturally be scaled to different sizes; they are implemented in small, medium, and large (corresponding to model size) variants to display the scaling behavior. Psi supports two exact inference algorithms: a default symbolic exact inference algorithm ("Psi") and its specialized dynamic programming inference algorithm ("Psi (DP)"). 

For the majority of benchmarks, our approach outperforms the current exact inference approaches, often achieving an orders-of-magnitude speedup. This result empirically validates the ability of our compilation strategy to exploit arithmetic structure in order to improve inference performance. We observe that the binary encoding outperforms the one-hot encoding with the same underlying knowledge compilation approach, demonstrating its superiority in exposing arithmetic structure. 

We note that \verb#Dice.jl# does not always outperform WebPPL, the enumeration based approach. We make special note of these examples. The caesar example introduces many random integers but immediately makes an observation on their value, thereby reducing the enumeration task and making this tractable for all approaches. The disease example contains parametric distributions on integers; for example, a binomial distribution with parameter $n$ distributed by a uniform distribution. These distributions have much less structure to exploit, and so our approach becomes essentially enumerative, but with additional overhead in compilation. The GCD example, which makes repeated use of the mod ($\%$) operator, similarly has a harder to exploit structure. However, we can see in general that our approach scales well to larger examples and outperforms existing PPLs that support exact discrete inference.  





\section{Enabling Continuous Priors with Discrete Distributions}\label{sec:betabern}
% \william{steven: continuous priors through discrete distributions?}
% \william{todd: make sure to highlight in intro}

Previous sections presented an inference strategy for integer distributions allowing for the scaling of integer arithmetic. We now demonstrate an interesting application of these integers: representing the continuous Beta distribution in a discrete space. 
Continuous priors are an essential part of Bayesian reasoning.
% In this section we show how we can leverage our new efficient encoding of integers to efficiently implement continuous posterior updates.
One particularly useful prior for discrete PPLs like \texttt{Dice.jl} is the \emph{Beta prior} $\Beta(\alpha,\beta)$, which is conjugate to the Bernoulli. The $\Beta$ distribution is continuous and thus not amenable to direct representation in \texttt{Dice.jl}.  However, we observe that if a Beta prior for a Bernoulli random variable has integral parameters $\alpha$ and $\beta$, then the posterior distribution is also a Beta with integral parameters. This section explains how we use this observation to represent Beta distribution in \texttt{Dice.jl}. 

% series of transformations in \verb#Dice.jl# that demonstrate how these integers can be used to increase the expressiveness of the language. \william{todd: too vague, surprising/interesting application of these integers}



Assume we have the following program where \texttt{Dice.jl} is extended to permit restricted classes of $\Beta$ priors, where the parameters $\alpha$ and $\beta$ must be constant integers:
\begin{lstlisting}[mathescape=true]
$\theta$ = Beta(1, 2)
x = flip($\theta$)
observe(x)
return $\theta$
\end{lstlisting}
We can perform inference for the posterior by exploiting well-known conjugacy results between $\Beta$ priors and Bernoullis. In particular, observing that \texttt{x} is true, as is done on Line 3 above, increases the \emph{pseudocount} $\alpha$ by 1, making the posterior for $\theta$ become $\Beta(2,2)$. Similarly, observing that \texttt{x} is false increases the pseudocount $\beta$ by 1.  

% \sh{I stopped writing here: please propagate the above changes through} 

To automate this approach in \texttt{Dice.jl}, we introduce program variables A and B to represent the pseudocounts $\alpha$ and $\beta$, respectively. We then conditionally update these pseudocounts after each \verb+flip+: increment $\alpha$ if the \verb+flip+ returns true; otherwise increment $\beta$.  Doing so ensures that later observations will have the desired effect on the pseudocounts.

The only remaining challenge is that discrete PPLs that employ knowledge compilation, like \verb+Dice.jl+, only support \verb+flip+s whose parameters are constants, so the \verb+flip+$(\theta)$ on Line 2 above is not supported.  To encode it, we use the fact that $\mathbb{E}[\Beta(\alpha, \beta)] = \frac{\alpha}{\alpha+\beta}$, so the \verb+flip+ on line 2 can be simplified to \verb+flip+$(\frac{A}{A+B})$. 
Unfortunately, \texttt{Dice.jl} still does not support this construct, since $\frac{A}{A+B}$ is not a constant.  However, $A+B$ is always a deterministic integer, since each observation increases it by exactly 1.  Therefore, we can introduce a variable T representing A+B and encode \verb+flip+$(\frac{A}{A+B})$ as $\uniform(0, T) < A$ (where $\uniform(0, n)$ is the uniform distribution over the integers ${0,..,n-1}$). Finally, we observe that it is not necessary to maintain both variables A and B, since B is derivable from A and T. The final transformed version of the program is as follows: 

% which can be rewritten using our previous transformation as $\uniform(0, A+B) < A$.

% A flip with parameter taken from this distribution will be true with probability the expectation of the distribution  $a/(a+b)$ \sh{this sentence does not typecheck for me. what is the expectation over?}. 
% We can therefore use our transformation from above to represent such a flip as $\uniform(0, A+B) < A$, representing the parameters $a$ and $b$ as program variables A and B. As they are deterministic values, this is efficient \sh{this sentence is unclear for me. try to be more specific about which transformations you're using, and give a bit more justification for why it is efficient.}. 

% This flip is not interesting on its own; we also require an update to obtain the correct posterior on Line 4. To do this, we can simply leverage the conjugacy property discussed above: the program variables A and B represent the pseudo-counts $\alpha$ and $\beta$, respectively. We conditionally update these pseudocounts on the outcome of the flip: if true, increment $\alpha$, otherwise increment $\beta$. 



\begin{lstlisting}[mathescape=true]
A = 1, T = 3 
x = uniform(0, T) < A 
A = if x then A+1 else A
T = T + 1   
observe(x)
return (A, T-A)
\end{lstlisting}

If we wish to draw another flip on the same Beta prior, we can simply repeat the code in lines 2-4 above. In this way, what we have actually implemented is a Beta-Bernoulli process via a Polya urn model - something analyzed in detail in probabilistic programs by \cite{staton}. We also note that this representation strategy --- an implementation of an urn model --- can also be used for many other distributions in addition to the Beta~\citep{polya}. An example application of this Beta prior --- learning Bayesian network parameters --- is given in the supplementary material. 

% So long that our prior parameters are integers, they will always remain in the integer domain, and so can be entirely represented with discrete values. 
% Note that it is also possible to handle rational-valued parameters by multiplying each term by an appropriate scaling factor.

% \william{priors? random?}}
% Consider a parameterised Boolean random variable \verb#flip(k/n)#, a flip with rational number bias where $k$ and $n$ are program variables. \verb#Dice.jl# does not support this construct: the language requires the arguments to a $\flip$ to be a fixed constant at compile time. Since integers are the only numerical construct we have access to, we cannot represent such a division within the program.

% To solve this, we make the key observation that $\uniform(0, n) < k$ (where $\uniform(0, n)$ is a uniform distribution over the integers ${0,..,n-1}$) has the same probability of being true as $\flip(\frac{k}{n})$. We can therefore use this transformation to represent the parameterised flip. 
% % We recall that $k$ can be a random number, and as long as $n$ is deterministic, the uniform distribution and the less-than operator have compact representations, making this an efficient representation. A varying 
% As $k$ can be a random value, this can be used to represent distribution over possible parameters, allowing us to now encode prior distributions over flip parameters. As long as $n$ is deterministic, we can use our approach in Algorithm \ref{alg:uniformgen} to represent such a uniform distribution efficiently. 


% \begin{figure}
% \centering
% \begin{subfigure}[b]{.45\textwidth}
% \begin{lstlisting}
% a = Beta(1, 2)
% x1 = flip(a)
% x2 = flip(a)
% observe(x1)
% observe(x2)
% return expectation(a) 
% \end{lstlisting}
% \caption{A probabilistic program using the Beta distribution as a prior}\label{example3} 
% \end{subfigure}
% \quad
% \begin{subfigure}[b]{.45\textwidth}
% \begin{lstlisting}
% A = 1
% T = 3 // = A + B = 1 + 2
% x1 = uniform(0, T) < A
% A = if x1 A + 1 else A 
% T = T + 1
% x2 = uniform(0, T) < A
% A = if x1 A + 1 else A 
% T = T + 1
% observe(x1)
% observe(x2)
% return uniform(0, T) < A
% \end{lstlisting}
% \caption{The program from \ref{example3} transformed to use an integer-based representation}\label{example4}
% \end{subfigure}
% \caption{Two probabilistic programs demonstrating an integer-based representation of the Beta distribution
% % \william{trying to decide if one large example works better than small code samples}
% }\label{betaexample}
% \end{figure}

% \william{move this to start of whole section}
 


% This single flip by itself is not interesting: a prior distribution is meant to be used for learning. By the conjugate prior property of the Beta distribution, we know if this flip is true, we should have a posterior distribution of $\Beta(a+1, b)$; if false, $\Beta(a, b+1)$. We can represent this fact by including the parameters of the Beta distribution as program variables; we can then conditionally update these parameters based on the flip, incrementing $A$ by 1 if true and $B$ by 1 if false. Now, observing a value of this flip will give us the correct Bayesian posterior parameters for the Beta distribution in the variables A and B. 

% However, suppose we want to make multiple observations - we will need another flip \sh{this discussion is confusing. You probably need to add another example program}! However, this conditional update means that $A$ and $B$ are no longer deterministic, meaning that our flip transformation will no longer be efficient. To address this, we make the observation that while our numerator A could take on the values $a$ or $a+1$, depending on the outcome of the flip, our denominator A+B is either $(a+1)+b$, or $a+(b+1)$ - the same value! Therefore, we can keep track of our total denominator in an entirely deterministic variable. 



% The flips we draw are equivalent to Bernoulli random variables with the same Beta-distributed parameter. This means that making an observation on the flip will implicitly update the underlying Beta prior. As an unobserved flip is true with probability equal to the expectation of the underlying Beta, we can get the expectation of the posterior distribution by computing the probability of this flip --- a point estimate of the true Bayesian posterior. 

% \subsection{An Application: Bayesian Network Parameter Learning}

% \begin{figure}[t]
%     \centering
%     % \includegraphics[scale=0.15]{filler.jpg}
%     % \begin{subfigure}[b]{0.2\textwidth}
%     \begin{tabular}{cc}
    
%     \begin{subfigure}[b]{0.2\linewidth}
    
%     \begin{tikzpicture}[node distance=0.75cm, baseline=-1em]
%       %
%       %
%       % \node[draw, circle] at (0, 0) {\textsc{a}};
%       % \node[draw, circle] at (20bp, 0) {\textsc{s}};
%       % \node[draw, circle] (E) {\textsc{e}};

%       % \node (f1) at (0, 0) [bddnode] {$f_1$};

%       \node (A) at (0, 0) [bddnode] {\textsc{a}};
%       \node (S) at ($(A) + (40bp, 0)$) [bddnode] {\textsc{s}};
%       \node (E) at ($(A) + (20bp, -20bp)$) [bddnode] {\textsc{e}};
%       \node (O) at ($(E) + (-20bp, -20bp)$) [bddnode] {\textsc{o}};
%       \node (R) at ($(O) + (40bp, 0)$) [bddnode] {\textsc{r}};
%       \node (T) at ($(R) + (-20bp, -20bp)$) [bddnode] {\textsc{t}};
      
%       % \node[draw, circle] (C) {\textsc{c}};
%       % \node[draw, circle, right of=C, above of=C] (S) {\textsc{s}};
%       % \node[draw, circle, left of=C, above of=C] (P) {\textsc{p}};
%       % \node[draw, circle, left of=C, below of=C] (X) {\textsc{x}};
%       % \node[draw, circle, right of=C, below of=C] (D) {\textsc{d}};
%       %
%       %
%       %
%       %
%       %
    
%       \draw[->] (A) -- (E);
%       \draw[->] (S) -- (E);
%       \draw[->] (E) -- (O);
%       \draw[->] (E) -- (R);
%       \draw[->] (O) -- (T);
%       \draw[->] (R) -- (T);
%       %

%       %
%       %

%       %
%       %

%       %
%       %
%       %
%       %
%       %
%       %
%       %
%     \end{tikzpicture}
%     \end{subfigure}
%     &
%     \begin{subfigure}[b]{0.2\linewidth}
%     %
%     \begin{tabular}{c|c|c|c|c|c}
%         A & S & E & O & R & T \\
%         \hline
%         1  & ?  & ? & 1 & 1 & 1 \\
%         1  & 0  & ? & 1 & 0 & 1 \\
%         1  & 0  & 1 & ? & 0 & 1 \\
%         1  & ?  & 1 & 0 & ? & 1 \\
%         0  & 0  & 1 & 1 & ? & 1 \\
%         0  & 1  & ? & 1 & 1 & ? \\
%         % 1  & ?  & 0 & 0 & 1 & 0 \\
%         % 0  & ?  & 1 & ? & ? & ? \\
%         % 1  & 1  & 1 & ? & 1 & ? \\
%         % 1  & 0  & 0 & ? & 1 & 1 \\
%         \hline
%     \end{tabular}
%     \end{subfigure}
%     \\
%     % \begin{minipage}{1cm}
%     \begin{subfigure}[b]{0.2\linewidth}
%     \begin{tabular}{c|c|c}
%         $\alpha$ & $\beta$ & Pr(.)  \\
%         \hline
%         9 & 3 & 0.226\\
%         4 & 8 & 0.035\\
%         \hline
%     \end{tabular}
%     \end{subfigure}
%     &
%     \begin{subfigure}[b]{0.2\linewidth}
%     \begin{tikzpicture}
%     \begin{axis}[
%         height=3cm,
%         width=4cm,
%         grid=major,
%         xmode=linear,
%         ymode=linear,
%         % xtick={2,4,6,8,10},
%         % ytick={0.001, 0.01, 0.1, 1, 10, 100, 1000},
%         xlabel={$\theta_{O|E}$}
%         % ylabel={PDF}
%         % legend columns=4,
%         % legend style={at={(-5em,-7em)},anchor=north west}
%     ]
%     \addplot[mark=none, thick, blue] table [x index={0}, y index={1}] {\pdfdata}; %
%     % \addlegendentry{WebPPL};
    
%     \end{axis}
%   \end{tikzpicture}
%   \end{subfigure}
%     \\
%   \end{tabular}
%     \caption{Example posterior on the survey Bayesian network computed using our representation method. \texttt{Dice.jl} outputs a distribution over Beta parameters which can be combined to get the posterior probability density function.}
%     \label{beta_example_plot}
% \end{figure}


% % \william{steven: either priors, or highly structured constraints (priors are all sampling based, struggle with hd-discrete constraints}

% % \william{typically people use things like stan, we do this exactly}

% One natural application of a Beta prior is for the parameters of a (binary) Bayesian network. The task of Bayesian network parameter learning can be described as follows: given a set of data consisting of instantiations of network variables, we want to find the network parameters maximizing the probability of this data. One interesting case is when our data is incomplete; that is, the instantiations have missing values. In this setting, a Bayesian approach to parameter learning must consider all (exponentially many) possible missing value instantiations~\cite{darwiche_2009}.

% This setting can naturally be modeled within \texttt{Dice.jl} using the pipeline described in Figure ~\ref{beta_example_plot}. A Bayesian network can be expressed in the probabilistic program with Beta priors on each network parameter; a dataset can then be observed. By returning the distribution over our Beta parameters $\alpha$ and $\beta$, we obtain our posterior, a mixture over Beta distributions. These can then be manually combined to obtain the exact posterior density function. 

% % This application naturally fits our probabilistic programming language with the process described above: we can easily model a Bayesian network with random flips drawn from Beta priors, representing our prior belief for each parameter. Observations on the network variables will then update our beliefs, from which we can get an expectation in the manner described above.
% % The distribution over the internal program variables representing the parameters of the Beta distribution is a mixture over Beta distributions that is the exact posterior to our problem. Compared to common sampling-based approaches for representing the Beta distribution, 
% % An example of this learning, on the \verb#survey#~\citep{survey} Bayesian network, is given in Figure ~\ref{beta_example_plot}. 



% % We have implemented such an application on the \verb#survey#~\citep{survey} Bayesian network, on which we can scale to roughly 7 missing data points \sh{plots? this feels unfinished}. 

% In the previous section, we described how a series of integer transformations allows us to represent a Beta prior in a manner suitable for a learning task, an expansion of the expressive capability of \verb#Dice.jl#. We note that this representation strategy --- an implementation of an urn model --- can also be used for many other distributions \citep{polya}, further increasing the space of distributions \verb#Dice.jl# can handle. 


\section{Related Work}\label{sec:related}

% Our approach is the first one to handle probabilistic integers using their binary encoding. Existing PPLs support integer distributions in distinctive ways as surveyed below. 
\emph{PPL Inference.}~~
Knowledge-compilation-based PPLs are most closely-related to this work~\citep{HoltzenOOPSLA20, Raedt2007, Fierens2015, saad2021sppl, pfanschilling2022sum}. All these languages stand to benefit from our new binary-encoding. Other PPLs perform exact discrete inference by eliminating discrete variables via enumeration or variable elimination; these approaches lose global structure and hence cannot exploit arithmetic structure as in our approach~\citep{dippl, bingham2019pyro}. Symbolic methods support integers by representing them as a symbolic formula or program~\citep{gehr2016psi,narayanan2016probabilistic}; we believe that in principle it may be possible to adapt these symbolic representations to use a binary representation, but currently these systems do not. Recent work uses probability generating functions (PGFs) to represent (potentially unbounded) discrete distributions~\citep{EPIUGF2023}. 
PGFs represent the distribution symbolically, but do not appear to be compatible with our strategy for binary encodings.
Sampling based inference algorithms work very well for probabilistic program with continuous distributions, but do not exploit the global structure of integer arithmetic~\citep{KANTAS2009774, hoffman2014no, Arouna+2004+1+24,10.5555/3061053.3061128}.
Finally, there are algorithms that seek to efficiently model integer distributions with recursion~\citep{KY76, pmlr-v108-saad20a}; these approaches are orthogonal to ours, as we do not use recursion.
% I know the reviewer asked for this, but it's really not relevant.


\emph{Graphical models.}~~
Probabilistic graphical model (PGM) based inference methods~\citep{pfeffer2009figaro, McCallum2009, sanner2012symbolic, koller2009probabilistic} support integers by treating them as categorical distributions. PGMs struggle to represent arithmetic: for instance, a CPT for adding two $n$-bit numbers requires $O(2^n)$ entries.
% \poorva{Following paragraph was promised in rebuttal}

% \textit{

%Other related works would include work based on probability generating function and beta problog.

\section{Conclusion}\label{sec:conclusion}
We presented a strategy for encoding random integers in probabilistic programs via a binary representation, which allows arithmetic operations to be performed through standard Boolean circuits. When combined with the knowledge compilation approach to probabilistic inference, this strategy naturally exploits structure in arithmetic that current approaches do not account for. We showed empirically that this allows existing discrete PPLs to scale to significantly more complex probabilistic models. One interesting consequence is that we can now leverage conjugacy to represent the Beta distribution, a continuous distribution, in purely discrete programs.

% \begin{contributions} % will be removed in pdf for initial submission 
% 					  % (without ‘accepted’ option in \documentclass)
%                       % so you can already fill it to test with the
%                       % ‘accepted’ class option
% \end{contributions}

\begin{acknowledgements} % will be removed in pdf for initial submission,
						 % (without ‘accepted’ option in \documentclass)
                         % so you can already fill it to test with the
                         % ‘accepted’ class option
    % \william{waiting on the guy blurb from honghua}
    This work was funded in part by the DARPA PTG Program under contract number HR00112220005, NSF grants \#IIS-1943641, \#IIS-1956441, \#CCF-1837129, \#CCF-2220408, and a gift from RelationalAI. GVdB discloses a financial interest in RelationalAI.
    
    % \sh{Please reference NSF Grant }

    % \emph{All} acknowledgements go in this section.
\end{acknowledgements}

% References
\bibliography{cao_757}
\end{document}
