\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

\usepackage{subcaption}
\usepackage{float}
\usepackage{amsthm}
\usepackage{amsfonts}


\usepackage[ruled,noend]{algorithm2e}
\SetKwInOut{Input}{Input}
\SetKwInOut{Output}{Output}

\usepackage{xspace}

\newtheorem{theorem}{Theorem}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{fact}[theorem]{Fact}
\newtheorem{definition}{Definition}
\newtheorem{remark}{Remark}
\newtheorem{example}{Example}

\usetikzlibrary{decorations.pathreplacing}
\usetikzlibrary{arrows}

\DeclareMathOperator*{\E}{\mathbb{E}}

\newcommand{\be}{\begin{eqnarray}}
\newcommand{\ee}{\end{eqnarray}}
\newcommand{\bes}{\begin{eqnarray*}}
\newcommand{\ees}{\end{eqnarray*}}
\newcommand{\complexset}{\mathbb{C}}
\newcommand{\natset}{\mathbb{N}}
\newcommand{\realset}{\mathbb{R}}
\newcommand{\A}{\mathbb{A}}
\newcommand{\B}{\mathbb{B}}
\newcommand{\F}{\mathbb{F}}
\newcommand{\Z}{\mathbb{Z}}
\newcommand{\setminusx}{\!\setminus\!}
\newcommand{\tO}{\tilde{O}}

\newcommand{\cM}{\mathcal{M}}
\newcommand{\cQ}{\mathcal{Q}}

\DeclareMathOperator{\coef}{coef}
\DeclareMathOperator{\diag}{diag}
\newcommand{\col}{\mathit{col}}
\newcommand{\row}{\mathit{row}}

\usepackage{stackengine} 
\newcommand\osym[1]{\stackMath\mathbin{\stackinset{c}{0ex}{c}{0ex}{#1}{\raisebox{-0.5mm}{\scalebox{1.7}{$\bigcirc$}}}}}




\title{On Inference and Learning With Probabilistic Generating Circuits}

\author[1]{\href{mailto:<juha.harviainen@helsinki.fi>?Subject=Your UAI 2023 paper}{Juha Harviainen}{}}
\author[2]{Vaidyanathan Peruvemba Ramaswamy}
\author[1]{Mikko Koivisto}
\affil[1]{%
    Department of Computer Science\\
    University of Helsinki\\
    Helsinki, Finland
}
\affil[2]{%
    Faculty of Informatics\\
    TU Wien\\
    Vienna, Austria
}
  
  \begin{document}
\maketitle

\begin{abstract}
Probabilistic generating circuits (PGCs) are economical representations of multivariate probability generating polynomials (PGPs). They unify and extend decomposable probabilistic circuits and determinantal point processes, admitting tractable computation of marginal probabilities. However, the need for addition and multiplication of high-degree polynomials incurs a significant additional factor in the complexity of inference. Here, we give a new inference algorithm that eliminates this extra factor. Specifically, we show that it suffices to keep track of the highest degree coefficients of the computed polynomials, rendering the algorithm linear in the circuit size. In addition, we show that determinant-based circuits need not be expanded to division-free circuits, but can be handled by division-based fast algorithms. While these advances enhance the appeal of PGCs, we also discover an obstacle to learning them from data: it is NP-hard to recognize whether a given PGC encodes a PGP. We discuss the implications of our ambivalent findings and sketch a method, in which learning is restricted to PGCs that are composed of moderate-size subcircuits.
\end{abstract}

\section{Introduction}


Much recent research on probabilistic modeling has focused on the computational tractability of inference. A class of inference queries---such as marginal probabilities or most probable explanations---is said to be \emph{tractable} in a model family if any query can be computed in time polynomial in the model size. For this concept to be useful, one would require succinct yet expressive models. 


To this end, two significant model families supporting tractable marginal probability queries are (decomposable) \emph{probabilistic circuits} (PCs) \citep{Choi20} and \emph{determinantal point processes} (DPPs) \citep{Borodin05,Kulesza12}. PCs unify a range of models composed of conditional independences and mixture distributions, including arithmetic circuits \citep{Darwiche03}, sum-product networks \citep{Peharz19,Poon11}, probabilistic sentential decision diagrams \citep{Kisa14}, and cutset networks \citep{Rahman16}. DPPs, on the other hand, represent global negative correlations among the variables through the determinant of a kernel matrix.
The two families are distinct: there are PCs whose representations as DPPs are exponentially larger, and vice versa \citep{Zhang20}. Another recent model family is the mixture of all trees that supports tractable likelihood computation and estimation of marginals \citep{Selvam23}.

Both decomposable PCs and DPPs can be succinctly represented by \emph{probabilistic generating circuits} (PGCs) \citep{Zhang21}. A PGC encodes a probability distribution into a \emph{probability generating polynomial} (PGP) using a circuit that consists of four types of nodes: sum nodes, product nodes, indeterminates, and constants. The key difference to PCs is that the sum nodes in PGCs can also take negative weights. This significantly boosts the expressiveness of the model family: a circuit with negation can be exponentially smaller than a monotone circuit representing the same polynomial \citep{Valiant80}. The ability to economically represent DPPs is (just) one remarkable incarnation of this power of PGCs.  
%

Marginal probabilities remain tractable in PGCs: they can be computed by a single pass over the circuit, using arithmetic over univariate polynomials \citep[Thm~1]{Zhang21}. However, since the degree of the polynomials can be as large as the number of variables---say, hundreds or thousands in practice---the computational overhead can be significant in comparison to the highly efficient inference algorithms available for decomposable PCs \citep{Peharz20}. 
%
While the expensive polynomial arithmetic can be avoided in special, somewhat basic PGC architectures (called SimplePGCs) \citep{Zhang21}, slow inference hampers the deployment of more general PGCs.


As the first contribution of this paper, we give a new inference algorithm. Our algorithm is linear in the circuit size, thus removing the extra factor due to polynomial arithmetic. 
%
While this complexity is the best possible for PGCs given as explicit circuits, one might hope for even faster algorithms if the circuit admits a smaller, implicit representation. The determinant of an $n \times n$ matrix is a prime example: it can be evaluated in time $O(n^3)$ using division, whereas division-free circuits, practical for moderate $n$, have size $\Omega(n^4)$; see, e.g., Bird's [\citeyear{Bird11}] algorithm.\footnote{The asymptotically fastest known division-free algorithms reduce the additional factor from $n$ to around $n^{0.3}$ \citep{Kaltofen05}.}  
%
As a second contribution, we show that PGCs that involve determinants, indeed, need not be compiled into explicit division-free circuits---we can employ the fastest division-based algorithms! 


The latter half of this paper investigates the task of learning a PGC from data. We consider the setting in which the structure of the circuit is given but model parameters---the weights of the addition nodes---are to be fitted to the data. Here a standard method would be to (locally) maximize a penalized likelihood function using (stochastic) gradient descent; implementing this method is straightforward  \citep[Sect.~5]{Zhang21} when all points in the ``natural'' parameter space are feasible solutions, i.e., the PGC encodes a PGP. 
%
Unfortunately, allowing arbitrary negative weights turns out to jeopardize that method: as we will show, not only there exist infeasible PGCs, but recognizing whether a given PGC is feasible is NP-hard. As a remedy, we end our paper by sketching a framework for learning restricted classes of PGCs: we argue that optimization or sampling in the parameter space can be made computationally feasible for PGCs that are composed of moderate-size subcircuits using the sum, product, and hierarchical composition operations \citep[Prop.~2 and 3]{Zhang21}. 


In this paper, we restrict ourselves to analytic, theoretical considerations. While we can expect our inference algorithms, no doubt, bring orders-of-magnitude speedups in practice, there remain numerous intriguing implementation issues related to, e.g., numerical stability and good use of modern hardware; these are best addressed by devoted engineering research (cf.\ \cite{Peharz20}). Likewise, we expect that implementing and experimenting with the sketched approach to learn PGCs will require a significant dedicated effort, which is beyond the present work. 

\begin{figure*}[t!]
	\captionsetup[subfigure]{position=b}
	\subcaptionbox{Joint probability table \label{fig:joint}}{\begin{subfigure}[p]{.5\textwidth}
	    \small
	    \centering
	    \begin{tabular}{cccc}
	    	\toprule
	    	$X_1$ & $X_2$ & $X_3$ & $\Pr(\cdot)$\\
	    	\midrule
	    	0 & 0 & 0 & $0.050$\\
	    	1 & 0 & 0 & $0.300$\\
	    	0 & 1 & 0 & $0.050$\\
	    	1 & 1 & 0 & $0.250$\\
	    	0 & 0 & 1 & $0.025$\\
	    	1 & 0 & 1 & $0.150$\\
	    	0 & 1 & 1 & $0.025$\\
	    	1 & 1 & 1 & $0.150$\\
	    	\bottomrule
	    \end{tabular}
	\end{subfigure}}
	\subcaptionbox{One possible PGC \label{fig:pgc}}{\begin{subfigure}[p]{.5\textwidth}
		\small
		\begin{tikzpicture}
	        \node[shape=circle,draw,inner sep=2] (P1) at (0,0) {$+$};
	        \node[shape=circle,draw,inner sep=2] (T1) at (-1,-1) {$\times$};
	        \node[shape=circle,draw,inner sep=2] (T2) at (1,-1) {$\times$};
	        \node[shape=circle,draw,inner sep=2] (T3) at (0,-2) {$\times$};
	        \node[shape=circle,draw,inner sep=2] (P2) at (-2,-2) {$+$};
	        \node[shape=circle,draw,inner sep=2] (P3) at (-1,-3) {$+$};
	        \node[shape=circle,draw,inner sep=2] (P4) at (1,-3) {$+$};
	        \node[shape=circle,draw,inner sep=2] (Z11) at (1,-2) {$z_1$};
	        \node[shape=circle,draw,inner sep=2] (Z12) at (2,-2) {$z_2$};
	        \node[shape=circle,draw,inner sep=2] (Z13) at (-2,-3) {$z_1$};
	        \node[shape=circle,draw,inner sep=2] (Z2) at (-1,-4) {$z_2$};
	        \node[shape=circle,draw,inner sep=2] (Z3) at (2,-4) {$z_3$};
	        \node[shape=circle,draw,inner sep=2.9] (C1) at (-3,-3) {$1$};
	        \node[shape=circle,draw,inner sep=2.9] (C2) at (-2,-4) {$1$};
	        \node[shape=circle,draw,inner sep=2.9] (C3) at (1,-4) {$2$};
	        \path [-triangle 45] (T1) edge node[left,pos=0.7] {$0.025$} (P1);
	        \path [-triangle 45] (T2) edge node[right,pos=0.7] {$-0.05$} (P1);
	        \path [-triangle 45] (Z11) edge node[right,pos=0.7] {} (T2);
	        \path [-triangle 45] (Z12) edge node[right,pos=0.7] {} (T2);
	        \path [-triangle 45] (P2) edge node[right,pos=0.7] {} (T1);
	        \path [-triangle 45] (P3) edge node[right,pos=0.7] {} (T3);
	        \path [-triangle 45] (P4) edge node[right,pos=0.7] {} (T3);
	        \path [-triangle 45] (T3) edge node[right,pos=0.7] {} (T1);
	        \path [-triangle 45] (C1) edge node[left,pos=0.6] {$1$} (P2);
	        \path [-triangle 45] (C2) edge node[left,pos=0.6] {$1$} (P3);
	        \path [-triangle 45] (C3) edge node[left,pos=0.6] {$1$} (P4);
	        \path [-triangle 45] (Z13) edge node[right,pos=0.6] {$6$} (P2);
	        \path [-triangle 45] (Z2) edge node[right,pos=0.6] {$1$} (P3);
	        \path [-triangle 45] (Z3) edge node[right,pos=0.6] {$1$} (P4);
	    \end{tikzpicture}
	    \centering
	\end{subfigure}}
	\caption{The joint probability table and one possible PGC for the polynomial of Eq.~(\ref{eq:1}).}
	\label{fig:example}
\end{figure*}


\section{Preliminaries}

This section reviews the main concepts of PGCs, following \cite{Zhang21}.

\subsection{Representation}

Let $X_1, \dots, X_n$ be binary random variables and $\Pr(\cdot)$ a probability distribution over them. Then, the \emph{probability generating polynomial} (PGP) of the distribution with indeterminates $z_1, \dots, z_n$ is
\[ g(z_1, \dots, z_n) \coloneqq \sum_{S \subseteq \{1, \dots, n\}} c_S z^S, \]
where $z^S = \prod_{i \in S} z_i$ and
\[ c_S = \Pr\big( \{X_i = 1\}_{i \in S}, \{X_i = 0\}_{i \not\in S} \big). \]

\emph{Probability generating circuits} (PGCs) are one way for representing these polynomials. They are directed acyclic graphs (DAGs) with edges oriented towards a single sink node, and each node has four options for its type:
\begin{itemize}[label={}]
	\item $\osym{+}$: Sum of two real-weighted inputs;
	\item $\osym{\times}$: Product of two inputs;
	\item $\osym{c}$: Constant real value $c$;
	\item $\osym{z_i}$: Indeterminate $z_i$.
\end{itemize}
The \emph{scope} of a node in a PGC is the set of indeterminates on which the node depends (i.e., the descendants in the graph). 
The \emph{scope of a PGC} is the scope of its sink node. We may refer to a scope by the index set of the indeterminates.    
%
The \emph{size} of a PGC is the number of edges in the graph. 

As an example, Figure~\ref{fig:example} contains the joint probability table and one possible PGC for the generating polynomial 
\begin{align}
	\begin{split}\label{eq:1}
		&0.15 z_1 z_2 z_3 + 0.025 z_2 z_3 + 0.15 z_1 z_3\\
		&\quad + 0.025 z_3 + 0.25 z_1 z_2 + 0.05 z_2 + 0.3 z_1 + 0.05
	\end{split}
\end{align}
that stems from rewriting the polynomial in the form
\bes
	0.025(1 + 6z_1)(1 + z_2)(2 + z_3) - 0.05z_1z_2\,.
\ees	


\subsection{Composition}\label{se:comp}

Let $f_k$ be a PGP with scope $S_k$, for $k = 1, \ldots, \ell$. 
We obtain a new PGP with scope $S_1 \cup \cdots \cup S_\ell$ by any of the following composition operations  \citep{Zhang21}: 
\begin{description}
\item[]\emph{Sum:} $w_1 f_1 + \cdots + w_\ell f_\ell$, where the weights $w_k$ are nonnegative and sum up to $1$. 
\item[]\emph{Product:}  $f_1 \cdots f_\ell$ for pairwise disjoint scopes $S_k$. 
\item[]\emph{Hierarchical:} $h(f_1, \ldots, f_\ell)$ for pairwise disjoint scopes $S_k$ and some $\ell$-variate PGP $h$. 
\end{description}

As an example of a hierarchical composition, consider the generating polynomial  
\be \label{eq:dpp}
	h(y_1, \ldots, y_\ell) = \det\big(I + L \diag(y_1, \ldots, y_\ell)\big)\,.
\ee	  
of a DPP with kernel $L$. Now, let $f_1, \ldots, f_\ell$ be PGPs with disjoint scopes that partition the set of indeterminates $z_1, \ldots, z_n$. By the substitutions $y_k := f_k$ we obtain an $n$-variate PGP. Referring to the representations of the involved polynomials as circuits, \cite{Zhang21} call the compound model a  \emph{determinantal PGC} (DetPGC).

\subsection{Inference}

PGCs support polynomial-time marginal probability queries. The idea is to extract a particular coefficient of a univariate polynomial obtained by assigning appropriate values to the indeterminates of the circuit. For a univariate polynomial $p(t) = b_0 + b_1 t + \cdots + b_n t^n$, we write $\coef_d p(t)$ for the coefficient $b_d$ of the term $t^d$.

\begin{lemma}[\cite{Zhang21}]\label{lem:coef}
	Let $\Pr(\cdot)$ be a probability distribution with a PGP $g(z_1, \dots, z_n)$. For disjoint index sets $A, B \subseteq \{1, \dots, n\}$, define a univariate polynomial in an indeterminate $t$ by 
	%\begin{equation}\label{eq:2}
	\[
		g_{A,B}(t) \!:=\! g\big(\{z_i \!= t\}_{i \in A}, \{z_i \!= 0\}_{i \in B}, \{z_i \!= 1\}_{i \not\in A \cup B}\big)\,.
	\]
	%\end{equation}
	Then, the marginal probabilities are given by 
	%\bes
	\[ 
		\Pr\big( \{X_i = 1\}_{i \in A}, \{X_i = 0\}_{i \in B} \big) = \coef_{|A|} g_{A,B}(t)\,.
	\]
	%\ees	 
\end{lemma}

We can compute the marginal probability for a given query $(A, B)$ by a bottom-up pass through the PGC: first we set each indeterminate (in source nodes) to $t$, $0$, or $1$; then in sum and product nodes, respectively, we add and multiply the two polynomials represented by the child subcircuits. Since two degree-$n$ polynomials can be added in time $O(n)$ and multiplied in time $O(n \log n \log \log n)$ \citep{Cantor91,Schonhage71}, the polynomial $g_{A, B}(t)$ can be computed in time $O(m n \log n \log \log n)$ in a circuit of size $m$. Note that any terms of degree higher than $n$ can be safely ignored as they get cancelled out eventually by the definition of a PGP.

\section{Faster Inference}

We present three improved algorithms for answering marginal probability queries $(A, B)$ in PGCs. Our algorithms make use of Lemma~\ref{lem:coef}, however avoiding the per-node arithmetic on univariate polynomials. 

Our first algorithm is based point evaluations and applies to arbitrary PGCs. 
Evaluating a PGC of size $m$ in a given point takes time $O(m)$. Since, the corresponding univariate polynomial (after the substitution) is of degree at most $|A| \le n$, it is uniquely determined by $n+1$ evaluations over different values of the variable $t$.
\begin{theorem}\label{thm:pointeval}
	Any marginal probability in a PGC of size $m$ and $n$ variables can be computed in time $O(m n)$.
\end{theorem}
\begin{proof}
	Evaluate the circuit at the points $t = 0, 1, \dots, |A|$. This takes $O(m n)$ time. Finally, apply fast polynomial interpolation \citep{Horowitz72} in time $O(n \log^2 n)$. Since the circuit mentions each indeterminate at least once, we have $m \geq n$. The asymptotic bound follows. 
\end{proof}

\begin{remark}\label{rem:pointeval}
The algorithm only needs a way to evaluate the circuit in a given point---the evaluation method need not restrict its operations to additions and multiplications. Supposing one evaluation takes time $T$, any marginal probabilty can be computed in time $O(T n)$.  
\end{remark}

\subsection{Linear-time Inference}

We next further expedite the computations by a factor of $n$, yielding a truly linear running time $O(m)$. 
%
Our algorithm requires that the PGC is decomposable: 
%
\begin{definition}
{\rm 
A product node in a PGC is \emph{decomposable} if its children have disjoint scopes. A PGC is \emph{decomposable} if all its product nodes are decomposable.
} 
\end{definition}

Decomposable PGCs are a significant subclass of PGCs. All example PGCs considered in this and previous work \cite{Zhang21} are decomposable. Note that the composition operations (Section~\ref{se:comp}) preserve decomposability. 

The key observation is that for every node in the circuit, the degree of the corresponding univariate polynomial is at most $|A|$, as the scopes are disjoint.\footnote{A non-decomposable PGC may contain a product node that corresponds to a multivariate polynomial where    some indeterminate has degree larger than $1$. Such terms cancel out at the end.} 
%
This observation allows us to only track the coefficients of the highest degree terms (with a slight abuse of terminology allowing zero-valued coefficients---we consider $t^2 - t^2$ to have degree $2$ in a sense, with a coefficient $0$).

The algorithm operates on a semiring $\langle \mathbb{R} \times (\mathbb{N} \cup \{0\}), +, \times \rangle$ where the elements $(v, d)$ describe the coefficient $v$ of the highest degree term $t^d$ that has been seen. Thus, the operators are defined as follows: Let $(v_1, d_1)$ and $(v_2, d_2)$ be elements of this semiring. Then, 
	\[ (v_1, d_1) + (v_2, d_2) \coloneqq \left\lbrace
	\begin{aligned}
		&(v_1, d_1), &&\textrm{if } d_1 > d_2,\\
		&(v_2, d_2), &&\textrm{if } d_1 < d_2,\\
		&(v_1 + v_2, d_1), &&\textrm{if } d_1 = d_2\\
	\end{aligned}
	\right. \]	
	and
	\[ (v_1, d_1) \times (v_2, d_2) \coloneqq (v_1v_2, d_1 + d_2)\,. \]
	We also define scalar multiplication by $w \in \mathbb{R}$ for the weighted addition:
	$w \cdot (v, d) \coloneqq (wv, d).$
	
	Similar structures have been used before, e.g., for counting optimal variable assignments in graphical models \citep{Marinescu19} and with infinitesimal numbers in probabilistic programming \citep{Martires23}.
	
	The algorithm proceeds by replacing $t$ by $(1, 1)$ and any constant $c$ by $(c, 0)$. Then, the sum and product nodes are evaluated using the operations of the semiring as described in Algorithm~\ref{algo:coef}. Clearly, the whole process takes $O(m)$ time.
	
\begin{algorithm}[t!]
{\small
    \Input{A PGC with a sink node $s$ after the substitution.}
    \Output{A pair $(v, d)$.}
    \If{$s$ \textup{has children}}{
    	Call Algorithm~\ref{algo:coef} with the children as sink nodes\;
    	Save outputs as $(v_1, d_1)$ and $(v_2, d_2)$\;
    	\If{$s$ \textup{is a product node}}{    
	    	\Return $(v_1, d_1) \times (v_2, d_2)$\;
	    }
	    \Else{
	    	\Return $w_1 \cdot (v_1, d_1) + w_2 \cdot (v_2, d_2)$ with $w_1, w_2$ being the corresponding weights for addition\;
		}
	}
	\If{$s$ \textup{has value} $c$}{
		\Return $(c, 0)$\;
	}
	\Return $(1, 1)$\;
    \caption{Fast inference in decomposable PGCs}
    \label{algo:coef}
}
\end{algorithm}

\newcommand{\extract}{\varphi}

	We claim that the desired coefficient is obtained from the output $(v, d)$ by the function \[
	\extract_{|A|}(v, d) \coloneqq \left\lbrace\begin{aligned}
		&v, &&\textrm{if $d = |A|$},\\
		&0, &&\textrm{otherwise}.
	\end{aligned}\right.
	\]
	
	The results are proven formally in the following theorem:

\begin{theorem}
	Any marginal probability in a decomposable PGC of size $m$ can be computed in time $O(m)$.
\end{theorem}
\begin{proof}
	We prove that Algorithm~\ref{algo:coef} computes the marginal probability correctly by induction on the size of the circuit. Consider the four types the sink node can have:
	\begin{itemize}
		\item Constant $c$: The polynomial is $c$ and the corresponding semiring element $(c, 0)$. In both cases the output will be $c$ if and only if $|A| = 0$ and $0$ otherwise.
		\item Indeterminate $z_i$: Similar to above with semiring element $(1, 1)$ and $|A| = 1$.
		\item Addition: Let $p$ and $q$ be the univariate polynomials that are the inputs of the node and $w_1$ and $w_2$ the corresponding weights for addition. The equality \[ \coef_{|A|} (w_1 \cdot p + w_2 \cdot q) = w_1 \cdot \coef_{|A|} p + w_2 \cdot \coef_{|A|} q \] implies that it suffices to compute the sum of the coefficients from two smaller circuits. A straightforward case analysis for the outputs of the subcircuits proves the result.
		\item Multiplication: Similar to addition, let $p$ and $q$ be the input polynomials and $(v_1, d_1)$ and $(v_2, d_2)$ the outputs for the subcircuits, respectively. Additionally, let $A_p$ be the set of indeterminates with $z_i = t$ in the subcircuit of $p$, and $A_q$ this set for $q$. The circuit is decomposable, and thus $A_p$ and $A_q$ are disjoint. If $A_p \cup A_q = A$, then 
		\[ \coef_{|A|} (p \cdot q) = (\coef_{|A_p|} p)(\coef_{|A_q|} q)\,, \]
		and by induction,
		\begin{align*}
			\!(\coef_{|A_p|}\! p)(\coef_{|A_q|}\! q)
			= &\, \extract_{|A_p|}(v_1, d_1) \cdot \extract_{|A_q|}(v_2, d_2)\\
			= &\, \extract_{|A_p| + |A_q|}(v_1 v_2, d_1 + d_2)\\
			= &\, \extract_{|A_p \cup A_q|}(v_1 v_2, d_1 + d_2)\\
			= &\, \extract_{|A|}(v_1 v_2, d_1 + d_2)\,.
		\end{align*}
		Otherwise
		\[	\coef_{|A|} (p \cdot q) = \extract_{|A|}(v_1 v_2, d_1 + d_2) = 0\]
		because $d_1 + d_2 < |A|$, completing the proof.
		%\qedhere  %savespace
	\end{itemize}	  
\end{proof}

To see why the linear-time algorithm can fail in a non-decomposable PGC, consider the PGP
\[\tfrac{1}{3}(z_1z_2 + z_1z_3 + z_2z_3) = \tfrac{1}{3}\big((z_1+z_2)(z_1+z_3)-z_1^2\big)\,. \]
Assigning $z_1 = t$ and $z_2 = z_3 = 1$ yields a term $z_1^2 = t^2$ on the right-hand side. This disrupts the computation because $z_1^2$ is always cancelled out when the formula is expanded, but the algorithm still finds a semiring element $(0, 2)$.

However, it is possible to modify the algorithm to allow overlap between the scopes. Let $(v, d)$ be the output from evaluating Algorithm~\ref{algo:coef} with $z_i = t$ for all $i$ on a PGC. Then, $\mu \coloneqq \max\{0,d - n\}$ measures the \emph{degree of disjointness} of the PGC in a sense, higher values meaning more overlap. It is also an upper bound for the degree of disjointness of all its subcircuits. This suggests that maintaining $\mu+1$ largest coefficients suffices for evaluating marginal probabilities in general PGCs in time $O(m \mu \log \mu \log \log \mu)$ with fast polynomial multiplication --- nearly linear time in $m$ for moderately small overlap. In the extreme, this reduces to the algorithm of \cite{Zhang21} as it needs to maintain all terms of the polynomial.

\subsection{Inference for Determinantal Forms}

We next present faster inference for PGCs containing determinants. We first define the form we require the circuit to have:

\begin{definition}\label{def:sf}
\rm Let $P = (p_{ij})$ be a matrix where each entry is a polynomial over indeterminates $y_1, \ldots, y_\ell$ of degree at most one, that is,
\bes
	p_{ij} = p_{ij0} + \sum_{k \in S_{ij}} p_{ijk} y_k\,, 
\ees 
with some scopes $S_{ij} \subseteq \{1,\ldots,\ell\}$ and real coefficients $p_{ijk}$. 
We say that $P$ is \emph{scope-disjoint} if each indeterminate $y_k$ appears in at most one row or column, that is, $k \in S_{ij}$ implies that $k \not\in S_{rs}$ if $r \neq i$ and $s \neq j$.  
\end{definition}

\begin{remark}
The degree restriction is here for convenience of exposition. Definition~\ref{def:sf} and Algorithm~\ref{alg:detPCG} can be extended in a straightforward manner to matrices where the entries are multilinear polynomials, the running time scaling with the number of terms in the polynomials.
\end{remark}

For example, in the DPP representation in Eq.~(\refeq{eq:dpp}) the matrix $I + L \diag(y_1, \ldots, y_\ell)$ with a kernel $L = (l_{ij})$ is scope-disjoint, since its diagonal and off-diagonal entries are, respectively, of the form $1 + l_{ii} y_i$ and $l_{ij} y_j$. 

A more complex example is provided by the matrix
\[
\frac{1}{13}
\begin{pmatrix}
y_1+2y_2 & y_1 & y_1 & -y_1\\
1+y_2 & -y_3 & y_5 & 0\\
y_2 & 1 & 2 & y_4\\
-y_2 & 0 & y_5 & 0
\end{pmatrix},
\]
demonstrating that scope-disjoint matrices are more expressive than those of DPPs: Here, the probability that $y_k = 0$ for all $k$ is zero, whereas for DPPs this has to be nonzero. (Verifying that the determinant of the matrix is a PGP is left as an exercise to the reader.)

More generally, the properties of a scope-disjoint matrix guarantee that its determinant is a multilinear polynomial, thus holding potential to encode a probability distribution.

\begin{definition}
{\rm 
A PGC is a \emph{determinantal PGC} if it is a hierarchical composition 
$h(f_1, \ldots, f_\ell)$, where $h = \det P$ with a scope-disjoint square matrix $P$ over $\ell$ indeterminates  
and $f_1, \ldots, f_\ell$ are PGPs with disjoint scopes over $z_1, \dots, z_n$. 
We call $P$ the \emph{parent} matrix and each $f_k$ a \emph{child} PGP. 
}
\end{definition}


Consider the problem of computing the marginal probability in a determinantal PGC for a given query $(A, B)$. 
%
So far, we have considered two different approaches. 
%
One is to apply Theorem~\ref{thm:pointeval} and Remark~\ref{rem:pointeval}: first evaluate the PGC in $O(n)$ points, and then perform one univariate polynomial interpolation at the end.  
%
The other approach is to represent the determinant as an explicit PGC which incurs overhead due to the size of division-free circuits.
%
Could we evaluate the determinant with fast algorithms without the need for multiple evalutations, getting the best of both worlds?

We achieve this by an approach similar to the fast inference of PGCs: we only track the highest-degree terms. However, the fast algorithms for the determinant are designed for real matrices, so we need to find a way of incorporating the information about the degrees of the terms. Note that because the matrix is scope-disjoint, each $f_k$ must appear in $P$ in at most one row (column). Thus, we can safely ignore entries on that row (column) that do not contain $f_k$ as a term, as degree $|A|$ cannot be achieved without picking an entry containing $f_k$ from that row (column). It turns out this is sufficient for the fast computation of marginal probabilities. This is more formally represented in Algorithm~\ref{alg:detPCG} and proven in Theorem~\ref{thm:detPGC}.

\begin{algorithm}[t!]
{\small
    \Input{ A determinantal PGC over $X_1, \ldots, X_n$, with an $m \times m$ parent matrix $(p_{ij})$ and child PGPs $f_1, \ldots, f_\ell$; disjoint subsets $A, B \subseteq \{1,\ldots,n\}$.}
    \Output{ The probability of $\{X_i = 1\}_{i\in A}, \{X_i = 0\}_{i\in B}$.}
    \For{$k = 1, \dots, \ell$}{
    	% Replace these ad-hoc white spaces by a proper alignment macro
    	\hspace*{0.1pt}$A_k \gets \textrm{the intersection of $A$ and the scope of $f_k$}$\;
    	$B_k \gets \textrm{the intersection of $B$ and the scope of $f_k$}$\;
    	\hspace*{3pt}$c_k \gets \textrm{the marginal probability for $(A_k, B_k)$ in $f_k$}$\;
    }
    \For{$i = 1, \dots, m$}{
       	\hspace*{-2.8pt}$\row_i \gets \{k : \textrm{$f_k$ appears only on row $i$ and $A_k \neq \emptyset$} \}$\;
    }
    \For{$j = 1, \dots, m$}{
    	$\col_j \gets \{k : \textrm{$f_k$ appears only on column $j$ and $A_k \neq \emptyset$} \}$\;
    }
    $c_0 \gets 1$\;
    \For{$i = 1, \dots, m$}{
    	\For{$j = 1, \dots, m$}{
		% $A_{ij} \gets \bigcup_{k \in S_{ij}} A_k$\;
		\hspace*{3pt}$r_{ij} \gets 0$\;
		\For{\textup{each} $k \in \{0\} \cup S_{ij}$}{
			%\If{$A_{ij} = A_k$}{
			\If{$\row_i, \col_j \subseteq \{k\}$}{ 
				$r_{ij} \gets r_{ij} + p_{ijk} c_k$\;
			}
		}
	 }
     }
     \Return the determinant of the matrix $(r_{ij})$
    \caption{Fast inference in determinantal PGCs}
    \label{alg:detPCG}
}
\end{algorithm}

As an example of the execution of Algorithm~\ref{alg:detPCG}, consider the earlier matrix where each $y_k$ is substituted by $f_k(z_k) = z_k$. Then, the transformation of $P$ into $R$ with $A = \{1, 2, 3\}$ and $B = \{4\}$ goes as follows:
\[
\begin{pmatrix}
z_1+2z_2 & z_1 & z_1 & -z_1\\
1+z_2 & -z_3 & z_5 & 0\\
z_2 & 1 & 2 & z_4\\
-z_2 & 0 & z_5 & 0
\end{pmatrix}
\mapsto
\begin{pmatrix}
0 & 0 & 1 & -1\\
0 & -1 & 0 & 0\\
1 & 0 & 2 & 0\\
-1 & 0 & 1 & 0
\end{pmatrix}
\]

Now, the only permutations with a nonzero value are $(4, 2, 1, 3)$ and $(4, 2, 3, 1)$, encoding terms $z_1z_2z_3z_5$ and $2z_1z_2z_3$ in the determinant of $P$.

\begin{theorem}\label{thm:detPGC}
Any marginal probability in a determinantal PGC can be computed in time $O(m^3 + m^2 \ell + T \ell)$, assuming  
the parent matrix is of size $m \times m$ and in each child PGP any marginal probability can be computed in time $T$.
\end{theorem}
\begin{proof}
Consider Algorithm~\ref{alg:detPCG}. All for-loops can clearly be implemented to run in the claimed time.
%
It remains to show that the algorithm is correct.
 
By Lemma~\ref{lem:coef}, it is sufficient to show that for the matrix $R = (r_{ij})$ we have 
\be \label{eq:Rdet}
	\det R = \coef_{|A|} \det P(t)\,,
\ee
where the entries of $P(t)$ are given by 
\bes
	p_{ij}(t) = p_{ij}\big(\{z_i=t\}_{i\in A}, \{z_i=0\}_{i\in B}, \{z_i=1\}_{i\not\in A\cup B}\big)\,;
\ees
here we view each entry of $P$ as a multivariate polynomial in the indeterminates $z_1, \ldots, z_n$. 
For convenience, we omit showing the dependence on the sets $A$ and $B$ in the notation. 

Recall that the determinant can be written as the sum
\[ \det R = \sum_{\sigma} \mathrm{sgn}(\sigma) \prod_{i=1}^m r_{i,\sigma(i)} \]
over the permutations $\sigma$ on $\{1,\ldots,m\}$, where $\mathrm{sgn}(\sigma)$ is the sign of the permutation. Fix an arbitrary permutation $\sigma$. It suffices to show that
\begin{equation}\label{eq:detequal}
	\prod_{i=1}^m r_{i,\sigma(i)} = \coef_{|A|} \prod_{i=1}^m p_{i,\sigma(i)}(t)\,.
\end{equation}

Let $A_{ij} \coloneqq \bigcup_{k \in S_{ij}} A_k$. Say that a permutation $\sigma$ \emph{partitions} $A$ if $|A| = \sum_{i=1}^m |A_{i,\sigma(i)}|$.

Assume first that $\sigma$ does not partition $A$. Then the right-hand side of Eq.~(\refeq{eq:detequal}) is zero, as the degree of the polynomial cannot reach $|A|$. Accordingly, we can select an $f_k$ such that $A_k \neq \emptyset$ and $k \not\in S_{i,\sigma(i)}$ for all $i$. Because $P$ is scope-disjoint, we can further select an $i$ such that $f_k$ appears only on row $i$ or column $j := \sigma(i)$. Now, as $k \not\in S_{ij}$ but $k \in \row_i$ or $k \in \col_j$, the algorithm assigns $r_{ij}$ to zero, thus rendering the left-hand side of Eq.~(\refeq{eq:detequal}) zero, as desired.

For the rest of the proof, assume that $\sigma$ partitions $A$. Since $P$ is scope-disjoint, we care only about the highest-degree terms in the factors of the product on the right-hand side of Eq.~(\ref{eq:detequal}). Therefore, it can be rewritten as
\[
\prod_{i=1}^m \coef_{|A_{i,\sigma(i)}|} p_{i,\sigma(i)}(t).
\]

Consider a single factor of this product. By Lemma~\ref{lem:coef} and the definition of $p_{ij}$, the coefficient corresponds to a weighted sum over the precomputed marginal probabilities $c_k$:
\[
	\coef_{|A_{ij}|} p_{ij}(t) = \sum_{\substack{k \in \{0\} \cup S_{ij} \\ A_k = A_{ij}}} p_{ijk} \cdot c_k,
\]
where $A_0 \coloneqq \emptyset$.

We separate two cases: If $A_{ij} = \emptyset$, then all indeterminates appear only in other rows and columns. Thus, $\row_i = \col_j = \emptyset$ and
\[ r_{ij} = \sum_{k \in \{0\} \cup S_{ij}} p_{ijk} \cdot c_k\,, \]
as desired. 

Otherwise $A_{ij} \neq \emptyset$. Then we can have $A_k = A_{ij}$ for at most one $k \in S_{ij}$, since the scopes of $f_k$ are disjoint. 

Now, if such a $k$ exists, then
\[ \coef_{|A_{ij}|} p_{ij}(t) = p_{ijk} \cdot c_k\,.\]
But because we assumed that $\sigma$ partitions $A$, and because $P$ is scope-disjoint, the indeterminates in $A \setminus A_{ij}$ must appear in other columns and rows. Thus, $\row_i, \col_{j} \subseteq \{k\}$ and, thereby, the algorithm correctly assigns $r_{ij}$ to $p_{ijk} \cdot c_k$.

Otherwise, no such $k$ exists and there is an $l \in S_{ij}$ with a nonempty set $A_{l} \subseteq A_{ij} \setminus A_k$. Now, $l \in \row_i$ or $l \in \col_{j}$, meaning that \[ r_{ij} = 0 = \coef_{|A_{ij}|} p_{ij}(t), \] concluding the proof.
\end{proof}

\section{Learning}


Let us now turn to the task of finding parameter values for a given PGC structure so as to (locally) maximize a penalized likelihood function. Or, equally, consider the task of drawing a sample of parameter values (approximately) from a posterior distribution obtained by multiplying the likelihood by a prior. 
If the parameters were subject to only simple constraints, such as nonnegativity, then one could apply standard gradient-based optimization methods or Markov chain Monte Carlo methods, such as those implemented in Adam \citep{Kingma14} and Stan \citep{Carpenter17}.
%
Unfortunately, in our case the constraints are complicated: as we will show next, checking whether a given PGC encodes a probability distribution is NP-hard. Motivated by this obstacle, we end this section with a sketch of how to learn PGCs that composed of moderate-size subcircuits.
%



\subsection{Recognizing Feasible PGCs Is Hard}

We call a PGC \emph{feasible} if it encodes a PGP, that is, all the coefficients in the generating polynomial are nonnegative. 
Thus, for recognizing that a given PGC is \emph{infeasible}, it would be sufficient to exhibit a negative coefficient. While a single coefficient can be efficiently extracted using inference algorithms, it turns out to be hard to figure out, which of the exponentially many coefficients one should compute:
\begin{theorem}
	Recognizing whether a given PGC is infeasible is an NP-complete problem.
\end{theorem}
\begin{proof}
	We prove the statement with a reduction from $k$-CNF-SAT by showing that efficient recognition would yield an efficient algorithm for $k$-CNF-SAT. The main idea is to encode the SAT instance into a PGC that is infeasible exactly when it has a solution. We will ignore normalizing the distribution here: it is easy to do by dividing by the value obtained from assigning each indeterminate to $1$. See Figure~\ref{fig:reduction} for a visualization of the construction given in the following paragraphs.
    
    Essentially, we are building a probability mass function that expresses the number of unsatisfied clauses for each truth assignment. We start by associating each truth assignment \[ \{x_i = 1\}_{i \in S}, \{x_j = 0\}_{j \not\in S} \] with the monomial $\prod_{i \in S} z_i = z^S$. A truth assignment is unsatisfying if any of the clauses is unsatisfied. This leads to the following idea: Add weight $-1$ to all assignments
and then, for each clause, increment the weight of every assignment that does not satisfy the clause. Thus, only a satisfying assignment has weight $-1$.
    
	Initializing all assignments with $-1$ corresponds to the product
\begin{equation}\label{eq:product}
	-(1+z_1)(1+z_2)\cdots(1+z_n)\,.
\end{equation}    
    
    Next, encode the clauses into polynomials such that the set of terms in it matches the set of unsatisfying truth assignments for it: A positive literal $x_i$ has to be false ($1$) and a negative literal $\bar{x}_j$ true ($z_j$). For example, clause $x_1 \lor x_2 \lor \bar{x}_3$ corresponds to \[ 1 \cdot 1 \cdot z_3 \cdot (1 + z_4)(1 + z_5)\cdots(1 + z_n)\,.\]

    Now, if any clause is unsatisfied for a truth assignment, the corresponding term is added at least once to Eq.~(\ref{eq:product}), and thus it is no longer negative. However, the term of a truth assignment satisfying all clauses remains negative after taking the sum over all clauses. If we were able to find it in polynomial time, we would have solved $k$-CNF-SAT efficiently.
\end{proof}

\begin{figure}
	\small
	\centering
	\begin{tikzpicture}%[yscale=.75]  %savespace
	    \node[shape=circle,draw,inner sep=2] (P1) at (0,0) {$+$};
	    \node[shape=circle,draw,inner sep=2] (T1) at (-2,-1.5) {$+$};
	    \node[shape=rectangle,draw,inner sep=2] (C1) at (-3,-3) {$C_1$};
	    \node[shape=rectangle,draw,inner sep=2] (C2) at (-2,-3) {$C_2$};
	    \node[shape=rectangle,draw,inner sep=2] (C3) at (-1,-3) {$C_3$};
	    \node[] (T2) at (2,-1.5) {$(1+z_1)(1+z_2)\cdots(1+z_n)$};
	    \path [->] (T1) edge node[left,pos=0.6] {$1\ $} (P1);
	    \path [->] (T2) edge node[right,pos=0.6] {$-1$} (P1);
	    \path [->] (T2) edge node[right,pos=0.6] {$-1$} (P1);
	    \path [->] (C1) edge node[left,pos=0.48] {$1$} (T1);
	    \path [->] (C2) edge node[left,pos=0.5] {$1$} (T1);
	    \path [->] (C3) edge node[left,pos=0.48] {$1$} (T1);
	    \draw [decorate,decoration={brace,amplitude=5pt}]
  (-3.5,-3.5) -- (1.5,-3.5) node[midway]{};
  		\node[] (C3O) at (-1, -4) {$C_3 = x_1 \lor \overline{x_4} \lor x_n$};
  		\node[shape=circle,draw,inner sep=2] (CT) at (-1, -6) {$\times$};
  		\node[shape=circle,draw,inner sep=2] (CV) at (-2, -7.5) {$\times$};
  		\node[] (CO) at (1.5, -7.5) {$(1 + z_2)(1 + z_3)(1 + z_5) \dots (1 + z_{n - 1})$};
  		\node[] (V1) at (-3,-9) {$1$};
	    \node[] (V2) at (-2,-9) {$z_4$};
	    \node[] (V3) at (-1,-9) {$1$};
  		\node[] (A1) at (-3,-9.5) {$(x_1)$};
	    \node[] (A2) at (-2,-9.5) {$(\overline{x_4})$};
	    \node[] (A3) at (-1,-9.5) {$(x_n)$};
	    \path [->] (V1) edge node[left,pos=0.48] {} (CV);
	    \path [->] (V2) edge node[left,pos=0.5] {} (CV);
	    \path [->] (V3) edge node[left,pos=0.48] {} (CV);
	    \path [->] (CV) edge node[left,pos=0.5] {} (CT);
	    \path [->] (CO) edge node[left,pos=0.48] {} (CT);
	    \draw[-implies,double equal sign distance] (-1, -4.5) -- (-1,-5.5);
	    \node[] (MAP) at (0,-5) {$\left\lbrace\begin{aligned}&x_i \mapsto 1,\\&\overline{x_i} \mapsto z_i\end{aligned}\right.$};
    \end{tikzpicture}
    \caption{An example of our reduction from a $k$-CNF-SAT instance $C_1 \land C_2 \land C_3$ to recognizing whether a given PGC is infeasible.}
    \label{fig:reduction}
\end{figure}


\begin{remark}\label{rem:coNP}
Equivalently, recognizing whether a given PGC is feasible is a co-NP-complete problem.
\end{remark}

Note that we actually showed a slightly stronger result: the constructed circuit is decomposable, so recognizing such infeasible PGCs is NP-complete as well.

The hardness result in itself would leave open the possibility of finding a (worst-case) moderately exponential algorithm running, e.g., in time $O(1.23^n m)$. However, the reduction to CNF-SAT is sufficiently tight to render that difficult. Indeed, such an algorithm would give an equally fast algorithm for CNF-SAT, thus refuting the conjectured \emph{strong exponential time hypothesis} \citep{Impagliazzo01}:
%
\begin{corollary}\label{cor:seth}
	Assuming the strong exponential time hypothesis holds, no algorithm can recognize feasible PGCs in time $O(c^n)$ for any $c < 2$.
\end{corollary}

\subsection{Learning Compound Circuits}


We next sketch an approach to learn \emph{compound PGCs} that are composed of moderate-size subcircuits. 
The key observation is that in such models, the check for feasibility can be done separately for each subcircuit. 

Suppose we are given a PGC that is composed of some PGCs with generating polynomials $g_1, \ldots, g_s$, recursively using the sum, product, and hierarchical composition. We view each $g_j$ as a function of its weight parameters, which we denote by $\theta_j$. We refer to $\theta = (\theta_1, \ldots, \theta_s)$ as a \emph{solution}. Recall that \emph{if} every $g_k$ is a PGP, then also the compound PGC encodes a PGP. 

Consider the following template algorithm, which can be instantiated either to local optimization or to approximate posterior sampling: 
%
\begin{enumerate}
\item \emph{Initialize.} Let $\theta$ be any feasible solution; e.g., assign all weights a nonnegative value from the range $[0, 1]$.
\item \emph{Move.} Let $\theta'$ be a candidate solution in a local neighborhood of $\theta$; e.g., take a small step in the direction of the gradient of the likelihood.
\item \emph{Accept or reject.} If $g_j$ is a PGP for all $j$, update $\theta$ to $\theta'$. 
\item \emph{Iterate.} Go to step~2.
\end{enumerate}
%
In the next paragraphs we discuss each step in detail. 

Initialization with nonnegative weights guarantees that the PGC encodes a PGP (up to a normalizing constant). Ideally, there would be no need to select a ``good'' initial solution, as the iterations would quickly move the solution to regions of high likelihood or posterior. 
An implicit assumption here is that such good regions can be---at least in theory---reached also from poor initial solutions. 
Characterizing the connectiveness of the feasible region is a question for future research.

Efficient moves can be critical for the performance of the algorithm. A simple implementation would generate a candidate solution in a random walk manner, perturbing the values in the current solution only in one or a few dimensions. While this would be extremely fast, it would also miss the opportunity to generate better candidates by computationally more demanding algorithms. Namely, the accept--reject step that follows is expected to dominate the computational complexity. It would be beneficial to generate candidates that are feasible with good probability. 
For gradient-based moves it would be useful if the inference algorithms support automated differentiation; it is not immediate what the status of our present algorithms is with this respect.

Deciding whether to accept or reject the proposed candidate solution is the most (or only) nontrivial step of this algorithm. Here we assume that the feasibility of the candidate was not ensured in the move step, but was left to be checked for in this step. Clearly, feasibility needs to be checked only for those subcircuits whose weights were changed; the number of affected subcircuits can be small or large depending on the type of the move. The main challenge is that deciding whether a subcircuit encodes a PGP is hard problem: by Corollary~\ref{cor:seth}, in the worst case, the running time of our algorithm would scale as $2^s$ for a subcircuit on $s$ variables. 
On the positive side, the complexity is practical when $s$ is small, say $s < 20$. In practice, it may be harder to ensure that a candiate is feasible (no polynomial-size certificate) than verifying that the candidate is infeasible---finding fast, practical algorithms for these tasks is an intriguing research question. 
It is worth noting that feasiblity checks need to be faster than likelihood computations, which also can be relatively expensive if there is a large number of data points. 

In the present form, the algorithm template does not specify any stopping rule. One can apply any criteria commonly used in local optimization and Markov chain simulation (to approximate sampling from a posterior). However, since each iteration is expected to be computationally expensive, the number of iterations may have to be kept relatively small in practice. This setting favors parallel schemes, such as annealed importance sampling \citep{Neal01}, supposing multiple processors are available.  



\section{Concluding Remarks}

Probabilistic generating circuits (PGCs) are a recently proposed model class that subsumes probabilistic circuits (PCs) and determinantal point processes (DPPs) \citep{Zhang21}. In this paper, we took a closer look at the computational complexity of inference and learning in PGCs. 

We observed that in a PGC on $n$ random variables, any marginal probability can be computed by $n+1$ point evaluations of the circuit, followed by a single, relatively inexpensive polynomial interpolation. Compared to a previous algorithm, the saving in the asymptotic complexity is by a factor logarithmic in $n$. In practice, for $n$ in hundreds or thousands, we may expect an orders-of-magnitude speedup given the simplicity of scalar arithmetic, as compared to multiplication of polynomials. 

In determinantal PGCs, our inference algorithm would circumvent the requirement to represent determinants as division-free circuits, an idea suggested in a previous work. But, we gave an even faster algorithm that avoids multiple point evaluations and interpolation altogether. We showed that, in essence, it suffices to evaluate a single determinant of a matrix with real-valued entries.

Using similar ideas, we also gave a linear-time inference algorithm in what we call decomposable PGCs. This removes a factor of $n$ from the time complexity of our evaluation--interpolation algorithm. Arguably, decomposable PGC are a significant subclass of PGCs: they are strictly more expressive than PCs and DPPs \citep[implied by the proof of Thm~2]{Zhang21}, and the class is closed under the three composition operations. 

As to the learning of PGCs from data, we pointed out a notable obstacle: it is NP-hard to recognize whether even a decomposable PGC encodes a probability distribution. On the other hand, we sketched an approach to learn PGCs composed of moderate-size subcircuits. Our inspection suggests that standard methods of (stochastic) local search may be practical, in particular if typical instances of the NP-hard recognition problem can be solved fast in practice.  

\begin{contributions} % will be removed in pdf for initial submission,
                      % so you can already fill it to test with the
                      % ‘accepted’ class option
    All authors collaborated on writing the paper. The faster inference algorithms for decomposable, determinantal, and general PGCs were devised by J.\ Harviainen, V.\ Peruvemba Ramaswamy, and M.\ Koivisto, respectively. J.\ Harviainen gave the NP-completeness proof, and the learning algorithm was outlined by M.\ Koivisto.
\end{contributions}

\begin{acknowledgements}
    Research supported by grants from the Academy of Finland (projects 316771 and 351156) and the Austrian Science Fund (project W1255).
\end{acknowledgements}

\bibliography{harviainen_353}

\end{document}
