% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} 
% after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}
\usepackage{tablefootnote}
\usepackage{subfig}
\usepackage{caption}
\usepackage{transparent}
% \usepackage{subcaption}
\usepackage{sidecap}
\sidecaptionvpos{figure}{c}
\usepackage{pgfplots,wrapfig}
%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usetikzlibrary{shapes.geometric}
\usetikzlibrary {arrows.meta}
\usepackage{bbm}
\usepackage{derivative}
\usepackage{upgreek}
\usepackage{amssymb,amsmath, amsthm}
\usepackage{enumitem}
\newcommand{\R}{\mathbb{R}}
\newcommand{\E}{\mathbb{E}}
\newtheorem{definition}{Definition} 
\newtheorem{property}{P.}  
\newtheorem{query}{Q.} 
\newtheorem*{remark}{Remark} 
\newtheorem{lemma}{Lemma} 
\newtheorem{theorem}{Theorem}
\newtheorem*{corollary}{Corollary}  

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example
\hypersetup{hidelinks}
% \title{Unified Deep Probabilistic Models:\\ On the Integration of Probabilistic Circuits with Normalizing Flows}
\title{Probabilistic Flow Circuits:\\ Towards Unified Deep Models for Tractable Probabilistic Inference}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is automatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<sahil.sidheekh@utdallas.edu>?Subject=Your UAI 2023 paper}{Sahil Sidheekh}{}}
\author[2,3]{Kristian Kersting}
\author[1]{Sriraam Natarajan}

% Add affiliations after the authors
\affil[1]{%
    Erik Jonsson School of Engineering \& Computer Science\\
    The University of Texas at Dallas
}
\affil[2]{%
    Department of Computer Science \\
    TU Darmstadt
}
\affil[3]{
Centre for Cognitive Science, TU Darmstadt, and Hessian Center for AI 
}

\begin{document}
% \onecolumn
\maketitle

\begin{abstract}
 We consider the problem of increasing the expressivity of probabilistic circuits by augmenting them with the successful generative models of normalizing flows. To this effect, we theoretically establish the requirement of decomposability for such combinations to retain tractability of the learned models. Our model, called Probabilistic Flow Circuits, essentially extends circuits by allowing for normalizing flows at the leaves. Our empirical evaluation clearly establishes the expressivity and tractability of this new class of probabilistic circuits. %proposed approach.
\end{abstract}

\section{Introduction}
%Probability theory has been well-accepted as one of the most principled, if not only, way of dealing with the uncertainties involved with the events we observe. 
Building flexible models for representing probability distributions has been a long standing goal in AI. 
Driven by the need for reliable decision making under uncertainty, the early efforts in this direction were aimed towards building efficient models~\citep{pgm} and algorithms~\citep{chowliu-tree,welch1995introductionkalman} that enabled tractable probabilistic inference. %Utilizing the notions of factors and mixtures,  
Several tractable generative models that described probability distributions in the form of graphs were recently developed ~\citep{poon2011spn,ProbCirc20,rahman2014cutset,darwiche2003differential-ac}.
% ,trapp2020deepgaussianprocess,yu2021uai_momogps}.

In particular, \cite{ProbCirc20} introduced \textit{Probabilistic Circuits} (PCs) as a unified notion encompassing such models that describe distributions in the form of computational graphs such as cutset networks \citep{rahman2014cutset}, arithmetic circuits \citep{darwiche2003differential-ac} and sum-product networks~\citep{poon2011spn}.  Several subsequent works have 
shown how to learn (a subset of) PCs 
\citep{gens2013learnspn, peharz2016latentvariablespn, bayesian-learning-spn} and have increased their flexibility \citep{molina2018mixed, molina2017poisson}, reliability \citep{credal-spn} as well as scalability \citep{peharz20a-rat-spn, peharz_20_einsum, dang2022sparse, liu2023scaling}.

While tractable, these PCs are still less expressive than deep generative neural models (DGMs) such as GANs \citep{GANs}, VAEs \citep{VAE} and Normalizing Flows (NFs) \citep{tabak2013flow1, papamakarios2021normalizingFlowreview}. Among these, NFs have recently gained attention due to their use of invertible functions that enables maximum likelihood training, resulting in more stable models that avoid mode collapse and vanishing gradient problems of deep generative models. While powerful, these deep models are not efficient in performing inference tasks that tractable models such as PCs are quite adept at.

%So it is indeed natural that one can envision the combination of PCs with DGMs.
So it is not surprising that combinations of PCs with DGMs have been explored.
 \cite{tan2019hierarchical-spn-vae} explored the integration of VAEs with PCs, while \cite{trapp2020deepgaussianprocess,yu2021uai_momogps} explored the integration of 
 gaussian processes with PCs. Along similar lines, \cite{correia2023continuous} recently proposed building continuous mixtures of tractable probabilistic models to improve their representational flexibility. However, the added expressivity of PCs due to these models came at the cost of their tractability. Integrating new transformations within PCs was first explored in \cite{sharir2018sum-product-quotient}, who proposed the addition of \textit{quotient} nodes to improve expressivity, while retaining tractability. Most recently, \citet{sptn} proposed the addition of invertible \textit{transform nodes}, representing normalizing flows. They augmented PCs by placing invertible affine transformations arbitrarily within the circuit, and demonstrated its added modeling flexibility. However, as we prove, %show theoretically, 
this understanding of integrating flows with PCs is {\bf incomplete, and does not guarantee tractability}. 

To establish tractability, we first define the notion of \textbf{$\uptau$-decomposability} that specifically considers the decomposibility of transform nodes.  Building upon this definition, we then propose a method for integrating PCs with NFs by defining a new class of circuits, called {\em Probabilistic Flow Circuits} (PFC). These models follow a similar structure to that of a PC by employing sum and product nodes in the circuit while using NFs at the leaves. The advantage is that since NFs allow for effectively modeling arbitrarily complex distributions, the resulting model enhances the modeling capabilities of a PC. As an example, consider the data shown in Figure~\ref{fig:motivation-}. Using a simple PC that learns a mixture of four distributions, the depicted data distribution cannot be modeled faithfully.  However, our method that uses the same structure can better model the data as it learns multimodal distributions at the leaves. The additional advantage is that the tractability of a PC is retained as the NFs are defined in the leaves rather than in the inner nodes. %We show that the \textbf{$\uptau$-Decomposability} is preserved 
%Thus these probabilistic flow circuits possess the twin advantages of tractability and flexibility.


\begin{figure}[t!]
    \centering
    \begin{tikzpicture}
    %  --- center 
    \node[circle,draw=white!50,fill=white!5,inner sep=0pt,minimum size=0.75cm, line width=1.0pt] (base) at (0,-0.7) {};
    % \node[circle,draw=gray!50,fill=white!5,inner sep=0pt,minimum size=0.6cm, line width=1.0pt] (base) at (0,-0.7) {\small{$N$}};
    
    \node[draw=red!90,fill=white!10,inner sep=0pt,minimum size=0.7cm, line width=1.0pt] (x1) at (0,0) {\small{$L^1_{x}$}};
    \node[draw=blue!90,fill=white!10,inner sep=0pt,minimum size=0.7cm, line width=1.0pt] (y1) at (1,0) {\small{$L^1_{y}$}};
    
    \node[draw=red!90,fill=white!10,inner sep=0pt,minimum size=0.7cm, line width=1.0pt] (x2) at (3.,0) {\small{$L^2_{x}$}};
    \node[draw=blue!90,fill=white!10,inner sep=0pt,minimum size=0.7cm, line width=1.0pt] (y2) at (2.,0) {\small{$L^2_{y}$}};
    
    \node[circle,draw=black!90,fill=white!10,inner sep=0pt,minimum size=0.7cm, line width=1.0pt] (p1) at (0,1.2) {$\times$};
    \node[circle,draw=black!90,fill=white!10,inner sep=0pt,minimum size=0.7cm, line width=1.0pt] (p2) at (1,1.2) {$\times$};
    \node[circle,draw=black!90,fill=white!10,inner sep=0pt,minimum size=0.7cm, line width=1.0pt] (p3) at (2,1.2) {$\times$};
    \node[circle,draw=black!90,fill=white!10,inner sep=0pt,minimum size=0.7cm, line width=1.0pt] (p4) at (3,1.2) {$\times$};
    \node[circle,draw=black!90,fill=white!10,inner sep=0pt,minimum size=0.7cm, line width=1.0pt] (s) at (1.5,2.3) {$+$};

    % \node[circle,draw=gray!50,fill=white!5,inner sep=0pt,minimum size=0.75cm, line width=1.0pt] (ny2) at (0.,3.7) {\small{$N$}};
    
    \path[draw=black,-] (p1) edge (x1);
    \path[draw=black,-] (p1) edge (y1);
    \path[draw=black,-] (p2) edge (x1);
    \path[draw=black,-] (p2) edge (y2);
    \path[draw=black,-] (p3) edge (x2);
    \path[draw=black,-] (p3) edge (y1);
    \path[draw=black,-] (p4) edge (x2);
    \path[draw=black,-] (p4) edge (y2);
    \path[draw=black,-] (s) edge (p1);
    \path[draw=black,-] (s) edge (p2);
    \path[draw=black,-] (s) edge (p3);
    \path[draw=black,-] (s) edge (p4);
    \end{tikzpicture} \ \ 
    \begin{tikzpicture}
    %  --- center 
    \node[ draw=red!90,fill=white!5,inner sep=0pt,minimum size=0.7cm, line width=1.0pt] (x1) at (0,0) {\small{$L^1_{x}$}};
    \node[draw=blue!90,fill=white!5,inner sep=0pt,minimum size=0.7cm, line width=1.0pt] (y1) at (1,0) {\small{$L^1_{y}$}};
    \node[ draw=red!90,fill=white!5,inner sep=0pt,minimum size=0.7cm, line width=1.0pt] (x2) at (3.,0) {\small{$L^2_{x}$}};
    \node[ draw=blue!90,fill=white!5,inner sep=0pt,minimum size=0.7cm, line width=1.0pt] (y2) at (2.,0) {\small{$L^2_{y}$}};
    
    \node[circle,draw=black!80,fill=gray!5,inner sep=0pt,minimum size=0.7cm, line width=1.0pt] (nx1) at (0,-1.6) {{\transparent{0.3}\includegraphics[width=.025\textwidth]{figures/intoriduction/gaussian-pdf.png}}};
    \node[circle,draw=black!80,fill=gray!5,inner sep=0pt,minimum size=0.7cm, line width=1.0pt] (ny1) at (1,-1.6) {{\transparent{0.3}\includegraphics[width=.025\textwidth]{figures/intoriduction/gaussian-pdf.png}}};
    \node[circle,draw=black!80,fill=gray!5,inner sep=0pt,minimum size=0.7cm, line width=1.0pt] (nx2) at (3.,-1.6) {{\transparent{0.3}\includegraphics[width=.025\textwidth]{figures/intoriduction/gaussian-pdf.png}}};
    \node[circle,draw=black!80,fill=gray!5,inner sep=0pt,minimum size=0.7cm, line width=1.0pt] (ny2) at (2.,-1.6) {{\transparent{0.3}\includegraphics[width=.025\textwidth]{figures/intoriduction/gaussian-pdf.png}}};
    
    % \draw[boxed] plot[domain=-0.5:0.5] ({\x},{exp(-2*\x*\x)});
    \node[circle,draw=black!90,fill=white!10,inner sep=0pt,minimum size=0.7cm, line width=1.0pt] (p1) at (0,1.2) {$\times$};
    \node[circle,draw=black!90,fill=white!10,inner sep=0pt,minimum size=0.7cm, line width=1.0pt] (p2) at (1,1.2) {$\times$};
    \node[circle,draw=black!90,fill=white!10,inner sep=0pt,minimum size=0.7cm, line width=1.0pt] (p3) at (2,1.2) {$\times$};
    \node[circle,draw=black!90,fill=white!10,inner sep=0pt,minimum size=0.7cm, line width=1.0pt] (p4) at (3,1.2) {$\times$};
    \node[circle,draw=black!90,fill=white!10,inner sep=0pt,minimum size=0.7cm, line width=1.0pt] (s) at (1.5,2.3) {$+$};
    \path[draw=black,-] (p1) edge (x1);
    \path[draw=black,-] (p1) edge (y1);
    \path[draw=black,-] (p2) edge (x1);
    \path[draw=black,-] (p2) edge (y2);
    \path[draw=black,-] (p3) edge (x2);
    \path[draw=black,-] (p3) edge (y1);
    \path[draw=black,-] (p4) edge (x2);
    \path[draw=black,-] (p4) edge (y2);
    \path[draw=black,-] (s) edge (p1);
    \path[draw=black,-] (s) edge (p2);
    \path[draw=black,-] (s) edge (p3);
    \path[draw=black,-] (s) edge (p4);
    \path[draw=red!50,-] (x1) edge (nx1);
    \path[draw=red!50,-] (x2) edge (nx2);
    \path[draw=blue!50,-] (y1) edge (ny1);
    \path[draw=blue!50,-] (y2) edge (ny2);
    \node[rectangle,draw=white!90,fill=gray!10,opacity=0.6,minimum width=3.5cm,minimum height=0.1cm] at (1.5,-0.8) {\small{\textit{normalizing flow}}};
    \end{tikzpicture}  

     \begin{tikzpicture}
         \node[rectangle,draw=white!90,fill=gray!10,opacity=0.9,minimum width=2.cm,minimum height=0.1cm] at (-0.5,0) {\small{Real Data}};
         \node[rectangle,draw=white!90,fill=gray!10,opacity=0.9,minimum width=2.cm,minimum height=0.1cm] at (2.2,0) {\small{PC}};
         \node[rectangle,draw=white!90,fill=gray!10,opacity=0.9,minimum width=2.cm,minimum height=0.1cm] at (5,0) {\small{PFC}};
     \end{tikzpicture}
     
     \includegraphics[width=0.325\linewidth]{figures/intoriduction/latest/joint_grid_CIRCLE.png}
        \includegraphics[width=0.325\linewidth]{figures/intoriduction/joint_grid_EinsumNet_CIRCLE.png}
        \includegraphics[width=0.325\linewidth]{figures/intoriduction/latest/joint_grid_LinearSplineEinsumFlow_CIRCLE.png}
    
    \caption{ Modeling a 2D data distribution (bottom left) using a PC (top left) and a PFC (top right). The red and blue marginal plots show the distribution captured by the corresponding leaves ($L$). The PFC, with its multi-modal leaf densities is able to better model the data than the PC.
    % Integrating normalizing flows helps improve the flexibility of probabilistic circuits, by modeling complex multi-modal distributions at the leaves.
    }
    \label{fig:motivation-}
\end{figure}

% Overall, We make the following key contributions: (1) We present the formalism for combining PCs and NFs to develop probabilistic flow circuits that retain their respective advantages of tractability and flexibility. (2) We %prove theoretically that the %current combination that uses  current use of 
% show that the existing combination that uses invertible affine transformations breaks the decomposability property,
% % We also prove
% and that \textbf{$\uptau$-decomposability} is a necessary condition for retaining the tractability of PCs. (3) We present a recommendation for a family of invertible functions by employing linear rational splines at the leaves of PCs.  (4) We demonstrate the efficacy and efficiency of the flow circuits empirically in density estimation and sample generation tasks using high-dimensional data. 
Overall, we make the following key contributions: 
\begin{itemize}
    \item  We present the formalism for combining PCs and NFs to develop probabilistic flow circuits that retain their respective advantages of tractability and flexibility.
    \item  We show that the existing combination that uses invertible affine transformations breaks the decomposability property, and that \textbf{$\uptau$-decomposability} is a necessary condition for retaining the tractability of PCs.
    \item We present a recommendation for a family of invertible functions by employing linear rational splines at the leaves of PCs.
    \item We demonstrate the efficacy and efficiency of the flow circuits empirically in density estimation and sample generation tasks using high-dimensional data. 
\end{itemize}

We proceed as follows: we start off by reviewing probabilistic circuits and normalizing flows. Then we introduce $\uptau$-decomposability and the probabilistic flow circuits. Before concluding, we present our empirical evaluation.  


\section{Background}
\textbf{Notation}. We use $X$ to denote a random variable, $x$ to denote its value, bold letters to denote sets of random variables i.e., $\boldsymbol{X}= \{ X_i \}_{i=1}^{d}$, $\boldsymbol{x}$ to denote an assignment of values to all variables in $\boldsymbol{X}$, $\mathbf{val}(X)$ to denote the set of all values that $X$ can take, and $\mathbf{val}(\boldsymbol{X})$ to denote the cartesian product $\mathbf{val}(X_1) \times \mathbf{val}(X_2) \ldots \mathbf{val}(X_d)$.  As normalizing flows are %naturally 
most commonly defined over continuous spaces, we will consider only continuous random variables. With mild overload of notation, we interchangeably refer to $\boldsymbol{X}$ as a $d$-dimensional real random vector and $\boldsymbol{x} \in \R^d = (x_1, \ldots, x_d)$ the value it takes. We denote by $\boldsymbol{X}_{A} \subseteq \boldsymbol{X} = \{ X_i \}_{i \in A}$ for $A \subseteq \{i\}_{i=1}^{d}$ the subset of random variables in $\boldsymbol{X}$ contained in $A$. Equivalently, we denote by $\boldsymbol{\Pi}_{A}(\boldsymbol{x}) \in \R^{|A|}$ the projection of the $\boldsymbol{x}$ on $A$, i.e., $\boldsymbol{x}_{A} = \boldsymbol{\Pi}_{A}(\boldsymbol{x}) = (x_i)_{i \in A}$ is the $|A|-$dimensional vector obtained by indexing $\boldsymbol{x}$ along the dimensions contained in $A$.  We use $P(\boldsymbol{X})$ to denote a probability distribution over $\boldsymbol{X}$ and $p(\boldsymbol{x})$ its density.  

%We begin the discussion with a brief introduction to the two generative modeling paradigms that we unite in this work: tractable probabilistic models and normalizing flows.

\paragraph{Tractable Probabilistic Models:} A generative model $\theta$ that approximates a probability distribution $P(\boldsymbol{X})$ is said to be tractable on a given probabilistic inference task $\boldsymbol{Q}$, where %which can be written as some function of the modeled distribution $P_{\theta}$, i.e.,  
$\boldsymbol{Q} = f(P_{\theta}(\boldsymbol{X}))$, if $\boldsymbol{Q}$ can be computed in time polynomial in the size of the model. Tractability is thus a spectrum governed by both the nature of the inference tasks (or $f$) and properties of $\theta$. 

 \textbf{Probabilistic Inference Tasks:}
Utilizing a probabilistic generative model ($P_{\theta}(\boldsymbol{X})$) to make decisions in real life often requires querying the model to infer properties of the underlying distribution or extracting the uncertainty associated with events defined by its random variables. We elaborate on the prevalent types of inference queries below.

 Let $\boldsymbol{X}_e,\boldsymbol{X}_q \subseteq \boldsymbol{X}$ such that $\boldsymbol{X} = \boldsymbol{X}_e \cup \boldsymbol{X}_q$ and $\boldsymbol{X}_e \cap \boldsymbol{X}_q = \emptyset$. First, given a full assignment of values $\boldsymbol{x}$ to all variables in $\boldsymbol{X}$, computing the density $p_{\theta}(\boldsymbol{X}=\boldsymbol{x})$ is called \emph{evidential inference}. 
 Second, we might be interested in computing the marginal distribution over a subset of variables $\boldsymbol{X}_e$, for instance, 
$ p_{\theta}(\boldsymbol{X}_e=\boldsymbol{x}_e)=\int_{\boldsymbol{x}_q \in \mathbf{val}(\boldsymbol{X}_q)} p_{\theta}(\boldsymbol{X}_e=\boldsymbol{x}_e, \boldsymbol{X}_q=\boldsymbol{x}_q)$ which is known as \emph{marginal inference}.
Third, we might be interested in computing conditional distributions, for e.g., \\
$p_{\theta}(\boldsymbol{X}_e=\boldsymbol{x}_e|\boldsymbol{X}_q=\boldsymbol{x}_q)$ $= \frac{p_{\theta}(\boldsymbol{X}_e=\boldsymbol{x}_e,\boldsymbol{X}_q=\boldsymbol{x}_q)}{\int_{\boldsymbol{x}_e \in \mathbf{val}(\boldsymbol{X}_e)} p_{\theta}(\boldsymbol{X}_e=\boldsymbol{x}_e, \boldsymbol{X}_q=\boldsymbol{x}_q)}$ 
which is known as \emph{conditional inference}. Finally, given a partial assignment $\boldsymbol{x}_e$ to a set of variables $\boldsymbol{X}_e$, finding the most probable assignment for the remaining variables $\boldsymbol{X}_q$, i.e. 
$\boldsymbol{x}_q = \text{argmax}_{\boldsymbol{x}^{'}_q \in \boldsymbol{X}_q} \ p_{\theta}(\boldsymbol{X}_q=\boldsymbol{x}^{'}_q | \boldsymbol{X}_e=\boldsymbol{x}_e)$ is \textit{maximum a posteriori} or \emph{MAP inference}. Note that while we consider only the above queries,  more complex tasks such as computing marginal-MAP, moments, etc.  exist and we refer to \cite{ProbCirc20} for an elaborate review.
% \begin{query}[\textbf{Evidencial}]
% \end{query}
% \begin{query}[\textbf{Marginal}]
% \end{query}
% \begin{query}[\textbf{Conditional}]
% \end{query}
% \begin{query}[\textbf{MAP}]
% \end{query}

\textbf{Probabilistic Circuits (PCs):} PCs are tractable probabilistic models, defined as rooted directed acyclic graphs (DAGs), in which leaf nodes represent univariate probability distributions and non-terminal nodes represent either a mixture (or
states of an observed variable in case of a deterministic
circuit) or an independence relation of their children.
More formally:
\begin{definition}
\label{def:probabilistic-circuit}
 % Let $\boldsymbol{X} \in \R^d$ be a $d-$dimensional random variable. Let us denote by $\boldsymbol{x}$ the value taken by $\boldsymbol{X}$ and $P(\boldsymbol{X})$ the joint distribution over $\boldsymbol{X}$.
 % For $\psi \subset \{1, 2, \dots d \}$ let $\boldsymbol{\Pi}_{\psi}(\boldsymbol{x}) \in \R^{|\psi|}$ denote the projection of the $\boldsymbol{x}$ on $\psi$, i.e., $\boldsymbol{x}_{\psi} = \boldsymbol{\Pi}_{\psi}(\boldsymbol{x})$ is the $|\psi|-$dimensional vector obtained by indexing $\boldsymbol{x}$ along the dimensions contained in $\psi$.  
A \textbf{probabilistic circuit} (PC or simply $\mathcal{C}$) is a computational graph that is composed of three types of nodes - sum nodes $\mathcal{S}$, product nodes $\mathcal{P}$ and leaf nodes $\mathcal{L}$. Each node in the graph computes a non-negative function over a set of variables $\psi \subset \{ X_i\}_{i=1}^{d}$, which is defined as its scope. Taken together, $\mathcal{C}$ encodes a probability distribution over $\boldsymbol{X}$, the probability density (or mass) function of which is given by the value of its root node and is defined recursively as follows:
\begin{enumerate}[topsep=0pt,itemsep=0ex,partopsep=1ex,parsep=1pt,leftmargin=5mm,label=\textbf{\arabic*}.]
    \item The value of a sum node($\mathcal{S}$) is equal to a convex combination of the values of its children, i.e.  $\mathcal{S}(\boldsymbol{x}) = \sum_{N_i \in \boldsymbol{ch(S)}} w_i N_i(\boldsymbol{x})$; $0\leq w_i\leq 1$ and $\sum_{i}w_i = 1$
    
    \item The value of a product node ($\mathcal{P}$) is equal to the product of the values of its children, i.e., $\mathcal{P}(\boldsymbol{x}) = \prod_{N_i \in \boldsymbol{ch(P)}}  N_i(\boldsymbol{x_{\psi_{N_i}}}) $
    % = \prod_{N_i \in \boldsymbol{ch(P)}}  N_i(\boldsymbol{\Pi}_{_{\psi_{N_i}}}(\boldsymbol{x}))$
    
    \item The leaf node $\mathcal{L}$ represents a simple probability distribution over its scope with an analytically computable leaf density, such as a Gaussian. For a given $\boldsymbol{x}$, the value of the leaf node equals the probability density (or mass if discrete) of $\boldsymbol{x}$ w.r.t the leaf distribution.
\end{enumerate}
 where $\mathcal{N}$ represents an arbitrary node, $\boldsymbol{ch(N)}$ refer to the children of $\mathcal{N}$ in the graph and $\boldsymbol{\psi_N}$ refer to the scope of $\mathcal{N}$.
\end{definition}
To ensure that $\mathcal{C}$ encodes a valid distribution and enables tractability for inference tasks, there are certain structural properties that it often needs to satisfy:

\begin{property}[\textbf{Smoothness}]
$\mathcal{C}$ is smooth if its sum nodes are defined over children having the same scope, i.e. $\forall \mathcal{S} \in \mathcal{C}, \psi(\boldsymbol{N_i}) = \psi(\boldsymbol{N_j}) \forall \boldsymbol{N_i},\boldsymbol{N_j} \in \boldsymbol{ch(S)}$.
\end{property}
\begin{property}[\textbf{Decomposability}]
$\mathcal{C}$ is decomposable if its product nodes are defined over children having disjoint scopes, i.e., $\forall \mathcal{P} \in \mathcal{C}, \ \text{and} \ \forall \boldsymbol{N_i},\boldsymbol{N_j} \in \boldsymbol{ch(P)}, \  \psi(\boldsymbol{N_i}) \cap \psi(\boldsymbol{N_j}) = \emptyset$.
\end{property}
Smoothness and decomposability are necessary and sufficient conditions for $\mathcal{C}$ to enable tractability for marginal and conditional inference \citep{vergari2021acompositional}. Circuits that support MAP inference are additionally required to satisfy determinism. %, a property called determinism.
\begin{property}[\textbf{Determinism}]
$\mathcal{C}$ is deterministic if for all the sum nodes, the value of only one of its children is non zero for a given fully observed input, $\boldsymbol{x}$ i.e., $\forall \mathcal{S} \in \mathcal{C}, \sum_{\boldsymbol{N_i} \in \boldsymbol{ch(S)}} \mathbbm{1}_{\boldsymbol{N_i}(\boldsymbol{x})>0} = 1$.
\end{property}

\paragraph{Normalizing Flows (NFs):} NFs constitute a class of deep-generative models that build flexible probability distributions over continuous spaces by utilizing the change of variables formula and invertible transformations.
Specifically,
% if $\boldsymbol{B}_{\boldsymbol{u}}$ denotes a simple base distribution over $\boldsymbol{u} \in \R^{d}$, such as a Gaussian, a normalizing flow defines a bijective transformation $f: \R^{d} \rightarrow \R^{d}$  
% by pushing a simple base distribution ($\mathcal{B}$) through a series of invertible transformations $T$. 
a normalizing flow transforms a complex distribution $p_{\boldsymbol{X}}$ defined over $\boldsymbol{x} \in \R^{d}$ into a simple base distribution $p_{\boldsymbol{Z}}$, through a bijective transformation $g$, such that $\boldsymbol{z} = g(\boldsymbol{x})$. Further, $g$ should satisfy that both $g$ and $g^{-1}$ are continuous and differentiable, a.k.a diffeomorphism. The probability density in the $\boldsymbol{X}$ space is given by the change of variables formula as:
% p_{\boldsymbol{X}}(\boldsymbol{x}) = p_{\boldsymbol{z}}(\boldsymbol{z})|\det\left( \dfrac{\partial f}{\partial \mathbf{z}}\right)|^{-1}
\begin{equation} 
   p_{\boldsymbol{X}}(\boldsymbol{x}) = p_{\boldsymbol{Z}}(\boldsymbol{z})|\det J_g|
\end{equation}

$p_{\boldsymbol{Z}}(\boldsymbol{z})$ is assumed to be a simple distribution such as a Gaussian and $|\det J_g|$ denotes the absolute value of the determinant of the Jacobian of the transformation $g$ evaluated at $\boldsymbol{x}$. Sampling from the distribution $p_{\boldsymbol{x}}$ can be achieved by sampling $\boldsymbol{z}^* \sim p_{\boldsymbol{z}}$ and applying the transformation $g^{-1}$, i.e. $\boldsymbol{x}^* = g^{-1}(\boldsymbol{z}^*)$. Since diffeomorphisms are closed under compositions, we can "flow" the base distribution through multiple compositions of invertible transformations to attain higher representational flexibility. Building efficient (neural) parameterizations for the invertible transformations ($g$) with efficiently computable Jacobian determinants is the key research question in the field of normalizing flows. 


\section{Integration of PC\lowercase{s} and Flows}

%\subsubsection*{A Formalism for Integrating Probabilistic Circuits with Normalizing Flows}
%\subsection{Proposed Framework}
%Normalizing flows have attracted much research interest in recent years, with increasingly efficient and deep parameterizations making them almost as expressive as other deep generative models. 
A key factor that distinguishes NFs from other deep models is that they support exact density evaluation, i.e., NFs are tractable models for evidential inference tasks. On the other hand, PCs are less expressive than NFs but tractable for more inference tasks under appropriate structural constraints. %Thus, it is natural to ask whether we can utilize the key concepts from each of these generative models to create a new class of more expressive and tractable models. 

One way to combine NFs and PCs would be to integrate the invertible transformations of flows within PCs, which was first explored by \cite{sptn}. They called the resulting model a sum product transform network (SPTN).
\begin{definition}
\label{def:sptn}
%Consider a probabilistic circuit $\mathcal{C}$ as defined by definition \ref{def:probabilistic-circuit}, comprising of sum nodes, product nodes, and leaf nodes. A sum-product-transform 
A SPTN is a PC obtained by augmenting  $\mathcal{C}$ with an additional node type, called transform node. Each transform node ($\mathcal{T}$) has a single child $\mathcal{N}$ and can be introduced arbitrarily over any node in $\mathcal{C}$, and $\psi_{\mathcal{T}}=\psi_{\mathcal{N}}$. The value of a transform node is defined as, $\mathcal{T}(\mathcal{N}(\boldsymbol{x})) = \mathcal{N}(g(\boldsymbol{x}))|\det J_g|$, where $g$ denotes a diffeomorphic transformation associated with $\mathcal{T}$ and $|\det J_g|$ denotes the absolute value of the determinant of the Jacobian of $g$. 
\end{definition}
 \cite{sptn} proposed to model the transformations $g$ associated with $\mathcal{T}$ as invertible affine transformations. To enable tractability, they considered PCs that are smooth and decomposable, with Gaussian distributions at the leaves, and argued that the addition of transform nodes maintained the properties. They reasoned that Gaussian distributions are closed under product operations and affine transformations, i.e., the product of two Gaussians is a Gaussian, an affine transformed Gaussian is a Gaussian, and hence in the induced tree view \citep{ProbCirc20} of a PC, the addition of transform nodes does not introduce any change. 
 
 Though it appears intuitive, we argue that this construction is {\bf incomplete} and can result in violating the structural properties of the PC. We formalize this issue first before discussing the solution to overcome it.

\begin{lemma}
\label{lemma:spn-non-decomposable}
An SPTN with invertible affine transformations, as constructed by Def.~\ref{def:sptn}, can violate the decomposability property of a PC and is thus not guaranteed to be a tractable model for marginal and conditional inference.
\end{lemma}
\begin{center}
    \resizebox {0.5\linewidth} {0.47\linewidth}{
    \begin{tikzpicture}
    %  --- LEFT 
    \node[circle,draw=black!90,fill=white!10,inner sep=0pt,minimum size=0.8cm, line width=1.0pt] (Ti) at (3.2,2.0) {\small{$\mathcal{T}^{i}$}};
    
    \node[circle,draw=black,fill=white!20,inner sep=0pt,minimum size=0.8cm, line width=1.0pt] (Si) at (3.2,0.9) {\small{$\mathcal{S}^{i}$}};
    
    \node[circle,draw=black,fill=white!10,inner sep=0pt,minimum size=0.8cm, line width=1.0pt] (Pi1) at (1.5,-0.2) {\small{$\mathcal{P}^{i}_{1}$}};
    \node[circle,draw=black,fill=white!10,inner sep=0pt,minimum size=0.8cm, line width=1.0pt] (Pi2) at (3,-0.2) {\small{$\mathcal{P}^{i}_{2}$}};
    \node[circle,draw=white,fill=white!10,inner sep=0pt,minimum size=0.8cm, line width=1.0pt] (Pi3) at (4,-0.2) {\small{$\boldsymbol{\ldots}$}};
    \node[circle,draw=black,fill=white!10,inner sep=0pt,minimum size=0.8cm, line width=1.0pt] (Pik) at (5,-0.2) {\small{$\mathcal{P}^{i}_{K}$}};
    
    % \node[circle,draw=grey,fill=white,inner sep=0pt,minimum size=0.8cm] (Ni13) at (1.2,-1.6) {\small{$\ldots$}};
    \node[circle,draw=black,fill=gray!20,inner sep=0pt,minimum size=0.8cm] (Ni1r) at (0.8,-1.3) {\small{$\mathcal{N}^{iR}_{1}$}};
    \node[circle,draw=gray,fill=gray!10,inner sep=0pt,minimum size=0.8cm] (Ni12) at (1.1,-1.8) {\small{$\ldots$}};
    \node[circle,draw=black,fill=gray!20,inner sep=0pt,minimum size=0.8cm] (Ni11) at (1.5,-2.2) {\small{$\mathcal{N}^{i1}_{1}$}};
    
    % \node[circle,draw=black,fill=grey!20,inner sep=0pt,minimum size=0.8cm] (Ni21) at (3,-2.2) {\small{$\mathcal{N}^{i}_{2}$}};
    \node[circle,draw=black,fill=gray!20,inner sep=0pt,minimum size=0.8cm] (Ni2r) at (2.3,-1.3) {\small{$\mathcal{N}^{iR}_{1}$}};
    \node[circle,draw=gray,fill=gray!10,inner sep=0pt,minimum size=0.8cm] (Ni22) at (2.6,-1.8) {\small{$\ldots$}};
    \node[circle,draw=black,fill=gray!20,inner sep=0pt,minimum size=0.8cm] (Ni21) at (3,-2.2) {\small{$\mathcal{N}^{i1}_{1}$}};
    
    \node[circle,draw=white,fill=white!20,inner sep=0pt,minimum size=0.8cm] (Ni31) at (4,-2.2) {\small{$\boldsymbol{\ldots}$}};
    
    % \node[circle,draw=black,fill=grey!20,inner sep=0pt,minimum size=0.8cm] (Nik1) at (5,-2.2) {\small{$\mathcal{N}^{i}_{K}$}};
    \node[circle,draw=black,fill=gray!20,inner sep=0pt,minimum size=0.8cm] (Nikr) at (4.8,-1.3) {\small{$\mathcal{N}^{iR}_{1}$}};
    \node[circle,draw=gray,fill=gray!10,inner sep=0pt,minimum size=0.8cm] (Nik2) at (5.1,-1.8) {\small{$\ldots$}};
    \node[circle,draw=black,fill=gray!20,inner sep=0pt,minimum size=0.8cm] (Nik1) at (5.5,-2.2) {\small{$\mathcal{N}^{i1}_{1}$}};
    
    \path[draw=black,-] (Ti) edge (Si);
    \path[draw=black,-] (Si) edge (Pi1);
    \path[draw=black,-] (Si) edge (Pi2);
    % \path[draw=grey,->] (Si) edge [dotted] (Pi3);
    \path[draw=black,-] (Si) edge (Pik);
    
    \path[draw=black,-] (Pi1) edge (Ni11);
    \path[draw=black,-] (Pi2) edge (Ni21);
    \path[draw=black,-] (Pik) edge (Nik1);
    
    \path[draw=black,-] (Pi1) edge (Ni12);
    \path[draw=black,-] (Pi1) edge (Ni1r);
    
    \path[draw=black,-] (Pi2) edge (Ni22);
    \path[draw=black,-] (Pi2) edge (Ni2r);
    
    \path[draw=black,-] (Pik) edge (Nik2);
    \path[draw=black,-] (Pik) edge (Nikr);
    \end{tikzpicture}
    }
\end{center}

\begin{proof}
Consider an arbitrary section of an SPTN comprising a sum node $\mathcal{S}^i$, product nodes $\{\mathcal{P}^i_j\}_{j=1}^K$, a transform node $\mathcal{T}^i$ as shown in the figure above.

 We will show that the transform node causes the scope of the children of $P^i$ to overlap, thus violating the decomposability property. Let $g_i$ denote the invertible affine transformation associated with $\mathcal{T}^i$. Let $\Omega $ be the scope of $\mathcal{T}^i$, i.e $\psi_{\mathcal{T}^i} = \Omega = \psi_{\mathcal{S}^i} = \psi_{\mathcal{P}^{i}_{j}} \forall j \in \{ i \}_{i=1}^{K}$ and $\boldsymbol{x} \in \R^{|\Omega|} = (x_1, x_2, \ldots, x_{|\Omega|})$. For $\psi \subset \Omega$ let $\boldsymbol{\Pi}_{\psi}(\boldsymbol{x}) \in \R^{|\psi|}$ denote the projection of the $\boldsymbol{x}$ on $\psi$, i.e., $\boldsymbol{x}_{\psi} = \boldsymbol{\Pi}_{\psi}(\boldsymbol{x}) = (x_i)_{i \in \psi}$ is the $|\psi|-$dimensional vector obtained by indexing $\boldsymbol{x}$ along the dimensions contained in $\psi$.  We have,
\begin{align*}
    \mathcal{T}^i&(\mathcal{S}^i (\boldsymbol{x})) = \mathcal{S}^i(g^i(\boldsymbol{x}))|\det J_{g^i}| \\
    &=  \sum_{j=1}^{K} w_j \mathcal{P}^i_j(g^i(\boldsymbol{x}))|\det J_{g^i}| \\
    &=  \sum_{j=1}^{K} w_j \left[ 
     \prod_{k=1}^{R} \mathcal{N}^{ik}_j(\boldsymbol{\Pi}_{\psi_{\mathcal{N}^{ik}_j}}(g^i(\boldsymbol{x}))) \right] |\det J_{g^i}| \\
\end{align*}
Let $\boldsymbol{z}^{ik}_{j}$ denote the input to $\mathcal{N}^{ik}_j$. Thus, for $g^i(\boldsymbol{x}) = \boldsymbol{W^i}(\boldsymbol{x}) + \boldsymbol{b^i}$, we have
\begin{align*}
\boldsymbol{z}^{ik}_{j} &= \boldsymbol{\Pi}_{\psi_{\mathcal{N}^{ik}_j}}(g^i(\boldsymbol{x}))) = \boldsymbol{\Pi}_{\psi_{\mathcal{N}^{ik}_j}}(\boldsymbol{W^i}(\boldsymbol{x}) + \boldsymbol{b^i}) \\
&= ( \sum_{l=1}^{|\Omega|}\boldsymbol{W}^i_{ml}x_l + b_m )_{m \in \psi_{\mathcal{N}^{ik}_j}}
\end{align*}
where we use the notation $ \boldsymbol{z} = (z_m)_{m=1}^{n}$ for an $n$-dimensional vector $\boldsymbol{z} \in \R^{n}$. Clearly, each child $\{\mathcal{N}^{ik}_{j}\}_{k=1}^{R}$ of $\mathcal{P}^i_j$ computes a function over $\boldsymbol{x} \in \R^{|\Omega|}$ and thus have overlapping scopes, $\implies \mathcal{P}^i_j$ is not decomposable $\implies$ the SPTN is not decomposable.
\end{proof}
Note that while we showed that SPTNs with affine transformations violate decomposability, this is true for any invertible transformation $\boldsymbol{y} = g(\boldsymbol{x})$ that computes each dimension $y_i$ of $\boldsymbol{y}$ as some function of $\boldsymbol{x}$. Thus we need to define further structural properties when transform nodes are introduced to a PC to maintain its decomposability. 

A close look at the proof for Lemma~\ref{lemma:spn-non-decomposable} tells us that we can overcome this issue, if $\mathcal{T}$ was constrained in a manner such that $\mathcal{T}(\mathcal{P}(x))$ independently transforms the subset of variables involved in the scope of the children of $\mathcal{P}$. We formalize this notion as a property, which we call $\uptau$-decomposability.
\begin{definition}
Let $\boldsymbol{x} \in \R^d, \ \ g:\R^d \rightarrow \R^d$ and $\boldsymbol{y} = g(\boldsymbol{x})$. We call $g$ to be decomposable w.r.t a collection $(\Omega)$ of disjoint subsets of $\{1, \ldots, d\}$,  if $\boldsymbol{y}_{\psi_i} \perp \boldsymbol{x}_{\psi_j} \ \forall \psi_i, \psi_j \in \Omega, i \ne j$, where $\boldsymbol{y}_{\psi} \in \R^{|\psi|}$ denotes the vector obtained by indexing $y$ along the dimensions contained in $\psi$. We use $\boldsymbol{y}_{\psi_i} \perp \boldsymbol{x}_{\psi_j}$ to imply that $\frac{\partial y_a}{\partial x_b}=0 \ \forall a \in \psi_i, b \in \psi_j$ %. Thus, in other words,
% $g$ can be decomposed as stack of groupwise transformations $g_{\psi_i}$ that 
i.e., it transforms the sets of dimensions contained in $\Omega$ independently, i.e. $\boldsymbol{y}_{\psi_i} = g_{\psi_i}(\boldsymbol{x}_{\psi_i})$, where $g_{\psi_i}:\R^{|\psi_i|} \rightarrow \R^{|\psi_i|}$.
\end{definition}
\begin{property}[\textbf{$\uptau$-decomposability}]
 A sum product transform network $\mathcal{C_{SPTN}}$ is $\uptau$-decomposable iff all of its transform nodes are $\uptau$-decomposable. A transform node $(\mathcal{T})$ is $\uptau$-decomposable if, when defined over a decomposable product node $\mathcal{P}$, the transformation $g$ associated with $\mathcal{T}$ is decomposable w.r.t the collection of the scopes of children of $\mathcal{P}$.
 % We call a transform node $(\mathcal{T})$ to be $\uptau$-decomposable with respect to a collection of disjoint subsets  if, when defined over a product node $\mathcal{P}$, $\mathcal{T}$ transforms the variables involved in the scope of children of $\mathcal{P}$ independently, i.e., $\forall \mathcal{T} \in \mathcal{S}, \mathcal{T}(\mathcal{P}(\boldsymbol{x})) = \prod_{\mathcal{N}_i \in \boldsymbol{ch(P)}} \mathcal{N}_i(g_{\psi_{\mathcal{N}_i}}(\boldsymbol{x}_{\psi_{\mathcal{N}_i}}))|\det J_g|$, where $g(\boldsymbol{x}) = [g_{\psi_{\mathcal{N}_i}}(\boldsymbol{x}_{\psi_{\mathcal{N}_i}})]_{\mathcal{N}_i \in \boldsymbol{ch(P)}}$.
\end{property}
\begin{lemma}
\label{lemma:tau-decomposbility-necessary}
$\uptau$-decomposability is a necessary condition for a SPTN to be decomposable.
\end{lemma}
%\begin{proof} 
The proof is a generalization of Lemma~\ref{lemma:spn-non-decomposable} and is presented in the appendix. Since decomposability is a necessary condition for a PC to be tractable over marginal and conditional queries, Lemma \ref{lemma:tau-decomposbility-necessary} implies that $\uptau$-decomposability is a necessary condition for a SPTN to be tractable w.r.t conditional and marginal inference tasks.  We know that all element-wise transformations are, by definition, $\uptau$-decomposable. But how does $\uptau$-decomposability affect the expressiveness of $\mathcal{C_{SPTN}}$ ? The following theorem provides some insight.
\begin{theorem}
\label{thm:sptn-flow-equivalence}
A $\uptau$-decompasable SPTN is equivalent to a PC 
comprising sum nodes and product nodes retaining the same graph structure, but with all transform nodes from the root to each leaf now pushed down as a composition of transformations (i.e. a normalizing flow) defined just over the corresponding leaf nodes.
\end{theorem}



\begin{proof}
Let $\mathcal{C_{SPTN}}$ be $\uptau$-decomposable sum product transform network, and $\mathcal{S}, \mathcal{P}, \mathcal{T}, \mathcal{L}  \in \mathcal{C_{SPTN}}$ denote a sum node, product node, transform node, and leaf node respectively.
For any arbitrary node $\mathcal{N}$, we will show that defining transformations over $\mathcal{N}$ reduces to transformations over the children of $\mathcal{N}$.



\textit{Case 1}: When $\mathcal{N}=\mathcal{S}$, we have, 
\begin{align*}
    \mathcal{T}(\mathcal{S}(\boldsymbol{x})) &= \mathcal{S}(g(\boldsymbol{x}))|\det J_g| \\
    &= \underset{\mathcal{N}_i \in ch(\mathcal{S})}{\sum}w_i \mathcal{N}_i(g(\boldsymbol{x}))|\det J_g| \\
    &= \underset{\mathcal{N}_i \in ch(\mathcal{S})}{\sum}w_i \mathcal{T}(\mathcal{N}_i(\boldsymbol{x}))
\end{align*}
\textit{Case 2}: When $\mathcal{N}=\mathcal{P}$, we have, 
\begin{align*}
    \mathcal{T}(\mathcal{P}(\boldsymbol{x})) &= \mathcal{P}(g(\boldsymbol{x}))|\det J_g| \\
    &= \left[ \underset{\mathcal{N}_i \in ch(\mathcal{P})}{\prod}\mathcal{N}_i(\Pi_{\psi_{\mathcal{N}_i}}(g(\boldsymbol{x}))) \right] |\det J_g|\\
    &= \left[ \underset{\mathcal{N}_i \in ch(\mathcal{P})}{\prod}\mathcal{N}_i(g_{\psi_{\mathcal{N}_i}}(\boldsymbol{x}_{\psi_{\mathcal{N}_i}}))  \right] |\det J_g|
\end{align*}
Since $g$ is decomposable w.r.t the collection ($\Omega$) of scopes of children of $\mathcal{P}$, the Jacobian of $g$ is a block diagonal matrix, each block corresponding to the jacobian of the independent transformations, $J_{g_{\psi_i}}$ for $\psi_i \in \Omega$. Thus, $|\det J_g| = \underset{\mathcal{N}_i \in ch(\mathcal{P})}{\prod} | \det J_{g_{\psi_{\mathcal{N}_i}}}|$. Hence,
\begin{align*}
    \mathcal{T}(\mathcal{P}(\boldsymbol{x})) 
    % \left[ \underset{\mathcal{N}_i \in ch(\mathcal{P})}{\prod}\mathcal{N}_i(g_{\psi_{\mathcal{N}_i}}(\boldsymbol{x}_{\psi_{\mathcal{N}_i}}))  \right]  \underset{\mathcal{N}_i \in ch(\mathcal{P})}{\prod} | \det J_{g_{\psi_{\mathcal{N}_i}}}| \\ 
    &= \underset{\mathcal{N}_i \in ch(\mathcal{P})}{\prod} \left[ \mathcal{N}_i(g_{\psi_{\mathcal{N}_i}}(\boldsymbol{x}_{\psi_{\mathcal{N}_i}}))| \det J_{g_{\psi_{\mathcal{N}_i}}}| \right]\\
    &= \underset{\mathcal{N}_i \in ch(\mathcal{P})}{\prod} \mathcal{T}_{\psi_{\mathcal{N}_i}}(\mathcal{N}_i(\boldsymbol{x}))
\end{align*}

\textit{Case 3}: When $\mathcal{N}=\mathcal{T}^{i-1}$, let us denote by $\mathcal{N}^{i-1}$ the child node of $\mathcal{T}^{i-1}$. We have, 
\begin{align*}
    \mathcal{T}^i(\mathcal{T}^{i-1}&(\mathcal{N}^{i-1}(\boldsymbol{x}))) = \mathcal{T}^{i}(\mathcal{N}^{i-1}(g^{i-1}(\boldsymbol{x}))|\det J_{g^{i-1}}| \\
    &= \mathcal{N}^{i-1}(g^{i-1}(g^{i}(\boldsymbol{x})))|\det J_{g^{i-1}}||\det J_{g^{i}}| \\
    &= \mathcal{N}^{i-1}(\overline{g}(\boldsymbol{x}))|\det J_{\overline{g}}|, \ \text{where} \  \overline{g} = g^{i-1} \circ g^{i}\\
    &= \overline{\mathcal{T}}(\mathcal{N}^{i-1}(\boldsymbol{x}))
 \end{align*}
i.e. $\mathcal{T}^{i}$ with transformation $g^{i}$ and $\mathcal{T}^{i-1}$ with transformation $g^{i-1}$ can be combined into a single transform node $\overline{\mathcal{T}}$ with an associated transformation $\overline{g}= g^{i-1} \circ g^{i}$. This follows from the fact that compositions of invertible transformations are invertible and the jacobian of composition of two transformations can be written as the product of the jacobians of the individual transformations.

Thus recursively pushing down transformations along the path from the root to each leaf node, we end up with compositions of transformations defined only over the leaf nodes. Let $l_i$ be the number of transform nodes along the path from the root to the leaf $\mathcal{L}^i$ and let $\overline{\mathcal{T}}$ denote the composition  $\mathcal{T}^1 \circ \mathcal{T}^2 \ldots \mathcal{T}^{l_i}$. $\mathcal{C_{SPTN}}$ has now reduced to containing only sum nodes, product nodes and a transformed leaf node, where the new leaf node $\overline{\mathcal{L}}$ is defined as:
$\overline{\mathcal{L}}(\boldsymbol{x}) = \overline{\mathcal{T}}(\mathcal{L}(\boldsymbol{x})) 
= \mathcal{L}(g^{1} \circ g^{2} \circ \ldots \circ g^{l_i}(\boldsymbol{x}))\prod_{i=1}^{l} |\det J_{g^i}|$


\begin{center}
\begin{tikzpicture}
\node[rectangle,draw=white!90,fill=gray!10,opacity=0.9,minimum width=2.9cm,minimum height=0.5cm] at (-0.5,0) {\textbf{Case 1}};    \node[rectangle,draw=white!90,fill=gray!10,opacity=0.9,minimum width=2.8cm,minimum height=0.5cm] at (2.5,0) {\textbf{Case 2}};    \node[rectangle,draw=white!90,fill=gray!10,opacity=0.9,minimum width=1.5cm,minimum height=0.5cm] at (4.8,0) {\textbf{Case 3}};    
\end{tikzpicture}

\resizebox {0.36\linewidth} {0.38\linewidth} {
\begin{tikzpicture}
%  --- LEFT 
\node[circle,draw=black!90,fill=white!10,inner sep=0pt,minimum size=0.8cm, line width=1.0pt] (Ti) at (3.0,2.1) {\small{$\mathcal{T}^{i}$}};
\node[circle,draw=black,fill=white!20,inner sep=0pt,minimum size=0.8cm, line width=1.0pt] (Si) at (3.0,0.9) {\small{$\mathcal{S}^{i}$}};
\node[circle,draw=black,fill=gray!20,inner sep=0pt,minimum size=0.8cm, line width=1.0pt] (Ni1) at (1.8,-0.2) {\small{$\mathcal{N}^{i}_{1}$}};
\node[circle,draw=black,fill=gray!20,inner sep=0pt,minimum size=0.8cm, line width=1.0pt] (Ni2) at (2.8,-0.2) {\small{$\mathcal{N}^{i}_{2}$}};
\node[circle,draw=white,fill=white!10,inner sep=0pt,minimum size=0.5cm, line width=1.0pt] (Ni3) at (3.55,-0.2) {\small{$\boldsymbol{\ldots}$}};
\node[circle,draw=black,fill=gray!20,inner sep=0pt,minimum size=0.8cm, line width=1.0pt] (Nik) at (4.3,-0.2) {\small{$\mathcal{N}^{i}_{K}$}};
\path[draw=black,-] (Ti) edge (Si);
\path[draw=black,-] (Si) edge (Ni1);
\path[draw=black,-] (Si) edge (Ni2);
\path[draw=black,-] (Si) edge (Nik);
\path[draw=black,->, line width=0.2mm] (3.0,-1) -- (3.0,-1.5);
\end{tikzpicture} \ \ \ 
}
\resizebox {0.36\linewidth} {0.38\linewidth} {
\begin{tikzpicture}
%  --- center 
\node[circle,draw=black!90,fill=white!10,inner sep=0pt,minimum size=0.8cm, line width=1.0pt] (Ti) at (3.0,2.1) {\small{$\mathcal{T}^{i}$}};
\node[circle,draw=black,fill=white!20,inner sep=0pt,minimum size=0.8cm, line width=1.0pt] (Pi) at (3.0,0.9) {\small{$\mathcal{P}^{i}$}};
\node[circle,draw=black,fill=gray!20,inner sep=0pt,minimum size=0.8cm, line width=1.0pt] (Ni1) at (1.8,-0.2) {\small{$\mathcal{N}^{i}_{1}$}};
\node[circle,draw=black,fill=gray!20,inner sep=0pt,minimum size=0.8cm, line width=1.0pt] (Ni2) at (2.8,-0.2) {\small{$\mathcal{N}^{i}_{2}$}};
\node[circle,draw=white,fill=white!10,inner sep=0pt,minimum size=0.6cm, line width=1.0pt] (Ni3) at (3.6,-0.2) {\small{$\boldsymbol{\ldots}$}};
\node[circle,draw=black,fill=gray!20,inner sep=0pt,minimum size=0.8cm, line width=1.0pt] (Nik) at (4.4,-0.2) {\small{$\mathcal{N}^{i}_{K}$}};
\path[draw=black,-] (Ti) edge (Si);
\path[draw=black,-] (Si) edge (Ni1);
\path[draw=black,-] (Si) edge (Ni2);
\path[draw=black,-] (Si) edge (Nik);
\path[draw=black,->, line width=0.2mm] (3.0,-1) -- (3.0,-1.5);
\end{tikzpicture} \ \ \ 
}
\resizebox {0.1\linewidth} {0.38\linewidth} {
\begin{tikzpicture}
%  --- right 
\node[circle,draw=black!90,fill=white!10,inner sep=0pt,minimum size=0.8cm, line width=1.0pt] (Ti) at (3.5,2.1) {\small{$\mathcal{T}^{i}$}};
\node[circle,draw=black,fill=white!20,inner sep=0pt,minimum size=0.8cm, line width=1.0pt] (Ti-1) at (3.5,0.9) {\small{$\mathcal{T}^{j}$}};
\node[circle,draw=black,fill=gray!20,inner sep=0pt,minimum size=0.8cm, line width=1.0pt] (Ni) at (3.5,-0.2) {\small{$\mathcal{N}^{i}$}};
\path[draw=black,-] (Ti) edge (Ti-1);
\path[draw=black,-] (Ti-1) edge (Ni);
\path[draw=black,->, line width=0.2mm] (3.5,-1) -- (3.5,-1.5);
\end{tikzpicture}
}

\resizebox {0.36\linewidth} {0.3\linewidth} {
\begin{tikzpicture}
%  --- LEFT 
\node[circle,draw=black!90,fill=white!10,inner sep=0pt,minimum size=0.8cm, line width=1.0pt] (Si) at (3.0,2.1) {\small{$\mathcal{S}^{i}$}};
\node[circle,draw=black,fill=white!20,inner sep=0pt,minimum size=0.8cm, line width=1.0pt] (Ti1) at (1.8,0.9) {\small{$\mathcal{T}^{i}$}};
\node[circle,draw=black,fill=white!20,inner sep=0pt,minimum size=0.8cm, line width=1.0pt] (Ti2) at (2.8,0.9) {\small{$\mathcal{T}^{i}$}};
\node[circle,draw=black,fill=white!20,inner sep=0pt,minimum size=0.8cm, line width=1.0pt] (Tik) at (4.3,0.9) {\small{$\mathcal{T}^{i}$}};
\node[circle,draw=black,fill=gray!20,inner sep=0pt,minimum size=0.8cm, line width=1.0pt] (Ni1) at (1.8,-0.2) {\small{$\mathcal{N}^{i}_{1}$}};
\node[circle,draw=black,fill=gray!20,inner sep=0pt,minimum size=0.8cm, line width=1.0pt] (Ni2) at (2.8,-0.2) {\small{$\mathcal{N}^{i}_{2}$}};
\node[circle,draw=white,fill=white!10,inner sep=0pt,minimum size=0.5cm, line width=1.0pt] (Ni3) at (3.55,-0.2) {\small{$\boldsymbol{\ldots}$}};
\node[circle,draw=black,fill=gray!20,inner sep=0pt,minimum size=0.8cm, line width=1.0pt] (Nik) at (4.3,-0.2) {\small{$\mathcal{N}^{i}_{K}$}};
\path[draw=black,-] (Si) edge (Ti1);
\path[draw=black,-] (Si) edge (Ti2);
\path[draw=black,-] (Si) edge (Tik);
\path[draw=black,-] (Ti1) edge (Ni1);
\path[draw=black,-] (Ti2) edge (Ni2);
\path[draw=black,-] (Tik) edge (Nik);
\end{tikzpicture} \ \ \  
}
\resizebox {0.36\linewidth} {0.3\linewidth} {
\begin{tikzpicture}
%  --- center 
\node[circle,draw=black!90,fill=white!10,inner sep=0pt,minimum size=0.8cm, line width=1.0pt] (Pi) at (3.0,2.1) {\small{$\mathcal{P}^{i}$}};
\node[circle,draw=black,fill=white!20,inner sep=0pt,minimum size=0.8cm, line width=1.0pt] (Ti1) at (1.8,0.9) {\small{$\mathcal{T}^{i}_{\psi_1}$}};
\node[circle,draw=black,fill=white!20,inner sep=0pt,minimum size=0.8cm, line width=1.0pt] (Ti2) at (2.8,0.9) {\small{$\mathcal{T}^{i}_{\psi_2}$}};
\node[circle,draw=black,fill=white!20,inner sep=0pt,minimum size=0.8cm, line width=1.0pt] (Tik) at (4.3,0.9) {\small{$\mathcal{T}^{i}_{\psi_K}$}};
\node[circle,draw=black,fill=gray!20,inner sep=0pt,minimum size=0.8cm, line width=1.0pt] (Ni1) at (1.8,-0.2) {\small{$\mathcal{N}^{i}_{1}$}};
\node[circle,draw=black,fill=gray!20,inner sep=0pt,minimum size=0.8cm, line width=1.0pt] (Ni2) at (2.8,-0.2) {\small{$\mathcal{N}^{i}_{2}$}};
\node[circle,draw=white,fill=white!10,inner sep=0pt,minimum size=0.5cm, line width=1.0pt] (Ni3) at (3.55,-0.2) {\small{$\boldsymbol{\ldots}$}};
\node[circle,draw=black,fill=gray!20,inner sep=0pt,minimum size=0.8cm, line width=1.0pt] (Nik) at (4.3,-0.2) {\small{$\mathcal{N}^{i}_{K}$}};
\path[draw=black,-] (Pi) edge (Ti1);
\path[draw=black,-] (Pi) edge (Ti2);
\path[draw=black,-] (Pi) edge (Tik);
\path[draw=black,-] (Ti1) edge (Ni1);
\path[draw=black,-] (Ti2) edge (Ni2);
\path[draw=black,-] (Tik) edge (Nik);
\end{tikzpicture} \ \ \
}
\resizebox {0.1\linewidth} {0.18\linewidth} {
\begin{tikzpicture}
%  --- right 
\node[circle,draw=black,fill=white!20,inner sep=0pt,minimum size=0.8cm, line width=1.0pt] (Ti-1) at (3.5,0.9) {\small{$\overline{\mathcal{T}}$}};
\node[circle,draw=black,fill=gray!20,inner sep=0pt,minimum size=0.8cm, line width=1.0pt] (Ni) at (3.5,-0.2) {\small{$\mathcal{N}^{i}$}};
\path[draw=black,-] (Ti-1) edge (Ni);
\end{tikzpicture}
}
\end{center}

Thus, each leaf node now becomes a normalizing flow.
\end{proof}
Each leaf in a PC typically encodes a tractable distribution over a single random variable. Thus, though it is intuitive to think of integrating NFs with PCs by the introduction of transform nodes arbitrarily in the circuit as done in \cite{sptn}, Thm.~\ref{lemma:tau-decomposbility-necessary} shows that, when ensuring tractability, such circuits reduce to a PC with a new type of leaf node. %We summarize such circuits below:
%under the definition of a \textit{Probabilistic Flow Circuit} below: 
\begin{definition}[\textbf{Probabilistic Flow Circuit}]
    A probabilistic flow circuit (PFC) is a computational graph that defines a probability distribution over a set of continuous random variables ($\boldsymbol{X}$). It comprises of three types of nodes: (1) Sum nodes $\mathcal{S}$ that computes a convex combination of the values of its children. (2) Product nodes $\mathcal{P}$ that computes a product of the values of its children. (3) Leaf nodes ($\mathcal{L}_{\mathcal{F}}$) defined as normalizing flows over individual features $X_i \in \boldsymbol{X}$.
\end{definition}

NFs are expressive deep models that can effectively model arbitrarily complex probability distributions. Thus, a PFC augments the expressivity of PCs by allowing complex distributions to be modeled at the leaf nodes. It also retains the tractability of PCs as the transformations are restricted only to the leaves. However does integrating any invertible transformation at the leaf provide the same added expressivity? 

To understand this,  consider the invertible affine transformations that were proposed by \cite{sptn}. We know that compositions of affine transformations can be reduced to a single affine transformation. We also know that an affine transformed Gaussian is still a Gaussian. Thus, the addition of invertible affine transformations at leaf nodes, still only gives us the ability to model Gaussian distributions at the leaf, and does not seem to help improve expressivity. Now, with the framework for integrating NFs with PCs constructed, let us define expressive flow transformations. 
% and discuss properties that are desirable for them in this context.

\textbf{Probabilistic Flow Circuits via Linear Rational Splines}\\
The requirement at the leaf node for a PFC is that it should enable tractable computation of probability densities. Any diffeomorphic transformation enables computing the exact density using the change of variables formula. Thus, any expressive normalizing flow architecture can, in principle, be used to implement a probabilistic flow circuit. 

However, the nature of inference tasks that we need tractability for can demand additional characteristics that are desirable of the diffeomorphisms defined at the leaves. For example, in order to support MAP inference, deterministic PCs require that the argmax over the leaf density is easily computable. In  PCs, since leaf distributions over continuous variables are typically parameterized as Gaussians, the argmax (or the mode of the distribution) corresponds to the mean. However, by defining distributions using normalizing flows, the leaf nodes in a PFC now model arbitrarily complex probability distributions. Since they are no longer restricted to be uni-modal, computing the argmax over this distribution becomes a challenging task. 
Thus, one design principle for probabilistic flow circuits could be defining diffeomorphisms that support easy computation of modes, without imposing significant limitations on its expressivity.

We propose to parameterize  $g$ 
 using a family of invertible piecewise functions, known as \textit{linear rational splines (LRS)}. Spline-based normalizing flows \citep{lrs-flow, nsf} have attracted much research interest in recent years and are among the state-of-the-art. % flow models. 
We will adapt the computationally efficient LRS transformations defined by \cite{lrs-flow} to define expressive leaf distributions with easily computable modes. 

\begin{definition}
An invertible linear rational spline transformation is a piecewise function that divides the input space ($X$$)$ into bins (or intervals) and defines a monotonic rational function of the form $y = \frac{a_1 x+b_1}{a_2 x + b_2}$ within each bin, such that they are knotted together at the boundaries so as to define a differentiable function. Formally, given a set of monotonically increasing points $\{(x^k, y^k)\}_{k=0}^K$ known as knots, derivatives $\{ d^k > 0 \}_{k=0}^K$ at the knots, and parameters $\{ \lambda^k \in (0,1) \}_{k=0}^K$,  the LRS transformation within each bin, say for $ x \in [x^k, x^{k+1}]$ is defined as:
\begin{equation*}
     y =  g_{lrs}(\phi), \text{where} \  \phi = \frac{(x - x^k)}{(x^{k+1} - x^{k})},  \text{and} \ 
\end{equation*}  
\begin{equation*}
     g_{lrs}(\phi) =  \begin{cases} 
      \frac{w^k y^k (\lambda^k - \phi) + w^m y^m \phi}{w^k (\lambda^k - \phi) + w^m\phi} & 0\leq \phi \leq \lambda^k \\
      \\
      \frac{w^m y^m (1 - \phi) + w^{k+1} y^{k+1} (\phi - \lambda^k)}{w^m (1 - \phi) + w^{k+1}(\phi - \lambda^k)} & \lambda^k \leq \phi \leq 1 \\
   \end{cases}
\end{equation*}    
\end{definition}

Given the knots and the derivatives, the parameters $w^k, w^m, w^{k+1}, \lambda^k $ and $y^m$ are constructed in such a manner \citep{lrs-flow} that the overall function defined is differentiable everywhere. The width and height of each bin, the derivatives at the knots, and $\lambda$ are defined as learnable parameters for an LRS flow. The form for the analytical inverse of $g_{lrs}$ and its jacobian are given in \cite{lrs-flow}. 
% We will  use $g_{lrs}$ to refer to the LRS transformation.
Defining a flow at the leaf also requires specifying a base distribution. 
While any distribution with an analytically computable density can be used, for theoretical purposes, we will use a Student's-t distribution with $3$ degrees of freedom, as the form of its density function simplifies the analysis when used with $g_{lrs}$. 
 % Motivated by the recent success of spline-based normalizing flows \cite{lrs-flow, nsf}, we propose to parameterize the invertible transformations $g$ using a family of invertible piecewise functions, known as \textit{linear rational splines (LRS)}.
 % In this work, we consider a class of normalizing flows defined using invertible linear rational spline (LRS) transformations and 
  % While well explored in the field of data interpolation, \cite{lrs-flow} recently showed that LRS can be adapted to define expressive and computationally efficient normalizing flows. 
 % A key factor contributing to the invertibility of $g_{lrs}$ is that it is monotonic within each bin. When used in a flow, we can exploit the monotonicity property of $g_{lrs}$, to reduce the complexity of computing the mode of the transformed distribution. More specifically, instead of searching for the mode over the entire support, we only need to evaluate and compare the densities at the knots. Thus, when using $g_{lrs}$ with $K$ intervals, the mode of the distribution can be computed in $O(K)$. 
We summarize the tractability of our model below before we present empirical results. %to experimentally establish its modeling flexibility.
\begin{lemma}
    A PFC ($\mathcal{C}_{\mathcal{F}}$) with leaf distributions defined using $g_{lrs}$ transformations and a Student's-t distribution with $\nu = 3$ as the base distribution is a tractable model for (a) evidential inference if $\mathcal{C}_{\mathcal{F}}$ is smooth, (b) Marginal and conditional inference if $\mathcal{C}_{\mathcal{F}}$ is smooth and decomposable, (c) MAP inference if $\mathcal{C}_{\mathcal{F}}$ is smooth, decomposable, and deterministic.
    \label{lemma:pfc-tractability}
\end{lemma}
\begin{proof}
Deferred to the supplementary.
\end{proof}
\section{Experiments \& Results}
\begin{figure*}[t!]
    \centering
    % \begin{tabular}{cc}
    % \subfloat[\\Interlocked-Circles]{\includegraphics[width = 0.16\linewidth]{figures/datasets/3d/InterlockedCIRCLES_GT.png}}
    % \subfloat[\\Helix]{\includegraphics[width = 0.16\linewidth]{figures/datasets/3d/HELIX_GT.png}}
    % \subfloat[\\Bent-Lissajous]{\includegraphics[width = 0.16\linewidth]{figures/datasets/3d/BentLISSAJOUS_GT.png}}
    % \subfloat[\\Disjoint-Circles]{\includegraphics[width = 0.16\linewidth]{figures/datasets/3d/DisjointCIRCLES_GT.png}}
    % \subfloat[\\Twisted-Eight]{\includegraphics[width = 0.16\linewidth]{figures/datasets/3d/TwistedEIGHT_GT.png}}
    % \subfloat[\\Knotted]{\includegraphics[width = 0.16\linewidth]{figures/datasets/3d/KNOTTED_GT.png}}
    % \end{tabular}
    \includegraphics[width = 0.16\linewidth]{figures/datasets/3d/InterlockedCIRCLES_GT.png}
    \includegraphics[width = 0.16\linewidth]{figures/datasets/3d/HELIX_GT.png}
    \includegraphics[width = 0.16\linewidth]{figures/datasets/3d/BentLISSAJOUS_GT.png}
    \includegraphics[width = 0.16\linewidth]{figures/datasets/3d/DisjointCIRCLES_GT.png}
    \includegraphics[width = 0.16\linewidth]{figures/datasets/3d/TwistedEIGHT_GT.png}
    \includegraphics[width = 0.16\linewidth]{figures/datasets/3d/KNOTTED_GT.png}
    \begin{tikzpicture}
         \node[rectangle,draw=white!90,fill=gray!10,opacity=0.9,minimum width=2.5cm,minimum height=0.5cm] at (-0.5,0) {\small{Interlocked-Circles}};
         \node[rectangle,draw=white!90,fill=gray!10,opacity=0.9,minimum width=2.5cm,minimum height=0.5cm] at (2.3,0) {\small{Helix}};
         \node[rectangle,draw=white!90,fill=gray!10,opacity=0.9,minimum width=2.5cm,minimum height=0.5cm] at (5,0) {\small{Bent-Lissajous}};
         \node[rectangle,draw=white!90,fill=gray!10,opacity=0.9,minimum width=2.5cm,minimum height=0.5cm] at (8.1,0) {\small{Disjoint-Circles}};
         \node[rectangle,draw=white!90,fill=gray!10,opacity=0.9,minimum width=2.5cm,minimum height=0.5cm] at (11.2,0) {\small{Twisted-Eight}};
         \node[rectangle,draw=white!90,fill=gray!10,opacity=0.9,minimum width=2.5cm,minimum height=0.5cm] at (14,0) {\small{Knotted}};
     \end{tikzpicture}
    \caption{Visualizations of the $6$ \textbf{3D data distributions} over complex manifolds considered in this work.}
    \label{3D-datasets}
\end{figure*}
\begin{figure}
    \centering
    % \includegraphics[width=0.3\linewidth]{figures/learning/DisjointCIRCLES.pdf}
    % \includegraphics[width=0.3\linewidth]{figures/learning/InterlockedCIRCLES.pdf}
    % \includegraphics[width=0.495\linewidth]{figures/learning/TwistedEIGHT.pdf}
    \includegraphics[width=0.495\linewidth]{figures/learning/KNOTTED.pdf}
    \includegraphics[width=0.495\linewidth]{figures/learning/HELIX.pdf}
    % \includegraphics[width=0.3\linewidth]{figures/learning/BentLISSAJOUS.pdf}
    \caption{ \textbf{Learning curves} of - (a)\emph{ EinsumNet} (b)\emph{ EinsumNet+Affine}, and (c) \emph{EinsumNet+LRS} on the \textbf{3D datasets} (mean validation log-likelihood across training epochs). The shaded regions depict the standard deviation across 3 trials.}
    \label{fig:learning-curve-synthetic}
\end{figure}
\begin{table*}[h]
\centering
\begin{tabular}{@{}lllllll@{}}
\toprule
                    & \multicolumn{1}{c}{Helix} & \multicolumn{1}{c}{Knotted} & \multicolumn{1}{c}{\begin{tabular}[c]{@{}c@{}}Disjoint-\\ Circles\end{tabular}} & \multicolumn{1}{c}{\begin{tabular}[c]{@{}c@{}}Interlocked-\\ Circles\end{tabular}} & \multicolumn{1}{c}{\begin{tabular}[c]{@{}c@{}}Twisted-\\ Eight\end{tabular}} & \multicolumn{1}{c}{\begin{tabular}[c]{@{}c@{}}Bent-\\ Lissajous\end{tabular}} \\ \midrule
EinsumNet              & $-2.94 \pm 0.24$             & $-5.23 \pm 0.07$               & $-1.71 \pm 0.19$                                                                   & $-2.88 \pm 0.07$                                                                      & $-3.02 \pm 0.24$                                                                & $-3.02 \pm 0.06$                                                                 \\
EinsumNet + Affine     & $-2.47 \pm 0.09$             & $-5.18 \pm 0.11$               & $-1.18 \pm 0.06$                                                                   & $-2.52 \pm 0.05 $                                                                     & $-2.79 \pm 0.07$                                                                & $-2.69 \pm 0.05 $                                                                \\
EinsumNet + LRS  & $-1.04 \pm 0.02$             & $-4.09 \pm 0.03 $              & $-0.81 \pm 0.03$                                                                   & $-2.44 \pm 0.02$                                                                      & $-2.43 \pm 0.01 $                                                               & $-2.53 \pm 0.02 $                                                                \\ \bottomrule
\end{tabular}
 \caption{ \textbf{Performance Evaluation} of  - (a)\emph{ EinsumNet} (b)\emph{ EinsumNet+Affine}, and (c) \emph{EinsumNet+LRS} on the \textbf{3D datasets}, in terms of mean test log-likelihood (\textbf{$\uparrow$}), $ \pm $ the standard deviation across 3 trials.}
 \label{table:test-ll-3d}
\end{table*}

% To show the benefits of probabilistic flow circuits, 
We experimentally validated the efficacy of probabilistic flow circuits in modeling a wide range of data distributions. We used einsum networks \citep{peharz_20_einsum} with Gaussian leaves to parameterize PCs. 
Einsum networks vectorize the leaf distributions in a PC and implement the sum and product operations using a monolithic \textit{einsum}-operation defined over a probability tensor. The balanced tree structure of its computational graph allows computing the leaf densities in parallel and processing it through stacked einsum-layers, similar to transforming an input through a deep neural network. 
We instantiated our PFC by integrating a linear rational spline flow as the input layer within einsum networks. To enable parallel computation of all the leaf densities, we implemented our input layer as a single conditional NF, conditioned on the index of the leaf distribution. 

We will refer to the base PC as EinsumNet and our probabilistic flow circuits  as EinsumNet+LRS. As our objective is to evaluate the added expressivity brought in by the flexible leaf distributions, we used the same circuit structure for both EinsumNet and EinsumNet+LRS, and compared their performances on a multitude of datasets. The two key parameters constraining the structure and expressivity of an einsum network are the number of vector components ($k$) and the number of replicas ($r$) (see \cite{peharz_20_einsum} for more details). We used $k=r=10$ for our $3$D data experiments and $k=r=20$ for our experiments on higher dimensional data. As einsum networks are differentiable like neural networks, we used end-to-end backpropagation and trained all our models using an Adam optimizer, with a learning rate of $1e-3$.
We defer further implementation details to the supplementary. Our code is implemented using pytorch and pyro \citep{bingham2019pyro}, adapted from \cite{peharz_20_einsum} and is publicly available\footnote{\href{https://github.com/sahilsid/probabilistic-flow-circuits}{https://github.com/sahilsid/probabilistic-flow-circuits}}. 

We aim to answer the following questions empirically:
\begin{description}
    \item[Q1] Do probabilistic flow circuits provide more accurate density estimates for complex distributions?
    \item[Q2]  If so, are the inference tasks still tractable with these models while generating better quality samples?
    \item[Q3]  Can we expect this added flexibility to hold in general, or is it closely coupled with the expressivity of the underlying PC ?
\end{description}
\subsection{Density Estimation}
\textbf{$3$D Manifold Data}.  We first considered $6$ data distributions over complex $3$D manifolds, adapted from \cite{vqflows-sidheekh22a}. These data distributions are  easy to visualize (see Fig.~\ref{3D-datasets}), being in a low dimensional space, but also challenging to model, owing to their complex manifold structure. They are thus useful distributions to evaluate and compare generative models. We utilized these data distributions to empirically validate (a) the performance improvement achieved by having flows as leaf distributions and (b) the better expressivity offered by linear rational spline transformations over the invertible affine transformations proposed in \cite{sptn}.
We generated $20,000$ data points for each of the $6$ $3$D datasets, $10,000$ of which we used for training and $5,000$ each for validation and testing.  We trained three models: (a) EinsumNet, (b) EinsumNet + Affine,  and (c) EinsumNet + LRS on each dataset, where EinsumNet + Affine refers to the probabilistic flow circuit obtained by integrating invertible affine transformations as the leaf flow. To ensure invertibility, we implemented the affine transformations in their SVD decomposed form, as proposed in \cite{sptn}, utilizing the householder parameterization for generating unitary matrices. \\
Fig.~\ref{fig:learning-curve-synthetic} shows the validation log-likelihood achieved by each model across the training epochs on two $3$D datasets. As one can see, EinsumNet + Affine performs similar to EinsumNet in many cases or slightly better. This is expected as the affine transformed gaussian leaf distributions are still gaussians.  EinsumNet + LRS on the other hand, not only achieves significantly better performance than both EinsumNet and  EinsumNet + Affine, but it is able to do so much faster. Similar learning curves on the other $4$ datasets are provided in the supplementary. Tab.~\ref{table:test-ll-3d} provides further quantitative evidence in terms of the test log-likelihood of each of the three models on the 3D datasets, thus validating the added flexibility of EinsumNet + LRS.
\begin{table*}[]
\centering
\begin{tabular}{@{}lcccccc@{}}
\toprule
                     & \multicolumn{1}{c}{POWER} & \multicolumn{1}{c}{GAS} & \multicolumn{1}{c}{MINIBOONE} & \multicolumn{1}{c}{HEPMASS} & \multicolumn{1}{c}{MNIST} & \multicolumn{1}{c}{Fashion-MNIST} \\ \midrule
MADE                & $-3.08        \pm 0.03$  & $3.56  \pm 0.04$ & $-15.59  \pm 0.50$ & $-20.98  \pm 0.02$ & $-1380.8 \pm 4.8$          & \multicolumn{1}{c}{-}     \\
Real NVP            & $-0.02        \pm 0.01$  & $4.78  \pm 1.80$ & $-13.55  \pm 0.49$ & $-19.62  \pm 0.02$ & $-1323.2 \pm 6.6$          & \multicolumn{1}{c}{-}   \\
MAF                 & $ \ \ \ 0.14  \pm 0.01$  & $9.07  \pm 0.02$ & $-11.75  \pm 0.44$ & $-17.70  \pm 0.02$ & $-1300.5 \pm 1.7$          & \multicolumn{1}{c}{-}                        \\ \midrule
EinsumNet           & $ \ \ \ 0.20  \pm 0.01$  & $3.57  \pm 0.08$ & $-35.93  \pm 0.06$ & $-22.79  \pm 0.05$ & $-1015.1 \pm 0.9$          & $649.1 \pm 0.3$\\
Einsum+LRS          & $ \ \ \ 0.36  \pm 0.01$  & $4.79  \pm 0.04$ & $-34.21  \pm 0.01$ & $-22.46  \pm 0.01$ & $-959.4  \pm 1.4$          & $655.6 \pm 0.6$\\ \bottomrule
\end{tabular}
\caption{\textbf{Performance Evaluation} of - (a)\emph{ EinsumNet} and (b) \emph{Einsum Net+LRS} on the \textbf{tabular} \& \textbf{image} datasets, in terms of mean test log-likelihood (\textbf{$\uparrow$}), $ \pm $ the standard deviation across 3  trials. 
}
\label{tab:high-dim-test-ll}
\end{table*}
% \begin{table}[t]
% \centering
% \begin{tabular}{lll}
% \hline
%                     & \multicolumn{1}{c}{MNIST} & \multicolumn{1}{c}{F-MNIST} \\ \hline
% MADE                & $-1380.8 \pm 4.8$            & \multicolumn{1}{c}{-}             \\
% Real NVP            & $-1323.2 \pm 6.6$            & \multicolumn{1}{c}{-}             \\
% MAF                 & $-1300.5 \pm 1.7$            & $613.3$                             \\ \hline
% Einsum Net          & $-1069.4 \pm 0.2$          & $613.4 \pm 0.2$                     \\
% Einsum + LRS (Ours) & $-960.9 \pm 0.3$           & $633.4 \pm 0.6$                   \\ \hline
% \end{tabular}
% \caption{\textbf{Performance Evaluation} of - (a)\emph{ Einsum Network} and (b) \emph{Einsum Network + LRS} on the \textbf{Image datasets}, in terms of mean log-likelihood on the test set (\textbf{\textit{higher the better}}), $ \pm $ the standard deviation across 3 independent trials.}
% \end{table}
% \begin{table*}[]
% \centering
% \small
% \begin{tabular}{@{}lllll@{}}
% \toprule
%                     & \multicolumn{1}{c}{POWER} & \multicolumn{1}{c}{GAS} & \multicolumn{1}{c}{MINIBOONE} & \multicolumn{1}{c}{HEPMASS} \\ \midrule
% MADE                & $-3.08 \pm 0.03$  & $3.56  \pm 0.04$ & $-15.59  \pm 0.50$ & $-20.98  \pm 0.02$ \\
% Real NVP            & $-0.02 \pm 0.01$  & $4.78  \pm 1.80$ & $-13.55  \pm 0.49$ & $-19.62  \pm 0.02$ \\
% MAF                 & $ \ \ \ 0.14  \pm 0.01$  & $9.07  \pm 0.02$ & $-11.75  \pm 0.44$ & $-17.70  \pm 0.02$ \\ \midrule
% EinsumNet          & $ \ \ \ 0.17  \pm 0.02$  & $3.64  \pm 0.21$ & $-35.93  \pm 0.06$ & $-22.79  \pm 0.05$ \\
% Einsum+LRS & $ \ \ \ 0.27  \pm 0.01$  & $4.78  \pm 0.02$ & $-34.21  \pm 0.01$ & $-22.46  \pm 0.01$ \\ \bottomrule
% \end{tabular}
% \caption{\textbf{Performance Evaluation} of - (a)\emph{ Einsum Network} and (b) \emph{Einsum Network + LRS} transformations on the \textbf{UCI tabular datasets}, in terms of mean log-likelihood on the test set (\textbf{\textit{higher the better}}), $ \pm $ the standard deviation across 3  trials.}
% \end{table*}
\begin{figure}
    \centering
    \begin{tabular}{ccc}
    \subfloat[Ground-Truth]{\includegraphics[width=0.31\linewidth]{figures/samples/ground_truth-mnist.png}}\hspace{-1.5pt}
    \subfloat[Einsum Net]{ \includegraphics[width=0.31\linewidth]{figures/samples/unconditional-einsum-net-25.png}}\hspace{1.5pt}
    \subfloat[Einsum+LRS]{\includegraphics[width=0.31\linewidth]{figures/samples/unconditional-einsum-spline-lrs-30.png}}
    \end{tabular}
    \caption{ \textbf{Qualitative Evaluation} - samples generated by \emph{ EinsumNet} and \emph{EinsumNet+LRS}  on the MNIST dataset.}
    \label{fig:unconditional-samples}
\end{figure}

\textbf{High dimensional data}. Having established the utility of probabilistic flow circuits for modeling 3D data distributions, we now proceed to study how the expressivity scales when learning data distributions in higher dimensional spaces. We considered $2$ image datasets - MNIST \citep{mnist}, Fashion-MNIST \citep{fashion-mnist} and $4$ UCI tabular datasets commonly used to evaluate density estimation in the normalizing flow literature. We followed the preprocessing done in \cite{papamakarios2017masked} for each of these datasets. We trained the two models - EinsumNet and EinsumNet+LRS for $100$ epochs over the tabular datasets and $50$ epochs over the image datasets. Tab.~\ref{tab:high-dim-test-ll} reports the mean test log-likelihood achieved by both models on these datasets. One can clearly see EinsumNet+LRS consistently achieves better performance as compared to EinsumNet on all the $6$ datasets. We also analyzed how these tractable models compare against deep models that are expressive but not quite tractable, in Tab.~\ref{tab:high-dim-test-ll} (values taken from \cite{papamakarios2017masked}). Specifically, we considered MADE \citep{germain2015made}, which is a deep autoregressive generative model, and two classic normalizing flow models, RealNVP \citep{dinh2017density} and MAF \citep{papamakarios2017masked}.  Interestingly, EinsumNet+LRS achieves similar performance as RealNVP on the GAS dataset and even performs better than the deep models on the POWER and MNIST datasets. We would like to add that even though these deep models  may not be the most state of the art, they are still models that trade off tractability for expressivity. These results clearly suggest that integrating NFs as leaf distributions within PCs can help achieve better of both worlds and bridge the gap between tractable and expressive models.

\subsection{Sample Generation}
\begin{figure}
    \centering
    \begin{tabular}{ccc}
    \subfloat[MNIST]{\includegraphics[width=0.32\linewidth]{figures/samples/cond_gen_mnist_lrs-2.png}}
    \subfloat[Fashion-MNIST]{ \includegraphics[width=0.32\linewidth]{figures/samples/cond_gen_fmnist_lrs.png}}
    \subfloat[Helix]{\includegraphics[width=0.32\linewidth]{figures/samples/cond_gen_helix_lrs.png}}
    \end{tabular}
    \caption{Generating samples from conditional distributions using \emph{EinsumNet+LRS}.}
    \label{fig:cond_gen}
\end{figure}

The ability to sample new data points from the underlying distribution is one of the key desirable features of any generative model. To understand how the integration of flows affects the quality of sample generation, we show samples generated randomly from an EinsumNet and EinsumNet+LRS trained on the MNIST dataset in Fig.~\ref{fig:unconditional-samples}. As one can see, EinsumNet+LRS is able to generate samples of higher fidelity than EinsumNet. 
Further, as probabilistic flow circuits are tractable models, one can utilize their tractability to gain better control over the data generation process. This can be helpful in many real world scenarios, for example in the case of missing data or generating data from regions having certain properties. In Fig.~\ref{fig:cond_gen} (a) and (b), we occlude the top half of images taken from MNIST and Fashion-MNIST datasets, and use an EinsumNet+LRS to generate samples from the conditional distribution over the occluded pixels given the non-occluded pixels. Similarly in Fig.~\ref{fig:cond_gen} (c) we used an EinsumNet+LRS trained on the $3$D Helix dataset to generate datapoints having the property that their projection to the $XY$ plane equals the black arc plotted in the Figure. Note that due to the lack of tractability, controlled sample generation like these are not possible for other deep generative models like GANs, VAEs or NFs.

\subsection{Ablation Study}
We also conducted ablation experiments to study how the expressivity of a probabilistic flow circuit varies when the underlying PC is made more complex. To this end, we considered $K_s =\{1, 2, 4, 8, 16, 32 \}$ and trained EinsumNet and  EinsumNet+LRS on the MNIST dataset by setting the structure parameters $k=r=K$ for each $K \in K_s$.

The test log-likelihood of the models for each parameter setting is shown in Fig.~\ref{fig:ablation-mnist}. We observe that irrespective of the complexity of the underlying PC, the integration of flows helps better model the data. EinsumNet+LRS performs considerably well even when $K=1$. This suggests that the expressivity of a probabilistic flow circuit is not tightly coupled with the complexity of the  PC.
\begin{SCfigure}[][t!]
    \includegraphics[width=0.6\linewidth]{figures/ablation/ablation_MNIST.pdf}
    \caption{Ablation study demonstrating how the effectiveness of \emph{Einsum+LRS} scales as we increase the expressivity of the base PC on the MNIST dataset. }
    \label{fig:ablation-mnist}
\end{SCfigure}

\section{Summary \& Future Work}
We presented a theoretically grounded approach to construct deep and tractable generative models using probabilistic circuits and normalizing flows. We empirically validated its added expressivity and tractability in modeling complex data distributions. 
In many real-world scenarios, tractability might be of interest only for a subset of random variables. In such cases, defining multivariate leaf flows over the remaining variables can help further the expressivity while retaining only the tractability \emph{needed} for the task, which we leave for future work. Our work thus lays the formalisms and foundations that can enable seamless interpolation on the tractability-expressivity spectrum. It also paves the way towards compositional learning, where hybrid data distributions modeled via normalizing flows can now be integrated as leaves in a unified probabilistic circuit.

\begin{acknowledgements}     
    \emph{SS} and \emph{SN} acknowledge the support by the U. S. Army Research Laboratory and the U. S. Army Research Office (ARO) under grant number W911NF2010224.
    
    \emph{KK} acknowledges the support by the Hessian Ministry of Higher Education, Research, Science and the Arts (HMWK; projects “The Third Wave of AI” and “The Adaptive Mind”), and the Hessian research priority programme LOEWE within the project “WhiteBox”.
\end{acknowledgements}
 
% References
\bibliography{sidheekh_526}
% \clearpage

% \appendix
% \section{PROOFS}

% \subsection{Lemma \ref{lemma:tau-decomposbility-necessary}}
% \emph{
% $\uptau$-decomposability is a necessary condition for $\mathcal{C_{SPTN}}$ to be decomposable.}

% \begin{proof}
%     Let $ \mathcal{C_{SPTN}}$ be a decomposable sum-product transform the network. $\implies \ \forall \mathcal{P} \in \mathcal{C_{SPTN}}, \mathcal{P}$ is decomposable $\implies \ \forall \mathcal{N}_i,\mathcal{N}_j \in ch(\mathcal{P}), i \ne j, \psi_{\mathcal{N}_i} \cap \psi_{\mathcal{N}_j} = \emptyset $. 
% Now, let $\mathcal{T} \in \mathcal{C_{SPTN}}$ be a transform node ( and $g$ be its associated transformation) that is not $\uptau$-decomposable. i.e. when defined over a product node $\mathcal{P}$, there exists at least one pair $\psi_{i^{'}}, \psi_{j^{'}} \in \{\psi_{{\mathcal{N}}_i}\}_{\mathcal{N}_i \in ch(\mathcal{P})}, i^{'} \ne j^{'} $, such that for $\boldsymbol{x} \in \R^{|\psi_P|}$ and $\boldsymbol{y}=g(\boldsymbol{x}), \boldsymbol{y}_{\psi_{i^{'}}} \not\perp \boldsymbol{x}_{\psi_{j^{'}}} \implies \Pi_{\psi_{i^{'}}}(\boldsymbol{y}) = f(\boldsymbol{x}_{\psi_{j^{'}}})$ for some function $f$. 
% Now, we have,

% \begin{align*}
% \mathcal{T}(\mathcal{P(\boldsymbol{x})}) &= \mathcal{P}(g(\boldsymbol{x}))|\det J_g|\\
% &= \prod_{N_i \in ch(\mathcal{P})} \mathcal{N}_i(\Pi_{\psi_{\mathcal{N}_i}}(g(\boldsymbol{x})))|\det J_g|
% \end{align*}
% Thus the child $\mathcal{N}_{i^{'}}$ of $\mathcal{P}$ computes a function over $\boldsymbol{x}_{\psi_{j^{'}}}  \implies \psi_{N_{i^{'}}} \supset \psi_{N_{j^{'}}} \implies \psi_{N_{i^{'}}} \cap \psi_{N_{j^{'}}} \not= \emptyset \implies \mathcal{P}$ is not decomposable, thus resulting in a contradiction. Thus, for a sum product transform network $ \mathcal{C_{SPTN}}$ to be decomposable, all transform nodes must be $\uptau$-decomposable. 

% \end{proof}

% \subsection{Lemma \ref{lemma:pfc-tractability}}
% \emph{
%     A $\mathcal{C}_{\mathcal{F}}$ with leaf distributions defined using $g_{lrs}$ transformations and Student's-t distribution of order $3$ as the base distribution is a tractable model for (a) evidential inference if $\mathcal{C}_{\mathcal{F}}$ is smooth, (b) Marginal and conditional inference if $\mathcal{C}_{\mathcal{F}}$ is smooth and decomposable, (c) MAP inference if $\mathcal{C}_{\mathcal{F}}$ is smooth, decomposable, and deterministic.
% }
% \begin{proof}
%     % The tractable computation of evidential queries require that the leaf nodes compute a valid probability density over its scope.  
%     The tractability of evidential, marginal and conditional inference for $\mathcal{C}_{\mathcal{F}}$ follows trivially from the fact that $\mathcal{C}_{\mathcal{F}}$ hence inherits the tractability offered by the circuit properties of a PC under the structural constraints of smoothness an decomposability. We elaborate this further below.
    
%     (a) \textbf{Evidential inference:} In order to tractably perform evidential inference, $\mathcal{C}_{\mathcal{F}}$ requires that the leaf nodes compute a valid probability density over its scope. A normalizing flow supports exact density evaluation using the change of variables formula and hence enables tractable evidential inference. Smoothness of $\mathcal{C}_{\mathcal{F}}$ further ensures that its sum nodes compute valid mixture densities. However, note that smoothness is not a necessary condition, as a non smooth PC can, in polynomial time, be converted to a smooth PC (\cite{ProbCirc20}).
    
%     (b) \textbf{Marginal and Conditional inference:} For a smooth and decomposable $\mathcal{C}_{\mathcal{F}}$, marginalizing out a variable $X_i$ from its modeled density reduces to marginalizing out the corresponding leaf distribution. This is because marginalization of $X_i$ involves integrating the model density over $\emph{val}(X_i)$, and as proved in \cite{ProbCirc20}, the integral over the circuit reduces to integrals over the leaf distributions having $X_i$ in their scope, when the circuit is smooth and decomposable. Note that each leaf nodes in $\mathcal{C}_{\mathcal{F}}$ represents a probability distribution over a single variable and marginalizing it out is equivalent to setting the corresponding leaf density to 1. Thus, $\mathcal{C}_{\mathcal{F}}$ supports tractable marginal inference. Also, the tractability of conditional inference naturally follows from the tractability of evidential and marginal inference.
    
%      (b) \textbf{MAP inference:} Along the same lines, computation of MAP queries for $\mathcal{C}_{\mathcal{F}}$ reduces to computing argmax over leaf densities if $\mathcal{C}_{\mathcal{F}}$ is smooth, decomposable and deterministic \cite{ProbCirc20}. Thus, if we can compute the mode of the distribution modeled by the leaf nodes, we can ensure tractability for MAP inference. 
%      For $ x \in [ x^i, x^{i+1} ]$, let $\phi=\frac{(x-x^{i+1})}{x^{i+1} - x^i}$, and  let $g$ denote the linear rational spline transformation associated with the bin, which has the form $g(\phi)  = \frac{q(\phi)}{r(\phi)} = \frac{a_1 \phi + b_1}{a_2 \phi + b_2}$. Let $S_t$ denote a Student's-t distribution with $3$ degrees of freedom. 
%      The pdf of a Student's-t distribution with $\nu$ degrees of freedom is given by:
%  \begin{equation*}
%  p(x;\nu) = \frac{\Gamma\left(\frac{\nu+1}{2}\right)}{
%    \sqrt{\nu\pi}\,\Gamma\left(\frac{\nu}{2}\right)}
%    \left(1+\frac{x^2}{\nu}\right)^{-(\frac{\nu+1}{2})}
% \end{equation*}
%      We have,
%     \begin{align*}
%         p(x) &= S_t(g(\phi)).|\frac{\partial{g(\phi)}}{\partial{x}}|\\
%              &= \dfrac{1}{(x^{i+1}-x^i)} S_t(g(\phi)).|\frac{\partial{g(\phi)}}{\partial{\phi}}|\\
%              &= C_1 S_t\left(\frac{q(\phi)}{r(\phi)}\right)(r(\phi))^{-2}\\
%              &= \dfrac{ C_1\Gamma(2)}{\sqrt{3\pi}\Gamma(\frac{3}{2})}\left[1+\frac{1}{3}\left(\frac{q(\phi)}{r(\phi)}\right)^2\right]^{-2} (r(\phi))^{-2} \\
%      %       &= C_1 C_2 \left[r(x)+\frac{1}{3}\frac{q(x)^2}{r(x)}\right]^{-2}\\
%             &= C_2 \left[3r(\phi)+ q(\phi)g(\phi) \right]^{-2} 
%     \end{align*}
%     Where, $C_1, C_2$ are constants. Thus, we have $ \log p(x) = \log C_2 -2 \log (3r(\phi)+ q(\phi)g(\phi))$. Now, $\log p(x)$ is maximized when $ f(\phi) = 3r(\phi)+ q(\phi)g(\phi)$ is minimized. 
%     % We will show that $f$ is monotonic for $\phi \in [0,1]$ (i.e. for $x \in [x^{i}, x^{i+1}]$), or equivalently the derivative of $f$ does not change its sign.
%     % We have
%     % \begin{align*}
%     %     \frac{\partial{f(\phi)}}{\partial{\phi}} &= 3a_2 + a_1 g(\phi) + q(\phi) \frac{\partial{g(\phi)}}{\partial{\phi}}\\
%     %     &= 3a_2 + a_1 \dfrac{q(\phi)}{r(\phi)} + q(\phi) \frac{\partial{g(\phi)}}{\partial{\phi}}
%     % \end{align*}
%     Differentiating and equating to zero, we have,
%     \begin{align*}
%         & 3r'(\phi)+ q'(\phi)g(\phi) + q(\phi)g'(\phi) = 0 \\
%         % & \implies 3a_2 + a_1 g(\phi) + q(\phi) \frac{C_1}{r(\phi)^2} = 0 \\   
%         & \implies 3a_2r(\phi)^2 + a_1 q(\phi)r(\phi) + (b_2a_1 - a_2b_1)q(\phi) = 0 \\   
%     \end{align*}
%     Note that as $q(\phi), r(\phi)$ are linear in $\phi$,  the above equation is quadratic in $\phi$. Thus, we can check if any of its real roots lie within the interval $ [0, 1]$. If it does, then the maximum density within that bin is given by the density at the root. If not, then the maximum occurs at either of the interval boundaries. Thus, we can compute the maximum within each bin analytically. The maximum density across all the bins gives the mode of the distribution.   
%     % Note that all $q(x), r(x)$ are linear functions of $x$, and the above equation is quadratic is $x$. Thus we can check if it has any real roots within the interval [$ x^i, x^{i+1} $]. If it does, then the maximum density within that bin is given by the density at the root. If not, then the maximum occurs at either of the interval boundaries. Thus, we can compute the maximum within each bin analytically and the maximum over all the bins gives the mode of the distribution.
% \end{proof}

% \section{Results}
% \begin{figure*}
%     \centering
%     \includegraphics[width=0.3\linewidth]{figures/learning/HELIX.pdf}
%     \includegraphics[width=0.3\linewidth]{figures/learning/DisjointCIRCLES.pdf}
%     \includegraphics[width=0.3\linewidth]{figures/learning/InterlockedCIRCLES.pdf}
%     \includegraphics[width=0.3\linewidth]{figures/learning/KNOTTED.pdf}
%     \includegraphics[width=0.3\linewidth]{figures/learning/TwistedEIGHT.pdf}
%     \includegraphics[width=0.3\linewidth]{figures/learning/BentLISSAJOUS.pdf}
%     \caption{\textbf{Learning curves} of the three models - (a)\emph{ Einsum Networks} (b)\emph{ Einsum Networks with Invertible Affine} transformations at the leaves (SPTN) and (c) \emph{Einsum Networks with Spline} transformations at the leaves (ours) on the \textbf{synthetic 3D datasets}, in terms of average log-likelihood (\textbf{\textit{higher the better}}) on the validation set across training epochs. The shaded regions depict the standard deviation across 5 independent trials.}
% \end{figure*}

\end{document}


PROOF OF LEMMA 2

Let $ \mathcal{C_{SPTN}}$ be a decomposable sum-product transform the network. $\implies \ \forall \mathcal{P} \in \mathcal{C_{SPTN}}, \mathcal{P}$ is decomposable $\implies \ \forall \mathcal{N}_i,\mathcal{N}_j \in ch(\mathcal{P}), i \ne j, \psi_{\mathcal{N}_i} \cap \psi_{\mathcal{N}_j} = \emptyset $. 
Now, let $\mathcal{T} \in \mathcal{C_{SPTN}}$ be a transform node ( and $g$ be its associated transformation) that is not $\uptau$-decomposable. i.e. when defined over a product node $\mathcal{P}$, there exists at least one pair $\psi_{i^{'}}, \psi_{j^{'}} \in \{\psi_{{\mathcal{N}}_i}\}_{\mathcal{N}_i \in ch(\mathcal{P})}, i^{'} \ne j^{'} $, such that for $\boldsymbol{x} \in \R^{|\psi_P|}$ and $\boldsymbol{y}=g(\boldsymbol{x}), \boldsymbol{y}_{\psi_{i^{'}}} \not\perp \boldsymbol{x}_{\psi_{j^{'}}} \implies \Pi_{\psi_{i^{'}}}(\boldsymbol{y}) = f(\boldsymbol{x}_{\psi_{j^{'}}})$ for some function $f$. 
Now, we have,

\begin{align*}
\mathcal{T}(\mathcal{P(\boldsymbol{x})}) &= \mathcal{P}(g(\boldsymbol{x}))|\det J_g|\\
&= \prod_{N_i \in ch(\mathcal{P})} \mathcal{N}_i(\Pi_{\psi_{\mathcal{N}_i}}(g(\boldsymbol{x})))|\det J_g|
\end{align*}
Thus the child $\mathcal{N}_{i^{'}}$ of $\mathcal{P}$ computes a function over $\boldsymbol{x}_{\psi_{j^{'}}}  \implies \psi_{N_{i^{'}}} \supset \psi_{N_{j^{'}}} \implies \psi_{N_{i^{'}}} \cap \psi_{N_{j^{'}}} \not= \emptyset \implies \mathcal{P}$ is not decomposable, thus resulting in a contradiction. Thus, for a sum product transform network $ \mathcal{C_{SPTN}}$ to be decomposable, all transform nodes must be $\uptau$-decomposable. 