% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023}  % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

\usepackage{subfig}
\usepackage{caption}
%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usetikzlibrary{shapes.geometric}
\usetikzlibrary {arrows.meta}
\usepackage{bbm}
\usepackage{derivative}
\usepackage{upgreek}
\usepackage{amssymb,amsmath, amsthm}
\newcommand{\R}{\mathbb{R}}
\newcommand{\E}{\mathbb{E}}
\newtheorem{definition}{Definition} 
\newtheorem{property}{P.}  
\newtheorem{query}{Q.} 
\newtheorem*{remark}{Remark} 
\newtheorem{lemma}{Lemma} 
\newtheorem{theorem}{Theorem}
\newtheorem*{corollary}{Corollary}  

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Probabilistic Flow Circuits:\\ Towards Unified Deep Models for Tractable Probabilistic Inference \\ Supplementary Material }

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<sahil.sidheekh@utdallas.edu>?Subject=Your UAI 2023 paper}{Sahil Sidheekh}{}}
\author[2,3]{Kristian Kersting}
\author[1]{Sriraam Natarajan}

% Add affiliations after the authors
\affil[1]{%
    Erik Jonsson School of Engineering \& Computer Science\\
    The University of Texas at Dallas
}
\affil[2]{%
    Department of Computer Science \\
    TU Darmstadt
}
\affil[3]{
Centre for Cognitive Science, TU Darmstadt, and Hessian Center for AI 
}



\begin{document}
\maketitle

In this supplementary material, we furnish further details pertaining to the theory and implementation that was left out in the main paper due to space constraints. 

% \section{Theoretical Results}
\section{Theoretical Results}
\textbf{Lemma 2.} \textit{
$\uptau$-decomposability is a necessary condition for an SPTN to be decomposable.}

\begin{proof}
    Let $ \mathcal{C_{SPTN}}$ be a decomposable sum-product transform the network. $\implies \ \forall \mathcal{P} \in \mathcal{C_{SPTN}}, \mathcal{P}$ is decomposable $\implies \ \forall \mathcal{N}_i,\mathcal{N}_j \in ch(\mathcal{P}), i \ne j, \psi_{\mathcal{N}_i} \cap \psi_{\mathcal{N}_j} = \emptyset $. 
Now, let $\mathcal{T} \in \mathcal{C_{SPTN}}$ be a transform node ( and $g$ be its associated transformation) that is not $\uptau$-decomposable. i.e. when defined over a product node $\mathcal{P}$, there exists at least one pair $\psi_{i^{'}}, \psi_{j^{'}} \in \{\psi_{{\mathcal{N}}_i}\}_{\mathcal{N}_i \in ch(\mathcal{P})}, i^{'} \ne j^{'} $, such that for $\boldsymbol{x} \in \R^{|\psi_P|}$ and $\boldsymbol{y}=g(\boldsymbol{x}), \boldsymbol{y}_{\psi_{i^{'}}} \not\perp \boldsymbol{x}_{\psi_{j^{'}}} \implies \Pi_{\psi_{i^{'}}}(\boldsymbol{y}) = f(\boldsymbol{x}_{\psi_{j^{'}}})$ for some function $f$. 
Thus, we have,

\begin{align*}
\mathcal{T}(\mathcal{P(\boldsymbol{x})}) &= \mathcal{P}(g(\boldsymbol{x}))|\det J_g|\\
&= \prod_{N_i \in ch(\mathcal{P})} \mathcal{N}_i(\Pi_{\psi_{\mathcal{N}_i}}(g(\boldsymbol{x})))|\det J_g|
\end{align*}
Thus the child $\mathcal{N}_{i^{'}}$ of $\mathcal{P}$ computes a function over $\boldsymbol{x}_{\psi_{j^{'}}}  \implies \psi_{N_{i^{'}}} \supset \psi_{N_{j^{'}}} \implies \psi_{N_{i^{'}}} \cap \psi_{N_{j^{'}}} \not= \emptyset \implies \mathcal{P}$ is not decomposable, thus resulting in a contradiction. Thus, for a sum product transform network $ \mathcal{C_{SPTN}}$ to be decomposable, all transform nodes must be $\uptau$-decomposable. 

\end{proof}

\textbf{Lemma 3.} \textit{
    A PFC ($\mathcal{C}_{\mathcal{F}}$) with leaf distributions defined using $g_{lrs}$ transformations and a Student's-t distribution with $\nu = 3$ as the base distribution is a tractable model for (a) evidential inference if $\mathcal{C}_{\mathcal{F}}$ is smooth, (b) Marginal and conditional inference if $\mathcal{C}_{\mathcal{F}}$ is smooth and decomposable, (c) MAP inference if $\mathcal{C}_{\mathcal{F}}$ is smooth, decomposable, and deterministic.
}
\begin{proof}
    % The tractable computation of evidential queries require that the leaf nodes compute a valid probability density over its scope.  
    The tractability of evidential, marginal and conditional inference for $\mathcal{C}_{\mathcal{F}}$ follows trivially from the fact that $\mathcal{C}_{\mathcal{F}}$ is a probabilistic circuit and hence inherits the tractability offered by the circuit properties of a PC under the structural constraints of smoothness an decomposability.
    We elaborate this further below.
    
    (a) \textbf{Evidential inference:} In order to tractably perform evidential inference, $\mathcal{C}_{\mathcal{F}}$ requires that the leaf nodes compute a valid probability density over its scope. A normalizing flow supports exact density evaluation using the change of variables formula and hence enables tractable evidential inference. Smoothness of $\mathcal{C}_{\mathcal{F}}$ further ensures that its sum nodes compute valid mixture densities. However, note that smoothness is not a necessary condition, as a non smooth PC can, in polynomial time, be converted to a smooth PC (\cite{ProbCirc20}).
    
    (b) \textbf{Marginal and Conditional inference:} For a smooth and decomposable $\mathcal{C}_{\mathcal{F}}$, marginalizing out a variable $X_i$ from its modeled density reduces to marginalizing out the corresponding leaf distribution. This is because marginalization of $X_i$ involves integrating the model density over $\boldsymbol{val}(X_i)$, and as proved in \cite{ProbCirc20}, the integral over the circuit reduces to integrals over the leaf distributions having $X_i$ in their scope, when the circuit is smooth and decomposable. Note that each leaf nodes in $\mathcal{C}_{\mathcal{F}}$ represents a probability distribution over a single variable and marginalizing it out is equivalent to setting the corresponding leaf density to 1. Thus, $\mathcal{C}_{\mathcal{F}}$ supports tractable marginal inference. Also, the tractability of conditional inference naturally follows from the tractability of evidential and marginal inference.
    
     (b) \textbf{MAP inference:} Along the same lines, computation of MAP queries for $\mathcal{C}_{\mathcal{F}}$ reduces to computing argmax over leaf densities if $\mathcal{C}_{\mathcal{F}}$ is smooth, decomposable and deterministic \cite{ProbCirc20}. Thus, if we can compute the mode of the distribution modeled by the leaf nodes, we can ensure tractability for MAP inference. 
     For $ x \in [ x^i, x^{i+1} ]$, let $\phi=\frac{(x-x^{i+1})}{x^{i+1} - x^i}$, and  let $g$ denote the linear rational spline transformation associated with the bin, which has the form $g(\phi)  = \frac{q(\phi)}{r(\phi)} = \frac{a_1 \phi + b_1}{a_2 \phi + b_2}$. Let $S_t$ denote a Student's-t distribution with $3$ degrees of freedom. 
     The pdf of a Student's-t distribution with $\nu$ degrees of freedom is given by:
 \begin{equation*}
 p(x;\nu) = \frac{\Gamma\left(\frac{\nu+1}{2}\right)}{
   \sqrt{\nu\pi}\,\Gamma\left(\frac{\nu}{2}\right)}
   \left(1+\frac{x^2}{\nu}\right)^{-(\frac{\nu+1}{2})}
\end{equation*}
     Thus, we have the following form for the density modeled at the leaf distributions,
    \begin{align*}
        p(x) &= S_t(g(\phi)).|\frac{\partial{g(\phi)}}{\partial{x}}|\\
             &= \dfrac{1}{(x^{i+1}-x^i)} S_t(g(\phi)).|\frac{\partial{g(\phi)}}{\partial{\phi}}|\\
             &= C_1 S_t\left(\frac{q(\phi)}{r(\phi)}\right)(r(\phi))^{-2}\\
             &= \dfrac{ C_1\Gamma(2)}{\sqrt{3\pi}\Gamma(\frac{3}{2})}\left[1+\frac{1}{3}\left(\frac{q(\phi)}{r(\phi)}\right)^2\right]^{-2} (r(\phi))^{-2} \\
     %       &= C_1 C_2 \left[r(x)+\frac{1}{3}\frac{q(x)^2}{r(x)}\right]^{-2}\\
            &= C_2 \left[3r(\phi)+ q(\phi)g(\phi) \right]^{-2} 
    \end{align*}
    Where, $C_1, C_2$ are constants. Thus, we have $ \log p(x) = \log C_2 -2 \log (3r(\phi)+ q(\phi)g(\phi))$. Now, $\log p(x)$ is maximized when $ f(\phi) = 3r(\phi)+ q(\phi)g(\phi)$ is minimized. 
    % We will show that $f$ is monotonic for $\phi \in [0,1]$ (i.e. for $x \in [x^{i}, x^{i+1}]$), or equivalently the derivative of $f$ does not change its sign.
    % We have
    % \begin{align*}
    %     \frac{\partial{f(\phi)}}{\partial{\phi}} &= 3a_2 + a_1 g(\phi) + q(\phi) \frac{\partial{g(\phi)}}{\partial{\phi}}\\
    %     &= 3a_2 + a_1 \dfrac{q(\phi)}{r(\phi)} + q(\phi) \frac{\partial{g(\phi)}}{\partial{\phi}}
    % \end{align*}
    Differentiating and equating to zero, we have,
    \begin{align*}
        & 3r'(\phi)+ q'(\phi)g(\phi) + q(\phi)g'(\phi) = 0 \\
        % & \implies 3a_2 + a_1 g(\phi) + q(\phi) \frac{C_1}{r(\phi)^2} = 0 \\   
        & \implies 3a_2r(\phi)^2 + a_1 q(\phi)r(\phi) + (b_2a_1 - a_2b_1)q(\phi) = 0 \\   
    \end{align*}
    Note that as $q(\phi), r(\phi)$ are linear in $\phi$,  the above equation is quadratic in $\phi$. Thus, we can check if any of its real roots lie within the interval $ [0, 1]$. If it does, then the maximum density within that bin is given by the density at the root. If not, then the maximum occurs at either of the interval boundaries. Thus, we can compute the maximum within each bin analytically. The maximum density across all the bins gives the mode of the distribution.   
    
    Note that the above analysis also extends to compositions of LRS transformations as a composition of linear rational functions is a linear rational function .i.e for $g_1(\phi)  = \frac{a_1 \phi + b_1}{a_2 \phi + b_2}$ and $g_2(\phi) = \frac{c_1 \phi + d_1}{c_2 \phi + d_2}$, we have,
    \begin{align*}
        g_2(g_1(\phi)) &= \frac{c_1 \left(\frac{a_1 \phi + b_1}{a_2 \phi + b_2} \right) + d_1}{c_2 \left( \frac{a_1 \phi + b_1}{a_2 \phi + b_2} \right) + d_2}\\
        &= \frac{(c_1 a_1 + d_1 a_2)\phi + c_1 b_1 + d_1 b_2}{(c_2 a_1 + d_2 a_2)\phi + c_2 b_1 + d_2 b_2}
    \end{align*}
    is also a linear rational function.
    % Note that all $q(x), r(x)$ are linear functions of $x$, and the above equation is quadratic is $x$. Thus we can check if it has any real roots within the interval [$ x^i, x^{i+1} $]. If it does, then the maximum density within that bin is given by the density at the root. If not, then the maximum occurs at either of the interval boundaries. Thus, we can compute the maximum within each bin analytically and the maximum over all the bins gives the mode of the distribution.
\end{proof}

\section{Implementation Details}
We implemented our code using pytorch, adapting from \citet{peharz_20_einsum}. We used the Pyro probabilistic programming language \cite{bingham2019pyro} package that is built on top of pytorch to implement the linear rational spline transformations. The hyper parameters defining the structure of the PC in einsum networks \citep{peharz_20_einsum} are - the depth ($D$) of the circuit, the number of vector components ($K$) (i.e. the no. of leaf distributions per variable and the no. sum nodes in an einsum-layer) and the number of replica ($R$). We use the same PC architecture for both the EinsumNet and EinsumNet+LRS. There are two hyper parameters associated with the linear rational spline transformation - the no. of of intervals ($I$), the bounds ($B$) within which it is defined. Outside of $[-B,B]$ the transformation is defined to be identity. For a dataset with $n$ features, we set depth $D = $ max $(1, \lfloor \log_2(n) \rfloor)$. We use end-to-end backpropagation and train all our models using an Adam optimizer, with a learning rate of $1e-3$. Further dataset specific details are summarized below:

\textbf{3D Manifold Data} We sampled $20,000$ data points for each of the $6$ 3D datasets, $10,000$ of which we used for training and $5,000$ each for validation and testing. We used $K=10,R=10$ as the underlying PC structure for each model on the 3D datasets. We used a single linear rational spline transformation with $I=16,B=20$ at the leaves of EinsumNet+LRS. We used a batch size of $100$ and trained all models for $200$ epochs. The learning curves of the models on the 3D datasets left out in the main paper is given in Figure \ref{fig:learning-curve-all}.

\textbf{UCI Tabular Datasets} We used $K=20,R=20$ as the underlying PC structure for all models on the tabular datasets. We used a single linear rational spline transformation with $I=16,B=16$ at the leaves of EinsumNet+LRS. We used a batch size of $200$ for the GAS, MINIBOONE and HEPMASS datasets, and a batch size of $500$ for the POWER dataset. We trained all models for $100$ epochs, early stopping if there is no improvement in the validation performance for over $5$ epochs. The details regarding the no. of features, and no. of datapoints within in each split of the $4$ UCI tabular datasets considered can be found in \cite{papamakarios2017masked}. We also followed the same preprocessing for the datasets as given by \cite{papamakarios2017masked}.

\textbf{Image Datasets} We used the PD structure \citep{poon2011spn,peharz_20_einsum} with $\triangle=[7, 28]$ and $K=20,R=20$ to define the underlying PC structure for all the models on the two image datasets - MNIST and Fashion-MNIST. We used two linear rational spline transformations with $I=16,B=16$ at the leaves of EinsumNet+LRS.  We used a batch size of $100$ and trained all models for $50$ epochs. As we consider continuous leaf distributions, we applied the logit transformations as done in \cite{papamakarios2017masked} to make the data continuous.


\begin{figure*}[h!]
    \centering
    \includegraphics[width=0.3\linewidth]{figures/learning/HELIX.pdf}
    \includegraphics[width=0.3\linewidth]{figures/learning/DisjointCIRCLES.pdf}
    \includegraphics[width=0.3\linewidth]{figures/learning/InterlockedCIRCLES.pdf}
    \includegraphics[width=0.3\linewidth]{figures/learning/KNOTTED.pdf}
    \includegraphics[width=0.3\linewidth]{figures/learning/TwistedEIGHT.pdf}
    \includegraphics[width=0.3\linewidth]{figures/learning/BentLISSAJOUS.pdf}
    \caption{\textbf{Learning curves} of - (a)\emph{ Einsum Network} (b)\emph{ Einsum Network +  Affine} transformations at the leaves and (c) \emph{Einsum Network + LRS} transformations at the leaves on the \textbf{3D datasets}, in terms of average log-likelihood (\textbf{\textit{higher the better}}) on the validation set across training epochs. The shaded regions depict the standard deviation across 3 independent trials. We can observe that \emph{Einsum Network + LRS} achieves superior performance much faster than the other two models on all datasets.}
    \label{fig:learning-curve-all}
\end{figure*}


\begin{figure*}[h!]
    \centering
    \includegraphics[width=0.23\linewidth]{figures/datasets/3d/HELIX_GT.png}
    \includegraphics[width=0.23\linewidth]{figures/samples/lrs-cond-helix/cond_3.png}
    \includegraphics[width=0.23\linewidth]{figures/samples/lrs-cond-helix/cond_1.png}
    \includegraphics[width=0.23\linewidth]{figures/samples/lrs-cond-helix/cond_2.png}
    \caption{Controlled Sample Generation Using an EinsumNet+LRS trained on the Helix dataset. The tractability of EinsumNet+LRS allows generating data with certain properties, for e.g. the second, third and fourth subfigures show data generated such that its projection onto the XY plane is the black curve plotted. }
    \label{fig:cond-samples}
\end{figure*}

% References
\bibliography{sidheekh_526}
\end{document}
