%\documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised version; also before submission to see how the non-anonymous paper would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer Modern (has noticable issues) \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon ptmx; less tested, no support) NOTE: Only keep *one* line above as appropriate, as it will be replaced automatically for papers to be published. Do not make any other change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
\bibliographystyle{plainnat}
\renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

\usepackage{amsmath,amssymb} \usepackage[capitalise]{cleveref}
\usepackage{caption} \usepackage{subcaption}

% for cross referencing the main text PLEASE ONLY USE xr IN THE SUPPLEMENTARY
% MATERIAL. In the main paper, hard code any cross-reference to the
% supplementary material.
\usepackage{xr} \makeatletter

\newcommand*{\addFileDependency}[1]{% argument=file name and extension
  \typeout{(#1)}% latexmk will find this if $recorder=0
  % however, in that case, it will ignore #1 if it is a .aux or .pdf file etc and it exists! If it doesn't exist, it will appear in the list of dependents regardless)
%
  % Write the following if you want it to appear in \listfiles --- although not really necessary and latexmk doesn't use this
%
  \@addtofilelist{#1}
%
  % latexmk will find this message if #1 doesn't exist (yet)
  \IfFileExists{#1}{}{\typeout{No file #1.}} }\makeatother

\newcommand*{\myexternaldocument}[1]{%
  \externaldocument{#1}%
  \addFileDependency{#1.tex}%
  \addFileDependency{#1.aux}%
}
% ------------End of helper code--------------

% put all the external documents here!
\myexternaldocument{ventola_118}
% \externaldocument{main}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
% redefining \footnotesize, we provide the original \footnotesize using this
% macro. (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
%\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Probabilistic Circuits That Know What They Don't
  Know\\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide more space for
% long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.


\usepackage[capitalise]{cleveref}
\usepackage{caption}
\usepackage{subcaption}
\usepackage{pgfplots}

\usepackage{pifont}% http://ctan.org/pkg/pifont
\newcommand{\cmark}{{\color{tab10green}\ding{51}}\hspace{0.5em}}%
\newcommand{\xmark}{{\color{tab10red}\ding{55}}\hspace{0.5em}}%

% TODONOTES
\usepackage{xargs}
%\usepackage[colorinlistoftodos,textsize=normalsize,disable]{todonotes} % Disabled
\usepackage[colorinlistoftodos,textsize=normalsize]{todonotes} % Enabled
%\setlength{\marginparwidth}{1.7cm}  % TODO: comment out when todonotes is disabled
\newcommandx{\todoc}[2][1=]{{\todo[linecolor=orange,backgroundcolor=orange!25,bordercolor=orange,#1]{
      TODO: #2}}}
\newcommandx{\unsure}[2][1=]{{\todo[linecolor=yellow,backgroundcolor=yellow!25,bordercolor=yellow,#1]{
      UNSURE: #2}}}
\newcommandx{\change}[2][1=]{{\todo[linecolor=blue,backgroundcolor=blue!25,bordercolor=blue,#1]{
      CHANGE: #2}}}
\newcommandx{\info}[2][1=]{{\todo[linecolor=green,backgroundcolor=green!25,bordercolor=green,#1]{
      INFO: #2}}}
\newcommandx{\improvement}[2][1=]{{\todo[linecolor=violet,backgroundcolor=violet!25,bordercolor=violet,#1]{
      IMPROVEMENT: #2}}}
\newcommandx{\thiswillnotshow}[2][1=]{{\todo[disable,#1]{THIS WILL NOT SHOW:
      #2}}}

% \usepackage[pdftex]{graphicx}
\usepackage{bm}

% --- For algorithms ---
\usepackage{algorithm} \usepackage[noend]{algpseudocode}
% --- ----

\usepackage{amsmath,amssymb}
\DeclareMathOperator*{\argmax}{arg\,max}

% --- For tikz plots ---
\usetikzlibrary{colorbrewer} \usetikzlibrary{positioning}
\usetikzlibrary{shapes.geometric} \usetikzlibrary{arrows}
\usetikzlibrary{arrows.meta} \usetikzlibrary{patterns,decorations.pathreplacing}
\usetikzlibrary{fit} \usetikzlibrary{backgrounds}
\usetikzlibrary{shapes,backgrounds,calc} \usetikzlibrary{shadows}
\usetikzlibrary{patterns} \definecolor{mygray}{RGB}{190,190,190}
% --- ----

% math definitions
\newcommand{\node}{\mathsf{N}}
%\newcommand{\nodeD}{\widetilde{\node}}
\newcommand{\nodeD}{\node}
\newcommand{\ndi}{\nodeD_i}
\newcommand{\ndj}{\nodeD_j}
\newcommand{\sumnode}{\mathsf{S}}
\newcommand{\si}{\sumnode_i}
\newcommand{\sj}{\sumnode_j}
\newcommand{\sk}{\sumnode_k}
\newcommand{\prodnode}{\mathsf{P}}
\newcommand{\leafnode}{\mathsf{L}}
%\newcommand{\sumnodeD}{\widetilde{\sumnode}}  % Dropout version with tilde
%\newcommand{\prodnodeD}{\widetilde{\prodnode}}  % Dropout version with tilde
%\newcommand{\leafnodeD}{\widetilde{\leafnode}}  % Dropout version with tilde
\newcommand{\sumnodeD}{\sumnode}  % Dropout version with tilde
\newcommand{\prodnodeD}{\prodnode}  % Dropout version with tilde
\newcommand{\leafnodeD}{\leafnode}  % Dropout version with tilde
\newcommand{\Ks}{K_{S}}
\newcommand{\Kp}{K_{P}}
\newcommand{\E}[1]{\mathbb{E}\!\left[#1\right]}
\newcommand{\Esi}{\E{\sumnode_i}}
\newcommand{\Esj}{\E{\sumnode_j}}
\newcommand{\Esk}{\E{\sumnode_k}}
\newcommand{\Ens}[1]{\mathbb{E}\left[#1\right]}  % no negative space
\newcommand{\Var}[1]{\text{Var}\!\left[#1\right]}
\newcommand{\Varns}[1]{\text{Var}\left[#1\right]} % no negative space
\newcommand{\Cov}[1]{\text{Cov}\!\left[#1\right]}
\newcommand{\Covns}[1]{\text{Cov}\left[#1\right]} % no negative space

% Inline alternatives that do not have huge brackets which increase between-line spacing
\newcommand{\Einline}[1]{\mathbb{E}[#1]}
\newcommand{\Varinline}[1]{\text{Var}[#1]}
\newcommand{\Covinline}[1]{\text{Cov}[#1]}



\newcommand{\ci}{c_i}  % Class priors
\newcommand{\cj}{c_j}  % Class priors

\newcommand{\cbar}{\,|\,}
\newcommand{\X}{\mathbf{X}}
\newcommand{\Y}{\mathbf{Y}}
\newcommand{\Z}{\mathbf{Z}}
\newcommand{\x}{\mathbf{x}}
\newcommand{\y}{\mathbf{y}}
\newcommand{\z}{\mathbf{z}}

\newcommand{\data}{\mathcal{D}}

\newcommand{\ind}{\perp\!\!\!\!\perp} % Independence symbol
\newcommand{\fn}[2]{#1\! \left( #2 \right)}

\newcommand{\spn}{\mathcal{S}}
\newcommand{\spnfn}[1]{\spn\!\left( #1 \right)}
\newcommand{\scope}[1]{\mathbf{sc}\left(#1\right)}

\newcommand{\PC}{\mathcal{P}}
\newcommand{\graph}{\mathcal{G}}
\newcommand{\tree}{\mathcal{T}}
% \newcommand{\scope}{\psi}
\newcommand{\pa}{\mathbf{pa}}
\newcommand{\ch}{\mathbf{ch}}
\newcommand{\val}{\mathbf{val}}


\newcommand{\ones}{\mathbf 1}
\newcommand{\reals}{{\mbox{\bf R}}}
\newcommand{\integers}{{\mbox{\bf Z}}}
\newcommand{\symm}{{\mbox{\bf S}}}  % symmetric matrices

\newcommand{\nullspace}{{\mathcal N}}
\newcommand{\range}{{\mathcal R}}
\newcommand{\Rank}{\mathop{\bf Rank}}
\newcommand{\Tr}{\mathop{\bf Tr}}
% \newcommand{\diag}{\mathop{\bf diag}}
\newcommand{\card}{\mathop{\bf card}}
\newcommand{\rank}{\mathop{\bf rank}}
\newcommand{\conv}{\mathop{\bf conv}}
\newcommand{\prox}{\mathbf{prox}}

\newcommand{\Expect}{\mathop{\bf E{}}}
\newcommand{\Prob}{\mathop{\bf Prob}}
\newcommand{\Co}{{\mathop {\bf Co}}} % convex hull
\newcommand{\dist}{\mathop{\bf dist{}}}
% \newcommand{\argmin}{\mathop{\rm argmin}}
% \newcommand{\argmax}{\mathop{\rm argmax}}
\newcommand{\epi}{\mathop{\bf epi}} % epigraph
\newcommand{\Vol}{\mathop{\bf vol}}
\newcommand{\dom}{\mathop{\bf dom}} % domain
\newcommand{\intr}{\mathop{\bf int}}
\newcommand{\sign}{\mathop{\bf sign}}

\newcommand{\cf}{{\it cf.}}
\newcommand{\eg}{{\it e.g.}}
\newcommand{\ie}{{\it i.e.}}
\newcommand{\etc}{{\it etc.}}

%% Math
\newcommand{\mathbold}[1]{\boldsymbol{#1}}
\newcommand{\mba}{\mathbold{a}}
\newcommand{\mbb}{\mathbold{b}}
\newcommand{\mbc}{\mathbold{c}}
\newcommand{\mbd}{\mathbold{d}}
\newcommand{\mbe}{\mathbold{e}}
\newcommand{\mbf}{\mathbold{f}}
\newcommand{\mbg}{\mathbold{g}}
\newcommand{\mbh}{\mathbold{h}}
\newcommand{\mbi}{\mathbold{i}}
\newcommand{\mbj}{\mathbold{j}}
\newcommand{\mbk}{\mathbold{k}}
\newcommand{\mbl}{\mathbold{l}}
\newcommand{\mbm}{\mathbold{m}}
\newcommand{\mbn}{\mathbold{n}}
\newcommand{\mbo}{\mathbold{o}}
\newcommand{\mbp}{\mathbold{p}}
\newcommand{\mbq}{\mathbold{q}}
\newcommand{\mbr}{\mathbold{r}}
\newcommand{\mbs}{\mathbold{s}}
\newcommand{\mbt}{\mathbold{t}}
\newcommand{\mbu}{\mathbold{u}}
\newcommand{\mbv}{\mathbold{v}}
\newcommand{\mbw}{\mathbold{w}}
\newcommand{\mbx}{\mathbold{x}}
\newcommand{\mby}{\mathbold{y}}
\newcommand{\mbz}{\mathbold{z}}

\newcommand{\mbA}{\mathbold{A}}
\newcommand{\mbB}{\mathbold{B}}
\newcommand{\mbC}{\mathbold{C}}
\newcommand{\mbD}{\mathbold{D}}
\newcommand{\mbE}{\mathbold{E}}
\newcommand{\mbF}{\mathbold{F}}
\newcommand{\mbG}{\mathbold{G}}
\newcommand{\mbH}{\mathbold{H}}
\newcommand{\mbI}{\mathbold{I}}
\newcommand{\mbJ}{\mathbold{J}}
\newcommand{\mbK}{\mathbold{K}}
\newcommand{\mbL}{\mathbold{L}}
\newcommand{\mbM}{\mathbold{M}}
\newcommand{\mbN}{\mathbold{N}}
\newcommand{\mbO}{\mathbold{O}}
\newcommand{\mbP}{\mathbold{P}}
\newcommand{\mbQ}{\mathbold{Q}}
\newcommand{\mbR}{\mathbold{R}}
\newcommand{\mbS}{\mathbold{S}}
\newcommand{\mbT}{\mathbold{T}}
\newcommand{\mbU}{\mathbold{U}}
\newcommand{\mbV}{\mathbold{V}}
\newcommand{\mbW}{\mathbold{W}}
\newcommand{\mbX}{\mathbold{X}}
\newcommand{\mbY}{\mathbold{Y}}
\newcommand{\mbZ}{\mathbold{Z}}

\newcommand{\mbalpha}{\mathbold{\alpha}}
\newcommand{\mbbeta}{\mathbold{\beta}}
\newcommand{\mbdelta}{\mathbold{\delta}}
\newcommand{\mbepsilon}{\mathbold{\epsilon}}
\newcommand{\mbchi}{\mathbold{\chi}}
\newcommand{\mbeta}{\mathbold{\eta}}
\newcommand{\mbgamma}{\mathbold{\gamma}}
\newcommand{\mbiota}{\mathbold{\iota}}
\newcommand{\mbkappa}{\mathbold{\kappa}}
\newcommand{\mblambda}{\mathbold{\lambda}}
\newcommand{\mbmu}{\mathbold{\mu}}
\newcommand{\mbnu}{\mathbold{\nu}}
\newcommand{\mbomega}{\mathbold{\omega}}
\newcommand{\mbphi}{\mathbold{\phi}}
\newcommand{\mbpi}{\mathbold{\pi}}
\newcommand{\mbpsi}{\mathbold{\psi}}
\newcommand{\mbrho}{\mathbold{\rho}}
\newcommand{\mbsigma}{\mathbold{\sigma}}
\newcommand{\mbtau}{\mathbold{\tau}}
\newcommand{\mbtheta}{\mathbold{\theta}}
\newcommand{\mbupsilon}{\mathbold{\upsilon}}
\newcommand{\mbvarepsilon}{\mathbold{\varepsilon}}
\newcommand{\mbvarphi}{\mathbold{\varphi}}
\newcommand{\mbvartheta}{\mathbold{\vartheta}}
\newcommand{\mbvarrho}{\mathbold{\varrho}}
\newcommand{\mbxi}{\mathbold{\xi}}
\newcommand{\mbzeta}{\mathbold{\zeta}}

\newcommand{\mbDelta}{\mathbold{\Delta}}
\newcommand{\mbGamma}{\mathbold{\Gamma}}
\newcommand{\mbLambda}{\mathbold{\Lambda}}
\newcommand{\mbOmega}{\mathbold{\Omega}}
\newcommand{\mbPhi}{\mathbold{\Phi}}
\newcommand{\mbPi}{\mathbold{\Pi}}
\newcommand{\mbPsi}{\mathbold{\Psi}}
\newcommand{\mbSigma}{\mathbold{\Sigma}}
\newcommand{\mbTheta}{\mathbold{\Theta}}
\newcommand{\mbUpsilon}{\mathbold{\Upsilon}}
\newcommand{\mbXi}{\mathbold{\Xi}}


\renewcommand{\d}[1]{\ensuremath{\operatorname{d}\!{#1}}}
\newcommand{\g}{\,|\,}
\renewcommand{\gg}{\,\|\,}
\newcommand\dif{\mathop{}\!\mathrm{d}}
\newcommand{\diag}{\textrm{diag}}
\newcommand{\supp}{\textrm{supp}}
\newcommand{\Gam}{\textrm{Gam}}
\newcommand{\InvGam}{\textrm{InvGam}}
% \DeclareMathOperator*{\argmax}{arg\,max}
% \DeclareMathOperator*{\argmin}{arg\,min}
\DeclareRobustCommand{\KL}[2]{\ensuremath{\textrm{KL}\left(#1\;\|\;#2\right)}}
\newcommand\indep{\protect\mathpalette{\protect\independenT}{\perp}}
\def\independenT#1#2{\mathrel{\rlap{$#1#2$}\mkern2mu{#1#2}}}
% \newcommand{\E}[1]{\mathbb{E}\left[ #1 \right]}


\newcommand{\pushbottom}{\vskip0pt plus 1filll}


% Tikz definitions
\newlength{\minnodesize}
\setlength{\minnodesize}{1.0cm}

\newlength{\nodethickness}
\setlength{\nodethickness}{1.5pt}

\newlength{\nodedist}
\setlength{\nodedist}{1.75cm}

\newlength{\prodnodedist}
\setlength{\prodnodedist}{0.95\nodedist}

\newlength{\leafnodedist}
\setlength{\leafnodedist}{0.75\nodedist}

% Color
\definecolor{tab10green}{HTML}{2CA02C}
\definecolor{tab10blue}{HTML}{1f77b4}
\definecolor{tab10red}{HTML}{d62728}


%
% Add authors
\author[1]{Fabrizio Ventola*}
\author[1]{Steven Braun*}
\author[1]{Zhongjie Yu}
\author[1,2]{Martin Mundt}
\author[1,2,3,4]{Kristian Kersting}
% Add affiliations after the authors
\affil[1]{%
    % Computer Science Dept.\\
    Department of Computer Science\\
    TU Darmstadt\\
    Darmstadt, Germany
}
\affil[2]{%
    Hessian Center for AI (hessian.AI)\\
    Darmstadt, Germany
}
\affil[3]{%
    German Research Center for Artificial Intelligence (DFKI)\\
    Darmstadt, Germany
}
\affil[4]{%
    Centre for Cognitive Science\\
    TU Darmstadt\\
    Darmstadt, Germany
}


\makeatletter
\def\blfootnote{\gdef\@thefnmark{}\@footnotetext}
\makeatother



\begin{document}

\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle

% This Supplementary Material should be submitted as a separate file. Please do
% not append the Supplementary Material to the main paper.

% Fig. \ref{fig:pitt} and Eq \ref{eq:example} in the main paper can be cross
% referenced using \texttt{xr}.

\appendix
% Uncomment for accepted version
\blfootnote{* indicates equal contribution}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% CONTENT OF THE SUPPLEMENTARY MATERIAL %
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

Our paper's supplementary material contains various supporting materials and
complementary empirical evidence for our main paper’s findings. Specifically,
the appendix consists of six sections:

\begin{itemize}
  \item[\textbf{A}] \textbf{Tractable dropout inference derivations:} We provide
        the full derivations behind the presented TDI equations of our main
        body's Section 3.
  \item[\textbf{B}] \textbf{Tractable dropout inference pseudocode algorithm:}
        We present the pseudocode algorithm of the TDI procedure.
  \item[\textbf{C}] \textbf{Experimental setup:} The section contains additional
        details with respect to the experimental setup for our empirical
        evaluations of the main body.
  \item[\textbf{D}] \textbf{PCs with TDI can detect OOD data:} We provide a
        complementary view on the ability of TDI in detecting OOD data, showing
        the predictive entropy and the predictive uncertainty of a PC with TDI.
  \item[\textbf{E}] \textbf{PCs with TDI are more robust to corruptions:} We
        present additional quantitative evidence in form of the remaining
        corruptions not shown in the main body to further support our findings
        that PCs with TDI are more robust on corrupted data.
  \item[\textbf{F}] \textbf{Societal impact:} We briefly discuss the societal
        impact of our contributions.
\end{itemize}
\section{Tractable Dropout Inference Derivations}
\label{supp:appendix_a}
Appendix A contains the full set of derivations behind our closed-form solutions
of TDI. Accordingly, the subsequent subsections follow the notation and
structure of our main body's Section 3.

\subsection{Monte Carlo Dropout Uncertainty}
\label{sec:mc-dropout}
With Monte Carlo dropout we can estimate model uncertainty by measuring the
variance of the probability computed by the PC's root node. This is done by
replacing all sum node computations with:
%
\begin{align}
  \label{eq:sumnode-dropout}
  \sumnodeD &= \sum_{i}\delta_{i}w_{i}\ndi \, ,
\end{align}
where $\delta_{i} \sim \text{Bern}\!\left(1-p\right)$ is the result of a
Bernoulli trial with probability $1-p$. Here, $p$ is the dropout chance and
$\ndi$ is the value of the $i$-th child. Note that the product node computations
remain unchanged since they have no model parameters associated and simply
compute factorizations.
In the traditional Monte Carlo dropout applied to PCs, the forward pass for a
single input is then repeated $L$ times to finally compute the expected value
and variance of the root node (i.e., the PC probability):
%
\begin{align}
  \E{\nodeD_{\text{root}}} &= \frac{1}{L}\sum_{i=1}^{L} \nodeD_{\text{root},i} \, \\
  \Var{\nodeD_{\text{root}}} &= \frac{1}{L}\sum_{i=1}^{L} \left(\nodeD_{\text{root},i} - \E{\nodeD_{\text{root}}}\right)^{2} \, ,
\end{align}
%
where $\nodeD_{\text{root},i}$ is the $i$-th random Monte Carlo trial of
\cref{eq:sumnode-dropout}.

Since the Monte Carlo dropout formulation is based on repeated Bernoulli trials,
we can now look at it from a different perspective. We can define $\sumnode$ to
be a function of the Bernoulli random variable $\delta_{i}$. As the expectation
and variance for Bernoulli random variables are well known and easy to compute,
we only need to formulate how they propagate bottom-up through the PC in a
hierarchical manner, through sum and product nodes. We formally derive this in
the next section.

\subsection{TDI's Analytical Solution to Dropout Uncertainty}
\label{sec:cf-dropout}

To derive the analytical solution of expectation, variance, and covariance
propagation for TDI, we make use of the basic rules of how the expectation,
variance, and covariance behave under addition, multiplication, and constant
scaling. Recall that the goal of the derivation is to express the expectation,
variance, and covariance of a dropout node $\ndi$ in terms of the expectation,
variance, and covariance of its children
$\E{\ndj}, \Var{\ndj}, \Cov{\ndj, \nodeD_{j'}}$ for
$\ndj \in \text{children}\!\left( \ndi \right)$. This means that if we have a
closed-form solution for the expectation, variance, and covariance of the leaf
nodes, we can calculate the expectation, variance, and covariance of any
arbitrary node in the PC by the bottom-up propagation of $\E{\ndj}$,
$\Var{\ndj}$, and $\Cov{\ndj, \nodeD_{j'}}$.

\subsubsection{Expectation: The Point Estimate}
\label{sec:tdi:expectation}
\paragraph{Sum Nodes:}
Using the linearity of the expectation, we can move the expectation into the sum
and make use of the independence between the child nodes $\nodeD_{i}$ and the
Bernoulli RVs $\delta_{i}$ to extract $\Einline{\delta_i} = q$:
%
\begin{align}
  \E{\sumnodeD} &= \E{   \sum_{i} \delta_{i}w_{i}\ndi   } \\
                &=   \sum_{i} \E{\delta_{i}w_{i}\ndi} \\
                &=   \sum_{i} \E{\delta_{i}}w_{i}\E{\ndi} \\
                &=   \sum_{i} qw_{i}\E{\ndi} \\
                &=   q\sum_{i} w_{i}\E{\ndi} \label{eq:sumnode-final} \quad .
\end{align}
As of \cref{eq:sumnode-final}, we can see that the expected outcome of the sum
node is a convex combination with the original weights $w_{i}$ of the expected
outcome of its children $\ndi$, scaled by the expected drop in likelihood $q$.

\paragraph{Product Nodes:}
The decomposability of product nodes ensures the independence of their children
w.r.t. each other. This in turn leads to the product node expectation simply
becoming the product over the expectations of its children, i.e.,
\begin{align}
  \E{\prodnodeD} = \E{   \prod_{i} \ndi   } =    \prod_{i} \E{\ndi} \quad .
\end{align}

\subsubsection{Variance: The Uncertainty Proxy}
\label{sec:tdi:variance}
Similar to the original work of Monte Carlo dropout in neural networks
of~\cite{gal_2016}, we have used the variance as the proxy for the uncertainty.

\paragraph{Sum Nodes:}
The sum node variance decomposes into a sum of two terms. The first term is
based on the variances and expectations of its children and the second term
accounts for the covariance between the combinations of all children:
\begin{align}
  \label{eq:sum-node-variance}
  \Var{\sumnodeD} &= \Var{ \sum_{i} \delta_{i}w_{i}\ndi   } \\
                  &=    \sum_{i} \Var{ \delta_{i}w_{i}\ndi}  + \sum_{i\neq j}\Cov{\delta_{i}w_{i}\ndi, \delta_{j}w_{j}\ndj} \\
                  &=    \sum_{i} w_{i}^{2}\Var{\delta_{i}\ndi}  + \sum_{i\neq j}w_{i}w_{j}\Cov{\delta_{i}\ndi, \delta_{j}\ndj}\quad . \label{eq:sum-node-variance:last}
\end{align}
%
For the first term of \cref{eq:sum-node-variance:last}, we can further resolve
$\Var{ \delta_{i}\ndi }$ by employing the rules for the variance computation:
\begin{align}
  \Var{ \delta_{i}\ndi} &= \E{\delta_{i}^{2}}\E{\ndi^{2}} - \E{\delta_{i}}^{2}\E{\ndi}^{2}  \\
                        &= q\E{\ndi^{2}} - q^{2}\E{\ndi}^{2}  \\
                        &= q \left(  \Var{\ndi} + \E{\ndi}^{2}  \right) - q^{2}\E{\ndi}^{2}  \\
                        &= q \left( \Var{\ndi} + \E{\ndi}^{2}  - q\E{\ndi}^{2}  \right) \\
                        &= q \left( \Var{\ndi} + \left( 1 - q \right)\E{\ndi}^{2}  \right) \\
                        &= q \left( \Var{\ndi} + p\E{\ndi}^{2}  \right) \quad . \label{eq:sum-node-variance:left-subterm}
\end{align}
The second term of \cref{eq:sum-node-variance:last}, the summation over
$\Cov{\delta_{i}\ndi, \delta_{j}\ndj}$, includes the covariance between all
child nodes of $\sumnodeD$:
\begin{align}
  \label{eq:cov-sum-node-children}
  \Cov{\delta_{i}\ndi, \delta_{j}\ndj} &= \E{\delta_{i}\ndi \delta_{j}\ndj} - \E{\delta_{i}\ndi}\E{\delta_{j}\ndj} \\
                                       &= \E{\delta_{i}}\E{\delta_{j}}\E{\ndi\ndj} - \E{\delta_{i}}\E{\ndi}\E{\delta_{j}}\E{\ndj} \\
                                       &= q^{2} \left(\E{\ndi\ndj} - \E{\ndi}\E{\ndj} \right) \\
                                       &= q^{2} \Cov{\ndi, \ndj} \quad . \label{eq:cov-sum-node-children:last}
\end{align}
Thus, by putting the derivations of the two terms together, we obtain the
expression for the variance of a sum node:
\begin{align}
  \label{eq:sum-variance-deriv}
  \Var{\sumnodeD} = q\sum_{i} w_{i}^{2} \left( \Var{\nodeD_{i}} + p\E{\nodeD_{i}}^{2}  \right) + q^{2}\sum_{i\neq j}w_{i}w_{j}\Cov{\nodeD_{i}, \nodeD_{j}} \, .
\end{align}
\paragraph{Product Nodes:}
Similarly to the case of a sum node, the product node variance decomposes into
two product terms, by applying the product of independent variables rule, we
obtain:
\begin{align}
  \Var{\prodnodeD} &= \Var{ \prod_{i} \ndi} \\
                   &= \prod_{i} \E{\ndi^{2}} - \prod_{i} \E{\ndi}^{2} \\
                   &= \prod_{i} \left( \Var{\ndi} + \E{\ndi}^{2} \right) - \prod_{i} \E{\ndi}^{2} \quad .
\end{align}


\subsubsection{Covariance: The Evil}
\label{sec:tdi:covariance}
\paragraph{Sum Nodes:}
The covariance of two sum nodes, $\sumnodeD^{A}$ and $\sumnodeD^{B}$, neatly
decomposes into a weighted sum of all covariance combinations of the sum node
children, as shown in the following:
\begin{align}
  \label{eq:cov-two-sum-nodes}
  \Cov{\sumnodeD^{A}, \sumnodeD^{B}}  &= \Cov{\sum_{i}w_{i}^{A}\delta_{i}^{A}\nodeD_{i}^{A}, \sum_{j}w_{j}^{B}\delta_{j}^{B}\nodeD_{j}^{B}} \\
                                      &= \E{\sum_{i}w_{i}^{A}\delta_{i}^{A}\nodeD_{i}^{A} \sum_{j}w_{j}^{B}\delta_{j}^{B}\nodeD_{j}^{B}} - \E{\sum_{i}w_{i}^{A}\delta_{i}^{A}\nodeD_{i}^{A}}\E{\sum_{j}w_{j}^{B}\delta_{j}^{B}\nodeD_{j}^{B}} \\
                                      &= \sum_{i}w_{i}^{A}\sum_{j}w_{j}^{B}\E{\delta_{i}^{A}\nodeD_{i}^{A} \delta_{j}^{B}\nodeD_{j}^{B}} - \sum_{i}w_{i}^{A}\sum_{j}w_{j}^{B}\E{\delta_{i}^{A}\nodeD_{i}^{A}}\E{\delta_{j}^{B}\nodeD_{j}^{B}} \\
                                      &= \sum_{i}w_{i}^{A}\sum_{j}w_{j}^{B}\E{\delta_{i}^{A} \delta_{j}^{B}}\E{\nodeD_{i}^{A}\nodeD_{j}^{B}} - \sum_{i}w_{i}^{A}\sum_{j}w_{j}^{B}\E{\delta_{i}^{A}}\E{\nodeD_{i}^{A}}\E{\delta_{j}^{B}}\E{\nodeD_{j}^{B}} \\
                                      &= q^{2}\left(\sum_{i}w_{i}^{A}\sum_{j}w_{j}^{B}\E{\nodeD_{i}^{A}\nodeD_{j}^{B}} - \sum_{i}w_{i}^{A}\sum_{j}w_{j}^{B}\E{\nodeD_{i}^{A}}\E{\nodeD_{j}^{B}} \right) \\
                                      &= q^{2}\left(\sum_{i}w_{i}^{A}\sum_{j}w_{j}^{B}\left(\E{\nodeD_{i}^{A}\nodeD_{j}^{B}} - \E{\nodeD_{i}^{A}}\E{\nodeD_{j}^{B}} \right)\right) \\
                                      &= q^{2}\sum_{i}w_{i}^{A}\sum_{j}w_{j}^{B}\Cov{\nodeD_{i}^{A},\nodeD_{j}^{B}} \quad . \label{eq:cov-two-sum-nodes:final}
\end{align}

\paragraph{Product Nodes:}
As detailed in the paper's main body, we are unfortunately unable to provide a
closed-form solution of the covariance between two product nodes for an
arbitrary graph. This is due to the first expectation in the product node
covariance:
\begin{align}
  \label{eq:6}
  \Cov{\prodnode^{A},\prodnode^{B}} &= \Cov{\prod_{i}\nodeD_{i}^{A}, \prod_j \nodeD_{j}^{B}} \\
                                    &= \E{\prod_{i}\nodeD_{i}^{A} \prod_j \nodeD_{j}^{B}} - \E{\prod_{i}\nodeD_{i}^{A}} \E{\prod_j \nodeD_{j}^{B}}\\
                                    &= \E{\prod_{i}\nodeD_{i}^{A} \prod_j \nodeD_{j}^{B}} - \prod_{i}\E{\nodeD_{i}^{A}} \prod_j \E{\nodeD_{j}^{B}} \quad\label{eq:6:final} .
\end{align}
%
Further resolving the term $\E{\prod_{i}\nodeD_{i}^{A} \prod_j \nodeD_{j}^{B}}$
without additional knowledge about specific node substructures is not possible.
More specifically, pairs of $\nodeD_{i}^{A}$ and $\nodeD_{j}^{B}$ may not be
independent and may share a common subset of nodes, deeper down in the PC
structure. In this case, as alternatives, we have provided three possible
solutions to solve~\cref{eq:6} in the main body. For convenience, we briefly
re-iterate. The first solution is comprised of exploiting the circuit's
structural knowledge, if available. Alternatively, when such information is not
known, we can compute tractable bounds using the Cauchy-Schwarz inequality.
These two solutions are now presented in more detail in the following text.
Finally, recall that the third solution consists of ``augmenting'' the structure
of the circuit. Here, node copies for the shared nodes are used, as discussed in
paragraph \textbf{c)} of Section 3.2.3 of the main paper.

\subsubsection{Exploiting Structural Knowledge for Exact Covariance Computation}
\label{supp:structural_knowledge}
\paragraph{Tree-structured PCs:}
To simplify \cref{eq:6} we can directly exploit structural knowledge of the DAG.
A simple solution could be provided when product nodes $\prodnodeD^{A}$ and
$\prodnodeD^{B}$ are not common ancestors of any node, resulting in the
independence $\prodnodeD^{A} \ind \prodnodeD^{B}$ holding and thus
$\Covinline{\prodnodeD^{A}, \prodnodeD^{B}} = 0$. Such a constraint is always
given in tree-structured PCs. Notably, the most common structure learner for
SPNs such as LearnSPN~\citep{gens_2013}, ID-SPN~\citep{rooshenas_2014}, and
SVD-SPN \citep{adel_2015} pursue the simplicity bias and induce tree structures.

\paragraph{Random and Tensorized Binary Tree Structures:}
For random and tensorized (RAT) binary tree structures~\citep{peharz_2020}, we
need to account for the existence of cross-products, to compute the
variance of product nodes. For binary tree structures of this kind, we can thus
leverage the characteristic that product nodes factorize two completely
independent partitions. In the context of this particular structure with binary
partitioning, the variance for a RAT sum node, with its children having their
scope on graph partitions $L$ and $R$, can be rewritten in the following way:
\begin{align}
  \label{eq:2}
  \Var{\sumnode} &= \Var{\sum_{l}\sum_{r}w_{l,r}\delta_{l,r} \prodnodeD_{l,r}} \\
                 &= \sum_{l}\sum_{r} w_{l,r}^{2} \Var{\delta_{l,r}\prodnodeD_{l,r}} + \sum_{l,r}\sum_{\left(l',r'\right) \neq \left( l,r \right)} w_{l,r}w_{l',r'} \Cov{\prodnodeD_{l,r}, \prodnodeD_{l',r'}} \label{eq:2:final} \, .
\end{align}
Consequently, the covariance term between two product nodes in~\cref{eq:2:final}
can be simplified to:
\begin{align}
  \label{eq:5}
  \Cov{\prodnodeD_{l,r}, \prodnodeD_{l',r'}} &= \E{\prodnodeD_{l,r} \prodnodeD_{l',r'}} -  \E{\prodnodeD_{l,r}}\E{\prodnodeD_{l',r'}} \\
                                             &= \E{\sumnodeD^{L}_{l}\sumnodeD^{R}_{r} \sumnodeD^{L}_{l'} \sumnodeD^{R}_{r'}} - \E{\sumnodeD^{L}_{l}\sumnodeD^{R}_{r}}\E{\sumnodeD^{L}_{l'} \sumnodeD^{R}_{r'}} \, .
\end{align}
%
Since nodes $\sumnodeD^{L}$ and $\sumnodeD^{R}$ are from two different
partitions, they are independent, and the expectation terms can be separated:
\begin{alignat}{2}
  \label{eq:7}
  \Cov{\prodnodeD_{l,r}, \prodnodeD_{l',r'}} &= &&\E{\sumnode^{L}_{l} \sumnode^{L}_{l'}} \E{\sumnode^{R}_{r} \sumnode^{R}_{r'}} - \E{\sumnode^{L}_{l}}\E{\sumnode^{R}_{r}}\E{\sumnode^{R}_{r'}}\E{\sumnode^{L}_{l'}} \\
                                             &= &&\left( \E{\sumnode^{L}_{l}}\E{\sumnode^{L}_{l'}} + \Cov{\sumnode^{L}_{l}, \sumnode^{L}_{l'}} \right) \left( \E{\sumnode^{R}_{r}}\E{\sumnode^{R}_{r'}} + \Cov{\sumnode^{R}_{r}, \sumnode^{R}_{r'}} \right) \notag \\
                                             & &&-  \E{\sumnode^{L}_{l}}\E{\sumnode^{R}_{r}}\E{\sumnode^{L}_{l'}}\E{\sumnode^{R}_{r'}} \\
                                             &= &&\E{\sumnode^{L}_{l}}\E{\sumnode^{R}_{r}}\E{\sumnode^{L}_{l'}}\E{\sumnode^{R}_{r'}} \notag\\
                                             & &&+\Cov{\sumnode^{L}_{l}, \sumnode^{L}_{l'}} \E{\sumnode^{R}_{r}} \E{\sumnode^{R}_{r'}} \notag\\
                                             & &&+\Cov{\sumnode^{R}_{r}, \sumnode^{R}_{r'}} \E{\sumnode^{L}_{l}} \E{\sumnode^{L}_{l'}} \notag\\
                                             & &&+\Cov{\sumnode^{L}_{l}, \sumnode^{L}_{l'}} \Cov{\sumnode^{R}_{r}, \sumnode^{R}_{r'}} \notag\\
                                             & &&- \E{\sumnode^{L}_{l}}\E{\sumnode^{R}_{r}}\E{\sumnode^{L}_{l'}}\E{\sumnode^{R}_{r'}} \\
                                             &= &&\Cov{\sumnode^{L}_{l}, \sumnode^{L}_{l'}} \E{\sumnode^{R}_{r}} \E{\sumnode^{R}_{r'}} \notag\\
                                             & &&+\Cov{\sumnode^{R}_{r}, \sumnode^{R}_{r'}} \E{\sumnode^{L}_{l}} \E{\sumnode^{L}_{l'}} \notag\\
                                             & &&+\Cov{\sumnode^{L}_{l}, \sumnode^{L}_{l'}} \Cov{\sumnode^{R}_{r}, \sumnode^{R}_{r'}}  \, .
\end{alignat}
%
We can see that the covariance of two product nodes now depends on the
covariance of the input sum nodes of the same partition (i.e. $L$ or $R$), for
which we can plug in \cref{eq:cov-sum-node-children:last}.

\subsubsection{It's Somewhere in Here -- Covariance Bounds}
\label{covariance_bounds}
As previously described, knowledge about the specific PC structure can
facilitate the covariance computation that otherwise could result in a
combinatorial explosion. When such knowledge is not available, we can
alternatively obtain a lower and upper bound of the covariance in a tractable
way, making use of the Cauchy-Schwarz inequality:
\begin{alignat}{2}
  \label{eq:cauchy-schwarz}
  &\Cov{\ndi, \ndj}^{2} &&\leq \Var{\ndi}\Var{\ndj} \\
  \Leftrightarrow \quad &\Cov{\ndi, \ndj} &&\in \left[ - \sqrt{\Var{\ndi}\Var{\ndj}}, + \sqrt{\Var{\ndi}\Var{\ndj}} \right] \quad . \label{eq:cauchy-schwarz:last}
\end{alignat}

\subsubsection{Leaf Nodes}
\label{sec:tdi:leaf-nodes}
Finally, as the leaf nodes are free of any dropout Bernoulli variables, their
expectation, variance, and covariance degrade to the leaf node value and zero
respectively, i.e.:
\begin{align}
  \E{\leafnode} & = \leafnode \, , \\
  \Var{\leafnode} & = 0 \, , \\
  \Cov{\leafnode_{i}, \leafnode_{j}} & = 0 \, .
\end{align}

\subsubsection{Classification Uncertainty}
\label{sec:tdi:classification}
For classification in PCs, for a given sample $\x \sim \X$, it is common to
obtain the data conditional class confidence $p(y_{i} | \x)$ by using Bayes'
rule with the class conditional root nodes $p(\x | y_{i})$ and the class priors
$p(y_{i}) = \ci$. That is, every class $y_{i}$ has a corresponding root node
$\si$, representing $p(\x | y_{i})$. Following the earlier sections'
elaborations, we can then similarly derive the variance as an uncertainty proxy
in a classification context. Making use of Bayes' rule, we obtain the posterior
for class $y_{i}$ as follows:
\begin{align}
  \label{eq:posterior}
  p(y_{i} | \x) = \frac{p(\x | y_{i}) p(y_{i})}{\sum_{j} p(\x | y_{j}) p(y_{j})} = \frac{\si\ci}{\sum_{j}\sj\cj} \, .
\end{align}
Following existing notation, we will abbreviate the $i$-th root node with
$\si$ and the $i$-th class prior with $\ci$ for simplicity.


The expectation and variance of the posterior are that of a random variable
ratio, $\E{\frac{A}{B}}$ and $\Var{\frac{A}{B}}$, with $A = \si\ci$ and
$B = \sum_{j}\sj\cj$. This ratio is generally not well-defined, but can
be approximated with a second-order Taylor approximation~\citep{ratio_seltman}:
%
\begin{align}
  \E{\frac{A}{B}} &\approx \frac{\E{A}}{\E{B}} - \frac{\Cov{A, B}}{\left( \E{B} \right)^{2}} + \frac{\Var{B}\E{A}}{\left( \E{B} \right)^{3}} \, , \label{eq:exp-rv-ratio-suppl}\\
  \Var{\frac{A}{B}} &\approx \frac{\E{A}^{2}}{\E{B}^{2}} \left[ \frac{\Var{A}}{\E{A}^{2}} - 2 \frac{\Cov{A, B}}{\E{A}\E{B}} + \frac{\Var{B}}{\E{B}^{2}} \right] \, . \label{eq:var-rv-ratio-suppl}
\end{align}
%
We will now resolve every component of \cref{eq:exp-rv-ratio-suppl,eq:var-rv-ratio-suppl}.
The expectations can be expressed directly as:
%
\begin{align}
  \E{A} &= \E{\si\ci} = \E{\sumnode{i}}\ci \label{eq:exp-x} \, , \\
  \E{B} &= \E{\sum_{j}\sj\cj} = \sum_{j}\Esj\cj \, .
          \label{eq:exp-y}
\end{align}
%
For the variances we obtain:
\begin{align}
  \Var{A} &= \Var{\si\ci} = \Var{\si}\ci^{2} \label{eq:var-x} \, , \\
  \Var{B} &= \Var{\sum_{j}\sj\cj} \\
          &= \sum_{j}\Var{\sj}\cj^{2} + \sum_{j_1 \neq j_2} \Cov{\sumnode_{j_{1}}, \sumnode_{j_{2}}}c_{j_{1}}c_{j_{2}} \, .
            \label{eq:var-y}
\end{align}
%
Finally, following~\cref{eq:cov-two-sum-nodes}, the covariance term between a
root node and the sum of all root nodes can be decomposed as follows:
%
\begin{align}
  \label{eq:cov-x-y}
  \Cov{A, B} &= \Cov{\si\ci, \sum_{j}\sj\cj} \\
             &= \E{\si \ci \cdot \sum_{j}\sj\cj} - \E{\si\ci}\E{\sum_{j}\sj\cj} \\
             &= \E{\sum_{j}\sj\si\ci\cj} - \E{\si\ci}\E{\sum_{j}\sj\cj} \\
             &= \ci\sum_{j} \cj \E{\si\sj} - \ci\Esi \sum_{j} \cj \Esj \\
             &= \ci\sum_{j} \cj \left( \Cov{\si,\sj} + \Esi\Esj \right) - \ci\Esi \sum_{j} \cj \Esj \\
             &= \ci \left(\left(\sum_{j} \cj \Cov{\si,\sj}\right) + \left(\Esi\sum_{j}\cj\Esj   \right) - \left(\Esi \sum_{j} \cj \Esj  \right)   \right) \\
             &= \ci \sum_{j} \cj \Cov{\si,\sj} \quad .
\end{align}
As previously described but repeated for emphasis, the last term
$\Cov{\si, \sj}$ can be resolved with one of the three methods
presented in Section 3.2.3 of the main paper, i.e., by exploiting structural
knowledge (see also~\cref{supp:structural_knowledge}), or by computing bounds
with \cref{eq:cauchy-schwarz} of~\cref{covariance_bounds}, or by ``augmenting''
the structure with node copies of shared nodes.

While \cref{eq:var-rv-ratio-suppl} is seemingly simple, this particular
formulation implies statistical independence between $A$ and $B$. Since $B$ is
a sum over all $A$, this independence naturally does not hold. Therefore, the
solution given here is only an approximation of the true second-order Taylor
approximation. In the following we extend the formulation of
\cite{ratio_seltman} and take into account the dependencies between root nodes
$\si$ and their sum $\sum_{i}\si$.
For simplicity, we express \cref{eq:posterior} as a function $f$ of variables
$\sumnode_{1}, \dots, \sumnode_{C}$.
\begin{align}
  \fn{f}{\sumnode_{1}, \dots, \sumnode_{C} } = \fn{f}{ \mbtheta } + \sum_{i} \frac{\partial}{\partial \si} \fn{f}{ \mbtheta } \left( \si - \theta_{i} \right) + \sum_{i}\sum_{j} \frac{\partial^{2}}{\partial \si \partial \sj} \fn{f}{\mbtheta} \left(\si - \theta_{i} \right) \left( \sj - \theta_{j} \right) + R \quad ,
\end{align}
%
where $R$ is the remainder of smaller orders. To abbreviate equations, we introduce $Z = \sum_{j} \sj c_{j}$ and
$Z_{\backslash i} = \sum_{j \neq i} \sj c_{j}$.
To resolve the first-order term, we need the first partial derivative at $\si$. %
\newcommand{\Zni}[0]{Z_{\backslash i}}
\begin{align}
  \frac{\partial}{\partial \si} \fn{f}{\sumnode_{1}, \dots, \sumnode_{C}} &= \frac{Z \frac{\partial \si c_{i}}{\partial \si} - \si c_{i} \frac{\partial}{\partial \si} Z}{Z^{2}} \\
 &= \frac{ c_{i}Z  - \si c_{i}^{2} }{Z^{2}} \\
 &= c_{i} \frac{ Z  - \si c_{i} }{Z^{2}} \\
 &=  c_{i} \frac{ \Zni  }{ Z^{2}}  \label{eq:first-partial-derivative} \quad .
\end{align}
%
We continue with the second partial derivative at $S_{k}$ and make a distinction
for $k = i$ and $k \neq i$. For the second partial derivative at $S_{i}$ we get
%
\begin{align}
  \frac{\partial^{2}}{\partial \si \partial \si} \fn{f}{\sumnode_{1}, \dots, \sumnode_{C} } &= \frac{Z^{2} \left( \frac{\partial}{\partial \si} \Zni \right) - \Zni \left( \frac{\partial}{\partial \si} Z^{2} \right)}{Z^{4}} \\
  &= \frac{- 2 c_{i} \Zni Z}{Z^{4}} \\
  &= - 2 c_{i} \frac{\Zni}{Z^{3}} \quad . \label{eq:second-partial-derivative-i}
\end{align}
%
For the second partial derivative at $S_{k}$ with $k \neq i$ we get
\begin{align}
  \frac{\partial^{2}}{\partial \si \partial \sk} \fn{f}{\sumnode_{1}, \dots, \sumnode_{C} } &= \frac{Z^{2} \left( \frac{\partial}{\partial \sk} \Zni \right) - \Zni \left( \frac{\partial}{\partial \sk} Z^{2} \right)}{Z^{4}} \\
  &= \frac{c_{k} Z^{2} - 2 c_{k} Z \Zni}{Z^{4}} \\
  &= \frac{c_{k} Z - 2 c_{k} \Zni}{Z^{3}} \\
  &= c_{k} \frac{Z - 2 \Zni}{Z^{3}} \\
  &= c_{k} \frac{Z - 2 \left( Z - \si c_{i} \right)}{Z^{3}} \\
  &= c_{k} \frac{ - Z -  2 \si c_{i} }{Z ^{3}} \\
  &= - c_{k} \frac{ Z +  2 \si c_{i} }{Z^{3}} \label{eq:second-partial-derivative-k} \quad .
\end{align}
%
Using
\cref{eq:first-partial-derivative,eq:second-partial-derivative-i,eq:second-partial-derivative-k},
and $\theta_{i} = \Esi$ we obtain
%
\begin{multline}
  \fn{f}{\sumnode_{1}, \dots, \sumnode_{C} } = \fn{f}{\E{\sumnode_{1}}, \dots, \E{\sumnode_{C}}} + \sum_{i} \left( \frac{c_{i} \E{\Zni}}{ \E{Z}^{2} } \right) \left( \si - \Esi \right) \\
  + \sum_{i} \Biggl( \sum_{j \neq i} \left( - c_{j} \frac{\E{Z} + 2 \Esi c_{i}}{\E{Z}^{3}} \left( \si - \Esi \right) \left( \sj - \Esj \right) \right) \\
  - 2 c_{i} \frac{\E{\Zni}}{\E{Z}^{3}}  \left( \si - \Esi \right)^{2}\Biggr) + R \quad .
\end{multline}
%
Now that we have derived the second order Taylor approximation of
\cref{eq:posterior}, we need to take the expectation and variance of $f$. We
begin with the expectation as follows
%
\begin{multline}
  \E{\fn{f}{\sumnode_{1}, \dots, \sumnode_{C} }} = \fn{f}{\E{\sumnode_{1}}, \dots, \E{\sumnode_{C}}} + \sum_{i} \left( \frac{c_{i} \E{Z}}{\E{Z}^{2}} \right) \overbrace{\E{\si - \Esi}}^{= \Ens{\si} - \Ens{\si} = 0} \\
  + \sum_{i} \Biggl( \sum_{j \neq i} \left( - c_{j} \frac{\E{Z} + 2\Esi c_{i}}{ \E{Z}^{3} } \overbrace{\E{ \left( \si - \Esi \right) \left( \sj - \Esj \right)  }}^{= \Covns{\si,\sj}}\right)  \\
  - 2 c_{i} \frac{\E{\Zni}}{ \E{Z}^{3}} \overbrace{\E{\left(\si - \Esi\right)^{2}}}^{= \Varns{\si}} \Biggr) \quad ,
\end{multline}
%
which simplifies to
%
\begin{equation}
  \label{eq:second-order-taylor-exp-final}
  \E{\fn{f}{\sumnode_{1}, \dots, \sumnode_{C} }} = \fn{f}{\E{\sumnode_{1}}, \dots, \E{\sumnode_{C}}} + \sum_{i} \left( \sum_{j \neq i} \left( - c_{j} \frac{\E{Z} + 2\Esi c_{i}}{ \E{Z}^{3} } \Cov{\si,\sj}\right)
  - 2 c_{i} \frac{\E{\Zni}}{  \E{Z}^{3}} \Var{\si} \right) \, .
\end{equation}
%
We continue with the variance of $f$
%
\begin{multline}
  \Var{\fn{f}{\sumnode_{1}, \dots, \sumnode_{C} }} = \Var{\fn{f}{\E{\sumnode_{1}}, \dots, \E{\sumnode_{C}}}} + \sum_{i} \Var{c_{i} \frac{\E{\Zni}}{\E{Z}^{2}} \left( \si - \E{\si} \right) } \\
  + \sum_{i} \Biggl( \sum_{j \neq i} \left( - c_{j} \frac{2\E{\si}c_{i} + \E{Z}}{\E{Z}^{3}} \right)^{2} \Var{\left( \si - \E{\si} \right) \left( \sj - \E{\sj} \right)} \\
  + \left( - 2 c_{i} \frac{\E{\Zni}}{\E{Z}^{3}} \right) \Var{\left( \si - \E{\si} \right)^{2}} \Biggr) \quad .
\end{multline}
%
Resolving each variance term simplifies the equation to
%
\begin{multline}
  \label{eq:second-order-taylor-var-final}
  \Var{\fn{f}{\sumnode_{1}, \dots, \sumnode_{C} }} = \sum_{i} \left( c_{i} \frac{\E{\Zni}}{\E{Z}^{2}} \right)^{2} \Var{\si} \\
  + \sum_{i} \Biggl( \sum_{j \neq i} \left( - c_{j} \frac{2\E{\si}c_{i} + \E{Z}}{\E{Z}^{3}} \right)^{2} \left(\Var{\si\sj} - \E{\si}^{2}\Var{\sj} - \E{\sj}^{2}\Var{\si}\right) \\
  + \left( - 2 c_{i} \frac{\E{\Zni}}{\E{Z}^{3}} \right) \left(\Var{\si^{2}} - 4\E{\si}^{2}\Var{\si}\right) \Biggr) \quad .
\end{multline}
%
We now end up with a variance term that needs further resolution: the product of
two sum nodes.
%
\newcommand{\wi}[1]{w_{#1}^{\sumnode_i}}
\newcommand{\wj}[1]{w_{#1}^{\sumnode_j}}
\newcommand{\Ni}[1]{\node_{#1}^{\sumnode_i}}
\newcommand{\Nj}[1]{\node_{#1}^{\sumnode_j}}
\begin{align}
  \label{eq:var-sum-sum}
  \Var{\si\sj} &= \Var{\sum_{k_{i}}\wi{k_{i}}\Ni{k_{i}}  \sum_{k_{j}}\wi{k_{i}}\Ni{k_{i}}} \\
  &= \Var{\wi{1}\wj{1}\Ni{1}\Nj{1} + \dots + \wi{1}\wj{\left|\sj\right|}\Ni{1}\Nj{\left|\sj\right|} \dots + \wi{\left|\si\right|}\wj{\left|\sj\right|}\Ni{\left|\si\right|}\Nj{\left|\sj\right|}} \\
  &= \sum_{k_{i}}\sum_{k_{j}} \Var{\wi{k_{i}}\wj{k_{j}}\Ni{k_{i}}\Nj{k_{j}}} \\
  &= \sum_{k_{i}}\sum_{k_{j}} \left(\wi{k_{i}}\wj{k_{j}}\right)^{2} \Var{\Ni{k_{i}}\Nj{k_{j}}} \quad .
\end{align}
%
Analogous to the discussions in \cref{sec:tdi:covariance}, the complexity of computing the variance of the product of two product nodes can be addressed through the incorporation of supplementary structural knowledge (refer to \cref{supp:structural_knowledge} for further details) or by utilizing crude approximations that induce local independence. An example of such an approximation is expressed as $\Var{\Ni{k_{i}}\Nj{k_{j}}} = \Var{\Ni{k_{i}}}\Var{\Nj{k_{j}}}$. With \cref{eq:second-order-taylor-exp-final,eq:second-order-taylor-var-final,eq:var-sum-sum} we have now reduced the second-order Taylor posterior approximation to quantities that we are able to compute from a single bottom up pass through the PC graph.

\newpage

\section{Tractable Dropout Inference Pseudocode Algorithm}
Analogously to the conventional probabilistic inference on PCs, TDI is performed
by evaluating the circuit bottom-up. For clarity and completeness, we present
the pseudocode of the bottom-up pass with TDI in
\cref{algo:tractable_dropout_inference}. While the bottom-up pass can be
implemented in a variety of different ways, we have decided on a recursive
version in the presented pseudocode for shortness and clarity. We start by
initializing empty maps \texttt{E}, \texttt{Var}, and \texttt{Cov} that map nodes to
their expectation, variance, and covariance values. Given a node $\nodeD$, some
data sample $\mbx$, and the dropout probability $p$, we then now traverse the
graph below $\nodeD$ and require, that the \texttt{tdi} procedure has been
called on all children, ensuring that the \texttt{E}, \texttt{Var},  and
\texttt{Cov} of all $\nodeD_{i} \in \ch\left(\nodeD\right)$ are populated. We
then continue to compute the covariance between all child nodes of $\nodeD$,
followed by the expectation and finally the variance of $\nodeD$. All three
procedure calls, \texttt{expectation}, \texttt{variance}, and
\texttt{covariance}, refer to the equations given in the main body in
\cref{sec:tdi:derivation} for sum, product, and leaf nodes respectively.

\begin{algorithm}[h]
    \caption{Tractable Dropout Inference: \texttt{tdi}$\left(\nodeD, \mbx, p\right)$}
    \label{algo:tractable_dropout_inference}
    \begin{algorithmic} %[1]
      \Require PC root node $\nodeD$; Data sample $\mbx$; Dropout probability
      $p$; Empty maps \texttt{E}, \texttt{Var}, \texttt{Cov}.
      \ForAll{$\ndi \in \ch\left(\nodeD\right)$}
        \State{\texttt{tdi}$\left(\nodeD_{i}, \mbx, p\right)$}
        \Comment{Populate maps for child nodes recursively}
      \EndFor

      \ForAll{$\ndi \in \ch\left(\nodeD\right)$}
        \ForAll{$\ndj \in \ch\left(\nodeD\right)$}
          \State{$\text{\texttt{Cov}}\!\left[ \ndi, \ndj \right] = \text{\texttt{covariance}}\left(\nodeD_{i}, \nodeD_{j}, \mbx, p\right)$}
          \Comment{See \cref{eq:sum-covariance,eq:prod-covariance,eq:leaf-exp-var-cov}}
        \EndFor
      \EndFor

      \State{$\text{\texttt{E}}\!\left[ \nodeD \right] = \text{\texttt{expectation}}\left( \nodeD, \mbx, p \right)$}
      \Comment{See \cref{eq:sum-expectation,eq:prod-expectation,eq:leaf-exp-var-cov}}
      \State{$\text{\texttt{Var}}\!\left[ \nodeD \right] = \text{\texttt{variance}}\left( \nodeD, \mbx, p \right)$}
      \Comment{See \cref{eq:sum-variance,eq:prod-variance,eq:leaf-exp-var-cov}}
    \end{algorithmic}
  \end{algorithm}

  \section{Experimental Setup}
  \label{supp:appendix_b}
  For our experiments, we implemented TDI based on the publicly-available
  PyTorch implementation of
  RAT-SPNs\footnote{\url{https://github.com/SPFlow/SPFlow/tree/master/src/spn/experiments/RandomSPNs_layerwise}}
  in the \emph{SPFlow} software package ~\citep{molina_2019}.
  Our code is available at \url{https://github.com/ml-research/tractable-dropout-inference}.
  All our models were trained for 200 epochs with the Adam optimizer
  \citep{kingma2015adam}, employing a batch size of 200 and a learning rate of
  1e-3. To assess the robustness of probabilistic circuits as probabilistic
  discriminators, we have run all our experiments with the RAT hyperparameter
  $\lambda$ set to $1$. The latter corresponds to maximizing the class
  conditional likelihood of a sample computed by the head attributed to the
  observed label. Regarding the RAT-SPN graph, we make use of the following
  hyperparameters: $S=20, I=20, D=5, R=5$ with Gaussian leaves. This
  configuration returns circuits with 1.65M edges and 1.83M learnable parameters
  for SVHN (614k are Gaussian leaf parameters), and 1.42M edges and 1.38M
  parameters for MNIST (157k are Gaussian leaf parameters).
  % mnist 1376800 params 1422400 edges svhn 1834400 params 1651200 edges
  We emphasize that all previously described hyperparameter choices correspond
  to standard, previously considered, choices in the literature. No additional
  or excessive hyperparameter tuning has been conducted on our side. We refer to
  the original work of \cite{peharz_2020} for further details on RAT-SPN
  training and its respective hyperparameters.

  For our TDI experiments, we select the dropout parameter $p$ to be $0.1$ for
  SVHN and $0.2$ for MNIST. This range is adequate to preserve accuracy and
  simultaneously provide valuable uncertainty estimates. Note how such a choice
  is consistent and not fundamentally different from dropout probabilities in
  neural networks (i.e. typically on the same scale in every layer, unless it is
  the last layer, where sometimes a $p$ of up to 0.5 can be found in the
  literature). However, we further note that probabilistic circuits are
  typically sparser than their neural counterparts. A small dropout parameter is
  thus generally sufficient but nevertheless remains a hyperparameter to be
  chosen.

\section{PCs with TDI can detect OOD data}
In the main paper, we have shown that PCs tend to be overconfident on OOD
instances while PCs + TDI can overcome this challenge and can detect
when an instance comes from a completely different distribution compared to what
observed during training. This facilitates the separation between ID and OOD
instances with a threshold, a peculiarity that is particularly useful also in
practice in many domains. Here, we provide a complementary view of this ability
of PCs with TDI. We show the predictive entropy in
\cref{fig:predictive-entropy-uncertainty:entropy} and the predictive uncertainty
in \cref{fig:predictive-entropy-uncertainty:uncertainty} obtained on the ID data
and on several OOD datasets.

\begin{figure}[h]
  \centering
  \begin{subfigure}[t]{0.5\linewidth}
    \centering
    \includegraphics[width=1.0\linewidth]{./figs/histograms-pred-entropy.pdf}
    \caption{}
    \label{fig:predictive-entropy-uncertainty:entropy}
  \end{subfigure}%
  \begin{subfigure}[t]{0.5\linewidth}
    \centering \includegraphics[width=1.0\linewidth]{./figs/pred-uncertainty.pdf}
    \caption{}
    \label{fig:predictive-entropy-uncertainty:uncertainty}
  \end{subfigure}%
  \caption{
  % (a)
  (\subref{fig:predictive-entropy-uncertainty:entropy}) Histograms for the predictive entropy of PCs and PCs + TDI on ID (shaded) and OOD (colored) datasets in a classification context.
  PCs (left panel) assign similar, largely indistinguishable predictive entropy to OOD and ID data. In contrast, PCs + TDI (right panel) provides high uncertainty on OOD data, pushing the predictive entropy close to the obtainable maximum and effectively separating them from the known ID dataset.
  % (b)
  (\subref{fig:predictive-entropy-uncertainty:uncertainty}) Predictive
  uncertainty (standard deviation as square root of \cref{eq:var-rv-ratio}) for the predictions
  provided by PC + TDI on ID (shaded) and OOD data (colored). Complementing the
  illustration of predictive entropy in
  (\subref{fig:predictive-entropy-uncertainty:entropy}), the values support the
  picture that PCs + TDI are more unsure about OOD data and can thus make a
  distinction. }
\label{fig:predictive-entropy-uncertainty}
\end{figure}



\section{PCs with TDI Are More Robust To Corruptions}
\label{supp:appendix_c}
We have applied 15 non-trivial natural and synthetic corruptions with respective
five levels of severity, as shown in the main body's experimental evidence
section. For convenience, we re-iterate that the latter are commonly employed in
the literature~\citep{hendrycks_2019} to test models' robustness. More
specifically, corruptions have been used to demonstrate standard neural
networks' inability to effectively handle corrupted data. Since we have observed
an initial similar inability in PCs, they thus form a sensible test to evaluate
the robustness of PCs versus PCs + TDI. To provide some visual intuition, a
respective SVHN illustration is shown in~\cref{fig:svhn_corruptions}.

In our main body, we have shown the performance of PCs and PCs + TDI for 4 such
corruptions and thus have provided empirical evidence for the robustness of PCs
with TDI and their ability to attribute increased uncertainty to corrupted data.
Here, we provide the full set of experiments for all 15 corruptions. Recall that
for a model to successfully detect the corruption, it should assign a
progressive increase in predictive entropy in accordance with the corruption
severity.

The full quantitative results of \cref{fig:svhn_corruptions_pred_entropy}
empirically affirm our conclusions drawn in the main body: PCs with TDI are
generally equally or more robust at various severity levels for all corruptions,
generally preserving equal or even higher predictive accuracy in several cases.
At the same time, the assigned predictive entropy is substantially larger with
an increasing level of corruption with TDI. Finally, we emphasize that in the
three cases where PCs with TDI do not provide a large and successively
increasing measure of entropy, i.e. zoom blur, pixelation, and jpeg compression,
their respective accuracy in \cref{fig:svhn_corruptions_pred_entropy} is almost
fully preserved. In other words, the model does not provide a large uncertainty
because it already provides a correct and robust prediction.

\begin{figure}[!h]
  \centering
  \includegraphics[width=0.6\linewidth]{./figs/svhn_corruptions_samples.pdf}
  \caption{Visual illustration of the 15 different corruptions with five
    increasing levels of severity as introduced in~\cite{hendrycks_2019} on an
    SVHN test sample.}
  \label{fig:svhn_corruptions}
\end{figure}

\begin{figure*}[!h]
  \centering \includegraphics[width=1.0\linewidth]{./figs/figure-1-supp.pdf}
  \caption{ Predictive entropy (left y-axis) and accuracy (right y-axis) of PCs
    (blue curve, circle markers) and PCs + TDI (orange, triangles) for
    increasingly corrupted SVHN data. 15 natural and synthetic corruptions are
    introduced at five severity levels. PCs with TDI can detect the distribution
    shift by assigning higher predictive entropy with increasing severity, while
    at the same time being more robust in predictive accuracy against the
    corruption, compared to PCs. We emphasize that in the three cases where PCs
    + TDI do not provide a large and successively increasing measure of entropy,
    i.e. zoom blur, pixelation, and jpeg compression, their respective accuracy
    (dashed) is almost fully preserved. The model is already correct and
    certain.}
  \label{fig:svhn_corruptions_pred_entropy}
\end{figure*}

\section{Societal Impact}
We believe that quantification of model uncertainty, as a direction to
contribute to overall robustness, can have a broad positive societal impact.
Colloquially speaking, an indication of when to ``trust'' the model is a
valuable tool for users and practitioners, especially in safety-critical
applications. That noted, gauging model uncertainty does not absolve users from
careful considerations of other potentially harmful effects, particularly, the
involvement of various forms of bias through training data, as these are then
considered ``certain'' by definition.


\clearpage
\bibliography{ventola_118.bib}

\end{document}
