% \documentclass[accepted]{uai2023}   % for initial submission
\documentclass[accepted]{uai2023}     % after acceptance, for a revised
                                      % version; also before submission to
                                      % see how the non-anonymous paper
                                      % would look like


%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
%\newcommand{\swap}[3][-]{#3#1#2} % just an example

\usepackage[capitalise]{cleveref}
\usepackage{caption}
\usepackage{subcaption}
\usepackage{pgfplots}

\usepackage{pifont}% http://ctan.org/pkg/pifont
\newcommand{\cmark}{{\color{tab10green}\ding{51}}\hspace{0.5em}}%
\newcommand{\xmark}{{\color{tab10red}\ding{55}}\hspace{0.5em}}%

% TODONOTES
\usepackage{xargs}
\usepackage[colorinlistoftodos,textsize=normalsize,disable]{todonotes} % Disabled
% \usepackage[colorinlistoftodos,textsize=normalsize]{todonotes} % Enabled
%\setlength{\marginparwidth}{1.7cm}  % TODO: comment out when todonotes is disabled
\newcommandx{\todoc}[2][1=]{{\todo[linecolor=orange,backgroundcolor=orange!25,bordercolor=orange,#1]{
      TODO: #2}}}
\newcommandx{\unsure}[2][1=]{{\todo[linecolor=yellow,backgroundcolor=yellow!25,bordercolor=yellow,#1]{
      UNSURE: #2}}}
\newcommandx{\change}[2][1=]{{\todo[linecolor=blue,backgroundcolor=blue!25,bordercolor=blue,#1]{
      CHANGE: #2}}}
\newcommandx{\info}[2][1=]{{\todo[linecolor=green,backgroundcolor=green!25,bordercolor=green,#1]{
      INFO: #2}}}
\newcommandx{\improvement}[2][1=]{{\todo[linecolor=violet,backgroundcolor=violet!25,bordercolor=violet,#1]{
      IMPROVEMENT: #2}}}
\newcommandx{\thiswillnotshow}[2][1=]{{\todo[disable,#1]{THIS WILL NOT SHOW:
      #2}}}

% \usepackage[pdftex]{graphicx}
\usepackage{bm}

% --- For algorithms ---
\usepackage{algorithm} \usepackage[noend]{algpseudocode}
% --- ----

\usepackage{amsmath,amssymb}
\DeclareMathOperator*{\argmax}{arg\,max}

% --- For tikz plots ---
\usetikzlibrary{colorbrewer} \usetikzlibrary{positioning}
\usetikzlibrary{shapes.geometric} \usetikzlibrary{arrows}
\usetikzlibrary{arrows.meta} \usetikzlibrary{patterns,decorations.pathreplacing}
\usetikzlibrary{fit} \usetikzlibrary{backgrounds}
\usetikzlibrary{shapes,backgrounds,calc} \usetikzlibrary{shadows}
\usetikzlibrary{patterns} \definecolor{mygray}{RGB}{190,190,190}
% --- ----

% math definitions
\newcommand{\node}{\mathsf{N}}
%\newcommand{\nodeD}{\widetilde{\node}}
\newcommand{\nodeD}{\node}
\newcommand{\ndi}{\nodeD_i}
\newcommand{\ndj}{\nodeD_j}
\newcommand{\sumnode}{\mathsf{S}}
\newcommand{\prodnode}{\mathsf{P}}
\newcommand{\leafnode}{\mathsf{L}}
%\newcommand{\sumnodeD}{\widetilde{\sumnode}}  % Dropout version with tilde
%\newcommand{\prodnodeD}{\widetilde{\prodnode}}  % Dropout version with tilde
%\newcommand{\leafnodeD}{\widetilde{\leafnode}}  % Dropout version with tilde
\newcommand{\sumnodeD}{\sumnode}  % Dropout version with tilde
\newcommand{\prodnodeD}{\prodnode}  % Dropout version with tilde
\newcommand{\leafnodeD}{\leafnode}  % Dropout version with tilde
\newcommand{\Ks}{K_{S}}
\newcommand{\Kp}{K_{P}}
\newcommand{\E}[1]{\mathbb{E}\!\left[#1\right]}
\newcommand{\Var}[1]{\text{Var}\!\left[#1\right]}
\newcommand{\Cov}[1]{\text{Cov}\!\left[#1\right]}
% Inline alternatives that do not have huge brackets which increase between-line spacing
\newcommand{\Einline}[1]{\mathbb{E}[#1]}
\newcommand{\Varinline}[1]{\text{Var}[#1]}
\newcommand{\Covinline}[1]{\text{Cov}[#1]}



\newcommand{\ci}{c_i}  % Class priors
\newcommand{\cj}{c_j}  % Class priors

\newcommand{\cbar}{\,|\,}
\newcommand{\X}{\mathbf{X}}
\newcommand{\Y}{\mathbf{Y}}
\newcommand{\Z}{\mathbf{Z}}
\newcommand{\x}{\mathbf{x}}
\newcommand{\y}{\mathbf{y}}
\newcommand{\z}{\mathbf{z}}

\newcommand{\data}{\mathcal{D}}

\newcommand{\ind}{\perp\!\!\!\!\perp} % Independence symbol

\newcommand{\spn}{\mathcal{S}}
\newcommand{\spnfn}[1]{\spn\!\left( #1 \right)}
\newcommand{\scope}[1]{\mathbf{sc}\left(#1\right)}

\newcommand{\PC}{\mathcal{P}}
\newcommand{\graph}{\mathcal{G}}
\newcommand{\tree}{\mathcal{T}}
% \newcommand{\scope}{\psi}
\newcommand{\pa}{\mathbf{pa}}
\newcommand{\ch}{\mathbf{ch}}
\newcommand{\val}{\mathbf{val}}


\newcommand{\ones}{\mathbf 1}
\newcommand{\reals}{{\mbox{\bf R}}}
\newcommand{\integers}{{\mbox{\bf Z}}}
\newcommand{\symm}{{\mbox{\bf S}}}  % symmetric matrices

\newcommand{\nullspace}{{\mathcal N}}
\newcommand{\range}{{\mathcal R}}
\newcommand{\Rank}{\mathop{\bf Rank}}
\newcommand{\Tr}{\mathop{\bf Tr}}
% \newcommand{\diag}{\mathop{\bf diag}}
\newcommand{\card}{\mathop{\bf card}}
\newcommand{\rank}{\mathop{\bf rank}}
\newcommand{\conv}{\mathop{\bf conv}}
\newcommand{\prox}{\mathbf{prox}}

\newcommand{\Expect}{\mathop{\bf E{}}}
\newcommand{\Prob}{\mathop{\bf Prob}}
\newcommand{\Co}{{\mathop {\bf Co}}} % convex hull
\newcommand{\dist}{\mathop{\bf dist{}}}
% \newcommand{\argmin}{\mathop{\rm argmin}}
% \newcommand{\argmax}{\mathop{\rm argmax}}
\newcommand{\epi}{\mathop{\bf epi}} % epigraph
\newcommand{\Vol}{\mathop{\bf vol}}
\newcommand{\dom}{\mathop{\bf dom}} % domain
\newcommand{\intr}{\mathop{\bf int}}
\newcommand{\sign}{\mathop{\bf sign}}

\newcommand{\cf}{{\it cf.}}
\newcommand{\eg}{{\it e.g.}}
\newcommand{\ie}{{\it i.e.}}
\newcommand{\etc}{{\it etc.}}

%% Math
\newcommand{\mathbold}[1]{\boldsymbol{#1}}
\newcommand{\mba}{\mathbold{a}}
\newcommand{\mbb}{\mathbold{b}}
\newcommand{\mbc}{\mathbold{c}}
\newcommand{\mbd}{\mathbold{d}}
\newcommand{\mbe}{\mathbold{e}}
\newcommand{\mbf}{\mathbold{f}}
\newcommand{\mbg}{\mathbold{g}}
\newcommand{\mbh}{\mathbold{h}}
\newcommand{\mbi}{\mathbold{i}}
\newcommand{\mbj}{\mathbold{j}}
\newcommand{\mbk}{\mathbold{k}}
\newcommand{\mbl}{\mathbold{l}}
\newcommand{\mbm}{\mathbold{m}}
\newcommand{\mbn}{\mathbold{n}}
\newcommand{\mbo}{\mathbold{o}}
\newcommand{\mbp}{\mathbold{p}}
\newcommand{\mbq}{\mathbold{q}}
\newcommand{\mbr}{\mathbold{r}}
\newcommand{\mbs}{\mathbold{s}}
\newcommand{\mbt}{\mathbold{t}}
\newcommand{\mbu}{\mathbold{u}}
\newcommand{\mbv}{\mathbold{v}}
\newcommand{\mbw}{\mathbold{w}}
\newcommand{\mbx}{\mathbold{x}}
\newcommand{\mby}{\mathbold{y}}
\newcommand{\mbz}{\mathbold{z}}

\newcommand{\mbA}{\mathbold{A}}
\newcommand{\mbB}{\mathbold{B}}
\newcommand{\mbC}{\mathbold{C}}
\newcommand{\mbD}{\mathbold{D}}
\newcommand{\mbE}{\mathbold{E}}
\newcommand{\mbF}{\mathbold{F}}
\newcommand{\mbG}{\mathbold{G}}
\newcommand{\mbH}{\mathbold{H}}
\newcommand{\mbI}{\mathbold{I}}
\newcommand{\mbJ}{\mathbold{J}}
\newcommand{\mbK}{\mathbold{K}}
\newcommand{\mbL}{\mathbold{L}}
\newcommand{\mbM}{\mathbold{M}}
\newcommand{\mbN}{\mathbold{N}}
\newcommand{\mbO}{\mathbold{O}}
\newcommand{\mbP}{\mathbold{P}}
\newcommand{\mbQ}{\mathbold{Q}}
\newcommand{\mbR}{\mathbold{R}}
\newcommand{\mbS}{\mathbold{S}}
\newcommand{\mbT}{\mathbold{T}}
\newcommand{\mbU}{\mathbold{U}}
\newcommand{\mbV}{\mathbold{V}}
\newcommand{\mbW}{\mathbold{W}}
\newcommand{\mbX}{\mathbold{X}}
\newcommand{\mbY}{\mathbold{Y}}
\newcommand{\mbZ}{\mathbold{Z}}

\newcommand{\mbalpha}{\mathbold{\alpha}}
\newcommand{\mbbeta}{\mathbold{\beta}}
\newcommand{\mbdelta}{\mathbold{\delta}}
\newcommand{\mbepsilon}{\mathbold{\epsilon}}
\newcommand{\mbchi}{\mathbold{\chi}}
\newcommand{\mbeta}{\mathbold{\eta}}
\newcommand{\mbgamma}{\mathbold{\gamma}}
\newcommand{\mbiota}{\mathbold{\iota}}
\newcommand{\mbkappa}{\mathbold{\kappa}}
\newcommand{\mblambda}{\mathbold{\lambda}}
\newcommand{\mbmu}{\mathbold{\mu}}
\newcommand{\mbnu}{\mathbold{\nu}}
\newcommand{\mbomega}{\mathbold{\omega}}
\newcommand{\mbphi}{\mathbold{\phi}}
\newcommand{\mbpi}{\mathbold{\pi}}
\newcommand{\mbpsi}{\mathbold{\psi}}
\newcommand{\mbrho}{\mathbold{\rho}}
\newcommand{\mbsigma}{\mathbold{\sigma}}
\newcommand{\mbtau}{\mathbold{\tau}}
\newcommand{\mbtheta}{\mathbold{\theta}}
\newcommand{\mbupsilon}{\mathbold{\upsilon}}
\newcommand{\mbvarepsilon}{\mathbold{\varepsilon}}
\newcommand{\mbvarphi}{\mathbold{\varphi}}
\newcommand{\mbvartheta}{\mathbold{\vartheta}}
\newcommand{\mbvarrho}{\mathbold{\varrho}}
\newcommand{\mbxi}{\mathbold{\xi}}
\newcommand{\mbzeta}{\mathbold{\zeta}}

\newcommand{\mbDelta}{\mathbold{\Delta}}
\newcommand{\mbGamma}{\mathbold{\Gamma}}
\newcommand{\mbLambda}{\mathbold{\Lambda}}
\newcommand{\mbOmega}{\mathbold{\Omega}}
\newcommand{\mbPhi}{\mathbold{\Phi}}
\newcommand{\mbPi}{\mathbold{\Pi}}
\newcommand{\mbPsi}{\mathbold{\Psi}}
\newcommand{\mbSigma}{\mathbold{\Sigma}}
\newcommand{\mbTheta}{\mathbold{\Theta}}
\newcommand{\mbUpsilon}{\mathbold{\Upsilon}}
\newcommand{\mbXi}{\mathbold{\Xi}}


\renewcommand{\d}[1]{\ensuremath{\operatorname{d}\!{#1}}}
\newcommand{\g}{\,|\,}
\renewcommand{\gg}{\,\|\,}
\newcommand\dif{\mathop{}\!\mathrm{d}}
\newcommand{\diag}{\textrm{diag}}
\newcommand{\supp}{\textrm{supp}}
\newcommand{\Gam}{\textrm{Gam}}
\newcommand{\InvGam}{\textrm{InvGam}}
% \DeclareMathOperator*{\argmax}{arg\,max}
% \DeclareMathOperator*{\argmin}{arg\,min}
\DeclareRobustCommand{\KL}[2]{\ensuremath{\textrm{KL}\left(#1\;\|\;#2\right)}}
\newcommand\indep{\protect\mathpalette{\protect\independenT}{\perp}}
\def\independenT#1#2{\mathrel{\rlap{$#1#2$}\mkern2mu{#1#2}}}
% \newcommand{\E}[1]{\mathbb{E}\left[ #1 \right]}


\newcommand{\pushbottom}{\vskip0pt plus 1filll}


% Tikz definitions
\newlength{\minnodesize}
\setlength{\minnodesize}{1.0cm}

\newlength{\nodethickness}
\setlength{\nodethickness}{1.5pt}

\newlength{\nodedist}
\setlength{\nodedist}{1.75cm}

\newlength{\prodnodedist}
\setlength{\prodnodedist}{0.95\nodedist}

\newlength{\leafnodedist}
\setlength{\leafnodedist}{0.75\nodedist}

% Color
\definecolor{tab10green}{HTML}{2CA02C}
\definecolor{tab10blue}{HTML}{1f77b4}
\definecolor{tab10red}{HTML}{d62728}


\title{Probabilistic Circuits That Know What They Don't Know}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{Fabrizio Ventola*}
\author[1]{Steven Braun*}
\author[1]{Zhongjie Yu}
\author[1,2]{Martin Mundt}
\author[1,2,3,4]{Kristian Kersting}
% Add affiliations after the authors
\affil[1]{%
    % Computer Science Dept.\\
    Department of Computer Science\\
    TU Darmstadt\\
    Darmstadt, Germany
}
\affil[2]{%
    Hessian Center for AI (hessian.AI)\\
    Darmstadt, Germany
}
\affil[3]{%
    German Research Center for Artificial Intelligence (DFKI)\\
    Darmstadt, Germany
}
\affil[4]{%
    Centre for Cognitive Science\\
    TU Darmstadt\\
    Darmstadt, Germany
}


\makeatletter
\def\blfootnote{\gdef\@thefnmark{}\@footnotetext}
\makeatother


\begin{document}
\maketitle

\begin{abstract}
Probabilistic circuits (PCs) are models that allow exact and tractable
probabilistic inference. In contrast to neural networks, they are often assumed
to be well-calibrated and robust to out-of-distribution (OOD) data. In this
paper, we show that PCs are in fact not robust to OOD data, i.e., they don't
know what they don't know. 
We then show how this challenge can be overcome by model uncertainty quantification. To this end, we
propose tractable dropout inference (TDI), an inference procedure to estimate
uncertainty by deriving an analytical solution to Monte Carlo dropout (MCD)
through variance propagation. Unlike MCD in neural networks,
which comes at the cost of multiple network evaluations, TDI provides
tractable sampling-free uncertainty estimates in a single forward pass.
TDI improves the robustness of PCs to distribution shift and OOD data,
demonstrated through a series of experiments evaluating the classification
confidence and uncertainty estimates on real-world data.
\end{abstract}


\section{Introduction}
\blfootnote{*indicates equal contribution}
The majority of modern machine learning research concentrates on a closed-world setting \citep{boult_2019}.
Here, the value of a model is judged by its performance on a dedicated train-validation-test split from a joint data distribution. Such a focus discounts crucial requirements for real-world inference, where data with a shift in distribution, conceptually novel data, or various combinations of unfiltered corruptions and perturbations are typically encountered \citep{boult_2019, hendrycks_2019}. It is well known that the latter scenarios impose a significant challenge for current practice, attributed to a common culprit referred to as overconfidence \citep{matan_90}. Specifically, popular discriminative approaches like SVMs~\citep{scheirer_2013,scheirer_2014} and neural networks~\citep{nguyen_2015, amodei_2016, guo_2017} chronically assign probabilities close to unity to their predictions, even when a category does not yet exist in the present model.

\begin{figure}
    \centering
    \input{./figs/id_vs_ood_acc_merged.pgf}
    \caption{Conventional PCs are incapable of identifying OOD datasets (solid colors, outlier percentage should be large) while retaining correct predictions for ID data (dashed red, outlier percentage needs to be low to avoid rejection) independently of the decision threshold on the model's predictive entropy. Whereas their OOD detection precision drops rapidly with increasing threshold, TDI successfully distinguishes ID from OOD data across a wide range.}
    \label{fig:precision-ood}
\end{figure}

Unfortunately, the above challenge is not limited to discriminative models and
has recently resurfaced in the context of generative models. Various works
\citep{nalisnick_2019, ovadia_2019, mundt_2022} have empirically demonstrated
that different deep models, such as variational
auto-encoders~\citep{kingma2014auto} or normalizing flows~\citep{kobyzev2020normalizing, papamakarios2021normalizing}, have
analogous difficulties in separating data from arbitrary distributions from
those observed during training. Intuitively speaking, these models ``\emph{don't know what they don't know}''. In this paper, we show that a fairly new
family of generative models, probabilistic circuits (PCs)~\citep{choi_2020},
suffer from the same fate. Until now, these models have been generally assumed
to overcome the overconfidence problem faced by their deep neural network
counterparts, see e.g.~\citep{peharz_2020_icml}, ascribed primarily to PCs' ability for
\emph{tractable} and \emph{exact} inference. This assumption should
clearly be challenged, as highlighted by our empirical evidence in the left
panel of \cref{fig:precision-ood}. Here, we show the percentage of samples recognized as
outliers based on the predictive entropy $\sum_c p\left(y_c|x\right)\log p\left(y_c|x\right)$, with labels $y$, classes $c$, and data $x$, 
obtained by a PC in a classification scenario trained on an in-distribution (ID) dataset and tested on several out-of-distribution (OOD) datasets. When correctly identifying e.g. 95\% of SVHN as ID, the PC is only able to detect 24\% of the LSUN dataset successfully as OOD.

Inspired by the neural network literature, we consequently posit that PCs' inability to successfully recognize OOD instances is due to a lack of uncertainty quantification in their original formulation. More specifically, the ability to gauge the uncertainty in the model's parameters (also called \emph{epistemic} uncertainty) \citep{kendall_2017, hullermeier_2021} is required to indicate when the output is expected to be volatile and should thus not be trusted. In Bayesian neural networks~\citep{mackay_1992}, such uncertainty is achieved by placing a distribution on the weights and capturing the observed variation given some data. In arbitrary deep networks, the popular Monte Carlo dropout (MCD) \citep{gal_2016} measures uncertainty by conducting stochastic forward passes through a model with dropout \citep{srivastava_2014}, as a practical approximation based on Bernoulli distributions on the parameters.

In our work, we draw inspiration from the MCD approach and build upon its
success to quantify the uncertainty in PCs. However, in a crucial difference to neural networks and as the key
contribution of our paper, we
derive a closed-form solution to MCD leveraging the PC structure's clear probabilistic semantics. We refer to the derived procedure as \emph{tractable dropout inference} (TDI), which provides tractable uncertainty quantification in an efficient single forward pass by means of variance propagation. The right panel of \cref{fig:precision-ood} highlights how TDI successfully alleviates the overconfidence challenge of PCs and remarkably improves the OOD detection precision over the entire range of threshold values.
Measured over all OOD detection thresholds, TDI improves OOD precision over PCs without TDI by 2.2$\times$ on CIFAR-100 and CINIC, and 2.7$\times$ on LSUN (see \cref{sec:experimental_evaluation} for details). In summary, our key contributions are:
\begin{enumerate}
    \item We show that PCs suffer from overconfidence and fall short in distinguishing ID from OOD data.
    \item We introduce TDI, a novel inference routine that provides tractable uncertainty estimation. For this purpose, we derive a sampling-free analytical solution to MCD in PCs via variance propagation.
    \item We demonstrate TDI's robustness to distribution shifts and OOD data in three key scenarios with OOD data of different datasets, perturbed, and corrupted instances.
\end{enumerate}

\section{Related Work}
\label{sec:related_work}
The primary purpose and goal of our work is to introduce tractable uncertainty quantification to PCs to effectively perform OOD detection, an important aspect that has previously received little to no attention.
{On a broader scale, the current literature reflects minimal endeavors targeted at enhancing the robustness of PCs. These works
undertake the challenge from different perspectives.
For instance, to tackle overconfidence on in-domain misclassified samples when data is scarce,
\cite{maua_2017} introduced costly inference routines based
on credal sets that forgo tractable and general inference.
Recently, \cite{peddi_2022} presented a likelihood loss to involve artificially
perturbed samples during optimization. Although sharing the common aspect of challenging the robustness of PCs,
none of these works has provided an efficient tractable uncertainty quantification method to make arbitrary PCs more robust to different OOD
data without altering the model's parameters or compromising inference tractability. Thus, our most closely related works reside} in the respective area of uncertainty quantification, in particular, the imminently related approximations made in neural network counterparts \citep{blundell_2015, gal_2016, kendall_2017}. Gauging such model uncertainty in turn provides substantial value in various tasks, including OOD detection, robustness to corrupt and perturbed data, and several downstream applications. We provide a brief overview of the latter for the purpose of completeness, before concentrating on the essence of our paper in terms of immediately related methodology to estimate model uncertainty.

\textbf{OOD Detection and Use of Uncertainty:}
Uncertainty quantification based on Bayesian methods provides a theoretical foundation to assess when a model lacks
confidence in its parameters and predictions. Alternatively, several other directions have been proposed to deal 
with OOD inputs in the inference phase. Notably, several works across the decades have proposed to include 
various forms of reject options in classification.
These methods are often criticized for their lack of theoretical grounding, leading to a separate thread advocating 
for the challenge to be addressed through open set recognition.
We point to the recent review of \cite{boult_2019} for an overview of techniques. In a similar spirit,
assessment of uncertainty has been shown to be foundational in the application to
e.g. active learning \citep{gal_2017} or continual learning \citep{ebrahimi_2020}.
We emphasize that these techniques and applications are complementary to our work and are yet to be explored in PCs.
Similarly to prior neural network-based efforts of \cite{ovadia_2019} and \cite{nalisnick_2019}, we first show
that PCs are incapable of inherent OOD detection, before leaning on uncertainty to overcome the challenge with our TDI.

\textbf{Uncertainty Quantification:}
In a simplified picture, methods to estimate uncertainty could be attributed to two main categories: methods falling into a Bayesian framework and alternative non-Bayesian ones. As the categorization suggests, the latter do not ground their principle in Bayesian statistics and provide quantification in different forms, such as the size of a prediction interval or a score ~\citep{osband_2021, yu_2022_uai}. In contrast, Bayesian methods rely on a solid theoretical ground that allows for a clear interpretation. When applied to computation graphs like neural networks and PCs, the key
concept is to have a probability distribution over the parameters, in this
context, the weights of the graph. More formally, the model parameterization is framed as picking the parameters $\bm{\theta}$ (subject to optimization) from a prior probability distribution $p(\bm{\theta})$. We are then interested in the parameter configuration that most likely represents the data $\data$, i.e.,
$\argmax_{\bm{\theta}} p(\bm{\theta}|\data)$. To account for model uncertainty, it would be necessary to integrate over the parameters, which is intractable for many models.


A considerable number of works have pursued this direction for
deep neural architectures. Bayesian neural networks~\citep{mackay_1992, neal_2012} have
initially paved the way to model uncertainty, but given the immense computational cost, alternatives focus on several
cheap approximations. Popular ways are to back-propagate uncertainty through
the gradients~\citep{blundell_2015, hernandez_lobato_2015, mishkin_2018, maddox_2019}, make use of variational inference~\citep{graves_2011, louizos_2016} or draw connections to Gaussian Processes ~\citep{gal_2016, khan_2019}.
A related approximate approach is based on ensembles, where the underlying idea is to relate the
model uncertainty with the statistics computed over the various ensemble components. To obtain uncertainty estimates, most of these
approaches need to train multiple ensemble components~\citep{hansen_1990, lakshminarayanan_2017} or larger
overparameterized singletons and treat them as an ensemble of
subnetworks~\citep{antoran_2020, daxberger_2021}.
Among the Bayesian methods for learning a PC on propositional knowledge bases, \cite{cerutti_2022} deal with uncertainty estimation for conditional boolean queries by attaching a second circuit to the PC. 

\textbf{Monte Carlo Dropout:}
The natural question for Bayesian methods is how to sample
the parameters $\bm{\theta}$ from the posterior $p\left(\bm{\theta}\cbar\data\right)$, taking the
high-dimensional and highly non-convex nature of the probability distribution for complex networks into account, which leads to intractable standard
sampling methods~\citep{izmailov_2021}. \cite{gal_2016}
have reframed dropout \citep{srivastava_2014} as a Bayesian approximation to assess model uncertainty. Originally, dropout is a method proposed to avoid overfitting and
improve generalization by including a stochastic chance $p$ of removing a connection between units of an adjacent layer. \cite{gal_2016}'s key realization is that dropout allows to cheaply sample from the posterior under the assumption of a Bernoulli distribution on the weights.
In essence, MCD approximates the integration over the parameters with a summation over a finite set of $n$ drawn sets of parameters
$\bm{\theta}_i \sim p\left(\bm{\theta}\cbar\data\right)$. By using the set of $n$ predicted
values, the first and the second raw moments can be computed. The former is then used as the prediction and the latter as an estimate of model uncertainty.

The essential advantage of MCD is its simple applicability, which has led to a wide range of immediate applications \citep{kendall_2017, gal_2017, kendall2017a, miller_2018}. In our work, we draw inspiration from MCD and its vast impact. However, instead of approximating the uncertainty with a Monte Carlo simulation in PCs, we perform variance propagation from the leaf to the root nodes with a single pass, with which we derive a sampling-free, closed-form solution to model uncertainty.

\section{Tractable Dropout Inference}
\label{sec:tdi}
In this section, we first introduce preliminaries with respect to PCs, before continuing to delve into a step-by-step derivation of how to obtain sampling-free uncertainties with TDI.

\subsection{Preface: Probabilistic Circuits}
\label{sec:tdi:pcs}

In this work, we refer to a relevant class of PCs, i.e.,
sum-product networks (SPNs)~\citep{poon_2011}. In the family of
tractable probabilistic models, SPNs stand out for their inference capabilities
and great representational power~\citep{delalleau_2011}.
They hold important structural properties
such as \emph{smoothness} and \emph{decomposability}
that enable the efficient encoding of a valid
probability distribution. In the following, we first formally introduce SPNs and their important properties.

\begin{figure*}
  \centering
  \begin{subfigure}[t]{0.333\linewidth}
% %%%%% GRAPH PC %%%%%
\centering \scalebox{0.3333}{
    \begin{tikzpicture}[minimum size=8mm, inner sep=0pt,
      align=center,
prod/.style={circle,tab10blue, draw, fill=tab10blue!10,line width=\nodethickness, minimum size=\minnodesize, path picture={
    \draw[tab10blue]
    (path picture bounding box.south east) -- (path picture bounding box.north west) (path picture bounding box.south west) -- (path picture bounding box.north east);
    }},
    sum/.style={circle, tab10green, draw, fill=tab10green!10, line width=\nodethickness, minimum size=\minnodesize, path picture={
    \draw[tab10green]
    (path picture bounding box.south) -- (path picture bounding box.north) (path picture bounding box.west) -- (path picture bounding box.east);
    }},
    leaf/.style={circle, draw, line width=\nodethickness, minimum
      size=\minnodesize},
    pcedge/.style={thick,{Stealth[scale=1.25]}-, font=\fontsize{16}{0}\selectfont},
    ]

    % NODE sum1 layer
    \node[sum, label={[font=\fontsize{16}{0}\selectfont]north:$\spn$}] (s1) {};

    % NODE prod1 layer
    \node[prod] (p11) [below left=\nodedist and 1.25\nodedist of s1] {};
    \node[prod] (p12) [below right=\nodedist and 1.25\nodedist of s1] {};

    % NODE sum2 layer
    \node[sum] (s21) [below left=\prodnodedist and 0.75\prodnodedist of p11] {};
    \node[leaf] (l22) [below right=\prodnodedist and 0.75\prodnodedist of p11] {\Large $\X_{3}$};
    \node[leaf] (l23) [below left=\prodnodedist and 0.75\prodnodedist of p12] {\Large $\X_{1}$};
    \node[sum] (s24) [below right=\prodnodedist and 0.75\prodnodedist of p12] {};

    % NODE prod2 layer
    \node[prod] (p21) [below left=\nodedist and 0.5\nodedist of s21] {};
    \node[prod] (p22) [below right=\nodedist and 1.25\nodedist of s21] {};
    \node[prod] (p23) [below left=\nodedist and 1.25\nodedist of s24] {};
    \node[prod] (p24) [below right=\nodedist and 0.5\nodedist of s24] {};

    % NODE leaf layer
    \node[leaf] (l31) [below left=1.25\nodedist and 0.3\leafnodedist of p21] {\Large $\X_{1}$};
    \node[leaf] (l32) [below right=1.25\nodedist and 0.3\leafnodedist of p21] {\Large $\X_{2}$};
    \node[leaf] (l33) [below left=1.25\nodedist and 0.3\leafnodedist of p22] {\Large $\X_{1}$};
    \node[leaf] (l34) [below right=1.25\nodedist and 0.3\leafnodedist of p22] {\Large $\X_{2}$};
    \node[leaf] (l35) [below left=1.25\nodedist and 0.3\leafnodedist of p23] {\Large $\X_{2}$};
    \node[leaf] (l36) [below right=1.25\nodedist and 0.3\leafnodedist of p23] {\Large $\X_{3}$};
    \node[leaf] (l37) [below left=1.25\nodedist and 0.3\leafnodedist of p24] {\Large $\X_{2}$};
    \node[leaf] (l38) [below right=1.25\nodedist and 0.3\leafnodedist of p24] {\Large $\X_{3}$};

    % EDGE sum to split
    \draw[pcedge] (s1) -- (p11) node[midway,fill=white] {$\prodnode_{1}$};
    \draw[pcedge] (s1) -- (p12) node[midway,fill=white] {$\prodnode_{2}$};

    % EDGE prod1 to sum2
    \draw[pcedge] (p11) -- (s21) node[midway,fill=white] {$\sumnode_{2}$};
    \draw[pcedge] (p11) -- (l22) node[midway,fill=white] {$\leafnode_{1}^{3}$};
    \draw[pcedge] (p12) -- (l23) node[midway,fill=white] {$\leafnode_{2}^{1}$};
    \draw[pcedge] (p12) -- (s24) node[midway,fill=white] {$\sumnode_{3}$};

    % EDGE sum2 to prod2
    \draw[pcedge] (s21) -- (p21) node[midway,fill=white] {$\prodnode_{3}$};
    \draw[pcedge] (s21) -- (p22) node[midway,fill=white] {$\prodnode_{4}$};
    \draw[pcedge] (s24) -- (p23) node[midway,fill=white] {$\prodnode_{5}$};
    \draw[pcedge] (s24) -- (p24) node[midway,fill=white] {$\prodnode_{6}$};

    % EDGE prod2 to leaf
    \draw[pcedge] (p21) -- (l31) node[midway,fill=white] {$\leafnode_{3}^{1}$};
    \draw[pcedge] (p21) -- (l32) node[midway,fill=white] {$\leafnode_{4}^{2}$};
    \draw[pcedge] (p22) -- (l33) node[midway,fill=white] {$\leafnode_{5}^{1}$};
    \draw[pcedge] (p22) -- (l34) node[midway,fill=white] {$\leafnode_{6}^{2}$};
    \draw[pcedge] (p23) -- (l35) node[midway,fill=white] {$\leafnode_{7}^{2}$};
    \draw[pcedge] (p23) -- (l36) node[midway,fill=white] {$\leafnode_{8}^{3}$};
    \draw[pcedge] (p24) -- (l37) node[midway,fill=white] {$\leafnode_{9}^{2}$};
    \draw[pcedge] (p24) -- (l38) node[midway,fill=white] {$\leafnode_{10}^{3}$};
  \end{tikzpicture}
} 
    \caption{Probabilistic inference}
    \label{fig:graphs:pc}
  \end{subfigure}%
  \begin{subfigure}[t]{0.333\linewidth}
% %%%%% GRAPH MCD %%%%%
\centering \scalebox{0.3333}{
    \begin{tikzpicture}[minimum size=8mm, inner sep=0pt,
      align=center,
    prod/.style={circle,tab10blue, draw, fill=tab10blue!10,line width=\nodethickness, minimum size=\minnodesize, path picture={
    \draw[tab10blue]
    (path picture bounding box.south east) -- (path picture bounding box.north west) (path picture bounding box.south west) -- (path picture bounding box.north east);
    }},
    proddrop/.style={circle,tab10blue!30, draw, fill=tab10blue!05,line width=\nodethickness, minimum size=\minnodesize, path picture={
    \draw[tab10blue!30]
    (path picture bounding box.south east) -- (path picture bounding box.north west) (path picture bounding box.south west) -- (path picture bounding box.north east);
    }},
    sum/.style={circle, tab10green, draw, fill=tab10green!10, line width=\nodethickness, minimum size=\minnodesize, path picture={
    \draw[tab10green]
    (path picture bounding box.south) -- (path picture bounding box.north) (path picture bounding box.west) -- (path picture bounding box.east);
    }},
    leafdrop/.style={circle, gray!30, draw, line width=\nodethickness, minimum size=\minnodesize},
    leaf/.style={circle, draw, line width=\nodethickness, minimum
      size=\minnodesize},
    pcedge/.style={thick,{Stealth[scale=1.25]}-, font=\fontsize{16}{0}\selectfont},
    pcedgedrop/.style={thick,{Stealth[scale=1.25]}-, gray!50, font=\fontsize{16}{0}\selectfont},
    ]


    % NODE sum1 layer
    \node[sum, label={[font=\fontsize{16}{0}\selectfont]north:$\spn$}] (s1) {};

    % NODE prod1 layer
    \node[prod] (p11) [below left=\nodedist and 1.25\nodedist of s1] {};
    \node[prod] (p12) [below right=\nodedist and 1.25\nodedist of s1] {};

    % NODE sum2 layer
    \node[sum] (s21) [below left=\prodnodedist and 0.75\prodnodedist of p11] {};
    \node[leaf] (l22) [below right=\prodnodedist and 0.75\prodnodedist of p11] {\Large $\X_{3}$};
    \node[leaf] (l23) [below left=\prodnodedist and 0.75\prodnodedist of p12] {\Large $\X_{1}$};
    \node[sum] (s24) [below right=\prodnodedist and 0.75\prodnodedist of p12] {};

    % NODE prod2 layer
    \node[proddrop] (p21) [below left=\nodedist and 0.5\nodedist of s21] {};
    \node[prod] (p22) [below right=\nodedist and 1.25\nodedist of s21] {};
    \node[prod] (p23) [below left=\nodedist and 1.25\nodedist of s24] {};
    \node[proddrop] (p24) [below right=\nodedist and 0.5\nodedist of s24] {};

    % NODE leaf layer
    \node[leafdrop] (l31) [below left=1.25\nodedist and 0.3\leafnodedist of p21] {\Large $\X_{1}$};
    \node[leafdrop] (l32) [below right=1.25\nodedist and 0.3\leafnodedist of p21] {\Large $\X_{2}$};
    \node[leaf] (l33) [below left=1.25\nodedist and 0.3\leafnodedist of p22] {\Large $\X_{1}$};
    \node[leaf] (l34) [below right=1.25\nodedist and 0.3\leafnodedist of p22] {\Large $\X_{2}$};
    \node[leaf] (l35) [below left=1.25\nodedist and 0.3\leafnodedist of p23] {\Large $\X_{2}$};
    \node[leaf] (l36) [below right=1.25\nodedist and 0.3\leafnodedist of p23] {\Large $\X_{3}$};
    \node[leafdrop] (l37) [below left=1.25\nodedist and 0.3\leafnodedist of p24] {\Large $\X_{2}$};
    \node[leafdrop] (l38) [below right=1.25\nodedist and 0.3\leafnodedist of p24] {\Large $\X_{3}$};

    % EDGE sum to split
    \draw[pcedge] (s1) -- (p11) node[midway,fill=white] {$\prodnodeD_{1}$};
    \draw[pcedge] (s1) -- (p12) node[midway,fill=white] {$\prodnodeD_{2}$};

    % EDGE prod1 to sum2
    \draw[pcedge] (p11) -- (s21) node[midway,fill=white] {$\sumnodeD_{2}$};
    \draw[pcedge] (p11) -- (l22) node[midway,fill=white] {$\leafnodeD_{1}^{3}$};
    \draw[pcedge] (p12) -- (l23) node[midway,fill=white] {$\leafnodeD_{2}^{1}$};
    \draw[pcedge] (p12) -- (s24) node[midway,fill=white] {$\sumnodeD_{3}$};

    % EDGE sum2 to prod2
    \draw[pcedgedrop] (s21) -- (p21) node[midway,fill=white] {$\prodnodeD_{3}$};
    \draw[pcedge] (s21) -- (p22) node[midway,fill=white] {$\prodnodeD_{4}$};
    \draw[pcedge] (s24) -- (p23) node[midway,fill=white] {$\prodnodeD_{5}$};
    \draw[pcedgedrop] (s24) -- (p24) node[midway,fill=white] {$\prodnodeD_{6}$};

    % EDGE prod2 to leaf
    \draw[pcedgedrop] (p21) -- (l31) node[midway,fill=white] {$\leafnodeD_{3}^{1}$};
    \draw[pcedgedrop] (p21) -- (l32) node[midway,fill=white] {$\leafnodeD_{4}^{2}$};
    \draw[pcedge] (p22) -- (l33) node[midway,fill=white] {$\leafnodeD_{5}^{1}$};
    \draw[pcedge] (p22) -- (l34) node[midway,fill=white] {$\leafnodeD_{6}^{2}$};
    \draw[pcedge] (p23) -- (l35) node[midway,fill=white] {$\leafnodeD_{7}^{2}$};
    \draw[pcedge] (p23) -- (l36) node[midway,fill=white] {$\leafnodeD_{8}^{3}$};
    \draw[pcedgedrop] (p24) -- (l37) node[midway,fill=white] {$\leafnodeD_{9}^{2}$};
    \draw[pcedgedrop] (p24) -- (l38) node[midway,fill=white] {$\leafnodeD_{10}^{3}$};
  \end{tikzpicture}
}
    \caption{Monte Carlo dropout inference}
    \label{fig:graphs:mcd}
  \end{subfigure}%
  \begin{subfigure}[t]{0.333\linewidth}
\newcommand{\prodlabel}[1]{$\Varinline{\prodnodeD_{#1}}$}
\newcommand{\sumlabel}[1]{$\Varinline{\sumnodeD_{#1}}$}
\newcommand{\leaflabel}[2]{$\Varinline{\leafnodeD_{#1}^{#2}}$}
% %%%%% GRAPH CF %%%%%
\centering \scalebox{0.3333}{
    \begin{tikzpicture}[minimum size=8mm, inner sep=0pt,
      align=center,
prod/.style={circle,tab10blue, draw, fill=tab10blue!10,line width=\nodethickness, minimum size=\minnodesize, path picture={
    \draw[tab10blue]
    (path picture bounding box.south east) -- (path picture bounding box.north west) (path picture bounding box.south west) -- (path picture bounding box.north east);
    }},
    sum/.style={circle, tab10green, draw, fill=tab10green!10, line width=\nodethickness, minimum size=\minnodesize, path picture={
    \draw[tab10green]
    (path picture bounding box.south) -- (path picture bounding box.north) (path picture bounding box.west) -- (path picture bounding box.east);
    }},
    leaf/.style={circle, draw, line width=\nodethickness, minimum
      size=\minnodesize},
    pcedge/.style={thick,{Stealth[scale=1.25]}-, font=\fontsize{16}{0}\selectfont},
    ]

    % NODE sum1 layer
    \node[sum, label={[font=\fontsize{16}{0}\selectfont]north:$\Varinline{\spn}$}] (s1) {};

    % NODE prod1 layer
    \node[prod] (p11) [below left=\nodedist and 1.25\nodedist of s1] {};
    \node[prod] (p12) [below right=\nodedist and 1.25\nodedist of s1] {};

    % NODE sum2 layer
    \node[sum] (s21) [below left=\prodnodedist and 0.75\prodnodedist of p11] {};
    \node[leaf] (l22) [below right=\prodnodedist and 0.75\prodnodedist of p11] {\Large $\X_{3}$};
    \node[leaf] (l23) [below left=\prodnodedist and 0.75\prodnodedist of p12] {\Large $\X_{1}$};
    \node[sum] (s24) [below right=\prodnodedist and 0.75\prodnodedist of p12] {};

    % NODE prod2 layer
    \node[prod] (p21) [below left=\nodedist and 0.5\nodedist of s21] {};
    \node[prod] (p22) [below right=\nodedist and 1.25\nodedist of s21] {};
    \node[prod] (p23) [below left=\nodedist and 1.25\nodedist of s24] {};
    \node[prod] (p24) [below right=\nodedist and 0.5\nodedist of s24] {};

    % NODE leaf layer
    \node[leaf] (l31) [below left=1.25\nodedist and 0.3\leafnodedist of p21] {\Large $\X_{1}$};
    \node[leaf] (l32) [below right=1.25\nodedist and 0.3\leafnodedist of p21] {\Large $\X_{2}$};
    \node[leaf] (l33) [below left=1.25\nodedist and 0.3\leafnodedist of p22] {\Large $\X_{1}$};
    \node[leaf] (l34) [below right=1.25\nodedist and 0.3\leafnodedist of p22] {\Large $\X_{2}$};
    \node[leaf] (l35) [below left=1.25\nodedist and 0.3\leafnodedist of p23] {\Large $\X_{2}$};
    \node[leaf] (l36) [below right=1.25\nodedist and 0.3\leafnodedist of p23] {\Large $\X_{3}$};
    \node[leaf] (l37) [below left=1.25\nodedist and 0.3\leafnodedist of p24] {\Large $\X_{2}$};
    \node[leaf] (l38) [below right=1.25\nodedist and 0.3\leafnodedist of p24] {\Large $\X_{3}$};

    % EDGE sum to split
    \draw[pcedge] (s1) -- (p11) node[midway,fill=white] {\prodlabel{1}};
    \draw[pcedge] (s1) -- (p12) node[midway,fill=white] {\prodlabel{2}};

    % EDGE prod1 to sum2
    \draw[pcedge] (p11) -- (s21) node[midway,fill=white] {\sumlabel{2}};
    \draw[pcedge] (p11) -- (l22) node[midway,fill=white] {\leaflabel{1}{3}};
    \draw[pcedge] (p12) -- (l23) node[midway,fill=white] {\leaflabel{2}{1}};
    \draw[pcedge] (p12) -- (s24) node[midway,fill=white] {\sumlabel{3}};

    % EDGE sum2 to prod2
    \draw[pcedge] (s21) -- (p21) node[midway,fill=white] {\prodlabel{3}};
    \draw[pcedge] (s21) -- (p22) node[midway,fill=white] {\prodlabel{4}};
    \draw[pcedge] (s24) -- (p23) node[midway,fill=white] {\prodlabel{5}};
    \draw[pcedge] (s24) -- (p24) node[midway,fill=white] {\prodlabel{6}};

    % EDGE prod2 to leaf
    % \draw[pcedge] (p21) -- (l31) node[midway,fill=white, xshift=-0.4cm] {\leaflabel{3}{1}};
    % \draw[pcedge] (p21) -- (l32) node[midway,fill=white, xshift=0.4cm] {\leaflabel{4}{2}};
    % \draw[pcedge] (p22) -- (l33) node[midway,fill=white, xshift=-0.4cm] {\leaflabel{5}{1}};
    % \draw[pcedge] (p22) -- (l34) node[midway,fill=white, xshift=0.4cm] {\leaflabel{6}{2}};
    % \draw[pcedge] (p23) -- (l35) node[midway,fill=white, xshift=-0.4cm] {\leaflabel{7}{2}};
    % \draw[pcedge] (p23) -- (l36) node[midway,fill=white, xshift=0.4cm] {\leaflabel{8}{3}};
    % \draw[pcedge] (p24) -- (l37) node[midway,fill=white, xshift=-0.4cm] {\leaflabel{9}{2}};
    % \draw[pcedge] (p24) -- (l38) node[midway,fill=white, xshift=0.4cm] {\leaflabel{10}{3}};

    \draw[pcedge] (p21) -- (l31) node[midway,fill=white, xshift=-0.3cm] {\leaflabel{3}{1}};
    \draw[pcedge] (p21) -- (l32) node[midway,fill=white, xshift=0.3cm] {\leaflabel{4}{2}};
    \draw[pcedge] (p22) -- (l33) node[midway,fill=white, xshift=-0.3cm] {\leaflabel{5}{1}};
    \draw[pcedge] (p22) -- (l34) node[midway,fill=white, xshift=0.3cm] {\leaflabel{6}{2}};
    \draw[pcedge] (p23) -- (l35) node[midway,fill=white, xshift=-0.3cm] {\leaflabel{7}{2}};
    \draw[pcedge] (p23) -- (l36) node[midway,fill=white, xshift=0.3cm] {\leaflabel{8}{3}};
    \draw[pcedge] (p24) -- (l37) node[midway,fill=white, xshift=-0.3cm] {\leaflabel{9}{2}};
    \draw[pcedge] (p24) -- (l38) node[midway,fill=white, xshift=0.3cm] {\leaflabel{10}{3}};
  \end{tikzpicture}
}
    \caption{Tractable dropout inference}
    \label{fig:graphs:cf}
  \end{subfigure}%
  \caption{Forward pass illustration of a PC (\subref{fig:graphs:pc}), a PC with
    MCD sampling (\subref{fig:graphs:mcd}), and a PC with TDI based on variance propagation through the graph (\subref{fig:graphs:cf}). Whereas a single MCD forward pass only simulates one instantiation of a possible
    subgraph, the PC with TDI directly propagates the variances through the graph in a
    single pass (expectations and covariances are left out for visual clarity in the illustration, see \cref{sec:tdi:derivation} for full equations).}
  \label{fig:graphs}
\end{figure*}

\textbf{Definition:}
An SPN $\spn$ is a computational graph defined by a rooted directed acyclic graph (DAG), encoding
a probability distribution $p_{\X}$ over a set of random variables (RVs)
$\X = \{X_1, \ldots, X_n\}$, where inner nodes are either sum nodes $\sumnode$ or product
nodes $\prodnode$ over their children,
 and leaves $\leafnode$ are valid probability distributions defined on a
subset of the RVs $\Z \subseteq \X$. Each node $\node \in \spn$ has a \emph{scope},
$\scope{\node} \subseteq \X$, defined as the set of RVs appearing in
its descendant leaves. Each edge $(\sumnode_{i} \rightarrow \node_{j})$ connecting a sum node
$\sumnode_i$ to one of its children $\node_j \in \ch\left(\sumnode_i\right)$ has a non-negative weight $w_{ij}$, with
$\sum_j w_{ij} = 1$. Sum nodes represent mixtures over the probability
distributions encoded by their children, while product nodes represent
factorizations over contextually independent distributions. In summary, an SPN
can be viewed as a deep hierarchical mixture model of different factorizations.
An illustration of this kind of PC is shown in \cref{fig:graphs:pc}.

To encode a valid probability distribution, an SPN has to fulfill
two structural requirements~\citep{poon_2011}. One is
\textit{decomposability}, i.e., the scopes of the children of each product node
need to be disjoint, which allows distributing the NP-hard computation of
integrals (e.g. the partition function), to leaves where we require
this computation to be tractable. This condition can be fulfilled, either by an explicit form, e.g., from an exponential family distribution, or by the architectural design of the leaf
density estimators. The second requirement is \textit{smoothness}, constraining the scopes of
the children of each sum node to be identical (this is also referred
to as \emph{completeness}). This constraint is important to encode a
valid distribution that does not over- or underestimate some RVs states. In a
valid SPN, the probability assigned to a given state $\x$ of the RVs
$\X$ is given by the root node and will be denoted as $\spn(\x) = p_{\X}(\X = \x)$.

\textbf{Tractable Inference:}
Given an SPN $\spn$, $\spn\!\left(\x\right)$ is computed
by evaluating the network bottom-up. When evaluating a leaf node $\leafnode_{i}$
with scope $X_j$, $\leafnode_{i}\!\left(x_j\right)$ corresponds to the probability of the
state $x_j$. The value of a product node $\prodnode_{i}$
corresponds to the product of its children's values:
$\prodnode_{i}\!\left(\x_{|\scope{\prodnode_{i}}}\right)=\prod_{\node_{j} \in \ch\left(\prodnode_i\right)} \node_{j}(\x_{|\scope{\node_{j}}})$.
The value of a sum node $\sumnode_{i}$ is computed as the weighted sum
of its children's values:
$\sumnode_{i}\left(\x_{|\scope{\sumnode_{i}}}\right) = \sum_{\node_{j} \in \ch\left(\sumnode_i\right)}w_{ij} \node_{j}(\x_{|\scope{\node_{j}}})$.
All the exact marginal and conditional probabilities, also with different amount
of evidence, the exact partition function, and even approximate most probable explanation and maximum a posteriori
states can be computed in time and space linear in the network's \emph{size}, i.e. number of edges~\citep{poon_2011, peharz_2015}.

Being part of the PC family, SPNs share the basic building
blocks and procedures when performing inference with other classes of PCs. Thus, the contributions
that we introduce in this paper can also be easily applied to other
models of the PC family~\citep{choi_2020}. We further note that because TDI is presently framed as an inference routine, training follows conventional algorithms. Whereas the inclusion of uncertainties to guide training itself is certainly intriguing, we defer this prospect to future work and concentrate on the means to quantify uncertainty to overcome overconfidence and detect OOD data.  

\subsection{Deriving TDI}
\label{sec:tdi:derivation}
One of the crucial aspects in which PCs stand out compared to neural networks is
that they have clear probabilistic semantics. Leaf nodes are normalized
tractable distributions and sum nodes represent weighted
mixtures over different factorizations, encoded by the product nodes, on the
same scope. When dropout is applied to PCs, it thus entails an easier
interpretation. At first look, similar to the case for neural networks, MCD
at sum nodes would perform a sort of model averaging between a randomly selected set
of models (sub-graphs). Naively, we could follow the same procedure as proposed
in \cite{gal_2016} and conduct stochastic forward passes with a more intuitive
interpretation on mixtures over different factorizations (compared to a
black-box neural network); see \cref{fig:graphs:mcd} for an illustration. The
variance of the different models is then interpreted as the uncertainty w.r.t. a
specific input. However, we will now show that it is also possible to derive TDI as a
tractable uncertainty estimate in a single forward pass in PCs. With our closed-form derivations, we thus provide an
analytical solution to this measure of uncertainty.

\textbf{Core Idea -- Sampling-free Uncertainty:}
The general idea is to derive closed-form expressions of the expectation,
variance, and covariance for sum and product nodes as a function of their children.
The model uncertainty at the root nodes is then recursively computed by
performing variance propagation from the leaf nodes to the root nodes
in a single bottom-up pass through the graph structure (see \cref{fig:graphs:cf}). This procedure results in
uncertainty estimates without sampling from the graph multiple times, as is required by MCD.

In TDI we start by viewing sum nodes as linear combinations of RVs over their Bernoulli
dropout RVs and their children:
%
\begin{align}
  \label{eq:sum-prod-definitions}
  \sumnodeD = \sum_{i} \delta_{i} w_{i} \prodnode_{i} \, ,
\end{align}
%
where $\delta_{i} \sim \text{Bern}\!\left(q\right)$ and $p = 1-q$ corresponds to
the dropout probability.

We will now provide the expectation, variance, and covariance closed-form
solutions for sum, product, and leaf nodes.
Here, we present the most relevant derivations
for these quantities. For full derivations and theoretical details, we refer to Appendix A.

\subsubsection{Expectation: The Point Estimate}
\label{sec:tdi:derivation:expectation}
Using the linearity of the expectation, we push the expectation into the sum
and make use of the independence between the child nodes $\nodeD_{i}$ and the
Bernoulli RVs $\delta_{i}$ to extract $\Einline{\delta_i} = q$:
%
\begin{align}
  \label{eq:sum-expectation}
  \E{\sumnodeD} = q \sum_{i} w_{i}\E{\nodeD_{i}} \, .
\end{align}
%

The decomposability of product nodes ensures the independence of their children
w.r.t. each other, which leads to the product node expectation simply becoming the
product over the expectations of its children, i.e.,
%
\begin{align}
  \label{eq:prod-expectation}
  \E{\prodnodeD} =  \prod_{i} \E{\nodeD_{i}} \, .
\end{align}
%

\subsubsection{Variance: The Uncertainty Proxy}
\label{sec:tdi:derivation:variance}
Similar to the original work in neural networks of~\cite{gal_2016},
we will use the variance as
the proxy for the uncertainty. The sum node variance decomposes
into a sum of two terms. The first term is based on the variances and
expectations of its children and the second term accounts for the covariance
between the combinations of all children, i.e.,
%
\begin{alignat}{2}
  \label{eq:sum-variance}
  \Var{\sumnodeD} &= && q\sum_{i} w_{i}^{2} \left( \Var{\nodeD_{i}} + p\E{\nodeD_{i}}^{2}  \right) \notag\\
                  &  &&+ q^{2}\sum_{i\neq j}w_{i}w_{j}\Cov{\nodeD_{i}, \nodeD_{j}} \, .
\end{alignat}
%
Analogously, the product node variance decomposes into two product terms. By
applying the product of independent variables rule, we obtain:
\begin{align}
  \label{eq:prod-variance}
  \Var{\prodnodeD} &= \prod_{i} \left( \Var{\nodeD_{i}} + \E{\nodeD_{i}}^{2} \right) - \prod_{i} \E{\nodeD_{i}}^{2} \, .
\end{align}
%
\subsubsection{Covariance: The Evil}
\label{sec:tdi:derivation:covariance}
The covariance between two sum nodes, $\sumnodeD^{A}$ and $\sumnodeD^{B}$,
neatly decomposes into a weighted sum of all covariance combinations of the sum
node children:
\begin{align}
  \label{eq:sum-covariance}
  \text{Cov}\!\left[ \sumnodeD^{A}, \sumnodeD^{B}\right] = q^2\sum_{i}w_{i}^{A}\sum_{j}w_{j}^{B}\Cov{\nodeD_{i}^{A},\node_{j}^{B}} \, .
\end{align}
%
For an arbitrary graph, we are unable to provide a closed-form solution of the covariance between two product
nodes due to the first expectation in the product node
covariance:
%
\begin{alignat}{2}
  \Cov{\prodnodeD^{A},\prodnodeD^{B}} &= &&\E{\prod_{i}\nodeD_{i}^{A} \prod_j \nodeD_{j}^{B}} \notag\\
                                      & &&- \prod_{i}\E{\nodeD_{i}^{A}} \prod_j \E{\nodeD_{j}^{B}} \, , \label{eq:prod-covariance}
\end{alignat}
%
which cannot be simplified without any structural knowledge about independencies
between the children of $\nodeD^{A}$ and $\nodeD^{B}$, as they may share
a common subset of nodes, deeper down in the PC structure. Fortunately, we explore
three possible solutions to solve \cref{eq:prod-covariance} in the following, of which variants b) and c) are always applicable.

\textbf{a) Structural Knowledge:}
To simplify \cref{eq:prod-covariance} we can easily exploit structural knowledge
of the DAG. The simplest solution is a structure in which we know that two
product nodes $\prodnodeD^{A}$ and $\prodnodeD^{B}$ are not common ancestors of
any node, resulting in the independence $\prodnodeD^{A} \ind \prodnodeD^{B}$ and thus $\Covinline{\prodnodeD^{A}, \prodnodeD^{B}} = 0$. This
constraint is always given in tree-structured PCs. Because simple in practice,
tree structures are generated by the most common structure learner for SPNs such
as LearnSPN~\citep{gens_2013}, ID-SPN~\citep{rooshenas_2014}, and
SVD-SPN~\citep{adel_2015}.

For binary tree random and tensorized (RAT) structures \citep{peharz_2020}, \cref{eq:prod-covariance} can be simplified to
%
\begin{alignat}{2}
  \Cov{\prodnodeD_{l,r}, \prodnodeD_{l',r'}} =&\, &&\Cov{\sumnodeD^L_{l}, \sumnodeD^L_{l'}} \E{\sumnodeD^R_{r}} \E{\sumnodeD^R_{r'}} \notag \\
  &+\, &&\Cov{\sumnodeD^R_{r}, \sumnodeD^R_{r'}} \E{\sumnodeD^L_{l}} \E{\sumnodeD^L_{l'}} \notag \\
  &+\, &&\Cov{\sumnodeD^L_{l}, \sumnodeD^L_{l'}} \Cov{\sumnodeD^R_{r}, \sumnodeD^R_{r'}} \, . \label{eq:cov-rat}
\end{alignat}
%
The covariance of two product nodes now only depends on the covariance of the
input sum nodes of the same graph partition ($L$ or $R$) for which we can
plug in \cref{eq:sum-covariance}.

\textbf{b) It's Somewhere in Here -- Covariance Bounds:}
Whereas knowledge about the specific PC structure can facilitate the covariance
computation, when not available, we can
alternatively obtain a lower and upper bound of the covariance, making use of
the Cauchy-Schwarz inequality:
%
\begin{alignat}{2}
  \label{eq:cauchy-schwarz}
  &\Cov{\ndi, \ndj}^{2} &&\leq \Var{\ndi}\Var{\ndj} \\
  \Leftrightarrow \quad &\Cov{\ndi, \ndj} &&\in \left[ - \sqrt{\Var{\ndi}\Var{\ndj}}, \right. \notag\\
  & && \left.+ \sqrt{\Var{\ndi}\Var{\ndj}} \right] \quad . \label{eq:cauchy-schwarz:last}
\end{alignat}
%

\textbf{c) The Copy-paste Solution:}
A third alternative to using structural knowledge or giving covariance
bounds is via a ``copy-paste'' augmentation of the DAG, that enforces the
covariance between two nodes, $\nodeD_{A}$ and $\nodeD_{B}$, to be zero
by treating their common child as two separate nodes. That
is, for each node $\nodeD_{C}$ where a
$\text{Path}_{A} := \nodeD_{A} \rightarrow \nodeD_{C}$ and a second
$\text{Path}_{B} := \nodeD_{B} \rightarrow \nodeD_{C}$ exists, we can ``copy''
$\nodeD_{C}$ to obtain an equivalent node $\nodeD_{C'}$ and replace the original
$\nodeD_{C}$ in $\text{Path}_{B}$ with the copy $\nodeD_{C'}$. With this simple
procedure, we can enforce a tree structure on the PC, resulting in the
covariance between two children of a node $\nodeD$ to be zero.
In practice, we do not need to modify the DAG. Instead, we can simply ignore the covariance
terms and thereby obtain this DAG transformation during the TDI procedure implicitly.

\subsubsection{Leaf Nodes}
\label{sec:tdi:derivation:leaf-nodes}

As leaf nodes are free of any dropout Bernoulli variables, their expectation,
variance, and covariance degrade to the leaf node value and zero respectively,
i.e.,
%
\begin{align}
  \label{eq:leaf-exp-var-cov}
  \E{\leafnodeD} = \leafnodeD, \quad \Var{\leafnodeD} = 0, \quad \Cov{\leafnodeD_{i},\leafnodeD_{j}} = 0 \, .
\end{align}
%
While the above is a valid choice, this framework
further allows including prior knowledge about aleatoric and epistemic
uncertainty, by setting $\Varinline{\leafnodeD} >0$ and
$\Covinline{\leafnodeD_{i}, \leafnodeD_{j}} \neq 0$. This additionally
highlights the advantage over the MCD procedure, where the inclusion of prior knowledge
is not possible.

\subsubsection{Classification Uncertainty}
\label{sec:tdi:derivation:classification}
For classification in PCs, we can express the class conditionals $p\left(\x \cbar y_i\right) = \sumnodeD_i$ as root nodes with class priors $p\left(y_i\right)=c_i$ and obtain the posterior  via Bayes' rule, i.e.,
%
\begin{align}
  \label{eq:1}
  p\left(y_{i} \cbar \x\right) = \frac{p\left(\x \cbar y_{i}\right) p\left(y_{i}\right)}{\sum_{j} p\left(\x \cbar y_{j}\right) p\left(y_{j}\right)} = \frac{\sumnodeD_{i}\ci}{\sum_{j}\sumnodeD_{j}\cj} \, .
\end{align}
%
In our case, the expectation and variance of the posterior are that of a random variable
ratio, $\E{\frac{A}{B}}$ and $\Var{\frac{A}{B}}$, with $A = \sumnodeD_{i}\ci$ and
$B = \sum_{j}\sumnodeD_{j}\cj$. This ratio is generally not well-defined, but can
be approximated with a second-order Taylor
approximation~\citep{ratio_seltman}: 
%
\begin{align}
  \E{\frac{A}{B}} &\approx \frac{\E{A}}{\E{B}} - \frac{\Cov{A, B}}{\left( \E{B} \right)^{2}} + \frac{\Var{B}\E{A}}{\left( \E{B} \right)^{3}} \label{eq:exp-rv-ratio}\\
  \Var{\frac{A}{B}} &\approx \frac{\E{A}^{2}}{\E{B}^{2}} \left[ \frac{\Var{A}}{\E{A}^{2}} - 2 \frac{\Cov{A, B}}{\E{A}\E{B}} + \frac{\Var{B}}{\E{B}^{2}} \right] . \label{eq:var-rv-ratio}
\end{align}
%
We will now resolve every component of \cref{eq:exp-rv-ratio,eq:var-rv-ratio}.
The expectations are straightforward:
%
\begin{align}
  \E{A} &= \E{\sumnodeD_{i}\ci} = \E{\sumnodeD_{i}}\ci \label{eq:exp-x}\\
  \E{B} &= \E{\sum_{j}\sumnodeD_{j}\cj} = \sum_{j}\E{\sumnodeD_{j}}\cj \, . \label{eq:exp-y}
\end{align}
%
For the variances we obtain:
%
\begin{align}
  \Var{A} &= \Var{\sumnodeD_i}\ci^{2} \label{eq:var-x}\\
  \Var{B} &= \sum_{j}\Var{\sumnodeD_j}\cj^{2} + \sum_{j_1 \neq j_2} \Cov{\sumnodeD_{j_{1}}, \sumnodeD_{j_{2}}}c_{j_{1}}c_{j_{2}} \, . \label{eq:var-y}
\end{align}
%
Following \cref{eq:sum-covariance}, the covariance term between a root node and the sum of all root nodes can be decomposed as follows:
%
\begin{align}
  \label{eq:cov-x-y}
  \Cov{A, B} &= \ci \sum_{j} \cj \Cov{\sumnodeD_{i},\sumnodeD_{j}} \quad ,
\end{align}
which in turn can be resolved with one of the methods provided in \cref{sec:tdi:derivation:covariance}.

While \cref{eq:exp-rv-ratio,eq:var-rv-ratio} are seemingly simple, their particular
formulation implies statistical independence between $A$ and $B$. Since $B$ is
a sum over all $A$, this independence naturally does not hold. Therefore, the
solution given here is only an approximation of the true second-order Taylor
approximation. In Appendix A.2.7 we extend the formulation of
\cite{ratio_seltman} and take into account the dependencies between root nodes $\sumnode_i$ and their sum $\sum_{i}\sumnode_i$. 

\subsubsection{Tractability}
\label{sec:tdi:derivation:tractability}
We re-emphasize that PCs are tractable probabilistic models where, in general,
inference is at most polynomial in the network size.
Specifically, in the PC family, SPNs perform a wide range of queries in linear time
in the network size.

Thanks to the compact representation of PCs, all formulations derived for TDI in
\cref{sec:tdi:derivation:expectation,sec:tdi:derivation:variance,sec:tdi:derivation:covariance,sec:tdi:derivation:leaf-nodes,sec:tdi:derivation:classification}
have polynomial space and time complexity, specifically, at most quadratic.
In fact, the expectation of a single sum node, given by \cref{eq:sum-expectation}, adds a negligible single
floating point multiplication (by $q$) as we can reuse the sum node output. The expectation
of a single product, in \cref{eq:prod-expectation}, does not require any additional operations. For the
computation of variance and covariance, the cost depends on the actual PC structure.
In sparse structures such as trees, the cost is linear with respect to the number of
input nodes, as the covariance term becomes zero.
However, for cases where all child input covariance combinations need to be computed, as in \cref{eq:prod-variance}, the cost can be
locally quadratic with respect to the number of sum node inputs (see \cref{eq:sum-variance,eq:sum-covariance}). Applying solution c) of
\cref{sec:tdi:derivation:covariance}, i.e.,
the implicit ``copy-paste'' augmentation of the DAG, reduces the cost of \cref{eq:prod-variance}
to be linear instead of quadratic.
This renders a full bottom-up pass tractable,
which can be performed in parallel with the standard bottom-up
probabilistic inference procedure. We provide pseudocode for the bottom-up TDI in Appendix B.

\section{Experimental Evaluation}
\label{sec:experimental_evaluation}
To demonstrate that PCs generally suffer from overconfidence under various forms of
distribution shift, and to show the benefits of TDI in these circumstances,
we investigate the following three common experimental scenarios:

\begin{enumerate}
    \item \textbf{OOD datasets:} Following popular practice to assess whether a
    model can successfully distinguish known data from unknown OOD instances
    \citep{bradshaw_2017, nalisnick_2019}, we train the circuits on SVHN~\citep{svhn} and then additionally test on several popular color image datasets: CIFAR-100~\citep{cifar100}, CINIC~\citep{cinic}, and LSUN~\citep{lsun}.
    \item \textbf{Perturbations:} Inspired by recent works that investigate predictions in the context of increasingly perturbed data instances \citep{ovadia_2019, antoran_2020, daxberger_2021}, we evaluate our models when rotating MNIST \citep{lecun_1998} digits are introduced for inference.
    \item \textbf{Corrupted inputs:} In the spirit of recent works that
    demonstrate standard neural networks' inability to effectively handle
    naturally corrupted data \citep{hendrycks_2019, michaelis_2019}, we
    include a set of 15 different non-trivial corruptions to the SVHN dataset
    for inference with PCs and PCs + TDI. Each of these corruptions features five different levels of severity.
\end{enumerate}

\paragraph{Experimental Setup.}
For our experiments, we implemented TDI based on RAT-SPNs in PyTorch and \emph{SPFlow}~\citep{molina_2019}. We use $S=20, I=20, D=5, R=5$ for the RAT-SPN structure and train our models for 200 epochs with a mini-batch size of 200, a learning rate of 1e-3 with the Adam~\citep{kingma2015adam} optimizer, and a PC + TDI dropout value of 0.2 for MNIST and 0.1 for SVHN. 
A detailed description is provided in Appendix C and our code is available at \url{https://github.com/ml-research/tractable-dropout-inference}.

\subsection{PCs with TDI Detect OOD Data}
Following our outlined first scenario, we first train on the SVHN dataset. We then
evaluate the predictive entropy obtained on samples of the unseen test set and on instances that come from entirely different
distributions of other datasets, e.g. house numbers vs. different
scene images or object categories like cars and sofas.
To successfully avoid mispredictions on an unrelated unknown dataset,
the entropy of our model's predictions should be higher compared to the one obtained for
ID samples.
Although predictive entropy might not be the optimal choice
for uncertainty quantification \citep{hullermeier_2021}, we leave the exploration
of better measures, in particular for OOD detection, to future work and remark that
our main focus is on assessing whether the epistemic uncertainty estimate obtained with TDI
is meaningful or not.

\begin{table}
    \caption{PC + TDI (in bold) improves area under curve scores for \cref{fig:precision-ood} over PCs when measuring the OOD precision over all thresholds by more than 2$\times$ on every OOD dataset. TDI is competitive with MCD while being more efficient as it does not require the cost of performing multiple (here 100) stochastic forward passes.
    }
    \centering
    \begin{tabular}{r|rrr}
          AUC ($\uparrow$) & CIFAR & CINIC & LSUN \\
            \midrule
         PC & 29.3 & 29.9 & 30.3 \\
         PC + TDI & \textbf{64.6} & \textbf{66.1} & \textbf{81.8} \\
         PC + MCD & 68.5 & 70.0 & 84.9 \\
         \midrule
         MLP & 56.0 & 58.1 & 55.9 \\
         MLP + MCD & 80.8 & 95.9 & 80.5 \\
         LeNet & 17.1 & 15.6 & 2.6 \\
         LeNet + MCD & 43.4 & 92.9 & 29.5 \\
    \end{tabular}
    \label{tab:ood-auc-scores}
\end{table}

\begin{figure}
    \centering
    \input{./figs/diff_id_ood_acc.pgf}
    \caption{Difference between ID and OOD detection precision of PCs (dashed) and PCs + TDI (solid) for all OOD detection threshold of \cref{fig:precision-ood}. PCs with TDI both outperform PCs in absolute terms and allow the model to adequately balance ID versus OOD data. On the contrary, conventional PCs without TDI generally perform poorly and have their best point at an unintuitively low threshold.}
    \label{fig:precision-ood-diffs}
\end{figure}

Our introductory \cref{fig:precision-ood} has already shown that conventional PCs are bad at properly detecting OOD data while keeping a high precision on ID data, whereas PCs with TDI overcome this challenge. To also quantify the improvement introduced by TDI over all thresholds, we now show the area under the curve scores for \cref{fig:precision-ood} in \cref{tab:ood-auc-scores}, demonstrating that TDI improves all scenarios by more than two times.
For a comprehensive overview and as a point of reference, we also present results illustrating that MCD attains analogous improvements on both PCs and neural networks such as a Multilayer Perceptron (MLP) and an inherently overconfident LeNet. However, while MCD requires several stochastic forward passes (100 in this case), by exploiting PCs semantics, TDI achieves similar performance with only a single forward pass. 

In addition, we further highlight the precise tradeoff between the ID and OOD precision over all OOD decision thresholds in \cref{fig:precision-ood-diffs}. In other words, we quantify the precision with which a selected threshold on entropy correctly leads to rejection of unknown OOD data, while at the same time \emph{not} rejecting ID data in order to classify it correctly. Intuitively, the threshold should balance the latter two, as a very low threshold should simply reject all data, whereas a very high threshold would incorrectly accept any inputs. As visible, this is not the case for PCs, that have their largest margin between ID and OOD error at a very low OOD decision threshold, leading to a high ID error, e.g. 28.7\% ID error and 21.8\% LSUN OOD error at a threshold of 0.05 (cf. \cref{fig:precision-ood}). On the contrary, TDI balances this shortcoming and allows for much higher OOD decision thresholds while keeping a lower ID error, e.g. 13.2\% ID error and 10.2\% LSUN OOD error at a reasonable mid-way threshold of 0.6 (cf. \cref{fig:precision-ood}). A complementary view with predictive entropy and uncertainty in terms of the standard deviation as the square root of \cref{eq:var-rv-ratio} is provided in Appendix D.

\subsection{PCs with TDI are More Uncertain on Perturbed Samples}
\begin{figure}[t]
  \centering
  \input{./figs/rotations.pgf}
  \caption{Top panel: predictive entropy (left y-axis, solid) and accuracy
    (right y-axis, dashed) of a PC (blue circles) and PC + TDI (orange
    triangles) on a gradual distribution shift of increasing MNIST digit
    rotations from 0$^\circ$ to 90$^\circ$ (x-axis). TDI already captures the
    distribution shift at lower degrees of rotation and assigns a much larger
    predictive entropy to greater rotations than PCs, while retaining
    predictive accuracy. Bottom panel: complementary view of predictive
    uncertainty (standard deviation in \cref{eq:var-rv-ratio}) in PC + TDI.}
    \label{fig:rotating_mnist_digits}
\end{figure}
\begin{figure*}[t]
  \centering
  \input{./figs/corruptions.pgf}
  \caption{
  Predictive entropy (left y-axis) and accuracy (right y-axis) of a PC (blue
   circles) and PC + TDI (orange triangles) for increasingly corrupted SVHN data at five severity levels; here, altering brightness, introducing elastic transformation, simulating frost, and adding Gaussian noise. PCs with TDI detect the distribution shift by assigning higher predictive entropy with increasing severity, while at the same time being more robust in predictive accuracy against the corruption.}
    \label{fig:svhn_corruptions}
\end{figure*}
In addition to the abrupt distribution shift of the prior section, we now
inspect the behavior of PCs and PCs + TDI on a more gradual scale in the second
scenario. Here, we train on the original MNIST training set. At inference time, we evaluate the models' predictive entropy on rotated versions of the MNIST test set from 0$^\circ$ to 90$^\circ$ in steps of 5$^\circ$, to simulate a gradual increase in data perturbation. Once more, with the aim of an effective measure for the distribution shift, the model should assign higher predictive entropy and uncertainty with increasing data perturbation. We visualize this experiment in \cref{fig:rotating_mnist_digits}, demonstrating in the top panel, that PCs with TDI (orange triangle) measure the data perturbation already at lower degrees of rotation and assigns a higher entropy to larger rotations than PCs (blue, circles). At the same time, TDI retains the same predictive accuracy. In the bottom panel of \cref{fig:rotating_mnist_digits}, we additionally highlight the measure of uncertainty as the standard deviation of \cref{eq:var-rv-ratio}, again confirming the expected increase of uncertainty with increasing data perturbation from an auxiliary viewpoint.


\subsection{PCs with TDI are More Robust to Data Corruptions}
As outlined in our third scenario, we investigate the case of natural and
synthetic data corruptions. We train on the SVHN training set and then evaluate the models on corrupted versions of the SVHN test set, with 15 different corruptions at five increasing levels of severity. Similar to the prior two scenarios, a successful detection entails that the models should be able to attribute a progressive increase in predictive entropy with increasing corruption severity. In \cref{fig:svhn_corruptions} we highlight the model's behavior on four such corruption types: brightness, elastic transformation, simulated frost, and Gaussian noise (see Appendix E for analogous evidence for all 15 corruption types). Matching the behavior of the perturbation scenario, in all shown corruption settings, a PC with TDI can associate an increase in corruption severity with higher predictive entropy. On top of that, TDI stays more robust at all severity levels of corruption than the PC by retaining higher predictive accuracy in the case of elastic transformation and Gaussian noise corruptions. This third scenario thus further verifies TDI's robustness against, and their ability to capture distribution shift.


\subsection{Discussion}
Our empirical evidence across all three scenarios indicates that PC + TDI is in
fact more robust and provides model uncertainty estimates that allow detecting
data from various unknown and shifted distributions. TDI lets PCs \emph{``know what they don't know''}. Beyond this desideratum, the sampling-free uncertainty of TDI entails several advantages over MCD in neural networks, opening up various additional prospects.

\textbf{Prospects:}
On the one hand, TDI alleviates the computational burden of MCD, getting rid of the compromise between estimation quality and amount of forward passes.
This can be particularly beneficial to other circuits such as regression or logistic circuits~\citep{liang2019logistic-circuits} tailored to predictive tasks.
Moreover, the tractable computation in a single forward pass in turn paves the way for uncertainty estimates to be directly involved in training processes. Such a signal does not only help with robustness but can also improve
active or continual learning,
in which PCs are largely yet to be explored. On the other hand, the clear semantics of PCs allow for the prospective inclusion of prior knowledge about uncertainty through the explicit (co-)variance terms at leaf nodes (recall \cref{sec:tdi:derivation:leaf-nodes}), which is unavailable to neural networks.

\textbf{Limitations:}
Whereas TDI has removed the computational burden of MCD, the necessity to select a dropout chance $p$, as a hyperparameter, remains untouched. In our various experiments, and prior works in neural networks, a common low value seems to suffice, but it is an additional consideration to be taken into account for training and inference. On the empirical side, further investigation of TDI should be extended to arbitrary structures,
involving the propagation of the covariance as introduced in~\cref{sec:tdi:derivation:covariance}. In similar spirit, although we have already experimentally investigated three distinct scenarios, the empirical performance of TDI for other density estimation tasks remains to be explored. 

\section{Conclusion}
In the spirit of recent works for neural networks, we have highlighted that the
generative model family of PCs suffers from overconfidence and is thus unable to
effectively separate ID from OOD data. As a remedy to this challenge, we have
drawn inspiration from the well-known MCD and introduced a novel probabilistic
inference method capable of providing tractable uncertainty estimates: tractable
dropout inference.
We obtain such sampling-free, single computation pass estimates by deriving a closed-form solution through variance propagation. Our empirical evidence confirms that TDI provides improved robustness and comes with the ability to detect distribution changes in three key scenarios: dataset change, data perturbation, and data corruption. The computationally cheap nature and potential to include prior knowledge in TDI paves the way for various future work, such as including uncertainty in training.


\begin{acknowledgements} % will be removed in pdf for initial submission,
						 % (without ‘accepted’ option in \documentclass)
                         % so you can already fill it to test with the
                         % ‘accepted’ class option
This work was supported by the Federal Ministry of Education and Research (BMBF) Competence Center for AI and Labour (``kompAKI'', FKZ 02L19C150) and the project ``safeFBDC - Financial Big
Data Cluster'' (FKZ: 01MK21002K), funded by the German Federal Ministry for
Economics Affairs and Energy as part of the GAIA-x initiative. It benefited from the Hessian
Ministry of Higher Education, Research, Science and the Arts (HMWK; projects
``The Third Wave of AI'' and ``The Adaptive Mind''), and the Hessian research
priority programme LOEWE within the project ``WhiteBox''.
\end{acknowledgements}


% References
\bibliography{ventola_118.bib}

\end{document}
