% \documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} % after acceptance, for a revised
                        % version; also before submission to
                        % see how the non-anonymous paper
                        % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                             % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                              % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
\bibliographystyle{plainnat}
\renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{amsthm}
\usepackage{amssymb}
\usepackage{amsfonts}
\usepackage{mathtools}
\usepackage{xcolor}
\usepackage{float}
\usepackage[labelformat=simple]{subcaption}
\usepackage[font={small}]{caption}
\usepackage{algorithm}
\usepackage[noend]{algpseudocode}
\usepackage{centernot}
\usepackage{makecell}
%\usepackage{enumitem}
\usepackage[makeroom]{cancel}
\usepackage{yhmath}
\usepackage{import}
\usepackage{multirow}
\usepackage{bm}
\usepackage{wrapfig}

\usepackage{tikz}
\tikzset{state/.style = {shape=circle,draw,thick,minimum size=3.0em}}
\tikzset{dstate/.style = {shape=circle,draw,thick,double,minimum size=3.0em}}
\tikzset{point/.style = {circle, draw, thick, inner sep=0.05cm,fill,node contents={}}}
%\tikzset{state/.style = {shape=circle,draw,minimum size=3.0em}}
%\tikzset{dstate/.style = {shape=circle,draw,double,minimum size=3.0em}}
%\tikzset{point/.style = {circle, draw, inner sep=0.05cm,fill,node contents={}}}
\usetikzlibrary{shapes,decorations,calc,fit,positioning}
%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example
\renewcommand\theadalign{bc}
\renewcommand\theadfont{\bfseries}
\renewcommand\theadgape{\Gape[4pt]}
\renewcommand\cellgape{\Gape[4pt]}
\newcommand{\pluseq}{\mathrel{{+}{=}}}
\newcommand{\minuseq}{\mathrel{{-}{=}}}

\renewcommand\thesubfigure{(\alph{subfigure})}

\newtheorem{theorem}{Theorem}
\newtheorem{corollary}{Corollary}
\newtheorem{lemma}{Lemma}
\newtheorem{definition}{Definition}
\newtheorem{heuristic}{Heuristic}
\newtheorem{proposition}{Proposition}
\newtheorem{axiom}{Axiom}
\newtheorem{notation}{Notation}
\newtheorem{example}{Example}
\newtheorem{conjecture}{Conjecture}
\newtheorem{assumption}{Assumption}

\newcommand{\yizuo}[1]{{\color{blue}#1}}% comments by Yizuo
\newcommand{\adnan}[1]{{\color{red}#1}}% comments by Adnan
\newcommand{\commented}[1]{}
\newcommand{\shrink}[1]{}

\newcommand\Sum{\sum^{\raisebox{-5pt}{$\scriptscriptstyle =$}}}
%\newcommand\Sum{\overset{=}{\sum}}
\newcommand\eql[2]{{#1\!=\!#2}}
\newcommand\e{{\bf e}}

\newcommand{\Pa}[1]{{{\tt Pa}(#1)}}
\newcommand{\fPa}[2]{{{{\tt fPar}_{#2}}(#1)}}
\newcommand\CUP{{\:\dot{\cup}\:}}

\newcommand{\bset}[1]{\overset{\smile}{#1}}
\newcommand{\aset}[1]{\overset{\frown}{#1}}

\def\FF{{\cal F}}
\def\GG{{\cal G}}
\def\HH{{\cal H}}
\def\BB{{\cal B}}
\def\PP{{\cal P}}
\def\EE{{\cal E}}
\def\KK{{\cal K}}
\def\SS{{\cal S}}
\def\MM{{\cal M}}
\def\JT{{\cal {T}}}
\def\AA{{\cal A}}
\def\CC{{\cal C}}
\def\TT{{\cal T}}
\def\NN{{\cal N}}
\def\Sij{{\S_{ij}^{\star}}}

\newcommand\VE{{\sc VE}}
\newcommand\VEC{{\sc VEC}}

\def\X{{\mathbf X}}
\def\Y{{\mathbf Y}}
\def\Z{{\mathbf Z}}
\def\U{{\mathbf U}}
\def\W{{\mathbf W}}
\def\V{{\mathbf V}}
\def\P{{\mathbf P}}
\def\S{{\mathbf S}}
\def\C{{\mathbf C}}
\def\R{{\mathbf R}}
\def\x{{\mathbf x}}
\def\y{{\mathbf y}}
\def\z{{\mathbf z}}
\def\w{{\mathbf w}}
\def\u{{\mathbf u}}
\def\v{{\mathbf v}}
\def\p{{\mathbf p}}
\def\s{{\mathbf s}}
\def\c{{\mathbf c}}
\def\QQ{{\mathcal Q}}
\def\And{{\textbf{and} }}
\def\Or{{\textbf{or} }}

\newcommand\T{{\mathbf T}} 

\newcommand{\indep}{\!\perp\!\!\!\perp}
\newcommand{\dep}{\not\!\perp\!\!\!\perp}
\newcommand{\entail}{\vDash}
\newcommand{\nentail}{\nvDash}
\newcommand{\derive}{\vdash}
\newcommand{\nderive}{\nvdash}
\newcommand{\map}{\rightarrow}
\newcommand{\proj}{\overset{=}{\sum}}

\def\true{{\tt True}}
\def\false{{\tt False}}
\def\func{{\tt mech}}
\def\fcon{{\tt fcon}}
\def\embed{{\tt embed}}
\def\sep{{\tt sep}}
\def\mes{{\cal M}}
\def\vars{{\tt vars}}
\def\cls{{\tt cls}}
\def\upath{{\tt upath}}
\def\desc{{\tt Desc}}
\def\Anc{{\tt Anc}}
\def\var{{\tt var}}
\def\comp{{\tt COMPOSE}}
\def\avars{{avars}}
\def\bvars{{bvars}}
\def\ecai{{top-down thinning strategy }}
\def\heu{{heuristic thinning strategy }}
\def\con{X-connect}
\def\fvars{{\tt fvars}}

\newcommand\rthinning{0.5}
\newcommand\rtree{0.6}


\newcommand\mysection{\section}
\newcommand\mysubsection{\subsection}

\title{On the Definition and Computation of Causal Treewidth}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author{\href{mailto:<yizuo.chen@ucla.edu>?Subject=Your UAI 2022 paper}Yizuo Chen}
\author{\href{mailto:<darwiche@cs.ucla.edu>?Subject=Your UAI 2022 paper}Adnan Darwiche}
%\author[1,2]{Further~Coauthor}
%\author[3]{Further~Coauthor}
%\author[1]{Further~Coauthor}
%\author[3]{Further~Coauthor}
%\author[3,1]{Further~Coauthor}
% Add affiliations after the authors
\affil{%
Computer Science Department\\
University of California\\
Los Angeles, USA
}
%\affil[2]{%
%Second Affiliation\\
%Address\\
%…
%}
%\affil[3]{%
%Another Affiliation\\
%Address\\
%…
%}


\begin{document}
\maketitle

\begin{abstract}
Causal treewidth is a recently introduced notion allowing one to speed up Bayesian network inference and to bound its complexity 
in the presence of functional dependencies (causal mechanisms) whose identities are unknown. 
Causal treewidth is no greater than treewidth and can be bounded even when treewidth is unbounded. 
The utility of causal treewidth has been illustrated recently in the context of causal inference and 
model-based supervised learning. However, the current definition of causal treewidth is descriptive rather
than perspective, therefore limiting its full exploitation in a practical setting. We provide an extensive
study of causal treewidth in this paper which moves us closer to realizing the full computational potential of this notion both
theoretically and practically.
\end{abstract}

\mysection{Introduction}
\label{sec:intro}

Treewidth is one of the most influential notions for parameterizing the complexity of probabilistic
inference. This notion originated in the graph theory literature and can be viewed as a measure
of graph connectivity~\citep{jal/RobertsonS86}. It has also been used to parameterize the complexity of many
algorithmic tasks that transcend probabilistic inference; see, e.g.,~\citep{wg/Bodlaender06,daglib/0016622}.
For Bayesian networks, the time and space complexity of computing marginals is bounded
by \(O(n \cdot \exp(w))\) where \(n\) is the number of nodes in the network and \(w\) is its
treewidth. For example, tree-structured networks have a treewidth~\(\leq 1\) so treewidth 
allows us to show that inference on such networks can be done in linear time and space.

Treewidth captures the structural aspects of a model and is independent of its parameters.
Hence, one can use treewidth to provide guarantees on the complexity
of inference without needing to know the model parameters. In the first few decades of
research on Bayesian network inference, the perception was that high treewidth is a 
barrier since all influential algorithms at that time, particularly the jointree and variable
elimination algorithms~\citep{JT_ALG,zhangJAIR96a,dechterUAI96}, had a complexity which was 
also lower bounded exponentially by treewidth. Later developments showed that exploiting the parametric structure
of Bayesian networks can lead to tractable inference in some situations where the
treewidth can be very high; see,~e.g.,~\citep{LarkinD03,ijcai/ChaviraD05,ijar/ChaviraDJ06,ai/ChaviraD08}. 
The parametric structure exploited was particularly in the form of 
context-specific independence~\citep{uai/BoutilierFGK96} and logical constrains (i.e., parameters in $\{0,1\}$).\footnote{Among the
most effective approaches for exploiting parametric structure are the ones based on
compiling Bayesian networks into tractable circuits~\citep{DarwicheJACM03}. These approaches allow one to conduct inference
in time linear in the circuit size while yielding circuits whose size is not necessarily
exponential in treewidth---see~\citep{NeSyChapter} for a recent survey on circuit representations
and~\citep{ijcai/AgrawalPM21} for a recent empirical evaluation in which methods based on circuits ranked at the forefront in terms of efficiency.}

More recently, a new and more abstract type of parametric structure has been identified and exploited computationally:
functional dependencies, also known as causal mechanisms, which identities are {\em unknown}~\citep{DarwicheECAI20b}. 
In a Bayesian network, a node is functionally determined by its parents if fixing the state of these parents also fixes the 
state of the node (that is, the node distribution is deterministic given any state of its parents). We often know that a node 
is functionally determined by its parents but without knowing the identity of the underlying function. This is prominent, for example,
in causal inference where one typically has a causal graph in which every internal node is assumed to be
functionally determined by its parents yet without knowing the specific functions that relate nodes to 
their parents~\citep{pearl00b}.
Classical techniques for exploiting parametric structure
are not applicable in this case since these methods require knowledge of the specific model parameters which imply
knowledge of the specific functions that determine the values of internal nodes. 
This also arises when learning the parameters of a Bayesian network from data where we may have background
knowledge to the effect that some nodes are functionally determined by their parents but without
knowing the specific functions as we are trying to learn them; see, e.g.,~\citep{pgm/ChenCD20}. 
Interestingly enough, a recent finding showed that one can 
exploit unknown causal mechanisms computationally, leading to potentially exponential reduction in complexity~\citep{DarwicheECAI20b}. 
This finding was based on two new theorems and cast in the context of model-based supervised learning.
It particularly took the 
form of an algorithm that compiles the {\em structure} of a Bayesian network into a tractable circuit whose size is 
not necessarily exponential in treewidth. This approach managed to efficiently compile circuits for networks 
with treewidth over \(100\) without needing to know the network parameters, only that some nodes are
functionally determined by their parents. More recently, this finding was cast in the context of causal 
inference while hinting that it can lead to a new parameter
for bounding complexity that was called {\em causal treewidth}~\citep{causalityAC}.

Treewidth is classically defined for an undirected graph but it can be extended to directed acyclic graphs (DAGs)
by computing the treewidth of the {\em moralized} DAG. This is an undirected graph obtained from the DAG 
by connecting every pair of parents by an edge and then removing the directionality of edges; see, e.g.,~\citep[Ch~9]{Darwiche09}. 
Causal treewidth applies only to DAGs in which some nodes are declared as being {\em functional.} 
If no nodes are functional, then the causal treewidth reduces to treewidth. 
While~\citep{causalityAC} suggested this more refined notion of causal treewidth, it did not provide
an operational definition of causal treewidth and therefore it did not specify a method for computing it.
Moreover, while~\citep{DarwicheECAI20b} showed that  inference can be sped up,
exponentially in some cases, by exploiting unknown causal mechanisms, it did not 
fully exploit the two new theorems that enabled these techniques.

Our goal in this paper is to first review the two key theorems in~\citep{DarwicheECAI20b}  that enabled the
computational exploitation of unknown causal mechanisms, and to then use them as basis for formally
defining the notion of causal treewidth and how it can be computed. In the process of doing so,
we will prove some results about the algorithmic techniques proposed in~\citep{DarwicheECAI20b},
showing that some are optimal while others are not. In other words, we will show that the algorithmic 
techniques proposed in~\citep{DarwicheECAI20b} do not fully exploit the two enabling theorems identified in that work.
Hence, the main contribution of this work is that it brings us closer, both theoretically and practically,
towards the full exploitation of unknown causal mechanisms during inference. 
At a more cognitive level, our contribution may provide further hints as to why causal knowledge is so central to 
human reasoning~\citep{pearl18} as we provide a formal account of how causal knowledge, even in this 
abstract form, can be quite useful computationally.

We start next with some further motivation, technical preliminaries 
and a review of the key results in~\citep{DarwicheECAI20b}. We then study two key ingredients
which are needed to formally define causal treewidth: jointree thinning and machanism replication.
We finally define causal treewidth and present some experimental results that shed more light on
this notion and its underlying ingredients. Proofs of all results can be found in the appendix. 

\mysection{Motivation and Preliminaries}
\label{sec:prelim}

\begin{figure}[tb]
\center
\begin{minipage}[c]{.45\linewidth}
\scriptsize
\begin{tabular}{|c|c|c|c|}
\hline
$A$ & $B$ & $C$ & $f_C(ABC)$ \\
\hline
t & t & t & 0.7 \\
t & t & f & 0.3 \\
t & f & t & 0.1 \\
t & f & f & 0.9 \\
f & t & t & 0.4 \\
f & t & f & 0.6 \\
f & f & t & 0.5 \\
f & f & f & 0.5 \\
\hline
\end{tabular}
\caption*{(a) CPT for \(C\) \label{tab:cpt-not-mech}}
\label{tab:cpts1}
\end{minipage} %
\quad\quad
\begin{minipage}[c]{.45\linewidth}
\scriptsize
\begin{tabular}{|c|c|c|c|}
\hline
$A$ & $B$ & $C$ & $f_C(ABC)$ \\
\hline
t & t & t & 0 \\
t & t & f & 1 \\
t & f & t & 1 \\
t & f & f & 0 \\
f & t & t & 1 \\
f & t & f & 0 \\
f & f & t & 0 \\
f & f & f & 1 \\
\hline
\end{tabular}
\caption*{(b) Mechanism for \(C\) \label{tab:cpt-is-mech}}
\label{tab:cpts2}
\end{minipage}
\caption{Two CPTs for variable \(C\) with parents \(A,B\).
The second CPT represents a mechanism for variable \(C\).}
\label{tab:cpts}
\end{figure}

Variables are discrete and denoted by uppercase letters (e.g., \(X\)) and their values are denoted by lowercase letters (e.g., \(x\)). 
Sets of variables are denoted by boldface, uppercase letters (e.g., \(\X\)) and their
instantiations are denoted by boldface, lowercase letters (e.g., \(\x\)). 
A {\em factor} \(f(\X)\) is a mapping from instantiations \(\x\) to non-negative numbers.
%For a binary variable \(X\), we will use \(x\) and \(\n x\) to denote \(\eql(X,1)\) and \(\eql(X,0)\), respectively.
A Bayesian network is a DAG \(G\) together with one conditional probability table (CPT) for
each node \(X\) and its parents \(\P\) in the DAG. A CPT specifies a conditional distribution \(\Pr(X|\P)\)
and will be represented by a factor \(f(X\P)\) where \(f(x\p) = \Pr(x|\p)\) (hence, \(\sum_x f(x\p) = 1\)).
To indicate that factor \(f(X\P)\) is a CPT for variable \(X\), we will usually notate it as \(f(X,\P)\) or \(f_X(X\P)\).
Of particular interest are CPTs (factors) that specify functions, also referred to as mechanisms. 

\begin{definition}
\label{def:fcpt}
A factor \(f(X,\P)\) is a \underline{mechanism} for  $X$ (or \(X\)-mechanism)
iff \(f(x,\p) \in \{0,1\}\) and \(\sum_x f(x,\p)=1\). 
\end{definition}
A mechanism for variable \(X\) represents a function whose inputs are parents \(\P\) and whose output is \(X\).
Figure~\ref{tab:cpts} depicts two factors over binary variables $\{A,B,C\}$. 
The factor in Figure~\ref{tab:cpts}a is a CPT for variable $C$ but is not a mechanism.
The one in Figure~\ref{tab:cpts}b is also a CPT for \(C\) but is a mechanism which corresponds to the function $C = A \oplus B$.
	
\begin{figure}[tb]
\begin{equation*}
\vcenter{\hbox{\import{./tikz_figs/}{fig15.tex}}}
\qquad\qquad
\begin{aligned}
A &= \bar{U}_1
\\
B &= U_1 \cdot A
\\
C &= U_2 \oplus A 
\end{aligned}
\end{equation*}
\caption{SCM with endogenous variables $A,B,C$ and exogenous variables \(U_1, U_2\).
All variables are binary.
The mechanisms for endogenous  variables are specified by structural equations.}
\label{fig:scm}
\end{figure}

The use of mechanisms is ubiquitous in causality~\citep{pearl00b}. 
In this context, root nodes in the DAG are called {\em exogenous} and internal nodes are called {\em endogenous.} 
A common class of models known as functional Bayesian networks or Structural Causal Models (SCMs)
assume that the CPTs of all endogenous variables are mechanisms.
Figure~\ref{fig:scm} depicts an example SCM where the mechanisms for endogenous variables (\(A,B,C\)) are specified using
structural equations as is commonly done. For this model to be complete, one also needs the CPTs for exogenous variables (\(U_1,U_2\))
which specify the distributions $Pr(U_1)$ and $Pr(U_2)$, the only source of uncertainty in the model.

A classical setup in causal inference is to only have the graph of an SCM while assuming that the mechanisms (structural equations)
are not known. In Figure~\ref{fig:scm}, this would amount to assuming that each endogenous variable (\(A,B,C\)) is a function of its parents,
yet without knowing what these functions are. For example, we may not know whether the function for variable \(C\) is \(C = U_2 \oplus A\) or
\(C = U_2 + \bar{A}\) or \(C = U_2 \cdot A\) or something else (there are \(16\) possible mechanisms for a binary variable with two binary parents).
This situation may also arise in non-causality contexts where the assumption of unknown mechanisms can be viewed as
background knowledge; see, e.g.,~\citep{pgm/ChenCD20}.

In these situations, one typically has data in addition to the graph of a Bayesian network and the goal is to perform inference
based on this available information; for example, by first estimating model parameters as suggested in~\citep{nips/Zaffalon,causalityAC,pgm/ChenCD20}.
This requires inference algorithms whose complexity is independent of the model parameters.
Until relatively recently, the best complexity one could attain in this case is exponential in the graph treewidth.
This complexity has been improved exponentially though due to the results in~\citep{DarwicheECAI20b}
and one goal of our work is to improve and further formalize these recent advances using the notion of causal treewidth.\footnote{A reviewer
suggested using the term {\em functional treewidth} instead of {\em causal treewidth.} Our choice for the latter term
is motivated by the emphasis we wish to place on exploiting ``unknown'' functions which are prevalent in causal inference,
in contrast to the more common and informed exploitation of ``known'' functions.}


\mysection{Computing Marginals}

There are two operations on factors, multiplication and sum-out, which allow us to define the computational problem whose complexity we wish 
to bound using causal treewidth.  
The {\em product} of factors \(f(\X)\) and \(g(\Y)\) is another factor \(h(\Z)\),
where \(\Z = \X \cup \Y\) and \(h(\z) = f(\x)g(\y)\) for the unique instantiations \(\x\) and \(\y\) that are compatible with instantiation~\(\z\). 
{\em Summing-out} variables \(\Y \subseteq \X\) from factor \(f(\X)\) yields another factor \(g(\Z)\),
where \(\Z= \X \setminus \Y\) and \(g(\z) = \sum_\y f(\y\z)\). 
We will use \(\sum_\Y f\) to denote the resulting factor~\(g\). 
We will also use \(\Sum_\Z f\) to denote summing out all variables from factor \(f\) except for variables \(\Z\).
That is, for a factor \(f(\X)\), we will write  \(\Sum_\Z f\) to mean \(\sum_\Y f\) where \(\Y = \X \setminus \Z\). 

\begin{wrapfigure}[5]{r}{0.06\textwidth}
\centering
\vspace{-6mm}
\hspace{-5mm}\includegraphics[width=0.08\textwidth]{figs/BN.png}
\caption*{}
\label{}
\end{wrapfigure}

The {\em joint distribution} of a Bayesian network is the product of its CPTs.
The network on the right has CPTs \(f_A(A)\), \(f_B(AB)\), \(f_C(AC)\), \(f_D(BCD)\) and \(f_E(CE)\).
Its joint distribution is \(\Pr(ABCDE) = f_A f_B f_C f_D f_E\).
We can now compute the marginal over any variables by suming out all other variables from the joint distribution.
For example, the marginal over variable \(D\) is the factor 
\(\Pr(D) = \sum_{ABCE} f_A f_B f_C f_D f_E = \Sum_{D} f_A f_B f_C f_D f_E \).
It is this computation of marginals that we will be bounding using causal treewidth.
We are particularly interested in computing marginals over {\em families,} where
a family is a variable and its parents, since these marginals form the basis of parameter 
estimation using algorithms such as gradient descent and EM; see, e.g.,~\cite[Ch 17]{Darwiche09}.

\begin{definition}\label{def:marginals}
Consider a DAG \(G\) with nodes \(X_1, \ldots, X_n\) and let \(\P_i\) be the parents of \(X_i\).
Given a set of factors \(f(X_i\P_i)\) for \(i=1,\ldots,n\), the \underline{marginals problem}
is to compute the factor \(\Sum_{\mathbf F} \prod_{i=1}^n  f(X_i\P_i)\) for each family \(\mathbf F\).
\shrink{
\footnote{To
	compute posterior marginals for evidence \(\e: \{\eql {X_j} {x_j^\star}\}_j\) we
	add an auxiliary child \(C_j\) for each variable \(X_j\) with the same values as \(X_j\) 
	and factor \(f(C_i,X_i)\) where \(f(c_j,x_j) = 1\) when \(c_j = x_j = x_j^\star\) and \(f(c_j,x_j) = 0\) otherwise. 
	This yields joint marginals \(\Pr(X_i,\e)\) which give posteriors when normalized.}
%Adding these auxiliary children does not change the causal treewidth of the DAG.}
}
\end{definition}
A factor \(f(X_i\P_i)\) will be called a {\em family factor.}  The marginals problem
does not place any restrictions on family factors so it is quite general. When these factors are CPTs,
the marginals problem corresponds to the computation of marginals in a Bayesian network.

As mentioned earlier, if the Bayesian network has \(n\) nodes and treewidth \(w\),
marginals can be computed in \(O(n \cdot \exp(w))\) time and space. The simplest proof
of this result is based on the algorithm of variable elimination (\VE) which applies more
generally to the problem in Definition~\ref{def:marginals}~\citep{zhangJAIR96a,dechterUAI96}.
\VE\  is based on two theorems, the first allows us to sum out variables in any order.

\begin{theorem}\label{thm:ve0}
\(\sum_{\X\Y} f = \sum_\X \sum_\Y f = \sum_\Y \sum_\X f\).
\end{theorem}

The second theorem allows us to pull out factors from sums.
\begin{theorem}\label{thm:ve1}
If variables \(\X\) appear in factor \(f\) but not in factor \(g\), then \(\sum_\X f \cdot g = g \sum_\X f\).
\end{theorem}

Consider the factor \(\sum_{ABDE} f(ACE) g(BCD)\). A direct computation of this factor
multiplies factors \(f\) and \(g\) to yield the factor \(h(ABCDE)\) and then sums out variables \(ABDE\) from \(g\). 
Using Theorem~\ref{thm:ve0}, we can arrange the above sum into \(\sum_{AE} \sum_{BD} f(ACE) g(BCD)\).
Using Theorem~\ref{thm:ve1}, we can arrange it further into \(\sum_{AE} f(ACE) \sum_{BD} f(BCD)\).
This is more efficient to compute as the largest factor constructed in the process will be over
\(3\) instead of \(5\) variables.

Suppose we eliminate variables according to order \(\pi\) when computing a marginal and
let \(w+1\) be the largest number of variables appearing in a factor constructed in the process. 
The time and space complexity of \VE\ can then be bounded by \(O(n \cdot \exp(w))\) 
where \(n\) is the number of variables. The number \(w\) is called the {\em width} of order \(\pi\).
If the DAG has treewidth \(w\) then there must exist an elimination order of width \(w\). Moreover,
no elimination order can have a width less than \(w\); see~\cite[Ch~6~\&~9]{Darwiche09} for 
a detailed exposition of these concepts and results. 

\mysection{Exploiting Unknown Mechanisms}

Two new theorems were added to \VE\ by~\cite{DarwicheECAI20b} which enabled the exploitation of unknown
causal mechanisms. In the following three results, 
we will use \(\FF\), \(\GG\), \(\HH\) to denote sets of factors, where each set is interpreted 
as a product of its factors. For example, the set of factors \(\FF\) will be interpreted as the
factor \(\prod_{f \in \FF} f\).

\begin{theorem}[\citep{DarwicheECAI20b}]\label{thm:ve2}
Let \(f\) be a mechanism for variable \(X\). If  \(f \in \GG\) and \(f \in \HH\), then
\(\GG \cdot \HH = \GG \sum_X \HH\). 
\end{theorem}
According to this result, if a mechanism for \(X\) appears in both parts of a product, then
variable \(X\) can be summed out from one part without changing the value of the product.

\begin{corollary}[\citep{DarwicheECAI20b}]\label{coro:ve2}
If \(f\) is a mechanism for  \(X\), \(f \in \GG\) and \(f \in \HH\), then
\(\sum_X \GG \cdot \HH = \left(\sum_X \GG\right) \left(\sum_X \HH\right)\). 
\end{corollary}
That is, if a mechanism for \(X\) appears in both parts of a product,
we can sum out variable \(X\) from the product by independently summing it out from each part.
%This is a remarkable addition to the algorithm of variable elimination which has been under study for a few decades now.
Corollary~\ref{coro:ve2} may appear unusable as it is predicated on
multiple occurrences of a mechanism whereas the factors of a Bayesian network
contain a single mechanism for each variable. This is where the second theorem comes in:
{\em replicating} (i.e., duplicating) mechanisms in a product does not change the product value. 

\begin{theorem}[\citep{DarwicheECAI20b}]\label{thm:ve3}
For mechanism \(f\), if \(f \in \GG\), then  \(f \cdot \GG=\GG\).
\end{theorem}

Consider the factor \(\alpha = \sum_X f(XY) g(XZ) h(XW)\). \VE\ has to multiply factors \(f\), \(g\) and \(h\) 
before summing out variable \(X\), therefore constructing a factor over four variables \(XYZW\).
However, if factor \(f\) is a mechanism for variable \(X\), then we can replicate it by Theorem~\ref{thm:ve3}:
\(\alpha =  f(XY) g(XZ) f(XY) h(XW)\). Corollary~\ref{coro:ve2} then gives
\(\alpha = \sum_X f(XY) g(XZ) \sum_X f(XY) h(XW)\). Hence, we can now compute factor \(\alpha\)
without having to construct any factor over more than three variables. Moreover, we were able to do
this without needing to know the function represented by factor \(f(XY)\): we only needed to
know that this factor represents a function from \(Y\) to \(X\).
As shown in~\citep{DarwicheECAI20b},
this technique can lead to exponential savings that are attained without needing to know the identity of mechanisms
which is a major departure from earlier techniques.

\begin{algorithm}[tb]
\small
\caption{Complete Replication}
\label{alg:replicate}
\alglanguage{pseudocode}
\begin{algorithmic}[1]
\Procedure{replicate}{DAG $G$, Functional nodes $\Gamma$ in $G$}
\State \(\Sigma \gets\) multi-set of family factors of \(G\)
\For{each node \(X\) in \(\Gamma\) (bottom-up traversal)}
\If{\(X\) is a leaf} continue
\EndIf
\State \(n \gets \) number of \(X\)-feeding factors in \(\Sigma\)
\State \(\Sigma \gets \Sigma \cup \{\mbox{$n-1$ copies of the family factor for $X$}\}\)
\EndFor
\State \Return \(\Sigma\)
\EndProcedure
\end{algorithmic}
\end{algorithm}

As the above example shows, the exploitation of unknown mechanisms requires their
replication (duplication). A specific replication strategy was 
mentioned briefly and informally in~\citep{DarwicheECAI20b} and referred to as a ``heuristic.'' 
We shall call it the {\em complete replication strategy}  for a reason that will become apparent later.
This strategy is described formally in Algorithm~\ref{alg:replicate} and uses the following definition.

\begin{definition} \label{def:tail}
A family factor $f(X,\P)$ is said to be \underline{\(Y\)-feeding} iff \(Y \in \P\).
\end{definition}

Algorithm~\ref{alg:replicate} works with a {\em multi-set} of factors \(\Sigma\) instead of a {\em set} since
\(\Sigma\) may contain multiple copies of the same factor. It starts with \(\Sigma\) 
containing all family factors and traverses the DAG \(G\) bottom up. 
When visiting a functional node \(X\), it adds replicas of the mechanism for \(X\) to \(\Sigma\). 
Algorithm~\ref{alg:replicate} returns what is called a {\em replication} of family factors. 

\begin{definition}
A \underline{replication} of factors \(\FF\) is a multi-set \(\FF' \supseteq \FF\) obtained 
by replicating some of the mechanisms in \(\FF\).
\end{definition}

Consider the DAG in Figure~\ref{fig:dag-rep} where nodes \(B\) and \(C\) are functional.
Calling Algorithm~\ref{alg:replicate} on this DAG and these functional nodes returns the 
following replication
\(f_A(A)\), \(f_B(AB)\),  \(f_B(AB)\),  \(f_B(AB)\), \(f_C(BC)\), \(f_C(BC)\), \(f_D(BCD)\), \(f_E(CE)\),
which contains three replicas of the mechanism for \(B\) and two replicas of the 
mechanism for \(C\).

Even though a replication is technically a multi-set, we will simply refer to it as set for convenience. 
We will study (complete) mechanism replication extensively later. 

\begin{figure*}[tb]
\centering
\begin{subfigure}[b]{0.12\textwidth}
         \centering
\import{./tikz_figs/}{fig16.tex}
         \caption{DAG}
         \label{fig:dag-rep}
     \end{subfigure}
     \hfill
\begin{subfigure}[b]{0.21\textwidth}
         \centering
\includegraphics[width=\textwidth]{figs/JT-s1.png}
         \caption{jointree for DAG factors}
         \label{fig:JT-s1}
     \end{subfigure}
     \hfill
\begin{subfigure}[b]{0.21\textwidth}
         \centering
\includegraphics[width=\textwidth]{figs/JT-s2.png} 
         \caption{separators and clusters}
         \label{fig:JT-s2}
     \end{subfigure}
     \hfill
\begin{subfigure}[b]{0.28\textwidth}
         \centering
\includegraphics[width=\textwidth]{figs/JT-sr.png}
         \caption{jointree for a replication}
         \label{fig:JT-sr}
     \end{subfigure}
\caption{A DAG with a jointree for its family factors (b,c) and a jointree for a replication of these factors (d).}
\label{fig:jointrees}
\end{figure*}

A popular mechanization of \VE\ is based on the notion of a {\em jointree.} We will review jointrees next
as we shall use them to mechanize the exploitation of Theorems~\ref{thm:ve2} and~\ref{thm:ve3}
and to  formally define the notion of causal treewidth.

\begin{definition}\label{def:jointree}
A \underline{jointree} for factors $\FF$ is a tree in which every leaf node \(i\) is assigned a non-empty set of 
factors \(\FF_i\) where the sets \(\{\FF_i\}_i\) form a partition of factors \(\FF\).
\footnote{Standard jointrees allow factors to be assigned to any node. Assigning factors to leaves, even one factor 
per leaf, does not preclude jointrees with optimal width; see~\citep[Ch~9]{Darwiche09}.}
\end{definition}
When a factor appears in \(\FF_i\), we will say that leaf node~\(i\) {\em hosts} the factor. 
We will use \(\vars(i)\) to denote the variables of factors \(\FF_i\).
For a jointree edge \((i,j)\), we will use \(\vars(i,j)\) to denote the union of \(\vars(k)\) for every leaf node \(k\)
on the \(i\)-side of the edge.
Figure~\ref{fig:JT-s1} depcits a jointree for the family factors of the DAG in Figure~\ref{fig:dag-rep}.
Each leaf node of this jointree hosts exactly one factor.

A jointree induces edge and node labels as follows.
\begin{definition}\label{def:jointree-labels}
The \underline{separator} \(\S_{ij}\) of jointree edge \((i,j)\) is defined as \(\vars(i,j)\cap\vars(j,i)\).
If node \(i\) is a leaf, its \underline{cluster} \(\C_i\) is defined as \(\vars(i)\), otherwise as \(\bigcup_{j} \S_{ij}\). 
The \underline{width} of a jointree is the size of its largest cluster minus one.
\end{definition}
Figure~\ref{fig:JT-s2} depicts the separators and clusters for the jointree in Figure~\ref{fig:JT-s1}.
The width of this jointree is \(2\) since its largest cluster has \(3\) variables.

Jointrees play at least two key roles. First, their structure provides a specific recipe for when
to multiply factors and when to sum out variables when applying VE. Second, their separators and clusters define 
the variables of factors
constructed by \VE\ so the sizes of these separators and clusters can be used to 
precisely determine the complexity of \VE. We explain both roles next, starting with the following
theorem which shows how a jointree can be used to direct \VE\ towards the computation of marginals
over separators.

\begin{theorem}\label{thm:jt-marginals}
Consider a jointree for factors \(f_1, \ldots, f_n\). Define the \underline{message}
from jointree node \(i\) to its neighbor \(j\) as:
\[
\mes_{ij} =
\left\{
\begin{array}{lr}
\Sum_{\S_{ij}} \FF_i & \mbox{for leaf node \(i\)} \\
\Sum_{\S_{ij}} \prod_{k \neq j} \mes_{ki} & \mbox{for internal node \(i\)}
\end{array}
\right.
\]
For all jointree edges \((i,j)\), \(\mes_{ij} \mes_{ji} = \Sum_{\S_{ij}} f_1, \ldots, f_n\).\footnote{To
compute the marginal over the family of variable \(X\), choose a leaf node \(i\) in
the jointree which hosts the family factor for \(X\) and multiply this factor by message \(\mes_{ji}\)
where \(j\) is the single neighbor of \(i\); see~\cite[Ch 7]{Darwiche09}.}
\shrink{
\footnote{If \(f_1, \ldots, f_n\) are the family factors of a DAG \(G\) and if each jointree leaf is assigned exactly one factor, 
then every non-leaf in \(G\) will appear in some separator. To ensure that
every variable appears in some separator, add an auxiliary child \(C_i\) for each leaf \(X_i\) in \(G\)
with factor \(g_i(C_i,X_i)\)  such that \(g_i(c_i,x_i)=1\). Any jointree 
for factors \(f_1, \ldots, f_n\) can be easily turned into a jointree of 
the same width for factors \(f_1, \ldots, f_n,g_1,\ldots,g_m = f_1, \ldots, f_n\).}
}
\end{theorem}

Each message corresponds to a factor over some separator in the jointree. 
Hence, separators determine the space complexity of the message-passing
algorithm of Theorem~\ref{thm:jt-marginals}.
A message \(\mes_{ij}\) can be computed in \(O(\exp(|\C_i|)\) time and space 
%assuming node \(i\) has a bounded number of neighbors and 
given messages \(\mes_{ki}\) for \(k \neq j\).
Since \(|\C_i| \leq w+1\), where \(w\) is the jointree width, all messages can be computed
in \(O(n \cdot \exp(w))\) time and space where \(n\) is the number of jointree factors. 
Given an elimination order of width \(w\), one can always construct a jointree of width \(\leq w\)%
%and a bounded number of neighbors
; see~\citep[Ch~9]{Darwiche09}. Hence, the mechanization
of \VE\ using jointrees preserves the treewidth complexity bound. 

In our context, jointrees play a third key role as they provide a direct method for 
exploiting Theorems~\ref{thm:ve2} and~\ref{thm:ve3} as shown in \citep{DarwicheECAI20b}.
Instead of computing a jointree for the original set of factors \(\FF\), one computes a jointree
for a replication \(\FF' \supseteq \FF\) as licensed by Theorem~\ref{thm:ve3}; see Figure~\ref{fig:JT-sr}.
One can then remove variables from separators and clusters in the expanded jointree based on Theorem~\ref{thm:ve2} 
while preserving the soundness of the message passing algorithm. This reduces the jointree width and 
can lead to an exponential reduction in complexity. As in~\citep{causalityAC},
we refer to the process of removing variables from separators and clusters as the 
process of {\em thinning a jointree}.\footnote{The term ``thin jointree'' was used earlier in the context 
of approximate inference~\citep{nips/BachJ01}.}
We will show in the next section that the thinning procedure in~\citep{DarwicheECAI20b} is not complete 
as it can miss opportunities that are licensed by Theorem~\ref{thm:ve2}. 
We will also provide a complete thinning procedure (with respect to Theorem~\ref{thm:ve2}) 
which paves the way for the formal definition of causal treewidth.

\mysection{Thinning Jointrees}
\label{sec:ctw}

Suppose we have a replication \(\FF' \supseteq \FF\) of some factors~\(\FF\).
Given a jointree for the replication \(\FF'\), we will next define the notion of a {\em jointree thinning} and show that
it is optimal (i.e., cannot be improved using Theorem~\ref{thm:ve2}). The replication \(\FF'\) may not be optimal though. 
Constructing optimal replications will be discussed in the next section.	

\begin{definition}
\label{def:embody}
A jointree node \(i\) is said to be \underline{$X$-connected} to a factor \(f\) iff \(i\) hosts \(f\) or
\(X\) appears in every separator on the path between \(i\) and some leaf node \(j\) that hosts factor~\(f\).
\end{definition}

\begin{definition}
\label{def:thinned-jointree-formal}
A \underline{jointree thinning} maps every edge \((i,j)\) in the jointree to a set of variables \(\S^\star_{ij} \subseteq \S_{ij}\), called a
\underline{thinned separator}, and satisfies two properties. First,  for each functional variable \(X \in \S^\star_{ij}\), we have:
\begin{enumerate}
\item[(a)] Node $i$ is not $X$-connected to any \(X\)-mechanism on the $i$-side of edge \((i,j)\), 
or node $j$ is not $X$-connected to any \(X\)-mechanism on the $j$-side of the edge.
\item[(b)] If node \(i\) is not a leaf, then \(X \in \S^\star_{ik}\) for some \(k \neq j\).
\item[(c)] If node \(j\) is not a leaf, then \(X \in \S^\star_{jk}\) for some \(k \neq i\).
\end{enumerate}
Second, no other mapping from edges to supersets of \(\S^\star_{ij}\) satisfies the above property.
\end{definition}

The separators \(\S_{ij}\) of a jointree are determined by the locations of factors (the leaf nodes they are hosted at).
Hence, the separators of a jointree are unique. However, a thinned separator \(\Sij\) depends on both the locations 
of factors and other thinned separators. Hence, a jointree may have multiple thinnings. We define next
the quality of a jointree thinning.

\begin{definition}\label{def:thinning width}
A jointree thinning induces a \underline{thinned cluster} \(\C^\star_i\) for each jointree node \(i\):
If \(i\) is a leaf, \(\C^\star_i = \vars(\FF_i)\); otherwise, \(\C^\star_i = \bigcup_{j} \S^\star_{ij}\).
The \underline{width} of a jointree thinning is the size of its largest thinned cluster minus one.
\end{definition}
A jointree thinning leads to the notion of a causal jointree.

\begin{definition}\label{def:causal jointree}
A \underline{causal jointree} is a jointree in which edges are annotated with thinned separators and
nodes are annotated with thinned clusters. The \underline{causal width} of a jointree is the smallest
width attained by any of its causal jointrees. 
\end{definition}

The width of a jointree can be determined by examining its cluster sizes.
However, determining the causal width of a jointree is more involved as, in principle, it requires 
examining all thinnings of the jointree (causal jointrees). 

\begin{theorem}\label{thm:widths}
The width of a jointree thinning and the causal width of a jointree are no greater than the jointree width.
\end{theorem}

\begin{figure}[tb]
\centering
\import{./tikz_figs/}{fig9_11.tex}
\caption{A Bayesian network with functional nodes $B$ and $C$.}
\label{fig:complete-replicate}
\end{figure}

\begin{figure}[tb]
\begin{subfigure}[t]{0.495\linewidth}
\centering
%\import{./tikz_figs/}{fig9_9.tex}
\includegraphics[width=\linewidth]{./figs/JT_1.png}
\caption{jointree thinning of width \(2\)}
\label{fig:exthinning2}
\end{subfigure}
\begin{subfigure}[t]{0.495\linewidth}
\centering
%\import{./tikz_figs/}{fig9_10.tex}
\includegraphics[width=\linewidth]{./figs/JT_2.png}
\caption{jointree thinning of width \(3\)}
\label{fig:exthinning3}
\end{subfigure}
\caption{Two jointree thinnings. Each edge \((i,j)\) is marked by separator $\S_{ij}$.
Red variables are not in the thinned separator $\Sij$.}
\label{fig:jointree-thinning}
\end{figure}

Figure~\ref{fig:complete-replicate} depicts a Bayesian network with two functional nodes (\(B,C\)) and
nine factors \(\FF = f_A, f_B, \ldots, f_I\). Consider now the replication \(\FF' \supseteq \FF\) which results from 
duplicating mechanisms \(f_B\) and \(f_C\) once (that is, \(\FF'\) has \(11\) factors). 
Figure~\ref{fig:jointree-thinning} depicts a jointree for the replication \(\FF'\) and two of its thinnings
according to Definition~\ref{def:thinned-jointree-formal}. 
The one in Figure~\ref{fig:exthinning2} has width~\(2\). The one in Figure~\ref{fig:exthinning3} has width~\(3\).

Using thinned separators as given by Definition~\ref{def:thinned-jointree-formal} 
preserves the correctness of the message passing algorithm. 

\begin{theorem}
\label{thm:sound}
Theorem~\ref{thm:jt-marginals} continues to hold if we use thinned separators \(\Sij\) (as given by Definition~\ref{def:thinned-jointree-formal}) 
instead of classical separators \(\S_{ij}\) (as given by Definition~\ref{def:jointree-labels}).
\end{theorem}

The following result shows that we cannot improve Definition~\ref{def:thinned-jointree-formal} of jointree thinnings 
based only on Theorem~\ref{thm:ve2}.
\begin{theorem}\label{thm:completeness}
Consider a jointree thinning (Definition~\ref{def:thinned-jointree-formal}). If we remove any functional
variable from a thinned separator, then Theorem~\ref{thm:ve2} will no longer be sufficient to prove
the soundness of the message-passing algorithm (Theorem~\ref{thm:jt-marginals}).
\end{theorem}

We next provide a characterization of jointree thinnings, which is more suitable for verifying
whether the removal of variables from classical separators leads to a valid thinning.

\begin{theorem}\label{thm:characterize-thinning}
A mapping from each jointree edge \((i,j)\) to variable set \(\S^\star_{ij}\) is a jointree
thinning according to Definition~\ref{def:thinned-jointree-formal} iff
(1)~for each non-functional variable \(X\), \(X \in \S^\star_{ij}\) iff \(X \in \vars(i,j) \cap \vars(j,i)\);
(2)~for each functional variable \(X\):
(a)~if \(X \in \vars(i)\) for a leaf node~\(i\), then \(i\) is \(X\)-connected to exactly one mechanism for \(X\); 
(b)~if \(X \in \S^\star_{ij}\) for some non-leaf \(i\), then \(X \in \S^\star_{ik}\) for some \(k \neq j\).
\end{theorem}

Definition~\ref{def:thinned-jointree-formal} tells us what a thinning is but it does not tell us how to obtain one.
We next provide a set of {\em thinning rules} that will generate every thinning admitted by
Definition~\ref{def:thinned-jointree-formal}.

\begin{theorem}
\label{thm:equivalence1}
We can obtain a jointree thinning by starting with \(\S^\star_{ij} = \S_{ij}\) and then removing variables from \(\S^\star_{ij}\) according
to the following rules, until no rules can be applied. Remove functional variable \(X\) from \(\S^\star_{ij}\) if either
\begin{enumerate}
\item[(a)] Node $i$ is $X$-connected to some \(X\)-mechanism on the $i$-side of edge~\((i,j)\)
and node $j$ is $X$-connected to some \(X\)-mechanism on the $j$-side of the edge; or
\item[(b)] $X \notin \S^\star_{ki}$ for all $k \neq j$ when node \(i\) is not a leaf; or 
\item[(c)] $X \notin \S^\star_{jk}$ for all $k \neq i$ when node $j$ is not a leaf. 
\end{enumerate}
\end{theorem}

We will use $R_a(i,j,X)$ to mean that Rule~(a) is applicable to variable \(X\) and edge \((i,j)\)
and call it a {\em rule application}. Similarly for $R_b(i,j,X)$ and $R_c(i,j,X)$. 
A jointree thinning  can now be specified using a sequence of rule applications.
The thinning in Figure~\ref{fig:exthinning3} corresponds to
$R_a(6,15,C)$, $R_c(4,6,C)$, $R_c(3,4,C)$, $R_c(2,3,C)$, $R_a(6,15,B)$.
The one in Figure~\ref{fig:exthinning2} corresponds to
$R_a(6,15,C)$, $R_c(4,6,C)$, $R_c(3,4,C)$, $R_c(2,3,C)$, $R_a(13,15,B)$, $R_c(11,13,B)$, $R_c(10,11,B)$, $R_c(8,10,B)$.

\begin{definition}
\label{def:thin-sequence}
A \underline{thinning sequence} is a list of rule applications $R^1,\dots,R^n$ where each rule is valid when it is applied
and no rules are applicable after the sequence terminates.   
\end{definition}
Theorem~\ref{thm:equivalence1} says that the thinning rules are sound. 
The next result says they are complete (with respect to Definition~\ref{def:thinned-jointree-formal}).

\begin{theorem}
\label{thm:equivalence2}
Every causal jointree can be obtained using some thinning sequence. 
\end{theorem}

Two distinct thinning sequences may yield the same jointree thinning since the order of applying rules may not matter in some cases.
The following result suggests a restriction on thinning sequences that does not compromise their ability to discover every possible jointree thinning.

\begin{theorem}
\label{thm:thinning2}
Every jointree thinning can be obtained by a thinning sequence in which all applications of Rule~(a) come before the applications of Rules~(b,c).
\end{theorem}

That is, we can first exhaust all applications of Rule~(a) and then apply Rules~(b,c). 
In fact, once we exhaust all applications of Rule~(a), applying Rules~(b,c) becomes deterministic. 
In other words, the jointree thinning obtained by a thinning sequence is fully determined by  its Rules~(a).


Thinning sequences mechanize the thinning process but finding an optimal thinning sequence remains a
computationally challenging task given the large number of such sequences (even under the above restriction).
Hence, one needs either sophisticated search algorithms or a heuristic to decide which thinning rule
to apply and when. One heuristic that we found effective is to prefer $R_a(i,j,X)$ with the largest \(\Sij\),
followed by \(X\) that is contained in the fewest neighboring separators, followed by 
minimizing the number of \(X\)-connected \(X\)-mechanisms on either side of edge~\((i,j)\). 


\begin{figure}[tb]
\includegraphics[width=\linewidth]{figs/fjt_updated.png}
\caption{Comparing the thinning rules in Theorem~\ref{thm:equivalence1} with the ones in~\citep{DarwicheECAI20b}.
Functional nodes are restricted to be internal (non-root) nodes. The average time for applying the new thinning rules to
networks with \(150\) variables and \(100\%\) functional (hardest configuration) is \(6.86\) sec, with a min/max time of \(0.8/28.8\) sec.
\label{fig:exp1}}
\end{figure}

\citep{DarwicheECAI20b} proposed three thinning rules that apply only to binary jointrees in which each node has one or 
three neighbors~\citep{DBLP:conf/uai/Shenoy96}.
The rules are not complete though as they can miss thinnings admitted by Definition~\ref{def:thinned-jointree-formal}.
As in the rules we defined above, one starts by setting thinned separators \(\Sij\) to classical separators \(\S_{ij}\) and
then tries to remove variables from \(\Sij\) using the rules. 
However, these rules can only be applied when visiting the jointree nodes in a particular order. A leaf node \(h\) is identified first and
then nodes are visited based on their distance from \(h\), where the closer nodes are visited first. 
Suppose we are visiting a non-leaf node \(i\). Let \(p\) be its neighbor that is closest to leaf \(h\) and let  \(c_1\) and \(c_2\) be 
its two other neighbors. The first two rules require the following conditions:
\(X \in \S^\star_{ic_1}\), \(X \in \S^\star_{ic_2}\), an \(X\)-mechanism is hosted on the \(c_1\)-side of edge \((c_1,i)\) and 
an \(X\)-mechanism is hosted on the \(c_2\)-side of edge \((c_2,i)\). 
If we further have \(X \in \S^\star_{ip}\), the first rule licenses the removal of \(X\) from either \(\S^\star_{ic_1}\) or \(\S^\star_{ic_2}\).
If \(X \not \in \S^\star_{ip}\), the second rule  licenses the removal of \(X\) from both \(\S^\star_{ic_1}\) and \(\S^\star_{ic_2}\).
The final rule applies to the single neighnor \(r\) of leaf \(h\), allowing
us to remove variable \(X\) from \(\S_{hr}^\star\) when an \(X\)-mechanism is hosted at leaf \(h\) and also 
at some other leaf in the jointree. The first rule involves a choice which is made using a heuristic
described in~\citep{DarwicheECAI20b}. 
%One can show that the above rules follow from the ones  in Theorem~\ref{thm:equivalence1}.

Consider now the Bayesian network in Figure~\ref{fig:complete-replicate} 
and its thinned jointree in Figure~\ref{fig:exthinning2} which has width \(2\). The best thinning that can be obtained
by the rules in \citep{DarwicheECAI20b} has width \(3\), regardless of which leaf node \(h\) we choose and
regardless of what choices we make when applying the first rule. Figure~\ref{fig:exp1} depicts a comparison
between these rules and the ones in Theorem~\ref{thm:equivalence1} on random (binary) jointrees,
for the factors of complete replications generated by Algorithm~\ref{alg:replicate}. The plots in this figure vary the number of
Bayesian network nodes from \(10\) to \(150\) and consider different percentages of functional nodes (\(25\), \(50\),
\(75\) and \(100\)) which are restricted to be non-root nodes.\footnote{Our method for generating a Bayesian network with 
nodes \(X_1, \ldots, X_n\) assumes that each node has at most five parents. We visit nodes \(X_i\) from \(i=1\) to \(i=n\).
When visiting node \(X_i\), we randomly choose a number from \(\{0,\ldots,\min(5,i-1)\}\) to represent the
number of parents for \(X_i\) and then randomly choose that many parents from \(X_1, \ldots, X_{i-1}\).}
They report the mean of maximal cluster size (width+1) over \(10\) jointrees for each data point.
The plots are for the cluster sizes of (1) a classical jointree (blue), (2) a causal jointree obtained by the incomplete rules (red) and 
(3) a causal jointree obtained by the proposed rules (yellow). 
Four patterns are clear: more thinning takes place as we increase the number of
functional nodes; the proposed thinning rules are much more effective; the gap between the two sets of
rules grows as we increase the number of Bayesian network nodes and the number of functional nodes;
the exploitation of unknown mechanisms can lead to significant reduction in inference complexity.

\mysection{Mechanism Replication}
\label{sec:replication}

The definition of thinning that we developed in the previous section was with respect to a particular replication \(\FF\)
and a particular jointree for the factors in \(\FF\).
Some replications are better than others in that they lead to causal jointrees of smaller width. We formalize this next.

\begin{definition}\label{def:replication-width}
The \underline{width} of a replication \(\FF\) is defined as the minimum width attained by any causal jointree for \(\FF\).
\end{definition}
Given a replication \(\FF\), we need to examine two search spaces before we can determine its width. First, we must choose a jointree for the factors
in \(\FF\). Second, we must choose a particular thinning of the jointree. Hence, determining the width of a replication is not a straightforward task.
Moreover, the width of a replication is not the only measure of its quality
as we need to also consider its size. This is a critical issue that was not discussed in~\citep{DarwicheECAI20b,causalityAC} and 
that we need to explore carefully before we are ready to provide the formal definition of causal treewidth.

The size of a replication is the number of factors it contains (replicas are counted individually).
To highlight the importance of a replication size, consider two replications \(\FF_1\) and \(\FF_2\) with respective sizes \(n_1\) and \(n_2\).
Suppose now that replication \(\FF_1\) has width \(w_1\) and replication \(\FF_2\) has width \(w_2\).
This means that there exists a causal jointree for replication \(\FF_1\) of width \(w_1\) and no other causal jointree can have a smaller width
(and similarly for replication \(\FF_2\)). If we use these optimal causal jointrees, inference using these replications can be
done in \(O(n_1 \cdot \exp(w_1))\) and \(O(n_2 \cdot \exp(w_2))\) time and space, respectively. One may be tempted to choose the
replication with smaller width since complexity is exponential in width but linear in size. However, the size of a replication may also be
exponential as we show next. In fact, the key result of this section is that the replication strategy proposed in~\citep{DarwicheECAI20b},
shown in Algorithm~\ref{alg:replicate}, satisfies two interesting properties. First, it is optimal: no other replication strategy will have a smaller width. Second, it can
lead to replications of exponential size. We will in fact provide a bound on the size of replications produced by this 
strategy and suggest how it can be improved to avoid a blow up in replication size.

We start with the following result which shows that excessive replication can never hurt width.

\begin{theorem}\label{thm:replication-size}
\label{lem:replication}
Consider two replications $\FF_1$ and $\FF_2$ of some factors \(\FF\) where \(\FF \subseteq \FF_1 \subseteq \FF_2\).
If the width of replication \(\FF_1\) is \(w\), then the width of replication \(\FF_2\) is \(\leq w\).
\end{theorem}

We next identify a class of replications that possess some significant properties.
\begin{definition}\label{def:complete-replication-strategy}
A replication \(\FF\) is  \underline{complete} iff it satisfies the following property for each functional variable \(X\) and its mechanism \(f_X\).
If \(n\) is the number of \(X\)-feeding factors in \(\FF\) and \(n>0\), 
then \(\FF\) contains \(n\) replicas of mechanism \(f_X\). Otherwise, \(\FF\) contains only one replica of \(f_X\).
\end{definition}

The first property of complete replications is uniqueness. 

\begin{theorem}
\label{thm:unique-replica}
The family factors of any DAG have a unique complete replication.
Moreover, the replications generated by Algorithm~\ref{alg:replicate} are complete.
\end{theorem}

The second property of complete replications is optimality. 

\begin{theorem}\label{thm:replication}
Let \(\FF\) and \(\FF'\) be two replications of the family factors of a DAG.
If replication \(\FF\) is complete, then its width is no greater than the width of replication \(\FF'\).
\end{theorem}

\begin{figure}[tb]
\centering
\import{./tikz_figs/}{fig5.tex}
\caption{A DAG with an exponential complete replication.}
\label{fig:chain-func}
\end{figure}

The third property of complete replications is that their size can be exponential.
Consider the family of DAGs in Figure~\ref{fig:chain-func} which have \(3n+2\) nodes for \(n \geq 1\).
Algorithm~\ref{alg:replicate}, which generates complete replications, starts with a set \(\Sigma\) containing the \(3n+2\) family factors.
It then visits functional nodes bottom-up and replicates their mechanisms. One can easily show that after visiting functional node \(X_i\),
the set \(\Sigma\) will contain \(2^i\) replicas of the mechanism for \(X_i\).

We next provide a bound on the number of factors in a complete replication which suggests a method for controlling 
the potential blow up in its size.

\begin{definition}
A \underline{functional chain} of length \(k\) in a DAG  is a set of functional nodes \(n_1, \ldots, n_k\) where node \(n_i\) is a parent of node \(n_{i+1}\) for \(i=1,\ldots,k-1\).
\end{definition}

\begin{theorem}
\label{thm:replication1}
Consider a DAG with $n$ nodes. Let $c$ be the largest number of children for any node and  let $k$ be the length of longest functional chain.
The (unique) complete replication of this DAG will contain at most $nc^k$ factors.
\end{theorem}

This bound immediately provides a method for controlling the number of replicas in a complete replication. 
If we treat variable $X_{n/2}$ in Figure~\ref{fig:chain-func} as a non-functional variable, 
the length of the largest functional chain will be cut by half. Hence, by selectively ignoring some functional variables
we can bound the size of functional chains and therefore ensure that Algorithm~\ref{alg:replicate} will produce 
replications with size that is polynomial in the number of DAG nodes.

We are now ready to define causal treewidth formally. 

\begin{definition}
Consider a DAG \(G\) with \(n\) nodes, some of which are declared as functional. 
The \underline{causal treewidth} of DAG \(G\) is the smallest width attained by any replication \(\FF\) for \(G\) 
where the size of \(\FF\) is polynomial in \(n\).
\end{definition}

For the class of DAGs with bounded functional chains, one can use the complete replication \(\FF\) to determine the causal treewidth
of the DAG. That is, determining causal treewidth becomes a matter of finding an optimal causal jointree for the factors in \(\FF\).
The situation is more intricate for DAGs with unbounded functional chains. The complete replication cannot be used 
in this case and one must search among replications of polynomial size. It remains
to be seen whether the space of replications to be explored can be restricted to subsets of the complete replication as suggested earlier.
This is a subject of future work. 
We note here that~\citep{causalityAC} identified a family of DAGs with \(O(n^2)\) nodes, bounded depth and
treewidth \(n+1\), while constructing thinned jointrees of width \(2\) for the family, assuming all internal nodes are functional. 
This is an example where the treewidth is unbounded while the causal treewidth is bounded, showing that causal
treewidth dominates treewidth.

The appendix contains an experiment that reveals the importance of replication strategies and how
such strategies interact with jointree construction methods.  
The experiment exhibited a number of patterns.
First, the causal width was always smaller than the width, and quite substantially smaller, even when using random replications.
Next, complete replications always produced a smaller causal width compared to random replications, particularly when the
number of functional nodes is largest (100\%).
Finally, increasing the size of a random replication almost always correlated with decreasing the causal width but up to a certain
point after which increasing the size of a replication did not help. 


\mysection{Conclusion}
\label{sec:conclusion}

We provided a formal definition of the notion of causal treewidth, which dominates the classical and influential notion of treewidth.
We also studied the three ingredients needed to define causal treewidth: mechanism replication, jointree construction and jointree thinning
which yields causal jointrees. 
On the first front, we presented a number of results about a replication strategy that we called complete replication, showing that it 
is optimal while providing a bound on the size of replications it produces and suggesting a technique for controlling their size. 
On the second front, we highlighted the relevance (and irrelevance)
of classical jointree construction methods to the construction of jointrees for replications. On the third front, we provided a complete characterization
of causal jointrees and provided three thinning rules that are sound and complete for generating causal jointrees. We also proved some  
properties of these rules which can be of practical significance. We finally presented some experimental results to shed further light on the 
developments in this paper, which also showed that causal jointrees can lead to exponential improvements in the 
complexity of inference in comparison to jointrees.

\begin{acknowledgements} 
We wish to thank Yunqiu Han for useful feedback. This work has been partially supported by NSF grant
\#ISS-1910317 and ONR grant \#N00014-18-1-2561.
\end{acknowledgements}

%\bibliography{uai2022-template}

\bibliography{bib/adnan,bib/references,bib/references2,bib/refs,bib/refsnn}


\end{document}