%%% Begin UAI-22 formatting
% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2024} % after acceptance, for a revised
%                                     % version; also before submission to
%                                     % see how the non-anonymous paper
%                                     % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
\bibliographystyle{plainnat}
\renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
\usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
% Use the postscript times font!
\usepackage{times}
\usepackage{soul}
\usepackage{url}
\usepackage{hyperref}
\hypersetup{
    colorlinks,
    linkcolor={black},
    citecolor={blue!50!black},
    urlcolor={black}
}
    % \hypersetup{
    %     colorlinks=true,
    %     urlcolor=blue,
    %     urlbordercolor=blue,
    %     linkcolor=blue,
    %     linkbordercolor=blue,
    %     filecolor=magenta,
    %     pdfborderstyle={/S/U/W 1},
    % }
\usepackage[utf8]{inputenc}
\usepackage{graphicx}
\usepackage{amsfonts}
\usepackage{booktabs}
% \usepackage{subfigure}
\usepackage{amssymb}
\urlstyle{same}
%%% End UAI-22 formatting

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% CUSTOM PACKAGES %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\usepackage{xspace} % package being used for \newcommand to remove extra space
                    %     when a command is invoked without an argument list
\usepackage{textcase}
\usepackage[toc, nopostdot]{glossaries}
\usepackage{amsmath}
\usepackage{amsthm}
\usepackage{mathtools}
\usepackage{enumitem}
\usepackage{refcount}
\usepackage[titlenumbered,ruled, linesnumbered]{algorithm2e}
\usepackage{mathrsfs} %for \mathscr
\usepackage[font=footnotesize,labelfont=bf]{caption}
\usepackage[font=footnotesize,labelfont=bf]{subcaption}
\usepackage{xcolor}
    \definecolor{darkgreen}{rgb}{0.0, 0.2, 0.13}
    \definecolor{cadmiumgreen}{rgb}{0.0, 0.42, 0.24}
    \definecolor{byzantium}{rgb}{0.44, 0.16, 0.39}
    \definecolor{darkelectricblue}{rgb}{0.33, 0.41, 0.47}
    \definecolor{battleshipgrey}{rgb}{0.52, 0.52, 0.51}
    \definecolor{warmblack}{rgb}{0.0, 0.26, 0.26}
\usepackage{newfloat}
\usepackage{chngcntr}
% \usepackage{graphicx}
\usepackage{wrapfig}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% CUSTOM COMMANDS %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%create new float environment called tablefigure with it's own counter
\DeclareFloatingEnvironment[name=Table]{tablefigure} 

%set the floats table and tablefigure to use the same counters
\makeatletter\let\c@tablefigure\c@table\makeatother 

%consider the floats table and tablecounter as the same set of floats (so location in document will be in order in which they appear)
\makeatletter\let\ftype@tablefigure\ftype@table\makeatother 

\DeclareMathOperator*{\argmin}{argmin}
\DeclareMathOperator*{\argmax}{argmax}
\mathchardef\mhyphen="2D % Define a "math hyphen"

% algorithm2e
% \newcommand\commentstyle[1]{\textcolor{cadmiumgreen}{#1}}
\SetCommentSty{commentstyle}
\SetKwInOut{Input}{input}
\SetKwInOut{Output}{output}

\newtheorem{theorem}{Theorem}[subsubsection]
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{definition}{Definition}[subsubsection]

\input{_cmds}
\renewcommand*{\glstextformat}{\textbf}






%%% for supplemental

\usepackage{enumitem}
    \setlistdepth{9}
    \setlist[itemize,1]{label=$\bullet$}
    \setlist[itemize,2]{label=$\cdot$}
    \setlist[itemize,3]{label=$\cdot$}
    \setlist[itemize,4]{label=$\cdot$}
    \setlist[itemize,5]{label=$\cdot$}
    \setlist[itemize,6]{label=$\cdot$}
    \setlist[itemize,7]{label=$\cdot$}
    \setlist[itemize,8]{label=$\cdot$}
    \setlist[itemize,9]{label=$\cdot$}
    \renewlist{itemize}{itemize}{9}






%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\setcounter{secnumdepth}{3} %May be changed to 1 or 2 if section numbers are desired.
\setcounter{tocdepth}{3}

\title{\vspace{-4pt}Value-Based Abstraction Functions for Abstraction Sampling\\ \vspace{6pt}(Extended Background)}


% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author{\href{mailto:<pezeshkb@uci.edu>?Subject=Abstraction Sampling - UAI 2024}{Bobak Pezeshki}{}}
\author{\href{mailto:<kkask@uci.edu>?Subject=Abstraction Sampling - UAI 2024}{Kalev Kask}{}}
\author{\href{mailto:<ihler@ics.uci.edu>?Subject=Abstraction Sampling - UAI 2024}{Alexander Ihler}{}}
\author{\href{mailto:<dechter@ics.uci.edu>?Subject=Abstraction Sampling - UAI 2024}{Rina Dechter}{}}
% Add affiliations after the authors
\affil[1]{%
    University of California, Irvine
}




\input{_gls}



\begin{document}
    \onecolumn
    \setlength{\abovedisplayskip}{3pt}
    \setlength{\belowdisplayskip}{3pt}

    \maketitle

    \begin{abstract}
        \noindent For revised supplemental materials, please visit \url{https://ics.uci.edu/~dechter/publications.html}.  In addition to providing a glossary of terms, abbreviations, and notation, this document aims to provide readers with background on topics that are foundational to the concepts that are discussed in the main paper. The most up-to-date version of this document - as well as other supplemental materials - can be found on the \href{https://www.ics.uci.edu/~dechter/publications.html}{Dechter Lab publications page}.
    \end{abstract}

    \tableofcontents



    % \glsaddallunused 
    \setglossarystyle{altlist}
\clearpage
    \printnoidxglossary[title=Glossary, toctitle=i.\ \ \ \  Glossary]
\clearpage
    \printnoidxglossary[type=abrv, title=Abbreviations, toctitle=ii.\ \ \  Abbreviations]
\clearpage
    \printnoidxglossary[type=nt, sort=use, title=Notation, toctitle=iii.\ \  Notation]
\clearpage
    \printnoidxglossary[type=algs, title=Algorithms, toctitle=iv.\ \  Algorithms]


    


\clearpage

    \section{Background: Graphical Models}
        \Glspl{gls:graphical-model}, such as a Bayesian or a Markov networks \cite{pearl88,darwiche-book,DBLP:series/synthesis/2013Dechter}, are mathematical tools for modeling complex systems, each composed of a set of variables with defined domains and functions defined over subsets of the variables.  The functions capture local dependencies of the subset of variables they are defined on, those variables known as the function's \gls{gls:scope}. The functions of a graphical model often represent a factorization of a global function over all the variables.  An assignment to all of the variables (referred to as a \gls{gls:full-configuration}) represents a possible state of the modeled system. 
        %(A formal definition is upcoming in Sub-\seclink{sec:discrete-graphical-models-formulation}).

        Graphical models are constructed not only to model a system, but also to provide a means of efficiently answering specific queries of interest via exploitation of the model's structure.  Some common computational tasks are 
        \begin{itemize}
            \item   determination of the
                    \gls{gls:partition-function}: a normalization constant necessary for computing probabilistic quantities
            \item   determination of the
                    \textbf{MPE} (\gls{gls:most-probable-explanation}): the most probable full configuration given a \gls{gls:partial-configuration} (assignments to a subset of the variables) known as observation or evidence. Additionally, the associated likelihood corresponding to the MPE, known as the \textbf{MAP} (\gls{gls:maximum-a-posteriori}) value, and can also be queried in kind
            % \item   determination of the
            %         marginal probability of the joint configurations of a subset of variables
            \item   determination of the
                    \textbf{MMAP} (\gls{gls:marginal-maximum-a-posteriori}) configuration: 
                    the configuration of a target subset of variables that maximizes their marginal likelihood
        \end{itemize}
         (More details about common graphical model queries to be provided in \seclink{sec:well-known-graphical-model-tasks}).

        \subsection{Discrete Graphical Models} \label{sec:discrete-graphical-models-formulation}
            Considering the discrete space, a discrete graphical model can be defined as a 3-tuple $\MDef$, where
            \begin{itemize}
                \item   % $\mathbf{X} \! = \! \{X_i \! : i \! \in \! V\}$
                        % is a set of indexed nodes, each representing a corresponding variable from an ordered set $V$
                        $\mathbf{X}$ is a set of variables for which the model is defined over
                \item   $\mathbf{D} \! = \! \{D_{X} \! : X \! \in \! \mathbf{X}\}$
                        is a set of finite domains, one for each $X \in \mathbf{X}$, defining the possible values each $X$ can be assigned
                \item   Each $f_{\alpha} \in   \mathbf{F}$
                        (sometimes denoted $f \in   \mathbf{F}$ for simplicity) is  a real-valued function defined over a subset of the model's variables $\alpha \subseteq \mathbf{X}$, known as the function's \gls{gls:scope}, for which the function defines local interactions.  More concretely, if we let $D_{\alpha}$ denote the Cartesian product of the domains of the variables in $\alpha$, then
                        $f_{\alpha} \! : D_{\alpha} \! \rightarrow \mathbb{R}_{\geq 0}$. These functions can be expressed as tables for which there is a real valued output associated with every possible input $d_{\alpha} \in D_{\alpha}$ (ie. every possible joint assignment - or \gls{gls:configuration} - to all of the variables in $\alpha$).
            \end{itemize}

        \subsection{Graphical Model Notation}
            Capital letters ($X$) represent variables and small letters ($x$) represent their values.
            Boldfaced capital letters ({\bf X}) denote a collection of variables,
            $|{\bf X}|$ its cardinality, 
            $D_{\X}$ their joint domains, 
            and $\xx$ a particular realization in that joint domain.
            Abusing notation, operations denoted $\bigoplus_{\X}$ (ex. $\sum_{\X}$) imply...
            \begin{align}
                \begin{split}
                    \bigoplus_{\X}
                                    &\iff \bigoplus_{\xx \in D_{\X}}\\
                                    &\iff \bigoplus_{x_{1} \in D_{X_{1}}} \bigoplus_{x_{2} \in D_{X_{2}}} ... \bigoplus_{x_{|\X|} \in D_{X_{|\X|}}}
                \end{split}
            \end{align}
            Furthermore, given a function $f_{\alpha}$ with scope $\alpha$, a super-set of variables $\beta$ s.t. $\alpha \subseteq \beta$, a particular configuration $\textbf{b}$ of $\beta$, and $\textbf{a} := \{ X \leftarrow x | X \leftarrow x \in \textbf{b} \textnormal{ and } X \in \alpha \}$ (ie. the subset of assignments in $\textbf{b}$ corresponding to variables in $\alpha$),
            \begin{align}
                f_{\alpha}(\textbf{b}) \implies f_{\alpha}(\textbf{a})
            \end{align}

        \subsection{Primal Graph} \label{sec:primal-graph}
            A \gls{gls:primal-graph} $\mathcal{G} \! = \! \langle \mathbf{X,E} \rangle$ of a graphical model $\mathcal{M}$ associates each variable of $\M$ with a corresponding node in a one-to-one fashion such that arcs $e \! \in \! \mathbf{E}$ connect nodes whose variables appear in the scope of the same local function.  To simplify, we abuse notation by using the same symbols to refer to primal graph nodes as their corresponding variables in $\mathcal{M}$.  (For those familiar, note that the primal graph corresponds to a Markov Random Field graph representation of the model). The primal graph is a useful tool for graphical model algorithms' exploitation of the model's local structure.
        
        \subsection{Simple Example} \label{sec:simple-example}
            Consider a simple model that relates temperature and humidity to the chance of rain, and temperature and elevation to the chance of different oxygen levels. Let us choose binary variables $\mathbf{X}=\{T, H, R, E, O\}$ to represent these different levels and construct a corresponding graphical model $\MDef$ where 
            \begin{itemize}
                \item   $T$ has corresponding domain 
                        $D_{T} = \set{low, high}$ representing high and low temperature
                \item   $H$ has corresponding domain 
                        $D_{H} = \set{low, high}$ representing high and low humidity
                \item   $R$ has corresponding domain
                        $D_{R} = \set{no, yes}$ representing the presence or absence of rain
                \item   $E$ has corresponding domain
                        $D_{E} = \set{low, high}$ representing the high or low elevation
                \item   $O$ has corresponding domain
                        $D_{O} = \set{low, high}$ representing the high or low oxygen levels
            \end{itemize}
            and having five functions
            \begin{itemize}
                \item   $f_{T}(T)$ 
                        representing the marginal probability of the temperature being low or high, $p(T)$
                \item   $f_{H}(H)$ 
                        representing the marginal probability of the humidity being low or high, $p(H)$
                \item   $f_{E}(E)$ 
                        representing the marginal probability of the elevation being low or high, $p(E)$
                \item   $f_{T,H,R}(T,H,R)$ 
                        representing the conditional probability of rain given levels of humidity and temperature, $p(R \,|\, T, H)$
                \item   $f_{T,E,O}(T,E,O)$ 
                        representing the conditional probability of high vs. low oxygen concentrations given the temperature and elevation levels, $p(O \,|\, T, E)$
            \end{itemize}
            defined by the following tables, respectively:

            %%%% p(T)
            \begin{tabular}{c||c}
                $T$  &  $p(T)$   \\
                \hline
                $low$   &  $0.60$  \\
                $high$  &  $0.40$  
            \end{tabular}
            \quad
            \begin{tabular}{c||c}
                $H$  &  $p(H)$   \\
                \hline
                $low$   &  $0.75$  \\
                $high$  &  $0.25$
            \end{tabular}
            \quad
            \begin{tabular}{c||c}
                $E$  &  $p(E)$   \\
                \hline
                $low$   &  $0.80$  \\
                $high$  &  $0.20$
            \end{tabular}

            \begin{tabular}{c|c|c||c}
                $T$     &  $H$     &  $R$    &  $p(R\,|\,T,H)$   \\
                \hline
                $low$   &  $low$   &  $no$   &  $0.90$ \\
                $low$   &  $low$   &  $yes$  &  $0.10$ \\
                $low$   &  $high$  &  $no$   &  $0.20$ \\
                $low$   &  $high$  &  $yes$  &  $0.80$ \\
                $high$  &  $low$   &  $no$   &  $0.95$ \\
                $high$  &  $low$   &  $yes$  &  $0.05$ \\
                $high$  &  $high$  &  $no$   &  $0.60$ \\
                $high$  &  $high$  &  $yes$  &  $0.40$
            \end{tabular}
            \quad
            \begin{tabular}{c|c|c||c}
                $T$     &  $E$     &  $O$     &  $p(O\,|\,T,E)$   \\
                \hline
                $low$   &  $low$   &  $low$   &  $0.30$ \\
                $low$   &  $low$   &  $high$  &  $0.70$ \\
                $low$   &  $high$  &  $low$   &  $0.75$ \\
                $low$   &  $high$  &  $high$  &  $0.25$ \\
                $high$  &  $low$   &  $low$   &  $0.60$ \\
                $high$  &  $low$   &  $high$  &  $0.40$ \\
                $high$  &  $high$  &  $low$   &  $0.80$ \\
                $high$  &  $high$  &  $high$  &  $0.20$
            \end{tabular}
            
            and where we make independence assumptions allowing the joint distribution $P(T,H,R,E,O)$ to factorize to the probability functions represented by the model such that:
            \begin{align}
                \begin{split}
                    p(T, H, E, R, O) 
                        &= p(T) \cdot p(H \,|\, T) \cdot p(E \,|\, T,H) \cdot p(R \,|\, T,H,E) \cdot p(O \,|\, T,H,E,R)\\
                        &= p(T) \cdot p(H)  \cdot p(E) \cdot p(R \,|\, T,H)\cdot p(O \,|\, T, E)\\
                        &= f_{T}(T) \cdot f_{H}(H)  \cdot f_{E}(E) \cdot f_{T,H,R}(T,H,R) \cdot f_{T,E,O}(T,E,O)\\
                        &= \prod_{f_{\alpha} \in \mathbf{F}} f_{\alpha}(\alpha) 
                \end{split} \label{eq:simple-example-joint-as-prod-of-funcs}
            \end{align}

            with primal graph:
            \vspace{-18pt}
            \begin{figure}[h]
                \centering
                \includegraphics[scale=0.6]{./_attachments/images/weather_primal_graph.pdf}
                \caption{Primal graph of the example model described above.}
                \label{fig:example-primal-graph}
            \end{figure}            
            
            You can see that the graph consists of nodes representing $T$, $H$, $R$, $E$, and $O$ and has edges between each pair of $\{T, H, R\}$ since each pair appears together in at least one $f_{\alpha} \in \mathbf{F}$ 
            %- namely, $\forall X',X'' \in \mathbf{X} \; s.t. \; X' \neq X'', \exists f_{\alpha} \in \mathbf{F} \; s.t. \; X' \in \alpha \textnormal{ and } X'' \in \alpha$
            , and similarly between each pair of $\{T, E, O\}$.

            With the model and primal graph defined, we can then use a variety of algorithms over graphical models to efficiently answer queries about the model.  One such query could be to find the probability corresponding to the mode of our modeled distribution - namely to find the probability associated with the most likely full configuration.  Throughout the next several sections, we will describe various queries and computational schemes commonly used with graphical models starting next by describing the general framework of variable elimination, which we will use to show one way to compute our example query.

        

    \subsection{Pseudo Tree} \label{sec:pseudo-tree}
        Given a variable ordering, directed tree called a \semph{pseudo tree} $\T \! = \! (\mathbf{X,E})$ can be constructed relative to a graphical model $\mathcal{M}$.  Each node of the pseudo tree corresponds one-to-one with a node in $\mathcal{M}$. As before, to simplify, we abuse notation by using the same symbols to refer to pseudo tree nodes as their corresponding variables in $\mathcal{M}$. The tree will be structured such that the nodes follow the provided variable ordering - namely, each node is a descendant of only nodes that come before it in the provided ordering - and such that any branching in the pseudo tree corresponds to existing conditional dependencies in $\M$ - namely, sibling branches are conditionally independent of each other given assignments to their ancestor variables in the pseudo tree. 

        \begin{wrapfigure}{r}{0.5\linewidth}
            \centering
            \vspace{-30pt}
            \includegraphics[scale=0.6]{./_attachments/images/example-pseudo-tree-threo.pdf}
            \caption{An example pseudo tree for the model described in  \seclink{sec:simple-example} based on ordering $T,H,R,E,O$.  Here the dummy root node is explicitly shown, however it is typically hidden for simplicity.}
            \label{fig:ex-pseudo-tree-threo}
        \end{wrapfigure}      
        
        A pseudo tree \PT can be constructed by the following steps:
        \begin{enumerate}
            \item   
                create a dummy root $X_{0}$
            \item
                add the first variable in the ordering $X_{i=1}$ as the child of the dummy root
            \item \label{pseudo-tree-loop-step}
                for the next variable in the ordering $X_{i+1}$:
                \begin{enumerate}[label=\roman*.]
                    \item \label{pseudo-tree-choose-parent-step}
                        choose an existing variable $X_{p}$
                        %, $p \in {0, ..., i}$ 
                        in the partially constructed pseudo tree $\T'$ such that, given assignments to $X_{p}$ and its ancestors, $X_{i+1}$ is conditionally independent of all existing descendants of $X_{p}$ and of existing descendants of its ancestors.
                    \item
                        add $X_{i+1}$ to $\T'$ as the child of $X_{p}$
                \end{enumerate}
            \item
                repeat \hyperref[pseudo-tree-loop-step]{step \ref*{pseudo-tree-loop-step}} until all variables in the ordering (and thus $\M$) have been added to the tree
        \end{enumerate}

        Note that edges in the primal graph of the model either exist in the pseudo tree as directed edges or would exist as back arcs, but never cross arcs.  
        
        \paragraph{Increasing Pseudo Tree Branching.}  In order to capture the maximal number of conditional independences given an ordering, \hyperref[pseudo-tree-loop-step]{step \ref*{pseudo-tree-loop-step}.\ref*{pseudo-tree-choose-parent-step}} can be altered to choose the earliest variable in the ordering that satisfies the said condition, thus leading to earlier branching in the tree.

        \paragraph{Pseudo Tree Uses.} One use of the pseudo tree is as a schematic of bucket elimination.  Mores specifically, message passing from [mini] bucket elimination with an elimination ordering that is the reverse of $o$ will follow a path from the leaves to the root of $\T$.  \todo{EXAMPLE}.  Furthermore, the pseudo tree can act as a blue print for constructing search space graphs of $\M$ as will be described in the next section.  In combination, given an ordering $o$ and corresponding search space and bucket elimination, the messages from the bucket elimination can act as heuristics guiding the search at each level.


    \subsection{Some Well-Known and Important Graphical Model Tasks} \label{sec:well-known-graphical-model-tasks}
        There are a plethora of queries that a graphical model can lend itself to answering.
        %, such as computing marginal or conditional probabilities of a subset of variables \bobak{REFERENCE} or determining causal relationships \bobak{REFERENCE}.  
        Here we will describe four traditionally important tasks in particular: determination of the
        \begin{itemize}
            \item   \Gls{gls:partition-function} \textbf{(Z)}
            \item   \Gls{gls:maximum-a-posteriori} \textbf{(MAP)}
            \item   \Gls{gls:most-probable-explanation} \textbf{(MPE)}
            \item   \Gls{gls:marginal-maximum-a-posteriori} \textbf{(MMAP)}
        \end{itemize}


        \subsubsection{Task Formalizations} \label{sec:task-formalizations}

            Definition \ref{def:pgm-common-tasks} below provides the formalization of these tasks respectively.

            \begin{definition}[Z, MAP, MPE, and MMAP]  Given a graphical model $\M \! = \! (\X,\D,\F),$ \hfill\\ \label{def:pgm-common-tasks}
            \begin{align}
                Z &= \sum_{\X} \prod_{\F} f(\xx); \label{def:z}\\
                MAP &= \max_{\X} \prod_{\F} f(\xx); \label{def:map}\\  
                MPE &= \argmax_{\X} \prod_{\F} f(\xx); \label{def:mpe}\\
                MMAP &= \max_{\Q \subset \X} \sum_{\SSS = \X \setminus \Q} \prod_{\F} f(\qq \cup \ssss) \label{def:mmap}
            \end{align}
            \end{definition} 

            The \gls{gls:partition-function}, \textbf{Z}, is mathematical quantity that characterizes the distribution among a system's possible states.  It is often used as a normalization constant for computing probabilities.  \gls{abrv:MPE} is a full configuration that maximizes the value of the model defined as the product of all of its functions, and \gls{abrv:MAP} outputs that value.  From a probabilistic model standpoint, this corresponds to finding the assignment to the variables that are most likely under the model, and the corresponding likelihood value, respectively.  Given evidence (ie. a given assignment to a subset of the variables), the MPE (constrained to be consistent with the evidence) corresponds to finding the assignment to the rest of the variables that makes the evidence most likely to occur (thus the name "most probable explanation").  \Gls{abrv:MMAP} is similar to MAP with the exception that the model value is now defined with respect to the marginalization of a subset of the variables denoted as the "sum" or $\SSS$ variables, and so the maximization is with respect to the remaining set denoted as the "query" or $\Q$ variables.  Although not commonly referred to, and thus omitted here, there can also be a corresponding MMPE task.

        \subsubsection{Difficulty}
            Summation tasks such as computing the partition function require consideration of the entire state-space (exponential in the number of variables) to compute accurately and are generally \#P-hard.  Since summation operations commute freely, when variable elimination algorithms are used for these tasks (with the variable ordering corresponding to the order in which variables are summed over in Equation \ref{def:z}), they can be used with any variable ordering.  Pure homogeneous optimization such as MAP and MPE inference, whose solutions can be confirmed in polynomial time, are easier but still NP-Hard to compute. Since optimization operators can freely commute with others of their own kind (ex. max operators can commute with each other, or min operators can commute with each other), variable elimination for these tasks can also use of any variable ordering.  Mixed inference tasks, however, are often more difficult to compute as they involve operators that do not commute.  In the example of MMAP (Equation \ref{def:mmap}), the summation operations must be computed before maximization, and thus restricts the variable orderings that can be used.  In practice, this \gls{gls:constrained-ordering} can lead to inference over graphs of much greater widths (see \seclink{sec:induced-width} for more details) and thus are more difficult to compute.

            This hierarchy of difficulties is summarized in \figlink{fig:task-difficulties}.

            \begin{figure}[h]
                \centering
                \includegraphics[scale=0.4]{./_attachments/images/pgm-task-difficutlies.png}
                \caption{Hierarchy of difficulties for three classes of graphical model inference tasks.}
                \label{fig:task-difficulties}
            \end{figure}


    \section{Background: Fundamental Schemes}

        In order to be able to solve important and difficult tasks in discrete graphical model framework, efficient algorithms are necessary.  Several foundational schemes serve as the backbone of a myriad of graphical model tasks.  Next we outline these schemes and their properties.

        \subsection{Search}
            We will start by describing fundamental search schemes used for solving tasks formulated as probabilistic graphical models.
            
            \subsubsection{OR Search Spaces}
                A graphical model can be cast into a search space in order to explore different configurations of the model.  \figlink{fig:ex-or-search-threo} shows a classical search space (also known as an "OR" search space) of the model described in \seclink{sec:simple-example} adhering to a search order that explores possible assignments to variable T, then H, then R, then E, then O.  (For simplicity, we abbreviate domain values of $low$ or $no$ instead with the value $0$, and $high$ or $yes$ with $1$).
    
                \begin{figure}[h]
                    \centering
                    \includegraphics[scale=0.37]{./_attachments/images/example-or-search-space-threo.pdf}
                    \captionsetup{width=.95\linewidth}
                    \caption{Example classical OR search space for the discrete graphical model described in \seclink{sec:simple-example} adhering to a search order that explores possible assignments to variable T, then H, then R, then E, then O.  For simplicity, we abbreviate domain values of $low$ or $no$ instead with the value $[0]$, and $high$ or $yes$ with $[1]$.}
                    \label{fig:ex-or-search-threo}
                \end{figure}
    
                As we follow a path down the tree, each successive level corresponds to an assignment to the next variable in the ordering.  Thus a path from the dummy root to a leaf corresponds to a \gls{gls:full-configuration}.  Given the search tree was built for our model that is meant to capture a factorized global function (in this case a factorized probability distribution), the search tree is constructed so the arc into a node $n$ associated with variable $X$ has a cost $c(n)$ equal to the product of functions $f_{\alpha} \in \F$ such that the path to $n_{X}$ fully instantiates all $X' \in \alpha$ and such that $X \in \alpha$ \citep{DBLP:journals/ai/DechterM07}.  In other words, $c(n)$ equal to the product of functions $f_{\alpha} \in \F$ such the variable represented by $n$ is in $f$'s scope and that the path to $n$ captures an assignment to every other variable in its scope. (If no such functions exist, the arc is vacuously assigned a value of $1.00$.
    
            \subsubsection{AND/OR Search Space}
                Often, assignments to earlier variables in the search ordering results in conditional independences between sub trees of later layers.  For example, given our model from \seclink{sec:simple-example} conditioning on variable $T$ (ie. giving an assignment to $T$) causes $E$ and $O$ to become independent of $H$ and $R$.  In the OR search space we can see this phenomenon by noticing that the edge cost into and sub tree under nodes of $E$ under the same assignment of $T$ but different assignments to $H$ and $R$ are duplicates.  (\figlink{fig:ex-conditional-independence-or-search-threo} shows this more explicitly).  
    
                \begin{figure}[h]
                    \centering
                    \includegraphics[scale=0.4]{./_attachments/images/example-conditional-independence-or-search-space-threo.pdf}
                    \captionsetup{width=.6\linewidth}
                    \caption{Conditional independence of $E$ and $O$ from $H$ and $R$ given assignment $T=0$ is shown in the search space from \figlink{fig:ex-or-search-threo}.  Notice that each distinct assignment to $H$ and $R$ leads to equivalent sub trees of $E$ and $O$ (each highlighted in a different colors for easier comparison).}
                    \label{fig:ex-conditional-independence-or-search-threo}
                \end{figure}
                
                We can take advantage of such conditional independences to construct a more compact search space that we will call an AND/OR search space.  Since such conditional independences are inherently captured by pseudo trees (\seclink{sec:pseudo-tree}), we can use pseudo trees to guide the construction of AND/OR search spaces.
    
                \begin{figure}[h]
                    \centering
                    \includegraphics[scale=0.4]{./_attachments/images/example-and-or-search-space-threo.pdf}
                    \captionsetup{width=.8\linewidth}
                    \caption{Example of a more compact AND/OR search space for the discrete graphical model described in \seclink{sec:simple-example} guided by the pseudo tree from \seclink{sec:pseudo-tree}.  For simplicity, we abbreviate domain values of $low$ or $no$ instead with the value $[0]$, and $high$ or $yes$ with $[1]$.}
                    \label{fig:ex-and-or-search-threo}
                \end{figure}
    
                \figlink{fig:ex-and-or-search-threo} shows the AND/OR search space that results from using the pseudo tree from figure \figlink{fig:ex-pseudo-tree-threo} as a guide.  Within AND/OR search spaces, nodes corresponding to assignments to variables (these are AND nodes; the yellow rectangles) are directly associated with a parent node corresponding to their respective variable (OR nodes; the blue circles). Branching in the guiding pseudo tree capturing conditional independences are also seen as branching in the AND/OR search space.  In the example provided, we see a branching under $T$ in the guiding pseudo tree that captures the conditional independence of $H$ and $R$ from $E$ and $O$ given assignment to T.  In the corresponding AND/OR search space, under each assignment of $T$ (namely under each AND child node of $T$) we see a branching leading to distinct sub trees - one for $H$ and $R$, and one for $E$ and $O$. Through capturing such decomposition, the search space can be greatly compacted.
    
                It is important to note that a path from root to leaf in an AND/OR search space does not necessarily capture a full configuration.  For example, the path from root to leaf highlighted in \figlink{fig:ex-partial-config-and-or-search-threo} captures a partial configuration corresponding only to assignments $T=0,H=1,R=0$, omitting assignments to $E$ and $O$.
    
                \begin{figure}[t!]
                    \begin{subfigure}{\textwidth}
                        \centering
                        \includegraphics[scale=0.4]{./_attachments/images/example-partial-configuration-and-or-search-space-threo.pdf}
                        \captionsetup{width=0.65\linewidth}
                        \caption{Paths from root to leaves in AND/OR search spaces do not necessarily correspond to full configurations. In the example shown here, the highlighted path captures a partial configuration with assignments $T=0,H=1,R=0$, but omits assignments to $E$ and $O$.}
                        \label{fig:ex-partial-config-and-or-search-threo}
                    \end{subfigure}

                    \begin{subfigure}{\textwidth}
                        \centering
                        \includegraphics[scale=0.4]{./_attachments/images/example-full-configuration-and-or-search-space-threo.pdf}
                        \captionsetup{width=.65\linewidth}
                        \caption{To capture a full configuration in an AND/OR search space, we must capture \textit{all} variables that branch from paths extended leading to a subtree of the full search space that includes all variables of the model.}
                        \label{fig:ex-full-config-and-or-search-threo}
                    \end{subfigure}

                    \captionsetup{width=0.8\linewidth}
                    \caption{Figure (a) shows an example AND/OR search space where a path corresponds only to a partial configuration of the variables in the model.  Figure (b) shows how a full configuration can be captured.}
                    
                \end{figure}
    
                In contrast, note that a full configuration (such as the one captured in \figlink{fig:ex-full-config-and-or-search-threo} for $T=0,H=1,R=0$, $E=0$, $O=1$) consists of a \emph{sub tree} such that at any point following a path from the root towards the leaves that a variable branching occurs under an AND node, \emph{all} children OR nodes are included in the final subtree.  The cost of a full configuration in an AND/OR subtree can be computed by applying a related combination operation (often multiplication) to the cost of each arc traversed.
    
                Finally, we can see that OR search spaces can be thought of as and AND/OR search space guided by a pseudo tree with no branching variables (also known as a chain pseudo tree) with the search space omitting explicit OR nodes (which are instead implicitly captured by the various levels in the search space).  For example, the search space from \figlink{fig:ex-or-search-threo} can be explicitly represented as the AND/OR search space shown in \figlink{fig:ex-or-search-as-and-or-threo}.
    
                \begin{figure}[h]
                    \centering
                    \includegraphics[scale=0.4]{./_attachments/images/example-or-as-and-or-search-space-threo.pdf}
                    \captionsetup{width=.8\linewidth}
                    \caption{The OR search space from \figlink{fig:ex-or-search-threo} expressed explicitly as an AND/OR search space.}
                    \label{fig:ex-or-search-as-and-or-threo}
                \end{figure}


            \subsubsection{Search Space Notation}
                The symbol $n$ is used to generally represent search nodes.  (Depending on context, $n$ may represent either AND or OR nodes).  $n_{X}$ specifically refers to an AND node in and AND/OR search tree $T$ associated with variable $X$. $Y_{n_X}\!$ represents the specific OR node associated with variable $Y$ that is the child of $n_{X}$. The notation of OR nodes may seem counter intuitive at first as they resemble the notation for variables of a graphical model.  However, this is because OR nodes of AND/OR trees representing graphical models in fact do represent the variables of the model which explains the choice for its notation.  $path(n)$ is the configuration of the variables along the path from the root of $T$ to $n$ according to assignments corresponding to that path.  For example, in \figlink{fig:ex-partial-config-and-or-search-threo}, if we let $n$ be the leaf node of the highlighted path, $path(n) = \set{T=0, H=1, R=0}$. 
                $varpath(n)$ is the set of variables that $path(n)$ provides a configuration for.  
                %In our example, $varpath(n) = \set{A,C}$.  
                With this notation, we can express the cost of the arc to an AND node $n_{X}$ as:
                \begin{align}
                    c(n_{X}) = \prod_{f_{\alpha} | varpath(n_{X}) \subseteq \alpha \tn{ and } \set{X} \subseteq \alpha} f_{\alpha}(path(n_{X}))
                \end{align}
                $g(n)$ is the cost of $path(n)$ according to the combination operation defined for the model.  For example, $g(n)$ for the same leaf node $n$ in \figlink{fig:ex-partial-config-and-or-search-threo} assuming a product combination operation would be $g(n) = (0.6)\cdot (0.25)\cdot (0.20)$. $ch(n)$ denotes the children of node $n$.  Note that for AND/OR nodes, children of AND nodes are OR nodes, and vise versa.  $anc(n)$ are all the ancestors of $n$.  In AND/OR trees, $br(n_{X})$ is the set of ancestor AND nodes $n_{Y}$ on the path to $n_{X}$ such that $Y$ is a branching variable in \PT.


            \subsubsection{Important Quantities in AND/OR Search Spaces}
                To use AND/OR search spaces effectively for solving tasks, there are several quantities that become important to compute and understand.  We will describe these next.

                \paragraph{$\bs{Z(n)}$.} \label{sec:partition-function-of-a-node}  
                    \hfill \\
                    As we saw in \seclink{sec:task-formalizations}, we use $Z$ to denote the partition function of a model - namely the sum of costs across all configurations of the model. Each configuration's cost is calculated as the product of the model's functions based on the assignments corresponding to that particular configuration.  In the context of AND/OR search, given infinite resources Z could be computed by systematic search enumerating all full configurations and summing their costs (see \figlink{fig:ex-full-config-and-or-search-threo} for an example full configuration).

                    Applying a similar concept, we introduce the quantity $Z(n)$.  Semantically, $Z(n)$ represents the partition function of an imaginary model that could be represented by the subtree rooted at $n$. Thus $Z(n)$ is  equal to the sum of the costs of all partial configurations rooted at $n$ due to only functions that contribute to the arc values under $n$.  For an AND node $n_{X}$ with children OR nodes $Y_{n_{X}} \in ch(n_{X})$, $Z(n_{X})$ can be computed by
                    \begin{equation} \label{eq:and-or-z-prod}
                        Z(n_{X}) = \prod_{Y_{n_{X}} \in ch(n_{X})} Z(Y_{n_X})
                    \end{equation}
                    such that for OR nodes $Y_{n_{X}}$, $Z(Y_{n_X})$ is computed by
                    \begin{equation} \label{eq:and-or-z-sum}
                        Z(Y_{n_X}) = \sum_{n_Y \in ch(Y_{n_X})}  c(n_Y) \cdot Z(n_Y)
                    \end{equation}
                    with $Z(n_{X}) = 1$ vacuously, in the case it has no children.

                    \begin{figure}[h]
                        \centering
                        \includegraphics[scale=0.4]{./_attachments/images/example-Z-of-n-and-or-search-space-threo.pdf}
                        \captionsetup{width=.8\linewidth}
                        \caption{The subtree contributing to $Z(n_{T=0})$ is highlighted above.  Using \eqlink{eq:and-or-z-prod} and \eqlink{eq:and-or-z-sum}, $Z(n_{T=0}) = 1.0$.}
                        \label{fig:ex-z-of-n-in-and-or-threo}
                    \end{figure}
                    
                    Note that given $n_{\varnothing}$ as the dummy root node of AND/OR tree $T$, $Z(n_{\varnothing}) = Z$ of the underlying model \M. We denote estimation of $Z(n)$ as $\hat{Z}(n)$.  Heuristic estimates of $Z(n)$ are denoted as $h(n)$.

        
                \paragraph{$\bs{R(n)}$.} \label{sec:ancestor-branching-mass}
                     On the path from the root of an AND/OR tree $T$ to some node $n_{X}$, there may be an intermediate node $n_{Y}$ associated with branching variable $Y$ in the guiding pseudo tree \PT. (For example in \figlink{fig:ancestor-branching-mass}, on the path to the highlighted node $n_{A=0,C-1}$, node $n_{A=0}$ is traversed where $A$ is a branching variable in \figlink{fig:psuedo-tree-with-ancestor-branching-mass}).  When this happens, the remaining variables of the model are split between different branches.  (For example in the same \figlink{fig:ancestor-branching-mass}, notice the left branch under the node $n_{A=0}$ contains variable $B$ but not $C$ or $D$ and that the right branch contains $C$ and $D$ but not $B$).  Thus, the $Z(n)$ of any node down one of the branches will necessarily miss the cost from the configurations of the variables included in the other branch(es). $R(n_{X})$, or the \textit{ancestor branching mass}, captures the $Z(n)$ for all variables that had branched off of the path to $n_{X}$. (For example in the same \figlink{fig:ancestor-branching-mass}, the green box shows the portion corresponding to the $R(n_{A=0,C=1})=Z(B_{n_{A=0}})$).
                     %(That same boxed portion would also be the ancestor branching mass for the sibling node of the red node, and also for any of their children).
        
                     More formally, let $br(n_{X})$ be the set of ancestor nodes $n_{Y}$ on the path to $n_{X}$ such that $Y$ is a branching variable in \PT. Let $W_{n_{Y}}$ be the child OR node of $n_{Y}$ that that is also on the path to $X$.  We define $R(n_{X})$ as: 
                     %$R(n_{X}) =   \prod_{n_{Y} \in br(n_{X})} \frac{Z(n_{Y})}{ Z(W_{n_{Y}})}$.
                     \begin{align}
                         \label{eq4}
                         R(n_{X}) =   \prod_{n_{Y} \in br(n_{X})} \frac{Z(n_{Y})}{ Z(W_{n_{Y}})}
                     \end{align}
                     We denote approximations to $R(n)$ as $r(n)$.
        
                    % \begin{wrapfigure}{r}{0.65\linewidth}
                    \begin{figure}[t!]
                    	\centering
                    	\begin{subfigure}{0.35\linewidth}
                    	\centering
                    	       \includegraphics[width=0.75\linewidth]{UAI-24/_attachments/images/pseudotree.png}
                                \caption{}
                                \label{fig:primal-graph-and-pseudo-tree}
                    	\end{subfigure}
                            \begin{subfigure}{0.95\linewidth}
                    	\centering
                                \includegraphics[width=0.5\linewidth]{UAI-24/_attachments/images/AncestorBranchingMass_withArcCosts.pdf}
                                \caption{}
                                \label{fig:ancestor-branching-mass}
                            \end{subfigure}
                            \captionsetup{width=.8\linewidth}
                    	\caption{A full AND/OR tree representing 16 possible solutions guided by the pseudo tree shown above. Boxed in green is the ancestor branching subtree for the path $\rightarrow \!\! (A \!\! = \!\! 0) \!\! \rightarrow  \!\! (C \!\! = \!\! 1)$. \vspace{-4pt}}
                                \label{fig:psuedo-tree-with-ancestor-branching-mass}
                    % \end{wrapfigure}
                    \end{figure}

                    
                \paragraph{$\bs{Q(n)}$.} \label{sec:q-of-a-node}
                    We can now concisely define a quantity $Q(n)$ as the contribution to $Z$ from all full configurations consistent with $path(n)$. In other words, $Q(n)$ is the likelihood of the configuration $path(n)$ based on the distribution defined by \M, with $P(path(n)) = \frac{Q(n)}{Z}$.  $Q(n)$ can be computed simply as: 
                    %Q(n) = g(n)  \! \cdot \!  R(n)  \! \cdot \!  Z(n)$.
                    \begin{align}
                        Q(n) = g(n)  \! \cdot \!  R(n)  \! \cdot \!  Z(n)
                    \end{align}
                        
                     \textbf{Example.} In \figlink{fig:ancestor-branching-mass}, consider the path from the root to the red node $n_{A= 0,C=1}$. Following $n_{A=0}$ to our node, we see OR node $B_{n_{A=0}}$ that branches off of the path.
                     So, 
                     %$Q(n_{A=0,C=1}) = g(n_{A=0,C=1}) \! \cdot \! R(n_{A=0,C=1}) \! \cdot \! Z(n_{A=0,C=1}) = g(n_{A=0,C=1}) \mul Z(n_{A=0,B}) \! \cdot \! Z(n_{A=0,C=1})$.
                     % \begin{alignat}{3}
                     % \begin{split}
                     %    Q(n_{A=0,C=1}) &= g(n_{A=0,C=1}) \! \cdot \! R(n_{A=0,C=1}) \! \cdot \! Z(n_{A=0,C=1}) \\
                     %    &= g(n_{A=0,C=1}) \mul \;\; Z(B_{n_{A=0}})\;\; \! \cdot \! Z(n_{A=0,C=1}) \\
                     %    &= \;\;\; (10 \mul 5) \;\;\; \mul \;\; ( 1 \mul 1 + 4 \mul 1)\;\; \! \mul \! ( 2 \mul 1 + 3 \mul 1) 
                     % \end{split}
                     % \end{alignat}
                     \begin{alignat*}{7}
                        Q(n_{A=0,C=1}) &= g(n_{A=0,C=1}) &\;\mul{}\;& R(n_{A=0,C=1}) &\;\mul{}\;& Z(n_{A=0,C=1}) \\
                        &= g(n_{A=0,C=1}) &\;\mul{}\;&  Z(B_{n_{A=0}}) &\;\mul{}\;& Z(n_{A=0,C=1}) \\
                        &= (10 \mul 5) &\;\mul{}\;& ( 1 \mul 1 + 4 \mul 1) &\;\mul{}\;& ( 2 \mul 1 + 3 \mul 1) 
                     \end{alignat*}

                

            % \subsubsection{Solving Tasks Using AND/OR Search}
            %     Search allows us to enumerate configurations of a model, which can in turn be used to solve useful tasks.  Next we will describe how AND/OR search can be used to solve useful tasks.

            %     \paragraph{Solving the Partition Function.}
            %         As described in \seclink{sec:task-formalizations}, the partition function is defined as:
            %         $Z = \sum_{\X} \prod_{\F} f(\xx)$.
            %         In the context of classical OR search, this would correspond to enumerating and summing the cost of all paths from root to leaves, thus capturing the cost of all full configurations.  However, when using AND/OR search guided by a pseudo tree with branching variables, paths in the tree only correspond to partial configurations.
            
        \subsection{Inference}
            Next we will outline key inference schemes used with probabilistic graphical models relevant to this work.
        
            \subsubsection{Variable Elimination}
                Many probabilistic graphical model queries can be solved by an inference framework known as \gls{gls:variable-elimination} (\textbf{VE}). Variable elimination involves an ordered computational processing of the variables of a model, at each step removing a processed variable from subsequent computations (thus called an \textit{elimination} step).  Each elimination step corresponds to a step of  inference, transferring the effects of the eliminated variables over to the remaining variables (in practice, done by creating a newly inferred function over the remaining variables).
    
                As an example, consider the query to find the mode of the distribution defined by our simple example above (\seclink{sec:simple-example}).  Formalizing this query, we want to solve the task:
                \begin{align}
                    \max_{T,H,E,R,O} p(t,h,e,r,o)
                \end{align}
                which, based on \eqlink{eq:simple-example-joint-as-prod-of-funcs}, in terms of our model is equivalent to
                \begin{align}
                    \max_{T,H,E,R,O} p(t,h,e,r,o) 
                        = \max_{T,H,E,R,O} \textcolor{blue}{f_{T}(t) \cdot f_{H}(h)  \cdot f_{E}(e) \cdot f_{T,H,R}(t,h,r)\cdot f_{T,E,O}(t,e,o)}
                \end{align}
                (The blue coloring is simply to help keep note of where the functions are in the expression).
                
                Using variable elimination to solve this query, we would first need an \gls{gls:elimination-order} - order in which to process and eliminate variables while performing inference.  Suppose an elimination order $o_{elim} = [R,O,E,H,T]$.  Then, given this ordering, we express our query as
                \begin{align}
                    \max_{T}(\; \max_{H}(\; \max_{E}(\; \max_{O}(\; \max_{R}(\; \textcolor{blue}{f_{T}(t) \cdot f_{H}(h)  \cdot f_{E}(e) \cdot f_{T,H,R}(t,h,r)\cdot f_{T,E,O}(t,e,o)} \;)\;)\;)\;)\;)
                \end{align}
                where the query can then be solved inside-to-out, variable-by-variable, via computations indicated by the parenthesis.  The result from each step can be interpreted as the inference performed over its corresponding variable.
    
                One power of variable elimination is its ability to simplify computation leveraging mathematical properties of the query.  Note that in our example some of the model's functions are not dependent on the variable being immediately maximized over and so can be factored out of the respective maximization.  Doing so recursively, we can rewrite our query with the same ordering instead as
                \begin{align}
                    \max_{T}(\; \textcolor{blue}{f_{T}(t)} \cdot 
                        \max_{E}(\; \textcolor{blue}{f_{E}(e)} \cdot 
                            \max_{O}(\; \textcolor{blue}{f_{T,E,O}(t,e,o)}
                            \;)\;) \cdot
                        \max_{H}(\; \textcolor{blue}{f_{H}(h)} \cdot 
                            \max_{R}(\; \textcolor{blue}{f_{T,H,R}(t,h,r)}
                    \;)\;)\;)
                \end{align}
                This decomposition reduces the size of the terms being maximized over, thus reducing complexity of the computations.
    
            \subsubsection{Bucket Elimination}
                \Gls{gls:bucket-elimination} \cite{dechter99}, or \textbf{BE}, is a variable elimination scheme that can be adapted for a myriad of graphical model tasks including those described in \seclink{sec:well-known-graphical-model-tasks}. 
        
                Bucket elimination is a message passing scheme that performs variable elimination according to a given elimination order by processing a data structure called \glspl{gls:bucket} one-by-one, each bucket corresponding to a variable in the ordering. 
                When reaching a variable $X_{i}$ in the ordering, all unprocessed functions that contain $X_{i}$ in their scope are placed in  bucket $B_{i}$ (this includes the model's original functions as well as messages generated during the bucket elimination process).  As shown in \eqlink{eq:general-bucket-function-generation}, the bucket is then processed by applying an elimination operation (generalized as $\bigoplus$) over $X_{i}$ to the combination of the bucket functions (generalized as $\bigotimes$) resulting in a \gls{gls:bucket-message} - a new function over the remaining variables present in the scope of the processed functions - denoted $\lambda_{i \rightarrow j}$, or $\lambda_{i}$ for short. 
                \begin{align} 
                    \label{eq:general-bucket-function-generation}
                    \lambda_{i \rightarrow j} = \bigoplus_{X_{i}} \bigotimes_{f_{\alpha} \in B_{i}} f_{\alpha}(\alpha)
                \end{align}
                In the context of computing the \gls{gls:partition-function}, this corresponds to marginalizing $X_{i}$ from the product of the functions
                \begin{align}
                    \label{eq:Z-bucket-function-generation}
                    \lambda_{i \rightarrow j} = \sum_{X_{i}} \prod_{f_{\alpha} \in B_{i}} f_{\alpha}(\alpha)
                \end{align}
                or in the context of computing the \gls{abrv:MAP}, maximizing the product of the functions over $X_{i}$
                        \begin{align}
                    \label{eq:MAP-bucket-function-generation}
                    \lambda_{i \rightarrow j} = \max_{X_{i}} \prod_{f_{\alpha} \in B_{i}} f_{\alpha}(\alpha)
                \end{align}
                
                The $i$ in $\lambda_{i \rightarrow j}$ refers to the bucket that generated the message. $j$ indicates the bucket this message will be sent to; namely the next variable in the elimination ordering that is also found in the scope of the message.
                
                The processed buckets and messages can then be used to compute result of the underlying query (ex. in the case of computing the partition function or MAP, the result is simply the combination of the finally remaining messages after processing of the last bucket). \figlink{fig:bucket-elimination-example} shows a schematic of bucket elimination on a graphical model with variables indexed from $A$ to $G$ and with a unary function with respect to variable $A$ and pair-wise functions over the pairs of variables connected by an edge in the underlying primal graph (\figlink{fig:bucket-elimination-example:primal-graph}), namely: 
                $F =
                    \{
                        f_{A}(A)$, $f_{A,B}(A,B)$, $f_{A,D}(A,D)$, $f_{A,G}(A,G)$,  $f_{B,C}(B,C)$, $f_{B,D}(B,D)$, $f_{B,E}(B,E)$, $f_{B,F}(B,F)$, $f_{C,D}(C,D)$, $f_{C,E}(C,E)$, $f_{F,G}(F,G) 
                    \}$.
        
                \begin{figure}[t!]
                        \centering
                        \begin{subfigure}[b]{0.3\textwidth}
                            \centering
                            \includegraphics[scale=0.7]{./_attachments/images/DBE/primalgraph-2}
                            \caption{
                                Example primal graph of a graphical model with 7 variables and model functions 
                                % $F = \{
                                %     f(A)$, $f(A,B)$, $f(A,D)$, $f(A,G)$,  $f(B,C)$, $f(B,D)$, $f(B,E)$, $f(B,F)$, $f(C,D)$, $f(C,E)$, $f(F,G)
                                % \}$.
                                $F =
                                    \{
                                        f_{A}(A)$, $f_{A,B}(A,B)$, $f_{A,D}(A,D)$, $f_{A,G}(A,G)$,  $f_{B,C}(B,C)$, $f_{B,D}(B,D)$, $f_{B,E}(B,E)$, $f_{B,F}(B,F)$, $f_{C,D}(C,D)$, $f_{C,E}(C,E)$, $f_{F,G}(F,G) 
                                    \}$.
                            }
                            \label{fig:bucket-elimination-example:primal-graph}
                        \end{subfigure}
                        \hspace{18pt}
                        \begin{subfigure}[b]{0.6\textwidth}   
                            \centering 
                            \includegraphics[scale=0.35]{./_attachments/images/DBE_bucket_elimination_schematic.pdf}
                            \caption{Bucket elimination schematic following an elimination order $o_{elim} = [D,E,G,C,F,B,A]$.} 
                            \label{fig:bucket-elimination-example:schematic}
                        \end{subfigure}
                        \caption[ ]
                        {\small (a) A primal graph
                            of a graphical model with 7 variables.  
                            % (b) Illustration of bucket elimination with an ordering G F D E C B A.
                            (b) Illustration of \emph{BE} with an ordering A B C E D F G.
                            % \yasaman{Can you please check if the red notes are correct?}\sakshi{node C should send message $\lambda_{C \rightarrow B}(A,B)$ instead of $\lambda_{C \rightarrow B}(B)$}
                            } 
                        \label{fig:bucket-elimination-example}
                    \end{figure}
                
                Bucket elimination can be viewed as a 1-iteration message-passing algorithm along its \gls{gls:bucket-tree} (\figlink{fig:bucket-elimination-example:schematic}). The nodes of the tree are the different buckets. Each bucket of a variable contains a set of the model's functions depending on the given order of processing. There is an arc from bucket
                % \sakshi{YAsaman, to be consistent with notation : $B_{p}$ to a parent bucket $B_{a}$?} 
                $B_{X}$ to bucket $B_{Y}$, if the function created at bucket $B_{X}$ is placed in bucket $B_{Y}$.
                
                In summary, bucket elimination uses the variable elimantion paradigm and dynamic programming to break a computational task  into smaller subproblems, computing the result by processing buckets and sending resulting messages according to a provided elimination order.
                
                \paragraph{Complexity.}
                    Both the time and space complexity of bucket elimination are exponential in the \gls{gls:induced-width} of the model, which can be computed as a graph parameter based on the provided ordering and the underlying primal graph \cite{dechter-book-2ndEd}.  (More on the induced width in \seclink{sec:induced-width}).  In the context of bucket messages, the induced width is equal to the cardinality of the scope of the bucket message with the largest scope.  Given that its complexity is exponential in the induced width, bucket elimination becomes impractical if the induced width is large, and thus approximation schemes have been developed to address this \cite{dechter99c,liu11-bounding-partition-function}.
    
    
            \subsubsection{Induced Width (w*)} \label{sec:induced-width}
                The difficulty of answering a query using variable elimination can depend heavily on the elimination order being used, with some elimination orderings leading to efficient factorization, whereas others may not and instead result large computations.  We can capture this complexity graphically.
                
                As elimination computations are performed variable-by-variable pursuant to the ordering provided, the corresponding solutions (which themselves may be a function over some remaining variables) can be viewed as inducing new edges onto the underlying primal graph in the same way the  model's native functions did originally (see \seclink{sec:primal-graph} for details).  These new edges correspond to newly inferred relationships between variables not directly connected in the original graph. When adding all of the newly induced edges to the primal graph (namely the new edges resulting from generated messages from the processing of all the variables), we end up with a new graph called the \semph{induced primal graph}, or \semph{induced graph} for short.  The complexity of exact variable elimination algorithms are with respect to the tree-width of the resulting induced graph - namely with respect to a quantity known as the \semph{induced width} (or \semph{w*}), which is one less than the largest clique-size of the induced graph.
    
                \begin{definition}[Induced Width (w*)]
                    The induced width of a graphical model $\M$ with respect to elimination order $o_{elim}$ with induced primal graph $\G'$ is
                    \begin{align}
                        w^{*} = \max_{c \in clq(\G')} |c|-1,
                    \end{align}
                    where $clq(\G')$ is the set of all cliques in $\G'$ and $|c|$ denotes the clique-size of clique $c$.
                \end{definition} 


        

 % A {\bf primal graph} $\mathcal{G} \! = \! (\mathbf{X,E})$ of a graphical model $\mathcal{M}$ associates each variable of $\M$ with a corresponding node in a one-to-one fashion such that arcs $e \! \in \! \mathbf{E}$ connect nodes whose variables appear in the scope of the same local function. To simplify, we abuse notation by using the same name to refer to the primal graph node and corresponding variable of $\mathcal{M}$.  










%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    
    % \subsection{AND/OR Search Space for Mixed Inference} 
    %     A pseudo tree of a graphical model can be used to construct a corresponding directed weighted state space graph that can be used for search. In the next sections, we will describe two types of such graphs.

    %     \subsubsection{Weighted OR Search Spaces}
    %         In the traditional \semph{OR} search space, which is constructed relative to a pseudo tree that is a chain, paths from the dummy root to the leaves represent full configurations of the model (assignments to each variables).  As such, nodes in the path at each depth represent an assignment to the corresponding variable in the pseudo tree.  Thus, each horizontal level of the search space corresponds to the respective variable in the pseudo tree, with the nodes across that level in the OR graph representing assignments to its corresponding pseudo tree variable conditioned on assignments to previous variables (each combination captured by a different paths down the graph).

    %         In the simplest case, given a pseudo tree $\T$ an OR search space can be constructed as a tree in the following way:
    %         \begin{enumerate}
    %             \item
    %                 add a dummy root node (corresponding to the dummy assignment of the dummy variable rooting $\T$)
    %             \item
    %                 for each next variable $X_{i+1}$ in $\T$:
    %             \begin{enumerate}[label=\roman*.]
    %                 \item
    %                     for each node $n$ in level $i$ of the partially constructed OR tree $T'$: 
    %                     \begin{enumerate}[label=\alph*)]
    %                         \item
    %                             add a node $n'$ for each possible assignment of variable $X_{i+1}$ in the next level $i+1$ of $T'$
    %                         \item
    %                             add an edge $n \rightarrow n'$
    %                     \end{enumerate}
    %             \end{enumerate}
    %         \end{enumerate}
    
    %         For models $\MDef$ representing a global function $\mathcal{F} = \bigoplus_{f \in \F} f$, edges in its corresponding OR graph can be assigned in the following way:
    %         \begin{enumerate}
    %             \item
    %                 the edges from the dummy root to the nodes corresponding to each possible assignment of the first variable $X_{i=1}$ is 1.0
    %             \item
    %                 for each
    %         \end{enumerate}
    %         An OR search tree for the model described in \seclink{sec:simple-example} is shown in Figure \todo{Make figure} using ordering \todo{$o = []$}.

        
    %     A more compact AND/OR search space can also be constructed by capturing conditional independenciesin the  model,
    %     thus facilitating more effective algorithms \cite{DBLP:journals/ai/DechterM07}. An AND/OR search space is defined relative to a \emph{pseudo tree} of a primal graph which can capture conditional independences.	A {\bf pseudo tree} $\mathcal{T} \! = \! (\mathbf{V,E'})$ of a  primal graph $\mathcal{G} \! = \! (\mathbf{V,E})$ is a directed rooted tree that spans $\mathcal{G}$ such that every arc of $\mathcal{G}$ not in $\mathbf{E'}$ is a back-arc in ${\cal T}$ connecting a node to one of its ancestors (\figlink{fig:graph-to-tree}(a),(b)).  
    %     %A variable is a {\bf branching variable} if it has multiple children in $\mathcal{T}$.
    %     %The arcs in $E'$ may not all be included in $E$ . 
    %     For mixed inference problems where a subset of variables are to be maximized (MAP variables) and the remaining variables (SUM variables) marginalized, the pseudo tree must be constrained such that the MAP variables precede SUM variables in the variable ordering \cite{lee16-exact-to-anytime-MMAP, marinescu18-jair-AO-search-mmap}.
        
        
    %     \begin{figure}[!htb]
    %         \centering
    %         \includegraphics[scale=0.25]{./_attachments/images/graph-to-tree.png}
    %         \label{graph-to-tree}
    %         \caption{A full AND/OR tree representing all 16 solutions.}
    %         \label{fig:graph-to-tree}
    %     \end{figure}
        
    % %     \begin{figure*}[!htb]
    % %         \centering
    % % 		\includegraphics[scale=0.25]{./_attachments/images/AncestorBranchingMass.pdf}
    % % 		\label{fig:ancestor-branching-mass}
    % % 		\caption{Boxed in green is the ancestor branching subtree for the path $\rightarrow \!\! (B \!\! = \!\! 0) \!\! \rightarrow  \!\! (C \!\! = \!\! 1)$.}
    % %     \end{figure*}

    
    %     Given a
    %     pseudo tree $\mathcal{T}$ of a primal graph $\mathcal{G}$, the AND/OR search tree
    %     $T_{\mathcal{T}}$ guided by $\mathcal{T}$ has alternating levels of OR nodes
    %     corresponding to variables, and AND nodes corresponding to
    %     assignments from  its domain  with edge costs extracted from
    %     the original functions \cite{DBLP:journals/ai/DechterM07}. %(By this logic, we can think of the nodes of an OR tree as AND nodes).  
    %     %Let $n$ be an AND node in $T_{\tau}$, also denoted $n_X$ if $X$ is the last variable of its partial configuration.
    %     Each arc into an AND node $n$ %(or the arc from its OR parent to the AND node)
    %     has a cost $c(n)$ defined to be the product of all factors $f_{\alpha}$ in $\mathcal{M}$ that are instantiated at $n$ but not before.
    %     % \textcolor{red}{Moved to section "Value of A Node": (see \figlink{fig-simple}(c)).}
        
    %         A  {\bf solution tree}  is a subtree %$\soltree$
    %         of $T_{\mathcal{T}}$  satisfying: (1) it contains the root of
    %         $T_{\mathcal{T}}$; (2) if an OR node is in the solution tree,
    %      exactly one of its  AND child nodes is in the solution tree; (3) if an  AND node is in the tree then all of its OR children  are in the solution tree. 
    %      %Finally, its leaves are leaves of $T_{\mathcal{T}}$ 
    %      \cite{DBLP:journals/ai/DechterM07}. 
         
         
         
         

    
    % \clearpage
    % \section{Background: Computational Protein Design}
    %     \subsection{Overview}
    %         \bobak{TODO}
    %     \subsection{Re-Design}
    %         \bobak{TODO}
    %     \subsection{Optimizing Affinity}
    %         \bobak{TODO}
    %         \subsubsection{GMEC Optimization}
    %             \bobak{TODO}
    %         \subsubsection{\Kstar{} Optimization}
    %             \bobak{TODO}

        
\clearpage

    \bibliography{ref}




\end{document}