\documentclass{uai2024} % for initial submission
%\documentclass[accepted]{uai2024} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2024} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2024} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Instructions for Authors: Title in Title Case}

% The standard author block has changed for UAI 2024 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
% \author[1]{\href{mailto:<jj@example.edu>?Subject=Your UAI 2024 paper}{Jane~J.~von~O'L\'opez}{}}
% \author[1]{Harry~Q.~Bovik}
% \author[1,2]{Further~Coauthor}
% \author[3]{Further~Coauthor}
% \author[1]{Further~Coauthor}
% \author[3]{Further~Coauthor}
% \author[3,1]{Further~Coauthor}
% % Add affiliations after the authors
% \affil[1]{%
%     Computer Science Dept.\\
%     Cranberry University\\
%     Pittsburgh, Pennsylvania, USA
% }
% \affil[2]{%
%     Second Affiliation\\
%     Address\\
%     …
% }
% \affil[3]{%
%     Another Affiliation\\
%     Address\\
%     …
%   }
\author{\href{mailto:<pezeshkb@uci.edu>?Subject=Abstraction Sampling - UAI 2024}{Bobak Pezeshki}{}}
\author{\href{mailto:<kkask@uci.edu>?Subject=Abstraction Sampling - UAI 2024}{Kalev Kask}{}}
\author{\href{mailto:<ihler@ics.uci.edu>?Subject=Abstraction Sampling - UAI 2024}{Alexander Ihler}{}}
\author{\href{mailto:<dechter@ics.uci.edu>?Subject=Abstraction Sampling - UAI 2024}{Rina Dechter}{}}
% Add affiliations after the authors
\affil[1]{%
    University of California, Irvine
}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% CUSTOM PACKAGES %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\usepackage{caption}
\usepackage{subcaption}
\usepackage{float}
\usepackage{xspace} % package being used for \newcommand to remove extra space
                    %     when a command is invoked without an argument list
\usepackage{textcase}
\usepackage[toc, nopostdot]{glossaries}
% \usepackage{amsmath}
\usepackage{amsthm, amssymb}
\usepackage{mathtools}
\usepackage{enumitem}
\usepackage{refcount}
\usepackage[leftmargin=6pt, vskip=3pt-\parskip]{quoting}
\usepackage[titlenumbered,ruled, linesnumbered]{algorithm2e}
\usepackage{mathrsfs} %for \mathscr
\usepackage[font=smaller,labelfont=bf]{caption}
% \usepackage[font=small,labelfont=bf]{subcaption}
% \usepackage[labelfont=bf]{caption}
% \usepackage[labelfont=bf]{subcaption}
\usepackage{xcolor}
    \definecolor{darkgreen}{rgb}{0.0, 0.2, 0.13}
    \definecolor{cadmiumgreen}{rgb}{0.0, 0.42, 0.24}
    \definecolor{byzantium}{rgb}{0.44, 0.16, 0.39}
    \definecolor{darkelectricblue}{rgb}{0.33, 0.41, 0.47}
    \definecolor{battleshipgrey}{rgb}{0.52, 0.52, 0.51}
    \definecolor{warmblack}{rgb}{0.0, 0.26, 0.26}
\usepackage{newfloat}
\usepackage{chngcntr}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% CUSTOM COMMANDS %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%create new float environment called plotfigure with it's own counter
\DeclareFloatingEnvironment[name=Plot]{plotfigure} 

%create new float environment called tablefigure with it's own counter
\DeclareFloatingEnvironment[name=Table]{tablefigure} 

%set the floats table and tablefigure to use the same counters
\makeatletter\let\c@tablefigure\c@table\makeatother 

%consider the floats table and tablecounter as the same set of floats (so location in document will be in order in which they appear)
\makeatletter\let\ftype@tablefigure\ftype@table\makeatother 

\DeclareMathOperator*{\argmin}{argmin}
\DeclareMathOperator*{\argmax}{argmax}
\mathchardef\mhyphen="2D % Define a "math hyphen"

% algorithm2e
% \newcommand\commentstyle[1]{\textcolor{cadmiumgreen}{#1}}
\SetCommentSty{commentstyle}
\SetKwInOut{Input}{input}
\SetKwInOut{Output}{output}

\newtheoremstyle{break}
  {\topsep}{\topsep}%
  {\itshape}{}%
  {\bfseries}{}%
  {\newline}{}%
\theoremstyle{break}
% \newtheorem{theorem}{Theorem}[subsubsection]
\newtheorem{theorem}{Theorem}[section]
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{proposition}[theorem]{Proposition}
% \newtheorem{definition}{Definition}[subsubsection]
\newtheorem{definition}{Definition}[section]

\input{cmds}
\renewcommand*{\glstextformat}{\textbf}

\renewcommand{\quote}{\list{}{\rightmargin=\leftmargin\topsep=0pt}\item\relax}







%%% for supplemental

\usepackage{enumitem}
    \setlistdepth{9}
    \setlist[itemize,1]{label=$\bullet$}
    \setlist[itemize,2]{label=$\cdot$}
    \setlist[itemize,3]{label=$\cdot$}
    \setlist[itemize,4]{label=$\cdot$}
    \setlist[itemize,5]{label=$\cdot$}
    \setlist[itemize,6]{label=$\cdot$}
    \setlist[itemize,7]{label=$\cdot$}
    \setlist[itemize,8]{label=$\cdot$}
    \setlist[itemize,9]{label=$\cdot$}
    \renewlist{itemize}{itemize}{9}






%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\setcounter{secnumdepth}{3} %May be changed to 1 or 2 if section numbers are desired.
\setcounter{tocdepth}{3}

\title{Abstraction Sampling with Heuristic-Based, HR-Based,\\ and Proposal-Based Abstraction Functions}


\input{gls}


\begin{document}
    % \onecolumn
    \setlength{\abovedisplayskip}{3pt}
    \setlength{\belowdisplayskip}{3pt}

    \maketitle
    
    \begin{abstract}
        \vspace{-12pt}
        Monte Carlo methods are powerful tools for solving problems involving complex probability distributions. Despite their versatility, these methods often suffer from computational inefficiencies, especially when dealing with rare events. As such, importance sampling emerged as a prominent technique for alleviating these challenges. Recently, a new scheme called Abstraction Sampling was developed that incorporated stratification to importance sampling over graphical models helping to improve estimates further. However, existing work on Abstraction Sampling has explored only a limited set of abstraction functions guiding the stratification. This study expands this set by introducing three new classes of abstraction functions combined with seven distinct partitioning schemes, resulting in twenty-one new abstraction functions. These are motivated by theory and intuition from both the search and sampling domains. An extensive empirical analysis on over 400 benchmarks compares these new abstraction functions, highlighting several well-performing candidates. 
    \end{abstract}

    % \vfill\eject
    % \tableofcontents
    
    % \clearpage
    \section{Introduction} \label{sec:introduction}

        The partition function ($Z$) is an important quantity in probabilistic graphical model inference, and is often estimated using Monte Carlo methods such as Importance Sampling (IS) \citep{Rubinstein_2007,liu2015probabilistic,DBLP:journals/ai/GogateD11}. Recently a framework called Abstraction Sampling (AS) \citep{DBLP:conf/uai/BrokaDIK18} was introduced, inspired by prior works \citep{knuth75,Chen92}, that extends IS by enabling samples to represent multiple configurations.  AS uses concepts from Stratified Sampling \citep{Rubinstein_2007,rizzo_2007} and compact search  \citep{DBLP:journals/ai/DechterM07} to build a sampled subtree called a probe.  The construction progresses level-by-level according to a variable ordering where, at each level, an \textit{abstraction function} groups nodes into \textit{abstract states} and then a representative node from each group is picked and reweighted 
        %(according to a proposal distribution) 
        to extend the paths in the probe.
        
        Using what are referred to as context-based abstraction functions, \citet{DBLP:conf/uai/BrokaDIK18} showed competitive performance by AS against IS, Weighted Mini-Bucket IS (wMBIS) \citep{liu2015probabilistic}, and IJGP-SampleSearch (IJGP-ss) \citep{DBLP:journals/ai/GogateD11}. \citet{kask20-scaling-up-as} further extended AS scalability introducing AS algorithm AOAS that more efficiently applied AS to AND/OR search spaces, and showed its superior performance using the same context-based abstraction functions against previous versions of abstraction sampling (and thus implicitly also against IS, wMBIS, and IJGP-ss) and also against state-of-the-art scheme Dynamic Importance Sampling \citep{lou2019interleave}.
        
        However, a shortcoming of abstraction sampling development has been construction of more versatile and effective abstraction functions.  \citet{hsiao23-gnn-dynamic-as} approached this challenge by using graph neural networks for learning abstraction functions.  However, such methodology has the drawback of requiring learning on problems before use.  In this work we present:
                
        \begin{itemize}
        
            \item
                A new Value-Based abstraction framework for grouping nodes according to values on a positive scale
                
            \item
                Twenty-one value-based abstraction functions constructed by combining:
                
                \begin{itemize}
                
                    \item
                        Three distinct abstraction value functions, each inspired by paradigms from search and sampling.
                        
                    \item
                        Seven schemes for partitioning nodes according to their assigned values.
                        
                \end{itemize}
                        
            \item
                An extensive empirical evaluation on over 400 problems comparing the above-mentioned schemes against: each other, the previously vetted relCB and randCB abstraction functions \cite{DBLP:conf/uai/BrokaDIK18,kask20-scaling-up-as} (and thus implicitly against IS, wMBIS, and IJGP-ss), and a purely randomized abstraction scheme.
        \end{itemize}

    In conclusion, we identify three new abstraction functions that show significantly better performance than any previous scheme, and also explore trends in their hyperparameterization.


    \section{General Background} \label{sec:background}


%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%

        \paragraph{Graphical Models.}
            
            % \begin{figure}[]
            %     \centering
            % 	\includegraphics[scale=0.25]{images/AncestorBranchingMass.pdf}
            % 	\vspace{-6pt}\caption{Ancestor branching mass of an AND node.}
            % 	\label{fig-ancestor-branching-mass}
            % \end{figure}
    
            % % \begin{comment}
            % \begin{figure}[]
            %     \centering
            % 	\includegraphics[scale=0.25]{images/ProperAbstractionGroups.pdf}
            % 	\vspace{-6pt}\caption{Scope of proper abstractions.}
            % 	\label{fig-proper-abstraction-groups}
            % \end{figure}
            % % \end{comment}
            
            A {\bf graphical model}, such as a Bayesian or Markov network \cite{pearl88,darwiche-book,DBLP:series/synthesis/2013Dechter}, can be defined by  a 3-tuple
            $\mathcal{M} \! = \! (\mathbf{X,D,F})$, where
            $\mathbf{X} \! = \! \{X_i \! : i \! \in \! \I\}$
            is a set of variables indexed by a set $\I$,
            and $\mathbf{D} \! = \! \{D_i \! : i \! \in \! \I\}$
            is the set of finite domains of values for each $X_i$. Each function $f_{\alpha} \in   \mathbf{F}$ is defined over a subset of the variables
            called its scope, $X_{\alpha}$, %\subseteq X$, also  denoted $scope(f_{\alpha})$
            where  $\alpha \subseteq \I$ are  the indices of  variables in its scope  and $D_{\alpha}$ denotes  the Cartesian product of their domains, so that %
            % Namely,
            $f_{\alpha} \! : D_{\alpha} \! \rightarrow R^{\geq 0}$. {\bf Primal graph} $\mathcal{G} \! = \! (\mathbf{V,E})$ of $\mathcal{M}$ associates each variable with a node ($\mathbf{V} \! = \! \mathbf{X}$), while arcs $e \! \in \! \mathbf{E}$ connect nodes whose variables appear in the scope of the same function.
            %We define $scope(F) = \{\alpha | f_{\alpha} \in F \}$.
            %  $\mathbf{F} = \{f_{\alpha} : \alpha \in scopes(F)\}$ is a set of discrete functions, where $\alpha \subseteq V $ and
            %$X_\alpha \subseteq X$ is the scope of $f_\alpha$.
            %Graphical models can be used to represent a global function, often a probability distribution on $\mathbf{X}$,
            $\mathcal{M}$ defines a global function, often a factorized probability distribution on $\mathbf{X}$,
            $P(\mathbf{X}) = \frac{1}{Z} \prod_{\alpha}f_\alpha(X_\alpha)$, where 
            $
            Z = \sum_X \prod_{\alpha}
            f_\alpha(X_\alpha)
            $, known as the partition function, is a normalization factor.

            
        \paragraph{Search Spaces of Graphical Models.} 
            A graphical model can be transformed into a weighted state space graph.
            In an OR search space, which is constructed layer-by-layer relative to a variable ordering, paths from the root to the leaves represent full configurations - or assignments to all variables - where each successive level corresponds to an assignment of the next variable in the ordering.
            A graphical model can also be transformed  into a more compact AND/OR search space  by capturing its conditional independencies, % in the  model,
            thus facilitating more effective algorithms \cite{DBLP:journals/ai/DechterM07}.
            
            An AND/OR search space is defined relative to a \emph{pseudo tree} of a primal graph.	A {\bf pseudo tree} $\mathcal{T} \! = \! (\mathbf{V,E'})$ of a  primal graph $\mathcal{G} \! = \! (\mathbf{V,E})$ is a directed rooted tree that spans $\mathcal{G}$ such that every arc of $\mathcal{G}$ not in $\mathbf{E'}$ is a back-arc in ${\cal T}$ connecting a node to one of its ancestors (Figure \ref{fig-simple}(a),(b)).  A variable is a {\bf branching variable} if it has multiple children in $\mathcal{T}$.
            %The arcs in $E'$ may not all be included in $E$ .  
            
            
            
            
            \begin{figure}[!htb]
            	\centering
            	\begin{subfigure}{0.9\linewidth}
            	\centering
            	       \includegraphics[width=0.8\linewidth]{./_attachments/images/pseudotree.png}
            	\end{subfigure}
                    \begin{subfigure}{0.9\linewidth}
            	\centering
                        \includegraphics[width=0.8\linewidth]{./_attachments/images/AncestorBranchingMass.png}
                    \end{subfigure}
            	\vspace{-6pt}\caption{A full AND/OR tree representing 16 possible solutions guided by the pseudo tree shown above . Boxed in green is the ancestor branching subtree for the path $\rightarrow \!\! (A \!\! = \!\! 0) \!\! \rightarrow  \!\! (C \!\! = \!\! 1)$.}
                        \label{fig-ancestor-branching-mass}
            \end{figure}

            
            Given a
            pseudo tree $\mc{T}$ of a primal graph $\mathcal{G}$, the \emph{AND/OR search tree}
            $T_{\mc{T}}$ guided by $\mc{T}$ has alternating levels of OR nodes
            corresponding to variables, and AND nodes corresponding to
            an assignment from  its domain  with edge costs extracted from
            the original functions \cite{DBLP:journals/ai/DechterM07}. %(By this logic, we can think of the nodes of an OR tree as AND nodes).  
            Let $n$ be an AND node in $T_{\tau}$, also denoted $n_X$ if $X$ is the last variable of its partial configuration.
            Each arc into an AND node $n$ %(or the arc from its OR parent to the AND node)
            has a cost $c(n)$ defined to be the product of all factors $f_{\alpha}$ in $\mathcal{M}$ that are instantiated at $n$ but not before.
            % \textcolor{red}{Moved to section "Value of A Node": (see Figure \ref{fig-simple}(c)).}                                            
            \paragraph{Notation.}
                When not otherwise stated, capital letters ($X$) represent variables and small letters ($x$) represent their values.  (An exception is when using $n$, which we use to represent search nodes).  Boldfaced letters represent a collection. For example,
                boldfaced capital letters ({\bf X}) denote a collection of variables,
                $|{\bf X}|$ its cardinality, 
                $D_{\X}$ their joint domains (ie. all possible configurations of \X), 
                and bolded $\xx$ a particular realization in that joint domain (ie. a particular configuration of \X).

                In the context of search, $path(n)$ is the partial configuration corresponding to assignments to variables according to the path from the root of $T_{\mc{T}}$ to $n$, and $g(n)$ is the cost of $path(n)$.
            

                \paragraph{$\bs{Z(n)}$.} \label{sec:partition-function-of-a-node}
                    We define $Z(n)$ recursively as: 
                    \begin{equation} \label{eq:and-or-z-prod}
                        Z(n_X) = \prod_{Y \in ch_{\cal T}(X)} Z(Y_{n_X})
                    \end{equation}
                    where
                    \begin{equation}
                        Z(Y_{n_X}) = \sum_{n_Y \in ch_Y(n_X)}  c(n_Y) \cdot Z(n_Y)
                    \label{eq2}
                    \end{equation}
                    and where $ch()$ denotes child variables either in the pseudo-tree or the search tree itself (depending on the context). Here, $ch_Y(n_X)$ are specifically the child AND nodes of $Y$ descended from AND node $n_X$.  Thus, $Z(r)$ equals the partition function $Z$ of the underlying full model (see Figure \ref{fig-simple}c). We denote sampling estimations of $Z(n)$ as $\hat{Z}(n)$.  Heuristic estimates of functions $Z(n)$ are denoted as $h(n)$.
    
                \paragraph{$\bs{R(n)}$.} \label{sec:ancestor-branching-mass}
                     On the path from the root of $T_{\mc{T}}$ to some $n_{X}$, there may an intermediate node $n_{B}$ such that its associated variable $X$ is a branching variable in \PT.  Whenever this happens, the remaining variables of the model are split between the branches, and thus no single branch captures all the remaining variables. $R(n)$, or the \textbf{ancestor branching mass}, captures the $Z(n_{Y})$ for all $Y$ that branch off of the path to $X$. In Figure \ref{fig-ancestor-branching-mass}, the dotted green box shows the portion of the search space corresponding to the $R(n)$ for the node highlighted in red.

                     More formally, let $branchings(n_{X})$ be the set of nodes $n_{B}$ on the path to $n_{X}$ such that $B$ is a branching variable in \PT. Let $W$ be the child of $B$ that on the path to $X$.  We define $R(n_{X})$ as:
                     \begin{align}
                         \label{eq4}
                         R(n_{X}) =   \prod_{n_{B} \in branchings(n_{X})} \frac{Z(n_{B})}{ Z(W_{n_{B}})}
                     \end{align}


                     \textbf{Example.} In Figure \ref{fig-ancestor-branching-mass}, consider the path from the root to the node $n \! = \! (A \!\! = \!\! 0,C \!\! = \!\! 1)$ marked in red. Following under $(A=0)$ to our node, we see there is a node of variable $B$ that branches off of the path.
                     Thus, $Q(n_{A=0,C=1})$
                     \begin{align}
                        &= g(n_{A=0,C=1}) \! \cdot \! R(n_{A=0,C=1}) \! \cdot \! Z(n_{A=0,C=1}) \\
                        &= g(n_{A=0,C=1}) \mul \;\; Z(n_{A=0,B})\;\; \! \cdot \! Z(n_{A=0,C=1}) 
                     \end{align}

                     We denote approximations to $R(n)$ as $r(n)$.
                
                \paragraph{$\bs{Q(n)}$.} \label{sec:q-of-a-node}
    
                    Putting it all together, we can now concisely define a quantity $Q(n)$ as the contribution to $Z$ from all full configurations consistent with $path(n)$. In other words, $Q(n)$ is the unnormalized probability of the configuration $path(n)$ based on the distribution defined by \M, with $P(path(n)) = \frac{Q(n)}{Z}$.  $Q(n)$ can be computed simply as:
                    \begin{align}
                        Q(n) = g(n)  \! \cdot \!  R(n)  \! \cdot \!  Z(n)
                    \end{align}
                    We denote approximations to $Q(n)$.
                
             
             
        \paragraph{Stratified Importance Sampling.} 
            Abstraction Sampling builds on Importance Sampling and Stratified Sampling. {\em Importance Sampling} (IS) is  a Monte Carlo scheme for approximating likelihood queries over graphical models.
            %\cite{Rubinstein_2007,DBLP:journals/ai/GogateD11,liu2015probabilistic}.
            {\em Stratified Sampling} is a variance reduction technique for sampling a search space by first dividing it into disjoint strata. This can be used with importance sampling to further reduce variance.
            In {\em Stratified Importance Sampling}, we first divide the sample space into $k$ strata of equal area under the distribution $q$, then choose re-weighted representatives from each strata. %, and uses these representatives to form an estimator over the entire model. 
            In order to maximize reduction in variance, the variance between strata should be maximized (see \cite{rizzo_2007}).
            
            
            \newcommand{\soltree}{\hat{x}_M}
            \newcommand{\parttree}{\bar x}
            
            
    

%%%%%%%%%%%%%%%%%%%%%%%%%5
%%%%%%%%%%%%%%%%%%%%%%%%%5
%%%%%%%%%%%%%%%%%%%%%%%%%5
%%%%%%%%%%%%%%%%%%%%%%%%%5
%%%%%%%%%%%%%%%%%%%%%%%%%5
%%%%%%%%%%%%%%%%%%%%%%%%%5
%%%%%%%%%%%%%%%%%%%%%%%%%5
%%%%%%%%%%%%%%%%%%%%%%%%%5


    \section{Abstraction Sampling}\label{sec:abstraction-sampling}

        {\em Abstraction Sampling} (AS) algorithms \cite{DBLP:conf/uai/BrokaDIK18} apply concepts of Stratified Importance Sampling to sampling over probabilistic graphical models.  
        %An abstraction event in Abstraction Sampling is analogous to sampling representatives from strata in stratified importance sampling and reweighing to account for the rest of the members that were not chosen.  
        Guided by an abstraction
        function $a(.)$ that dictates how nodes of a variable should be partitioned into \textbf{abstract states} (analogous to strata in stratified sampling), Abstraction Sampling iteratively expands a search tree variable by variable, uses $a(.)$ to group nodes into abstract states, and uses an importance-sampling like process to select an individual from each abstract state and reweight it to account for the other members that were in its abstract state.  The chosen nodes can then be expanded, and this results in the generation of a  subtree of the full search tree $T_{\mc{T}}$ (called a {\bf probe}) as a sample.
            
        \paragraph{AOAS.}
            Taking Abstraction Sampling further, \cite{kask20-scaling-up-as} introduced algorithm AOAS (\textbf{A}nd/\textbf{O}R \textbf{A}bstraction \textbf{S}ampling) that applied abstraction sampling to AND/OR search spaces and significantly improved performance over previous version. AOAS used a proposal distribution $p(n) \propto q(n) = w(n) \! \cdot \! g(n) \! \cdot \! h(n) \! \cdot \! r(n)$ (see Figure \ref{fig:proposal}), where $g(n)$ is computed exactly, $Z(n)$ is approximated by $h(n)$, and $R(n)$ is estimated by $r(n)$, and a weight $w(n)$ is applied to account for the nodes abstracted into the path to $n$. An overview of the algorithm can be seen in Algorithm \ref{alg:aoas-overview} and a more detailed version and a sample trace taken from \cite{kask20-scaling-up-as} found in the Supplemental Materials.

        
        \begin{algorithm}[t!]
              \vspace{-6pt}\caption{AOAS Overview}
                \label{alg:aoas-overview}
        
            \begin{enumerate}
                \item \textbf{Initialization:}
                    Begin with a dummy root node $r$.
                \item \textbf{Probe Generation:}
                    Proceeding in a DFS manner according to a pseudo tree $\PT$...
                    \begin{enumerate}
                        \item \textbf{Expansion:} \label{alg:aoas-overview:expansion}
                            Generate children nodes $n$ corresponding to the next variable in the DFS ordering of $\PT$. Inherit $w(n)$ from parents and assign appropriate $g(n), h(n), \tn{and } r(n)$ values.
                        \item \textbf{Abstraction:} \label{alg:aoas-overview:abstraction}
                            \begin{enumerate}
                                \item \textbf{Form Abstract States:}
                                    Using $a(.)$, partition newly expanded nodes into abstract states.
                                \item \textbf{Select Representative:}
                                    Using the proposal $p(.)$ defined, select a representative from each abstract state and reweigh it such that $w(n) \leftarrow \frac{w(n)}{p(n)}$
                            \end{enumerate}
                        \item \textbf{Backtrack:} \label{alg:aoas-overview:backtracking}
                            After reaching a leaf in $\PT$, recursively backtrack until reaching nodes of the next unexplored branch of $\PT$. While backtracking, update parent node $\hat{Z}(n')$ estimates based on children's $w(n), g(n),$ and $\hat{Z}(n)$ values.
                        \item \textbf{Repeat:}
                            Repeat steps \ref{alg:aoas-overview:expansion}-\ref{alg:aoas-overview:backtracking} until having backtracked all the way to the root node.
                    \end{enumerate}
                \item \textbf{Return:}
                    $\hat{Z} = w(r)\,\hat{Z}(r)$ for the root node $r$.
            \end{enumerate}
        \end{algorithm}



        \begin{figure}[!htb]
            \centering
            \includegraphics[width=0.8\linewidth]{./_attachments/images/proposal.png}
            \vspace{-6pt}\caption{The $q(n)$ visualized to show it estimating the mass of nodes previously abstracted (via $w(n)$), the ancestor branching mass (via $r(n)$), the current path cost (via $g(n)$), and the subtree mass (via $h(n))$.}
                    \label{fig:proposal}
        \end{figure}

        
        \paragraph{Existing Abstraction Functions.} \label{sec:abstraction-sampling:existing-abstraction-functions}

            \cite{DBLP:conf/uai/BrokaDIK18} designed abstractions based on assignments to a variable's context $C(X)$, where $C(X)$ is a subset of its ancestor in a pseudo-tree $\cal T$ whose assignment uniquely determines the AND/OR subtree below it, ie. its $Z(n)$ \cite{DBLP:journals/ai/DechterM07}.  Thus, abstractions 
            based on a subset of the context aim to group nodes based on having similar $Z(n)$ values. However, all the possible assignments to the context, $|\D_{C(X)}|$, is exponential and unfeasible to use in its entirity. Thus, \cite{DBLP:conf/uai/BrokaDIK18,kask20-scaling-up-as} utilize relaxed context-based (\textbf{RelCB}) and randomized context-based (\textbf{RandCB}) abstractions as in \cite{DBLP:conf/uai/BrokaDIK18}.  
            
            RelCB is parametrized by a level $j$, selecting the closest $j \! - \! 1$ variables from a variable's context (ie. its {\em relaxed context}) plus itself. It abstracts nodes of the same domain value that also share the same assignment to the relaxed context. Assuming domain size of $k$, this yields at most $k^j$ abstract states at each level. 
            
            The randomized scheme, RandCB, considers the entire context however is parameterized by a parameter $nAbs$ constraining the number of abstract states per level nodes can be placed into.  Each of the $nAbs$ abstract states are randomly associated with a set of possible full context assignments defining the nodes that will belong to that state.






    \section{Value-Based Abstractions} \label{sec:ordered-value-based-abstraction-functions}

        We introduce a new framework for abstractions which we call Value-Based Abstractions.  Value-based abstraction functions consist of two parts: (1) a value function $\mu: n \rightarrow \mathbb{R}$ that assigns a real value on a positive scale to nodes $n$ that are to be abstracted, and (2) a partitioning scheme that then abstracts nodes based on $\mu(n)$.  

        \begin{algorithm}[t]
            \caption{$a_{\tn{\textit{value-ordered}}}$}
            \label{alg:general-ordered-value-based-abstraction-function}
            \begin{footnotesize}
                \SetInd{0.25em}{0.55em}
                \DontPrintSemicolon 
            \Input{A set of nodes $\bs{n}$ to be partitioned into abstract states; an abstraction value function $\mu(.)$; a sorting algorithm $SORT(.)$ that sorts $\bs{n}$ according to $\mu(.)$ and sort order $o$; a parameter $nAbs$ bounding the number of abstract states; a partitioning function $\Psi(.)$ that partitions the sorted nodes into abstract states maintaining their order}
            \Output{Nodes $\bs{n}$ partitioned into abstract states $\bs{A} = \setst{\bs{A_{i}}}{i<=nAbs}$ such that sort order $o$ of $\mu(n)$ is maintained across all $\bs{A_{i}}$.
            %as defined in Definition \ref{def:value-ordered-abstraction-function}.
            }
            
            \Begin{
                $\bs{n^{*}} \leftarrow SORT(\bs{n},\mu,o)$\\
                \uIf{$|\bs{n^{*}}| <= nAbs$}{
                    $\bs{A} = \setst{\set{n}}{n \in \bs{n^{*}}}$\\
                }
                \uElse{
                    $\bs{A} = \Psi(\bs{n^{*}}, v)$
                }
                \Return $\bs{A}$       
            }
            \end{footnotesize}
        \end{algorithm}
        
        We provide a general scheme of for a value-based abstraction function that maintains an ordering according to $\mu(n)$ in Algorithm \ref{alg:general-ordered-value-based-abstraction-function}. Assuming the value function $\mu(.)$ is not dominating, the complexity is determined by the sorting method's complexity and partitioning complexity.

        Next we present three value-based abstraction classes, each defining a unique $\mu$, and then seven ordered value partitioning schemes that, in conjunction with a provided $\mu(.)$, can be used with Algorithm \ref{alg:general-ordered-value-based-abstraction-function} to define a unique value-ordered abstraction function.
        

        \subsection{Value-Based Abstraction Classes} \label{sec:value-based-abstraction-classes}
    
            We introduce three Value-Based Abstraction Classes, each characterized by a unique value function $\mu(.)$ that signifies a notion of similarity between nodes.  
            
            % In this work we present three value-based abstraction classes: Heuristic-Based (HB), HR-Based (HRB), and Q-Based (QB) abstraction value-classes.  Each is motivated by theory in search or sampling discussed in Section \ref{sec:paradigms}, and each can be used with node partitioning schemes (Section \ref{sec:ordered-partitioning-schemes}), which together form a value-ordered abstraction function.
    
        
            \paragraph{Heuristic-Based Abstractions.} \label{sec:value-based-abstraction-classes:HB}
            
                % \begin{quote}
                %     $\mu(n) = h(n)$
                % \end{quote}
                
                Using the motivation of abstracting nodes with similar subtree $Z(n)$ intuited from previous work and concepts of graph search, Heuristic-Based (HB) abstractions use $\mu(n) = h(n)$, $h(n)$ is a heuristic estimate of a node's $Z(n)$.  Unlike the use of partial (or hashed) contexts as was used by \cite{DBLP:conf/uai/BrokaDIK18}, heuristic estimates of $Z(n)$ can often provide \textit{quantitative} insight into potential similarities in $Z(n)$ values, and this is particularly true when using wMBE heuristics which provides bounds.
    
                % In conjunction with the node partitioning schemes that will be presented in Section \ref{sec:ordered-partitioning-schemes}, the presented HB abstraction functions aim to form abstractions such that nodes with similar $Z(n)$ are grouped together.
    
    
            \paragraph{HR-Based Abstractions.} \label{sec:value-based-abstraction-classes:HRB}
    
                % \begin{quote}
                %     $\mu(n) = h(n)  \! \cdot \!  r(n)$
                % \end{quote}

                Consider the following definition of what can be considered an abstraction functions that is "exact":
                \begin{definition}[Exact Abstraction Function]
                     An abstraction function $a(.)$ is exact for an abstraction sampling algorithm, AS, if use of $a(.)$ with AS always leads to AS estimates having zero variance and $\hat{Z} = Z$ for every AS probe.
                \end{definition}

                Recall that $h(n)$ is a heuristic estimate of \hyperref[sec:partition-function-of-a-node]{$Z(n)$} and $r(n)$ is an estimate of $n$'s \hyperref[sec:partition-function-of-a-node]{ancestor branching mass $R(n)$}. With AOAS we can say:
                \begin{theorem}[AOAS Exact Abstractions from $h(n)r(n)$ vs. $Z(n)R(n)$ Proportionality] \label{thm:aoas-proportionality-exact-proposal}
                      If an abstraction function $a(.)$ forms abstract states $\bs{A_{i}}$ such that $\forall n \in \bs{A_{i}}, \frac{h(n)r(n)}{Z(n)R(n)} = \; \propto_{i}$ for some $\propto_{i} \in \!\!  \mathbb{R}_{>0}$ when $Z(n)R(n) \in \!\!  \mathbb{R}_{>0}$, or $h(n)r(n) = 0$ otherwise, then it is an exact abstraction function for AOAS. (Proof in Supplemental Materials)
                \end{theorem}
    
                We see that similarity of $\frac{h(n)r(n)}{Z(n)R(n)}$ among nodes in the same abstract state can lead to reduction in variance.  Although this ratio is infeasible to compute, HR-Based (HRB) abstractions use $\mu(n) = h(n)r(n)$ as a surrogate for similarity of this ratio and group nodes accordingly.
    
    
            \paragraph{Q-Based Abstractions.} \label{sec:value-based-abstraction-classes:QB}
    
                % \begin{quote}
                %     $\mu(n) = w(n) \! \cdot \! g(n) \! \cdot \! h(n) \! \cdot \! r(n)$
                % \end{quote}
    
                On the other hand, \cite{rizzo_2007} showed the potential of variance reduction when minimizing variance within strata when forming strata that had equal mass under the proposal distribution.  Thus, in Q-Based Abstractions we use the quantity $\mu(n) = q(n) \! = \! w(n)g(n)h(n)r(n) \propto p(n)$ \cite{kask20-scaling-up-as}, where $p(n)$ is the proposal function, as the value function.
    
                In addition to serving as un-normalized proposal function for a node $n$, $q(n)$ also estimates the $n$'s
                contribution to the overall $Z$. Therefore, $q(n)$ estimates the impact of $n$ (and all of the nodes $n$ represents as the selected representative from previous abstractions) on the overall $Z$.
                %which could be a valuable quantity to base our choice of nodes on as discussed in Section \ref{sec:paradigms:combined}.
     


        \subsection{VOrdered Partitioning Schemes} \label{sec:ordered-partitioning-schemes}

            We now present seven schemes, each defined by a unique sort order $o$ and partition strategy $\Psi$ combination.  Each scheme uses a different method to partition nodes into abstract states keeping the nodes in sort order according to $o$ and a provided abstraction value function $\mu(.)$. In addition to defining each scheme we also describe the motivation behind its creation and show the results on the running example below.
    
            \paragraph{Running Example} \label{sec:ordered-partitioning-schemes:running-example}  
                As we motivate and describe the various partitioning schemes, we will also provide an example of abstract states that would result from partitioning nodes with the following $\mu(n)$:
                \begin{align} \label{eq:running-partitioning-example}
                    % \set{
                        1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 10, 100
                    % }
                \end{align}
                into $nAbs=4$ abstract states as a running example.

            For brevity, we have omitted the algorithmic representation of each scheme, however, they are included in the Supplemental Materials for your reference.
                
            \paragraph{\NoCaseChange{simpleVB}.} \label{sec:ordered-partitioning-schemes:simpleVB}
    
                The simpleVB (simple value-based) scheme follows the motivation of grouping nodes of similar value in the same abstract state by a simple 2-step method: 1) first, nodes are ordered by $\mu(n)$ (low to high), and 2) next the ordered nodes are partitioned into [approximately] equal cardinality abstract states.
    
                % \textit{Time Complexity:}
                %     Partitioning is achieved via one pass through $|\bs{n}|$ leading to $\mathcal{O}(|\bs{n}|)$ time complexity.
                % \textit{Space Complexity:}
                %     No more than linear space is required.  $\mathcal{O}(|\bs{n}|)$.
                \textit{\hyperref[sec:ordered-partitioning-schemes:running-example]{Running Example}:}
                    \smallset{1.0, 1.1}, \smallset{1.2, 1.3}, \smallset{1.4, 1.5}, \smallset{10, 100}.
                Nodes are partitioned evenly, and through its simplicity this method aims to leverage speed allowing for abstractions to be formed quickly leveraging a greater number of drawn samples.
                %\footnotetext{\label{ftn:ordered-schemes-maintain-sort-order}Such that nodes maintain sort order $o$ across all abstract states.}
    
    
    
            \paragraph{\NoCaseChange{minVarVB.}} \label{sec:ordered-partitioning-schemes:minVarVB}
    
                The minVarVB scheme uses Ward's Minimum Variance Hierarchical Clustering (or Ward's Method, for short) \cite{ward1963} to group nodes into a $nAbs$ abstract states so as to minimize variance within each abstract state with respect to the provided value function $\mu(.)$. Ward's Method is an agglomerative hierarchical clustering algorithm that creates a dendrogram by iteratively merging clusters. Ward's Method can be combined with Lance-Williams linear distance updates \cite{LanceWillaims1967-distanceUpdates} to increase efficiency.  We include more details on Ward's Method and Lance-Williams linear distance updates in the Supplemental Materials.

                % \textit{Time Complexity:\footnote{\label{ftn:time-complexity-assumes-constant-time-v}Assuming $\mu(n)$ is $\mathcal{O}(1)$ in both time and space.}}
                %     The choice of clusters to merge generally leads to having a $\mathcal{O}(|\bs{n^{*}}|^{3})$ time complexity due to the need to compare pair-wise distances between all clusters at each iteration.  However, in the case where nodes are distributed linearly in one dimension, only neighboring distances need to be considered at each iteration and can be made efficient by use of a priority queue, however since the Lance-Williams distance updates themselves take linear time, once per iteration, the reduced time complexity is still $\mathcal{O}(|\bs{n}|^{2})$.
                % \textit{Space Complexity:\super{\ref{ftn:time-complexity-assumes-constant-time-v}}}
                %     The space complexity is implementation dependent, with most time-efficient variants making use of a distance matrix leading to $\mathcal{O}(|\bs{n}|^{2})$ space complexity.
                \textit{\hyperref[sec:ordered-partitioning-schemes:running-example]{Running Example}:}
                    \smallset{1.0, 1.1, 1.2}, \smallset{1.3, 1.4, 1.5}, \smallset{10}, \smallset{100}.
                In contrast to simpleVB, minVarVB places considerable resources into computing abstractions, leading to fewer samples, but provably forms abstractions that minimize variance of $\mu(n)$ within the abstract states.
    
    
    
            \paragraph{\NoCaseChange{equalDistVB}.} \label{sec:ordered-partitioning-schemes:equalDistVB}
    
                equalDistVB is inspired by the goal of minVarVB and the simplicity of simpleVB.  The scheme works by greedily adding nodes in value order (low to high) into abstract state $\bs{A_{i}}$ until $\sum_{j=1}^{i} \sum_{n \in \bs{A_{j}}} \mu(n) \geq \frac{i \cdot \sum_{n' \in \bs{n}} \mu(n)}{nAbs}$,
                namely until the total sum of node values from $\bs{A_{1}},...,\bs{A_{i}}$ reaches or exceeds the $\frac{i}{nAbs}$ quantile.
                
                When paired with the QB abstraction class (see Section \ref{sec:value-based-abstraction-classes:QB}), the equalDistVB schemes also attempts to partition notes into abstract states of equal mass under the proposal.  This in corresponds to the condition for Proposition \ref{prop:rizzo-variance-reduction} for stratified importance sampling variance reduction.
    
                % \textit{Time Complexity:\super{\ref{ftn:time-complexity-assumes-constant-time-v}}}
                %     $Z(A_{1...i})$ can be updated progressively in constant time, and thus computation of $P_{i}$ at each iteration can also be done in constant time. Partitioning is achieved via one pass through $|\bs{n}|$ leading to $\mathcal{O}(|\bs{n}|)$ time complexity.
                % \textit{Space Complexity:\super{\ref{ftn:time-complexity-assumes-constant-time-v}}}
                %     No more than linear space is required.  $\mathcal{O}(|\bs{n}|)$.
                \textit{\hyperref[sec:ordered-partitioning-schemes:running-example]{Running Example}:}
                    \smallset{1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 10, 100}, \smallset{}, \smallset{}, \smallset{}.
                Although, this method hopes to find a balance between intuitions previously explored, and without compromising speed and efficiency of abstract state generation, from the running example we can see how this method can yield undesirable results in the presence of certain distributions of node values.  In this example, the first quantile is only reached after all the nodes have been added to the first abstract state, leaving no nodes remaining to be partitioned into the subsequent abstract states. 
    
    
    
            \paragraph{\NoCaseChange{equalDistVB2}.} \label{sec:ordered-partitioning-schemes:equalDistVB2}

                By simply reversing the sort order, equalDistVB2 is able to use the same partitioning strategy as equalDistVB meanwhile mitigate some of the overfilling of abstract states as seen in the example shown above for equalDistVB.
                
                % \textit{Time Complexity:\super{\ref{ftn:time-complexity-assumes-constant-time-v}}}
                %     $Z(A_{1...i})$ can be updated progressively in constant time, and thus computation of $P_{i}$ at each iteration can also be done in constant time. Partitioning is achieved via one pass through $|\bs{n}|$ leading to $\mathcal{O}(|\bs{n}|)$ time complexity.
                % \textit{Space Complexity:\super{\ref{ftn:time-complexity-assumes-constant-time-v}}}
                %     No more than linear space is required.  $\mathcal{O}(|\bs{n}|)$.
                \textit{\hyperref[sec:ordered-partitioning-schemes:running-example]{Running Example}:}
                    \smallset{100}, \smallset{}, \smallset{}, \smallset{10, 1.5, 1.4, 1.3, 1.2, 1.1, 1.0}.
                We see that equalDistVB2 can still be subject to over packing of abstract states.  Next we present two more equalDistvB variants that continue to mitigate this artifact.
    
    
    
            \paragraph{\NoCaseChange{equalDistVB3}.} \label{sec:ordered-partitioning-schemes:equalDistVB3}
    
                % \begin{quote}
                %     $o = \tn{high to low}$\\
                %     $\Part{equalDistVB3}$ (Algorithm \ref{alg:psi-equalDistVB3})
                % \end{quote}
    
        %         \begin{algorithm}[t!]
        %             \vspace{-6pt}\caption{$\Part{equalDistVB3}$}
        %             \label{alg:psi-equalDistVB3}
        %             \begin{footnotesize}
        %                 \SetInd{0.25em}{0.55em}
        %                 \DontPrintSemicolon 
        %             \Input{A set of ordered nodes $\bs{n^{*}}$ to be partitioned into $nAbs$ abstract states; a value function $\mu(.)$}
        %             \Output{
    				% With 
    				% %
    				% $Z(\bs{A_{1,...,i}}) = (\sum_{j=1}^{i} \sum_{n' \in \bs{A_{j}}} Z(n')$,
    				% %
    				% $n_{\bs{A_{i}}}^{\tn{last}}$ be the last node in $\bs{A_{i}}$, 
    				% %
    				% and 
    				% %
    				% $P_{i} = \frac{ i \cdot \sum_{n \in \bs{n^{*}}}\mu(n)}{nAbs}$,
    				% %
        %             $\bs{n^{*}}$ partitioned into abstract states\super{\ref{ftn:ordered-schemes-maintain-sort-order}} $\bs{A} = \setst{\bs{A_{i}}}{i \in \set{1,...,nAbs}}$ such that for $i=1,...,nAbs$ in order,
        %             $(\; Z(\bs{A_{1,...,i}}) \geq P_{i} \;)$
        %             $\land$ \\ $(\; (\, \bs{|A_{i}}|=1 \,) \lor (\, Z(\bs{A_{1,...,i}}) - Z(n_{\bs{A_{i}}}^{\tn{last}}) < P_{i} \,) \;)$ }
                    
        %             \Begin{
        %                 $j \leftarrow 1$\\
        %                 \ForEach{$i \leftarrow 1,...,nAbs$}{
        %                     $\bs{A_{i}} = \set{n^{*}_{{j}}}$\\
        %                     $j \leftarrow j+1$\\
        %                     \While{$Z(\bs{A_{1,...,i}}) < P_{i}$}{
        %                         $\bs{A_{i}} \leftarrow A_{i} \cup \set{n^{*}_{{j}}}$\\
        %                         $j \leftarrow j+1$
        %                     }
        %                 }
        %                 $\bs{A} = \cup_{i = 1}^{nAbs} \bs{A_{i}}$\\
        %                 \Return $\bs{A}$       
        %             }
        %             \end{footnotesize}
        %         \end{algorithm}
    
                In order to lessen over packing and ensure abtract states are not left empty, equalDistVB3 modifies equalDistVB2 so that, after processing of each abstract state, the next state is forced an addition of at least a single node by default and then proceeds as the previous equalDistVB does.
                
                % \textit{Time Complexity:\super{\ref{ftn:time-complexity-assumes-constant-time-v}}}
                %     $Z(A_{1...i})$ can be updated progressively in constant time, and thus computation of $P_{i}$ at each iteration can also be done in constant time. Partitioning is achieved via one pass through $|\bs{n}|$ leading to $\mathcal{O}(|\bs{n}|)$ time complexity.
                % \textit{Space Complexity:\super{\ref{ftn:time-complexity-assumes-constant-time-v}}}
                %     No more than linear space is required.  $\mathcal{O}(|\bs{n}|)$.
                \textit{\hyperref[sec:ordered-partitioning-schemes:running-example]{Running Example}:}
                    \smallset{100}, \smallset{10}, \smallset{1.5}, \smallset{1.4, 1.3, 1.2, 1.1, 1.0}.
                Still highly efficient, equalDistVB3 manages to ensure that the provided $nAbs$ granularity is honored, allowing users better control of the search vs. sampling interpolation possible with Abstraction Sampling.
    
    
    
            \paragraph{\NoCaseChange{equalDistVB4}.} \label{sec:ordered-partitioning-schemes:equalDistVB4}
    
        %         \begin{quote}
        %             $o = \tn{high to low}$\\
        %             $\Part{equalDistVB4}$ (Algorithm \ref{alg:psi-equalDistVB4})
        %         \end{quote}
    
        %         \begin{algorithm}[t!]
        %             \vspace{-6pt}\caption{$\Part{equalDistVB4}$}
        %             \label{alg:psi-equalDistVB4}
        %             \begin{footnotesize}
        %                 \SetInd{0.25em}{0.55em}
        %                 \DontPrintSemicolon 
        %             \Input{A set of ordered nodes $\bs{n^{*}}$ to be partitioned into $nAbs$ abstract states; a value function $\mu(.)$}
        %             \Output{
    				% With 
    				% %
    				% $Z(\bs{A_{1,...,i}}) = (\sum_{j=1}^{i} \sum_{n' \in \bs{A_{j}}} Z(n')$,
    				% %
    				% $n_{\bs{A_{i}}}^{\tn{last}}$ be the last node in $\bs{A_{i}}$, 
    				% %
    				% and 
    				% %
    				% $L_{i} = \frac{Z(\bs{n^{*}})-Z(\bs{A_{1,...,i-1}})}{nAbs-i+1}$,
    				% %
        %             $\bs{n^{*}}$ partitioned into abstract states\super{\ref{ftn:ordered-schemes-maintain-sort-order}} $\bs{A} = \setst{\bs{A_{i}}}{i \in \set{1,...,nAbs}}$ such that for $i=1,...,nAbs$ in order,
        %             $(\; Z(\bs{A_{i}}) \geq L_{i} \;)$
        %             $\land$ \\ $(\; (\, \bs{|A_{i}}|=1 \,) \lor (\, Z(\bs{A_{i}}) - Z(n_{\bs{A_{i}}}^{\tn{last}}) < L_{i} \,) \;)$ }
                    
        %             \Begin{
        %                 $j \leftarrow 1$\\
        %                 \ForEach{$i \leftarrow 1,...,nAbs$}{
        %                     $\bs{A_{i}} = \set{}$\\
        %                     \While{$Z(\bs{A_{i}}) < L_{i}$}{
        %                         $\bs{A_{i}} \leftarrow A_{i} \cup \set{n^{*}_{{j}}}$\\
        %                         $j \leftarrow j+1$
        %                     }
        %                 }
        %                 $\bs{A} = \cup_{i = 1}^{nAbs} \bs{A_{i}}$\\
        %                 \Return $\bs{A}$       
        %             }
        %             \end{footnotesize}
        %         \end{algorithm}
    
                The final varaint of the equalDist schemes, equalDistVB4 attempts to perform a more even partitioning than the previous variants by recomputing quantiles. Each time the algorithm progesses to processing a new abstract state, remaining nodes and abstract states are used to compute new quantiles which are then used to guide filling of the current abstract state.
                
                % \textit{Time Complexity:\super{\ref{ftn:time-complexity-assumes-constant-time-v}}}
                %     $Z(A_{1...i})$ can be updated progressively in constant time, and thus computation of $L_{i}$ at each iteration can also be done in constant time.  Partitioning is achieved via one pass through $|\bs{n}|$ leading to $\mathcal{O}(|\bs{n}|)$ time complexity.
                % \textit{Space Complexity:\super{\ref{ftn:time-complexity-assumes-constant-time-v}}}
                %     No more than linear space is required.  $\mathcal{O}(|\bs{n}|)$.
                \textit{\hyperref[sec:ordered-partitioning-schemes:running-example]{Running Example}:}
                    \smallset{100}, \smallset{10}, \smallset{1.5, 1.4, 1.3}, \smallset{1.2, 1.1, 1.0}.
                Still highly efficient, equalDistVB4 manages to spread nodes with smaller values across abstract states more evenly than the previous versions.
    
    
            \paragraph{\NoCaseChange{randVB}.} \label{sec:ordered-partitioning-schemes:randVB}
    
                It can be beneficial to rely on randomness to ensure a diverse sampling of abstractions.  randVB does this by sampling $nAbs\!-\!1$ partition points uniformly at random and without replacement from between the sorted nodes $\bs{n^{*}}$, and then partitions the nodes accordingly. As a result, abstract states are formed such that nodes are still grouped according to $\mu(.)$, but the size of those groups varies.
                
                % \textit{Time Complexity:\super{\ref{ftn:time-complexity-assumes-constant-time-v}}}
                %     $\mathcal{O}(|\bs{n}|)$ time complexity.
                % \textit{Space Complexity:\super{\ref{ftn:time-complexity-assumes-constant-time-v}}}
                %     No more than linear space is required.  $\mathcal{O}(|\bs{n}|)$.
                \textit{\hyperref[sec:ordered-partitioning-schemes:running-example]{Running Example}:}
                    ex1: \smallset{100, 10}, \smallset{1.5}, \smallset{1.4, 1.3, 1.2}, \smallset{1.1, 1.0};
                    ex2: \smallset{100}, \smallset{10, 1.5, 1.4, 1.3}, \smallset{1.2, 1.1}, \smallset{1.0};
                    ...etc.

            \paragraph{Complexity.} Assuming $\mu(.)$ is $\mathcal{O}(1)$ and nodes are pre-sorted, these partitioning schemes all have both time and space complexity of $\mathcal{O}(|\bs{n}|)$, with the exception of minVarVB, which requires $\mathcal{O}(|\bs{n}|^{2})$.





    \section{Empirical Evaluation} \label{sec:empirical-evaluation}

        %%%%%%%%%%%%%%%%%%% AS Algorithms Tested

       
        \paragraph{Setup Overview.}
            All combinations of the three new Value-Based Abstraction Classes: Heuristic-Based \textbf{HB}, HR-Based \textbf{HRB}, and Q-Based \textbf{QB}; with each of the seven Ordered Partitioning Schemes: \textbf{simple}, \textbf{minVar}, \textbf{equalDist1-4}, and \textbf{rand}, were tested, resulting in a total of twenty-one abstraction functions.  For comparison, the formerly evaluated context-based abstraction functions of randCB and relCB were also used. (We abbreviate the context-based class of functions as \textbf{CTX}).  In addition, a pure randomized abstraction function, simpleRand, was also included. (We abbreviate the purely randomized class as \textbf{RAND}).  With the exception of RelCB, each abstraction function accepts a hyper parameter called $nAbs$ which bounds the number of abstract states at any level. RelCB instead uses an $nContext$ parameter that limits the number of context variables used in assigning abstract states.  To facilitate comparison, we report RelCB's $nContext$ parameter instead as an equivalent $nAbs$ parameter assuming a domain size of $2$.  (For example, if RelCB was run using $nContext = 6$, in results we report it as RelCB with $nAbs = 2^{6}$). All of the abstraction functions were tested using the AOAS algorithm \citep{kask20-scaling-up-as}.  All algorithms were implemented in C++. All experiments were run on a 2.66 GHz processor and allotted 8 GB of memory.
        
        
        
        %%%%%%%%%%%%%%%%%%% Heuristic Description
        \paragraph{Heuristics.}
            To inform the sampling proposal, Weighted Mini-Bucket Elimination (wMBE) \cite{DBLP:journals/jacm/DechterR03,DBLP:conf/icml/LiuI11} is used as a heuristic.  The i-bound (\textbf{iB}) parameter controls the strength of wMBE, where higher i-bounds generally lead to stronger heuristics and, thus, better proposals at the expense of higher computation and memory. We standardize our experiments by using the same i-bound when comparing across algorithms. 
        
        
        
        
        %%%%%%%%%%%%%%%%%%% Benchmark Description
        \paragraph{Benchmarks.}
            
            In line with previous work on Abstraction Sampling, we perform experiments on the same set of over 480 problems from five well known benchmarks: DBN, Grids, Linkage-Type4, Pedigree, and Promedas used by \citep{kask20-scaling-up-as}. 
            
            We refer to problem instances with known $Z$ values as "Exact".  Larger problems for which exact solutions are not known are called "LARGE".  For LARGE problems, estimates from 100hr of context-based abstraction sampling (obtained from \citet{kask20-scaling-up-as}) are used as the true $Z$.  When experimenting on Exact problems, algorithms use a small i-bound of 5 (thus weakening heuristic estimates) and were given a short time limit of 300sec in order to increase difficulty.  For LARGE problems, an i-bound of 10 and time limit of 1200 sec are used.

            For both brevity and preciseness, we focus on results from the Exact problem instances. 
            % thus here excluding the Linkage-Type4 benchmark whose problems do not have known solutions.  
            Results for LARGE problems can be found in the Supplemental Materials and their trends generally agree with those from the EXACT problems.

            
            
            \begin{centering}
            \begin{tabular}{lrrrrr}
              \toprule
              Benchmark &   N &   |\textbf{X}| &     k &          w* &        d \\ 
              \midrule
                    DBN &  66 &      67 &          2 &      29 &      30 \\ 
                  Grids &   8 &     250 &          2 &      22 &      49 \\ 
               Pedigree &  25 &     690 &          5 &      25 &      89 \\ 
               Promedas &  65 &     612 &          2 &      21 &      62 \\ 
              \bottomrule
            \end{tabular}
            \vspace{-6pt}\captionof{table}{
                \textbf{Exact Benchmark Statistics}. Average benchmark statistics for Exact problems (with known $Z$ values). \textbf{N}: number of instances, \textbf{\tabs{X}}: average number of variables per instance, \textbf{k}: average of problems' largest domain sizes, \textbf{w\super{*}}: average induced tree-width, \textbf{d}: average pseudo-tree depth. 
                \label{tbl:small-benchmark-statistics}
            }
            \end{centering}

           \begin{centering}
            \begin{tabular}{lrrrrr}
              \toprule
              Benchmark &   N &   |\textbf{X}| &        k &          w* &        d \\ 
              \midrule
                        DBN &   48 &     216.0 &        2.0 &     78.0 &    78.0\\
                      Grids &   19 &    3432.0 &        2.0 &    117.0 &   220.0\\
              Linkage-Type4 &   82 &    6550.0 &        5.0 &     45.0 &   761.0\\
                   Promedas &  173 &    1194.0 &        2.0 &     72.0 &   114.0\\
              \bottomrule
            \end{tabular}
            \vspace{-6pt}\captionof{table}{
                \textbf{LARGE Benchmark Statistics}. Average benchmark statistics for LARGE problems (with estimated $Z$ values). \textbf{N}: number of instances, \textbf{\tabs{X}}: average number of variables per instance, \textbf{k}: average of problems' largest domain sizes, \textbf{w\super{*}}: average induced tree-width, \textbf{d}: average pseudo-tree depth. 
                \label{tbl:small-benchmark-statistics}
            }
            \end{centering}
        
        
        %%%%%%%%%%%%%%%%%%% Performance Measure
        
        \paragraph{Performance Measure.}
            To evaluate the performance of the various algorithms, we define error as:    
            $Error = |log_{10} \hat{Z} - log_{10} Z^{*}|$,
            where $\hat{Z}$ is estimate obtained and $Z^{*}$ is the true $Z$ value.



        \subsection{Results} \label{sec:empirical-evaluation:results}
        
            % \subsubsection{Aggregated Results Tables}
            
                \paragraph{Summary Comparison.}

                    To examine the potential of the different methods, we tested each algorithm with a range of $nAbs \in \set{1, 4, 16, 64, 256, 512, 1024, 2048}$. For each $nAbs$ and each benchmark, we calculated the average error across the Exact problems of the benchmark and identified the $nAbs$ that resulted in the lowest average error. In Table \ref{tbl:summary-aggregations} we show this lowest average error and the corresponding $nAbs$ for each algorithm and benchmark, and highlight the schemes that performed well across all benchmarks.  If an algorithm was unable to produce a positive Monte Carlo $Z$ estimate for a problem (denoted as "Fail"), the wMBE heuristic bound was used as its $Z$ estimate and error computed accordingly.
    
                    % Tables \ref{tbl:DBN_aggregation}-\ref{tbl:Promedas_aggregation} show aggregated performance of the various Value-Based Abstraction Classes with the various Partitioning Schemes on problems of DBN, Grids, Linkage-Type4, and Promedas benchmarks.

                    \begin{tablefigure*}[!htb]
                        \centering     %%% not \center
                        \begin{subtablefigure}{0.99\linewidth}
                            \includegraphics[width=0.98\linewidth]{./_attachments/Results/ALL-SMALL-aggregations-i-5-t-300.pdf}
                            \caption{}
                            \label{tbl:small-aggregations}
                        \end{subtablefigure}
                        \begin{subtablefigure}{0.99\linewidth}
                            \includegraphics[width=0.98\linewidth]{./_attachments/Results/QB-CTX-RAND-LARGE-aggregations-i-10-t-1200.pdf}
                            \caption{}
                            \label{tbl:large-qb-aggregations}
                        \end{subtablefigure}
                        \vspace{-6pt}\caption{\textbf{Summary Comparison}. For each table, displayed are the Abstraction Class (\textit{Class}), Partitioning Scheme (\textit{Scheme}), bound on the number of abstract states per level (\textit{nAbs}), number of problems for which a positive solution could not be estimated (\textit{Fail}), and average $log_{10}Z$ error (\textit{Avg. Error}) across Exact problems of the given benchmark.  Color bars visualize the magnitude of the values. Overall best performing algorithms are highlighted.  (a) shows results on Exact problems. (b) Shows results on LARGE problems.}
                        \label{tbl:summary-aggregations}
                    \end{tablefigure*}
    

                \paragraph{Comparison using 100 Samples.} \label{sec:empirical-evaluation:results:aggregation-tables:set-number-of-samples}

                    \begin{tablefigure}[!htb]
                        \centering
                        \includegraphics[width=0.99\linewidth]{./_attachments/Results/ALL-SMALL-iB-5-nAbs-256-nR-100-QB-CB-RAND.pdf}
                        \vspace{-6pt}\caption{\textbf{100-Sample Comparison}. For abstraction granularity of $nAbs=256$, aggregated statistics (as described in Table \ref{tbl:AggregationTables}) for Exact problems of each benchmark with each algorithm allotted 100 samples.}
                        \label{tbl:results:ALL-SMALL-iB-5-nAbs-256-nR-100-QB-CB-RAND}
                    \end{tablefigure}
        
                    We also wanted to assess the quality of abstraction functions in an implementation-agnostic manner, and irrespective of resulting probe-sizes or speed of processing abstractions.  
                    %However, as detailed in Section \ref{sec:ordered-partitioning-schemes}, some schemes may exhibit variations in execution time, and implementation differences can contribute to this variability. 
                    % And as discussed in Section \ref{sec:empirical-evaluation:results:abstraction-speed-plot}, probe sizes can also vary. 
                    %Probe sizes can also vary between use of different abstraction functions.
                    %To circumvent these artifacts, 
                    Thus, we also conducted experiments using a one-hundred sample termination condition (denoted \textbf{m-100}) rather than a time constraint. Table \ref{tbl:results:ALL-SMALL-iB-5-nAbs-256-nR-100-QB-CB-RAND} shows these results on Exact problems of each benchmark for the better performing QB algorithms using an abstraction granularity of $nAbs=256$.  We use $nAbs=256$ as it is (1) an intermediate granularity and (2) all schemes were able to produce 100 samples in a reasonable amount of time.  We again highlight the overall best performing schemes.


            \paragraph{Varying \NoCaseChange{nAbs}.}

                \begin{tablefigure}[!htb]
                    \centering
                    \includegraphics[width=0.99\linewidth]{./_attachments/Results/varying-nAbs-SMALL-i-5-t-300-best-QB.pdf}
                    \vspace{-6pt}\caption{\textbf{Varying nAbs}. Average error when using $nAbs \in \set{4, 64, 1024}$ for minVarQB, equalDistQB3, equalDistQB4, the CTX based algorithms, and RAND, each with iB-5 and time limit of 300 sec.}
                    \label{tbl:varying-nAbs-SMALL-i-5-t-300-best-QB}
                \end{tablefigure}

                \begin{plotfigure}[!htb]
                    \centering
                    \includegraphics[width=0.99\linewidth]{./_attachments/Results/error-vs-nAbs-plot-minVarQB-iB-5}
                    \vspace{-6pt}\caption{Varying $nAbs$ for minVarQB. Plotted is the average error on Exact problems using iB-5 and time limit of 300 sec for each benchmark and for various abstraction granularities (in log2).}
                    \label{plt:results:error-vs-nAbs-plot-minVarQB-iB-5}
                \end{plotfigure}

                \begin{plotfigure}[!htb]
                    \centering
                    \includegraphics[width=0.99\linewidth]{./_attachments/Results/error-vs-nAbs-plot-equalDistQB4-iB-5}
                    \vspace{-6pt}\caption{Varying $nAbs$ for equalDistQB4. Plotted is the average error on Exact problems using iB-5 and time limit of 300 sec for each benchmark for various abstraction granularities (in log2).}
                    \label{plt:results:error-vs-nAbs-plot-equalDistQB4-iB-5}
                \end{plotfigure}
                
                In order to observe the effect of changing $nAbs$, Table \ref{tbl:varying-nAbs-SMALL-i-5-t-300-best-QB} shows average error for different $nAbs \in \set{4, 64, 1024}$ for Exact problems of each benchmark.  Here we focus on only the better performing abstraction functions of QB: minVarQB, equalDistQB3, equalDistQB4; the well performing purely randomized scheme: RAND; and also show the context-based schemes (CTX) for comparison. In Plots \ref{plt:results:error-vs-nAbs-plot-minVarQB-iB-5} and \ref{plt:results:error-vs-nAbs-plot-equalDistQB4-iB-5}, we also show average error across a wider array of $nAbs$ for minVarQB and equalDistQB4, respectively, the latter also acting as a representative for the profile of the plots of equalDistQB3 and RAND.



            \paragraph{Time Series Plot.}

                Plots \ref{plt:results:grid20x20.f15-time-series}-\ref{plt:results:or_chain_209.fg-time-series} show time-series $Z$ estimates for the better performing QB algorithms, the purely randomized scheme, and context-based schemes (CTX) on a representative Grids and representative Promedas problem.  For each algorithm was plotted with the $nAbs$ that resulted in the lowest average error for the problem's respective benchmark.  Each plot line is labeled with the scheme, $nAbs$ used, and the final $Error$ of its estimate.

                \begin{plotfigure}[!htb]
                    \centering
                    \includegraphics[width=0.99\linewidth]{./_attachments/Results/grid20x20.f15-time-series.png}
                    \vspace{-16pt}\caption{Z estimates from various algorithms versus time on Grids problem grid20x20.f15  using $iB=5$. The dashed black line shows the true Z value.}
                    \label{plt:results:grid20x20.f15-time-series}
                \end{plotfigure}

                \begin{plotfigure}[!htb]
                    \centering
                    \includegraphics[width=0.99\linewidth]{./_attachments/Results/or_chain_209.fg-time-series.png}
                    \vspace{-16pt}\caption{Z estimates from various algorithms versus time on Promedas problem or\us chain\us 209.fg  using $iB=5$. The dashed black line shows the true Z value.}
                    \label{plt:results:or_chain_209.fg-time-series}
                \end{plotfigure}

                 



        \subsection{Analysis} \label{sec:empirical-evaluation:analysis}

            \paragraph{Performance Comparison with Context-Based Schemes.}

                The $\Delta CTX |Error|$ column in Tables \ref{tbl:DBN_aggregation}-\ref{tbl:Promedas_aggregation} explicitly shows the aggregated performance of the various schemes against the best parameterized context-based scheme (green negative values in these columns indicate better performance than the best contex-based scheme for the respective benchmark).  For the HB and HRB classes, we see that there always exist partitioning schemes that can outperform the best context based scheme.  For the HB class, the "simple" partitioning scheme tends to perform best, whereas for the HRB class it seems to be more benchmark dependent.  With the exception of the "simple", "equalDist", and "rand" partitioning schemes, the QB schemes tend to consistanty perform as well as or outperform the context-based schemes.  The purely randomized scheme also consistently outperforms the context-based schemes with the exception of Promedas.  Results using iB-10 on the large problems (ie. without exact known Z values, using a reference Z value obtained from \cite{kask20-scaling-up-as} using 100hr of context-based abstraction sampling) generally agree, with the exception of Promedas, where the context-based randCB outperforms the HB, HRB (results in the Supplemental).


            \paragraph{Performance Comparison with Purely Randomized Abstractions.}
                The $\Delta RAND |Error|$ column in Tables \ref{tbl:DBN_aggregation}-\ref{tbl:Promedas_aggregation} explicitly shows the aggregated performance of the various schemes against the purely randomized scheme (Green negative values in these columns indicate better performance than the purely randomized scheme for the respective benchmark).  The QB class using equalDist3 and equalDist4 partitioning strategies consistently did as well or better than the purely randomized scheme, with no other scheme doing as well or better than these schemes consistently.


            \paragraph{Abstraction Quality of the QB Schemes.}
                When drawing an equal number of samples with the same abstraction granularity of $nAbs=256$ aggregating over Grids problems, we note that, as before, QB with equalDist3 and equalDist4, as well as the purely randomized scheme perform as well as previously observed, relatively.  However, we see that QB with minVar, which showed only slightly worse performance using a time limit, is the best performer when considering a set number of samples.  (Omitted for brevity, overall all benchmarks it the often (and overall) the best performer, though there are cases where one or some of the other schemes does as well or slightly better).  This can, in part, explain the success of the equalDist3 and equalDist4 schemes, which attempt to emulate the minVar scheme using faster greedy strategies.



            \paragraph{Anytime Behavior.}
                From Plots \ref{plt:results:grid20x20.f15-iB-5-nAbs-64-1024-plots}-\ref{plt:results:or_chain_209.fg-iB-5-nAbs-64-1024-plots} we can see that the abstraction sampling algorithms gradually improve their estimates as time progresses.  We also notice that estimates tend to be underestimates that increase over time which we have noticed is often, though not always, true.
                


            % \subsubsection{The Effect of \NoCaseChange{iB}}



            \paragraph{Choice of Abstraction Granularity.}
                Plot \ref{plt:results:error-vs-nAbs-plot-equalDistQB4-iB-5} highlights that, for the well performing equalDistQB scheme, the greater the granularity of abstractions allotted (namely greater $nAbs$), generally the better it will perform.  This results holds generally true for the better performing abstraction functions (with a caveat for minVar, presumably since when the granularity becomes too large its abstractions take excessively long leaving little time to draw samples).


            % \subsubsection{Best Abstraction Parameterization}
            %     In summary, we suggest use of equalDistQB4 with as high of i-bound as possible, and then with the finest granularity possible (ie. greatest possible $nAbs$), that will allow for samples to be drawn given time and memory considerations.




    \section{Conclusion} \label{sec:conlcusion}

        \paragraph{Summary.}

        \paragraph{Suggested Extensions.}


        
\clearpage
    % \bibliographystyle{named}
    \bibliography{ref}




\end{document}