%%%% ijcai24.tex

\typeout{IJCAI--24 Instructions for Authors}

% These are the instructions for authors for IJCAI-24.

\documentclass{article}
\pdfpagewidth=8.5in
\pdfpageheight=11in

% The file ijcai24.sty is a copy from ijcai22.sty
% The file ijcai22.sty is NOT the same as previous years'
\usepackage{ijcai24}

% Use the postscript times font!
\usepackage{times}
\usepackage{soul}
\usepackage{url}
\usepackage[hidelinks]{hyperref}
\usepackage[utf8]{inputenc}
\usepackage[small]{caption}
\usepackage{graphicx}
\usepackage{amsmath}
\usepackage{amsthm}
\usepackage{booktabs}
% \usepackage{algorithm}
% \usepackage{algorithmic}
\usepackage[switch]{lineno}

% Comment out this line in the camera-ready submission
\linenumbers

\urlstyle{same}

% the following package is optional:
%\usepackage{latexsym}

% See https://www.overleaf.com/learn/latex/theorems_and_proofs
% for a nice explanation of how to define new theorems, but keep
% in mind that the amsthm package is already included in this
% template and that you must *not* alter the styling.
% \newtheorem{example}{Example}
% \newtheorem{theorem}{Theorem}

% Following comment is from ijcai97-submit.tex:
% The preparation of these files was supported by Schlumberger Palo Alto
% Research, AT\&T Bell Laboratories, and Morgan Kaufmann Publishers.
% Shirley Jowell, of Morgan Kaufmann Publishers, and Peter F.
% Patel-Schneider, of AT\&T Bell Laboratories collaborated on their
% preparation.

% These instructions can be modified and used in other conferences as long
% as credit to the authors and supporting agencies is retained, this notice
% is not changed, and further modification or reuse is not restricted.
% Neither Shirley Jowell nor Peter F. Patel-Schneider can be listed as
% contacts for providing assistance without their prior permission.

% To use for other conferences, change references to files and the
% conference appropriate and use other authors, contacts, publishers, and
% organizations.
% Also change the deadline and address for returning papers and the length and
% page charge instructions.
% Put where the files are available in the appropriate places.


% PDF Info Is REQUIRED.

% Please leave this \pdfinfo block untouched both for the submission and
% Camera Ready Copy. Do not include Title and Author information in the pdfinfo section
\pdfinfo{
/TemplateVersion (IJCAI.2024.0)
}

\title{IJCAI--24 Formatting Instructions}


% % Single author syntax
% \author{
%     Author Name
%     \affiliations
%     Affiliation
%     \emails
%     email@example.com
% }

% Multiple author syntax (remove the single-author syntax above and the \iffalse ... \fi here)
% \iffalse
% \author{
% Bobak Pezeshki$^1$
% \and
% Kalev Kask$^2$\and
% Alexander Ihler$^{2,3}$\And
% Rina Dechter$^4$\\
% \affiliations
% $^1$University of California, Irvine
% \emails
% \{pezeshkb, kkask, ihler, dechter\}@uci.edu,}
% \fi

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% CUSTOM PACKAGES %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\usepackage{caption}
\usepackage{subcaption}
\usepackage{float}
\usepackage{xspace} % package being used for \newcommand to remove extra space
                    %     when a command is invoked without an argument list
\usepackage{textcase}
\usepackage[toc, nopostdot]{glossaries}
% \usepackage{amsmath}
\usepackage{amsthm, amssymb}
\usepackage{mathtools}
\usepackage{enumitem}
\usepackage{refcount}
\usepackage[leftmargin=6pt, vskip=3pt-\parskip]{quoting}
\usepackage[titlenumbered,ruled, linesnumbered]{algorithm2e}
\usepackage{mathrsfs} %for \mathscr
% \usepackage[font=small,labelfont=bf]{caption}
% \usepackage[font=small,labelfont=bf]{subcaption}
% \usepackage[labelfont=bf]{caption}
\usepackage[labelfont=bf]{subcaption}
\usepackage{xcolor}
    \definecolor{darkgreen}{rgb}{0.0, 0.2, 0.13}
    \definecolor{cadmiumgreen}{rgb}{0.0, 0.42, 0.24}
    \definecolor{byzantium}{rgb}{0.44, 0.16, 0.39}
    \definecolor{darkelectricblue}{rgb}{0.33, 0.41, 0.47}
    \definecolor{battleshipgrey}{rgb}{0.52, 0.52, 0.51}
    \definecolor{warmblack}{rgb}{0.0, 0.26, 0.26}
\usepackage{newfloat}
\usepackage{chngcntr}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% CUSTOM COMMANDS %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%create new float environment called plotfigure with it's own counter
\DeclareFloatingEnvironment[name=Plot]{plotfigure} 

%create new float environment called tablefigure with it's own counter
\DeclareFloatingEnvironment[name=Table]{tablefigure} 

%set the floats table and tablefigure to use the same counters
\makeatletter\let\c@tablefigure\c@table\makeatother 

%consider the floats table and tablecounter as the same set of floats (so location in document will be in order in which they appear)
\makeatletter\let\ftype@tablefigure\ftype@table\makeatother 

\DeclareMathOperator*{\argmin}{argmin}
\DeclareMathOperator*{\argmax}{argmax}
\mathchardef\mhyphen="2D % Define a "math hyphen"

% algorithm2e
% \newcommand\commentstyle[1]{\textcolor{cadmiumgreen}{#1}}
\SetCommentSty{commentstyle}
\SetKwInOut{Input}{input}
\SetKwInOut{Output}{output}

\newtheoremstyle{break}
  {\topsep}{\topsep}%
  {\itshape}{}%
  {\bfseries}{}%
  {\newline}{}%
\theoremstyle{break}
\newtheorem{theorem}{Theorem}[subsubsection]
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{definition}{Definition}[subsubsection]

\input{cmds}
\renewcommand*{\glstextformat}{\textbf}

\renewcommand{\quote}{\list{}{\rightmargin=\leftmargin\topsep=0pt}\item\relax}







%%% for supplemental

\usepackage{enumitem}
    \setlistdepth{9}
    \setlist[itemize,1]{label=$\bullet$}
    \setlist[itemize,2]{label=$\cdot$}
    \setlist[itemize,3]{label=$\cdot$}
    \setlist[itemize,4]{label=$\cdot$}
    \setlist[itemize,5]{label=$\cdot$}
    \setlist[itemize,6]{label=$\cdot$}
    \setlist[itemize,7]{label=$\cdot$}
    \setlist[itemize,8]{label=$\cdot$}
    \setlist[itemize,9]{label=$\cdot$}
    \renewlist{itemize}{itemize}{9}






%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\setcounter{secnumdepth}{3} %May be changed to 1 or 2 if section numbers are desired.
\setcounter{tocdepth}{3}

\title{Abstraction Sampling with Heuristic-Based, HR-Based,\\ and Proposal-Based Abstraction Functions}


\input{gls}


\begin{document}
    % \onecolumn
    \setlength{\abovedisplayskip}{3pt}
    \setlength{\belowdisplayskip}{3pt}

    \maketitle
    
    \begin{abstract}
        Monte Carlo methods have proven to be powerful tools for solving a wide range of computational problems, including those involving complex probability distributions. Despite their versatility, these methods often suffer from computational inefficiencies, especially when dealing with rare events. As such, importance sampling emerged as a prominent technique for alleviating these challenges. Recently, a new scheme called Abstraction Sampling was developed that incorporated stratification to importance sampling over graphical models helping to improve estimates further. Nevertheless, work on Abstraction Sampling to date has explored a only a handful of abstraction functions that guide the stratification. This work expands the set of general abstraction functions for AND/OR Abstraction Sampling by introducing three new classes of abstraction functions combined with seven distinct partitioning schemes resulting in a total of twenty-one new abstraction functions that are motivated by theory and intuition - which has been expanded upon, and for which extensive empirical analysis on over 400 benchmarks were conducted. 
    \end{abstract}

    % \vfill\eject
    % \tableofcontents
    
    % \clearpage
    \section{Introduction} \label{sec:introduction}


        
    \section{General Background} \label{sec:background}


%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%

        \subsection{Graphical Models}
            
            % \begin{figure}[]
            %     \centering
            % 	\includegraphics[scale=0.25]{images/AncestorBranchingMass.pdf}
            % 	\caption{Ancestor branching mass of an AND node.}
            % 	\label{fig-ancestor-branching-mass}
            % \end{figure}
    
            % % \begin{comment}
            % \begin{figure}[]
            %     \centering
            % 	\includegraphics[scale=0.25]{images/ProperAbstractionGroups.pdf}
            % 	\caption{Scope of proper abstractions.}
            % 	\label{fig-proper-abstraction-groups}
            % \end{figure}
            % % \end{comment}
            
            A {\bf graphical model}, such as a Bayesian or Markov network \cite{pearl88,darwiche-book,DBLP:series/synthesis/2013Dechter}, can be defined by  a 3-tuple
            $\mathcal{M} \! = \! (\mathbf{X,D,F})$, where
            $\mathbf{X} \! = \! \{X_i \! : i \! \in \! \I\}$
            is a set of variables indexed by a set $\I$,
            and $\mathbf{D} \! = \! \{D_i \! : i \! \in \! \I\}$
            is the set of finite domains of values for each $X_i$. Each function $f_{\alpha} \in   \mathbf{F}$ is defined over a subset of the variables
            called its scope, $X_{\alpha}$, %\subseteq X$, also  denoted $scope(f_{\alpha})$
            where  $\alpha \subseteq \I$ are  the indices of  variables in its scope  and $D_{\alpha}$ denotes  the Cartesian product of their domains, so that %
            % Namely,
            $f_{\alpha} \! : D_{\alpha} \! \rightarrow R^{\geq 0}$. {\bf Primal graph} $\mathcal{G} \! = \! (\mathbf{V,E})$ of $\mathcal{M}$ associates each variable with a node ($\mathbf{V} \! = \! \mathbf{X}$), while arcs $e \! \in \! \mathbf{E}$ connect nodes whose variables appear in the scope of the same function.
            %We define $scope(F) = \{\alpha | f_{\alpha} \in F \}$.
            %  $\mathbf{F} = \{f_{\alpha} : \alpha \in scopes(F)\}$ is a set of discrete functions, where $\alpha \subseteq V $ and
            %$X_\alpha \subseteq X$ is the scope of $f_\alpha$.
            %Graphical models can be used to represent a global function, often a probability distribution on $\mathbf{X}$,
            $\mathcal{M}$ defines a global function, often a factorized probability distribution on $\mathbf{X}$,
            $P(\mathbf{X}) = \frac{1}{Z} \prod_{\alpha}f_\alpha(X_\alpha)$, where 
            $
            Z = \sum_X \prod_{\alpha}
            f_\alpha(X_\alpha)
            $, known as the partition function, is a normalization factor.

            
        \subsubsection{Search Spaces of Graphical Models} 
            A graphical model can be transformed into a weighted state space graph.
            In an OR search space, which is constructed layer-by-layer relative to a variable ordering, paths from the root to the leaves represent full configurations - or assignments to all variables - where each successive level corresponds to an assignment of the next variable in the ordering.
            A graphical model can also be transformed  into a more compact AND/OR search space  by capturing its conditional independencies, % in the  model,
            thus facilitating more effective algorithms \cite{DBLP:journals/ai/DechterM07}.
            
            An AND/OR search space is defined relative to a \emph{pseudo tree} of a primal graph.	A {\bf pseudo tree} $\mathcal{T} \! = \! (\mathbf{V,E'})$ of a  primal graph $\mathcal{G} \! = \! (\mathbf{V,E})$ is a directed rooted tree that spans $\mathcal{G}$ such that every arc of $\mathcal{G}$ not in $\mathbf{E'}$ is a back-arc in ${\cal T}$ connecting a node to one of its ancestors (Figure \ref{fig-simple}(a),(b)).  A variable is a {\bf branching variable} if it has multiple children in $\mathcal{T}$.
            %The arcs in $E'$ may not all be included in $E$ .  
            
            
            
            
            \begin{figure}[!htb]
            	\centering
            	\begin{subfigure}{0.9\linewidth}
            	\centering
            	       \includegraphics[width=0.8\linewidth]{./_attachments/images/pseudotree.png}
            	\end{subfigure}
                    \begin{subfigure}{0.9\linewidth}
            	\centering
                        \includegraphics[width=0.8\linewidth]{./_attachments/images/AncestorBranchingMass.png}
                    \end{subfigure}
            	\caption{A full AND/OR tree representing 16 possible solutions guided by the pseudo tree shown above . Boxed in green is the ancestor branching sub tree for the path $\rightarrow \!\! (A \!\! = \!\! 0) \!\! \rightarrow  \!\! (C \!\! = \!\! 1)$.}
                        \label{fig-ancestor-branching-mass}
            \end{figure}

            
            Given a
            pseudo tree $\mc{T}$ of a primal graph $\mathcal{G}$, the \emph{AND/OR search tree}
            $T_{\mc{T}}$ guided by $\mc{T}$ has alternating levels of OR nodes
            corresponding to variables, and AND nodes corresponding to
            an assignment from  its domain  with edge costs extracted from
            the original functions \cite{DBLP:journals/ai/DechterM07}. %(By this logic, we can think of the nodes of an OR tree as AND nodes).  
            Let $n$ be an AND node in $T_{\tau}$, also denoted $n_X$ if $X$ is the last variable of its partial configuration.
            Each arc into an AND node $n$ %(or the arc from its OR parent to the AND node)
            has a cost $c(n)$ defined to be the product of all factors $f_{\alpha}$ in $\mathcal{M}$ that are instantiated at $n$ but not before.
            % \textcolor{red}{Moved to section "Value of A Node": (see Figure \ref{fig-simple}(c)).}                                            
            \subsection{Notation}
                When not otherwise stated, capital letters ($X$) represent variables and small letters ($x$) represent their values.  (An exception is when using $n$, which we use to represent search nodes).  Boldfaced letters represent a collection. For example,
                boldfaced capital letters ({\bf X}) denote a collection of variables,
                $|{\bf X}|$ its cardinality, 
                $D_{\X}$ their joint domains (ie. all possible configurations of \X), 
                and bolded $\xx$ a particular realization in that joint domain (ie. a particular configuration of \X).

                In the context of search, $path(n)$ is the partial configuration corresponding to assignments to variables according to the path from the root of $T_{\mc{T}}$ to $n$, and $g(n)$ is the cost of $path(n)$.
            

                \subsubsection{$Z(n)$} \label{sec:partition-function-of-a-node}
                    We define $Z(n)$ recursively as: 
                    \begin{equation} \label{eq:and-or-z-prod}
                        Z(n_X) = \prod_{Y \in ch_{\cal T}(X)} Z(Y_{n_X})
                    \end{equation}
                    where
                    \begin{equation}
                        Z(Y_{n_X}) = \sum_{n_Y \in ch_Y(n_X)}  c(n_Y) \cdot Z(n_Y)
                    \label{eq2}
                    \end{equation}
                    and where $ch()$ denotes child variables either in the pseudo-tree or the search tree itself (depending on the context). Here, $ch_Y(n_X)$ are specifically the child AND nodes of $Y$ descended from AND node $n_X$.  Thus, $Z(r)$ equals the partition function $Z$ of the underlying full model (see Figure \ref{fig-simple}c). We denote sampling estimations of $Z(n)$ as $\hat{Z}(n)$.  Heuristic estimates of functions $Z(n)$ are denoted as $h(n)$.
    
                \subsubsection{$R(n)$} \label{sec:partition-function-of-a-node}
                     On the path from the root of $T_{\mc{T}}$ to some $n_{X}$, there may an intermediate node $n_{B}$ such that its associated variable $X$ is a branching variable in \PT.  Whenever this happens, the remaining variables of the model are split between the branches, and thus no single branch captures all the remaining variables. $R(n)$, or the \textbf{ancestor branching mass}, captures the $Z(n_{Y})$ for all $Y$ that branch off of the path to $X$. In Figure \ref{fig-ancestor-branching-mass}, the dotted green box shows the portion of the search space corresponding to the $R(n)$ for the node highlighted in red.

                     More formally, let $branchings(n_{X})$ be the set of nodes $n_{B}$ on the path to $n_{X}$ such that $B$ is a branching variable in \PT. Let $W$ be the child of $B$ that on the path to $X$.  We define $R(n_{X})$ as:
                     \begin{align}
                         \label{eq4}
                         R(n_{X}) =   \prod_{n_{B} \in branchings(n_{X})} \frac{Z(n_{B})}{ Z(W_{n_{B}})}
                     \end{align}


                     \textbf{Example.} In Figure \ref{fig-ancestor-branching-mass}, consider the path from the root to the node $n \! = \! (A \!\! = \!\! 0,C \!\! = \!\! 1)$ marked in red. Following under $(A=0)$ to our node, we see there is a node of variable $B$ that branches off of the path.
                     Thus, $Q(n_{A=0,C=1})$
                     \begin{align}
                        &= g(n_{A=0,C=1}) \! \cdot \! R(n_{A=0,C=1}) \! \cdot \! Z(n_{A=0,C=1}) \\
                        &= g(n_{A=0,C=1}) \mul \;\; Z(n_{A=0,B})\;\; \! \cdot \! Z(n_{A=0,C=1}) 
                     \end{align}

                     We denote approximations to $R(n)$ as $r(n)$.
                
                \subsubsection{$Q(n)$} \label{sec:partition-function-of-a-node}
    
                    Putting it all together, we can now concisely define a quantity $Q(n)$ as the contribution to $Z$ from all full configurations consistent with $path(n)$. In other words, $Q(n)$ is the unnormalized probability of the configuration $path(n)$ based on the distribution defined by \M, with $P(path(n)) = \frac{Q(n)}{Z}$.  $Q(n)$ can be computed simply as:
                    \begin{align}
                        Q(n) = g(n)  \! \cdot \!  R(n)  \! \cdot \!  Z(n)
                    \end{align}
                    We denote approximations to $Q(n)$ as $q(n)$.
                
             
             
        \subsection{Stratified Importance Sampling.} 
            Abstraction Sampling builds on Importance Sampling and Stratified Sampling. {\em Importance Sampling} (IS) is  a Monte Carlo scheme for approximating likelihood queries over graphical models.
            %\cite{Rubinstein_2007,DBLP:journals/ai/GogateD11,liu2015probabilistic}.
            {\em Stratified Sampling} is a variance reduction technique for sampling a search space by first dividing it into disjoint strata. This can be used with importance sampling to further reduce variance.
            In {\em Stratified Importance Sampling}, we first divide the sample space into $k$ strata of equal area under the distribution $q$, then choose re-weighted representatives from each strata. %, and uses these representatives to form an estimator over the entire model. 
            In order to maximize reduction in variance, the variance between strata should be maximized (see \cite{rizzo_2007}).
            
            
            \newcommand{\soltree}{\hat{x}_M}
            \newcommand{\parttree}{\bar x}
            
            
    

%%%%%%%%%%%%%%%%%%%%%%%%%5
%%%%%%%%%%%%%%%%%%%%%%%%%5
%%%%%%%%%%%%%%%%%%%%%%%%%5
%%%%%%%%%%%%%%%%%%%%%%%%%5
%%%%%%%%%%%%%%%%%%%%%%%%%5
%%%%%%%%%%%%%%%%%%%%%%%%%5
%%%%%%%%%%%%%%%%%%%%%%%%%5
%%%%%%%%%%%%%%%%%%%%%%%%%5


    \section{Abstraction Sampling}\label{sec:abstraction-sampling}

        {\em Abstraction Sampling} (AS) algorithms \cite{DBLP:conf/uai/BrokaDIK18} apply concepts of Stratified Importance Sampling to sampling over probabilistic graphical models.  
        %An abstraction event in Abstraction Sampling is analogous to sampling representatives from strata in stratified importance sampling and reweighing to account for the rest of the members that were not chosen.  
        Guided by an abstraction
        function $a(.)$ that dictates how nodes of a variable should be partitioned into \textbf{abstract states} (analogous to strata in stratified sampling), Abstraction Sampling iteratively expands a search tree variable by variable, uses $a(.)$ to group nodes into abstract states, and uses an importance-sampling like process to select an individual from each abstract state and reweight it to account for the other members that were in its abstract state.  The chosen nodes can then be expanded, and this results in the generation of a  sub tree of the full search tree $T_{\mc{T}}$ (called a {\bf probe}) as a sample.
            
        \subsection{AOAS}
            Taking Abstraction Sampling further, \cite{kask20-scaling-up-as} introduced algorithm AOAS (\textbf{A}nd/\textbf{O}R \textbf{A}bstraction \textbf{S}ampling) that applied abstraction sampling to AND/OR search spaces and significantly improved performance over previous version. AOAS used a proposal distribution $p(n) \propto q(n) = w(n) \! \cdot \! g(n) \! \cdot \! h(n) \! \cdot \! r(n)$ (see Figure \ref{fig:proposal}), where $g(n)$ is computed exactly, $Z(n)$ is approximated by $h(n)$, and $R(n)$ is estimated by $r(n)$, and a weight $w(n)$ is applied to account for the nodes abstracted into the path to $n$. An overview of the algorithm can be seen in Algorithm \ref{alg:aoas-overview} and a more detailed version and a sample trace taken from \cite{kask20-scaling-up-as} found in the Supplemental Materials.

        
        \begin{algorithm}[t!]
              \caption{AOAS Overview}
                \label{alg:aoas-overview}
        
            \begin{enumerate}
                \item \textbf{Initialization:}
                    Begin with a dummy root node $r$.
                \item \textbf{Probe Generation:}
                    Proceeding in a DFS manner according to a pseudo tree $\PT$...
                    \begin{enumerate}
                        \item \textbf{Expansion:} \label{alg:aoas-overview:expansion}
                            Generate children nodes $n$ corresponding to the next variable in the DFS ordering of $\PT$. Inherit $w(n)$ from parents and assign appropriate $g(n), h(n), \tn{and } r(n)$ values.
                        \item \textbf{Abstraction:} \label{alg:aoas-overview:abstraction}
                            \begin{enumerate}
                                \item \textbf{Form Abstract States:}
                                    Using $a(.)$, partition newly expanded nodes into abstract states.
                                \item \textbf{Select Representative:}
                                    Using the proposal $p(.)$ defined, select a representative from each abstract state and reweigh it such that $w(n) \leftarrow \frac{w(n)}{p(n)}$
                            \end{enumerate}
                        \item \textbf{Backtrack:} \label{alg:aoas-overview:backtracking}
                            After reaching a leaf in $\PT$, recursively backtrack until reaching nodes of the next unexplored branch of $\PT$. While backtracking, update parent node $\hat{Z}(n')$ estimates based on children's $w(n), g(n),$ and $\hat{Z}(n)$ values.
                        \item \textbf{Repeat:}
                            Repeat steps \ref{alg:aoas-overview:expansion}-\ref{alg:aoas-overview:backtracking} until having backtracked all the way to the root node.
                    \end{enumerate}
                \item \textbf{Return:}
                    $\hat{Z} = w(r)\,\hat{Z}(r)$ for the root node $r$.
            \end{enumerate}
        \end{algorithm}



        \begin{figure}[!htb]
            \centering
            \includegraphics[width=0.8\linewidth]{./_attachments/images/proposal.png}
            \caption{The $q(n)$ visualized to show it estimating the mass of nodes previously abstracted (via $w(n)$), the ancestor branching mass (via $r(n)$), the current path cost (via $g(n)$), and the sub tree mass (via $h(n))$.}
                    \label{fig:proposal}
        \end{figure}

        
        \subsection{Existing Abstraction Functions} \label{sec:abstraction-sampling:existing-abstraction-functions}

            \cite{DBLP:conf/uai/BrokaDIK18} designed abstractions based on assignments to a variable's context $C(X)$, where $C(X)$ is a subset of its ancestor in a pseudo-tree $\cal T$ whose assignment uniquely determines the AND/OR subtree below it, ie. its $Z(n)$ \cite{DBLP:journals/ai/DechterM07}.  Thus, abstractions 
            based on a subset of the context aim to group nodes based on having similar $Z(n)$ values. However, all the possible assignments to the context, $|\D_{C(X)}|$, is exponential and unfeasible to use in its entirity. Thus, \cite{DBLP:conf/uai/BrokaDIK18,kask20-scaling-up-as} utilize relaxed context-based (\textbf{RelCB}) and randomized context-based (\textbf{RandCB}) abstractions as in \cite{DBLP:conf/uai/BrokaDIK18}.  
            
            RelCB is parametrized by a level $j$, selecting the closest $j \! - \! 1$ variables from a variable's context (ie. its {\em relaxed context}) plus itself. It abstracts nodes of the same domain value that also share the same assignment to the relaxed context. Assuming domain size of $k$, this yields at most $k^j$ abstract states at each level. 
            
            The randomized scheme, RandCB, considers the entire context however is parameterized by a parameter $nAbs$ constraining the number of abstract states per level nodes can be placed into.  Each of the $nAbs$ abstract states are randomly associated with a set of possible full context assignments defining the nodes that will belong to that state.



    \section{Paradigms Intuiting Abstraction Strategies} \label{sec:paradigms}

        Next we review concepts from search and sampling that offer paradigms from which we draw ideas for abstraction functions.

        \subsection{Search Paradigms} \label{sec:paradigms:search}

            In [tree] search, one can merge nodes that have the same value to produce a more efficient graph search \cite{DBLP:journals/jair/MateescuDM08}. Abstraction functions by \cite{DBLP:conf/uai/BrokaDIK18} focused on this paradigm and approached it by using the concept of a node's context - the assignments to the smallest subset of a node's ancestor variables that dictates its value.  Due to the potentially large context size for variables, and consequently the exponentially high number of combinations of assignments to the context, the full context of variables could not be used in most cases.  \cite{DBLP:conf/uai/BrokaDIK18} resolved this by creating two context-based abstraction functions that were relaxed to allow nodes with different contexts to be grouped in the same abstract state.  However, sharing the same partial context does not necessarily imply the same, nor even similar, node values. Our new Heuristic-Based abstractions (Section \ref{sec:value-based-abstraction-classes:HB}) hope to provide more accurate abstractions based on the same ideology.


        \subsection{Sampling Paradigms} \label{sec:paradigms:sampling}

            % From \cite{DBLP:conf/uai/BrokaDIK18}, we learn that when abstraction sampling over a classical OR search space, we can say:
            % \begin{theorem}[Exact Proposal] \label{thm:old-exact-proposal}
            %     If the proposal function $p$ in AS uses an exact heuristic $h(n)=Z(n)$, then $\hat{Z}$ has zero variance (single probe is exact), for any abstraction.
            % \end{theorem}

            Consider wanting to compute the $\mathbb{E}_{p^{*}}[f(x)] = \sum_{x} f(x) p^{*}(x)$ for a distribution $p^{*}(.)$ over a variable $X$ that is difficult to sample from but easy to evaluate, and given a positive value function $f(x)$.  Using a proposal distribution $p(.)$ that is easy to sample from, and noticing the equivalency of the target quantity with $\sum_{x} \frac{f(x)p^{*}(x)}{p(x)} p(x)$, we can be estimate the quantity by importance sampling by drawing $m$ samples to estimate the equivalent quantity $\mathbb{E}_{p}[f(x)\frac{p^{*}(x)}{p(x)}] \approx \frac{1}{m}\sum_{j=1}^{m} f(x^{(j)}) \frac{p^{*}(x^{(j)})}{p(x^{(j)})}, x^{(j)} \!\! \stackrel{\text{iid}}{\sim} \!\! p$.  it is well known that importance sampling achieves zero variance when 1) $p(x)=0 \implies p^{*}(x)=0$, and 2) otherwise $p(x)$ is proportional to $p^{*}(x)f(x)$ \cite{KahnAndMarshall1953-variance-reduction,mcbook}.  
            
            \begin{lemma}[Importance Sampling Exact Proposal Based on Proportionality with Target Distribution]
                Given a distribution $p^{*}(.)$ over a variable $X$ that is easy to evaluate, and given a positive value function $f(x)$, importance sampling to estimate $\mathbb{E}_{p^{*}}[f(x)]$ achieves zero variance when using a proposal function $p(.)$ such that 1) $p(n)=0 \implies p^{*}(n)f(n)=0$, and 2) $p(n) \propto p^{*}(n)f(n)$, otherwise.
            \end{lemma}

            Note that we can also use importance sampling to simply compute $\sum_{x} f(x) = \sum_{x} \frac{f(x)}{p(x)} p(x) = \mathbb{E}_{p}[ \frac{f(x)}{p(x)}] \approx \frac{1}{m}\sum_{j=1}^{m} \frac{f(n^{(j)})}{p(x^{(j)})}, x^{(j)} \!\! \stackrel{\text{iid}}{\sim} \!\! p$.  Note that the partition function over a graphical model, $Z = \sum_{\xx} \F(\xx), \F(\xx)=\prod_{f\in \F} f(x)$, has the form of this task.
            
            In fact, expanding an AND/OR search tree level-by-level, the partition function $Z$ with respect to the nodes $n$ at any variable $X$ can be written as $Z = \sum_{n} g(n) Z(n) R(n)$.  Thus, using a proposal $p(.)$ to perform importance sampling at any level we could instead estimate 
            \begin{align}
                Z &= \sum_{n} g(n) Z(n) R(n) = \sum_{x} \frac{g(n) Z(n) R(n)}{p(n)} p(n)
                \\
                % &= \mathbb{E}_{p}[g(n) Z(n) R(n)\frac{1}{p(n)}]
                % \\
                &\approx \frac{1}{m}\sum_{j=1}^{m} \frac{g(n^{(j)}) Z(n^{(j)}) R(n^{(j)})}{p(n^{(j)})},
                % \\
                % &= \frac{1}{m}\sum_{j=1}^{m} w(n^{(j)}) g(n^{(j)}) Z(n^{(j)}) R(n^{(j)}),\\
                % &\hspace{3cm}w(n^{(j)}) = \frac{1}{p(n^{(j)})}, 
                n^{(j)} \! \stackrel{\text{iid}}{\sim} \! p
            \end{align}
            Thus, sampling at any level would also allow for zero variance / exact computation if similarly $p(n) \propto g(n) Z(n) R(n)$.
            
            Note that in Abstraction Sampling each abstract state involves a node selection procedure analogous to importance sampling and that AOAS uses a proposal $p(n) \propto g(n) h(n) r(n)$.  $g(n)$ can always be evaluated exactly.  Then assuming that $h(n)=0 \implies Z(n)=0$ and $r(n)=0 \implies R(n)=0$, it naturally follows that designing each abstract states $\bs{A_{i}}$ such that $\forall n \in \bs{A_{i}}, h(n)r(n) = \alpha\, g(n) Z(n) R(n)$, for some constant $\alpha$, we similarly achieve zero variance. 


            \begin{definition}[Abstraction Function $h(n)r(n)$ vs. $Z(n)R(n)$ Proportionality]
                An abstraction function $a(n)$ maintains $h(n)r(n)$ vs. $Z(n)R(n)$ proportionality if, for every abstract state $A_i$ formed by $a(n)$, $\forall n \in A_i, h(n)r(n) = \alpha \, Z(n)R(n)$, for some constant $\alpha$ specific to $A_i$.
            \end{definition}

            \begin{definition}[Exact Abstraction Function]
                 An abstraction function $a(.)$ is exact for an abstraction sampling algorithm, AS, if use of $a(.)$ with AS always leads to AS estimates having zero variance and $\hat{Z} = Z$ for every AS probe.
            \end{definition}
            
            Thus, we can say:
            \begin{theorem}[AOAS Exact Abstractions from $h(n)r(n)$ vs. $Z(n)R(n)$ Proportionality] \label{thm:aoas-proportionality-exact-proposal}
                  If an abstraction function $a(.)$ maintains $h(n)r(n)$ vs. $Z(n)R(n)$ Proportionality, then it is an exact abstraction function for AOAS. (Proof in Supplemental Materials)
            \end{theorem}



            Normally we neither have access to the proportionality constant 
            %$\frac{h(n)r(n)}{Z(n)R(n)}$,
            $\alpha$
            or even know whether nodes have the same 
            %$\frac{h(n)r(n)}{Z(n)R(n)}$,
            $\alpha$.  However one idea is to use the magnitude of $h(n)r(n)$ itself as a heuristic for similarities in $\alpha$.  This drives the intuition for the new HR-Based abstractions (Section \ref{sec:value-based-abstraction-classes:HRB}).

            Also from a sampling perspective, \cite{rizzo_2007} showed the following about stratified importance sampling when sampling from equal area strata under the proposal:
    
            \begin{proposition}[Stratified Importance Sampling Variance Reduction] \label{prop:rizzo-variance-reduction}
                Suppose that $M = mk$ is the number of replicates for an importance sampling estimator $\hat{\theta^{I}}$, and $\hat{\theta^{SI}}$ is a stratified importance sampling estimator, with estimates $\hat{\theta_{j}}$ for $\theta_{j}$ on the individual strata, each with $m$ replicates.  If $Var(\hat{\theta^{I}}) = \sigma^{2} / M$ and $Var(\hat{\theta_{j}}) = \sigma^{2}_{j} / m$, $j = 1, ..., k$, then
                \begin{align}
                    \sigma^{2} - k \sum^{k}_{j=1} \sigma^{2}_{j} \geq 0,
                \end{align}
                with equality if and only if $\theta_{1}=...=\theta_{k}$.  Hence stratification never increases variance, and there exists a stratification that reduces the variance except when [the proposal function] $g(x)$ is constant.
            \end{proposition}
    
            Two takeaways from this proposition are that 1) we can achieve variance reduction with respect to importance sampling (analogous to Abstraction Sampling with all nodes placed into a single abstract state) by stratifying into equal area strata under the proposal, and 2) reducing the variance of each strata $\sigma^{2}_{j}$ leads to greater variance reduction.  These help drive the intuition for the new Proposal-Based abstraction class presented in Section \ref{sec:value-based-abstraction-classes:QB}, as well as motivate several of the abstraction schemes presented in Section \ref{sec:ordered-partitioning-schemes}.


        \subsection{Combined Paradigms}\label{sec:paradigms:combined}
            With a helpful heuristic, in both the search and sampling domains there are notions of potential benefit by spending more time in optimistic areas of the search/sampling space.  In heuristic search, this corresponds to proceeding in an order that prioritizes expansion of nodes believed to have higher values first.  And similarly in sampling, among many schemes it is beneficial to spend the most effort sampling high impact events. We use these perspectives to motivate the new abstraction schemes that will be described in Section \ref{sec:ordered-partitioning-schemes}.


    \section{Value-Based Abstraction Classes} \label{sec:value-based-abstraction-classes}

        We introduce three new classes of abstraction functions that each define a unique notion of similarity between nodes based on a unique value measurement on a positive scale.  These value measurements, which we refer to as \textbf{\textit{abstraction values}} denoted $\mu(.)$, are used as a measure of similarity by abstraction functions to abstract similar nodes together.  The three classes we present are: Heuristic-Based (HB), HR-Based (HRB), and Q-Based (QB) abstraction classes.  Each is motivated by theory in search or sampling discussed in Section \ref{sec:paradigms}, and each can be used with various abstraction schemes to come in Section \ref{sec:ordered-partitioning-schemes}, which together form an abstraction functions.

    
        \subsection{Heuristic-Based Abstractions} \label{sec:value-based-abstraction-classes:HB}
        
            \begin{quote}
                $\mu(n) = h(n)$
            \end{quote}
            
            Using the motivation of abstracting nodes with similar sub tree $Z(n)$ intuited from the search domain, we propose associating an abstraction value $\mu(.)$ to each node based on the heuristic estimate $h(n)$ of a node's $Z(n)$.  Unlike the use of partial (or hashed) contexts as was used by \cite{DBLP:conf/uai/BrokaDIK18}, heuristic estimates of $Z(n)$ can often provide \textit{quantitative} insight into potential similarities in $Z(n)$ values, and this is particularly true when using wMBE heuristics which provides bounds.

            In conjunction with the schemes that will be presented in Section \ref{sec:ordered-partitioning-schemes}, HB algorithms aim to form abstractions such that nodes with similar $Z(n)$ are grouped together.


        \subsection{HR-Based Abstractions} \label{sec:value-based-abstraction-classes:HRB}

            \begin{quote}
                $\mu(n) = h(n)  \! \cdot \!  r(n)$
            \end{quote}

            As demonstrated in Theorem \ref{thm:aoas-proportionality-exact-proposal}, similarity of $\alpha = \frac{h(n)r(n)}{Z(n)R(n)}$ among nodes in the same abstract state can lead to reduction in variance.  Although this ratio is infeasible to compute, we can use $\mu(n) = h(n)r(n)$ as a surrogate for $\alpha$ and group nodes accordingly.
            
            In conjunction with the schemes that will be presented in Section \ref{sec:ordered-partitioning-schemes}, HRB algorithms aim to form abstractions such that nodes with similar $\alpha = \frac{h(n)r(n)}{Z(n)R(n)}$ are grouped together in hopes of driving down variance within the abstract states.


        \subsection{Q-Based Abstractions} \label{sec:value-based-abstraction-classes:QB}

            \begin{quote}
                $\mu(n) = w(n) \! \cdot \! g(n) \! \cdot \! h(n) \! \cdot \! r(n)$
            \end{quote}

            On the other hand, \cite{rizzo_2007} showed the potential of variance reduction minimizing variance within strata when stratifying based on the proposal distribution.  Thus, in Q-Based Abstractions we use the quantity $q(n) = w(n)g(n)h(n)r(n) \propto p(n)$ \cite{kask20-scaling-up-as} as the value function.

            In addition to serving as un-normalized proposal function for a node $n$, $q(n)$ also estimates the $n$'s
            contribution to the overall $Z$. Therefore, $q(n)$ estimates the impact of $n$ (and all of the nodes $n$ represents as the selected representative from previous abstractions) on $Z$, which could be a valuable quantity to base our choice of nodes on as discussed in Section \ref{sec:paradigms:combined}.
            

                








            
    
    
    
    
    
        % \subsection{Proposal Based Abstractions} \label{sec:q-based-abstractions}
        %     As a main algorithmic contribution of this work, we introduce a new class of abstractions based on proposal estimates.  We will motivate this new class of abstractions with theory, and then describe three schemes that were developed as a result.
            
        %     \subsubsection{Motivation} \label{sec:q-based-abstractions:motivation}
    
                
    
                
                 
        %     \subsubsection{\NoCaseChange{simpleQB}} \label{sec:q-based-abstractions:SimpleQB}
    
    
                
    
        %     \subsubsection{\NoCaseChange{simpleQB}} \label{sec:q-based-abstractions:SimpleQB}
    
    
    
    
        %         \begin{itemize}
        %             \item
        %                 \textbf{randQB}: nodes are ordered by $q$ and then partitioned into $nAbs$ abstract states of random sizes
    
        %             \item
        %                 \textbf{minVarQB}: nodes are partitioned into $nAbs$ abstract states using Ward's Minimum Variance hierarchical clustering
        %             \item
        %                 \textbf{equalDistQB}: nodes are ordered by $q$ (from \emph{low $q$ to high $q$}) and placed into abstract states based on $nAbs$ quantiles (with respect to the total $q$ of nodes already assigned abstract states).  If a quantile has not been reached or surpassed, the next node in the ordering is added into the current abstract state.  If the current quantile is surpassed, the next abstract state and quantile are instead considered.
        %             \item
        %                 \textbf{equalDistQB2}: nodes are ordered by $q$ (from \emph{high $q$ to low $q$}) and placed into abstract states based on $nAbs$ quantiles (with respect to the total $q$ of nodes already assigned abstract states).  If a quantile has not been reached or surpassed, the next node in the ordering is added into the current abstract state.  If the current quantile is surpassed, the next abstract state and quantile are instead considered.
        %             \item
        %                 \textbf{equalDistQB3}: nodes are ordered by $q$ (from \emph{high $q$ to low $q$}) and placed into abstract states based on $nAbs$ quantiles (with respect to the total $q$ of nodes already assigned abstract states) with the caveat that at least one node is added to each abstract state.  If a quantile has not been reached or surpassed, the next node in the ordering is added into the current abstract state.  If the current quantile is surpassed, the next abstract state and quantile are instead considered and the next node in the ordering added to that abstract state by default.
        %             \item
        %                 \textbf{equalDistQB4}: nodes are ordered by $q$ (from \emph{high $q$ to low $q$}) and placed into the current abstract state until the accumulation on $q$ of the added nodes is equal to or exceeds the $1/nAbs_{remaining}$ quantile considering the total $q$ of nodes in the current abstract state and nodes yet to be assigned.
        %         \end{itemize}
            
        %     \subsubsection{MinVarQB} \label{sec:q-based-abstractions:MinVarQB}
        %     \subsubsection{EqualDistQB} \label{sec:q-based-abstractions:EqualDistQB}
        %         From the performance of the previous two abstraction functions an their analysis as seen in Section \ref{sec:empirical-evaluation}, it was observed that the more computationally intensive MinVarQB abstraction function was producing probes with better estimates where as SimpleQB was producing better overall estimates (presumably due to its speed enabling many more probes).  Thus, a third heuristic based abstraction, EqualDistQB, was created inspired by the simplicity and speed of SimpleQB and, in a greedy way, also attempting to minimize variance of the heuristic values of the probes in the resulting abstract states.
    
        %         EqualDistQB works by 
                
        %         The variance reduction proven for stratified importance sampling by \cite{rizzo_2007} assumed that each stratified layer had an equal mass under the proposal distribution \todo{double check that it is the proposal and not true}.  Thus, to approach emulation of such a scenario at each level of abstraction, we can sort nodes
    
    
    
    
    \section{Ordered Value-Based Abstraction Functions} \label{sec:ordered-value-based-abstraction-functions}

        Consider the following condition for abstract states with respect to an abstraction value function $\mu(.)$ and sort order $o$:

        \begin{definition}[Value Ordered Abstractions]
            Abstractions $\bs{A}$ of nodes $\bs{n}$ are considered value ordered with respect to abstraction value function $\mu(.)$ and sort order $o$ if:
            
        \end{definition}
        
        We first define a new class of abstraction functions, Ordered Value-Based Abstraction Functions:

        \begin{definition}[Ordered Value-Based Abstraction Function]
            An abstraction function $a(.)$ that partitions a set of nodes $\bs{n}$ into at most $nAbs$ abstract states $A_{i}$ such that 
            \begin{align}
            \begin{split}
                &1) \bigcup_{A_{i} \in \bs{A}} \! A_{i} \; = \; \bs{n},\\ 
                &2) \bigcap_{A_{i} \in \bs{A}} \! A_{i} \; = \; \emptyset, \\
                &3) \hspace{3pt}\forall A_{i},A_{i'} \in \bs{A}, i < i', \\
                &\hspace{2.5cm}\forall n \in A_{i}, \forall n' \in A_{i'}, \mu(n) \stackrel{o}{<} \mu(n')
            \end{split}
            \end{align}
            
            % nodes remain sorted across all abstract states according to a provided value function $\mu(.)$ and sort-ordering $o$.
            
            
            % \vspace{-6pt}
            % \begin{itemize}
            %     \item 
            %         takes as input: A set of nodes $\bs{n}$ to be partitioned into abstract states; an abstraction value function $\mu(.)$; a sorting algorithm $SORT(.)$ that sorts $\bs{n}$ according to $\mu(.)$ and sort order $o$; a parameter $nAbs$ bounding the number of abstract states; a partitioning function $\Psi(.)$ that partitions the sorted nodes into abstract states maintaining their order
            %     \item 
            %         outputs: Nodes $\bs{n}$ partitioned into abstract states $\bs{A} = \setst{\bs{A_{i}}}{i<=nAbs}$ such that sort order $o$ of $\mu(n)$ is maintained across all $\bs{A_{i}}$
            % \end{itemize}
        \end{definition}

        \begin{algorithm}[t]
            \caption{$a_{\tn{\textit{ordered value}}}$}
            \label{alg:general-ordered-value-based-abstraction-function}
            \begin{footnotesize}
                \SetInd{0.25em}{0.55em}
                \DontPrintSemicolon 
            \Input{A set of nodes $\bs{n}$ to be partitioned into abstract states; an abstraction value function $\mu(.)$; a sorting algorithm $SORT(.)$ that sorts $\bs{n}$ according to $\mu(.)$ and sort order $o$; a parameter $nAbs$ bounding the number of abstract states; a partitioning function $\Psi(.)$ that partitions the sorted nodes into abstract states maintaining their order}
            \Output{Nodes $\bs{n}$ partitioned into abstract states $\bs{A} = \setst{\bs{A_{i}}}{i<=nAbs}$ such that sort order $o$ of $\mu(n)$ is maintained across all $\bs{A_{i}}$.}
            
            \Begin{
                \uIf{$|\bs{n}| <= m$}{
                    $\bs{A} = \setst{\set{n}}{n \in \bs{n}}$\\
                }
                % \uIf{$|\bs{n}| <= m$}{
                %     \tcp{Each node is its own abstract state}
                %     $\bs{A} = \set{}$\\
                %     $nAbs' \leftarrow |\bs{n}|$\\
                %     \ForEach{$i \in \set{1,...,nAbs'}$}{
                %         $\bs{A_{i}} = \set{n_{i}}$\\
                %         $\bs{A} \leftarrow \bs{A} \cup \set{\bs{A_{i}}}$
                %     }
                % }
                \uElse{
                    $\bs{n^{*}} \leftarrow SORT(\bs{n},v,o)$\\
                    $\bs{A} = \Psi(\bs{n^{*}}, v)$
                }
                \Return $\bs{A}$       
            }
            \end{footnotesize}
        \end{algorithm}

        We provide a generic example of an ordered value-based abstraction function in Algorithm \ref{alg:general-ordered-value-based-abstraction-function}.
        
        % \semph{Complexity of $a_{\tn{\textit{ordered value}}}$.}\hfill\\
        %     $\mathcal{O}(\; \mathcal{O}(SORT(\bs{n},v,o)) + \mathcal{O}(\Psi(\bs{n^{*}}, v) \;)$
            
        Assuming the value function $\mu(.)$ is not dominating, the complexity is either determined by the sorting method's complexity or the partitioning complexity.

        Next we present seven ordered value partitioning schemes that, in conjunction with a provided $\mu(.)$, can be used with Algorithm \ref{alg:general-ordered-value-based-abstraction-function} to define a unique ordered value-based abstraction function.


    \section{Ordered Partitioning Schemes} \label{sec:ordered-partitioning-schemes}

        We now present seven schemes, each defined by a unique sort order $o$ and partition strategy $\Psi$ combination.  Each scheme uses a different method to partition nodes into abstract states keeping the nodes in sort order according to $o$. With a provided value function $\mu(.)$, each scheme can be used to form an ordered value abstraction function.  In addition to defining each scheme, we also describe the motivation behind its creation.

        \paragraph{Running Example} \label{sec:ordered-partitioning-schemes:running-example}  As we motivate and describe the schemes, we will also provide an example of abstract states that would result from partitioning the following nodes:
        \begin{align} \label{eq:running-partitioning-example}
            \set{1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 10, 100}
        \end{align}
        into $nAbs=4$ abstract states.
            
        \subsubsection{\NoCaseChange{simpleVB}} \label{sec:ordered-partitioning-schemes:simpleVB}
            \begin{quote}
                $o = \tn{low to high}$\\
                $\Part{simpleVB}$ (Algorithm \ref{alg:psi-simpleVB})
            \end{quote}
            
            \begin{algorithm}[t!]
                \caption{$\Part{simpleVB}$}
                \label{alg:psi-simpleVB}
                \begin{footnotesize}
                    \SetInd{0.25em}{0.55em}
                    \DontPrintSemicolon 
                \Input{A set of ordered nodes $\bs{n^{*}}$ to be partitioned into $nAbs$ abstract states; a value function $\mu(.)$}
                \Output{$\bs{n^{*}}$ partitioned into abstract states\footnotemark{} $\bs{A} = \setst{\bs{A_{i}}}{i \in \set{1,...,nAbs}}$ such that $\forall \bs{A_{i}},\bs{A_{j}} \in \bs{A}, |\bs{A_{i}}|-|\bs{A_{j}}| <= 1$}
                
                \Begin{
                    $baseCardinality \leftarrow \floor{\frac{|\bs{n^{*}}|}{nAbs}}$\\
                    $extras \leftarrow |\bs{n^{*}}| \mod nAbs$\\
                    $j_{begin}=1$\\
                    \ForEach{$i \leftarrow 1,...,nAbs$}{
                        \uIf{$extras > 0$}{
                            $j_{end} \leftarrow j_{begin} + baseCardinality$\\
                            $extras \leftarrow extras - 1$
                        }
                        \uElse{
                            $j_{end} \leftarrow j_{begin} + baseCardinality - 1$
                        }
                        $\bs{A_{i}} = \set{n^{*}_{{j_{begin}}}, ..., n^{*}_{{j_{end}}}}$\\
                        $j_{begin} \leftarrow j_{end}+1$
                    }
                    $\bs{A} = \cup_{i = 1}^{nAbs} \bs{A_{i}}$\\
                    \Return $\bs{A}$       
                }
                \end{footnotesize}
            \end{algorithm} \footnotetext{\label{ftn:ordered-schemes-maintain-sort-order}Such that nodes maintain sort order $o$ across all abstract states.}

            The simpleVB (simple value-based) scheme follows the motivation of grouping nodes of similar value in the same abstract state by a simple 2-step method: 1) first, nodes are ordered by their heuristic value (low to high), and 2) next the ordered nodes are partitioned into [approximately] equal cardinality abstract states.

            \semph{Time Complexity.}\hfill\\
                Partitioning is achieved via one pass through $|\bs{n^{*}}|$ leading to $\mathcal{O}(|\bs{n^{*}}|)$ time complexity.

            \semph{Space Complexity.}\hfill\\
                No more than linear space is required.  $\mathcal{O}(|\bs{n^{*}}|)$

            \semph{Result on \hyperref[sec:ordered-partitioning-schemes:running-example]{Running Example}.}\hfill\\
                $\set{1.0, 1.1}, \set{1.2, 1.3}, \set{1.4, 1.5}, \set{10, 100}$
                
            Through its simplicity, this method aims to leverage speed allowing for abstractions to be formed much quicker leading to greater number of samples.



        \subsubsection{\NoCaseChange{minVarVB}} \label{sec:ordered-partitioning-schemes:minVarVB}

            \begin{quote}
                $o = \tn{low to high}$\\
                $\Psi = \Part{minVarVB}$ (Algorithm \ref{alg:psi-minVarVB})
            \end{quote}
            
            \begin{algorithm}[t!]
                \caption{$\Part{minVarVB}$}
                \label{alg:psi-minVarVB}
                \begin{footnotesize}
                    \SetInd{0.25em}{0.55em}
                    \DontPrintSemicolon 
                \Input{A set of ordered nodes $\bs{n^{*}}$ to be partitioned into $nAbs$ abstract states; a value function $\mu(.)$}
                \Output{$\bs{n^{*}}$ partitioned into abstract states\super{\ref{ftn:ordered-schemes-maintain-sort-order}} $\bs{A} = \setst{\bs{A_{i}}}{i \in \set{1,...,nAbs}}$ satisfying $\min \sum_{\bs{A_{i}} \in \bs{A}} Var(\bs{A_{i}}, v)$}
                
                \Begin{
                    $\bs{A} = WardsMethod(|\bs{n^{*}}|,v, \tn{Euclidian distance})$\\
                    \Return $\bs{A}$       
                }
                \end{footnotesize}
            \end{algorithm}

            As mentioned in Section \ref{sec:paradigms:sampling}, Proposition \ref{prop:rizzo-variance-reduction}, \cite{rizzo_2007} showed that in stratified importance sampling minimizing variance of the estimates within individual strata can lead to a reduction in overall variance.

            The minVarVB scheme was designed based on this intuition.  The scheme uses Ward's Minimum Variance Hierarchical Clustering (or Ward's Method, for short) \cite{ward1963} to group nodes into a $nAbs$ abstract states so as to minimize variance within each abstract state with respect to the provided value function $\mu(.)$.

            Ward's Minimum Variance Hierarchical Clustering is an agglomerative hierarchical clustering algorithm designed to create a dendrogram by iteratively merging clusters. The primary objective is to minimize the total within-cluster variance. Ward's method works as outlined in Algorithm \ref{alg:wards-method}.
                        
            \begin{algorithm}[t!]
              \caption{Ward's Method}
                \label{alg:wards-method}
              \begin{enumerate}
                \item \textbf{Initialization:} Treat each data point as an individual cluster. Assign each cluster a label or identifier.
                
                \item \textbf{Compute Pairwise Distances:} Calculate the pairwise distances between all clusters. Various distance metrics can be used, such as Euclidean distance.
                
                \item \textbf{Cluster Merging Iteration:} 
                  \begin{enumerate}
                    \item Identify the pair of clusters $\bs{C_{i}}$ and $\bs{C_{j}}$ that, when merged into a new cluster $\bs{C_{ij}}$, results in the smallest increase in the overall within-cluster variance. This is determined using the formula:
                      \[ \Delta Var = Var(\bs{C_{ij}}) - (Var(\bs{C_{i}}) + Var(\bs{C_{j}})) \]
                      where \(Var(\bs{C_{ij}})\) is the variance of the merged cluster, and \(Var(\bs{C_{i}})\) and \(Var(\bs{C_{j}})\) are the variances of clusters $\bs{C_{i}}$ and $\bs{C_{j}}$, respectively.
                    \item Update distance measures between the newly merged cluster and all other clusters.
                  \end{enumerate}
                
                \item \textbf{Repeat:} Repeat steps 2-3 until the desired number of clusters is achieved.
              \end{enumerate}
            \end{algorithm}
                        
            Ward's Method can be combined with Lance-Williams linear distance updates \cite{LanceWillaims1967-distanceUpdates} to increase efficiency. Lance-Williams linear distance updates, in the context of agglomerative clustering, refer to the formula used to calculate the distance between clusters as they are merged during the hierarchical clustering process. The general form of Lance-Williams distance updates can be expressed as follows:
            \begin{align}
                d_{(ij)k} = \alpha_{i} d_{ik} + \alpha_{j} d_{jk} + \alpha d_{ij} + \gamma |d_{ik} - d_{jk}|
            \end{align}
            where:
            \begin{itemize}
                \vspace{-6pt}
                \item [\tiny$\bullet$]
                    $d_{ij}$, $d_{ik}$, and $d_{jk}$ are the pair-wise distances between clusters $\bs{C_{i}}$, $\bs{C_{j}}$, and $\bs{C_{k}}$
                \item [\tiny$\bullet$]
                    $d_{(ij)k}$ is the distance between the newly merged cluster $\bs{C_{i}} \cup \bs{C_{j}}$ and cluster $\bs{C_{k}}$
                \item [\tiny$\bullet$]
                    $\alpha_i, \alpha_j, \alpha, \text{ and } \gamma$ are coefficients that depend on the linkage criterion used
            \end{itemize}
            
            In the case of Ward's method, the coefficients are specific to the minimization of within-cluster variance and are calculated as follows:
            \begin{align}
            \begin{split}
                \alpha_i &= \frac{|\bs{C_{i}}| + |\bs{C_{k}}|}{|\bs{C_{i}}| + |\bs{C_{j}}| + |\bs{C_{k}}|} \\
                \alpha_j &= \frac{|\bs{C_{j}}| + |\bs{C_{k}}|}{|\bs{C_{i}}| + |\bs{C_{j}}| + |\bs{C_{k}}|} \\
                \alpha &= -\frac{|\bs{C_{k}}|}{|\bs{C_{i}}| + |\bs{C_{j}}| + |\bs{C_{k}}|} \\
                \gamma &= 0
            \end{split}
            \end{align}
            (The inclusion of \(\gamma\) provides additional flexibility in the more general case, adjusting the distance updates based on the specific clustering criterion being used).

            \semph{Time Complexity.\footnote{\label{ftn:time-complexity-assumes-constant-time-v}Assuming $\mu(n)$ is $\mathcal{O}(1)$ in both time and space.}}\hfill\\
                The choice of clusters to merge generally leads to having a $\mathcal{O}(|\bs{n^{*}}|^{3})$ time complexity due to the need to compare pair-wise distances between all clusters at each iteration.  However, in the case where nodes are distributed linearly in one dimension, only neighboring distances need to be considered at each iteration and can be made efficient by use of a priority queue, however since the Lance-Williams distance updates themselves take linear time, once per iteration, the reduced time complexity is still $\mathcal{O}(|\bs{n^{*}}|^{2})$.
                
            \semph{Space Complexity.\super{\ref{ftn:time-complexity-assumes-constant-time-v}}}\hfill\\
                The space complexity is implementation dependent, with most time-efficient variants making use of a distance matrix leading to $\mathcal{O}(|\bs{n^{*}}|^{2})$ space complexity.

            \semph{Result on \hyperref[sec:ordered-partitioning-schemes:running-example]{Running Example}.}\hfill\\
                $\set{1.0, 1.1, 1.2}, \set{1.3, 1.4, 1.5}, \set{10}, \set{100}$

            In contrast to simpleVB, minVarVB places considerable resources into computing abstractions, leading to fewer samples, but with potentially better estimates with an appropriate value function $\mu(.)$. 



        \subsubsection{\NoCaseChange{equalDistVB}} \label{sec:ordered-partitioning-schemes:equalDistVB}

            \begin{quote}
                $o = \tn{low to high}$\\
                $\Part{equalDistVB}$ (Algorithm \ref{alg:psi-equalDistVB})
            \end{quote}
            
            \begin{algorithm}[t!]
                \caption{$\Part{equalDistVB}$}
                \label{alg:psi-equalDistVB}
                \begin{footnotesize}
                    \SetInd{0.25em}{0.55em}
                    \DontPrintSemicolon 
                \Input{A set of ordered nodes $\bs{n^{*}}$ to be partitioned into $nAbs$ abstract states; a value function $\mu(.)$}
                \Output{
				With 
				%
				$Z(\bs{A_{1,...,i}}) = (\sum_{j=1}^{i} \sum_{n' \in \bs{A_{j}}} Z(n')$,
				%
				$n_{\bs{A_{i}}}^{\tn{last}}$ be the last node in $\bs{A_{i}}$, 
				%
				and 
				%
				$P_{i} = \frac{ i \cdot \sum_{n \in \bs{n^{*}}}\mu(n)}{nAbs}$,
				%
                $\bs{n^{*}}$ partitioned into abstract states\super{\ref{ftn:ordered-schemes-maintain-sort-order}} $\bs{A} = \setst{\bs{A_{i}}}{i \in \set{1,...,nAbs}}$ such that for $i=1,...,nAbs$ in order,
                $(\; Z(\bs{A_{1,...,i}}) \geq P_{i} \;)$
                $\land$ \\ $(\; (\, \bs{A_{i}}=\set{} \,) \lor (\, Z(\bs{A_{1,...,i}}) - Z(n_{\bs{A_{i}}}^{\tn{last}}) < P_{i} \,) \;)$ }
                
                \Begin{
                    $j \leftarrow 1$\\
                    \ForEach{$i \leftarrow 1,...,nAbs$}{
                        $\bs{A_{i}} = \set{}$\\
                        \While{$Z(\bs{A_{1,...,i}}) < P_{i}$}{
                            $\bs{A_{i}} \leftarrow A_{i} \cup \set{n^{*}_{{j}}}$\\
                            $j \leftarrow j + 1$
                        }
                    }
                    $\bs{A} = \cup_{i = 1}^{nAbs} \bs{A_{i}}$\\
                    \Return $\bs{A}$       
                }
                \end{footnotesize}
            \end{algorithm}

            As discussed in Section \ref{sec:paradigms:combined}, there is intuition in wanting to focus on high impact regions of the search/sampling space.  Allowing the provided value function $\mu(.)$ to serve as a heuristic of nodes that are part of these high impact spaces, equalDistVB attempts to balance this intuition with the notion of variance reduction from minVarVB in attempts to group fewer predicted high impact nodes together in abstract states and allowing for the predicted lower impact nodes to be part of larger abstract states.  Also inspired by the simplicity of simpleVB, the scheme works by greedily adding nodes in value order (low to high) into abstract state $\bs{A_{i}}$ until the total sum of node values from $\bs{A_{1}},...,\bs{A_{i}}$ reaches or exceeds the $\frac{i}{nAbs}$ quantile.
            
            When paired with the QB abstraction class (see Section \ref{sec:value-based-abstraction-classes:QB}), the equalDistVB schemes also attempts to partition notes into abstract states of equal mass under the proposal.  This in corresponds to the condition for Proposition \ref{prop:rizzo-variance-reduction} for stratified importance sampling variance reduction.

            \semph{Time Complexity.\super{\ref{ftn:time-complexity-assumes-constant-time-v}}}\hfill\\
                $Z(A_{1...i})$ can be updated progressively in constant time, and thus computation of $P_{i}$ at each iteration can also be done in constant time. Partitioning is achieved via one pass through $|\bs{n^{*}}|$ leading to $\mathcal{O}(|\bs{n^{*}}|)$ time complexity.

            \semph{Space Complexity.\super{\ref{ftn:time-complexity-assumes-constant-time-v}}}\hfill\\
                No more than linear space is required.  $\mathcal{O}(|\bs{n^{*}}|)$

            \semph{Result on \hyperref[sec:ordered-partitioning-schemes:running-example]{Running Example}.}\hfill\\
                $\set{1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 10, 100}, \set{}, \set{}, \set{}$
                
            Although, this method hopes to find a balance between intuitions previously explored, and without compromising speed and efficiency of abstract state generation, from the running example we can see how this method yield undesirable results in the presence of certain distributions of node values.  In this example, the first quantile is only reached after all the nodes have been added to the first abstract state, leaving no nodes remaining to be partitioned into the subsequent abstract states. 



        \subsubsection{\NoCaseChange{equalDistVB2}} \label{sec:ordered-partitioning-schemes:equalDistVB2}

            \begin{quote}
                $o = \tn{high to low}$\\
                $\Part{equalDistVB}$ (Algorithm \ref{alg:psi-equalDistVB})
            \end{quote}

            By simply reversing the sort order, equalDistVB2 is able to use the same partitioning strategy $\Part{equalDistVB}$ associated with equalDistVB meanwhile mitigate some of the overfilling of abstract states.
            
            \semph{Time Complexity.\super{\ref{ftn:time-complexity-assumes-constant-time-v}}}\hfill\\
                $Z(A_{1...i})$ can be updated progressively in constant time, and thus computation of $P_{i}$ at each iteration can also be done in constant time. Partitioning is achieved via one pass through $|\bs{n^{*}}|$ leading to $\mathcal{O}(|\bs{n^{*}}|)$ time complexity.

            \semph{Space Complexity.\super{\ref{ftn:time-complexity-assumes-constant-time-v}}}\hfill\\
                No more than linear space is required.  $\mathcal{O}(|\bs{n^{*}}|)$

            \semph{Result on \hyperref[sec:ordered-partitioning-schemes:running-example]{Running Example}.}\hfill\\
                $\set{100}, \set{}, \set{}, \set{10, 1.5, 1.4, 1.3, 1.2, 1.1, 1.0}$

            We see that equalDistVB2 can still be subject to over packing of abstract states.  Next we present two more equalDistvB variants that continue to mitigate this artifact.



        \subsubsection{\NoCaseChange{equalDistVB3}} \label{sec:ordered-partitioning-schemes:equalDistVB3}

            \begin{quote}
                $o = \tn{high to low}$\\
                $\Part{equalDistVB3}$ (Algorithm \ref{alg:psi-equalDistVB3})
            \end{quote}

            \begin{algorithm}[t!]
                \caption{$\Part{equalDistVB3}$}
                \label{alg:psi-equalDistVB3}
                \begin{footnotesize}
                    \SetInd{0.25em}{0.55em}
                    \DontPrintSemicolon 
                \Input{A set of ordered nodes $\bs{n^{*}}$ to be partitioned into $nAbs$ abstract states; a value function $\mu(.)$}
                \Output{
				With 
				%
				$Z(\bs{A_{1,...,i}}) = (\sum_{j=1}^{i} \sum_{n' \in \bs{A_{j}}} Z(n')$,
				%
				$n_{\bs{A_{i}}}^{\tn{last}}$ be the last node in $\bs{A_{i}}$, 
				%
				and 
				%
				$P_{i} = \frac{ i \cdot \sum_{n \in \bs{n^{*}}}\mu(n)}{nAbs}$,
				%
                $\bs{n^{*}}$ partitioned into abstract states\super{\ref{ftn:ordered-schemes-maintain-sort-order}} $\bs{A} = \setst{\bs{A_{i}}}{i \in \set{1,...,nAbs}}$ such that for $i=1,...,nAbs$ in order,
                $(\; Z(\bs{A_{1,...,i}}) \geq P_{i} \;)$
                $\land$ \\ $(\; (\, \bs{|A_{i}}|=1 \,) \lor (\, Z(\bs{A_{1,...,i}}) - Z(n_{\bs{A_{i}}}^{\tn{last}}) < P_{i} \,) \;)$ }
                
                \Begin{
                    $j \leftarrow 1$\\
                    \ForEach{$i \leftarrow 1,...,nAbs$}{
                        $\bs{A_{i}} = \set{n^{*}_{{j}}}$\\
                        $j \leftarrow j+1$\\
                        \While{$Z(\bs{A_{1,...,i}}) < P_{i}$}{
                            $\bs{A_{i}} \leftarrow A_{i} \cup \set{n^{*}_{{j}}}$\\
                            $j \leftarrow j+1$
                        }
                    }
                    $\bs{A} = \cup_{i = 1}^{nAbs} \bs{A_{i}}$\\
                    \Return $\bs{A}$       
                }
                \end{footnotesize}
            \end{algorithm}

            In order to lessen over packing and ensure abtract states are not left empty, equalDistVB3 modifies equalDistVB2 so that, after processing of each abstract state, the next state is forced an addition of at least a single node by default.
            
            \semph{Time Complexity.\super{\ref{ftn:time-complexity-assumes-constant-time-v}}}\hfill\\
                $Z(A_{1...i})$ can be updated progressively in constant time, and thus computation of $P_{i}$ at each iteration can also be done in constant time. Partitioning is achieved via one pass through $|\bs{n^{*}}|$ leading to $\mathcal{O}(|\bs{n^{*}}|)$ time complexity.

            \semph{Space Complexity.\super{\ref{ftn:time-complexity-assumes-constant-time-v}}}\hfill\\
                No more than linear space is required.  $\mathcal{O}(|\bs{n^{*}}|)$

            \semph{Result on \hyperref[sec:ordered-partitioning-schemes:running-example]{Running Example}.}\hfill\\
                $\set{100}, \set{10}, \set{1.5}, \set{1.4, 1.3, 1.2, 1.1, 1.0}$

            Still highly efficient, equalDistVB3 manages to ensure that the provided $nAbs$ granularity is honored, allowing users better control of the search vs. sampling interpolation possible with Abstraction Sampling.



        \subsubsection{\NoCaseChange{equalDistVB4}} \label{sec:ordered-partitioning-schemes:equalDistVB4}

            \begin{quote}
                $o = \tn{high to low}$\\
                $\Part{equalDistVB4}$ (Algorithm \ref{alg:psi-equalDistVB4})
            \end{quote}

            \begin{algorithm}[t!]
                \caption{$\Part{equalDistVB4}$}
                \label{alg:psi-equalDistVB4}
                \begin{footnotesize}
                    \SetInd{0.25em}{0.55em}
                    \DontPrintSemicolon 
                \Input{A set of ordered nodes $\bs{n^{*}}$ to be partitioned into $nAbs$ abstract states; a value function $\mu(.)$}
                \Output{
				With 
				%
				$Z(\bs{A_{1,...,i}}) = (\sum_{j=1}^{i} \sum_{n' \in \bs{A_{j}}} Z(n')$,
				%
				$n_{\bs{A_{i}}}^{\tn{last}}$ be the last node in $\bs{A_{i}}$, 
				%
				and 
				%
				$L_{i} = \frac{Z(\bs{n^{*}})-Z(\bs{A_{1,...,i-1}})}{nAbs-i+1}$,
				%
                $\bs{n^{*}}$ partitioned into abstract states\super{\ref{ftn:ordered-schemes-maintain-sort-order}} $\bs{A} = \setst{\bs{A_{i}}}{i \in \set{1,...,nAbs}}$ such that for $i=1,...,nAbs$ in order,
                $(\; Z(\bs{A_{i}}) \geq L_{i} \;)$
                $\land$ \\ $(\; (\, \bs{|A_{i}}|=1 \,) \lor (\, Z(\bs{A_{i}}) - Z(n_{\bs{A_{i}}}^{\tn{last}}) < L_{i} \,) \;)$ }
                
                \Begin{
                    $j \leftarrow 1$\\
                    \ForEach{$i \leftarrow 1,...,nAbs$}{
                        $\bs{A_{i}} = \set{}$\\
                        \While{$Z(\bs{A_{i}}) < L_{i}$}{
                            $\bs{A_{i}} \leftarrow A_{i} \cup \set{n^{*}_{{j}}}$\\
                            $j \leftarrow j+1$
                        }
                    }
                    $\bs{A} = \cup_{i = 1}^{nAbs} \bs{A_{i}}$\\
                    \Return $\bs{A}$       
                }
                \end{footnotesize}
            \end{algorithm}

            The final varaint of the equalDist schemes, equalDistVB4 attempts to perform a more even partitioning than the previous variants by recomputing quantiles. Each time the algorithm progesses to processing a new abstract state, remaining nodes and abstract states are used to compute new quantiles which are then used to guide filling of the current abstract state in the same way previously done.
            
            \semph{Time Complexity.\super{\ref{ftn:time-complexity-assumes-constant-time-v}}}\hfill\\
                $Z(A_{1...i})$ can be updated progressively in constant time, and thus computation of $L_{i}$ at each iteration can also be done in constant time.  Partitioning is achieved via one pass through $|\bs{n^{*}}|$ leading to $\mathcal{O}(|\bs{n^{*}}|)$ time complexity.

            \semph{Space Complexity.\super{\ref{ftn:time-complexity-assumes-constant-time-v}}}\hfill\\
                No more than linear space is required.  $\mathcal{O}(|\bs{n^{*}}|)$

            \semph{Result on \hyperref[sec:ordered-partitioning-schemes:running-example]{Running Example}.}\hfill\\
                $\set{100}, \set{10}, \set{1.5, 1.4, 1.3}, \set{1.2, 1.1, 1.0}$

            Still highly efficient, equalDistVB3 manages to ensure that the provided $nAbs$ granularity is honored, allowing users better control of the search vs. sampling interpolation possible with Abstraction Sampling.


        \subsubsection{\NoCaseChange{randVB}} \label{sec:ordered-partitioning-schemes:randVB}

            \begin{quote}
                $o = \tn{high to low}$\\
                $\Part{randVB}$ (Algorithm \ref{alg:psi-randVB})
            \end{quote}

            \begin{algorithm}[t!]
                \caption{$\Part{randVB}$}
                \label{alg:psi-randVB}
                \begin{footnotesize}
                    \SetInd{0.25em}{0.55em}
                    \DontPrintSemicolon 
                \Input{A set of ordered nodes $\bs{n^{*}}$ to be partitioned into $nAbs$ abstract states; a value function $\mu(.)$}
                \Output{$\bs{n^{*}}$ partitioned into abstract states\super{\ref{ftn:ordered-schemes-maintain-sort-order}} $\bs{A} = \setst{\bs{A_{i}}}{i \in \set{1,...,nAbs}}$ }
                
                \Begin{
                    $\bs{s} \sim Unif(\setst{\bs{M} \subseteq \set{1,...,|\bs{n^{*}}|-1}}{|\bs{M}|=nAbs-1})$\\
                    $\bs{s^{*}_{}} \leftarrow SORT(\bs{s})$\\
                    $j \leftarrow 1$\\
                    \ForEach{$i \leftarrow 1,...,nAbs\!-\!1$}{
                        $\bs{A_{i}} = \set{n^{*}_{j},...,n^{*}_{s^{*}_{i}}}$\\
                        $j \leftarrow s^{*}_{i}+1$
                    }
                    $\bs{A_{nAbs}} = \set{n^{*}_{j},...,n^{*}_{|n^{*}|}}$\\
                    $\bs{A} = \cup_{i = 1}^{nAbs} \bs{A_{i}}$\\
                    \Return $\bs{A}$       
                }
                \end{footnotesize}
            \end{algorithm}

            If the quality of $\mu(.)$ as a measure of similarity is unknown or poor, it could instead be beneficial to rely on randomness to ensure a diverse sampling of abstractions.  randVB does this by sampling $nAbs\!-\!1$ partition points between the sorted nodes $\bs{n^{*}}$ uniformly at random and without replacement, and then partitions the nodes accordingly. As a result, abstract states are formed such that nodes are still grouped according to $\mu(.)$, but the size of those groups varies.
            
            \semph{Time Complexity.\super{\ref{ftn:time-complexity-assumes-constant-time-v}}}\hfill\\
                $\mathcal{O}(|\bs{n^{*}}|)$ time complexity.

            \semph{Space Complexity.\super{\ref{ftn:time-complexity-assumes-constant-time-v}}}\hfill\\
                No more than linear space is required.  $\mathcal{O}(|\bs{n^{*}}|)$

            \semph{Result on \hyperref[sec:ordered-partitioning-schemes:running-example]{Running Example}.}\hfill\\
                $\set{100, 10}, \set{1.5}, \set{1.4, 1.3, 1.2}, \set{1.1, 1.0}$;\\
                $\set{100}, \set{10, 1.5, 1.4, 1.3}, \set{1.2, 1.1}, \set{1.0}$;\\
                ...etc.







    \section{Empirical Evaluation} \label{sec:empirical-evaluation}

        %%%%%%%%%%%%%%%%%%% AS Algorithms Tested
        
        \paragraph{Setup.} All combinations of each of the three new Value-Based Abstraction Classes (ie. Heuristic-Based, HR-Based, and Q-Based; Section \ref{sec:value-based-abstraction-classes}) with each of the seven Ordered Partitioning Schemes (ie. simpleVB, minVarVB, equalDistVB1-4, and randVB; Section \ref{sec:ordered-partitioning-schemes}) were tested for a total of twenty-one abstraction functions.  For comparison, the formely introduced abstraction functions of randCB and relCB were also used (see Section \ref{sec:abstraction-sampling:existing-abstraction-functions}).  In addition, a pure randomized abstraction function, simpleRand, was also included.  With the exception of RelCB, each accepts a parameter called $nAbs$ which bounds the number of abstract states any level. RelCB uses a parameter that limits the number of context variables to use in assigning nodes to abstract states.  For ease of comparison, we report RelCB's parameter as an $nAbs$ parameter assuming a domain size of $2$.  (For example, if a result shows RelCB with $nAbs=64$ this means RelCB was parameterized to use a relaxed context size of $nContext = log2(64) = 6$.All of the abstraction functions were used with the AOAS framework.  All algorithms were implemented in C++. All experiments were run for 300sec on a 2.66 GHz processor with 8 GB of memory.
        
        
        
        %%%%%%%%%%%%%%%%%%% Heuristic Description
        \paragraph{Heuristics.}  To inform the sampling proposal, Weighted Mini-Bucket Elimination (wMBE) \cite{DBLP:journals/jacm/DechterR03,DBLP:conf/icml/LiuI11} is used as a heuristic.  The i-bound (iB) parameter controls the strength of wMBE, where higher i-bounds generally lead to stronger heuristics and, thus, better proposals at the expense of higher computation and memory. We standardize our experiments by using the same i-bound when comparing across algorithms. 
        
        
        
        
        %%%%%%%%%%%%%%%%%%% Benchmark Description
        
        \paragraph{Benchmarks.} In line with previous work on Abstraction Sampling, we perform experiments on the same group of over 480 problems from five well known benchmarks: DBN, Grids, Linkage-Type4, Pedigree, and Promedas. For brevity, we show detailed aggregated statistics on only large problem instances, thus excluding Pedigree, whose problems were all small and results relatively uniform across all algorithms.  Average statistics for the benchmarks can be found in Figure \ref{fig:AggregationTables}, and statistics on individual problems can be found in the supplemental materials.
        
        
        %%%%%%%%%%%%%%%%%%% Performance Measure
        
        \paragraph{Performance Measure.} To evaluate the performance of the various algorithms, we calculate error as:     $|Error| = |log_{10} \hat{Z} - log_{10} Z^{*}|$, where $Z$ is the partition function, $log_{10} \hat{Z}$ is the $log_{10}$ of the experimentally obtained $Z$ estimate, and $log_{10} Z^{*}$ is the reference $log_{10}Z$ value. When the exact $Z$ value is unknown, an empirical estimate based on an average over $100\times1$hr of abstraction sampling is used as the reference collected by \cite{kask20-scaling-up-as} (who also verified that 98\% of the estimates fell within the 95\% probabilistic bounds determined by bound-providing sampling algorithm Dynamic Importance Sampling \cite{lou2019interleave}.



        \subsection{Results} \label{sec:empirical-evaluation:results}

            \subsubsection{Aggregated Result}

                Results from experiments run on problems of the same benchmark set were aggregated and reported in Tables \ref{tbl:DBN_aggregation}-\ref{tbl:Promedas_aggregation}. (Please see the table caption for what each statistic represents).  The $nAbs$ granularity to report was picked by averaging the simple, minVar, equalDist (all were considered together), and rand performances across all classes of abstractions and the best chosen for each. RandCB(CTX-rand), RelCB (CTX-rel), and the pure randomized scheme (RAND-rand) were each given their best $nAbs$ for each benchmark.  To help the reader identify AS algorithm configurations that performed particularly well, rows are bolded for alogorithms that both (A) solved a relatively large number of problems, and (B) low average absolute error error.  Single values bolded highlight cases where algorithms may have good results consistent with (A) or (B), but not both. (Extended results with different $nAbs$ and iB can be found in the supplemental materials).

                \subsubsection{Comparing Value Classes and Partitioning Schemes}

                    \begin{tablefigure*}[!htb]
                        \centering     %%% not \center
                        \begin{subfigure}[DBN]{0.49\linewidth}
                            \label{tbl:DBN_aggregation}
                            \centering     %%% not \center
                            \includegraphics[width=0.8\linewidth]{./_attachments/Results/DBN-aggregations-LARGE-i-10-t-300.pdf}
                            \caption{}
                        \end{subfigure}
                        \begin{subfigure}[Grids]{0.49\linewidth}
                            \label{tbl:Grids_aggregation}
                            \centering     %%% not \center
                            \includegraphics[width=0.8\linewidth]{./_attachments/Results/Grids-aggregations-LARGE-i-10-t-300.pdf}
                            \caption{}
                        \end{subfigure}
                        \begin{subfigure}[Linkage-Type4]{0.49\linewidth}
                            \label{tbl:Linkage-Type4_aggregation}
                            \centering     %%% not \center
                            \includegraphics[width=0.8\linewidth]{./_attachments/Results/Linkage-Type4-aggregations-LARGE-i-10-t-300.pdf}
                            \caption{}
                        \end{subfigure}
                        \begin{subfigure}[Promedas]{0.49\linewidth}
                            \label{tbl:Promedas_aggregation}
                            \centering     %%% not \center
                            \includegraphics[width=0.8\linewidth]{./_attachments/Results/Promedas-aggregations-LARGE-i-10-t-300.pdf}
                            \caption{}
                        \end{subfigure}
                        \caption{Aggregated statistics. Displayed are the Abstraction Class (\textit{Class}), Partitioning Scheme (\textit{Scheme}), bound on number of abstract states (\textit{nAbs}), number of problems only a zero solution could be found (\textit{Fail}), average $log_{10}Z$ error (\textit{|Error|}), average number of samples (\textit{Samples}), and average size of probes (\textit{Size}). Color bars visualize the magnitude of the values. Red \textit{Fail} cells indicate an algorithm's inability to solve relatively many problems. Bold indicates best performances. Lines in bold indicate the best performing algorithms. Each benchmark also displays the total number of problems within it (N), average number of variables (\textbar X\textbar), max domain size (k), tree width ranges (w*), and AND/OR search tree depth ranges (d).}
                        \label{fig:AggregationTables}
                    \end{tablefigure*}


            \subsubsection{The Effect of \NoCaseChange{iB}}

                In order to understand more about the effect of iB on abstraction quality and results, in Figure \ref{plot:results:DBN-algs-all-i-5-10-nabs-4} we show aggregate results for the QB algorithms vs. the former CB algorithms and the fully randomized scheme.  (For other benchmarks and $nAbs$, please see the Supplemental Materials).

                \begin{plotfigure}[!htb]
                    \centering
                    \caption{Observing the effect of iB on average results over the Large DBN benchmark with $nAbs=4$. Displayed are the Abstraction Class (\textit{Class}), Partitioning Scheme (\textit{Scheme}), i-bound (\textit{iB}), number of problems only a zero solution could be found (\textit{Fail}), average $log_{10}Z$ error (\textit{|Error|}), average number of samples (\textit{Samples}), and average size of probes (\textit{Size}). Color bars visualize the magnitude of the values. Red \textit{Fail} cells indicate an algorithm's inability to solve relatively many problems. Bold indicates best performances. Lines in bold indicate the best performing algorithms. Each benchmark also displays the total number of problems within it (N), average number of variables (\textbar X\textbar), max domain size (k), tree width ranges (w*), and AND/OR search tree depth ranges (d).}
                    \includegraphics[width=1.0\linewidth]{./_attachments/Results/DBN-algs-all-i-5-10-nabs-4.pdf}
                    \label{plot:results:DBN-algs-all-i-5-10-nabs-4}
                \end{plotfigure}




            \subsubsection{Abstraction Speed}

                In order to understand more about the speed of each scheme at performing abstractions, in Figure \ref{plot:results:Promedas-nAbs-2048_samples-vs-probe-size} we plot the number of samples that were able to be drawn with respect to the average probe size for each algorithm on individual problems of the Promedas benchmark.  (For other benchmarks and $nAbs$, please see the Supplemental Materials).

                \begin{plotfigure}[!htb]
                    \centering
                    \caption{For the given abstraction granularity and benchmark, the number of samples (in log10) relative to the probe size (in log10) using iB-10.}
                    \includegraphics[width=1.0\linewidth]{./_attachments/Results/Promedas-nAbs-2048_samples-vs-probe-size.pdf}
                    \label{plot:results:Promedas-nAbs-2048_samples-vs-probe-size}
                \end{plotfigure}

            \subsection{Probe Size}

                As seen by using the different Ordered Partitioning Schemes on our running  abstraction example in Section \ref{sec:ordered-partitioning-schemes}, even given the same granularity, some schemes may not use all of the allotted abstract states per level, or once-extended paths may eventually end up dead due to abstractions in later iterations. These can lead to reduced probe sizes.  In order to observe how the various schemes perform in this respect, we plotted Probe Size with respect to the number of variables of a problem for each problem of the Promedas benchmark.  (For other benchmarks and $nAbs$, please see the Supplemental Materials).
            
                \begin{plotfigure}[!htb]
                    \centering
                    \caption{For the given abstraction granularity and benchmark, the size of the probe (in log10) relative to the number of problem variables (in log10) using iB-10.}
                    \includegraphics[width=1.00\linewidth]{./_attachments/Results/Promedas-nAbs-2048_probe-size-vs-nVars}
                    \label{plot:results:Promedas-nAbs-2048_probe-size-vs-nVars}
                \end{plotfigure}

        \subsection{Analysis} \label{sec:empirical-evaluation:analysis}




    \section{Conclusion} \label{sec:analysis}


        
\clearpage
    \bibliographystyle{named}
    \bibliography{ref}




\end{document}