\documentclass{uai2024} % for initial submission
%\documentclass[accepted]{uai2024} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2024} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2024} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Instructions for Authors: Title in Title Case}

% The standard author block has changed for UAI 2024 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
% \author[1]{\href{mailto:<jj@example.edu>?Subject=Your UAI 2024 paper}{Jane~J.~von~O'L\'opez}{}}
% \author[1]{Harry~Q.~Bovik}
% \author[1,2]{Further~Coauthor}
% \author[3]{Further~Coauthor}
% \author[1]{Further~Coauthor}
% \author[3]{Further~Coauthor}
% \author[3,1]{Further~Coauthor}
% % Add affiliations after the authors
% \affil[1]{%
%     Computer Science Dept.\\
%     Cranberry University\\
%     Pittsburgh, Pennsylvania, USA
% }
% \affil[2]{%
%     Second Affiliation\\
%     Address\\
%     …
% }
% \affil[3]{%
%     Another Affiliation\\
%     Address\\
%     …
%   }
\author{\href{mailto:<pezeshkb@uci.edu>?Subject=Abstraction Sampling - UAI 2024}{Bobak Pezeshki}{}}
\author{\href{mailto:<kkask@uci.edu>?Subject=Abstraction Sampling - UAI 2024}{Kalev Kask}{}}
\author{\href{mailto:<ihler@ics.uci.edu>?Subject=Abstraction Sampling - UAI 2024}{Alexander Ihler}{}}
\author{\href{mailto:<dechter@ics.uci.edu>?Subject=Abstraction Sampling - UAI 2024}{Rina Dechter}{}}
% Add affiliations after the authors
\affil[1]{%
    University of California, Irvine
}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% CUSTOM PACKAGES %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\usepackage{caption}
\usepackage{subcaption}
\usepackage{float}
\usepackage{xspace} % package being used for \newcommand to remove extra space
                    %     when a command is invoked without an argument list
\usepackage{textcase}
\usepackage[toc, nopostdot]{glossaries}
% \usepackage{amsmath}
\usepackage{amsthm, amssymb}
\usepackage{mathtools}
\usepackage{enumitem}
\usepackage{refcount}
\usepackage[leftmargin=6pt, vskip=3pt-\parskip]{quoting}
\usepackage[titlenumbered,ruled, linesnumbered]{algorithm2e}
\usepackage{mathrsfs} %for \mathscr
\usepackage[font=smaller,labelfont=bf]{caption}
% \usepackage[font=small,labelfont=bf]{subcaption}
% \usepackage[labelfont=bf]{caption}
% \usepackage[labelfont=bf]{subcaption}
\usepackage{xcolor}
    \definecolor{darkgreen}{rgb}{0.0, 0.2, 0.13}
    \definecolor{cadmiumgreen}{rgb}{0.0, 0.42, 0.24}
    \definecolor{byzantium}{rgb}{0.44, 0.16, 0.39}
    \definecolor{darkelectricblue}{rgb}{0.33, 0.41, 0.47}
    \definecolor{battleshipgrey}{rgb}{0.52, 0.52, 0.51}
    \definecolor{warmblack}{rgb}{0.0, 0.26, 0.26}
\usepackage{newfloat}
\usepackage{chngcntr}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% CUSTOM COMMANDS %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%create new float environment called plotfigure with it's own counter
\DeclareFloatingEnvironment[name=Plot]{plotfigure} 

%create new float environment called tablefigure with it's own counter
\DeclareFloatingEnvironment[name=Table]{tablefigure} 

%set the floats table and tablefigure to use the same counters
\makeatletter\let\c@tablefigure\c@table\makeatother 

%consider the floats table and tablecounter as the same set of floats (so location in document will be in order in which they appear)
\makeatletter\let\ftype@tablefigure\ftype@table\makeatother 

\DeclareMathOperator*{\argmin}{argmin}
\DeclareMathOperator*{\argmax}{argmax}
\mathchardef\mhyphen="2D % Define a "math hyphen"

% algorithm2e
% \newcommand\commentstyle[1]{\textcolor{cadmiumgreen}{#1}}
\SetCommentSty{commentstyle}
\SetKwInOut{Input}{input}
\SetKwInOut{Output}{output}

\newtheoremstyle{break}
  {\topsep}{\topsep}%
  {\itshape}{}%
  {\bfseries}{}%
  {\newline}{}%
\theoremstyle{break}
% \newtheorem{theorem}{Theorem}[subsubsection]
\newtheorem{theorem}{Theorem}[section]
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{proposition}[theorem]{Proposition}
% \newtheorem{definition}{Definition}[subsubsection]
\newtheorem{definition}{Definition}[section]

\input{cmds}
\renewcommand*{\glstextformat}{\textbf}

\renewcommand{\quote}{\list{}{\rightmargin=\leftmargin\topsep=0pt}\item\relax}







%%% for supplemental

\usepackage{enumitem}
    \setlistdepth{9}
    \setlist[itemize,1]{label=$\bullet$}
    \setlist[itemize,2]{label=$\cdot$}
    \setlist[itemize,3]{label=$\cdot$}
    \setlist[itemize,4]{label=$\cdot$}
    \setlist[itemize,5]{label=$\cdot$}
    \setlist[itemize,6]{label=$\cdot$}
    \setlist[itemize,7]{label=$\cdot$}
    \setlist[itemize,8]{label=$\cdot$}
    \setlist[itemize,9]{label=$\cdot$}
    \renewlist{itemize}{itemize}{9}






%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\setcounter{secnumdepth}{3} %May be changed to 1 or 2 if section numbers are desired.
\setcounter{tocdepth}{3}

\title{Abstraction Sampling with Heuristic-Based, HR-Based,\\ and Proposal-Based Abstraction Functions}


\input{gls}


\begin{document}
    % \onecolumn
    \setlength{\abovedisplayskip}{3pt}
    \setlength{\belowdisplayskip}{3pt}

    \maketitle
    
    \begin{abstract}
        \vspace{-12pt}
        Monte Carlo methods are powerful tools for solving problems involving complex probability distributions. Despite their versatility, these methods often suffer from computational inefficiencies, especially when dealing with rare events. As such, importance sampling emerged as a prominent technique for alleviating these challenges. Recently, a new scheme called Abstraction Sampling was developed that incorporated stratification to importance sampling over graphical models to improve estimates further. However, existing work has only explored a limited set of abstraction functions that guide  stratification. This study expands this set by introducing three new classes of abstraction functions combined with seven distinct partitioning schemes, resulting in twenty-one new abstraction functions. These are motivated by theory and intuition from both search and sampling domains. An extensive empirical analysis on over 400 problems compares these new abstraction functions highlighting several well-performing candidates. 
        \vspace{-6pt}
    \end{abstract}

    % \vfill\eject
    % \tableofcontents
    
    % \clearpage
    \section{Introduction} \label{sec:introduction}
    \vspace{-4pt}
        The partition function ($Z$) is an important quantity in probabilistic graphical model inference, and is often estimated using Monte Carlo methods such as Importance Sampling (IS) \citep{Rubinstein_2007,liu2015probabilistic,DBLP:journals/ai/GogateD11}. Recently a framework called Abstraction Sampling (AS) \citep{DBLP:conf/uai/BrokaDIK18} was introduced, inspired by the prior works of \citet{knuth75,Chen92}, that extends IS by enabling samples to represent multiple configurations.  AS uses concepts from Stratified Sampling \citep{Rubinstein_2007,rizzo_2007} and compact search  \citep{DBLP:journals/ai/DechterM07} to build a sampled subtree called a probe.  The construction progresses level-by-level according to a variable ordering where, at each level, an \textit{abstraction function} groups nodes into \textit{abstract states} and then a representative node from each group is picked and reweighted 
        %(according to a proposal distribution) 
        to extend the paths in the probe.
        
        Using what are referred to as context-based abstraction functions, \citet{DBLP:conf/uai/BrokaDIK18} showed competitive performance by AS against IS, Weighted Mini-Bucket IS (wMBIS) \citep{liu2015probabilistic}, and IJGP-SampleSearch (IJGP-ss) \citep{DBLP:journals/ai/GogateD11}. \citet{kask20-scaling-up-as} further extended AS scalability introducing AS algorithm AOAS that more efficiently applied AS to AND/OR search spaces, and showed even better performance
        % its superior performance using the same context-based abstraction functions against previous versions of Abstraction Sampling
        % (and thus implicitly also against IS, wMBIS, and IJGP-ss) 
        % and 
        also comparing to state-of-the-art scheme Dynamic Importance Sampling \citep{lou2019interleave}.
        
        However, Abstraction Sampling development has lacked exploration of more effective abstraction functions.  \citet{hsiao23-gnn-dynamic-as} proposed use of graph neural networks to learn abstraction functions.  However, this has the drawback of requiring learning on problems before use. Future work will explore adjusting the abstraction scheme and its $nabs$ hyperparameter to the problem instance through learning. 

        
       %\rina{ 
       \vspace{-4pt}
       \paragraph{Contributions.} We provide a detailed study of new abstraction function schemes for Abstraction Sampling algorithms. We presented a new type of abstraction defined by a real-valued function aimed at capturing relevant similarity features between nodes.  Three classes of this new framework are presented and augmented by seven partitioning strategies.  An extensive empirical evaluation is performed on over 400 problems comparing the above-mentioned schemes against: each other, the previously vetted relCB and randCB abstraction functions \citep{DBLP:conf/uai/BrokaDIK18,kask20-scaling-up-as} (and thus implicitly against IS, wMBIS, and IJGP-ss), and a purely randomized abstraction scheme.
       %}

                \shrink{
        \begin{itemize}
        \vspace{-6pt}
            \item
                A new Value-Based abstraction framework for grouping nodes according to values on a positive scale
                
            \item
                Twenty-one value-based abstraction functions constructed by combining:
                
                \begin{itemize}
                
                    \item
                        Three distinct abstraction value functions, each inspired by paradigms from search and sampling.
                        
                    \item
                        Seven schemes for partitioning nodes according to their assigned values.
                        
                \end{itemize}
                        
            \item
                An extensive empirical evaluation on over 400 problems comparing the above-mentioned schemes against: each other, the previously vetted relCB and randCB abstraction functions \citep{DBLP:conf/uai/BrokaDIK18,kask20-scaling-up-as} (and thus implicitly against IS, wMBIS, and IJGP-ss), and a purely randomized abstraction scheme.
        \end{itemize}
                \vspace{-6pt}
        }

    %\rina{The results paragraph should be more elaborated and more informative. Can be done later.}
   Our experiments identify three abstraction functions in particular: \textit{equalDistQB3}, \textit{equalDistQB4}, and \textit{simpleRand} that perform significantly better than any previous scheme, and show that they tend to perform best with greater numbers of abstract states. These results indicate a significant improvement of one of the most competitive sampling schemes yielding a substantial computational advance for one of the most challenging tasks in probabilistic inference.


    \section{General Background} \label{sec:background}
    \vspace{-4pt}

%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%

        \paragraph{Graphical Models.}
            
            % \begin{figure}[]
            %     \centering
            % 	\includegraphics[scale=0.25]{images/AncestorBranchingMass.pdf}
            % 	\vspace{-6pt}\caption{Ancestor branching mass of an AND node.}
            % 	\label{fig-ancestor-branching-mass}
            % \end{figure}
    
            % % \begin{comment}
            % \begin{figure}[]
            %     \centering
            % 	\includegraphics[scale=0.25]{images/ProperAbstractionGroups.pdf}
            % 	\vspace{-6pt}\caption{Scope of proper abstractions.}
            % 	\label{fig-proper-abstraction-groups}
            % \end{figure}
            % % \end{comment}
            
            A \textit{graphical model}, such as a Bayesian or Markov network \citep{pearl88,darwiche-book,DBLP:series/synthesis/2013Dechter}, can be defined by  a 3-tuple
            $\mathcal{M} \! = \! (\mathbf{X,D,F})$, where
            $\mathbf{X}$
            is a set of variables,
            and $\mathbf{D}$
            is the set of variable domains, and $\mathbf{F}$ is a set of functions such that each function $f_{\alpha} \in \mathbf{F}$ is defined over $\alpha \subseteq \bs{X}$
            called its scope. A \textit{Primal graph} $\mathcal{G} \! = \! (\mathbf{V,E})$ of $\mathcal{M}$ associates each variable with a node ($\mathbf{V} \! = \! \mathbf{X}$), while arcs $e \! \in \! \mathbf{E}$ connect nodes whose variables appear in the scope of the same function.
            $\mathcal{M}$ defines a global function, often a factorized probability distribution on $\mathbf{X}$,
            $P(\mathbf{X}) = \frac{1}{Z} \prod_{\alpha}f_\alpha(X_\alpha)$, where 
            $
            Z = \sum_X \prod_{\alpha}
            f_\alpha(X_\alpha)
            $, known as the partition function, is a normalization factor.

        \vspace{-4pt}
        \paragraph{Search Spaces of Graphical Models.} 
            % A graphical model can be transformed into a weighted state space graph.
            % In an OR search space, which is constructed layer-by-layer relative to a variable ordering, paths from the root to the leaves represent \textbf{full configurations} - or assignments to all variables - where each successive level corresponds to an assignment of the next variable in the ordering.
            
            A graphical model can
            % also 
            be transformed  into a compact AND/OR search space that leverages conditional independencies and facilitates use of efficient search algorithms \citep{DBLP:journals/ai/DechterM07}. Given a primal graph $\mathcal{G}$, an AND/OR search space is defined relative to a \textit{pseudo tree} $\mathcal{T} \! = \! (\mathbf{V,E'})$ that is a directed rooted tree that spans $\mathcal{G}$ according to a variable ordering and captures conditional independences encoded in the model.  $\mathcal{T}$ is constructed such that every arc of $\mathcal{G}$ not in $\mathbf{E'}$ is a back-arc in ${\cal T}$ 
            % connecting a node to one of its ancestors
            (Figure \ref{fig:primal-graph-and-pseudo-tree}).  A variable is a \textit{branching variable} if it has multiple children in $\mathcal{T}$.
            %The arcs in $E'$ may not all be included in $E$ .  
            
            
            
            
            \begin{figure}[!htb]
            \vspace{-2pt}
            	\centering
            	\begin{subfigure}{0.9\linewidth}
            	\centering
            	       \includegraphics[width=0.75\linewidth]{UAI-24/_attachments/images/pseudotree.png}
                        \vspace{-6pt}\caption{}
                        \label{fig:primal-graph-and-pseudo-tree}
            	\end{subfigure}
                    \begin{subfigure}{0.9\linewidth}
            	\centering
                        \includegraphics[width=0.75\linewidth]{UAI-24/_attachments/images/AncestorBranchingMass.pdf}
                        \vspace{-6pt}\caption{}
                        \label{fig:ancestor-branching-mass}
                    \end{subfigure}
            	\vspace{-6pt}\caption{A full AND/OR tree representing 16 possible solutions guided by the pseudo tree shown above. Boxed in green is the ancestor branching subtree for the path $\rightarrow \!\! (A \!\! = \!\! 0) \!\! \rightarrow  \!\! (C \!\! = \!\! 1)$. \vspace{-4pt}}
                        \label{fig:psuedo-tree-with-ancestor-branching-mass}
            \end{figure}

            
            Given a
            pseudo tree $\mc{T}$, an \textit{AND/OR search tree}
            $T$ guided by $\mc{T}$ has alternating levels of OR nodes
            corresponding to variables and AND nodes corresponding to
            possible assignments. 
            % to the variables, 
            %with edge costs extracted from
            %the original functions \citep{DBLP:journals/ai/DechterM07} such that %(By this logic, we can think of the nodes of an OR tree as AND nodes).  
            % Let $n$ be an AND node in $T_{\tau}$, also denoted $n_X$ if $X$ is the last variable of its partial configuration.
            The arc into an AND node $n_{X}$ associated with variable $X$ %(or the arc from its OR parent to the AND node)
            has a cost $c(n_{X})$ equal to the product of functions $f_{\alpha} \in \F$ such that the path to $n$ fully instantiates all $X' \in \alpha$ and such that $X \in \alpha$ \citep{DBLP:journals/ai/DechterM07}.
            % \textcolor{red}{Moved to section "Value of A Node": (see Figure \ref{fig-simple}(c)).}  

        \vspace{-6pt}
        \paragraph{Notation.}
            Capital letters ($X$) represent variables and small letters ($x$) their values.  Boldfaced letters represent a collection. Boldfaced capital letters ({\bf X}) denote a collection of variables,
            $|{\bf X}|$ its cardinality, 
            $D_{\X}$ their joint domains (possible configurations of \X), 
            and bolded $\xx$ a particular realization in that joint domain (ie. a particular configuration of \X).

            In the context of search, $n$ is used to represent search nodes.  $n_{X}$ specifically refers to an AND node in $T$ associated with variable $X$, and $Y_{n_X}\!$ the OR node associated with variable $Y$ that is the child of $n_{X}$. $path(n)$ is the configuration of the variables along the path from the root of $T$ to $n$ according to assignments corresponding to that path. $g(n)$ is the cost of $path(n)$. $ch(n)$ denotes the children of node $n$.
            % $ch_{\PT}(X)$ denotes the children of $X$ in pseudo tree \PT.
            
        \vspace{-6pt}
        \paragraph{$\bs{Z(n)}$.} \label{sec:partition-function-of-a-node}  
            % In the context of AND/OR search, let $ch_Y(n_X)$ refer to the children AND nodes of variable $Y$ that are descended from the AND node $n_X$.
            We define $Z(n_X) \! = \! \prod_{Y_{n_{X}} \in ch(n_{X})} Z(Y_{n_X})$
            % \begin{equation} \label{eq:and-or-z-prod}
            %     Z(n_X) = \prod_{Y_{n_{X}} \in ch(n_{X})} Z(Y_{n_X})
            % \end{equation}
            where $Z(Y_{n_X}) = \sum_{n_Y \in ch(Y_{n_X})}  c(n_Y) \cdot Z(n_Y)$.
            % \begin{equation}
            %     Z(Y_{n_X}) = \sum_{n_Y \in ch(Y_{n_X})}  c(n_Y) \cdot Z(n_Y)
            % \label{eq2}
            % \end{equation}
            With $n_{\varnothing}$ as a dummy root node of $T$, $Z(n_{\varnothing}) = Z$ of the underlying model \M. We denote estimation of $Z(n)$ as $\hat{Z}(n)$.  Heuristic estimates of $Z(n)$ are denoted as $h(n)$.

        \vspace{-6pt}
        \paragraph{$\bs{R(n)}$.} \label{sec:ancestor-branching-mass}
             On the path from the root of $T$ to some $n_{X}$, there may an intermediate node $n_{Y}$ associated with branching variable $Y$ in \PT ($A$ is a branching variable in Figure \ref{fig:psuedo-tree-with-ancestor-branching-mass}).  When this happens, the remaining variables of the model are split between branches. $R(n_{X})$, or the \textit{ancestor branching mass}, captures the $Z(n)$ for all variables that branch off of the path to $n_{X}$. In Figure \ref{fig:ancestor-branching-mass}, the green box shows the portion corresponding to the $R(n)$ for the red node.
             %(That same boxed portion would also be the ancestor branching mass for the sibling node of the red node, and also for any of their children).

             More formally, let $br(n_{X})$ be the set of nodes $n_{Y}$ on the path to $n_{X}$ such that $Y$ is a branching variable in \PT. Let $W_{n_{Y}}$ be the child OR node of $n_{Y}$ that that is also on the path to $X$.  We define $R(n_{X})$ as: $R(n_{X}) =   \prod_{n_{Y} \in br(n_{X})} \frac{Z(n_{Y})}{ Z(W_{n_{Y}})}$.
             % \begin{align}
             %     \label{eq4}
             %     R(n_{X}) =   \prod_{n_{Y} \in br(n_{X})} \frac{Z(n_{Y})}{ Z(W_{n_{Y}})}
             % \end{align}
             We denote approximations to $R(n)$ as $r(n)$.

        \vspace{-6pt}
        \paragraph{$\bs{Q(n)}$.} \label{sec:q-of-a-node}

            We can now concisely define a quantity $Q(n)$ as the contribution to $Z$ from all full configurations consistent with $path(n)$. In other words, $Q(n)$ is the unnormalized probability of the configuration $path(n)$ based on the distribution defined by \M, with $P(path(n)) = \frac{Q(n)}{Z}$.  $Q(n)$ can be computed simply as: $Q(n) = g(n)  \! \cdot \!  R(n)  \! \cdot \!  Z(n)$.
            % \begin{align}
            %     Q(n) = g(n)  \! \cdot \!  R(n)  \! \cdot \!  Z(n)
            % \end{align}
                
             \textbf{Example.} In Figure \ref{fig:ancestor-branching-mass}, consider the path from the root to the red node $n_{A= 0,C=1}$. Following $n_{A=0}$ to our node, we see OR node $B_{n_{A=0}}$ that branches off of the path.
             So, $Q(n_{A=0,C=1}) = g(n_{A=0,C=1}) \! \cdot \! R(n_{A=0,C=1}) \! \cdot \! Z(n_{A=0,C=1}) = g(n_{A=0,C=1}) \mul Z(n_{A=0,B}) \! \cdot \! Z(n_{A=0,C=1})$.
             % \begin{align}
             %    &= g(n_{A=0,C=1}) \! \cdot \! R(n_{A=0,C=1}) \! \cdot \! Z(n_{A=0,C=1}) \\
             %    &= g(n_{A=0,C=1}) \mul \;\; Z(n_{A=0,B})\;\; \! \cdot \! Z(n_{A=0,C=1}) 
             % \end{align}
             
        \vspace{-4pt}
        \paragraph{Stratified Importance Sampling.} 
            Abstraction Sampling builds on Importance Sampling and Stratified Sampling. {\em Importance Sampling} is  a Monte Carlo scheme for approximating likelihood queries.
            %\citep{Rubinstein_2007,DBLP:journals/ai/GogateD11,liu2015probabilistic}.
            {\em Stratified Sampling} is a variance reduction technique for sampling a search space by first dividing it into disjoint strata. 
            % The two can be merged to further reduce variance.
            In {\em Stratified Importance Sampling}, the sample space is first divided into $k$ strata,
            %of equal area under the distribution $p$, 
            then representatives from each strata chosen and re-weighted. %, and uses these representatives to form an estimator over the entire model. 
            To reduce overall variance, the sum of variances within the strata should be minimized \citep{rizzo_2007}.
            
            
            \newcommand{\soltree}{\hat{x}_M}
            \newcommand{\parttree}{\bar x}
            
            
    

%%%%%%%%%%%%%%%%%%%%%%%%%5
%%%%%%%%%%%%%%%%%%%%%%%%%5
%%%%%%%%%%%%%%%%%%%%%%%%%5
%%%%%%%%%%%%%%%%%%%%%%%%%5
%%%%%%%%%%%%%%%%%%%%%%%%%5
%%%%%%%%%%%%%%%%%%%%%%%%%5
%%%%%%%%%%%%%%%%%%%%%%%%%5
%%%%%%%%%%%%%%%%%%%%%%%%%5


    \section{Abstraction Sampling}\label{sec:abstraction-sampling}
    \vspace{-4pt}
        {\em Abstraction Sampling} (AS) algorithms \citep{DBLP:conf/uai/BrokaDIK18} apply concepts of Stratified Importance Sampling to sampling over probabilistic graphical models. 
        %An abstraction event in Abstraction Sampling is analogous to sampling representatives from strata in stratified importance sampling and reweighing to account for the rest of the members that were not chosen.  
        AS is guided by an abstraction
        function $a(\cdot)$ that dictates how nodes are partitioned into \textbf{abstract states} (analogous to strata in stratified sampling). Abstraction Sampling iteratively expands a search tree along a variable ordering %variable by variable, 
       using $a(\cdot)$ to group nodes into abstract states levl by level. It uses an importance-sampling-like process to select a  representative from each abstract state and reweights it to account for the members that were removed.  The selected nodes are expanded and the process iterates.
       This process yields
       %leading to the generation of 
       a sampled subtree of the full search tree $T$ (called a {\bf probe}) as a sample.

        \vspace{-4pt}
        \paragraph{AOAS.}
            Taking Abstraction Sampling further, \citet{kask20-scaling-up-as} introduced algorithm AOAS
            %(\textbf{A}nd/\textbf{O}R \textbf{A}bstraction \textbf{S}ampling) 
            that more effectively applied Abstraction Sampling to AND/OR search spaces and significantly improved the performance of AS. AOAS uses a proposal function $p(n) \propto q(n) = w(n) g(n)  h(n)  r(n)$ where $g(n)$ is the cost of the path to $n$, $h(n)$ is the heuristic estimate of $Z(n)$, $r(n)$ is the estimate of $R(n)$, and a weight $w(n)$ which accounts for the nodes abstracted into the path to $n$ (see Figure \ref{fig:proposal}). 
            Algorithm \ref{alg:aoas-overview} provides a high level description.  A more detailed version and an example from \citet{kask20-scaling-up-as} are in the Supplemental.

        
        \begin{algorithm}[t!]
                \caption{AOAS Overview}
                \label{alg:aoas-overview}
        
            \begin{enumerate}
                \vspace{2pt}
                \item \textbf{Initialization:}
                    Begin with a dummy root node $r$.
                \item \textbf{Probe Generation:}
                    Proceeding in a DFS manner according to a pseudo tree $\PT$...
                    \begin{enumerate}
                        \item \textbf{Expansion:} \label{alg:aoas-overview:expansion}
                            Generate children nodes $n$ corresponding to the next variable in the DFS ordering of $\PT$. Inherit $w(n)$ from parents and assign appropriate $g(n), h(n), \tn{and } r(n)$ values.
                        \item \textbf{Abstraction:} \label{alg:aoas-overview:abstraction}
                            \begin{enumerate}
                                \item \textbf{Form Abstract States:}
                                    Using $a(\cdot)$, partition newly expanded nodes into abstract states.
                                \item \textbf{Select Representative:}
                                    Using proposal $p(n) \propto q(n)$, stochasticallh select a representative from each abstract state and reweigh it such that $w(n) \leftarrow \frac{w(n)}{p(n)}$
                            \end{enumerate}
                        \item \textbf{Backtrack:} \label{alg:aoas-overview:backtracking}
                            After reaching a leaf in $\PT$, recursively backtrack until reaching nodes of the next unexplored branch of $\PT$. While backtracking, update parent node $\hat{Z}(n')$ estimates based on its children's $w(n), g(n),$ and $\hat{Z}(n)$ values.
                        \item \textbf{Repeat:}
                            Repeat steps \ref{alg:aoas-overview:expansion}-\ref{alg:aoas-overview:backtracking} until having backtracked all the way to the root node.
                    \end{enumerate}
                \item \textbf{Return:}
                    $\hat{Z} = w(r)\,\hat{Z}(r)$ for the root node $r$.
            \end{enumerate}
        \end{algorithm}



        \begin{figure}[!htb]
            \centering
            \includegraphics[width=0.75\linewidth]{UAI-24/_attachments/images/proposal.png}
            \vspace{-6pt}\caption{The $q(n)$ visualized to show it estimating nodes previously abstracted (via $w(n)$), the ancestor branching mass (via $r(n)$), current path cost (via $g(n)$), and subtree mass (via $h(n))$.
            \vspace{-8pt}
            }
                    \label{fig:proposal}
        \end{figure}


         The choice of abstraction function is a key element of an AS scheme, which was only partially explored so far. The main focus of this work is to expand %upon the set of abstraction functions 
         and identify abstraction functions that lead to significantly better AS performance.
         
         % A key hyper-parameter used is $nAbs$ that bounds the number of abstract states at each level, and thus bounds the size of each probe. If each search node is placed in its own abstract state, the result will be a pure search algorithm, but for that $nabs$ would be exponential \cite{}.
        

        \vspace{-4pt}        
        \paragraph{Current state-of-the-art Abstraction Functions.} \label{sec:abstraction-sampling:existing-abstraction-functions}
       \citet{DBLP:conf/uai/BrokaDIK18} designed abstractions based on assignments to a 
       variable's context $C(X)$, a subset of its ancestors in the pseudo-tree $\cal T$ whose assignments or configurations uniquely determine the AND/OR subtree below it \citep{DBLP:journals/ai/DechterM07} 
       % and thus its $Z(n)$.
       %Therefore abstracting nodes together that have the same context configuration ensures that they have the same $Z(n)$.
       However, the number of configurations to a context %$|\D_{C(X)}|$, 
       is exponential in the context's size.
       %In the {\relax context}  approach abstractions were employed.
       %was explored which is to select a subset of the context variables hoping that this will group nodes having similar $Z(n)$.
       %yielding too many abstract states, unless the induced-width is boundsd.  and is infeasible to use if the induced-width of the graph is high. Thus, 
      Thus, earlier work \citet{DBLP:conf/uai/BrokaDIK18,kask20-scaling-up-as} 
      used \textit{relaxed} context-based (\textbf{RelCB}) and \textit{randomized} context-based (\textbf{RandCB}) abstractions that control the number of abstract states.  RelCB is controlled by a {\em level} parameter $j$  %parameterized by a level $j$, 
      %selecting the closest $j \! - \! 1$ variables from a variable's context (ie. its {\em relaxed context}) plus itself. 
      grouping nodes having the same configuration over  $j \! - \! 1$ context variables (the relaxed context) in an abstract state. With a domain size of $k$, this yields at most $k^j$ abstract states at each level.  The randomized scheme, RandCB, considers the entire context, but bounds the number of abstract states per level based on an $nAbs$ parameter, using a randomized hashing scheme to associate each of the $nAbs$ abstract states with a subset of possible full context assignments.

      % \rina{Bobak: you have to mention the notion of "granularity and to say that $nabs$ denoted the granularity.}
     % \rina{In the next section we introduced a class of abstraction functions which we call "value-based".}






    \section{Value-Based Abstractions} \label{sec:ordered-value-based-abstraction-functions}
    \vspace{-4pt}

        We now introduce a new class
        %framework for 
        of abstractions which we call Value-Based Abstractions. They are defined by  
        %Value-based abstraction functions consist of two parts: 
        (1) a positive real-value function $\mu: D_{\X} \rightarrow \mathbb{R^{+}}$,
        where $D_{\X}$ is a set of partial configurations for the variables \X,
        %that assigns a real positive number to each node $n$,
        and by (2) a partitioning scheme $\psi$ that assigns nodes in $N$ to ordered abstract states based on their $\mu$ value and in an order consistent manner. as defined next. 
        
        \begin{definition}[Value Ordered Partitioning] Given $nAbs$ and a function $\mu: D_{\X} \rightarrow \mathbb{R^{+}}$, a partitioning function $\psi_{\mu}: D_{\X}  \rightarrow \{ A_1,A_2,...A_{nAbs} \}$,
        is oreder consistent with $\mu$ relative the $nasb$ allowed abstract states if
        ($n_1 \in  A_i$ and $ n_2 \in A_j$ where $i<J$)   $\Leftrightarrow \mu(n_1) \leq \mu(n_2))$.
        \vspace{-6pt}
        \end{definition}

        We will categorize our value-based abstraction by specifying different types of $\mu$ and $\psi$ functions.

        % \rina{bobak, I thing algorithm2 should be removed and the whole discussion simplified. You can talk about how to generate the abstract cases for the specific cases. I comment it}
        %Algorithm \ref{alg:general-ordered-value-based-abstraction-function} provides a general value-based abstraction scheme that maintains an ordering of nodes according to $\mu(n)$. Assuming the value function $\mu(\cdot)$ is not dominating, the complexity is determined by the complexity of the partitioning function used.

        %\rina{removed.}
        %We next present three value-based abstraction classes, each based on a unique $\mu$.  Subsequently, we will  present seven ordered partitioning schemes that, in conjunction with a $\mu$, are used %with Algorithm \ref{alg:general-ordered-value-based-abstraction-function} 
        %to define a unique value-ordered abstraction function.
        

        \subsection{Value-Based Abstraction Classes} \label{sec:value-based-abstraction-classes}
        \vspace{-4pt}
        
            We introduce three Value-Based Abstraction types, each characterized by a unique value function $\mu$ that signifies a notion of similarity between nodes.   We will subsequently provide partitioning functions that, together with $\mu$, will yield a set of abstraction functions.
            
            % In this work we present three value-based abstraction classes: Heuristic-Based (HB), HR-Based (HRB), and Q-Based (QB) abstraction value-classes.  Each is motivated by theory in search or sampling discussed in Section \ref{sec:paradigms}, and each can be used with node partitioning schemes (Section \ref{sec:ordered-partitioning-schemes}), which together form a value-ordered abstraction function.

            % The three types of guiding value functions we will use as a basis for abstraction function are 1) Heuristic-based, 
            % % 2) Heuristic and Ancestral Branching-based, 
            % 2) Heuristic and Ancestral Branching-based (or HR-based), 
            % and 3) Q-based value functions. 
            % %when Q is the proposal function. W

            \vspace{-4pt}
            \paragraph{1. Heuristic-Based Abstractions.} \label{sec:value-based-abstraction-classes:HB}

            
                % \begin{quote}
                %     $\mu(n) = h(n)$
                % \end{quote}
                
                %Using the motivation of abstracting nodes with similar subtree $Z(n)$ intuited from previous work and concepts of graph search,
                Heuristic-Based (HB) abstractions use $\mu(n) = h(n)$, where $h(n)$ is a heuristic estimate of $Z(n)$.  Unlike partial or hashed contexts as was used by \citet{DBLP:conf/uai/BrokaDIK18}, heuristic estimates of $Z(n)$ can often provide refined \textit{quantitative} insight into potential similarities of $Z(n)$ values. In particular this intuition holds when using wMBE heuristics that provides bounds on $Z(n)$. 
    
                % In conjunction with the node partitioning schemes that will be presented in Section \ref{sec:ordered-partitioning-schemes}, the presented HB abstraction functions aim to form abstractions such that nodes with similar $Z(n)$ are grouped together.

    
            \vspace{-4pt}
            \paragraph{2. Heuristic and Ancestral Branching-Based Abstractions.} \label{sec:value-based-abstraction-classes:HRB}

    \shrink{
                % \begin{quote}
                %     $\mu(n) = h(n)  \! \cdot \!  r(n)$
                % \end{quote}

                Consider the following definition of "exact" abstraction functions:
                \begin{definition}[Exact Abstraction Function]
                     An abstraction function $a(\cdot)$ is exact for an Abstraction Sampling algorithm, AS, if use of $a(\cdot)$ with AS always leads to AS estimates having zero variance and $\hat{Z} = Z$ for every AS probe.
                \end{definition}

                }

                Recall that
                %$h(n)$ is a heuristic estimate of \hyperref[sec:partition-function-of-a-node]{$Z(n)$} and 
                $r(n)$ is an estimate of $n$'s \hyperref[sec:partition-function-of-a-node]{ancestor branching mass $R(n)$}. We can show that:
                 \begin{theorem}[AOAS Exact Abstractions] \label{thm:aoas-proportionality-exact-proposal}
              %  \begin{theorem}[AOAS Exact Abstractions from $h(n)r(n)$ vs. $Z(n)R(n)$ %Proportionality] \label{thm:aoas-proportionality-exact-proposal}
                      If an abstraction function $a(\cdot)$ forms abstract states $\bs{A_{i}}$ such that 
                      $\forall n \in \bs{A_{i}}$ , there exists a constant $c_i >0$ s.t.
                      $\frac{h(n)r(n)}{Z(n)R(n)} = c_i $.
                      %for some $\propto_{i} \in %\!\!  \mathbb{R}_{>0}$ 
                      %$\forall n \in \bs{A_{i}}, 
                     % \frac{h(n)r(n)}{Z(n)R(n)} = \; \propto_{i}$ for some $\propto_{i} \in %\!\!  \mathbb{R}_{>0}$ 
                      whenever $Z(n)R(n) > 0$, or $h(n)r(n) = 0$, 
                     % $Z(n)R(n) \in \!\!  \mathbb{R}_{>0}$, or $h(n)r(n) = 0$ otherwise, then 
                     then AOAS is exact
                     %it is an exact abstraction function for AOAS. 
                     (Proof in Supplemental).%Materials).
            \vspace{-6pt}
                \end{theorem}
    
                This observation suggests use of $hr(n) = \frac{h(n)r(n)}{Z(n)R(n)}$  as a similarity measure. Thus if we put nodes having closed $hr$ values in the same abstract state it can lead to reduction in variance.  Clearly this $hr$ ratio is hard to compute, but it inspire some related expressions such as what we call HR-Based (HRB) abstractions that use $\mu(n) = h(n)r(n)$.
                %as a surrogate for similarity of this ratio and group nodes accordingly.
    

            \vspace{-4pt}
            \paragraph{3. Q-Based Abstractions.} \label{sec:value-based-abstraction-classes:QB}
    
                % \begin{quote}
                %     $\mu(n) = w(n) \! \cdot \! g(n) \! \cdot \! h(n) \! \cdot \! r(n)$
                % \end{quote}
    
                Another intuition for generating abstractions comes from Statistics.  In his work on stratified Importance Sampling, \citet{rizzo_2007} showed the potential of overall variance reduction by aiming to find strata (abstraction states) that minimize the variance within strata %when forming strata 
                that have equal mass under the proposal distribution.  Thus, since our proposal $p$ is proportional to $w(n)g(n)h(n)r(n)$, we use  $\mu(n) = q(n) \! = \! w(n)g(n)h(n)r(n) $.  We call these type of abstractions Q-based (QB).
    
                % \rina{remove (unclear) In addition to serving as an un-normalized proposal function, $q(n)$ also estimates $n$'s
                % contribution to the overall $Z$. Therefore, $q(n)$ estimates the impact of $n$ (and all previously abstracted nodes that $n$ represents) on the overall $Z$.}
                %which could be a valuable quantity to base our choice of nodes on as discussed in Section \ref{sec:paradigms:combined}.
     


        \subsection{Ordered Partitioning Schemes} \label{sec:ordered-partitioning-schemes}
        \vspace{-4pt}
        
           Next we describe and motivate seven partitioning schemes to be used with $\mu$ for creation of value-based abstraction functions. For brevity, we have omitted the algorithmic representation of the partitioning schemes, but can be found in the Supplemental.
                
            
            %We now present seven distinct schemes of partitioning nodes into abstract states such that nodes are sorted according to a provided abstraction value function $\mu(\cdot)$. In addition to defining each scheme we also describe the motivation behind its creation and show the results on a running example we will use presented below.

            \vspace{-4pt}
            \paragraph{Running Example.} \label{sec:ordered-partitioning-schemes:running-example} 
            We will use the following running example to illustrate the various partitioning schemes.
            
               % As we motivate and describe the various partitioning schemes, we will also provide examples of the abstract states that would result from partitioning nodes with the following $\mu(n)$:
               Assume we have eight nodes with the following $\mu(n)$:
                \begin{align} \label{eq:running-partitioning-example}
                    % \set{
                        1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 10, 100
                    % }
                \end{align}
               and want to partition the nodes into $nAbs=4$ abstract states.  As we explore each partitioning scheme, we also demonstrate how the scheme would partition these nodes. 

            \vspace{-4pt}
            \paragraph{\NoCaseChange{1. SimpleVB}.} \label{sec:ordered-partitioning-schemes:simpleVB}
    
                The simpleVB (simple value-based) scheme group nodes having similar $\mu(n)$ in the same state by a simple 2-step process: 
                1) nodes are ordered by $\mu(n)$ (low to high), and 2) the ordered nodes are partitioned into [approximately] equal cardinality abstract states.
    
                % \textit{Time Complexity:}
                %     Partitioning is achieved via one pass through $|\bs{n}|$ leading to $\mathcal{O}(|\bs{n}|)$ time complexity.
                % \textit{Space Complexity:}
                %     No more than linear space is required.  $\mathcal{O}(|\bs{n}|)$.
                \textit{\hyperref[sec:ordered-partitioning-schemes:running-example]{Running Example}:}
                    \smallset{1.0, 1.1}, \smallset{1.2, 1.3}, \smallset{1.4, 1.5}, \smallset{10, 100}.
               % Nodes are partitioned evenly, and through its simplicity 
                This method aims to leverage speed 
                % allowing for abstractions to be formed quickly 
                 yet still roughly groups nodes with similar $\mu(n)$ together.
                %\rina{complexity?}
                %\footnotetext{\label{ftn:ordered-schemes-maintain-sort-order}Such that nodes maintain sort order $o$ across all abstract states.}
    
    
            \vspace{-4pt}
            \paragraph{\NoCaseChange{2. minVarVB.}} \label{sec:ordered-partitioning-schemes:minVarVB}
    
                The minVarVB %scheme 
                uses Ward's Minimum Variance Hierarchical Clustering, also known as Ward's Method \citep{ward1963}, to cluster nodes into $nAbs$ abstract states. The objective is to minimize total within variance of $\mu(\cdot)$ across all abstract states.  Ward's Method is an agglomerative hierarchical clustering algorithm that creates a dendrogram by iteratively merging clusters. Ward's Method can be combined with Lance-Williams linear distance updates \citep{LanceWillaims1967-distanceUpdates} to increase efficiency.
                % We include more details on Ward's Method and Lance-Williams linear distance updates in the Supplemental Materials.

                % \textit{Time Complexity:\footnote{\label{ftn:time-complexity-assumes-constant-time-v}Assuming $\mu(n)$ is $\mathcal{O}(1)$ in both time and space.}}
                %     The choice of clusters to merge generally leads to having a $\mathcal{O}(|\bs{n^{*}}|^{3})$ time complexity due to the need to compare pair-wise distances between all clusters at each iteration.  However, in the case where nodes are distributed linearly in one dimension, only neighboring distances need to be considered at each iteration and can be made efficient by use of a priority queue, however since the Lance-Williams distance updates themselves take linear time, once per iteration, the reduced time complexity is still $\mathcal{O}(|\bs{n}|^{2})$.
                % \textit{Space Complexity:\super{\ref{ftn:time-complexity-assumes-constant-time-v}}}
                %     The space complexity is implementation dependent, with most time-efficient variants making use of a distance matrix leading to $\mathcal{O}(|\bs{n}|^{2})$ space complexity.
                \textit{\hyperref[sec:ordered-partitioning-schemes:running-example]{Running Example}:}
                    \smallset{1.0, 1.1, 1.2}, \smallset{1.3, 1.4, 1.5}, \smallset{10}, \smallset{100}.
                In contrast to simpleVB, minVarVB places considerable resources into computing abstractions, potentially leading to fewer probes, but provably forms abstractions that minimize the total with-in variance of $\mu(n)$ among the abstract states.
    
    
            \vspace{-4pt}
            \paragraph{\NoCaseChange{3. equalDistVB}.} \label{sec:ordered-partitioning-schemes:equalDistVB}
    
               % equalDistVB 
                Building upon the ideas of both
                %is inspired by the goal of the 
                minVarVB and the simplicity of simpleVB, this scheme greedily adds nodes in order of $\mu$ (low to high) into an abstract state $\bs{A_{i}}$ until $\sum_{j=1}^{i} \sum_{n \in \bs{A_{j}}} \mu(n) \geq \frac{i \cdot \sum_{n' \in \bs{n}} \mu(n)}{nAbs}$,
                namely until the total sum of node values from $\bs{A_{1}},...,\bs{A_{i}}$ reaches or exceeds the $\frac{i}{nAbs}$ quantile. When paired with the QB type, 
                %the equalDistVB schemes also attempts to
                it aims to partition nodes into equal mass states under the proposal, motivated by \citet{rizzo_2007}.
                
                %This in corresponds to the condition in \citet{rizzo_2007}'s proposition for stratified importance sampling variance reduction.
    
                % \textit{Time Complexity:\super{\ref{ftn:time-complexity-assumes-constant-time-v}}}
                %     $Z(A_{1...i})$ can be updated progressively in constant time, and thus computation of $P_{i}$ at each iteration can also be done in constant time. Partitioning is achieved via one pass through $|\bs{n}|$ leading to $\mathcal{O}(|\bs{n}|)$ time complexity.
                % \textit{Space Complexity:\super{\ref{ftn:time-complexity-assumes-constant-time-v}}}
                %     No more than linear space is required.  $\mathcal{O}(|\bs{n}|)$.
                %\rina{For our running example we can get: }
                \textit{\hyperref[sec:ordered-partitioning-schemes:running-example]{Running Example}:}
                    \smallset{1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 10, 100}, \smallset{}, \smallset{}, \smallset{}.
                Although, this method hopes to find a balance between intuitions previously explored while maintaining speed, from the running example we can see how this method can yield potentially undesirable partitionings in the presence of certain distributions of node values.  In this example, the first quantile is only reached after all the nodes have been added to the first abstract state, leaving no nodes remaining to be partitioned into the subsequent abstract states. 
    
    
            \vspace{-4pt}
            \paragraph{\NoCaseChange{4. equalDistVB2}.} \label{sec:ordered-partitioning-schemes:equalDistVB2}

                By reversing the sort order, equalDistVB2 uses the same partitioning strategy as equalDistVB, aiming to mitigate some of the overfilling of abstract states as shown above for equalDistVB.
                
                % \textit{Time Complexity:\super{\ref{ftn:time-complexity-assumes-constant-time-v}}}
                %     $Z(A_{1...i})$ can be updated progressively in constant time, and thus computation of $P_{i}$ at each iteration can also be done in constant time. Partitioning is achieved via one pass through $|\bs{n}|$ leading to $\mathcal{O}(|\bs{n}|)$ time complexity.
                % \textit{Space Complexity:\super{\ref{ftn:time-complexity-assumes-constant-time-v}}}
                %     No more than linear space is required.  $\mathcal{O}(|\bs{n}|)$.
                \textit{\hyperref[sec:ordered-partitioning-schemes:running-example]{Running Example}:}
                    \smallset{100}, \smallset{}, \smallset{}, \smallset{10, 1.5, 1.4, 1.3, 1.2, 1.1, 1.0}.
                We see that equalDistVB2 can stillyields over packing of abstract states.  
                The next two variants aim to mitigate this issue.
    
    
            \vspace{-4pt}
            \paragraph{\NoCaseChange{5. equalDistVB3}.} \label{sec:ordered-partitioning-schemes:equalDistVB3}
    
                % \begin{quote}
                %     $o = \tn{high to low}$\\
                %     $\Part{equalDistVB3}$ (Algorithm \ref{alg:psi-equalDistVB3})
                % \end{quote}
    
        %         \begin{algorithm}[t!]
        %             \vspace{-6pt}\caption{$\Part{equalDistVB3}$}
        %             \label{alg:psi-equalDistVB3}
        %             \begin{footnotesize}
        %                 \SetInd{0.25em}{0.55em}
        %                 \DontPrintSemicolon 
        %             \Input{A set of ordered nodes $\bs{n^{*}}$ to be partitioned into $nAbs$ abstract states; a value function $\mu(\cdot)$}
        %             \Output{
    				% With 
    				% %
    				% $Z(\bs{A_{1,...,i}}) = (\sum_{j=1}^{i} \sum_{n' \in \bs{A_{j}}} Z(n')$,
    				% %
    				% $n_{\bs{A_{i}}}^{\tn{last}}$ be the last node in $\bs{A_{i}}$, 
    				% %
    				% and 
    				% %
    				% $P_{i} = \frac{ i \cdot \sum_{n \in \bs{n^{*}}}\mu(n)}{nAbs}$,
    				% %
        %             $\bs{n^{*}}$ partitioned into abstract states\super{\ref{ftn:ordered-schemes-maintain-sort-order}} $\bs{A} = \setst{\bs{A_{i}}}{i \in \set{1,...,nAbs}}$ such that for $i=1,...,nAbs$ in order,
        %             $(\; Z(\bs{A_{1,...,i}}) \geq P_{i} \;)$
        %             $\land$ \\ $(\; (\, \bs{|A_{i}}|=1 \,) \lor (\, Z(\bs{A_{1,...,i}}) - Z(n_{\bs{A_{i}}}^{\tn{last}}) < P_{i} \,) \;)$ }
                    
        %             \Begin{
        %                 $j \leftarrow 1$\\
        %                 \ForEach{$i \leftarrow 1,...,nAbs$}{
        %                     $\bs{A_{i}} = \set{n^{*}_{{j}}}$\\
        %                     $j \leftarrow j+1$\\
        %                     \While{$Z(\bs{A_{1,...,i}}) < P_{i}$}{
        %                         $\bs{A_{i}} \leftarrow A_{i} \cup \set{n^{*}_{{j}}}$\\
        %                         $j \leftarrow j+1$
        %                     }
        %                 }
        %                 $\bs{A} = \cup_{i = 1}^{nAbs} \bs{A_{i}}$\\
        %                 \Return $\bs{A}$       
        %             }
        %             \end{footnotesize}
        %         \end{algorithm}
    
                In order to lessen over packing and ensure abtract states are not left empty, equalDistVB3 modifies equalDistVB2 so that, after processing of each abstract state, the next state is forced an addition of at least a single node by default, and then proceeds as the previous equalDistVB's do.
                
                % \textit{Time Complexity:\super{\ref{ftn:time-complexity-assumes-constant-time-v}}}
                %     $Z(A_{1...i})$ can be updated progressively in constant time, and thus computation of $P_{i}$ at each iteration can also be done in constant time. Partitioning is achieved via one pass through $|\bs{n}|$ leading to $\mathcal{O}(|\bs{n}|)$ time complexity.
                % \textit{Space Complexity:\super{\ref{ftn:time-complexity-assumes-constant-time-v}}}
                %     No more than linear space is required.  $\mathcal{O}(|\bs{n}|)$.
                \textit{\hyperref[sec:ordered-partitioning-schemes:running-example]{Running Example}:}
                    \smallset{100}, \smallset{10}, \smallset{1.5}, \smallset{1.4, 1.3, 1.2, 1.1, 1.0}.
                Still highly efficient, equalDistVB3 manages to ensure that the provided $nAbs$ granularity is honored, allowing users better control of the search vs. sampling interpolation possible with Abstraction Sampling.
    
    
            \vspace{-4pt}
            \paragraph{\NoCaseChange{6. equalDistVB4}.} \label{sec:ordered-partitioning-schemes:equalDistVB4}
    
        %         \begin{quote}
        %             $o = \tn{high to low}$\\
        %             $\Part{equalDistVB4}$ (Algorithm \ref{alg:psi-equalDistVB4})
        %         \end{quote}
    
        %         \begin{algorithm}[t!]
        %             \vspace{-6pt}\caption{$\Part{equalDistVB4}$}
        %             \label{alg:psi-equalDistVB4}
        %             \begin{footnotesize}
        %                 \SetInd{0.25em}{0.55em}
        %                 \DontPrintSemicolon 
        %             \Input{A set of ordered nodes $\bs{n^{*}}$ to be partitioned into $nAbs$ abstract states; a value function $\mu(\cdot)$}
        %             \Output{
    				% With 
    				% %
    				% $Z(\bs{A_{1,...,i}}) = (\sum_{j=1}^{i} \sum_{n' \in \bs{A_{j}}} Z(n')$,
    				% %
    				% $n_{\bs{A_{i}}}^{\tn{last}}$ be the last node in $\bs{A_{i}}$, 
    				% %
    				% and 
    				% %
    				% $L_{i} = \frac{Z(\bs{n^{*}})-Z(\bs{A_{1,...,i-1}})}{nAbs-i+1}$,
    				% %
        %             $\bs{n^{*}}$ partitioned into abstract states\super{\ref{ftn:ordered-schemes-maintain-sort-order}} $\bs{A} = \setst{\bs{A_{i}}}{i \in \set{1,...,nAbs}}$ such that for $i=1,...,nAbs$ in order,
        %             $(\; Z(\bs{A_{i}}) \geq L_{i} \;)$
        %             $\land$ \\ $(\; (\, \bs{|A_{i}}|=1 \,) \lor (\, Z(\bs{A_{i}}) - Z(n_{\bs{A_{i}}}^{\tn{last}}) < L_{i} \,) \;)$ }
                    
        %             \Begin{
        %                 $j \leftarrow 1$\\
        %                 \ForEach{$i \leftarrow 1,...,nAbs$}{
        %                     $\bs{A_{i}} = \set{}$\\
        %                     \While{$Z(\bs{A_{i}}) < L_{i}$}{
        %                         $\bs{A_{i}} \leftarrow A_{i} \cup \set{n^{*}_{{j}}}$\\
        %                         $j \leftarrow j+1$
        %                     }
        %                 }
        %                 $\bs{A} = \cup_{i = 1}^{nAbs} \bs{A_{i}}$\\
        %                 \Return $\bs{A}$       
        %             }
        %             \end{footnotesize}
        %         \end{algorithm}
    
                The final varaint of the equalDist, equalDistVB4 aaims for more even partitioning %than the previous variants 
                by recomputing quantiles. Each time the algorithm is about to process a new abstract state, the remaining nodes and abstract states are used to compute new quantiles which are then used to guide filling of the current abstract state.
                
                % \textit{Time Complexity:\super{\ref{ftn:time-complexity-assumes-constant-time-v}}}
                %     $Z(A_{1...i})$ can be updated progressively in constant time, and thus computation of $L_{i}$ at each iteration can also be done in constant time.  Partitioning is achieved via one pass through $|\bs{n}|$ leading to $\mathcal{O}(|\bs{n}|)$ time complexity.
                % \textit{Space Complexity:\super{\ref{ftn:time-complexity-assumes-constant-time-v}}}
                %     No more than linear space is required.  $\mathcal{O}(|\bs{n}|)$.
                \textit{\hyperref[sec:ordered-partitioning-schemes:running-example]{Running Example}:}
                    \smallset{100}, \smallset{10}, \smallset{1.5, 1.4, 1.3}, \smallset{1.2, 1.1, 1.0}.
                Still highly efficient, equalDistVB4 manages to spread nodes with smaller values across abstract states more evenly.

    
            \vspace{-4pt}
            \paragraph{\NoCaseChange{7. randVB}.} \label{sec:ordered-partitioning-schemes:randVB}
    
                It can be beneficial to rely on randomness to ensure a diverse sampling of abstractions.  randVB does this by sampling $nAbs\!-\!1$ partition points uniformly at random and without replacement from between nodes sorted according to $\mu(\cdot)$, and then partitions the nodes accordingly. As a result, abstract states are formed such that nodes are still grouped according to $\mu(\cdot)$, but the size of those groups varies.
                
                % \textit{Time Complexity:\super{\ref{ftn:time-complexity-assumes-constant-time-v}}}
                %     $\mathcal{O}(|\bs{n}|)$ time complexity.
                % \textit{Space Complexity:\super{\ref{ftn:time-complexity-assumes-constant-time-v}}}
                %     No more than linear space is required.  $\mathcal{O}(|\bs{n}|)$.
                \textit{\hyperref[sec:ordered-partitioning-schemes:running-example]{Running Example}:}
                    ex1: \smallset{100, 10}, \smallset{1.5}, \smallset{1.4, 1.3, 1.2}, \smallset{1.1, 1.0};
                    ex2: \smallset{100}, \smallset{10, 1.5, 1.4, 1.3}, \smallset{1.2, 1.1}, \smallset{1.0};
                    ...etc.

            \vspace{-4pt}
            \paragraph{Complexity.} Assuming $\mu(\cdot)$ is $\mathcal{O}(1)$, all above-mentioned partitioning schemes have time complexity $\mathcal{O}(|\bs{n}| \, log |\bs{n}|)$ and space complexity of $\mathcal{O}(|\bs{n}|)$, with the exception of minVarVB, which requires $\mathcal{O}(|\bs{n}|^{2})$ for both.





    \section{Empirical Evaluation} \label{sec:empirical-evaluation}

        %%%%%%%%%%%%%%%%%%% AS Algorithms Tested

       
        \paragraph{Overview.}
        \vspace{-4pt}
        
            All combinations of the three new Value-Based Abstraction Classes: Heuristic-Based \textbf{HB}, HR-Based \textbf{HRB}, and Q-Based \textbf{QB}; with each of the seven Ordered Partitioning Schemes: \textbf{simple}, \textbf{minVar}, \textbf{equalDist1-4}, and \textbf{rand}, were tested, resulting in a total of twenty-one abstraction functions.  For comparison, the formerly evaluated context-based (\textbf{CTX}) abstraction functions: randCB and relCB were also used.  In addition, a pure randomized abstraction function, \textbf{RAND}, was also included.  With the exception of RelCB, each abstraction function accepts a hyper parameter called $nAbs$ which bounds the number of abstract states at any level. RelCB instead uses an $nContext$ parameter that limits the number of context variables used in assigning abstract states.  To facilitate comparison, we report RelCB's $nContext$ parameter instead as an equivalent $nAbs$ parameter assuming a domain size of $2$.  (For example, if RelCB was run using $nContext = 6$, in results we report it as RelCB with $nAbs = 2^{6}$). All of the abstraction functions were tested using the AOAS algorithm \citep{kask20-scaling-up-as}.  All algorithms were implemented in C++. All experiments were run on a 2.66 GHz processor and allotted 8 GB of memory.
        
        
        
        %%%%%%%%%%%%%%%%%%% Heuristic Description
        \vspace{-4pt}
        \paragraph{Heuristics.}
            To inform the sampling proposal, Weighted Mini-Bucket Elimination (wMBE) \citep{DBLP:journals/jacm/DechterR03,DBLP:conf/icml/LiuI11} is used as a heuristic.  The i-bound (\textbf{iB}) parameter controls the strength of wMBE, where higher i-bounds generally lead to stronger heuristics and, thus, better proposals at the expense of higher computation and memory. We standardize our experiments by using the same i-bound when comparing across algorithms. 
        
        
        
        
        %%%%%%%%%%%%%%%%%%% Benchmark Description
        \vspace{-4pt}
        \paragraph{Benchmarks.}
            
            In line with previous work on Abstraction Sampling, we perform experiments on the same set of over 400 problems from five benchmarks: DBN, Grids, Linkage-Type4, Pedigree, and Promedas used by \citep{kask20-scaling-up-as}. 
            
            We refer to problem instances with known $Z$ values as "Exact".  Larger problems without exact solutions are called "LARGE".  For LARGE problems, estimates from 100hr of context-based Abstraction Sampling (obtained from \citet{kask20-scaling-up-as}) are used as the true $Z$.  When experimenting on Exact problems, algorithms use a small i-bound of 5 (weakening heuristic estimates) and were given a short time limit of 300sec in order to increase difficulty.  For LARGE problems, i-bound of 10 and time limit of 1200 sec are used.

            For both brevity and preciseness, we focus on results from the Exact problem instances. 
            % thus here excluding the Linkage-Type4 benchmark whose problems do not have known solutions.  
            Results for LARGE problems can be found in the Supplemental Materials and their trends generally agree with those from the EXACT problems.

            
            
            \begin{centering}
            \begin{table}[H]
                \centering
            \caption{
                \textbf{Exact Benchmark Statistics}. Average benchmark statistics for Exact problems. \textbf{N}: number of instances, \textbf{\tabs{X}}: average number of variables, \textbf{k}: average of problems' largest domain sizes, \textbf{w\super{*}}: average induced tree-width, \textbf{d}: average \PT depth. 
                \label{tbl:small-benchmark-statistics}
            }
            \vspace{-8pt}
            \begin{tabular}{lrrrrr}
              \toprule
              Benchmark &   N &   |\textbf{X}| &     k &          w* &        d \\ 
              \midrule
                    DBN &  66 &      67 &          2 &      29 &      30 \\ 
                  Grids &   8 &     250 &          2 &      22 &      49 \\ 
               Pedigree &  25 &     690 &          5 &      25 &      89 \\ 
               Promedas &  65 &     612 &          2 &      21 &      62 \\ 
              \bottomrule
            \end{tabular}
            \end{table}
            \vspace{-8pt}
            \end{centering}

           \begin{centering}
           \begin{table}[H]
               \centering
            \caption{
                \textbf{LARGE Benchmark Statistics}. Average benchmark statistics for LARGE problems. \textbf{N}: number of instances, \textbf{\tabs{X}}: average number of variables, \textbf{k}: average of problems' largest domain sizes, \textbf{w\super{*}}: average induced tree-width, \textbf{d}: average \PT depth. 
                \label{tbl:large-benchmark-statistics}
            }
            \vspace{-6pt}
            \begin{tabular}{lrrrrr}
              \toprule
              Benchmark &   N &   |\textbf{X}| &        k &          w* &        d \\ 
              \midrule
                        DBN &   48 &     216 &        2 &     78 &    78\\
                      Grids &   19 &    3432 &        2 &    117 &   220\\
              Linkage-Type4 &   82 &    6550 &        5 &     45 &   761\\
                   Promedas &  173 &    1194 &        2 &     72 &   114\\
              \bottomrule
            \end{tabular}
           \end{table}
            \vspace{-8pt}
            \end{centering}
        
        
        %%%%%%%%%%%%%%%%%%% Performance Measure
        \vspace{-4pt}
        \paragraph{Performance Measure.}
            To evaluate the performance of the various algorithms, we define the error as:    
            $Error = |log_{10} \hat{Z} - log_{10} Z^{*}|$,
            where $\hat{Z}$ is the estimate obtained and $Z^{*}$ is the exact $Z$ value.



        \subsection{Results} \label{sec:empirical-evaluation:results}
        \vspace{-4pt}
        
            % \subsubsection{Aggregated Results Tables}
            
                % \vspace{-4pt}
                \paragraph{Summary Comparison.}
                    To examine the potential of the different methods, we tested each algorithm with a range of $nAbs \in \set{1, 4, 16, 64, 256, 512, 1024, 2048}$. For each $nAbs$ and  benchmark, we calculated the average error across problems of the benchmark and identified the $nAbs$ that resulted in the lowest average error. In Table \ref{tbl:small-aggregations} we focus only on Exact problems and show this lowest average error and corresponding $nAbs$ for each algorithm,
                    %and benchmark, 
                    highlighting schemes that performed well across all benchmarks.  Table \ref{tbl:large-qb-aggregations} shows the corresponding results for LARGE problems on the better performing QB abstraction class as well as the CTX and RAND classes for comparison.  If an algorithm was unable to produce a positive Monte Carlo $Z$ estimate for a problem (denoted as "Fail"), the wMBE heuristic bound was used as its $Z$ estimate and error computed accordingly.
    
                    % Tables \ref{tbl:DBN_aggregation}-\ref{tbl:Promedas_aggregation} show aggregated performance of the various Value-Based Abstraction Classes with the various Partitioning Schemes on problems of DBN, Grids, Linkage-Type4, and Promedas benchmarks.

                    \begin{tablefigure*}[!htb]
                        \centering     %%% not \center
                        \begin{subtablefigure}{0.9\linewidth}
                            \includegraphics[width=0.98\linewidth]{UAI-24/_attachments/Results/ALL-SMALL-aggregations-i-5-t-300.pdf}
                            \caption{}
                            \label{tbl:small-aggregations}
                        \end{subtablefigure}
                        \begin{subtablefigure}{0.9\linewidth}
                            \includegraphics[width=0.98\linewidth]{UAI-24/_attachments/Results/QB-CTX-RAND-LARGE-aggregations-i-10-t-1200.pdf}
                            \caption{}
                            \label{tbl:large-qb-aggregations}
                        \end{subtablefigure}
                        \vspace{-10pt}\caption{\textbf{Summary Comparison}. For each table, displayed are the Abstraction Class (\textit{Class}), Partitioning Scheme (\textit{Scheme}), bound on the number of abstract states per level (\textit{nAbs}), number of problems for which a positive solution could not be estimated (\textit{Fail}), and average $log_{10}Z$ error (\textit{Avg. Error}) across Exact problems of the given benchmark.  Color bars visualize the magnitude of the values. Overall best performing algorithms are highlighted.  (a) shows results on Exact problems. (b) Shows results on LARGE problems.
                \vspace{-4pt}
                        }
                        \label{tbl:summary-aggregations}
                    \end{tablefigure*}

    
                \vspace{-4pt}
                \paragraph{Comparison using 100 Samples.} \label{sec:empirical-evaluation:results:aggregation-tables:set-number-of-samples}

                    \begin{tablefigure}[!htb]
                        \centering
                        \includegraphics[width=0.99\linewidth]{UAI-24/_attachments/Results/ALL-SMALL-iB-5-nAbs-256-nR-100-QB-CB-RAND.pdf}
                        \vspace{-6pt}\caption{\textbf{100-Sample Comparison}. For abstraction granularity of $nAbs=256$, aggregated statistics (as described in Table \ref{tbl:summary-aggregations}) for Exact problems of each benchmark with each algorithm allotted 100 samples.
                        \vspace{-4pt}}
                        \label{tbl:results:ALL-SMALL-iB-5-nAbs-256-nR-100-QB-CB-RAND}
                    \end{tablefigure}
        
                    To assess the quality of abstraction functions in an implementation-agnostic manner and irrespective of resulting probe-sizes or speed of processing abstractions,
                    %However, as detailed in Section \ref{sec:ordered-partitioning-schemes}, some schemes may exhibit variations in execution time, and implementation differences can contribute to this variability. 
                    % And as discussed in Section \ref{sec:empirical-evaluation:results:abstraction-speed-plot}, probe sizes can also vary. 
                    %Probe sizes can also vary between use of different abstraction functions.
                    %To circumvent these artifacts, 
                    we conducted experiments using a one-hundred sample termination condition (\textbf{m-100}). 
                    % rather than a time constraint. 
                    Table \ref{tbl:results:ALL-SMALL-iB-5-nAbs-256-nR-100-QB-CB-RAND} shows these results on Exact problems of each benchmark for the better performing QB algorithms with $nAbs=256$.  We use $nAbs=256$ as (1) it is an intermediate granularity and (2) all schemes produced 100 samples in a reasonable amount of time.  We highlight the best performing schemes.

            \vspace{-4pt}
            \paragraph{Varying \NoCaseChange{nAbs}.}

                \begin{tablefigure}[!htb]
                    \centering
                    \includegraphics[width=0.99\linewidth]{UAI-24/_attachments/Results/varying-nAbs-SMALL-i-5-t-300-best-QB.pdf}
                    \vspace{-6pt}\caption{\textbf{Varying nAbs}. Average error when using $nAbs \in \set{4, 64, 1024}$ for minVarQB, equalDistQB3, equalDistQB4, the CTX based algorithms, and RAND, each with iB-5 and time limit of 300 sec.
                \vspace{-12pt}
                }
                    \label{tbl:varying-nAbs-SMALL-i-5-t-300-best-QB}
                \end{tablefigure}

                \begin{plotfigure}[!htb]
                \vspace{-14pt}
                    \centering
                    \includegraphics[width=0.99\linewidth]{UAI-24/_attachments/Results/error-vs-nAbs-plot-equalDistQB4-iB-5}
                    \vspace{-8pt}\caption{\textbf{Varying $\bs{nAbs}$ for equalDistQB4}. Plotted is the average error on Exact problems using iB-5 and time limit of 300 sec for each benchmark for various abstraction granularities (in log2).
                \vspace{-4pt}
                }
                    \label{plt:results:error-vs-nAbs-plot-equalDistQB4-iB-5}
                \end{plotfigure}

                \begin{plotfigure}[H]
                \vspace{-14pt}
                    \centering
                    \includegraphics[width=0.99\linewidth]{UAI-24/_attachments/Results/error-vs-nAbs-plot-minVarQB-iB-5}
                    \vspace{-8pt}\caption{\textbf{Varying $\bs{nAbs}$ for minVarQB}. Plotted is the average error on Exact problems using iB-5 and time limit of 300 sec for each benchmark and for various abstraction granularities (in log2).
                \vspace{-24pt}
                }
                    \label{plt:results:error-vs-nAbs-plot-minVarQB-iB-5}
                \end{plotfigure}
                
                To observe the effect of changing $nAbs$, Table \ref{tbl:varying-nAbs-SMALL-i-5-t-300-best-QB} shows average error for different \small{$nAbs \in \smallset{4, 64, 1024}$} for Exact problems of each benchmark.  We focus on the better performing variants of QB: minVarQB, equalDistQB3, equalDistQB4; the purely randomized scheme: RAND; and the context-based schemes (CTX) for comparison. In Plots \ref{plt:results:error-vs-nAbs-plot-minVarQB-iB-5} and \ref{plt:results:error-vs-nAbs-plot-equalDistQB4-iB-5}, we also show average error across a wider array of $nAbs$ for minVarQB and equalDistQB4, respectively, the latter also acting as a representative for the profile of the plots of equalDistQB3 and RAND.



            \vspace{-4pt}
            \paragraph{Time Series Plot.}

                Plots \ref{plt:results:grid20x20.f15-time-series}-\ref{plt:results:or_chain_209.fg-time-series} show time-series $Z$ estimates for the better performing QB algorithms, the purely randomized scheme, and context-based schemes (CTX) on a representative Grids and representative Promedas problem.  For each algorithm was plotted with the $nAbs$ that resulted in the lowest average error for the problem's respective benchmark.  Each plot line is labeled with the scheme, $nAbs$ used, and the final $Error$ of its estimate.

                \begin{plotfigure}[!htb]
                    \centering
                    \includegraphics[width=0.99\linewidth]{UAI-24/_attachments/Results/grid20x20.f15-time-series.png}
                    \vspace{-8pt}\caption{Z estimates from various algorithms versus time on Grids problem grid20x20.f15  using $iB=5$. The dashed black line shows the true Z value.
                \vspace{-12pt}
                }
                    \label{plt:results:grid20x20.f15-time-series}
                \end{plotfigure}

                \begin{plotfigure}[t]
                    \centering
                    \includegraphics[width=0.99\linewidth]{UAI-24/_attachments/Results/or_chain_209.fg-time-series.png}
                    \vspace{-8pt}\caption{Z estimates from various algorithms versus time on Promedas problem or\us chain\us 209.fg  using $iB=5$. The dashed black line shows the true Z value.
                \vspace{-4pt}
                }
                    \label{plt:results:or_chain_209.fg-time-series}
                \end{plotfigure}

                 



        \subsection{Analysis} \label{sec:empirical-evaluation:analysis}
        \vspace{-4pt}

            % \vspace{-4pt}
            \paragraph{Performance Comparison with Context-Based Schemes.}

                Comparing errors of the the HB and HRB classes to the CTX class for Exact problems in Table \ref{tbl:small-aggregations}, we see that there always exist a partitioning scheme that can outperform the best context based scheme.  For the HB class, the \textit{simple} and \textit{rand} schemes perform best, whereas for the HRB class it seems to be more benchmark dependent.  The QB scheme with \textit{minVar}, \textit{equalDist3}, and \textit{equalDist4} outperform the CTX schemes across all benchmark.  The purely randomized scheme (RAND) also consistently outperforms the CTX schemes.  Results from Table \ref{tbl:large-qb-aggregations} on LARGE problems agree with the exception of \textit{minVar} QB and RAND schemes, which fall slightly shy of randCB (CTX \textit{rand}) in performance on Promedas.

            \vspace{-4pt}
            \paragraph{Comparison with Purely Randomized Abstractions.}
                Table \ref{tbl:summary-aggregations} show RAND is a particularly well performing scheme across all benchmarks.  However, the QB class using the \textit{equalDist3} and \textit{equalDist4} strategies is consistently comparable or better than the purely randomized scheme, and no other scheme does as well.

            \vspace{-4pt}
            \paragraph{Comparison with Non Abstraction Sampling Schemes.}
                In prior work by \citet{DBLP:conf/uai/BrokaDIK18} and \citet{kask20-scaling-up-as}, Abstraction Sampling using CTX based abstractions was shown as competitive against several powerful schemes such as Importance Sampling (IS), Weighted Mini-Bucket Importance Sampling (wMBIS) \citep{liu2015probabilistic}, IJGP-SampleSearch (IJGP-ss) \citep{DBLP:journals/ai/GogateD11}, and Dynamic Importance Sampling \citep{lou2019interleave}.  Thus, by showing superior performance against CTX based schemes implicitly indicates competitiveness against the above-mentioned non Abstraction Sampling schemes.

            \vspace{-4pt}
            \paragraph{Abstraction Quality of the QB Schemes.}
                When drawing an equal number of samples with the same abstraction granularity of $nAbs=256$, QB with \textit{equalDist3} and \textit{equalDist4} and RAND are well performing across all benchmarks as seen when using a time limit.  However, a key difference we see is that QB with minVar, which had showed only slightly worse performance using a time limit, is now best performing when normalizing the number of samples drawn.  This can, in part, explain the success of the QB \textit{equalDist3} and \textit{equalDist4} schemes, which attempt to emulate the QB \textit{minVar} scheme while using faster greedy strategies.

            \vspace{-4pt}
            \paragraph{Anytime Behavior.}
                From Plots \ref{plt:results:grid20x20.f15-time-series} and \ref{plt:results:or_chain_209.fg-time-series} we see that the Abstraction Sampling algorithms continue to improve their estimates as time progresses.  We also notice that estimates are often underestimates that increase over time.
                


            % \subsubsection{The Effect of \NoCaseChange{iB}

            \vspace{-4pt}
            \paragraph{Choice of Abstraction Granularity.}
                From Table \ref{tbl:varying-nAbs-SMALL-i-5-t-300-best-QB} that, for the well performing QB \textit{equalDist3} and \textit{equalDist4} schemes and for the RAND scheme, there is a trend that greater $nAbs$ (corresponding to a greater allotment of abstract states) improves performance to a point and then has little effect.  Plot \ref{plt:results:error-vs-nAbs-plot-equalDistQB4-iB-5} further supports this for QB with \textit{equalDist4}, for which plots of QB \textit{equalDist3} and RAND have similar profiles (omitted for brevity).  However in Plot \ref{plt:results:error-vs-nAbs-plot-minVarQB-iB-5} and Table \ref{tbl:varying-nAbs-SMALL-i-5-t-300-best-QB} we see that for \textit{minVar} error begins to increase when $nAbs$ becomes too high.  This makes sense due to the the higher computational cost of forming \textit{minVar} abstractions (which uses Ward's minimum variance hierarchical clustering), leaving less time for probe generation.

            \vspace{-4pt}
            \paragraph{Summary of Results.}
               Our experiments show that QB scheme with \textit{equalDist3} or \textit{equalDistQB4} and the RAND scheme were significantly better than the earlier Context-based based abstractions (Figure \ref{fig:results:performance-matrix}).  We also observe that those best performing schemes tend to improve  as the abstraction scheme's granularity $nAbs$ increases up to a point, past which we see little difference in performance.  Thus, our study suggests that 
               %given an i-bound, we suggest use of one of 
               these three abstraction schemes should be the first choice when using AOAS, with a large $nAbs$ that is bounded to allow single  probe generation.

                \begin{figure}[t]
                    \centering
                    \includegraphics[width=0.50\linewidth]{UAI-24/_attachments/Results/performance-matrix.pdf}
                    \vspace{-8pt}\caption{\textbf{Performance Matrix}. Relative average performance of the value-based schemes vs. existing state-of-the-art context-based abstractions.  Values $> 1.00$ indicate superior performance.
                \vspace{-10pt}
                }
                    \label{fig:results:performance-matrix}
                \end{figure}




    \section{Conclusion} \label{sec:conlcusion}
    \vspace{-4pt}
            
        The paper provides a wide exploration of abstarction functions in the context of AND/OR Abstraction Sampling. We presented a value-based abstraction framework introducing three classes: HB, QB, and HRB each defined by real-valued functions that aims to capture informative elements from search and sampling to improve AOAS performance. Each class was augmented with several partitioning schemes to form twenty-one new abstraction functions. We also present an abstraction function RAND that places nodes into abstract states completely at random.
        An extensive empirical evaluation of all these abstraction functions was evaluated on over 400 benchmark problems.  Our results show two of the new QB based schemes (\textit{equalDistQB3}, and \textit{equalDistQB4}) and the RAND scheme showing superior performance consistently and throughout all benchmarks. In particular we obtained significantly improved performance relative to current state-of-the-art.
        Based on this study and based on earlier finding we believe that AOAS is one of the best scheme for estimating the partition function to date.
        Future work will explore adjusting the abstraction scheme and its $nAbs$ hyper-parameter to problem instances through learning. 
        %We observed a trend that allotting these abstraction functions a high number of abstract states helps them perform best. 


\shrink{
        %\paragraph{Summary.}
            We advance Abstraction Sampling by presenting a new abstraction framework, Value-Based Abstractions, which abstracts nodes based on values along a positive scale.  We introduce three Value-Based Abstraction Classes
            %- HB, QB, and HRB - 
            each defined by a unique value function motivated by paradigms in search and sampling. We also introduce seven Ordered Partitioning Schemes 
            %- \textit{simple}, \textit{equalDist}, \textit{equalDist2}, \textit{equalDist3}, \textit{equalDist4}, and \textit{rand} - 
            that partition nodes into abstract states by their values while maintaining value order across abstract states.  We also present a purely randomized abstraction function
            % - RAND - 
            that places nodes into abstract states completely at random.  We evaluated the schemes with the AOAS algorithm on over 400 standard benchmark problems characterizing their performance.  Three schemes, QB with \textit{equalDist3}, QB with \textit{equalDist4}, and RAND, were identified as exceptionally strong performers, demonstrating significantly better performance than the previous state-of-the-art.  We observed a trend that allotting these abstraction functions a high number of abstract states helps them perform best. 
}


\shrink{
\rina{I suggest to remove the following paragraph. It is  vague. If I would say anything is about how to automate abstraction function selection and its granularity using learning-based scheme.
}


        \paragraph{Suggested Extensions.}
            An extensive, but fruitful, followup to this work would be to identify properties of problem instances that play key roles in the quality of Abstraction Sampling estimates, and explore how these properties can be used to inform choice of abstraction function and hyper-parameters.  Additionally, a deep dive into the way different abstraction functions affect probe construction, and how probe structure and characteristics influence or inform the quality of estimates, would also be valuable.
}
        
\clearpage
    % \bibliographystyle{named}
    \bibliography{ref}




\end{document}