% %% Begin supplemental formatting
% \documentclass{article}
% \usepackage{natbib}
% \usepackage[margin=1in,footskip=0.25in]{geometry}
% \usepackage[utf8]{inputenc}
% \usepackage{graphicx}
% \usepackage{xcolor}
%     \definecolor{darkgreen}{rgb}{0.0, 0.2, 0.13}
%     \definecolor{cadmiumgreen}{rgb}{0.0, 0.42, 0.24}
%     \definecolor{byzantium}{rgb}{0.44, 0.16, 0.39}
% \usepackage{amsmath}
% \usepackage{mathrsfs}
% \usepackage[font=small]{caption}
% \usepackage[font=small]{subcaption}
% \usepackage{booktabs}
% \usepackage{multirow}
% \usepackage{enumitem}
% \usepackage{times}
% \usepackage{hyperref}
%     \hypersetup{
%         colorlinks=true,
%         urlcolor=blue,
%         urlbordercolor=blue,
%         linkcolor=blue,
%         linkbordercolor=blue,
%         filecolor=magenta,
%         pdfborderstyle={/S/U/W 1},
%     }
% \usepackage{mathrsfs}
% \usepackage[titlenumbered,ruled, linesnumbered]{algorithm2e}
%     \newcommand\commentstyle[1]{\textcolor{cadmiumgreen}{#1}}
%     \SetCommentSty{commentstyle}
%     \SetKwInOut{Input}{input}
%     \SetKwInOut{Output}{output}
% \usepackage{enumitem}
%     \setlistdepth{9}
%     \setlist[itemize,1]{label=$\bullet$}
%     \setlist[itemize,2]{label=$\cdot$}
%     \setlist[itemize,3]{label=$\cdot$}
%     \setlist[itemize,4]{label=$\cdot$}
%     \setlist[itemize,5]{label=$\cdot$}
%     \setlist[itemize,6]{label=$\cdot$}
%     \setlist[itemize,7]{label=$\cdot$}
%     \setlist[itemize,8]{label=$\cdot$}
%     \setlist[itemize,9]{label=$\cdot$}
%     \renewlist{itemize}{itemize}{9}
% \usepackage{algorithm2e}


%% end supplemental formatting


% %% Begin AAAI-22 formatting
% \def\year{2022}\relax
% %File: formatting-instructions-latex-2022.tex
% %release 2022.1
% \documentclass[letterpaper]{article} % DO NOT CHANGE THIS
% \usepackage{aaai22}  % DO NOT CHANGE THIS
% \usepackage{times}  % DO NOT CHANGE THIS
% \usepackage{helvet}  % DO NOT CHANGE THIS
% \usepackage{courier}  % DO NOT CHANGE THIS
% \usepackage[hyphens]{url}  % DO NOT CHANGE THIS
% \usepackage{graphicx} % DO NOT CHANGE THIS
% \urlstyle{rm} % DO NOT CHANGE THIS
% \def\UrlFont{\rm}  % DO NOT CHANGE THIS
% \usepackage{natbib}  % DO NOT CHANGE THIS AND DO NOT ADD ANY OPTIONS TO IT
% \usepackage{caption} % DO NOT CHANGE THIS AND DO NOT ADD ANY OPTIONS TO IT
% \DeclareCaptionStyle{ruled}{labelfont=normalfont,labelsep=colon,strut=off} % DO NOT CHANGE THIS
% \frenchspacing  % DO NOT CHANGE THIS
% \setlength{\pdfpagewidth}{8.5in}  % DO NOT CHANGE THIS
% \setlength{\pdfpageheight}{11in}  % DO NOT CHANGE THIS
% %% End AAAI-22 formatting





%%% Begin UAI-22 formatting
\documentclass{uai2023} % for initial submission
% \documentclass[accepted]{uai2023} % after acceptance, for a revised
%                                     % version; also before submission to
%                                     % see how the non-anonymous paper
%                                     % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
\bibliographystyle{plainnat}
\renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
\usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
% Use the postscript times font!
\usepackage{times}
\usepackage{soul}
\usepackage{url}
\usepackage{hyperref}
    % \hypersetup{
    %     colorlinks=true,
    %     urlcolor=blue,
    %     urlbordercolor=blue,
    %     linkcolor=blue,
    %     linkbordercolor=blue,
    %     filecolor=magenta,
    %     pdfborderstyle={/S/U/W 1},
    % }
\usepackage[utf8]{inputenc}
\usepackage{graphicx}
\usepackage{amsfonts}
\usepackage{booktabs}
% \usepackage{subfigure}
\usepackage{amssymb}
\urlstyle{same}
%%% End UAI-22 formatting

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% CUSTOM PACKAGES %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\usepackage{float}
\usepackage{xspace} % package being used for \newcommand to remove extra space
                    %     when a command is invoked without an argument list
\usepackage{textcase}
\usepackage[toc, nopostdot]{glossaries}
% \usepackage{amsmath}
\usepackage{amsthm}
\usepackage{mathtools}
\usepackage{enumitem}
\usepackage{refcount}
\usepackage[leftmargin=6pt, vskip=3pt-\parskip]{quoting}
\usepackage[titlenumbered,ruled, linesnumbered]{algorithm2e}
\usepackage{mathrsfs} %for \mathscr
% \usepackage[font=small,labelfont=bf]{caption}
% \usepackage[font=small,labelfont=bf]{subcaption}
\usepackage[labelfont=bf]{caption}
\usepackage[labelfont=bf]{subcaption}
\usepackage{xcolor}
    \definecolor{darkgreen}{rgb}{0.0, 0.2, 0.13}
    \definecolor{cadmiumgreen}{rgb}{0.0, 0.42, 0.24}
    \definecolor{byzantium}{rgb}{0.44, 0.16, 0.39}
    \definecolor{darkelectricblue}{rgb}{0.33, 0.41, 0.47}
    \definecolor{battleshipgrey}{rgb}{0.52, 0.52, 0.51}
    \definecolor{warmblack}{rgb}{0.0, 0.26, 0.26}
\usepackage{newfloat}
\usepackage{chngcntr}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% CUSTOM COMMANDS %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%create new float environment called plotfigure with it's own counter
\DeclareFloatingEnvironment[name=Plot]{plotfigure} 

%create new float environment called tablefigure with it's own counter
\DeclareFloatingEnvironment[name=Table]{tablefigure} 

%set the floats table and tablefigure to use the same counters
\makeatletter\let\c@tablefigure\c@table\makeatother 

%consider the floats table and tablecounter as the same set of floats (so location in document will be in order in which they appear)
\makeatletter\let\ftype@tablefigure\ftype@table\makeatother 

\DeclareMathOperator*{\argmin}{argmin}
\DeclareMathOperator*{\argmax}{argmax}
\mathchardef\mhyphen="2D % Define a "math hyphen"

% algorithm2e
% \newcommand\commentstyle[1]{\textcolor{cadmiumgreen}{#1}}
\SetCommentSty{commentstyle}
\SetKwInOut{Input}{input}
\SetKwInOut{Output}{output}

\newtheoremstyle{break}
  {\topsep}{\topsep}%
  {\itshape}{}%
  {\bfseries}{}%
  {\newline}{}%
\theoremstyle{break}
\newtheorem{theorem}{Theorem}[subsubsection]
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{definition}{Definition}[subsubsection]

\input{cmds}
\renewcommand*{\glstextformat}{\textbf}

\renewcommand{\quote}{\list{}{\rightmargin=\leftmargin\topsep=0pt}\item\relax}







%%% for supplemental

\usepackage{enumitem}
    \setlistdepth{9}
    \setlist[itemize,1]{label=$\bullet$}
    \setlist[itemize,2]{label=$\cdot$}
    \setlist[itemize,3]{label=$\cdot$}
    \setlist[itemize,4]{label=$\cdot$}
    \setlist[itemize,5]{label=$\cdot$}
    \setlist[itemize,6]{label=$\cdot$}
    \setlist[itemize,7]{label=$\cdot$}
    \setlist[itemize,8]{label=$\cdot$}
    \setlist[itemize,9]{label=$\cdot$}
    \renewlist{itemize}{itemize}{9}






%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\setcounter{secnumdepth}{3} %May be changed to 1 or 2 if section numbers are desired.
\setcounter{tocdepth}{3}

\title{Abstraction Sampling Meeting Updates}


% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<pezeshkb@uci.edu>?Subject=Abstraction Sampling}{Bobak Pezeshki}{}}
% \author[2]{\href{mailto:<radu.marinescu@ie.ibm.com>?Subject=Abstraction Sampling}{Radu Marinescu}{}}
\author[1]{\href{mailto:<ihler@ics.uci.edu>?Subject=Abstraction Sampling}{Kalev Kask}{}}
\author[1]{\href{mailto:<kkask@uci.edu>?Subject=Abstraction Sampling}{Alexander Ihler}{}}
\author[1]{\href{mailto:<dechter@ics.uci.edu>?Subject=Abstraction Sampling}{Rina Dechter}{}}
% Add affiliations after the authors
\affil[1]{%
    University of California, Irvine
}
% \affil[2]{%
%     IBM Research
% }


\input{gls}

\begin{document}
    % \onecolumn
    \setlength{\abovedisplayskip}{3pt}
    \setlength{\belowdisplayskip}{3pt}

    \maketitle
    
    \begin{abstract}
        Monte Carlo methods have proven to be powerful tools for solving a wide range of computational problems, including those involving complex probability distributions. Despite their versatility, these methods often suffer from computational inefficiencies, especially when dealing with rare events. As such, importance sampling emerged as a prominent technique for alleviating these challenges. Recently, a new scheme called Abstraction Sampling was developed that incorporated stratification to importance sampling, helping to improve estimates further. Nevertheless, work on Abstraction Sampling to date has explored a only a handful of abstraction functions that guide the stratification.\\
        
        This work expands the set of general abstraction functions for AND/OR Abstraction Sampling by introducing three new classes of abstraction functions, one based on using heuristic estimates of node values, another based on heuristic estimates combined with side-branch estimates, and the last using the un-normalized abstraction sampling proposal. Each of these new classes can be augmented by seven distinct schemes resulting in a total of twenty-one new abstraction functions. We provide a thorough review of the schemes, explain their motivation as backed by theory and intuition, and provide an extensive empirical analysis on their performance. We also expand general understanding and analysis of Abstraction Sampling by providing additional theoretical properties and analysis, many of which can be instrumental for development of more sophisticated abstraction functions and Abstraction Sampling schemes in the future.
    \end{abstract}

    % \begin{abstract}
    %     \noindent This supplemental material is purposed to provide readers with extended background on topics that are foundational to \bobak{add ref to this paper}, provide deeper explanations for the presented topics, as well as to provide extended results.  As this document is lengthy, we suggest using the table of contents to help guide you to section(s) of the document relevant to your needs. The most up-to-date version of this supplemental as well as other supplemental materials can be found on the \href{https://www.ics.uci.edu/~dechter/publications.html}{Dechter Lab publications page}.
    % \end{abstract}

    \vfill\eject
    \tableofcontents
  



% \clearpage

%     \setglossarystyle{altlist}
    
% \clearpage
%     \printnoidxglossary[title=Glossary, toctitle=i.\ \ \ \  Glossary]
    
% \clearpage
%     \printnoidxglossary[type=abrv, title=Abbreviations, toctitle=ii.\ \ \  Abbreviations]
    
% \clearpage
%     \printnoidxglossary[type=nt, sort=def, title=Notation, toctitle=iii.\ \  Notation]


    % \clearpage
    % \section{Possible ideas}
    
    %     \begin{itemize}
    %         \item new abs fxns more consistent in terms of effective granularity (less pruning)
    %         \item analysis: probe sizes
    %     \end{itemize}

    %     \begin{theorem}[Exact Abstraction]
    %         When the abstraction function satisfies that $a(n) = a(n') \implies Z(n) = Z(n')$, then $\hat{Z}$ is exact (ie. $\hat{Z} = Z$) with one probe if h satisfies $a(n) = a(n') \implies h(n) = h(n')$.
    %     \end{theorem}

    %     \todo{caveat: does not apply to AND/OR}
    %     \todo{caveat: depends on proposal (no need to state)}
    
    \clearpage
    \section{Introduction} \label{sec:introduction}


        
    \section{General Background} \label{sec:background}



    \section{Abstraction Sampling}\label{sec:abstraction-sampling}
    
        \subsection{General Algorithm} \label{sec:abstraction-sampling:general-algorithm}
        
        \subsection{Existing Abstraction Functions} \label{sec:abstraction-sampling:existing-abstraction-functions}



    \section{Paradigms Intuiting Abstraction Strategies} \label{sec:paradigms}

        \subsection{Search Paradigms} \label{sec:paradigms:search}

            \cite{DBLP:conf/uai/BrokaDIK18} made the observation that in [tree] search, one can merge nodes that have the same value to produce a more efficient graph search, and intuited the potential benefits of abstraction that group nodes with similar sub tree values.

            Abstraction functions by \cite{DBLP:conf/uai/BrokaDIK18} focused on this paradigm and approached it by using the concept of a node's context - the assignments to the smallest subset of a node's ancestor variables that dictates its value.  Due to the potentially large context size for variables, and consequently the exponentially high number of combinations of assignments to the context, the full context of variables could not be used in most cases.  \cite{DBLP:conf/uai/BrokaDIK18} resolved this by creating two context-based abstraction functions that were relaxed to allow nodes with different contexts to be grouped in the same abstract state.  A key observation we make is that, in the general setting, sharing the same partial context does not necessarily imply the same, nor even similar, node values.

            Our new heuristic-based abstraction functions hope to address this as will be presented in Section \ref{sec:value-based-abstraction-classes}.


        \subsection{Sampling Paradigms} \label{sec:paradigms:sampling}

            From \cite{DBLP:conf/uai/BrokaDIK18} we learn that
            \begin{theorem}[Exact Proposal] \label{thm:old-exact-proposal}
                If the proposal function $p$ in AS uses an exact heuristic $h(n)=Z(n)$, then $\hat{Z}$ has zero variance (single probe is exact), for any abstraction.
            \end{theorem}

            % However, based on insight from importance sampling, Theorem \ref{thm:old-exact-proposal} can be relaxed based on a notion of proportionality:

            % \begin{definition}[Heuristic Proportionality Value of a Node]
            %     Let the \gls{gls:heuristic-proportionality-value-of-a-node} $n$ with a heuristic value of $h(n)$ and exact subproblem value of $v^{*}(n)$ be defined as
            %     \begin{align}
            %         \hpropn{h}{n} = \frac{h(n)}{v^{*}(n)}
            %     \end{align}
            % \end{definition}

            % and we can subsequently say:

            However, with $p(x)$ as the proposal distribution and $p^{*}(x)$ the true distribution over $X$, and $f(x)$ a non-zero value function associated with $X$, it is well known that importance sampling achieves zero variance when 1) $p(x)=0 \implies p*(x)=0$, and 2) otherwise $p(x)$ is proportional to $p^{*}(x)f(x)$. 
            \todo{\href{https://artowen.su.domains/mc/Ch-var-is.pdf\#page=5}{where's the primary source?}}.  
            
            \begin{lemma}[Importance Sampling Exact Proposal Based on Proportionality with Target Distribution]
                Given $p^{*}(x)$ as the true distribution over $X$, and a value function $f(x)$, importance sampling achieves zero variance when using a proposal function $p(x)$ such that $p(x)=0 \implies p^{*}(x)f(x)=0$ and $p(x) \propto p^{*}(x)f(x)$, otherwise.
                \bobak{do i need to explicitly talk about the pathological case where $\forall x \in X, p^{*}(x)f(x) = 0$?}
            \end{lemma}
            
            Note that in abstraction sampling each abstract state involves a node selection procedure analogous to importance sampling.  With the use case of computing the partition function given the previously described proposal, and assuming $h(n)=0 \implies Z(n)=0$, it naturally follows that designing each abstract states $\bs{A_{i}}$ such that $\forall n \in \bs{A_{i}}, p(n) = k\, p^{*}(n)$, for some constant $k$ (we omit $f(x)$ as $f(x)=1$ for the task of computing the partition function), we similarly achieve zero variance.  Thus, we generalize Theorem \ref{thm:old-exact-proposal} to

            \begin{theorem}[ORAS Exact Abstractions from $h(n)$ vs. $Z(n)$ Proportionality] \label{thm:oras-proportionality-exact-proposal}
                An abstraction function $a(n)$ has $h(n)$ vs. $Z(n)$ proportionality if, for every abstract state $A_i$ formed by $a(n)$, $\forall n \in A_i, h(n) = k \, Z(n)$, for some constant $k$ specific to $A_i$.  In such a case, $a$ is exact for ORAS with ORAS estimates having zero variance and $\hat{Z} = Z$ for each probe. 
            \end{theorem}

            \begin{proof}
                 We know that if we were to use exhaustive search, we would arrive at the true $Z$ value.  We use a proof by induction that assumes that after each abstraction step we will compute the rest of the probe exactly using exhaustive search.  Thus, if abstractions are performed layer by layer down from the root, after each abstraction we know that $Z(n')$ will be computed exactly for the selected node $n'$.
                 
                 We denote the estimate that would be generated by a probe constructed after $t$ time steps as $\hat{Z}^{(t)}(PROBE)$.  (As we will describe, each time step will correspond to an abstraction step).  As a base case, $\hat{Z}^{(t=0)}(PROBE) = Z$ since all values will be computed exactly via exhaustive search.  In the inductive step, we will show that after each time step $t$, if instead of using exhaustive search immediately, we first perform an abstraction on the current level of the probe, the resulting estimate of the newly abstracted probe $\hat{Z}^{(t+1)}(PROBE)$ will remain unchanged.  Namely, we will show that
                 \[
                    \hat{Z}^{(t)}(PROBE) - \hat{Z}^{(t+1)}(PROBE) = 0
                 \]
                 This shows that the abstractions maintain exactness of the probe's estimate.
                 
                 Starting from the left hand side
                 \[
                    LHS = \hat{Z}^{(t)}(PROBE) - \hat{Z}^{(t+1)}(PROBE)
                \]
                
                We note the difference in the overall probe estimates during an abstraction sampling is due to the change in the probe estimate that results from each individual abstraction step (namely selection and reweighing of a representative node $n'$ from an abstract state $A_i$).  Thus for our time steps, we will focus on the difference in value resulting from a single arbitrary abstraction step.
                \[
                    = \sum_{n \in A_i} w^{(t)}(n)   g(n) Z(n) - w^{(t+1)}(n') g(n') Z(n') 
                \]
                Above, the left term shows the contribution to the partition function due to nodes of abstract state $A_i$ (still assuming we will perform exhaustive search below each one), and the right term is the contribution of a selected node $n'$ after abstraction (note the adjustment to the selected node's weight).
                
                Using the fact that $w^{(t+1)}(n') = \frac{w^{(t)}(n')}{p(n')}$ (from the importance weight modification), we now get
                \[
                    = \sum_{n \in A_i}  w^{(t)}(n) g(n) Z(n) - \frac{w^{(t)}(n')}{p(n')} g(n') Z(n')
                \]
                (Note that $p(n')$ cannot be zero, otherwise $n'$ would not have been selected).
                
                Noting that for $p(n') = \frac{w^{(t)}(n') g(n') h(n')}{\sum_{n \in A_i}  w^{(t)}(n) g(n) h(n)}$ and substituting we get
                \begin{align*}
                    = \sum_{n \in A_i}&  w^{(t)}(n) g(n) Z(n) \\
                    &- w^{(t)}(n') g(n') Z(n')\frac{\sum_{n \in A_i}  w^{(t)}(n) g(n) h(n)}{w^{(t)}(n') g(n') h(n')}
                \end{align*}
                \[
                    = \sum_{n \in A_i}  w^{(t)}(n) g(n) Z(n) - \frac{Z(n')}{h(n')}  \sum_{n \in A_i}  w^{(t)}(n) g(n) h(n)
                \]
                
                Now, per our assumption, $\forall n \in A_i$, let $h(n) = k \, Z(n)$, where $k$ is the proportionality constant by which $h(n)$ differs from $Z(n)$.  Then
                \[
                    = \sum_{n \in A_i}  w^{(t)}(n) g(n) Z(n) - \frac{Z(n')}{k \, Z(n')}  \sum_{n \in A_i}  w^{(t)}(n) g(n)\,k\,Z(n)
                \]
                \[
                    = \sum_{n \in A_i}  w^{(t)}(n) g(n) Z(n) - \frac{k}{k}  \sum_{n \in A_i}  w^{(t)}(n) g(n) Z(n)
                \]
                \[
                    = \sum_{n \in A_i}  w^{(t)}(n) g(n) Z(n) - \sum_{n \in A_i}  w^{(t)}(n) g(n) Z(n)
                \]
                \[
                    = 0 = RHS
                \]
                \qed
            \end{proof}
            
            \begin{theorem}[AOAS Exact Abstractions from $h(n)r(n)$ vs. $Z(n)R(n)$ Proportionality] \label{thm:aoas-proportionality-exact-proposal}
                An abstraction function $a(n)$ has $h(n)r(n)$ vs. $Z(n)R(n)$ proportionality if, for every abstract state $A_i$ formed by $a(n)$, $\forall n \in A_i, h(n)r(n) = k \, Z(n)R(n)$, for some constant $k$ specific to $A_i$.  In such a case, $a$ is exact for ORAS with ORAS estimates having zero variance and $\hat{Z} = Z$ for each probe. 
            \end{theorem}

            \begin{proof}
                 We know that if we were to use exhaustive search, we would arrive at the true $Z$ value.  We use a proof by induction that assumes that after each abstraction step we will compute the rest of the probe exactly using exhaustive search.  Thus, if abstractions are performed layer by layer down from the root, after each abstraction we know that $Z(n')$ will be computed exactly for the selected node $n'$.  We also assume that, $R(n)$ for every node will be computed exactly.  This assumption holds true before we perform any abstractions (as everything is computed exactly via exhaustive search) and continues to hold if we can show that, after each abstraction step, the resulting estimates remains unchanged (and thus remains exact).
                 
                 We denote the estimate that would be generated by a probe constructed after $t$ time steps as $\hat{Z}^{(t)}(PROBE)$.  (As we will describe, each time step will correspond to an abstraction step).  As a base case, $\hat{Z}^{(t=0)}(PROBE) = Z$ since all values will be computed exactly via exhaustive search.  In the inductive step, we will show that after each time step $t$, if instead of using exhaustive search immediately, we first perform an abstraction on the current level of the probe, the resulting estimate of the newly abstracted probe $\hat{Z}^{(t+1)}(PROBE)$ will remain unchanged.  Namely, we will show that
                 \[
                    \hat{Z}^{(t)}(PROBE) - \hat{Z}^{(t+1)}(PROBE) = 0
                 \]
                 This shows that the abstractions maintain exactness of the probe's estimate.
                 
                 Starting from the left hand side
                 \[
                    LHS = \hat{Z}^{(t)}(PROBE) - \hat{Z}^{(t+1)}(PROBE)
                \]
                
                We note the difference in the overall probe estimates during an abstraction sampling is due to the change in the probe estimate that results from each individual abstraction step (namely due to the selection and reweighing of a representative node $n'$ from an abstract state $A_i$).  Thus for our time steps, we will focus on the difference in value resulting from a single arbitrary abstraction step.
                \[
                    = \sum_{n \in A_i} w^{(t)}(n) g(n) Z(n) R(n) - w^{(t+1)}(n') g(n') Z(n') R(n')
                \]
                Above, the left term shows the contribution to the partition function due to nodes of abstract state $A_i$ (still assuming we will perform exhaustive search below each one), and the right term is the contribution of a selected node $n'$ after abstraction (note the adjustment to the selected node's weight).
                
                Using the fact that $w^{(t+1)}(n') = \frac{w^{(t)}(n')}{p(n')}$ (from the importance weight modification), we now get
                \[
                    = \sum_{n \in A_i}  w^{(t)}(n) g(n) Z(n) R(n) - \frac{w^{(t)}(n')}{p(n')} g(n') Z(n') R(n')
                \]
                (Note that $p(n')$ cannot be zero, otherwise $n'$ would not have been selected).
                
                Noting that for $p(n') = \frac{w^{(t)}(n') g(n') h(n') r(n')}{\sum_{n \in A_i}  w^{(t)}(n) g(n) h(n) r(n')}$ and substituting we get
                \begin{flalign*}
                    \hspace{4pt}
                    = &\sum_{n \in A_i}  w^{(t)}(n) g(n) Z(n) R(n) \\
                    %
                    &- w^{(t)}(n') g(n') Z(n') R(n')
                    \frac{\sum_{n \in A_i}  w^{(t)}(n) g(n) h(n) r(n)}{w^{(t)}(n') g(n') h(n') r(n')}\\
                    %
                    = &\sum_{n \in A_i}  w^{(t)}(n) g(n) Z(n) R(n)\\
                    %
                    &- \frac{Z(n')R(n')}{h(n')r(n')}  \sum_{n \in A_i}  w^{(t)}(n) g(n) h(n) r(n)
                \end{flalign*}
                
                Now, per our assumption, $\forall n \in A_i$, let $h(n)r(n) = k \, Z(n)R(n)$, where $k$ is the proportionality constant by which $h(n)r(n)$ differs from $Z(n)R(n)$.  Then
                \begin{flalign*}
                    = &\sum_{n \in A_i}  w^{(t)}(n) g(n) Z(n) R(n) \\
                    %
                    &- \frac{Z(n')R(n')}{k \, Z(n')R(n')}  \sum_{n \in A_i}  w^{(t)}(n) g(n)\,k\,Z(n)R(n)\\
                    %
                    = &\sum_{n \in A_i}  w^{(t)}(n) g(n) Z(n) R(n) \\
                    %
                    &- \frac{k}{k}  \sum_{n \in A_i}  w^{(t)}(n) g(n) Z(n) R(n) \\
                    % 
                    = &\sum_{n \in A_i}  w^{(t)}(n) g(n) Z(n) R(n) - \sum_{n \in A_i}  w^{(t)}(n) g(n) Z(n) R(n)
                \end{flalign*}
                \vspace{4pt}
                \[
                    = 0 = RHS
                \]
                \qed
            \end{proof}


            Of course being able to determine $\frac{h(n)r(n)}{Z(n)R(n)}$, or even whether nodes have the same $\frac{h(n)r(n)}{Z(n)R(n)}$, is hard.  However one idea is to use the magnitude of $h(n)r(n)$ itself as a heuristic for similarities in $k$.  This drives the intuition for the new hR-based abstractions that will be described in Section \ref{sec:value-based-abstraction-classes}.

            Also from the sampling perspective, \cite{rizzo_2007} showed that when stratifying into equal area strata under the proposal target value distribution,
    
            \begin{proposition}[Stratified Importance Sampling Variance Reduction] \label{prop:rizzo-variance-reduction}
                Suppose that $M = mk$ is the number of replicates for an importance sampling estimator $\hat{\theta^{I}}$, and $\hat{\theta^{SI}}$ is a stratified importance sampling estimator, with estimates $\hat{\theta_{j}}$ for $\theta_{j}$ on the individual strata, each with $m$ replicates.  If $Var(\hat{\theta^{I}}) = \sigma^{2} / M$ and $Var(\hat{\theta_{j}}) = \sigma^{2}_{j} / m$, $j = 1, ..., k$, then
                \begin{align}
                    \sigma^{2} - k \sum^{k}_{j=1} \sigma^{2}_{j} \geq 0,
                \end{align}
                with equality if and only if $\theta_{1}=...=\theta_{k}$.  Hence stratification never increases variance, and there exists a stratification that reduces the variance except when [the proposal function] $g(x)$ is constant.
            \end{proposition}
    
            Two takeaways from this proposition are that 1) we can achieve variance reduction with respect to importance sampling (analogous to abstraction sampling with all nodes placed into a single abstract state) by stratifying into equal area strata under the proposal, and 2) reducing the variance of each strata $\sigma^{2}_{j}$ leads to greater variance reduction.  These help drive the intuition for the new Proposal-Based abstraction class presented in Section \ref{sec:value-based-abstraction-classes}, as well as motivate several of the abstraction schemes presented in Section \todo{reference section}.


        \subsubsection{Combined Paradigms}\label{sec:paradigms:combined}

            With a helpful heuristic, in both the search and sampling domains there are notions of potential benefit by spending more time in optimistic areas of the search/sampling space.  In heuristic search, this corresponds to associating a value with each nodes defining the belief in that node's worth, and then, among the frontier nodes being considered for expansion, proceeding in an order that priorities nodes with high values first.  \todo{And similarly in sampling, among many schemes it is beneficial to spend the most effort sampling high impact events}. We adapt these perspectives to drive strategies of forming giving rise to seven new abstraction schemes that will be described in Section \ref{sec:ordered-partitioning-schemes}


    \section{Value-Based Abstraction Classes} \label{sec:value-based-abstraction-classes}

        We introduce three new classes of abstraction functions that each define a unique notion of similarity between nodes based on value measurements on a positive scale.  These measurement values, which we refer to as \textbf{\textit{abstraction values}}, are used as a measure of similarity to abstract nodes together.  The three classes we present are the: Heuristic-Based (HB), hr-Based (HRB), and Q-Based (QB) abstraction classes.  Each is motivated by theory in search or sampling (as alluded to above; Section \ref{sec:paradigms}) and each that can be used with various abstraction schemes that will be presented in the subsequent section (Section \ref{sec:ordered-partitioning-schemes}).

        

        % We begin by reviewing existing paradigms for designing good abstraction functions, discuss an additional paradigm, then introduce the two new classes of abstraction functions and their motivation, and finally general algorithms that define the variants for each, including their motivation. 

    
        \subsection{Heuristic-Based Abstractions} \label{sec:value-based-abstraction-classes:HB}
        
            \begin{quote}
                $v(n) = h(n)$
            \end{quote}
            
            Using the motivation of abstracting nodes with similar sub tree $Z(n)$ intuited from the search domain, we propose associating an abstraction value to each node based on the heuristic estimate $h(n)$ of a node's $Z(n)$.  Unlike the use of partial (or hashed) contexts as was used by \cite{DBLP:conf/uai/BrokaDIK18}, heuristic estimates of $Z(n)$ can often provide \textit{quantitative} insight into potential similarities in $Z(n)$ values, and this is particularly true when using wMBE heuristics which provides bounds.

            In conjunction with the schemes that will be presented in Section \ref{sec:ordered-partitioning-schemes}, HB algorithms aim to form abstractions such that nodes with similar $Z(n)$ are grouped together.








        \subsection{HR-Based Abstractions} \label{sec:value-based-abstraction-classes:HRB}

            \begin{quote}
                $v(n) = h(n)r(n)$
            \end{quote}






        \subsection{Q-Based Abstractions} \label{sec:value-based-abstraction-classes:QB}

            \begin{quote}
                $v(n) = q(n) = w(n)g(n)h(n)r(n)$
            \end{quote}

            On the other hand, \cite{rizzo_2007} showed the potential of stratification when partitioning based on the proposal.  Furthermore, 
            

                








            
    
    
    
    
    
        % \subsection{Proposal Based Abstractions} \label{sec:q-based-abstractions}
        %     As a main algorithmic contribution of this work, we introduce a new class of abstractions based on proposal estimates.  We will motivate this new class of abstractions with theory, and then describe three schemes that were developed as a result.
            
        %     \subsubsection{Motivation} \label{sec:q-based-abstractions:motivation}
    
                
    
                
                 
        %     \subsubsection{\NoCaseChange{simpleQB}} \label{sec:q-based-abstractions:SimpleQB}
    
    
                
    
        %     \subsubsection{\NoCaseChange{simpleQB}} \label{sec:q-based-abstractions:SimpleQB}
    
    
    
    
        %         \begin{itemize}
        %             \item
        %                 \textbf{randQB}: nodes are ordered by $q$ and then partitioned into $nAbs$ abstract states of random sizes
    
        %             \item
        %                 \textbf{minVarQB}: nodes are partitioned into $nAbs$ abstract states using Ward's Minimum Variance hierarchical clustering
        %             \item
        %                 \textbf{equalDistQB}: nodes are ordered by $q$ (from \emph{low $q$ to high $q$}) and placed into abstract states based on $nAbs$ quantiles (with respect to the total $q$ of nodes already assigned abstract states).  If a quantile has not been reached or surpassed, the next node in the ordering is added into the current abstract state.  If the current quantile is surpassed, the next abstract state and quantile are instead considered.
        %             \item
        %                 \textbf{equalDistQB2}: nodes are ordered by $q$ (from \emph{high $q$ to low $q$}) and placed into abstract states based on $nAbs$ quantiles (with respect to the total $q$ of nodes already assigned abstract states).  If a quantile has not been reached or surpassed, the next node in the ordering is added into the current abstract state.  If the current quantile is surpassed, the next abstract state and quantile are instead considered.
        %             \item
        %                 \textbf{equalDistQB3}: nodes are ordered by $q$ (from \emph{high $q$ to low $q$}) and placed into abstract states based on $nAbs$ quantiles (with respect to the total $q$ of nodes already assigned abstract states) with the caveat that at least one node is added to each abstract state.  If a quantile has not been reached or surpassed, the next node in the ordering is added into the current abstract state.  If the current quantile is surpassed, the next abstract state and quantile are instead considered and the next node in the ordering added to that abstract state by default.
        %             \item
        %                 \textbf{equalDistQB4}: nodes are ordered by $q$ (from \emph{high $q$ to low $q$}) and placed into the current abstract state until the accumulation on $q$ of the added nodes is equal to or exceeds the $1/nAbs_{remaining}$ quantile considering the total $q$ of nodes in the current abstract state and nodes yet to be assigned.
        %         \end{itemize}
            
        %     \subsubsection{MinVarQB} \label{sec:q-based-abstractions:MinVarQB}
        %     \subsubsection{EqualDistQB} \label{sec:q-based-abstractions:EqualDistQB}
        %         From the performance of the previous two abstraction functions an their analysis as seen in Section \ref{sec:empirical-evaluation}, it was observed that the more computationally intensive MinVarQB abstraction function was producing probes with better estimates where as SimpleQB was producing better overall estimates (presumably due to its speed enabling many more probes).  Thus, a third heuristic based abstraction, EqualDistQB, was created inspired by the simplicity and speed of SimpleQB and, in a greedy way, also attempting to minimize variance of the heuristic values of the probes in the resulting abstract states.
    
        %         EqualDistQB works by 
                
        %         The variance reduction proven for stratified importance sampling by \cite{rizzo_2007} assumed that each stratified layer had an equal mass under the proposal distribution \todo{double check that it is the proposal and not true}.  Thus, to approach emulation of such a scenario at each level of abstraction, we can sort nodes
    
    
    
    
    \section{Ordered Value-Based Abstraction Functions} \label{sec:ordered-value-based-abstraction-functions}

        We first define a new class of abstraction functions, Ordered Value-Based Abstraction Functions:

        \begin{definition}[Ordered Value-Based Abstraction Function]
            An abstraction function $a(.)$ that partitions a set of nodes $\bs{n}$ into at most $nAbs$ abstract states such that nodes remain sorted across all abstract states according to a provided value function.
            % \vspace{-6pt}
            % \begin{itemize}
            %     \item 
            %         takes as input: A set of nodes $\bs{n}$ to be partitioned into abstract states; an abstraction value function $v(.)$; a sorting algorithm $SORT(.)$ that sorts $\bs{n}$ according to $v(.)$ and sort order $o$; a parameter $nAbs$ bounding the number of abstract states; a partitioning function $\Psi(.)$ that partitions the sorted nodes into abstract states maintaining their order
            %     \item 
            %         outputs: Nodes $\bs{n}$ partitioned into abstract states $\bs{A} = \setst{\bs{A_{i}}}{i<=nAbs}$ such that sort order $o$ of $v(n)$ is maintained across all $\bs{A_{i}}$
            % \end{itemize}
        \end{definition}

        \begin{algorithm}[t]
            \caption{$a_{\tn{\textit{ordered value}}}$}
            \label{alg:general-ordered-value-based-abstraction-function}
            \begin{footnotesize}
                \SetInd{0.25em}{0.55em}
                \DontPrintSemicolon 
            \Input{A set of nodes $\bs{n}$ to be partitioned into abstract states; an abstraction value function $v(.)$; a sorting algorithm $SORT(.)$ that sorts $\bs{n}$ according to $v(.)$ and sort order $o$; a parameter $nAbs$ bounding the number of abstract states; a partitioning function $\Psi(.)$ that partitions the sorted nodes into abstract states maintaining their order}
            \Output{Nodes $\bs{n}$ partitioned into abstract states $\bs{A} = \setst{\bs{A_{i}}}{i<=nAbs}$ such that sort order $o$ of $v(n)$ is maintained across all $\bs{A_{i}}$.}
            
            \Begin{
                \uIf{$|\bs{n}| <= m$}{
                    $\bs{A} = \setst{\set{n}}{n \in \bs{n}}$\\
                }
                % \uIf{$|\bs{n}| <= m$}{
                %     \tcp{Each node is its own abstract state}
                %     $\bs{A} = \set{}$\\
                %     $nAbs' \leftarrow |\bs{n}|$\\
                %     \ForEach{$i \in \set{1,...,nAbs'}$}{
                %         $\bs{A_{i}} = \set{n_{i}}$\\
                %         $\bs{A} \leftarrow \bs{A} \cup \set{\bs{A_{i}}}$
                %     }
                % }
                \uElse{
                    $\bs{n^{*}} \leftarrow SORT(\bs{n},v,o)$\\
                    $\bs{A} = \Psi(\bs{n^{*}}, v)$
                }
                \Return $\bs{A}$       
            }
            \end{footnotesize}
        \end{algorithm}

        We provide an example of an ordered value-based abstraction function in Algorithm \ref{alg:general-ordered-value-based-abstraction-function}.
        
        \semph{Complexity of $a_{\tn{\textit{ordered value}}}$.}\hfill\\
            $\mathcal{O}(\; \mathcal{O}(SORT(\bs{n},v,o)) + \mathcal{O}(\Psi(\bs{n^{*}}, v) \;)$
            
        Thus, assuming the value function $v(.)$ is not dominating, the complexity is either dominated by the sorting method's complexity or the partitioning complexity.

        Next we present seven ordered value partitioning schemes that, in conjunction with a provided $v$, can be used with Algorithm \ref{alg:general-ordered-value-based-abstraction-function} to define a unique ordered value-based abstraction function.


    \section{Ordered Partitioning Schemes} \label{sec:ordered-partitioning-schemes}

        We now present seven schemes, each defined by a unique sort order $o$ and partition strategy $\Psi$ combination.  Each scheme uses a different method to partition nodes into abstract states keeping the nodes in sort order according to $o$. With a provided value function $v(.)$, each scheme can be used to form an ordered value abstraction function.  In addition to defining each scheme, we also describe the motivation behind its creation.

        \paragraph{Running Example} \label{sec:ordered-partitioning-schemes:running-example}  As we motivate and describe the schemes, we will also provide an example of the abstract states that would result from partitioning the nodes:
        \begin{align} \label{eq:running-partitioning-example}
            \set{1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 10, 100}
        \end{align}
        into $nAbs=4$ abstract states.
            
        \subsubsection{\NoCaseChange{simpleVB}} \label{sec:ordered-partitioning-schemes:simpleVB}
            \begin{quote}
                $o = \tn{low to high}$\\
                $\Part{simpleVB}$ (Algorithm \ref{alg:psi-simpleVB})
            \end{quote}
            
            \begin{algorithm}[t!]
                \caption{$\Part{simpleVB}$}
                \label{alg:psi-simpleVB}
                \begin{footnotesize}
                    \SetInd{0.25em}{0.55em}
                    \DontPrintSemicolon 
                \Input{A set of ordered nodes $\bs{n^{*}}$ to be partitioned into $nAbs$ abstract states; a value function $v(.)$}
                \Output{$\bs{n^{*}}$ partitioned into abstract states\footnotemark{} $\bs{A} = \setst{\bs{A_{i}}}{i \in \set{1,...,nAbs}}$ such that $\forall \bs{A_{i}},\bs{A_{j}} \in \bs{A}, |\bs{A_{i}}|-|\bs{A_{j}}| <= 1$}
                
                \Begin{
                    $baseCardinality \leftarrow \floor{\frac{|\bs{n^{*}}|}{nAbs}}$\\
                    $extras \leftarrow |\bs{n^{*}}| \mod nAbs$\\
                    $j_{begin}=1$\\
                    \ForEach{$i \leftarrow 1,...,nAbs$}{
                        \uIf{$extras > 0$}{
                            $j_{end} \leftarrow j_{begin} + baseCardinality$\\
                            $extras \leftarrow extras - 1$
                        }
                        \uElse{
                            $j_{end} \leftarrow j_{begin} + baseCardinality - 1$
                        }
                        $\bs{A_{i}} = \set{n^{*}_{{j_{begin}}}, ..., n^{*}_{{j_{end}}}}$\\
                        $j_{begin} \leftarrow j_{end}+1$
                    }
                    $\bs{A} = \cup_{i = 1}^{nAbs} \bs{A_{i}}$\\
                    \Return $\bs{A}$       
                }
                \end{footnotesize}
            \end{algorithm} \footnotetext{\label{ftn:ordered-schemes-maintain-sort-order}Such that nodes maintain sort order $o$ across all abstract states.}

            The simpleVB (simple value-based) scheme follows the motivation of grouping nodes of similar value in the same abstract state by a simple 2-step method: 1) first, nodes are ordered by their heuristic value (low to high), and 2) next the ordered nodes are partitioned into [approximately] equal cardinality abstract states.

            \semph{Time Complexity.}\hfill\\
                Partitioning is achieved via one pass through $|\bs{n^{*}}|$ leading to $\mathcal{O}(|\bs{n^{*}}|)$ time complexity.

            \semph{Space Complexity.}\hfill\\
                No more than linear space is required.  $\mathcal{O}(|\bs{n^{*}}|)$

            \semph{Result on \hyperref[sec:ordered-partitioning-schemes:running-example]{Running Example}.}\hfill\\
                $\set{1.0, 1.1}, \set{1.2, 1.3}, \set{1.4, 1.5}, \set{10, 100}$
                
            Through its simplicity, this method aims to leverage speed allowing for abstractions to be formed much quicker leading to greater number of samples.



        \subsubsection{\NoCaseChange{minVarVB}} \label{sec:ordered-partitioning-schemes:minVarVB}

            \begin{quote}
                $o = \tn{low to high}$\\
                $\Psi = \Part{minVarVB}$ (Algorithm \ref{alg:psi-minVarVB})
            \end{quote}
            
            \begin{algorithm}[t!]
                \caption{$\Part{minVarVB}$}
                \label{alg:psi-minVarVB}
                \begin{footnotesize}
                    \SetInd{0.25em}{0.55em}
                    \DontPrintSemicolon 
                \Input{A set of ordered nodes $\bs{n^{*}}$ to be partitioned into $nAbs$ abstract states; a value function $v(.)$}
                \Output{$\bs{n^{*}}$ partitioned into abstract states\super{\ref{ftn:ordered-schemes-maintain-sort-order}} $\bs{A} = \setst{\bs{A_{i}}}{i \in \set{1,...,nAbs}}$ satisfying $\min \sum_{\bs{A_{i}} \in \bs{A}} Var(\bs{A_{i}}, v)$}
                
                \Begin{
                    $\bs{A} = WardsMethod(|\bs{n^{*}}|,v, \tn{Euclidian distance})$\\
                    \Return $\bs{A}$       
                }
                \end{footnotesize}
            \end{algorithm}

            As mentioned in Section \ref{sec:paradigms:sampling}, Proposition \ref{prop:rizzo-variance-reduction}, \cite{rizzo_2007} showed that in stratified importance sampling minimizing variance of the estimates within individual strata can lead to a reduction in overall variance.

            The minVarVB scheme was designed based on this intuition.  The scheme uses Ward's Minimum Variance Hierarchical Clustering (or Ward's Method, for short) \cite{ward1963} to group nodes into a $nAbs$ abstract states so as to minimize variance within each abstract state with respect to the provided value function $v(.)$.

            Ward's Minimum Variance Hierarchical Clustering is an agglomerative hierarchical clustering algorithm designed to create a dendrogram by iteratively merging clusters. The primary objective is to minimize the total within-cluster variance. Ward's method works as outlined in Algorithm \ref{alg:wards-method}.
                        
            \begin{algorithm}[t!]
              \caption{Ward's Method}
                \label{alg:wards-method}
              \begin{enumerate}
                \item \textbf{Initialization:} Treat each data point as an individual cluster. Assign each cluster a label or identifier.
                
                \item \textbf{Compute Pairwise Distances:} Calculate the pairwise distances between all clusters. Various distance metrics can be used, such as Euclidean distance.
                
                \item \textbf{Cluster Merging Iteration:} 
                  \begin{enumerate}
                    \item Identify the pair of clusters $\bs{C_{i}}$ and $\bs{C_{j}}$ that, when merged into a new cluster $\bs{C_{ij}}$, results in the smallest increase in the overall within-cluster variance. This is determined using the formula:
                      \[ \Delta Var = Var(\bs{C_{ij}}) - (Var(\bs{C_{i}}) + Var(\bs{C_{j}})) \]
                      where \(Var(\bs{C_{ij}})\) is the variance of the merged cluster, and \(Var(\bs{C_{i}})\) and \(Var(\bs{C_{j}})\) are the variances of clusters $\bs{C_{i}}$ and $\bs{C_{j}}$, respectively.
                    \item Update distance measures between the newly merged cluster and all other clusters.
                  \end{enumerate}
                
                \item \textbf{Repeat:} Repeat steps 2-3 until the desired number of clusters is achieved.
              \end{enumerate}
            \end{algorithm}
                        
            The algorithm progresses to minimize within-cluster variance ensuring that the clusters formed are compact and internally as close as possible to being homogeneous with respect to the metric used.
            
            For the proposed value-based abstraction classes used in this work, node values are single dimensional, and thus we can use Ward's Method combined with Lance-Williams linear distance updates \todo{cite} to increase efficiency.
            
            Lance-Williams linear distance updates, in the context of Ward's Method, refer to the formula used to calculate the distance between clusters as they are merged during the hierarchical clustering process. The general form of Lance-Williams distance updates can be expressed as follows:
            \begin{align}
                d_{(ij)k} = \alpha_{i} d_{ik} + \alpha_{j} d_{jk} + \beta d_{ij} + \gamma |d_{ik} - d_{jk}|
            \end{align}
            where:
            \begin{itemize}
                \vspace{-6pt}
                \item [\tiny$\bullet$]
                    $d_{ij}$, $d_{ik}$, and $d_{jk}$ are the pair-wise distances between clusters $\bs{C_{i}}$, $\bs{C_{j}}$, and $\bs{C_{k}}$
                \item [\tiny$\bullet$]
                    $d_{(ij)k}$ is the distance between the newly merged cluster $\bs{C_{i}} \cup \bs{C_{j}}$ and cluster $\bs{C_{k}}$
                \item [\tiny$\bullet$]
                    $\alpha_i, \alpha_j, \beta, \text{ and } \gamma$ are coefficients that depend on the linkage criterion used
            \end{itemize}
            
            In the case of Ward's method, the coefficients are specific to the minimization of within-cluster variance and are calculated as follows:
            \begin{align}
            \begin{split}
                \alpha_i &= \frac{|\bs{C_{i}}| + |\bs{C_{k}}|}{|\bs{C_{i}}| + |\bs{C_{j}}| + |\bs{C_{k}}|} \\
                \alpha_j &= \frac{|\bs{C_{j}}| + |\bs{C_{k}}|}{|\bs{C_{i}}| + |\bs{C_{j}}| + |\bs{C_{k}}|} \\
                \beta &= -\frac{|\bs{C_{k}}|}{|\bs{C_{i}}| + |\bs{C_{j}}| + |\bs{C_{k}}|} \\
                \gamma &= 0
            \end{split}
            \end{align}
            (The inclusion of \(\gamma\) provides additional flexibility in the more general case, adjusting the distance updates based on the specific clustering criterion being used).

            \semph{Time Complexity.\footnote{\label{ftn:time-complexity-assumes-constant-time-v}Assuming v(n) is $\mathcal{O}(1)$ in both time and space.}}\hfill\\
                The choice of clusters to merge generally leads to having a $\mathcal{O}(|\bs{n^{*}}|^{3})$ time complexity due to the need to compare pair-wise distances between all clusters at each iteration.  However, in the case where nodes are distributed linearly in one dimension, only neighboring distances need to be considered at each iteration and can be made efficient by use of a priority queue, however since the Lance-Williams distance updates themselves take linear time, once per iteration, the reduced time complexity is still $\mathcal{O}(|\bs{n^{*}}|^{2})$.
                
            \semph{Space Complexity.\super{\ref{ftn:time-complexity-assumes-constant-time-v}}}\hfill\\
                The space complexity is implementation dependent, with most time-efficient variants making use of a distance matrix leading to $\mathcal{O}(|\bs{n^{*}}|^{2})$ space complexity.

            \semph{Result on \hyperref[sec:ordered-partitioning-schemes:running-example]{Running Example}.}\hfill\\
                $\set{1.0, 1.1, 1.2}, \set{1.3, 1.4, 1.5}, \set{10}, \set{100}$

            In contrast to simpleVB, minVarVB places considerable resources into computing abstractions, leading to fewer samples, but with potentially better estimates with an appropriate value function $v(.)$. 



        \subsubsection{\NoCaseChange{equalDistVB}} \label{sec:ordered-partitioning-schemes:equalDistVB}

            \begin{quote}
                $o = \tn{low to high}$\\
                $\Part{equalDistVB}$ (Algorithm \ref{alg:psi-equalDistVB})
            \end{quote}
            
            \begin{algorithm}[t!]
                \caption{$\Part{equalDistVB}$}
                \label{alg:psi-equalDistVB}
                \begin{footnotesize}
                    \SetInd{0.25em}{0.55em}
                    \DontPrintSemicolon 
                \Input{A set of ordered nodes $\bs{n^{*}}$ to be partitioned into $nAbs$ abstract states; a value function $v(.)$}
                \Output{
				With 
				%
				$v(\bs{A_{1,...,i}}) = (\sum_{j=1}^{i} \sum_{n' \in \bs{A_{j}}} v(n')$,
				%
				$n_{\bs{A_{i}}}^{\tn{last}}$ be the last node in $\bs{A_{i}}$, 
				%
				and 
				%
				$P_{i} = \frac{ i \cdot \sum_{n \in \bs{n^{*}}}v(n)}{nAbs}$,
				%
                $\bs{n^{*}}$ partitioned into abstract states\super{\ref{ftn:ordered-schemes-maintain-sort-order}} $\bs{A} = \setst{\bs{A_{i}}}{i \in \set{1,...,nAbs}}$ such that for $i=1,...,nAbs$ in order,
                $(\; v(\bs{A_{1,...,i}}) \geq P_{i} \;)$
                $\land$ \\ $(\; (\, \bs{A_{i}}=\set{} \,) \lor (\, v(\bs{A_{1,...,i}}) - v(n_{\bs{A_{i}}}^{\tn{last}}) < P_{i} \,) \;)$ }
                
                \Begin{
                    $j \leftarrow 1$\\
                    \ForEach{$i \leftarrow 1,...,nAbs$}{
                        $\bs{A_{i}} = \set{}$\\
                        \While{$v(\bs{A_{1,...,i}}) < P_{i}$}{
                            $\bs{A_{i}} \leftarrow A_{i} \cup \set{n^{*}_{{j}}}$\\
                            $j \leftarrow j + 1$
                        }
                    }
                    $\bs{A} = \cup_{i = 1}^{nAbs} \bs{A_{i}}$\\
                    \Return $\bs{A}$       
                }
                \end{footnotesize}
            \end{algorithm}

            As discussed in Section \ref{sec:paradigms:combined}, there is intuition in wanting to focus on high impact regions of the search/sampling space.  Allowing the provided value function $v(.)$ to serve as a heuristic of nodes that are part of these high impact spaces, equalDistVB attempts to balance this intuition with the notion of variance reduction from minVarVB in attempts to group fewer predicted high impact nodes together in abstract states and allowing for the predicted lower impact nodes to be part of larger abstract states.  Also inspired by the simplicity of simpleVB, the scheme works by greedily adding nodes in value order (low to high) into abstract state $\bs{A_{i}}$ until the total sum of node values from $\bs{A_{1}},...,\bs{A_{i}}$ reaches or exceeds the $\frac{i}{nAbs}$ quantile.
            
            When paired with the QB abstraction class (see Section \ref{sec:value-based-abstraction-classes:QB}), the equalDistVB schemes also attempts to partition notes into abstract states of equal mass under the proposal.  This in corresponds to the condition for Proposition \ref{prop:rizzo-variance-reduction} for stratified importance sampling variance reduction.

            \semph{Time Complexity.\super{\ref{ftn:time-complexity-assumes-constant-time-v}}}\hfill\\
                $v(A_{1...i})$ can be updated progressively in constant time, and thus computation of $P_{i}$ at each iteration can also be done in constant time. Partitioning is achieved via one pass through $|\bs{n^{*}}|$ leading to $\mathcal{O}(|\bs{n^{*}}|)$ time complexity.

            \semph{Space Complexity.\super{\ref{ftn:time-complexity-assumes-constant-time-v}}}\hfill\\
                No more than linear space is required.  $\mathcal{O}(|\bs{n^{*}}|)$

            \semph{Result on \hyperref[sec:ordered-partitioning-schemes:running-example]{Running Example}.}\hfill\\
                $\set{1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 10, 100}, \set{}, \set{}, \set{}$
                
            Although, this method hopes to find a balance between intuitions previously explored, and without compromising speed and efficiency of abstract state generation, from the running example we can see how this method yield undesirable results in the presence of certain distributions of node values.  In this example, the first quantile is only reached after all the nodes have been added to the first abstract state, leaving no nodes remaining to be partitioned into the subsequent abstract states. 



        \subsubsection{\NoCaseChange{equalDistVB2}} \label{sec:ordered-partitioning-schemes:equalDistVB2}

            \begin{quote}
                $o = \tn{high to low}$\\
                $\Part{equalDistVB}$ (Algorithm \ref{alg:psi-equalDistVB})
            \end{quote}

            By simply reversing the sort order, equalDistVB2 is able to use the same partitioning strategy $\Part{equalDistVB}$ associated with equalDistVB meanwhile mitigate some of the overfilling of abstract states.
            
            \semph{Time Complexity.\super{\ref{ftn:time-complexity-assumes-constant-time-v}}}\hfill\\
                $v(A_{1...i})$ can be updated progressively in constant time, and thus computation of $P_{i}$ at each iteration can also be done in constant time. Partitioning is achieved via one pass through $|\bs{n^{*}}|$ leading to $\mathcal{O}(|\bs{n^{*}}|)$ time complexity.

            \semph{Space Complexity.\super{\ref{ftn:time-complexity-assumes-constant-time-v}}}\hfill\\
                No more than linear space is required.  $\mathcal{O}(|\bs{n^{*}}|)$

            \semph{Result on \hyperref[sec:ordered-partitioning-schemes:running-example]{Running Example}.}\hfill\\
                $\set{100}, \set{}, \set{}, \set{10, 1.5, 1.4, 1.3, 1.2, 1.1, 1.0}$

            We see that equalDistVB2 can still be subject to over packing of abstract states.  Next we present two more equalDistvB variants that continue to mitigate this artifact.



        \subsubsection{\NoCaseChange{equalDistVB3}} \label{sec:ordered-partitioning-schemes:equalDistVB3}

            \begin{quote}
                $o = \tn{high to low}$\\
                $\Part{equalDistVB3}$ (Algorithm \ref{alg:psi-equalDistVB3})
            \end{quote}

            \begin{algorithm}[t!]
                \caption{$\Part{equalDistVB3}$}
                \label{alg:psi-equalDistVB3}
                \begin{footnotesize}
                    \SetInd{0.25em}{0.55em}
                    \DontPrintSemicolon 
                \Input{A set of ordered nodes $\bs{n^{*}}$ to be partitioned into $nAbs$ abstract states; a value function $v(.)$}
                \Output{
				With 
				%
				$v(\bs{A_{1,...,i}}) = (\sum_{j=1}^{i} \sum_{n' \in \bs{A_{j}}} v(n')$,
				%
				$n_{\bs{A_{i}}}^{\tn{last}}$ be the last node in $\bs{A_{i}}$, 
				%
				and 
				%
				$P_{i} = \frac{ i \cdot \sum_{n \in \bs{n^{*}}}v(n)}{nAbs}$,
				%
                $\bs{n^{*}}$ partitioned into abstract states\super{\ref{ftn:ordered-schemes-maintain-sort-order}} $\bs{A} = \setst{\bs{A_{i}}}{i \in \set{1,...,nAbs}}$ such that for $i=1,...,nAbs$ in order,
                $(\; v(\bs{A_{1,...,i}}) \geq P_{i} \;)$
                $\land$ \\ $(\; (\, \bs{|A_{i}}|=1 \,) \lor (\, v(\bs{A_{1,...,i}}) - v(n_{\bs{A_{i}}}^{\tn{last}}) < P_{i} \,) \;)$ }
                
                \Begin{
                    $j \leftarrow 1$\\
                    \ForEach{$i \leftarrow 1,...,nAbs$}{
                        $\bs{A_{i}} = \set{n^{*}_{{j}}}$\\
                        $j \leftarrow j+1$\\
                        \While{$v(\bs{A_{1,...,i}}) < P_{i}$}{
                            $\bs{A_{i}} \leftarrow A_{i} \cup \set{n^{*}_{{j}}}$\\
                            $j \leftarrow j+1$
                        }
                    }
                    $\bs{A} = \cup_{i = 1}^{nAbs} \bs{A_{i}}$\\
                    \Return $\bs{A}$       
                }
                \end{footnotesize}
            \end{algorithm}

            In order to lessen over packing and ensure abtract states are not left empty, equalDistVB3 modifies equalDistVB2 so that, after processing of each abstract state, the next state is forced an addition of at least a single node by default.
            
            \semph{Time Complexity.\super{\ref{ftn:time-complexity-assumes-constant-time-v}}}\hfill\\
                $v(A_{1...i})$ can be updated progressively in constant time, and thus computation of $P_{i}$ at each iteration can also be done in constant time. Partitioning is achieved via one pass through $|\bs{n^{*}}|$ leading to $\mathcal{O}(|\bs{n^{*}}|)$ time complexity.

            \semph{Space Complexity.\super{\ref{ftn:time-complexity-assumes-constant-time-v}}}\hfill\\
                No more than linear space is required.  $\mathcal{O}(|\bs{n^{*}}|)$

            \semph{Result on \hyperref[sec:ordered-partitioning-schemes:running-example]{Running Example}.}\hfill\\
                $\set{100}, \set{10}, \set{1.5}, \set{1.4, 1.3, 1.2, 1.1, 1.0}$

            Still highly efficient, equalDistVB3 manages to ensure that the provided $nAbs$ granularity is honored, allowing users better control of the search vs. sampling interpolation possible with abstraction sampling.



        \subsubsection{\NoCaseChange{equalDistVB4}} \label{sec:ordered-partitioning-schemes:equalDistVB4}

            \begin{quote}
                $o = \tn{high to low}$\\
                $\Part{equalDistVB4}$ (Algorithm \ref{alg:psi-equalDistVB4})
            \end{quote}

            \begin{algorithm}[t!]
                \caption{$\Part{equalDistVB4}$}
                \label{alg:psi-equalDistVB4}
                \begin{footnotesize}
                    \SetInd{0.25em}{0.55em}
                    \DontPrintSemicolon 
                \Input{A set of ordered nodes $\bs{n^{*}}$ to be partitioned into $nAbs$ abstract states; a value function $v(.)$}
                \Output{
				With 
				%
				$v(\bs{A_{1,...,i}}) = (\sum_{j=1}^{i} \sum_{n' \in \bs{A_{j}}} v(n')$,
				%
				$n_{\bs{A_{i}}}^{\tn{last}}$ be the last node in $\bs{A_{i}}$, 
				%
				and 
				%
				$L_{i} = \frac{v(\bs{n^{*}})-v(\bs{A_{1,...,i-1}})}{nAbs-i+1}$,
				%
                $\bs{n^{*}}$ partitioned into abstract states\super{\ref{ftn:ordered-schemes-maintain-sort-order}} $\bs{A} = \setst{\bs{A_{i}}}{i \in \set{1,...,nAbs}}$ such that for $i=1,...,nAbs$ in order,
                $(\; v(\bs{A_{i}}) \geq L_{i} \;)$
                $\land$ \\ $(\; (\, \bs{|A_{i}}|=1 \,) \lor (\, v(\bs{A_{i}}) - v(n_{\bs{A_{i}}}^{\tn{last}}) < L_{i} \,) \;)$ }
                
                \Begin{
                    $j \leftarrow 1$\\
                    \ForEach{$i \leftarrow 1,...,nAbs$}{
                        $\bs{A_{i}} = \set{}$\\
                        \While{$v(\bs{A_{i}}) < L_{i}$}{
                            $\bs{A_{i}} \leftarrow A_{i} \cup \set{n^{*}_{{j}}}$\\
                            $j \leftarrow j+1$
                        }
                    }
                    $\bs{A} = \cup_{i = 1}^{nAbs} \bs{A_{i}}$\\
                    \Return $\bs{A}$       
                }
                \end{footnotesize}
            \end{algorithm}

            The final varaint of the equalDist schemes, equalDistVB4 attempts to perform a more even partitioning than the previous variants by recomputing quantiles. Each time the algorithm progesses to processing a new abstract state, remaining nodes and abstract states are used to compute new quantiles which are then used to guide filling of the current abstract state in the same way previously done.
            
            \semph{Time Complexity.\super{\ref{ftn:time-complexity-assumes-constant-time-v}}}\hfill\\
                $v(A_{1...i})$ can be updated progressively in constant time, and thus computation of $L_{i}$ at each iteration can also be done in constant time.  Partitioning is achieved via one pass through $|\bs{n^{*}}|$ leading to $\mathcal{O}(|\bs{n^{*}}|)$ time complexity.

            \semph{Space Complexity.\super{\ref{ftn:time-complexity-assumes-constant-time-v}}}\hfill\\
                No more than linear space is required.  $\mathcal{O}(|\bs{n^{*}}|)$

            \semph{Result on \hyperref[sec:ordered-partitioning-schemes:running-example]{Running Example}.}\hfill\\
                $\set{100}, \set{10}, \set{1.5, 1.4, 1.3}, \set{1.2, 1.1, 1.0}$

            Still highly efficient, equalDistVB3 manages to ensure that the provided $nAbs$ granularity is honored, allowing users better control of the search vs. sampling interpolation possible with abstraction sampling.


        \subsubsection{\NoCaseChange{randVB}} \label{sec:ordered-partitioning-schemes:randVB}

            \begin{quote}
                $o = \tn{high to low}$\\
                $\Part{randVB}$ (Algorithm \ref{alg:psi-randVB})
            \end{quote}

            \begin{algorithm}[t!]
                \caption{$\Part{randVB}$}
                \label{alg:psi-randVB}
                \begin{footnotesize}
                    \SetInd{0.25em}{0.55em}
                    \DontPrintSemicolon 
                \Input{A set of ordered nodes $\bs{n^{*}}$ to be partitioned into $nAbs$ abstract states; a value function $v(.)$}
                \Output{$\bs{n^{*}}$ partitioned into abstract states\super{\ref{ftn:ordered-schemes-maintain-sort-order}} $\bs{A} = \setst{\bs{A_{i}}}{i \in \set{1,...,nAbs}}$ }
                
                \Begin{
                    $\bs{s} \sim Unif(\setst{\bs{M} \subseteq \set{1,...,|\bs{n^{*}}|-1}}{|\bs{M}|=nAbs-1})$\\
                    $\bs{s^{*}_{}} \leftarrow SORT(\bs{s})$\\
                    $j \leftarrow 1$\\
                    \ForEach{$i \leftarrow 1,...,nAbs\!-\!1$}{
                        $\bs{A_{i}} = \set{n^{*}_{j},...,n^{*}_{s^{*}_{i}}}$\\
                        $j \leftarrow s^{*}_{i}+1$
                    }
                    $\bs{A_{nAbs}} = \set{n^{*}_{j},...,n^{*}_{|n^{*}|}}$\\
                    $\bs{A} = \cup_{i = 1}^{nAbs} \bs{A_{i}}$\\
                    \Return $\bs{A}$       
                }
                \end{footnotesize}
            \end{algorithm}

            If the quality of $v(.)$ as a measure of similarity is unknown or poor, it could instead be beneficial to rely on randomness to ensure a diverse sampling of abstractions.  randVB does this by sampling $nAbs\!-\!1$ partition points between the sorted nodes $\bs{n^{*}}$ uniformly at random and without replacement, and then partitions the nodes accordingly. As a result, abstract states are formed such that nodes are still grouped according to $v(.)$, but the size of those groups varies.
            
            \semph{Time Complexity.\super{\ref{ftn:time-complexity-assumes-constant-time-v}}}\hfill\\
                $\mathcal{O}(|\bs{n^{*}}|)$ time complexity.

            \semph{Space Complexity.\super{\ref{ftn:time-complexity-assumes-constant-time-v}}}\hfill\\
                No more than linear space is required.  $\mathcal{O}(|\bs{n^{*}}|)$

            \semph{Result on \hyperref[sec:ordered-partitioning-schemes:running-example]{Running Example}.}\hfill\\
                $\set{100, 10}, \set{1.5}, \set{1.4, 1.3, 1.2}, \set{1.1, 1.0}$;\\
                $\set{100}, \set{10, 1.5, 1.4, 1.3}, \set{1.2, 1.1}, \set{1.0}$;\\
                ...etc.







    \section{Empirical Evaluation} \label{sec:empirical-evaluation}

        \subsection{Results} \label{sec:empirical-evaluation:results}

        \subsection{Analysis} \label{sec:empirical-evaluation:analysis}


    % \section{Increasing Understanding of Abstraction Sampling Probes}\label{sec:understanding-abstraction-sampling-probes}

    %     As the main theoretical contribution of this work, we provide theory and understanding of probe structure and how different probe generation can lead to better or worse estimates.
        
    %     \subsection{Probe Size} \label{sec:understanding-abstraction-sampling-probes:probe-size}
    %         As will be highlighted in Section \ref{sec:empirical-evaluation}, even when abstractions are set to have the same granularity they can result in different probe structures, which can also cause probes that vary greatly in the number of nodes generated during their construction, and ultimately their final size.  This has implications on the configurations captured by the probes as well as the computation (ie. time) necessary to generate them.

    %         In the corresponding section of the supplemental, you can find detailed examples of how, even for the same granularity, probe generation can vary leading to different probe sizes.  Below, we provide a theoretical analysis of the potential variation and then we follow with the practical implications of this understanding.

    %         \begin{theorem}[Minimum size of an AOAS probe based on a chain psuedo tree]
    %             \hfill \\
    %             Consider a chain pseudo tree \PT based on the ordered variable set $\X = \set{X_{1}, ..., X_{N}}$ and an abstraction function with granularity of $nAbs$ per variable.
                
    %             If $|D_{\X}| > nAbs$, then let the ordered set
    %             \begin{align*}
    %                 \bs{Y} = \set{X_{y}, ..., X_{N}}
    %             \end{align*}
    %             s.t. $|D_{\bs{Y}}| \geq nAbs$, yet $|D_{\bs{Y} \setminus \set{X_{y}}}| < nAbs$.  Otherwise, let
    %             $\bs{Y} = \X$.
                
    %             Then the smallest probe that can be generated will have size
    %             \begin{align}
    %                 \begin{split}
    %                     min(\{|\widetilde{T}^{(m)}|\}_{m=1}^{\infty}) =& 
    %                           \;
    %                            |\X| - |\bs{Y}| \\
    %                         &+ |T_{\bs{Y}}| \\
    %                         &- \sum_{X_{N}, ..., X_{y}} prune(X_{i})
    %                 \end{split},
    %             \end{align}
    %             \begin{align}
    %                 prune(X_{i}) =
    %                 \begin{cases}
    %                     max(0,|D_{\bs{Y}}| - nAbs), &X_{i}=X_{N} \\
    %                     \floor{prune(X_{i+1})/D_{X_{i+1}}}, &otherwise
    %                 \end{cases}
    %             \end{align}
                
                
    %             \begin{proof}[Proof by construction]
    %                 \hfill \\
    %                 \begin{quoting}
    %                     \textbf{Case 1: ($|D_{\X}| \leq nAbs$})
    %                     \begin{quoting}
    %                         Since even the fully expanded search tree does not produce levels that have $nAbs$ number of nodes, the full search tree will be expanded for every probe and
    %                         \begin{align}
    %                             |\widetilde{T}^{(m)}| &= T_{\X}
    %                         \end{align}
    %                         According to the case criterion and definition of the ordered variable set $\bs{Y}$, $\bs{Y} = \X$ and thus $(|\X| - |\bs{Y}|) = 0$ and $|D_{\bs{Y}}| \leq nAbs$. The latter further implies that $max(0,|D_{\bs{Y}}| - nAbs)=0$ which in turn implies that $\sum_{X_{N}, ..., X_{y}} prune(X_{i}) = 0$. Thus,
    %                         \begin{align}
    %                             \begin{split}
    %                                 |\widetilde{T}^{(m)}| &= T_{\X}\\
    %                                                       &= |\X| - |\bs{Y}| \\
    %                                                             &\;\;\;\;+ |T_{\bs{Y}}| \\
    %                                                             &\;\;\;\;+ \sum_{X_{N}, ..., X_{y}} prune(X_{i})
    %                             \end{split}
    %                         \end{align}
    %                     \end{quoting}

    %                     \textbf{Case 2: ($|D_{\X}| > nAbs$})
    %                     \begin{quoting}
    %                         Then $\bs{Y} = \set{X_{y}, ..., X_{N}}$ s.t. $|D_{\bs{Y}}| \geq nAbs$ and $|D_{\bs{Y} \setminus \set{X_{y}}}| < nAbs$.\\

    %                         We start with the observation that the final level of the probe must have $nAbs$ leaves since, at some level the probe will expand to at least $nAbs$ leaves since $|D_{\X}| > nAbs$ and so, at that level and subsequent levels, abstractions will group newly expanded nodes to $nAbs$ abstractions.  We also make the observation that the smallest such sub tree contains $nAbs$ nodes \textit{only} at the final level (with all other levels having fewer than $nAbs$ nodes).  Assuming such a probe can be constructed, these observations imply that there exists some level $y$ where expansions from level $y$ to $N$ result in $nAbs$ or more leaves, but where expansions from level $y$ to level $N-1$ still produce fewer than $nAbs$ leaves.  With these observations, we can see that the smallest probe would then be one that consists of a single path for levels $1$ through $y-1$ and then results in $nAbs$ leaves at level $N$.

    %                         However, assuming expansions from level $y$ to level $N$ can produce more than $nAbs$ leaves, there can be many probes that consist of a path from levels $1$ through $y-1$ and $nAbs$ leaves at level $N$.  However, following similar logic as before, the smallest such probe will be one with the smallest number of nodes at higher levels, the smallest being achieved when the final $nAbs$ nodes of level $N$ extend from the fewest nodes of level $N-1$, and those nodes of level $N-1$ extend from the fewest nodes of level $N-2$, and so on.  
                            
    %                         Formally, this means that every level until level $y$ contributes a single node to the probe; in other words $|\X|-|\bs{Y}|$ nodes.  To express the size of the remaining sub tree consisting of levels $y$ through $N$, we can first consider the fully expanded sub tree $T_{\bs{Y}}$ and then recursively remove nodes from bottom up so that there are $nAbs$ nodes in the last level and the nodes are grouped so that they extend from the fewest nodes of the previous level, and recursively repeat the procedures with the nodes of the previous level all the way up to level $y$.  This results in $|T_{\bs{Y}}| - \sum_{X_{N}, ..., X_{y}} prune(X_{i})$ nodes for levels $y$ to $N$.

    %                         Thus, the final size of such a minimum size probe $\widetilde{T}^{(m)}_{min}$ will be
    %                         \begin{align}
    %                             \begin{split}
    %                                 |\widetilde{T}^{(m)}_{min}| =& 
    %                                       \;
    %                                        |\X| - |\bs{Y}| \\
    %                                     &+ |T_{\bs{Y}}| \\
    %                                     &- \sum_{X_{N}, ..., X_{y}} prune(X_{i})
    %                             \end{split}
    %                         \end{align}

    %                         Finally, we are left with the task of proving that such a probe can be constructed by AOAS.  Indeed this is so.  Such a probe could occur if, at every level that abstractions occur, the abstraction functions group nodes such that it is possible for the chosen $nAbs$ representatives to extended from as few parent nodes as possible for $nAbs$ number of nodes.  Not every abstraction function can satisfy this criterion, however, there can always be some abstraction function that can (ex. an abstraction function that places $nAbs-1$ of the nodes mentioned above into a singleton abstract state, and the rest of the nodes into the last).
    %                     \end{quoting}

    %                     And given that \textbf{Case 1} and \textbf{Case 2} explore all the possible cases, we prove our claim.
    %                 \end{quoting}
    %             \end{proof}
    %         \end{theorem}

    %         \todo{corollary with simple complexity for min probe size}

    %         \todo{theorem max probe size}
    %         \todo{corollary with simple complexity for max probe size}
    %         \todo{corollary at any point, probe size will be bounded by the max probe size}


    \section{Conclusion} \label{sec:analysis}


        
\clearpage

    \bibliography{ref}




\end{document}