% \documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}
\newcommand{\shrink}[1]{}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{NeuroBE: Escalating Neural Network Approximations of Bucket Elimination (Supplementary material)}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
%\author{Sakshi Agarwal}
%\author[]{Kalev Kask}
%\author[]{Alex Ihler}
%\author{Rina Dechter}

\author{\href{mailto:<sakshia1@uci.edu>?Subject=Your UAI 2022 paper}{Sakshi Agarwal}}
\author[]{\href{mailto:<kkask@uci.edu>?Subject=Your UAI 2022 paper}{Kalev Kask}}
\author[]{\href{mailto:<ihler@ics.uci.edu>?Subject=Your UAI 2022 paper}{Alex Ihler}}
\author{\href{mailto:<dechter@ics.uci.edu>?Subject=Your UAI 2022 paper}{Rina Dechter}}

%\author[3]{Further~Coauthor}
%\author[3,1]{Further~Coauthor}
% Add affiliations after the authors
\affil[1]{%
    %\\
    University of California Irvine
    %Pittsburgh, Pennsylvania, USA
}
  
  \begin{document}
\maketitle

\subsection{Estimating the pseudo-dimension of a NN:}
In our work, we use NN architectures with ReLU activation functions. To construct a NN with L layers and a variable h, \#hidden-units per layer to model a specific local bucket message $\mu^*$, we pick the rule $h=b*w$ where $w$ is the bucket's width and b is a constant. By doing this, the $\#$parameters in the NN is : 
\begin{equation}
|\theta| = (L-1)*b^2*w^2+ b*w^2 + (L+1)*b*w + 1
\end{equation}

We make use of the lower bound of pseudo-dimension for NNs with ReLU activation functions from the work in \citet{JMLR:v20:17-612} to get: 

\begin{equation}
\rho = |\theta|*Llog(|\theta|/L)
\end{equation}

By substituting Eq. 1 in Eq. 2 and ignoring all linear terms in $w$  we get that $\rho$ can be dominated by:
%We simplify the dependency of $\rho_c$ on $|\theta_c|$ by taking the term quadratic in width $w_c$ from eq 7 to give the following: 
\[
\rightarrow \rho \propto (L*b*w)^2log[(b*w)]
\]

\subsection{Estimating error in partition function:}
\begin{theorem*}

Let $B_c$ be a bucket in a bucket chain along an ordering $d$; let $B_c$ contain the original functions as $\phi_c$ and $\mu_{c+1}$ as the message passed to it from the previous bucket; let $\lambda_c$ be the (global) exact message generated in  $B_c$, $\mu^*_c$ be the local exact message in $B_c$ and $\mu_c = APP(\mu^*_c)$ its  approximation  (e.g., by a trained neural network). Let $E_c = \log\mu^*_c - \log\mu_c$ 
and $\epsilon_c = max_{B_c}|E_c|$. Then, 

\[
\log \lambda_c- \log \mu_c \leq  \sum_{k=2}^{n-c} \epsilon_{c+k}
\]
In particular, since $\lambda_1= Z$, the partition function and $\mu_1= \hat Z$, the estimate to the partition function,
%\begin{equation}
% \logZ-\log\mu_1  \leq E_1 +  \sum_{k=0}^{n-2} \epsilon_{2+k}
%\label{eqerror0}
%\end{equation}
%or 
\begin{equation}
 \log Z-\log \hat Z  \leq  \sum_{k=1}^{n-1} \epsilon_{1+k}
\label{eq:errorb}
\end{equation}
%If $max_x|\epsilon_c(x)| \leq \epsilon$ for some  $\epsilon \geq 0$, then,

%\[
%\logZ-\log\mu_1 \leq  n*\epsilon
%\]

%where $n$ is the number of variables.
\end{theorem*}


\begin{proof} We will next derive the recursion, starting at the first processed bucket $B_n$ and going down in order. Remember throughout that
$\log \mu^*_{n-i} = \log \sum_{X_{n-i}} (e^{\log \phi_{n-i} + \log \mu_{n-i+1}})$

{\bf For $B_n$} $\lambda_n = \mu^*_n$, therefore
\[
\log \lambda_n- \log \mu_n = \log \mu^*_n - \log \mu_n = E_n
\]

{\bf For $B_{n-1}$}, by definition
\[
\log \lambda_{n-1} - \log \mu_{n-1} = \log \sum_{X_{n-1}}e^{ \log \phi_{n-1} + \log\lambda_n} - \log \mu_{n-1} 
\]
%\[
%=  \log\sum_{X_{n-1}}e^{\log\phi_{n-1} + \log\mu^*_n} - \log\mu_{n-1}
%\]
%\[
%= \log\sum_{X_{n-1}}e^{[(\log\phi_{n-1}+ \log\mu_n) + (\log\mu^*_n- \log\mu_{n})]} - log\mu_{n-1}
%\]
Substituting $\log\lambda_n$ from $B_n$
\[
= \log\sum_{X_{n-1}}e^{[(\log\phi_{n-1}+ \log\mu_n) + E_n]} - \log\mu_{n-1}
\]
\[
= \log[\sum_{X_{n-1}}e^{(\log\phi_{n-1}+ \log\mu_n)}e^{E_n}] - \log\mu_{n-1}
\]

If $max_{scope(\mu^*_n)}|E_n| = \epsilon_n$, then, 


\[
\leq \log[e^{\epsilon_n}\sum_{X_{n-1}}e^{(\log\phi_{n-1}+ \log\mu_n)}] - \log\mu_{n-1}
\]


\[
\leq \epsilon_n + \log\sum_{X_{n-1}}e^{(\log\phi_{n-1}+ \log\mu_n)} - \log\mu_{n-1}
\]

%\[
%= \sum_{X_{n-1}} loge^{\log\phi_{n-1} + \log\mu_n} -log\mu_{n-1} + \sum_{X_{n-1}} log(e^{\log\phi_{n-1} + (\mu^*_n- \mu_{n})}
%\]

Since $\log\sum_{X_{n-1}} e^{ \log\phi_{n-1} + \log\mu_n} = \log\mu^*_{n-1}$ we get

\begin{equation}
\log\lambda_{n-1} - \log\mu_{n-1} \leq \epsilon_n +  \log\mu^*_{n-1} - \log\mu_{n-1}
\label{eq10_}
\end{equation}
or equivalently,

\begin{equation}
\log\lambda_{n-1} - \log\mu_{n-1} \leq \epsilon_n + E_{n-1} 
\end{equation}

{\bf Moving to $B_{n-2}$}, by definition:
\begin{equation}
\log\lambda_{n-2} - \log\mu_{n-2} = \log\sum_{X_{n-2}} e^{\log\phi_{n-2} + \log\lambda_{n-1}} - \log\mu_{n-2}
\end{equation}
Substituting  $\log\lambda_{n-1}$ from Eq. (\ref{eq10_}) 
 we get
%(Substituting $E_{j} = \mu^*_j - \mu_j$, and %remembering that $\mu^*_{n-2} = \sum_{X_{n-2}} %\phi_{n-2} \mu_{n-1}$), we get
\[
\log\lambda_{n-2} - \log\mu_{n-2} 
\]

\[
\leq  \log\sum_{X_{n-2}} e^{\log\phi_{n-2} + [ \log\mu_{n-1} +\epsilon_n + E_{n-1}]}  - \log\mu_{n-2}
\]

\[
\leq  \log\sum_{X_{n-2}} e^{\log\phi_{n-2} + \mu_{n-1}} e^{\epsilon_n + E_{n-1}}  - \log\mu_{n-2}
\]
Taking max$_{scope(\mu^*_{n-1})}E_{n-1} = \epsilon_{n-1}$,

\[
\leq  \loge^{\epsilon_n + \epsilon_{n-1}} \sum_{X_{n-2}} e^{\log\phi_{n-2} + \mu_{n-1}}   - \log\mu_{n-2}
\]
\[
\leq \epsilon_n +\epsilon_{n-1} + \log \sum_{X_{n-2}} e^{\log\phi_{n-2} + \mu_{n-1}}   - \log\mu_{n-2}
\]
\[
\leq \epsilon_n +\epsilon_{n-1} + \log\mu^*_{n-2}   - \log\mu_{n-2}
\]
%\[
%= E_{n-2} + \sum_{X_{n-2}} \phi_{n-2}  [E_{n-1} + \sum_{X_{n-1}} %\phi_{n-1} E_n ]
%\]
%\[
%= E_{n-2} + \sum_{X_{n-2}} \phi_{n-2}  E_{n-1} + \sum_{X_{n-2}} \phi_{n-2} \sum_{X_{n-1}} %\phi_{n-1} E_n
%\]
yielding,
\begin{equation}
\log\lambda_{n-2} - \log\mu_{n-2} \leq
E_{n-2}+ \epsilon_{n-1} + \epsilon_n
\label{eq11}
\end{equation}


{\bf Moving to bucket $B_{n-3}$}, by definition
\[
\log\lambda_{n-3} - \log\mu_{n-3} = \log\sum_{X_{n-3}} e^{\log\phi_{n-3} +  \log\lambda_{n-2}} - \log\mu_{n-3}
\]
Substituting for $\lambda_{n-2}$ from Eq. (\ref{eq11}) we get with some algebra
\[
\log\lambda_{n-3} - \log\mu_{n-3}
\]
\[
\leq \log\sum_{X_{n-3}} e^{\log\phi_{n-3} + [ \log\mu_{n-2} +  E_{n-2} + \epsilon_{n-1} + \epsilon_n]} - \log\mu_{n-3}
\]
%\[
%=E_{n-3}  +
%\sum_{X_{n-3}} \phi_{n-3} [ E_{n-2} + \sum_{X_{n-2}} \phi_{n-2}  %E_{n-1} + \sum_{X_{n-2} X_{n-1}} \phi_{n-2} \cdot \phi_{n-1} E_n
%\]
yielding
\[
\log\lambda_{n-3} - \log\mu_{n-3} \leq E_{n-3}  + \epsilon_{n-2} + \epsilon_{n-1} + \epsilon_n
\]
and so on. Clearly the  emerging expression for bucket $B_c$ is

\begin{equation}
 \log\lambda_c - \log\mu_c \leq E_c +  \epsilon_{c+1} + \epsilon_{c+2} + ...
 \end{equation}
 
  or,
\begin{equation}
 \log\lambda_c-\log\mu_c \leq E_{c}+ \sum_{k=0}^{n-c-1} \epsilon_{c+1+k}
\label{eqerror}
\end{equation}

The general transition from $n-i$ to $n-i-1$ can be easily followed to complete the inductive proof.
\shrink{
It can be proved by induction. We have already shown the base case and the second case above. Assume the expression is correct for  $j =n...i$ we will show it for $j=i-1$

By definition
\[
\lambda_{i-1} - \mu_{i-1} = \sum_{X_{i-1}} \phi_{i-1} \cdot \lambda_{i} - \mu_{i-1}
\]
From equation (\ref{eqerror}) substituting for $\lambda_i$  we get
\[
= \sum_{X_{i-1}} \phi_{i-1} [E_i + \sum_{k= 0}^{n-i-1} 
\sum_{X_i..X_{i+k} } (\phi_i \cdot \phi_{i+1} ...\cdot \phi_{i+k}) E_{i+k+1}]  -\mu_{i-1}
\]

}
Assuming that we control the derivation of $\mu_c$ for each $B_c$ to ensure that $E_c = \log\mu^*_c - \log\mu_c \leq \epsilon_c$ and substituting in the expression we get from Eq. (\ref{eqerror}) that
\begin{equation}
 \log \lambda_c- \log \mu_c  \leq  \epsilon_c + \sum_{k=0}^{n-c-1} \epsilon_{c+1+k} \leq (n-c+1)*\epsilon
\label{eqerror1}
\end{equation}


%Taking $G.E[i]$ as the global error of bucket $i$ and $E[i]$ as its local error, the following recursion holds true : 
%\[
%G.E[i] \leq E[i] + G.E[i+1]
%\]


\end{proof}

% NOTE: necessary when ptmx or no mathfont class option is given
%\providecommand{\upGamma}{\Gamma}
%\providecommand{\uppi}{\pi}

\subsection{Miscallaneous experiments on analyzing error}

Calculating $\epsilon$ from Theorem 1 is hard because it involves computing the local bucket error $E$ over all configurations in the scope of the bucket. Therefore, we calculate the maximum over a sampled test set (lines 18-19 of Algorithm 3}) % estimates the local bucket error bound$\epsilon_c$ as 
as $\hat \epsilon$. % and $\hat \epsilon^{avg}$ respectively. 
Additionally, we also calculate the average local bucket error, $\hat \epsilon^{avg}$ over the same test set. 
%\fromrina{S: you need to give more information here. how do you compute the error and why is it an estimate? aren't you just computing the max over the log ratio?} 

To bound the global error of the approximated partition function from Eq. \ref{eq:errorb}, we sum over all the estimated bucket error bounds, $\hat\epsilon$), % summing over all bucket error bounds, $\hat \epsilon$,  bounds the global error of the estimated partition function. 
Clearly, the bound is very lose. We therefore also 
use the average local bucket error, $\hat \epsilon^{avg}$ to give us some additional information on the global error empirically: 

\begin{equation}
\hat E_1 \leq  \sum_{k=0}^{n-1} \hat\epsilon^{avg}_{1+k}
\label{eq:errorb_avg}
\end{equation}


\begin{figure*}[tb]
%\vspace{.3in}
%\begin{subfigure}[b]{\linewidth}
    \includegraphics[width=\linewidth]{figures/error-analysis.png}
  %  \caption{DBN}
 % \end{subfigure}
%\vspace{.3in}
%\includegraphics[width=\linewidth]{figures/graph.png}
\shrink{
\begin{subfigure}[b]{\linewidth}
\includegraphics[width=\linewidth]{figures/graph3.png}
    \caption{\smaller Empirical global error vs local error bound for 2 sample sizes. %across buckets
    }
  \end{subfigure}
  
 % \begin{subfigure}[b]{0.45\linewidth}
 %  \includegraphics[width=\linewidth]{figures/graph1.png}
 %   \caption{\smaller Local  error bound %across buckets
 %   }
 % \end{subfigure}
 % \begin{subfigure}[b]{0.45\linewidth}
 %  \includegraphics[width=\linewidth]{figures/graph2.png}
 %   \caption{\smaller Global  error bound %Eq. \ref{}
 %   }
 % \end{subfigure}
}
\caption[]
{\small 
Statistics of Local $\&$  bucket errors compared with global error over 5 runs for 4 grid-hard instances having w=55 with $i-$bound=20, where h=w, $\#$ buckets trained, $\#NB=308$ for two different scales of smaples sizes. %\fromrina{what is the i-bound?} 
{\em test wmse} is the w.m.s.e of the learned NN over the test set; {\em local bucket error} is the average L1 error for $log\lambda$ approximations over all buckets; {\em estimated bounds} is the bound obtained in  eq \ref{eq:errorb}; {\em empirical error} is the average global error over 5 runs. 
%Two bar graphs showing the local and global error bounds for 4 grid-hard instances for 2 configurations of average \#samples when \textit{NeuroBE} is run once. 
}
\label{fig:error-analysis}
\end{figure*}


{\bf Relationship between local and global errors empirically. %local and global 
}

Figure \ref{fig:error-analysis}a) depicts the empirical global errors against the local error bound for 4 grid instances over 2 sample configurations=\{60k,120k\}. Specifically, %we show  
the local error bound shown is the maximum over the estimated local bucket errors ($\hat\epsilon^{avg}$ from Eq. \ref{eq:errorb_avg}) across all buckets and the (empirical) global error is the error in the partition function estimate. As expected, we see a somewhat linear relationship between the global error and the local error bound. We also see that higher samples drive the local and global errors towards the lower-left of the plot and vice-versa. 

{\bf Impact of sample size on %local and global 
error bounds.} %\fromrina{this part is very unclear. We should consider removing. you have to talk about what you are computing and what is in the graph. I could understand better the previous table.} 
Figure \ref{fig:error-analysis} also depicts the impact of sample size on the estimated local and global error bounds (Eq. \ref{eq:errorb_avg}). %\fromrina{as derived in  xxx }. 
Specifically, %we show  
the local error bound shown is the maximum over the estimated local bucket errors ($\hat\epsilon^{avg}$ from Eq. \ref{eq:errorb_avg}) across all buckets. %in Fig. \ref{fig:error-analysis}a) and the estimate to the global error bound (or the sum of the averaged local error across buckets (Sec. 4)) in Fig. \ref{fig:error-analysis}b).
As expected, we see that increasing the training sample size makes the two bounds tighter. For the 4 grid instances (f10, f5, f2, f15),  we also observed that the empirical global error in the partition function estimate for for the two sample configurations \{(37.21, 7.94), (18.8, 5.9), (10.28, 3.05), (41.3, 27.5)\} is in proportion to the global error bound from Fig. \ref{fig:error-analysis}b).  %\fromrina{Sakshi, I am not sure what are the 2 bounds here. Either you talk about it fully, or not at all.}





%\bibliography{uai2022-template}

% NOTE: necessary when ptmx or no mathfont class option is given
\providecommand{\upGamma}{\Gamma}
\providecommand{\uppi}{\pi}

\bibliography{agarwal_339}


\end{document}
