% \documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz,ifthen} % nice language for creating drawings and diagrams
\usepackage{amsmath,amssymb,amsthm, eufrak}
\usepackage{bm}
\usepackage{cleveref}
\usepackage[mathcal]{eucal}
\usepackage{subfig}
\usepackage{graphicx}
\usepackage{xr}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example
%% ------------- Maths -----------------
\DeclareMathOperator{\E}{\mathbb{E}}
\DeclareMathOperator*{\argmin}{argmin}
\DeclareMathOperator{\sech}{sech}
%--- Numbers ---
\newcommand{\Int}{\mathbb{Z}}
\newcommand{\IntPositive}{\mathbb{Z}_+^*}
\newcommand{\Real}{\mathbb{R}}
\newcommand{\RealPositive}{\mathbb{R}_+}
\newcommand{\RealNonnenative}{\mathbb{R}_{\geq 0}}
\newcommand{\bigO}{\mathcal{O}}

%-- Marginals
\newcommand{\marginals}[1]{P_{#1}}
\newcommand{\marginalsApprox}[1]{{\tilde{P}_{#1}(x_{#1})}}
\newcommand{\marginalsApproxPW}[2]{\tilde{P}_{#1 #2}({x_{#1},x_{#2}})}
\newcommand{\marginalsSingle}[1]{P_{#1}(x_{#1})}
\newcommand{\marginalsShort}[1]{P(x_{#1})}
\newcommand{\marginalsShortApprox}[1]{\tilde{P}(x_{#1})}
\newcommand{\marginalsShortApproxFp}[1]{\tilde{P}^{\circ}(x_{#1})}
\newcommand{\setofmarg}{\mathcal{P}}


%% -------- Graphical Model -----------------
%--- Graph ---
\newcommand{\graph}{\mathcal{G}}
\newcommand{\setofnodes}{\mathbf{X}}
\newcommand{\lenSetOfNodes}{\lvert \mathbf{X} \rvert}
\newcommand{\setofedges}{\mathbf{E}}
\newcommand{\alphabet}{\mathcal{X}}
\newcommand{\RV}[1]{X_{#1}}
\newcommand{\Joint}{\mathbf{X}}
\newcommand{\edge}[2]{(#1,#2)}			% Undirected Edge
\newcommand{\nbh}[1]{\mathcal{N}(#1)}
\newcommand{\neighbors}[1]{N({#1})}
\newcommand{\degree}[1]{d_{#1}}
%---PGM ---
\newcommand{\PGM}[1][]{\mathcal{U}_{#1}}
\newcommand{\potPW}[2]{\Phi_{#1 #2}({x_{#1},x_{#2}})}
\newcommand{\pairwiseShort}[2]{\Phi(x_{#1},x_{#2})}
\newcommand{\pairwiseSBP}[3]{\Phi_{#3}(x_{#1},x_{#2})}
\newcommand{\potLocal}[1]{\Phi_{#1}(x_{#1})}
\newcommand{\localShort}[1]{\Phi(x_{#1})}
\newcommand{\localSBP}[2]{\Phi_{#2}(x_{#1})}

\newcommand{\setOfPot}{ \Psi}
\newcommand{\coupling}[2]{J_{#1#2}}
\newcommand{\field}[1]{\theta_{#1}}
\newcommand{\mean}[1]{m_{#1}}
\newcommand{\meanMinLocal}[1]{m_{#1}^{\circ}}
\newcommand{\meanMinLocalNeg}[1]{{m}_{#1}^{\oplus}}
\newcommand{\meanMinGlobal}[1]{m_{#1}^{*}}
\newcommand{\meanMinLocalSpecific}[2]{m_{#1}^{#2}}
\newcommand{\correlation}[2]{\chi_{#1#2}}
\newcommand{\correlationMinLocalSpecific}[3]{\chi_{#1#2}^{#3}}

%% -------- Belief Propagation -----------------
\newcommand{\BP}{\text{BP}}
\newcommand{\BPD}{\text{BP}_{\text{D}}}
\newcommand{\msg}[4][]{\ifthenelse{\equal{#4}{}} {\mu^{(#1)}_{#2 #3}(x_{#3})} {\mu^{(#1)}_{#2 #3}(\RV{#3}=#4)}}
\newcommand{\msgShort}[4][]{\ifthenelse{\equal{#4}{}} {\mu^{#1}_{#2 #3}(x_{#3})} {\mu^{#1}_{#2 #3}(#4)}}
\newcommand{\setOfMsg}[1][]{\bm{\mu}^{(#1)}}
\newcommand{\fpMsg}[3]{\mu^{\circ}_{#1 #2}(x_{#2}, #3)}
\newcommand{\msgRatio}[3][]{\bar{\mu}_{#2 #3}}



%% -------- Evaluation Metrics  -----------
\newcommand{\spin}{\stateSpace}
\newcommand{\magnetization}[1][]{\langle m_{#1} \rangle}
\newcommand{\mse}[1][]{\text{MSE}}
\newcommand{\mseb}{\text{MSE}_{{B}}}

%--- Cost-Functions ---
\newcommand{\FG}{\mathcal{F}}
\newcommand{\UG}{\mathcal{U}}
\newcommand{\SG}{\mathcal{S}}
\newcommand{\UBdiff}{\Delta \mathcal{U}_B}
\newcommand{\FB}{\mathcal{F}_B}
%\newcommand{\FB}[1][]
%{\ifthenelse{\equal{#1}{}}
%	{\mathcal{F}_B} {\mathcal{F}_{B,all}}}
\newcommand{\FBMinGlobal}{\mathcal{F}_B^*}
\newcommand{\FBMinLocal}{\mathcal{F}_B^{\circ}}
\newcommand{\FBMinLocalNeg}{\mathcal{{F}_B}^{\oplus}}
\newcommand{\FBMinLocalSpecific}[1]{\mathcal{F}_B^{#1}}
\newcommand{\FBdiff}{\Delta \mathcal{F}_B}
\newcommand{\FMF}{\mathcal{F}_{MF}}
\newcommand{\FMFdiff}{\Delta \mathcal{F}_{MF}}
\newcommand{\mutinfB}{I_B}
\newcommand{\UB}{\mathcal{U}_{{B}}}
\newcommand{\SB}{\mathcal{S}_{{B}}}
\newcommand{\FBApprox}{\mathcal{\tilde{F}_B}}
\newcommand{\PartitionBethe}{\mathcal{Z_B}}
\newcommand{\PartitionMF}{\mathcal{Z_{MF}}}
%\newcommand{\PartitionGibbs}{\mathcal{Z}}
\newcommand{\Partition}{\mathcal{Z}}
%\newcommand{\PartitionApprox}{\mathcal{{Z}_B}}
\newcommand{\polytopeLocal}{\mathbb{L}}
\newcommand{\polytopeMarginal}{\mathbb{M}}
\newcommand{\Bethebox}{\mathbb{B}}
\newcommand{\xiopt}{\xi_{ij}^{\ast}}


%% ---------- Fixed Pointn Indexing --------------------
\newcommand{\init}{\text{init}} %initial messages
\newcommand{\iteration}[1]{{#1}}
\newcommand{\fpI}{k}
\newcommand{\stableFP}{s}
\newcommand{\stableOne}{s}
\newcommand{\stableTwo}{t}
\newcommand{\minFP}{(m)}
\newcommand{\setOfStable}{\vm{S}}
\newcommand{\setOfMin}{\vm{M}}

%% ----------- Theorem Environments ---------------------
\newtheorem{claim}{Claim}%[section]
\newtheorem{thm}{Theorem}%[section]
\newtheorem{lemma}{Lemma}%[section]
\newtheorem{con}{Conjecture}
\newtheorem{cor}{Corollary}
\newtheorem{prop}{Properties}
\newtheorem{propos}{Proposition}

\theoremstyle{definition}
\newtheorem*{ex}{Example}

\externaldocument{leisenberger_186-supp}

\title{Fixing the Bethe Approximation: \\ How Structural Modifications in a Graph Improve Belief Propagation}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:Harald Leisenberger <harald.leisenberger@tugraz.at>?Subject=Your UAI 2022 paper}{Harald~Leisenberger}{}}
\author[1]{Franz Pernkopf}
\author[1]{Christian Knoll}
%\author[3]{Further~Coauthor}
%\author[1]{Further~Coauthor}
%\author[3]{Further~Coauthor}
%\author[3,1]{Further~Coauthor}
% Add affiliations after the authors
\affil[1]{%
    Signal Processing and Speech Communication Laboratory\\
	Graz University of Technology\\
	Austria
}
%\affil[2]{%
    %Second Affiliation\\
    %Address\\
    %…
%}
%\affil[3]{%
    %Another Affiliation\\
    %Address\\
    %…
  %}
  
\begin{document}
\maketitle

\begin{abstract}
 Belief propagation is an iterative method for inference in probabilistic graphical models. 
 Its well-known relationship to a classical concept from statistical physics, the Bethe free energy, puts it on a solid theoretical foundation. 
 If belief propagation fails to approximate the marginals, then this is often due to a failure of the Bethe approximation. 
 In this work, we show how modifications in a graphical model can be a great remedy for fixing the Bethe approximation.
 Specifically, we analyze how the removal of edges influences and improves belief propagation, and demonstrate that this positive effect is particularly distinct for dense graphs.
\end{abstract}

\section{INTRODUCTION} \label{sec:introduction}
Message passing algorithms are an effective method for approximate inference in probabilistic graphical models~\citep{koller2009graphical}.
Although they often perform well in practice, there are only few guarantees about their theoretical behavior.
%They often give accurate results in practice; however, this is not guaranteed and one can hardly make statements about their general performance. 
%This opened up many possibilities for better understanding their practical behavior and improving their methodology.
A remarkable milestone was the discovery of a direct connection between message passing algorithms and concepts of statistical mechanics~\citep{yedidia2001bethe}, perhaps the most famous being the relationship between belief propagation (BP)~\citep{pearl1988reasoning} and the so-called Bethe free energy~\citep{bethe1935superlattices, peierls1936superlattices}.
%that has its origins in the theory of magnetism

%It is well-known that fixed points of BP are in a one-to-one correspondence to stationary points of the Bethe free energy.
One favorable property of BP is its exactness on trees. 
On loopy graphs, however, it frequently suffers from two major issues:
first, it may fail to converge to a fixed point and thus to find reasonable estimates of the marginals. 
%the iterative message passing process may fail to converge to a fixed point and thus to approximate reasonable estimates of the marginals. 
%The first is an issue of approximation: the iterative message passing process may fail to converge to a fixed point and thus to approximate reasonable estimates of the marginals. 
Second, the fixed points themselves may induce bad estimates of the marginals, in which case even convergence would not help.
%to lead to acceptable results.
%The second is an issue of estimation: some or all BP fixed points may induce bad estimates of the marginals and even convergence might not lead to acceptable results.

To solve the first issue, various techniques have been developed that can improve the convergence behavior of BP; e.g., one can damp the message updates~\citep{murphy1999empirical} or utilize elaborate scheduling schemes~\citep{elidan2006residual, sutton2007dynamic, knoll2015scheduling, aksenov2020relaxed}. 
Moreover, it depends on the properties of the graphical model~\citep{tatikonda2002gibbs, ihler2005message, mooij2007sufficient} and on the initialization of the messages~\citep{koehler2019fast, knoll2021selfguided, leisenberger2021lyapunov} if and to which fixed point BP converges.

The second issue might be even harder to overcome, as it is
%considered to be even more serious than the first one: it 
inherently linked to a failure of the Bethe approximation~\citep{weller2014understanding}. 
The detrimental influence of loops can make the Bethe free energy non-convex and cause its local minima -- and thus BP fixed points -- to be far away from the exact marginals. 
Enhanced variants of free energy approximations~\citep{yedidia2005constructing} or loop corrections~\citep{mooij2007loopcorrections} are prudent alternatives, that improve the accuracy but also increase the complexity.

In this work, we follow a different path: we aim to improve the approximation quality of the Bethe free energy itself.
To address this problem, we modify the structure of the graphical model and show how this transforms the Bethe free energy in a way that moves its local minima closer to the exact marginals.
In particular, we analyze the effect of removing individual edges from the graph.
This loop-breaking approach enforces a 'reconvexification' of the Bethe free energy and therefore not only improves the accuracy of fixed points, but also the convergence behavior of BP.

We make a series of interesting theoretical contributions that arise from analyzing the behavior of the Bethe free energy on a 'small scale'. 
More precisely, we introduce a measure for the discrepancy between two different representations of the Bethe free energy, each induced by a different graphical model, and then utilize this tool to relate variations in the Bethe free energy to the characteristics of the model and the behavior of BP. Theoretically and experimentally, we address the following questions: (i) How does edge removal influence the estimated marginals? (ii) How does edge removal influence the estimated partition function? (iii) Which and how many edges -- if at all --  should we remove?

The structure of this paper is as follows: Sec.~\ref{sec:background} summarizes all relevant background on graphical models, BP, and the Bethe approximation. Sec.~\ref{sec:theoretical} contains a detailed theoretical analysis and the main results. We experimentally validate our findings in Sec.~\ref{sec:experiments} and conclude the paper in Sec.~\ref{sec:conclusion}.

 
\section{BACKGROUND} \label{sec:background}

This introductory section provides a compact overview of the topics that we deal with: probabilistic graphical models, belief propagation, and the Bethe approximation.
We further introduce the Ising model and discuss related work.

\subsection{PROBABILISTIC GRAPHICAL MODELS} \label{subsec:pgms}
We consider an undirected graph $\graph = (\setofnodes,\setofedges)$, where $\setofnodes = \{1,\dots,N\}$ is a set of nodes and $\setofedges \subseteq \{(i,j): i,j \in \setofnodes\}$ is a set of edges.
An edge connects two nodes if $(i,j) \in \setofedges$.
Note that we assume all edges to be undirected and hence $(i,j) = (j,i)$ for all pairs of connected nodes; specifically we do not count edges twice.
Furthermore, $\nbh{i}$ denotes the neighborhood of node $i$ (i.e., the set of nodes that are connected to $i$) and $d_i \coloneqq \lvert \nbh{i} \rvert$ denotes the degree of $i$.

Let $X_1, \dots, X_N$ be random variables (RVs) with state spaces $\alphabet_1, \dots, \alphabet_N$.
A probabilistic graphical model (PGM) represents a joint probability distribution $\marginals{\setofnodes}(\bm{x})$ over the RVs, where each node represents a RV\footnote{Due to the one-to-one correspondence between variables $X_i$ and nodes $i$, we shall not rigorously distinguish between them; e.g., we often write $P_i(x_i)$ instead of $P_{X_i}(x_i)$.} and edges indicate statistical dependencies between RVs.
Formally, a PGM is a pair $(\graph,\bm{\Phi})$ that associates a set of potential functions (or potentials) $\bm{\Phi} = \{\Phi_{1}(\bm{x_1}), \ldots, \Phi_{K}(\bm{x_K})\}$ with the graph $\graph$ that are defined over joint realizations of subsets $\bm{X_1},\dots,\bm{X_K} \subseteq \setofnodes$.
We shall focus on the class of binary pairwise models\footnote{A wide range of models can be equivalently transformed into binary pairwise models, although this may increase the state space considerably~\citep{weiss2000correctness, eaton2013modelreduction}.} that satisfy the following two assumptions: first, each RV has two states, i.e., $\alphabet_i = \alphabet = \{+1,-1\}$.
Second, the potentials are defined over either one (singleton potentials $\Phi_i(x_i)$) or two (pairwise potentials $\Phi_{ij}(x_i,x_j)$) RVs.
Then the joint distribution factorizes as 
\begin{equation} \label{eq:joint_distribution}
	\marginals{\setofnodes}(\bm{x})  =  \frac{1}{\Partition} \prod_{\edge{i}{j} \in \setofedges} \potPW{i}{j}\prod_{i=1}^N \potLocal{i},
\end{equation}
where $\Partition$
%= \sum\limits_{\bm{x} \in \alphabet^N} \marginals{\setofnodes}(\bm{x})$ 
is the normalization constant or partition function. 
%that normalizes $\marginals{\setofnodes}(\bm{x})$.

We consider the Ising model, whose potentials have the form $\Phi_{ij}(x_i,x_j) = \exp(J_{ij} x_i x_j)$ and $\Phi_{i}(x_i) = \exp(\theta_i x_i)$ with $J_{ij} \in \mathbb{R}$ being the coupling strength of edge $(i,j)$ and $\theta_i \in \mathbb{R}$ being the local field of node $i$. We further call an edge $(i,j$) \emph{attractive} if $J_{ij} > 0$, and \emph{repulsive} if $J_{ij} < 0$.
Then~\eqref{eq:joint_distribution} takes the exponential form
\begin{equation} \label{eq:Ising_model}
	\marginals{\setofnodes}(\bm{x})  = \frac{1}{\Partition} \exp(-E(\bm{x}))
\end{equation}
with $E(\bm{x}) \coloneqq \sum_{\edge{i}{j} \in \setofedges} J_{ij} x_i x_j + \sum_{i=1}^N \theta_i x_i$ being the state energy.\footnote{This parameterization does often facilitate the theoretical analysis as it associates a unique parameter vector -- consisting of all $J_{ij}$ and $\theta_i$ -- with each distribution (a so-called minimal representation,~\citet{wainwright2008exponential}).}

Finally, following the terminology in~\citet{knoll2021selfguided}, we specify three types of models: \emph{unidirectional} models do only contain attractive edges and all variables are biased towards the same state (i.e., either $\theta_i \leq 0$ or $\theta_i \geq 0$); \emph{attractive} models do only contain attractive edges, but there may be local fields that differ in sign; \emph{general} models may contain both attractive and repulsive edges. 
Note that, by definition, attractive models include unidirectional models, while general models include both unidirectional and attractive models.

\subsection{BELIEF PROPAGATION} \label{subsec:bp}
In this work, we consider two problems: first, the computation of marginal distributions where our specific interest lies in singleton marginals $P_{X_i}(x_i)$;
%of subsets $\bm{Y} \subseteq \bm{X}$ according to
%\begin{align}
	%\marginals{\bm{Y}}(\bm{y}) = \sum_{x_i : \, X_i \in \, \setofnodes \backslash \bm{Y}} \marginals{\setofnodes}(\bm{x}),
%\end{align}
%where our specific interest lies in singleton
%and pairwise
and second, the computation of the partition function.\footnote{Actually, these two problems are closely related as marginals can be expressed as a ratio of sub-partition functions~\citep{weller2014b}.}
It is well known that an exact computation of these quantities is intractable~\citep{valiant1979permanent,cooper1990computational} and even the approximation of the marginals to a certain precision is NP-hard~\citep{dagum1993approximate}.

Belief propagation (BP) approximates the marginals by iteratively exchanging local statistical information between nodes in form of 'messages'.
This process is governed by the recursive message update equations
\begin{equation} \label{eq:message_update}
	\mu_{ij}^{(n+1)}(x_j) \propto \sum \limits_{x_i \in \alphabet}  \potPW{i}{j}\potLocal{i} \!\!\! \prod \limits_{k \in \neighbors{i} \setminus {j}} \!\!\! \mu_{ki}^{(n)}(x_k),
\end{equation}
where the superscript $(n)$ refers to the current iteration and the subscript $ij$ refers to the direction in which a message is sent (e.g., node $i$ sends $\mu_{ij}$ to node $j$).
In principle, one can approximate the singleton marginals at any iteration, by multiplying all incoming messages with the local potential:
\begin{align} \label{eq:singleton_marginals}
	\tilde{P_{i}}^{(n)}(x_i) \propto \frac{1}{Z_i} \potLocal{i} \hspace{-0.1cm} \prod_{k \in \nbh{i}} \hspace{-0.1cm} \mu_{ki}^{(n)}(x_k)
\end{align}
%similarly, pairwise marginals are estimated according to
%\begin{align} \label{eq:pairwise_marginals}
	%\tilde{P}_{ij}^{(n)}(x_i,x_j) = \frac{1}{Z_{ij}} \potPW{i}{j} \Big( \hspace{-0.3cm} \prod_{k \in \nbh{i} \setminus j} \hspace{-0.3cm} \potLocal{i} \mu_{ki}^{(n)}(x_i) \Big) \hspace{-0.1cm} \Big( \hspace{-0.2cm} \prod_{l \in \nbh{j} \setminus i} \hspace{-0.3cm} \potLocal{j} \mu_{lj}^{(n)}(x_j) \Big).
%\end{align}
%Here the $Z_i$ and $Z_{ij}$ are normalization constants.
Generally, however, marginal estimates are considered to be more accurate when they are obtained from a BP fixed point~\citep{murphy1999empirical, knoll2017fixed};
more precisely, BP has converged to a fixed point $\mu_{ij}^{\circ}$, whenever an update of the form~\eqref{eq:message_update} does not alter the message values anymore (that is $\mu_{ij}^{(n+1)} = \mu_{ij}^{(n)}$ for all $(i,j)$). 


\subsection{BETHE APPROXIMATION} \label{subsec:bethe}

It is often useful to formulate marginal inference in terms of a variational problem. For this purpose, we consider some trial distribution $Q_{\setofnodes}(\bm{x})$ and define the Gibbs free energy as
\begin{equation} \label{eq:Gibbs_free_energy}
 \FG(Q_{\setofnodes}) = \UG(Q_{\setofnodes})  - \SG(Q_{\setofnodes})
\end{equation}
with $\UG = \mathbb{E}_{Q}\, [E(\setofnodes)]$ being the average energy of the model and $\SG = - \hspace{-0.2cm}\sum\limits_{\bm{x} \in \alphabet^N} Q_{\setofnodes}(\bm{x}) \log Q_{\setofnodes}(\bm{x})$ being the entropy of $Q_{\setofnodes}(\bm{x})$.
Let us further define the marginal polytope $\polytopeMarginal$ as the set of all valid probability distributions over $\setofnodes$ (i.e., that satisfy all global and local marginalization and normalization constraints).
If one minimizes $\FG$ over $\polytopeMarginal$, then one recovers the true distribution $P_{\setofnodes}(\bm{x})$ with the negative log-partition function $- \log(\Partition)$ as the functional value at the global minimum, i.e., $- \log(\Partition) = \min_{\polytopeMarginal} \FG(Q_{\setofnodes}) = \FG(P_{\setofnodes})$.\footnote{For more details on variational inference in graphical models and the marginal polytope, we refer the reader to~\citet{wainwright2008exponential, mezard2009information}.}

Two aspects, however, render the minimiziation of the Gibbs free energy intractable: first, the definition of the marginal polytope by exponentially many constraints; second, the evaluation of the entropy that requires summing over exponentially many terms.
The Bethe free energy approximation addresses these two issues as follows: first, it relaxes the marginal polytope $\polytopeMarginal$ to the local polytope $\polytopeLocal$ that involves only local marginalization and normalization constraints of the pairwise and singleton 'pseudo-marginals' $\tilde{P}_{ij}$ and $\tilde{P}_i$:
\begin{align} \label{local_polytope}
  \begin{split}
  \polytopeLocal = \{\tilde{P}_{ij}, \tilde{P}_i : \, & \sum\limits_{x_j} \tilde{P}_{ij}(x_i,x_j) = \tilde{P}_i(x_i), \\ & \sum\limits_{x_i,x_j} \tilde{P}_{ij}(x_i,x_j) = 1, \sum\limits_{x_i} \tilde{P}_i(x_i) = 1; \\ & (i,j) \in \setofedges, i \in \setofnodes \}.
  \end{split}
\end{align}
%Note that since all constraints are linear, $\polytopeLocal$ is indeed an outer bound to $\polytopeMarginal$.
Second, it replaces the entropy $\SG$ by an accordingly weighted sum of entropy contributions from edges and nodes.
%In the case of pairwise models, two types of clusters -- consisting of either one or two (connected) variables -- are taken into account.
More concretly, the Bethe free energy is defined as $\FB = \UB - \SB$ where the Bethe average energy is
\begin{align} \label{eq:Bethe_energy}
 \begin{split}
 \UB = & -\sum_{(i,j) \in \setofedges} \sum_{x_i, x_j \in \alphabet} \marginalsApproxPW{i}{j} \log \potPW{i}{j}  \\ & - \sum_{i=1}^{n} \sum_{x_i \in \alphabet} \marginalsApprox{i} \log \potLocal{i},
 \end{split}
\end{align}
and the Bethe entropy is
\begin{align} \label{eq:Bethe_entropy}
 \begin{split}
 \SB = & -\sum_{(i,j) \in \setofedges} \sum_{x_i, x_j \in \alphabet} \marginalsApproxPW{i}{j} \log \marginalsApproxPW{i}{j} \\ & + \sum_{i=1}^{n} (d_i - 1) \sum_{x_i \in \alphabet} \marginalsApprox{i} \log \marginalsApprox{i}.
 \end{split}
\end{align}
While $\UB$ equals the true average energy $\UG$ in the exact marginals, $\SB$ is only an approximation to the true entropy $\SG$ -- unless the graph is a tree~\citep{yedidia2005constructing}.
To obtain locally consistent approximations to the exact marginals, one usually aims to minimize $\FB$ over $\polytopeLocal$. Also, the so-called Bethe partition function $\PartitionBethe$, that is implicitely defined by $-\log(\PartitionBethe) = \min\limits_{\polytopeLocal} \FB$, provides an estimation of $\Partition$.

Binary variables allow for a particularly simple description of the local polytope. Following the notation of~\citet{welling2001belief, weller2013bethebounds}, we define the pseudo-marginal distribution of $X_i$ by $\tilde{P}_i(X_i = +1) = q_i$ (implying $\tilde{P}_i(X_i = -1) = 1 - q_i$) and, for any pair of connected nodes, we denote the joint pseudo-marginal probability $\tilde{P}_{ij}(X_i = +1, X_j = +1)$ by $\xi_{ij}$.
Then the local marginalization and normalization constraints induce a full specification of the joint probability table between $X_i$ and $X_j$ in terms of the three parameters $q_i, q_j, \xi_{ij}$ (Tab.~\ref{tab:prob_table}).

\begin{table}[h] 
\centering 
\caption{Variational joint probability table for two binary variables $X_i$ and $X_j$.} \label{tab:prob_table}
\begin{tabular}{ c||c|c||c } 
 $\tilde{P}_{ij}(X_i, X_j)$ & $X_j = +1$ & $X_j = -1$ & \\ \hline \hline
 $X_i = +1$ & $\xi_{ij}$ & $q_i - \xi_{ij}$ & $q_i$ \\ \hline
 $X_i = -1$ & $q_j - \xi_{ij}$ & $1 + \xi_{ij} - q_i - q_j$ & $1-q_i$ \\ \hline \hline
 & $q_j$ & $1 - q_j$ &  \\
\end{tabular}
\end{table}

If we assume that all probabilities are strictly positive, then $\xi_{ij}$ is bounded by
\begin{equation} \label{eq:xi_bounds}
 \max(0,q_i + q_j - 1) < \xi_{ij} < \min(q_i,q_j).
\end{equation}
Inserting singleton and pairwise pseudo-marginals from Table~\ref{tab:prob_table} together with the Ising potentials from Sec.~\ref{subsec:pgms} into~\eqref{eq:Bethe_energy} and~\eqref{eq:Bethe_entropy}, the Bethe free energy becomes
\begin{align} \label{eq:Bethe_reparameterized}
 \begin{split}
 \FB = \; & - \sum_{(i,j) \in \setofedges} \, (1+  2 \; (2 \, \xi_{ij} - q_i - q_j)) \, J_{ij} \\ \ & + \, \, \sum_{i=1}^{n} (1 - 2 q_i) \, \theta_i \, \\ &  - \sum_{(i,j) \in \setofedges} \mathcal{S}_{ij} \; + \; \sum_{i=1}^{n} (d_i - 1) \, \mathcal{S}_{i},
 \end{split}
\end{align}
where the pairwise entropies are
\begin{align} \label{eq:Bethe_entropy_PW}
 \begin{split}
 \hspace{-0.15cm} \mathcal{S}_{ij} = & -\xi_{ij} \log \xi_{ij} \\ & - (1+\xi_{ij}-q_i-q_j) \log (1+\xi_{ij}-q_i-q_j) \\ & - (q_i - \xi_{ij}) \log (q_i - \xi_{ij}) \\ & - (q_j - \xi_{ij}) \log (q_j - \xi_{ij})
   \end{split}
\end{align}
and the local entropies are
\begin{align} \label{eq:Bethe_entropy_Loc}
 \begin{split}
 \mathcal{S}_{i} = -q_i \log q_i - (1-q_i) \log (1-q_i).
   \end{split}
\end{align}

 Then, with $(\bm{q}; \bm{\xi})$ being the vector that contains all $q_i$ and $\xi_{ij}$, the local polytope takes the simplified form
 \begin{align} \label{eq:local_polytope_reparameterized}
 \begin{split}
\hspace{-0.5cm} \polytopeLocal & = \{(\bm{q}; \bm{\xi}) \in \mathbb{R}^{\lvert \setofnodes \rvert + \lvert \setofedges \rvert}: 0 < \; q_i  < 1, i \in \setofnodes; \\ & \hspace{-0.2cm} \max(0, q_i + q_j - 1) < \; \xi_{ij}  < \min(q_i,q_j), (i,j) \in \setofedges \}.
\end{split}
\end{align}
%The usual aim is to minimize $\FB$ over $\polytopeLocal$ and obtain estimates for the marginals and partition function.
To further facilitate the task of minimizing $\FB$ over $\polytopeLocal$, ~\citet{welling2001belief} have derived necessary conditions for points $(\bm{q}; \bm{\xi})$ of $\polytopeLocal$ to be located at local minima of $\FB$. 
%One can even further reduce the dimensionality of the domain to be optimized over by setting $\frac{\partial}{\partial \xi_{ij}} \FB \stackrel{!}{=} 0$ and solving for $\xi_{ij}$ explicitly~\citep{welling2001belief}.
By setting the partial derivative $\frac{\partial}{\partial \xi_{ij}} \FB$ for an arbitrary edge to zero, they proved that the resulting quadratic equation
\begin{align} 
 & \hspace{-0.2cm} \alpha_{ij} \xi_{ij}^2 - (1 + \alpha_{ij} (q_i + q_j)) \xi_{ij} + (1+\alpha_{ij}) q_i q_j = 0, \label{eq: xi_opt_equation} \\
 & \hspace{-0.2cm} \text{where} \quad  \alpha_{ij} = e^{4 J_{ij}} -1, \label{eq:alpha}
\end{align}
%\begin{equation} \label{eq:alpha}
 %\text{where} \, \, \, \alpha_{ij} = e^{4 J_{ij}} -1,
%\end{equation}
has a unique valid (i.e., inside the bounds~\eqref{eq:xi_bounds}) solution 
\begin{align} \label{eq:xi_optimal}
 \begin{split}
   \xiopt(q_i,q_j) = &  \frac{1}{2\alpha_{ij}} \Big( (1 + \alpha_{ij}(q_i + q_j)) \\ & \hspace{-1.6cm} - \sqrt{(1 + \alpha_{ij}(q_i + q_j))^2 - 4 \alpha_{ij}(1+\alpha_{ij}) q_i q_j  } \, \, \Big). \\
 \end{split}
\end{align}
%that directly depends on $q_i$ and $q_j$.
%The resulting quadratic equation has the unique valid (i.e., inside the bounds~\eqref{eq:xi_bounds}) solution 
%This solution imposes a necessary condition on $\xi_{ij}$ to be located at a stationary point of $\FB$.
This means that for each edge $(i,j)$, the only $\xi_{ij}^{\ast}(q_i,q_j)$, that can be located at a stationary point of $\FB$, depends directly on $q_i$ and $q_j$ and may therefore be inserted in the definition of $\FB$~\eqref{eq:Bethe_reparameterized}. This is advantageous for two reasons: first, it considerably reduces the number of independent variables that are involved in optimizing $\FB$ (i.e., from $\lvert \setofnodes \rvert + \lvert \setofedges \rvert$ to $\lvert \setofnodes \rvert$);
%involved in the optimization problem 
%considerably: instead of minimizing the $(\lvert \setofnodes \rvert + \lvert \setofedges \rvert)$ - dimensional function $\FB(\bm{q}; \bm{\xi})$ over $\polytopeLocal$, we can instead minimize the $\lvert \setofnodes \rvert$ - dimensional function $\FB(\bm{q})$ (with all $\xi_{ij}^{\ast}$ already inserted) over the \emph{Bethe box}
second, it simplifies the shape of the domain, as $\FB$ is now defined on a box-constrained domain, the \emph{Bethe box}
 \begin{align} \label{eq:Bethe_box}
 \begin{split}
\Bethebox = \{\bm{q} \in \mathbb{R}^{\lvert \setofnodes \rvert}: 0 < \; & q_i  < 1, i \in \setofnodes\}.
\end{split}
\end{align}
In this work, we do always refer to the Bethe free energy by $\FB$, be it defined over the local polytope or the Bethe box.


\subsection{RELATED WORK} \label{subsec:related}
\textbf{Belief propagation and the Bethe free energy.} Since the seminal work of~\citet{yedidia2001bethe}, it is well known that fixed points of BP correspond one-to-one to stationary points of the Bethe free energy; moreover, stable fixed points of BP must always be associated to local minima of the Bethe free energy~\citep{heskes2003stable}.\footnote{On the other hand, there may exist minima of the Bethe free energy that are related to unstable BP fixed points~\citep{mooij2005bethe, knoll2017fixed}.}
Consequently, one can try to overcome the convergence issue of BP by minimizing the Bethe free directly. 
To solve the problem, \citet{welling2001belief, shin2012complexity} have derived gradient-based algorithms;~\citet{yuille2002CCCP, heskes2006convexity} have proposed provably convergent double-loop algorithms.

\textbf{Variational free energy approximations.} \citet{yedidia2005constructing} have shown that BP is only a special case of a general class of message passing algorithms, the generalized belief propagation (GBP). 
Likewise, fixed points of these algorithms correspond to stationary points of the so-called Kikuchi free energies that try to approximate the true entropy by a sum over entropy contributions from larger node clusters~\citep{kikuchi1951cooperative, pelizzola2005cluster}. 
In practice,  many of these methods can be prohibitively slow and may suffer in the same way as BP from non-convexity of the particular free energy approximation; i.e., they may -- if at all -- converge to suboptimal minima. 
This inspired various researchers to design free energy approximations that are convex ~\citep{wainwright2005logpartition, globerson2007b}, some of which are related to convergent message passing algorithms~\citep{kolmogorov2006convergent, globerson2007a, hazan2008convergent, meltzer2009convergent, jancsary2011convergent}.

\textbf{Theoretical work on the Bethe approximation.} The Bethe approximation proves often to be superior to other methods in terms of a tradeoff between efficiency and accuracy~\citep{meshi2009convexifying}. 
Its theoretical properties have therefore been intensely studied:~\citet{heskes2004uniqueness, pakzad2005kikuchi} derived conditions for the convexity of the Bethe free energy.
~\citet{chertkov2006loopseries} formulated the so-called loop series expansion that directly relates the Bethe partition function to the true partition function. 
Others have found interesting connections between the Bethe approximation and classical graph theory~\citep{watanabe2009graphzeta, vontobel2013graphcovers}. 
Moreover,~\citet{weller2014a} derived an FPTAS
\footnote{Fully polynomial-time approximation scheme.} 
to approximate the Bethe partition function in attractive models.

\textbf{Graphical model approximation.} Another line of research, that is in some sense complementary to variational inference, tries to approximate the graphical model itself. 
The classical Chow-Liu algorithm~\citep{chowliu1968tree} finds a spanning tree such that the Kullback-Leibler (KL) divergence between the original distribution and the induced tree distribution is minimal. 
Furthermore, two different techniques have been applied to reduce the complexity of exact inference in a graphical model: first, the 'annihilation' of small probabilities that are below a certain treshold~\citep{jensen1990approximations}; second, the deletion of one or more edges from the model (not necessarily until a spanning tree is reached). 
Due to its empirical success, the second method deserves special attention:~\citet{kjaerulff1994reduction} carefully selected edges whose removal decreases the treewidth of a graph.
\citet{vanengelen1997arcremoval} studied how the removal of edges in a directed graph influences the KL divergence.
\citet{choi2006edgedeletion} showed that a particular class of GBP, the so-called join graph propagation~\citep{dechter2002joingraph}, can be equivalently cast in terms of a procedure that consecutively deletes and recovers edges.
In the past, these methods were primarily applied to perform exact inference in the approximated model.
%\footnote{The most commonly used algorithm for performing exact inference in PGMs is the so-called junction tree algorithm~\citep{lauritzen1988junctiontree}.} 
For large graphs, this does often remain a hard computational challenge.

\section{THEORETICAL ANALYSIS} \label{sec:theoretical}

We shall now devote our attention to the central topic of this work: how removing edges from a graphical model influences the behavior of BP.
While the accuracy of the exact marginals degrades if one approximates a model by a sparser one~\citep{vanengelen1997arcremoval}, one might expect a similar behavior for the marginals estimated by BP.
%While~\citet{vanengelen1997arcremoval} reported an increase of the marginal error that results from exact inference in approximated models, one may intuitively expect a similar behavior for the BP-estimated marginals; i.e., that the more edges we remove, the worse becomes the approximation accuracy with respect to the exact marginals. 
We show, that the opposite is the case: sparsifying the graph does often significantly improve the marginal accuracy of BP.
The quality of the estimated partition function, however, tends to degrade by deviating from the original model.

In this section, we explain the second of these phenomena theoretically.
We further analyze the role of an 'optimal' edge to be removed and relate this problem to the Bethe free energy. 
In particular, we prove an inherent relationship between global error measures on the Bethe free energy and the coupling strength of the edges. 
Our detailed analysis of the Bethe free energy on a 'small scale' extends the work of~\citet{welling2001belief, weller2013bethebounds,weller2014understanding} and leads to better understanding of BP and the Bethe approximation in general.

\subsection{PROBLEM SPECIFICATION}
We briefly clarify the problem to be considered. 
Let $(\graph,\bm{\Phi})$ be a PGM and let $(\graph',\bm{\Phi}')$ be a second PGM that is obtained by removing a set of edges $\tilde{\setofedges}$ (and the associated pairwise potentials) from the original model.\footnote{Without loss of generality, we assume that the removal of $\tilde{\setofedges}$ does not make the graph disconnected (otherwise, individual connected components can be treated separately).}
Let $\setofmarg \coloneqq \{P_i: i \in \setofnodes\}$ be the set of exact (singleton) marginals on $(\graph,\bm{\Phi})$ and $\Partition$ be the partition function. 
Assume that we run BP on both models and obtain pseudo-marginals $\tilde{\setofmarg} \coloneqq \{\tilde{P}_i: i \in \setofnodes\}$ on $(\graph,\bm{\Phi})$ resp. $\tilde{\setofmarg}' \coloneqq \{\tilde{P}'_i: i \in \setofnodes\}$ on $(\graph',\bm{\Phi}')$, together with partition function estimates $\tilde{\Partition}$ resp. $\tilde{\Partition}'$.\footnote{Note that $\tilde{P}'_{\setofnodes}$ and $\tilde{Z}'$ are approximations to the exact marginals and partition function in the new model $(\graph',\bm{\Phi}')$.} 
Then we are interested in comparing the following quantities: first, the $l^1$-errors $\lvert \lvert \setofmarg_{\setofnodes} - \tilde{\setofmarg}_{\setofnodes} \rvert \rvert_{l^1}$ and $\lvert \lvert \setofmarg_{\setofnodes} - \tilde{\setofmarg}'_{\setofnodes} \rvert \rvert_{l^1}$; and second, the absolute errors $\lvert \log \Partition - \log\tilde{\Partition} \rvert$ and $\lvert \log \Partition - \log \tilde{\Partition}' \rvert$.

Ideally, we would like to remove a (possibly empty) set of edges, such that the induced errors $\lvert \lvert P_{\setofnodes} - \tilde{P}'_{\setofnodes} \rvert \rvert_{l^1}$ and $\lvert \log \Partition - \log \tilde{\Partition}' \rvert$ become minimal over all subsets $\tilde{\setofedges} \subseteq \setofedges$.
That is, we want to find the model for which BP best approximates the marginals and the partition function of the original model.
If one premises that, for comparison, we require the access to these exact quantities, the finding of such an edge set is of course an intractable problem.
Still, it remains a crucial question whether and to what extent the removal of edges has a positive impact on the estimates. 
To identify edges to be deleted, we need to define an objective that contains information about the discrepancy between different graphical models. 
Note that a global comparison via the KL divergence and its generalizations~\citep{minka2005divergence} is prohibitive as this would involve a summation over exponentially many terms. 
Likewise, it is intractable to compare between different representations of the Gibbs free energy~\eqref{eq:Gibbs_free_energy}.
To relax the problem, we focus on the analysis of local discrepancies between two models.
The Bethe free energy~\eqref{eq:Bethe_reparameterized} provides an ideal tool to explicitly measure these local differences.

%that is simple enough to allows for an explicite formula that represents a measure of discrepancy between two different representations of the Bethe free energy.

%\subsection{REFORMULATING THE BETHE APPROXIMATION FOR BINARY VARIABLES}

%In (Section 3 and 4) , we will investigate the properties of the Bethe function over the local polytope $\polytopeLocal$ and the Bethe box $\Bethebox$.
%We therefore formally distinguish between both geometrical objects. \ \\

\subsection{THE BETHE ENERGY DIFFERENCE} \label{subsec:bethe_energy_difference}
Our main idea to make model comparison tractable lies in comparing between two different representations of the Bethe free energy.
We formalize this concept as follows:
assume for now that we remove a single edge $(i,j)$ from a model $(\graph,\bm{\Phi})$ and let $(\graph^{\, \setminus(i,j)},\bm{\Phi}^{\, \setminus(i,j})$ denote the resulting model.
Let further $\FB$ resp. $\FB^{\, \setminus(i,j)}$ be the representations of the Bethe free energy that are associated with $(\graph,\bm{\Phi})$ resp. $(\graph^{\, \setminus(i,j)},\bm{\Phi}^{\, \setminus(i,j})$.
Specifically, $\FB^{\, \setminus(i,j)}$ does not contain the pairwise energy and entropy contributions from edge $(i,j)$, while the local entropy contributions from nodes $i$ and $j$ are counted once less than in the definition of $\FB$~\eqref{eq:Bethe_reparameterized}.
Then we define the \emph{Bethe free energy difference} $\FBdiff^{\, (i,j)}$ as the difference between $\FB$ and $\FB^{\, \setminus(i,j)}$, i.e., 
\begin{align} 
 \begin{split} \label{eq:Bethe_energy_difference}
 \FBdiff^{\, (i,j)} & \coloneqq \FB - \FB^{\, \setminus (i,j)} \\ & = \overbrace{-\, (1+  2 \; (2 \, \xi_{ij} - q_i - q_j)) \, J_{ij}}^{\coloneqq \,  \UBdiff^{\, (i,j)}} \\ & \, \, \, \, \, \, \, +  \overbrace{\mathcal{S}_{i} + \mathcal{S}_{j} - \mathcal{S}_{ij}}^{\coloneqq \, \mutinfB^{\, (i,j)}}, %= \\ & = 
 %\UBdiff^{\, (i,j)} + \mutinfB^{\, (i,j)}
 \end{split}
\end{align}
where $\UBdiff^{\, (i,j)}$ is the difference in the Bethe average energy and $\mutinfB^{\, (i,j)}$ is the mutual information between $X_i$ and $X_j$.

Depending on whether we consider $\FB$ on the local polytope $\polytopeLocal$~\eqref{eq:local_polytope_reparameterized} or the Bethe box $\Bethebox$~\eqref{eq:Bethe_box}, $\FBdiff^{\, (i,j)}$ is defined on slices of these objects, that is either on the sliced local polytope
\begin{align} \label{eq:sliced_local_polytope}
 \begin{split}
 \hspace{-0.4cm} \polytopeLocal^{\, (i,j)} & \coloneqq \{(q_i,q_j ; \xi_{ij}) \in \mathbb{R}^{3}: 0 < \, \, q_i,  q_j  < 1; \\ \hspace{-0.2cm} & \max(0, q_i + q_j - 1) < \; \xi_{ij}  < \min(q_i,q_j)\}
 \end{split}
\end{align}
or the sliced Bethe box
\begin{equation} \label{eq:sliced_Bethe_box} 
 \hspace{-1.955cm} \Bethebox^{(i,j)} \coloneqq \{(q_i,q_j) \in \mathbb{R}^{2}: 0 < q_i , q_j  < 1\}.
\end{equation}

It only depends on three resp. two variables and may therefore be considered as a function that contains variational information about local changes in a model when removing an edge. 
Moreover, it entails an effective way of measuring the local discrepancy between two graphical models, e.g., by computing an arbitrary norm of $\FBdiff^{\, (i,j)}$ on $\polytopeLocal^{(i,j)}$ or $\Bethebox^{(i,j)}$.
In this work, we consider $L^p$-norms as the most natural choice and analyze the special cases of $p=\infty$ and $p=2$ in Sec.~\ref{subsec:main_results} (Theorem~\ref{thm:Bethe_infinity_norm}, Corollary~\ref{cor:Bethe_opt_edge_Linf}, and Theorem~\ref{thm:Bethe_opt_edge_L2}).

%To measure the error between different representations of the Bethe free energy, we consider $L^p$ norms as the most natural choice; in light of our discussion, we thus aim to compute the norm of the Bethe energy difference.
%Although we cannot evaluate the involved integrals of the form $\iint\limits_{\Bethebox^{(i,j)}}\, \lvert \FBdiff^{(i,j)} (q_i,q_j) \rvert^p \, d q_i \, d q_j$ analytically, the low dimensionality of the integrand allows for an efficient computation by numerical methods.
%For the important special case that $p = \infty$, we present the following result:

%Note that we can rewrite the Bethe free energy~\eqref{eq:Bethe_reparameterized} more compactly as a sum of an 'empty graph contribution' $\FB^{\, \circ}$ (that only takes node contributions into account) plus contributions from all edges in form of Bethe energy differences:
%\begin{equation} \label{eq:Bethe_rewritten}
 %\FB = \overbrace{\sum_{i=1}^{n} \Big( (1 - 2 q_i) \, \theta_i - \mathcal{S}_{i} \Big)}^{\coloneqq \FB^{\, \circ}} + \sum_{(i,j) \in \setofedges} \FBdiff^{\, (i,j)}.
%\end{equation}
%This formulation will be useful for certain parts of our theoretical analysis in (Section 3, reference) and when we present a generalized form of the Bethe energy difference in (Section 4, reference).
In principle, one can generalize the above idea to compare between models that result from removing multiple edges $\tilde{\setofedges}$ in one step, as the associated Bethe free energy difference $\FBdiff^{\, \tilde{\setofedges}}$ is then simply the sum over energy differences $\FBdiff^{\, (i,j)}$ for all $(i,j)$ in $\tilde{\setofedges}$. 
However, this increases both the number of variables to be integrated over and the number of edge sets to be taken into account for removal.   
To facilitate the theoretical and experimental analysis of edge removal, we shall therefore focus on removing edges one by one.

To make statements about the global effects of removing individual edges on the Bethe free energy and BP, we must carefully analyze the functional behavior of the Bethe free energy difference and its components on a small scale. In the following, we derive a series of auxiliary theorems where we consider the mathematical properties of $\xiopt$ from~\eqref{eq:xi_optimal} (Lemma~\ref{lm:xi_optimal_center} and~\ref{lm:xi_optimal_limits}), the Bethe mutual information $\mutinfB^{\, (i,j)}$ (Lemma~\ref{lm:mut_inf_properties}), and the Bethe energy difference $\FBdiff^{\, (i,j)}$ (Lemma~\ref{lm:derivatives_FB},~\ref{lm:stationary}, and~\ref{lm:FBdiff_negative}) on their joint domain, the sliced Bethe box $\Bethebox^{(i,j)}$. These results -- besides being interesting in themselves -- are rather of technical nature and will help us in proving our main results in Sec.~\ref{subsec:main_results}. All proofs for Sec.~\ref{subsec:bethe_energy_difference} and~\ref{subsec:main_results} are contained in the Appendix A.


First, we compute values of $\xiopt$ in the center point of the sliced Bethe box:
%Specifically, we analyze the extremal and limit behavior of $\FBdiff^{\, (i,j)}$ over its domain (Lemma~\ref{lm:xi_optimal_center}) and at the boundary (Lemma bla).
%Furthermore, we characterize regions of negativity of this function.

%First of all, we find a simplified expression of $\xiopt$ from~\eqref{eq:xi_optimal} in the center point $(0.5,0.5)$ of the sliced Bethe box $\Bethebox^{\, (i,j)}$:

\begin{lemma} \label{lm:xi_optimal_center}
Let $(i,j)$ be an edge.
In the center point $(0.5,0.5)$ of the sliced Bethe box $\Bethebox^{(i,j)}$, the unique $\xiopt$ that can be located at a stationary point of $\FB$ has the form
\begin{equation} \label{eq:xiopt_center}
 \xiopt(0.5,0.5) = \frac{\sigma(2 \, J_{ij})}{2}.
\end{equation}
\end{lemma}

We will also have to analyze the behavior of $\xiopt$ if $q_i$ and $q_j$ approach the boundary $\partial \Bethebox^{(i,j)}$\footnote{That is, the four line segments connecting the points $(0,0) - (0,1)$, $(0,0) - (1,0)$, $(0,1) - (1,1)$, and $(1,0) - (1,1)$.} of the sliced Bethe box:

%Next, we compute the limit values of $\xiopt$ when $q_i$ and $q_j$ approach the boundary of the feasible space:

\begin{lemma} \label{lm:xi_optimal_limits}
Let $(i,j$) be edge and let $k \in [0,1]$.
The limits of $\xiopt$ at the boundary $\partial \Bethebox^{(i,j)}$ of the sliced Bethe box are
\begin{align}
 \lim_{\substack{q_i \to 0 \\ q_j \to k }} \xiopt(q_i,q_j) & = 0 = \lim_{\substack{q_i \to k \\ q_j \to 0 }} \xiopt(q_i,q_j), \label{eq:xi_limit_0} \\
 \lim_{\substack{q_i \to 1 \\ q_j \to k }} \xiopt(q_i,q_j) & = k = \lim_{\substack{q_i \to k \\ q_j \to 1 }} \xiopt(q_i,q_j). \label{eq:xi_limit_1}
\end{align}
\end{lemma}

Moreover, we shall prepare bounds and compute the boundary limits at $\partial \Bethebox^{(i,j)}$ of the mutual information $\mutinfB^{\, (i,j)}$~\eqref{eq:Bethe_energy_difference}:

\begin{lemma} \label{lm:mut_inf_properties}
Let $(i,j)$ be an edge.
\begin{enumerate}
 \item [(a)] In the interior of the sliced Bethe box $\Bethebox^{(i,j)}$, the mutual information $\mutinfB^{\, (i,j)}$ is bounded by
 \begin{align}
  \begin{split} 
  0 \, \, & < \, \, 8 (\xiopt - q_i q_j)^2 \, \, \\ & < \, \, \mutinfB^{\, (i,j)}(q_i,q_j) \, \, \\ & \leq  \frac{(\xiopt - q_i q_j)^2}{q_i(1-q_i)q_j(1-q_j)} \, .
  \end{split}
 \end{align}
 \item [(b)] The limit of $\mutinfB^{\, (i,j)}$ at the boundary $\partial \Bethebox^{(i,j)}$ is
 \begin{equation}
  \lim_{(q_i,q_j) \to \partial \Bethebox^{(i,j)}} \mutinfB^{\, (i,j)} (q_i,q_j) = 0.
 \end{equation}
\end{enumerate}
\end{lemma}

Next, we compute first-order and second-order derivatives of $\FBdiff^{\, (i,j)}$ on $\Bethebox^{(i,j)}$. The proof utilizes results from~\citet{welling2001belief,weller2013bethebounds} (Appendix B).



\begin{lemma} \label{lm:derivatives_FB}
Let $(i,j)$ be an edge.
\begin{enumerate}
 \item [(a)] The first-order derivatives of $\FBdiff^{\, (i,j)}$ on $\Bethebox^{(i,j)}$ are
 \begin{align} \label{eq:Bethe_diff_first_derivative}
\begin{split}
 \hspace{-1cm} \frac{\partial}{\partial q_{i}} \FBdiff^{\, (i,j)} \! \!
 %\frac{\partial}{\partial q_{i}} (\FB - \FB^{\setminus (i,j)}) = \frac{\partial}{\partial q_{i}} \FB - %\frac{\partial}{\partial q_{i}} \FB^{\setminus (i,j)} = \\ & 
 = 2 J_{ij} \! + \! \log \Big( \frac{(1-q_i) (q_i - \xi_{i j}^{\ast})}{q_i (1 + \xi_{i j}^{\ast} - q_i - q_j)} \Big).
 \end{split}
\end{align}
 \item [(b)] The second-order derivatives of $\FBdiff^{\, (i,j)}$ on $\Bethebox^{(i,j)}$ are
 \begin{align}
 \hspace{-2cm} \frac{\partial^2}{\partial q_i^2} \, \FBdiff^{\, (i,j)} & = \frac{q_j (1-q_j)}{T_{ij}}  -\frac{1}{q_i(1-q_i)}, \label{eq:Hessian_q_i} \\
 \hspace{-2cm} \frac{\partial^2}{\partial q_i q_j} \, \FBdiff^{\, (i,j)} & = \frac{\partial^2}{\partial q_j q_i} \, \FBdiff^{\, (i,j)} = \frac{q_i q_j - \xiopt}{T_{ij}}, \label{eq:Hessian_q_i_q_j} \\
 \hspace{-2cm} \frac{\partial^2}{\partial q_j^2} \, \FBdiff^{\, (i,j)} & = \frac{q_i (1-q_i)}{T_{ij}} - \frac{1}{q_j(1-q_j)}, \label{eq:Hessian_q_j}
\end{align}
where $T_{ij} \coloneqq q_i q_j (1-q_i) (1-q_j) - (\xiopt - q_i q_j)^2$.
\end{enumerate}
\end{lemma}




The following result formulates a useful property of the Bethe free energy difference on the sliced Bethe box:
\begin{lemma} \label{lm:stationary}
 Let $(i,j)$ be an edge. $\FBdiff^{\, (i,j)}$ has precisely one stationary point on $\Bethebox^{(i,j)}$, which is
 %This point is 
 $(\bar{q_i},\bar{q_j}) = (0.5,0.5)$ and is neither a maximum nor a minimum (i.e., a saddle point).

\end{lemma}

Lemma~\ref{lm:stationary} implies that $\FBdiff^{(i,j)}$ cannot possess a maximum nor a minimum in the interior of $\Bethebox^{(i,j)}$. 
This implies that the supremum and infimum of $\FBdiff^{(i,j)}$ must lie at the boundary.
Finally, we characterize regions of $\Bethebox^{(i,j)}$ on which $\FBdiff^{\, (i,j)}$ contributes always negatively to the Bethe free energy $\FB$:

\begin{lemma} \label{lm:FBdiff_negative}
Let $(i,j)$ be an edge.
 \begin{enumerate}
  \item [(a)] For an attractive edge, $\FBdiff^{\, (i,j)}(q_i,q_j)$ is negative if either both $q_i, q_i < 0.5$ or both $q_i, q_i > 0.5$.
  \item [(b)] For a repulsive edge, $\FBdiff^{\, (i,j)}(q_i,q_j)$ is negative if either $q_i < 0.5$ and $q_j > 0.5$ or $q_i < 0.5$ and $q_j > 0.5$.
 \end{enumerate}
\end{lemma}

\subsection{MAIN RESULTS} \label{subsec:main_results}

After having prepared the technical framework in Sec.~\ref{subsec:bethe_energy_difference}, we now proceed by presenting our main results.
First, we directly relate the Bethe free energy difference to the local properties of the graphical model (Theorem~\ref{thm:Bethe_infinity_norm}).
Then, we address the problem of an 'Bethe-optimal' edge to be deleted (Corollary~\ref{cor:Bethe_opt_edge_Linf} and Theorem~\ref{thm:Bethe_opt_edge_L2}).
Finally, we conclude about the approximation quality of BP regarding the estimated partition function if edges are deleted (Theorems~\ref{thm:error_Bethe_partition} and~\ref{thm:Z_error_unidirectional}).
%Now that we have prepared the technical framework, we can proceed with our theoretical analysis. 
%has been established, we put our analysis on the next level.
%More concretely, we compute stationary points of the Bethe energy difference function and subsequently derive sharp upper and lower bounds on it.
%From this, we can then draw conclusions about the effects of deleting a graph's edges on the Bethe approximation and belief propagation.
%The previous results will prove as useful technical tools when we characterize local and global properties of the Bethe energy difference function.
%This, in turn, will allow us to relate the 'small-scale' behavior of the Bethe function to the performance of belief propagation on the graphical model.

%This property of the Bethe energy difference turns out to be a surprisingly useful tool to establish a direct connection between two substantially distinct quantities: the induced error on the Bethe approximation when changing the structure of a graphical model and the strength of the couplings.

\begin{thm} \label{thm:Bethe_infinity_norm}
Let $(i,j)$ be an arbitrary edge.
Then the $L^{\infty}$-norm of the Bethe free energy difference is
\begin{equation} \label{eq:Bethe_infinity_norm}
 \lvert \lvert \FBdiff^{\, (i,j)} \rvert\rvert_{L^\infty} = \lvert J_{ij} \rvert,
\end{equation}
with
\begin{align}
\hspace{-0.4cm} - \lvert J_{ij} \rvert & = \inf_{\substack{(q_i,q_j) \in \Bethebox^{(i,j)}}} \FBdiff^{\, (i,j)}(q_i,q_j), \label{eq:Bethe_infimum} \\
%\end{equation}
%and
%\begin{equation} 
 \lvert J_{ij} \rvert & = \sup_{\substack{(q_i,q_j) \in \Bethebox^{(i,j)}}} \FBdiff^{\, (i,j)}(q_i,q_j). \label{eq:Bethe_supremum}
\end{align}
The infimum and supremum are not taken by $\FBdiff^{\, (i,j)}$ but exist only as limits at the boundary of $\Bethebox^{(i,j)}$.
In particular, we have for an attractive edge
\begin{align} \label{eq:sup_inf_attractive}
 \begin{split}
  -J_{ij} & =  \lim_{\substack{q_i \to 0 \\ q_j \to 0 }} \FBdiff^{\, (i,j)}(q_i,q_j) =  \lim_{\substack{q_i \to 1 \\ q_j \to 1 }} \FBdiff^{\, (i,j)}(q_i,q_j), \\
  J_{ij} & = \lim_{\substack{q_i \to 0 \\ q_j \to 1 }} \FBdiff^{\, (i,j)}(q_i,q_j) =  \lim_{\substack{q_i \to 1 \\ q_j \to 0 }} \FBdiff^{\, (i,j)}(q_i,q_j),
 \end{split}
\end{align}
and, conversely, for a repulsive edge
\begin{align} \label{eq:sup_inf_repulsive}
 \begin{split}
  -J_{ij} & = \lim_{\substack{q_i \to 0 \\ q_j \to 1 }} \FBdiff^{\, (i,j)}(q_i,q_j) =  \lim_{\substack{q_i \to 1 \\ q_j \to 0 }} \FBdiff^{\, (i,j)}(q_i,q_j), \\
  J_{ij} & = \lim_{\substack{q_i \to 0 \\ q_j \to 0 }} \FBdiff^{\, (i,j)}(q_i,q_j) =  \lim_{\substack{q_i \to 1 \\ q_j \to 1 }} \FBdiff^{\, (i,j)}(q_i,q_j).
 \end{split}
\end{align}
\end{thm}

%Theorem~\ref{thm:Bethe_infinity_norm} entails a series of interesting consequences. 
Theorem~\ref{thm:Bethe_infinity_norm}
%establishes a direct relationship between the structural properties of the graphical model and a global error measure on the Bethe energy difference.
%More specifically, it 
reveals a monotonic dependence between the strength of the couplings and absolute changes in the Bethe free energy that are caused by local modifications in the graphical structure. 
In terms of edge deletion, this implies that the 'Bethe-optimal' choice of an edge to be removed from the graph is the one with the weakest coupling strength:

\begin{cor} \label{cor:Bethe_opt_edge_Linf}
Suppose we aim to remove an edge from the graphical model such that the induced maximum error in the Bethe free energy is minimal. Then this '$L^{\infty}$-Bethe-optimal' edge is the one with the lowest absolute coupling strength:
\begin{equation}
 \argmin_{(i,j) \in \setofedges} \, \lvert \lvert \FBdiff^{(i,j)} \rvert \rvert_{L^{\infty}} = \, \argmin_{(i,j) \in \setofedges} \, \lvert J_{ij} \rvert
\end{equation}
\end{cor}

An analogous property holds for the $L^2$-error of $\FB$ on $\polytopeLocal$: 
%The proof, however, is only valid for the local polytope $\polytopeLocal$, which is a somewhat weaker property than the one in Theorem~\ref{thm:Bethe_infinity_norm}):

\begin{thm} \label{thm:Bethe_opt_edge_L2}
Suppose we aim to remove an edge from the graphical model such that the induced mean squared error in the Bethe free energy on the local polytope $\polytopeLocal$ is minimal. Then this '$L^2$-Bethe-optimal' edge is the one with the lowest absolute coupling strength:
\begin{equation}
 \argmin_{(i,j) \in \setofedges} \, \lvert \lvert \FBdiff^{(i,j)} \rvert \rvert_{L^{2}} = \, \argmin_{(i,j) \in \setofedges} \, \lvert J_{ij} \rvert
\end{equation}
\end{thm}

Next, we conclude about the quantitative change in the Bethe partition function $\PartitionBethe$ if an edge is removed:
\begin{thm} \label{thm:error_Bethe_partition}
Let $\PartitionBethe$ be the Bethe partition function associated with some graphical model, i.e., the quanitity that satisfies $-\log(\PartitionBethe) = \min_{\, \Bethebox} \FB$.
Suppose we remove an (attractive or repulsive) edge from the graph. Let $\FB^{\setminus (i,j)}$ be the representation of the Bethe free energy associated with the new model, together with the new Bethe partition function $\PartitionBethe^{\setminus (i,j)}$ that is implicitly defined by $-\log(\PartitionBethe^{\setminus (i,j)}) = \min_{\, \Bethebox} \FB^{\setminus (i,j)}$.
Then the following error estimate holds:
\begin{equation}
 \bigg\vert \log \Big( \frac{\PartitionBethe}{\PartitionBethe^{\setminus (i,j)}} \Big) \bigg\vert < \lvert J_{ij} \rvert
\end{equation}
\end{thm}

%Third, we obtain a lower bound on the global Bethe minimum in Ising models:
%\begin{cor}
 %The global minimum of the Bethe function is strictly lower bounded by
 %\begin{equation} \label{eq:Bethe_lower_bound}
  %\min\limits_{\bm{q} \in \Bethebox} \FB > - \sum_{(i,j) \in \setofedges} \lvert J_{ij} \rvert - \sum_{i=1}^{n} \, \lvert \theta_i \rvert \, - n \log(2).
 %\end{equation}
%\end{cor}
%The above bound can indeed never be saturated and will be rather loose in general. 
%For certain types of models, however, it can be significantly improved: 
%\textcolor{red}{Example: unidirectional, attractive}.

Finally, we conclude about the quality of the estimated partition function if edges are removed.
%(Theorem \ref{thm:Z_error_unidirectional}) and the marginals (Theorem \ref{thm:marg_error_homogenous}) if edges are removed or the associated potentials are altered. 
We consider unidirectional models (Sec.~\ref{subsec:pgms}) that allow for a precise statement:

\begin{thm} \label{thm:Z_error_unidirectional}
 Consider a unidirectional model, i.e., where all edges are attractive and all variables are biased towards the same state. Let $\Partition$ resp. $\PartitionBethe$ be the associated partition resp. Bethe partition function. Suppose we remove an arbitrary edge from the graph and let $\PartitionBethe^{\setminus (i,j)}$ be the Bethe partition function associated with the new model. Then the quality of the estimated partition function degrades, i.e.,
 \begin{equation} \label{eq:Z_minus_ZB}
  \lvert \Partition - \PartitionBethe\rvert \, < \, \lvert \Partition - \PartitionBethe^{\setminus (i,j)} \rvert. 
 \end{equation}

\end{thm}

Theorem~\ref{thm:Z_error_unidirectional} does not formally extend to models that contain both positive and negative local fields.
Generally, however, the error between the true and the BP-estimated partition function tends to increase, the more edges we remove. 

This negative result is contrasted by the positive effect of edge removal on the estimated marginals. While existing theoretical bounds on the marginal errors are often loose and typically hard to compute~\citep{wainwright2003tree, tagamase2006bounds, ihler2007accuracy, mooij2008bounds}, %it is difficult even for restricted model classes to provide any theoretical guarantees
we validate and explain our statement in Sec.~\ref{sec:experiments}.

%Finally, we prove an interesting theorem that -- for a special case -- formulates a theoretical explanation on the beneficial effects of edge removal on the performance of BP:

%\begin{thm} \label{thm:marg_error_homogenous}
 %For an attractive model with vanishing local potentials (i.e., $\theta_i = 0$), removing edges can only improve the marginal accuracy that is induced by BP fixed points.
%\end{thm}
%The above Theorem is of course not valid in the general case. 
%Moreover, we will observe in Sec.~\ref{sec:experiments} that the marginal error tends to decrease for many models if we remove edges from the graph.
%Recall that the sliced Bethe box is Bisomorphic to a submanifold of $\polytopeLocal^{(i,j)}$ that consists of all points $(q_i,q_j,\xiopt)$ with $\xiopt(q_i,q_j)$ from~\eqref{eq:xi_optimal} being the unique positive number that satisfies the condition $\frac{\partial}{\partial \xi_{ij}} \FB \stackrel{!}{=} 0$.
%Therefore, every stationary point of $\FBdiff$ on $\polytopeLocal^{(i,j)}$ must also be a stationary point of $\FBdiff$ on $\Bethebox^{(i,j)}$ (as usual, with $\xiopt$ already inserted into the definition $\FBdiff$).

%\section{A GENERALIZED BETHE ENERGY DIFFERENCE}


\section{EXPERIMENTS} \label{sec:experiments}
We now demonstrate empirically how removing edges can have an astonishingly positive impact on the approximation accuracy of BP. We perform a range of experiments on a fully connected graph on $10$ vertices.\footnote{This allows for a computation of the exact marginals via the junction tree algorithm~\citep{lauritzen1988junctiontree} and enables us to compare the approximated marginals to them.} Further experiments including a $5 \times 5$- grid graph are contained in Appendix C.

We consider both attractive and general models (Sec.~\ref{subsec:pgms}). In Sec.~\ref{subsec:attractive_models}, we focus on attractive models and sample $J_{ij}$ uniformly from $[0,\hat{J}]$ for $\hat{J} \in \{0.1, 0.2, \dots, 2\}$. In Sec.~\ref{subsec:general_models}, we focus on general models and sample $J_{ij}$ uniformly from $[-\hat{J},\hat{J}]$ for $\hat{J} \in \{0.1, 0.2, \dots, 2\}$. For both settings, we create two scenarios: first, models with weak local fields (each $\theta_i$ is sampled uniformly from $[-0.2,0.2]$); second, models with strong local fields (each $\theta_i$ is sampled uniformly from $[-0.5,0.5]$). For each configuration, we create 200 models.

For each individual model, we remove edges one by one until we reach a spanning tree. 
We do not remove edges, whose deletion makes the graph disconnected.\footnote{This procedure corresponds to the so-called reverse-delete algorithm~\citep{kruskal1956spanning} that constructs a maximum spanning tree with respect to a given criterion.} 
We compare two criteria for selecting the next edge to be removed: first,
%we remove edges based on 
the Bethe-optimal criterion (Corollary~\ref{cor:Bethe_opt_edge_Linf}, Theorem~\ref{thm:Bethe_opt_edge_L2}); second, we remove edges that induce the lowest mutual information between two connected variables in the original model. 
More precisely: assume that we have already removed edge set $\tilde{\setofedges}$ from a model; then the next edge $(i,j)$ to be removed is the one that minimizes either of the following criteria:
\begin{align*}
 \texttt{BETHE-OPT}: & \quad \argmin\limits_{(i,j) \in \setofedges \setminus \tilde{\setofedges}} \lvert J_{ij} \rvert  \\
 \texttt{CHOW-LIU}: & \quad \argmin\limits_{(i,j) \in \setofedges \setminus \tilde{\setofedges}} I(X_i; X_j) 
\end{align*}
Note that by applying the second criterion, we end up in a Chow-Liu tree~\citep{chowliu1968tree}, i.e., the spanning tree with the lowest KL divergence from the original model.\footnote{We cannot generally apply the second criterion, as the computation of the mutual information between two variables requires knowledge of the related exact singleton and pairwise marginals.}

For each intermediate model during the edge deletion process, we run BP 100 times with random message initialization to approximate the marginals. For each run, we perform at most 1000 iterations. If BP has not converged, we estimate the marginals from the final iteration. We utilize a randomized message scheduling to achieve better convergence~\citep{elidan2006residual}. For the error evaluation, we compute the $l^1$-distance between the exact and estimated marginals. The results for each model are averaged over the $100$ runs. Finally, the results are averaged over all $200$ models, each based on a different configuration of the potentials. 

\subsection{ATTRACTIVE MODELS} \label{subsec:attractive_models}
For weak couplings,
%small values of the pairwise couplings, 
BP finds accurate marginal estimates in the original model. 
If the strength of the couplings increases, this favorable property suddenly disappears at some critical treshold and BP fails to approximate the marginals for larger values of $\hat{J}$ (Fig.~\ref{fig:completegraph10_attractive}).
This behavior is not due to worse convergence properties of BP, but results from inaccurate BP fixed points and thus inaccurate minima of the Bethe free energy~\citep{weller2014understanding}. 
While the Bethe free energy is convex for weaker couplings and possesses a unique global minimum, this minimum becomes an (unstable) saddle point if $\hat{J}$ increases and cannot be reached by BP any longer~\citep{heskes2003stable, mooij2005bethe, knoll2017stability}.
For even larger couplings, the landscape of the Bethe free energy becomes increasingly complex and the (possibly many\footnote{The Bethe free energy may theoretically possess exponentially many local minima~\citep{watanabe2009graphzeta, knoll2019accurate}.}) Bethe minima approach the boundary of the domain, thus moving away from the exact marginals.
%The higher the couplings increase, the more complex becomes the landscape of the Bethe free energy, and the closer the (possibly many\footnote{The Bethe free energy may theoretically possess exponentially many local minima~\citep{watanabe2009graphzeta, knoll2019accurate}.}) Bethe minima approach the boundary of the domain, thus moving away from the exact marginals.

If we remove edges from the graph, the marginal accuracy of BP in the new model is often much better than in the original model. 
This can be explained by a 'reconvexification' of the Bethe free energy that makes unstable saddle points or maxima stable minima again and allows BP to converge to accurate fixed points. 
The question on how many edges we should actually remove, is a difficult one. 
In Fig.~\ref{fig:completegraph10_attractive}, we observe that there appears to be a 'channel' that defines an optimal number of edges to be removed.
The stronger the couplings become, the more preferable is it to rely on tree approximations, while BP outperforms the edge removal techniques for regimes with lower coupling strength (Fig.~\ref{fig:completegraph10_attractive_treeslice}).
For stronger local potentials, the channel becomes narrower and edge removal loses some of its benefit (although the results are mostly superior in comparison to the original model).
Also, in Fig.~\ref{fig:completegraph10_attractive} we observe that $\texttt{BETHE-OPT}$ performs slightly better than $\texttt{CHOW-LIU}$ criterion, with an increasing advantage for stronger local potentials.

\subsection{GENERAL MODELS} \label{subsec:general_models}
The situation for general models is similar as in the attractice case. We can observe certain differences though (Fig.~\ref{fig:completegraph10_general}):
%There are a few differences though, as we can observe in Fig.~\ref{fig:completegraph10_general}:
first, the critical treshold of the couplings, beyond which the Bethe free energy becomes non-convex, is higher than for attractive models. 
Second, for models with strong local potentials, edge removal based on the $\texttt{BETHE-OPT}$ criterion improves the marginal accuracy only slightly.
%edge removal based on the $\texttt{BETHE-OPT}$ criterion does only provide a slight advantage if the local potentials become stronger.
Interestingly, the Chow-Liu tree induces strikingly accurate Bethe minima for all models (Fig.~\ref{fig:completegraph10_general_treeslice}).
As in the attractive case, we observe that the problem becomes more difficult if both the pairwise and the local potentials become stronger at the same time.

\section{CONCLUSION} \label{sec:conclusion}

We have proposed to approximate a graphical model as a 'preprocessing step' for approximate inference. We focused on the removal of single edges and showed that this can have a beneficial impact on the behavior of belief propagation.

We have exploited the relationship between belief propagation and the Bethe free energy
%and provided a detailed theoretical analysis 
to explain the success of such an approach. 
Subsequently, we have validated our findings in an experimental study. 
Most importantly, our analysis contributes to an improved understanding of belief propagation and the Bethe approximation in general.

We are convinced that our observations inspire the development of further sophisticated methods that try to approximate a graphical model and improve the behavior of message passing algorithms. We believe that one logical extension lies in the modification of the local potentials to compensate for the lost information caused by edge removal.

\begin{figure}[t]
\centering \hspace*{-5pt}
%\subfloat[][]{\label{attractive_theta_m02_p02_random} \includegraphics[width=0.4\linewidth]{heatmap_completegraph10_attractive_theta_m02_p02_random_l1_error_sing_all_runs}}
\subfloat[][]{\label{completegraph10_attractive_theta_m02_p02_weakest_edge} \includegraphics[width=0.5\linewidth]{figures_UAI2022/heatmap_completegraph10_attractive_theta_m02_p02_weakest_edge_l1_error_sing_all_runs}}
\subfloat[][]{\label{completegraph10_attractive_theta_m02_p02_Chow_Liu} \includegraphics[width=0.5\linewidth]{figures_UAI2022/heatmap_completegraph10_attractive_theta_m02_p02_mutual_information_l1_error_sing_all_runs}}
%\caption{Attractive models, $\theta_i \in [-0.2,0.2]$. (a) $\texttt{BETHE-OPT}$ criterion vs. (b) $\texttt{CHOW LIU}$ criterion}
%\label{attractive_theta_m02_p02}
%\end{figure}
\vspace{-0.2cm}
%\begin{figure}[h]
%\centering \hspace*{-5pt}
%\subfloat[][]{\label{attractive_theta_m02_p02_random} \includegraphics[width=0.4\linewidth]{heatmap_completegraph10_attractive_theta_m02_p02_random_l1_error_sing_all_runs}}
\subfloat[][]{\label{completegraph10_attractive_theta_m05_p05_weakest_edge} \includegraphics[width=0.5\linewidth]{figures_UAI2022/heatmap_completegraph10_attractive_theta_m05_p05_weakest_edge_l1_error_sing_all_runs}}
\subfloat[][]{\label{completegraph10_attractive_theta_m05_p05_Chow_Liu} \includegraphics[width=0.5\linewidth]{figures_UAI2022/heatmap_completegraph10_attractive_theta_m05_p05_mutual_information_l1_error_sing_all_runs}}
\caption{Attractive models. First row: $\theta_i \in [-0.2,0.2]$; second row: $\theta_i \in [-0.5,0.5]$. (a) + (c): $\texttt{BETHE-OPT}$ criterion; (b) + (d): $\texttt{CHOW LIU}$ criterion.}
\vspace{-0.25cm}
\label{fig:completegraph10_attractive}
\end{figure}

\begin{figure}[t]
\centering
%\subfloat[][]{\label{general_theta_m02_p02_random} \includegraphics[width=0.4\linewidth]{heatmap_completegraph10_general_theta_m02_p02_random_l1_error_sing_all_runs}}
\includegraphics[width=1\linewidth]{figures_UAI2022/sliceplot_completegraph10_attractive_l1_error_sing_marg_tree_slice}
\caption{Attractive models. Tree approximations with respect to BETHE-OPT (blue) and CHOW-LIU (green) vs. BP in the original model (red). Left-hand side: $\theta_i \in [-0.2,0.2]$; right-hand side: $\theta_i \in [-0.5,0.5]$.}
\label{fig:completegraph10_attractive_treeslice}
\end{figure}

\begin{figure}[t]
\centering \hspace*{-5pt}
%\subfloat[][]{\label{general_theta_m02_p02_random} \includegraphics[width=0.4\linewidth]{heatmap_completegraph10_general_theta_m02_p02_random_l1_error_sing_all_runs}}
\subfloat[][]{\label{completegraph10_general_theta_m02_p02_weakest_edge} \includegraphics[width=0.5\linewidth]{figures_UAI2022/heatmap_completegraph10_general_theta_m02_p02_weakest_edge_l1_error_sing_all_runs}}
\subfloat[][]{\label{completegraph10_general_theta_m02_p02_Chow_Liu} \includegraphics[width=0.5\linewidth]{figures_UAI2022/heatmap_completegraph10_general_theta_m02_p02_mutual_information_l1_error_sing_all_runs}}
%\caption{General models, $\theta_i \in [-0.2,0.2]$. (a) $\texttt{BETHE-OPT}$ criterion vs. (b) $\texttt{CHOW LIU}$ criterion}
%\label{general_theta_m02_p02}
%\end{figure}
\vspace{-0.2cm}
%\begin{figure}[h]
%\centering \hspace*{-5pt}
%\subfloat[][]{\label{general_theta_m02_p02_random} \includegraphics[width=0.4\linewidth]{heatmap_completegraph10_general_theta_m02_p02_random_l1_error_sing_all_runs}}
\subfloat[][]{\label{completegraph10_general_theta_m05_p05_weakest_edge} \includegraphics[width=0.5\linewidth]{figures_UAI2022/heatmap_completegraph10_general_theta_m05_p05_weakest_edge_l1_error_sing_all_runs}}
\subfloat[][]{\label{completegraph10_general_theta_m05_p05_Chow_Liu} \includegraphics[width=0.5\linewidth]{figures_UAI2022/heatmap_completegraph10_general_theta_m05_p05_mutual_information_l1_error_sing_all_runs}}
\caption{General models. First row: $\theta_i \in [-0.2,0.2]$; second row: $\theta_i \in [-0.5,0.5]$. (a) + (c): $\texttt{BETHE-OPT}$ criterion; (b) + (d): $\texttt{CHOW LIU}$ criterion.}
\vspace{-0.25cm}
\label{fig:completegraph10_general}
\end{figure}

\begin{figure}[t]
\centering
%\subfloat[][]{\label{general_theta_m02_p02_random} \includegraphics[width=0.4\linewidth]{heatmap_completegraph10_general_theta_m02_p02_random_l1_error_sing_all_runs}}
\includegraphics[width=1\linewidth]{figures_UAI2022/sliceplot_completegraph10_general_l1_error_sing_marg_tree_slice}
\caption{General models. Tree approximations with respect to BETHE-OPT (blue) and CHOW-LIU (green) vs. BP in the original model (red). Left-hand side: $\theta_i \in [-0.2,0.2]$; right-hand side: $\theta_i \in [-0.5,0.5]$.}
\label{fig:completegraph10_general_treeslice}
\end{figure}

%\section{Back Matter}
%There are a some final, special sections that come at the back of the paper, in the following order:
%\begin{itemize}
  %\item Author Contributions
  %\item Acknowledgements
  %\item References
%\end{itemize}
%They all use an unnumbered \verb|\subsubsection|.

%For the first two special environments are provided.
%(These sections are automatically removed for the anonymous submission version of your paper.)
%The third is the ‘References’ section.
%(See below.)

%(This ‘Back Matter’ section itself should not be included in your paper.)

%\begin{contributions} % will be removed in pdf for initial submission,
                      % so you can already fill it to test with the
                      % ‘accepted’ class option
    %Briefly list author contributions.
    %This is a nice way of making clear who did what and to give proper credit.

    %H.~Q.~Bovik conceived the idea and wrote the paper.
    %Coauthor One created the code.
    %Coauthor Two created the figures.
%\end{contributions}

\begin{acknowledgements}
This work was supported by the Graz University of Technology LEAD project ``Dependable Internet of Things in Adverse Environments''. We further acknowledge support from the Wireless Lab, Huawei Technologies Sweden.
\end{acknowledgements}

\newpage

\def\bibfont{\small}
\bibliography{uai2022-template}

\appendix
% NOTE: necessary when ptmx or no mathfont class option is given


\end{document}
