% \documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz,ifthen} % nice language for creating drawings and diagrams
\usepackage{amsmath,amssymb,amsthm, eufrak}
\usepackage{bm}
\usepackage{cleveref}
\usepackage[mathcal]{eucal}
\usepackage{subfig}
\usepackage{graphicx}
\usepackage{xr}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example
%% ------------- Maths -----------------
\DeclareMathOperator{\E}{\mathbb{E}}
\DeclareMathOperator*{\argmin}{argmin}
\DeclareMathOperator{\sech}{sech}
%--- Numbers ---
\newcommand{\Int}{\mathbb{Z}}
\newcommand{\IntPositive}{\mathbb{Z}_+^*}
\newcommand{\Real}{\mathbb{R}}
\newcommand{\RealPositive}{\mathbb{R}_+}
\newcommand{\RealNonnenative}{\mathbb{R}_{\geq 0}}
\newcommand{\bigO}{\mathcal{O}}

%-- Marginals
\newcommand{\marginals}[1]{P_{#1}}
\newcommand{\marginalsApprox}[1]{{\tilde{P}_{#1}(x_{#1})}}
\newcommand{\marginalsApproxPW}[2]{\tilde{P}_{#1 #2}({x_{#1},x_{#2}})}
\newcommand{\marginalsSingle}[1]{P_{#1}(x_{#1})}
\newcommand{\marginalsShort}[1]{P(x_{#1})}
\newcommand{\marginalsShortApprox}[1]{\tilde{P}(x_{#1})}
\newcommand{\marginalsShortApproxFp}[1]{\tilde{P}^{\circ}(x_{#1})}
\newcommand{\setofmarg}{\mathcal{P}}


%% -------- Graphical Model -----------------
%--- Graph ---
\newcommand{\graph}{\mathcal{G}}
\newcommand{\setofnodes}{\mathbf{X}}
\newcommand{\lenSetOfNodes}{\lvert \mathbf{X} \rvert}
\newcommand{\setofedges}{\mathbf{E}}
\newcommand{\alphabet}{\mathcal{X}}
\newcommand{\RV}[1]{X_{#1}}
\newcommand{\Joint}{\mathbf{X}}
\newcommand{\edge}[2]{(#1,#2)}			% Undirected Edge
\newcommand{\nbh}[1]{\mathcal{N}(#1)}
\newcommand{\neighbors}[1]{N({#1})}
\newcommand{\degree}[1]{d_{#1}}
%---PGM ---
\newcommand{\PGM}[1][]{\mathcal{U}_{#1}}
\newcommand{\potPW}[2]{\Phi_{#1 #2}({x_{#1},x_{#2}})}
\newcommand{\pairwiseShort}[2]{\Phi(x_{#1},x_{#2})}
\newcommand{\pairwiseSBP}[3]{\Phi_{#3}(x_{#1},x_{#2})}
\newcommand{\potLocal}[1]{\Phi_{#1}(x_{#1})}
\newcommand{\localShort}[1]{\Phi(x_{#1})}
\newcommand{\localSBP}[2]{\Phi_{#2}(x_{#1})}

\newcommand{\setOfPot}{ \Psi}
\newcommand{\coupling}[2]{J_{#1#2}}
\newcommand{\field}[1]{\theta_{#1}}
\newcommand{\mean}[1]{m_{#1}}
\newcommand{\meanMinLocal}[1]{m_{#1}^{\circ}}
\newcommand{\meanMinLocalNeg}[1]{{m}_{#1}^{\oplus}}
\newcommand{\meanMinGlobal}[1]{m_{#1}^{*}}
\newcommand{\meanMinLocalSpecific}[2]{m_{#1}^{#2}}
\newcommand{\correlation}[2]{\chi_{#1#2}}
\newcommand{\correlationMinLocalSpecific}[3]{\chi_{#1#2}^{#3}}

%% -------- Belief Propagation -----------------
\newcommand{\BP}{\text{BP}}
\newcommand{\BPD}{\text{BP}_{\text{D}}}
\newcommand{\msg}[4][]{\ifthenelse{\equal{#4}{}} {\mu^{(#1)}_{#2 #3}(x_{#3})} {\mu^{(#1)}_{#2 #3}(\RV{#3}=#4)}}
\newcommand{\msgShort}[4][]{\ifthenelse{\equal{#4}{}} {\mu^{#1}_{#2 #3}(x_{#3})} {\mu^{#1}_{#2 #3}(#4)}}
\newcommand{\setOfMsg}[1][]{\bm{\mu}^{(#1)}}
\newcommand{\fpMsg}[3]{\mu^{\circ}_{#1 #2}(x_{#2}, #3)}
\newcommand{\msgRatio}[3][]{\bar{\mu}_{#2 #3}}



%% -------- Evaluation Metrics  -----------
\newcommand{\spin}{\stateSpace}
\newcommand{\magnetization}[1][]{\langle m_{#1} \rangle}
\newcommand{\mse}[1][]{\text{MSE}}
\newcommand{\mseb}{\text{MSE}_{{B}}}

%--- Cost-Functions ---
\newcommand{\FG}{\mathcal{F}}
\newcommand{\UG}{\mathcal{U}}
\newcommand{\SG}{\mathcal{S}}
\newcommand{\UBdiff}{\Delta \mathcal{U}_B}
\newcommand{\FB}{\mathcal{F}_B}
%\newcommand{\FB}[1][]
%{\ifthenelse{\equal{#1}{}}
%	{\mathcal{F}_B} {\mathcal{F}_{B,all}}}
\newcommand{\FBMinGlobal}{\mathcal{F}_B^*}
\newcommand{\FBMinLocal}{\mathcal{F}_B^{\circ}}
\newcommand{\FBMinLocalNeg}{\mathcal{{F}_B}^{\oplus}}
\newcommand{\FBMinLocalSpecific}[1]{\mathcal{F}_B^{#1}}
\newcommand{\FBdiff}{\Delta \mathcal{F}_B}
\newcommand{\FMF}{\mathcal{F}_{MF}}
\newcommand{\FMFdiff}{\Delta \mathcal{F}_{MF}}
\newcommand{\mutinfB}{I_B}
\newcommand{\UB}{\mathcal{U}_{{B}}}
\newcommand{\SB}{\mathcal{S}_{{B}}}
\newcommand{\FBApprox}{\mathcal{\tilde{F}_B}}
\newcommand{\PartitionBethe}{\mathcal{Z_B}}
\newcommand{\PartitionMF}{\mathcal{Z_{MF}}}
%\newcommand{\PartitionGibbs}{\mathcal{Z}}
\newcommand{\Partition}{\mathcal{Z}}
%\newcommand{\PartitionApprox}{\mathcal{{Z}_B}}
\newcommand{\polytopeLocal}{\mathbb{L}}
\newcommand{\polytopeMarginal}{\mathbb{M}}
\newcommand{\Bethebox}{\mathbb{B}}
\newcommand{\xiopt}{\xi_{ij}^{\ast}}


%% ---------- Fixed Pointn Indexing --------------------
\newcommand{\init}{\text{init}} %initial messages
\newcommand{\iteration}[1]{{#1}}
\newcommand{\fpI}{k}
\newcommand{\stableFP}{s}
\newcommand{\stableOne}{s}
\newcommand{\stableTwo}{t}
\newcommand{\minFP}{(m)}
\newcommand{\setOfStable}{\vm{S}}
\newcommand{\setOfMin}{\vm{M}}

%% ----------- Theorem Environments ---------------------
\newtheorem{claim}{Claim}%[section]
\newtheorem{thm}{Theorem}%[section]
\newtheorem{lemma}{Lemma}%[section]
\newtheorem{con}{Conjecture}
\newtheorem{cor}{Corollary}
\newtheorem{prop}{Properties}
\newtheorem{propos}{Proposition}

\theoremstyle{definition}
\newtheorem*{ex}{Example}

\externaldocument{leisenberger_186}

\title{Fixing the Bethe Approximation: \\ How Structural Modifications in a Graph Improve Belief Propagation \\ -- Supplementary Material}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:Harald Leisenberger <harald.leisenberger@tugraz.at>?Subject=Your UAI 2022 paper}{Harald~Leisenberger}{}}
\author[1]{Franz Pernkopf}
\author[1]{Christian Knoll}
%\author[3]{Further~Coauthor}
%\author[1]{Further~Coauthor}
%\author[3]{Further~Coauthor}
%\author[3,1]{Further~Coauthor}
% Add affiliations after the authors
\affil[1]{%
    Signal Processing and Speech Communication Laboratory\\
	Graz University of Technology\\
	Austria
}
%\affil[2]{%
    %Second Affiliation\\
    %Address\\
    %…
%}
%\affil[3]{%
    %Another Affiliation\\
    %Address\\
    %…
  %}
  
\begin{document}
\onecolumn
\maketitle

\section*{APPENDIX}

\subsection*{APPENDIX A -- PROOFS}

\begin{proof} [\textbf{PROOF OF LEMMA~\ref{lm:xi_optimal_center}:}] \leavevmode

 Inserting $q_i = 0.5$ and $q_j = 0.5$ in~\eqref{eq:xi_optimal} yields
 \begin{align*}
  \xiopt(0.5,0.5) & = \frac{1}{2 \alpha_{ij}} \Big( (1+\alpha_{ij}) - \sqrt{(1+\alpha_{ij})^2  - \alpha_{ij} (1+\alpha_{ij})} \, \, \Big) \\ 
  & = \frac{1}{2 \alpha_{ij}} \Big( (1+\alpha_{ij}) - \sqrt{(1+\alpha_{ij})} \, \, \Big) \\
  & \hspace{-0.1cm} \stackrel{\eqref{eq:alpha}}{=} \frac{1}{2 \big( e^{4 J_{ij}} - 1)} (e^{4 J_{ij}} - \sqrt{e^{4 J_{ij}}} \big) \\
  & = \frac{1}{2} \Big( \frac{e^{2 J_{ij}}-1}{e^{2 J_{ij}} - e^{-2 J_{ij}}} \Big) \\
  & = \frac{1}{4} \big( \tanh(J_{ij}) + 1 \big) \\
  & = \frac{\sigma(2 \, J_{ij})}{2}.
 \end{align*}
\end{proof}

\begin{proof} [\textbf{PROOF OF LEMMA~\ref{lm:xi_optimal_limits}:}] \leavevmode

 Note first that one can interchange the role of $q_i$ and $q_j$ in the definition~\eqref{eq:xi_optimal} of $\xiopt$ due to symmetry.
 It is therefore sufficient to consider only the left hand side equalities in~\eqref{eq:xi_limit_0} and~\eqref{eq:xi_limit_1}.
 Let first $q_i \to 0$ and $q_j \to k$.
 From~\eqref{eq:xi_bounds} we know that $\xiopt$ is bounded from above by $\min(q_i,q_j)$. As also $\xiopt > 0$, the first equality in~\eqref{eq:xi_limit_0} follows by continuity.
 Let now $q_i \to 1$ and $q_j \to k$. Then the limit of both the lower bound and the upper bound in~\eqref{eq:xi_bounds} equals $k$.
 Consequently, $\xiopt$ must tend to $k$ as well, which yields the first equality in~\eqref{eq:xi_limit_1}.
\end{proof}

\begin{proof} [\textbf{PROOF OF LEMMA~\ref{lm:mut_inf_properties}:}] \leavevmode

 \begin{enumerate}
 \item [(a)]
 We apply Lemma~\ref{lm:Dragomir_bounds} of~\citet{dragomir2000relentropy} to the special case of binary random variables.
 For the upper bound, we substitute the matching combinations of singleton and pairwise probabilities from table~\eqref{tab:prob_table} into the right-hand formula from~\eqref{eq:Dragomir_bounds}:
 \begin{align*}
  \mutinfB^{\, (i,j)}(q_i,q_j) \, \leq \, \,  \frac{(\xiopt)^2}{q_i q_j} + \frac{(q_i - \xiopt)^2 }{q_i (1-q_j)} + \frac{(q_j - \xiopt)^2}{(1-q_i) q_j} + \frac{(1 + \xiopt - q_i - q_j)^2}{(1-q_i)(1 - q_j)} -1,
 \end{align*}
 and after a few simple algebraic manipulations we directly arrive at the desired result.
 Analogously, we derive the lower bound by inserting the corresponding probabilities into the expression to the left of $\mutinfB^{(i,j)}$ in~\eqref{eq:Dragomir_bounds}:
 \begin{align*}
  \mutinfB^{\, (i,j)}(q_i,q_j) \, \geq \, \, \frac{1}{2} \, \big( & \, \lvert \, \xiopt - q_i q_j \, \rvert \, + \lvert \, (1 + \xiopt - q_i - q_j) - (1-q_i) (1-q_j) \, \rvert \, \\ & + \lvert \, (q_i - \xiopt) - q_i (1 - q_j) \, \rvert + \lvert \, (q_j - \xiopt) - (1 - q_i) q_j \, \rvert \, \big)^2
 \end{align*}
 Depending on whether we have an attractive or an repulsive edge, we either make us of result (a) or (b) in Lemma~\ref{lm:Weller_bounds_xi} and get rid of the absolute value symbols, after which -- in both cases -- the last expression simplifies to
 \begin{equation*}
  \frac{1}{2} \,  \big(4 (\xiopt - q_i q_j) \big)^2 = 8 (\xiopt - q_i q_j)^2.
 \end{equation*}
 \item [(b)]
 According to~\ref{eq:sliced_Bethe_box}, the boundary of the sliced Bethe Box $\Bethebox^{(i,j)}$ is the union of four line segments
 \begin{align*}
 \partial \Bethebox^{(i,j)} = & \{(q_i,q_j) \in \mathbb{R}^2 \, \vert \, q_i = 0, 0 \leq q_j \leq 1 \}  \, \, \cup \\ & \{(q_i,q_j) \in \mathbb{R}^2 \, \vert \, q_i = 1, 0 \leq q_j \leq 1 \} \, \, \cup \\ & \{(q_i,q_j) \in \mathbb{R}^2 \, \vert \, 0 \leq q_i \leq 1, q_j = 0 \}  \, \, \cup \\ & \{(q_i,q_j) \in \mathbb{R}^2 \, \vert \, 0 \leq q_i \leq 1, q_j = 1 \}.
 \end{align*}
 Without loss of generality, we focus on the case where $q_i \to 1$ and $q_j \to k$ for some $k \in [0,1]$. The remaining cases can be treated similarly. We further note that it is sufficient to prove the equality
 \begin{equation} \label{eq:independence_at_boundary}
  \lim_{\substack{q_i \to 1 \\ q_j \to k }} \marginalsApproxPW{i}{j} = \lim_{\substack{q_i \to 1 \\ q_j \to k }} \marginalsApprox{i} \marginalsApprox{j}
 \end{equation}
 for all possible realizations $x_i,x_j \in \{+1,-1\}$ of $X_i,X_j$, as this implies statistical independence of $X_i$ and $X_j$ at the boundary and consequently their mutual information must equal zero in the limit. For checking equality~\eqref{eq:independence_at_boundary}, we utilize Lemma~\ref{lm:xi_optimal_limits}. E.g., for $x_i = +1, x_j = -1$ we get
 \begin{align*}
  \lim_{\substack{q_i \to 1 \\ q_j \to k }} \tilde{P}_{ij}(+1,-1) & \stackrel{\eqref{tab:prob_table}}{=} \lim_{\substack{q_i \to 1 \\ q_j \to k }} (q_i - \xiopt(q_i,q_j)) \\ 
  & \stackrel{\eqref{eq:xi_limit_1}}{=} 1 - k = \\ & = \lim_{\substack{q_i \to 1 \\ q_j \to k }} q_i (1-q_j) \\ & \stackrel{\eqref{tab:prob_table}}{=} \lim_{\substack{q_i \to 1 \\ q_j \to k }} \tilde{P}_{i}(+1) \tilde{P}_{j}(-1).
 \end{align*}
 Analogous calculations for the three other possible combinations of $x_i,x_j$ validate the statement.
 \end{enumerate}
\end{proof}

\begin{proof} [\textbf{PROOF OF LEMMA~\ref{lm:derivatives_FB}:}] \leavevmode
 \begin{enumerate}
 \item [(a)] We utilize Lemma~\ref{lm:Welling_derivatives_FB} of \citet{welling2001belief}. Then the first-order derivative of $\FB^{\, \setminus (i,j)}$ on $\Bethebox^{\setminus (i,j)}$ is
 \begin{align} 
 \begin{split} \label{eq:Bethe_prime_first_derivative}
  \frac{\partial}{\partial q_{i}} \FB^{\, \setminus (i,j)} & =  -2 \, \theta_i + 2 \hspace{-0.3cm} \sum\limits_{k \in \nbh{i} \setminus j} \hspace{-0.3cm} J_{ij} + \log \Big( \frac{(1-q_i)^{d_i-2}}{q_i^{d_i-2}} \hspace{-0.4cm} \prod\limits_{k \in \nbh{i} \setminus j} \frac{q_i - \xiopt }{1 + \xiopt - q_i - q_j}  \Big).
 \end{split}
\end{align}
 Consequently, we obtain
 %\citet{welling2001belief} and~\citet{shin2012complexity} have derived gradient-based algorithms to minimize $\FB$ over $\Bethebox$.
 %To this end, they have computed the partial derivatives of $\FB$ on $\Bethebox$ as
 %\begin{align}
 %\begin{split} \label{eq:Bethe_first_derivative}
  %\frac{\partial}{\partial q_{i}} \FB = & -2 \, \theta_i + 2 \sum\limits_{j \in \nbh{i}} J_{ij} + \\ & + \log \Big( \frac{(1-q_i)^{d_i-1}} {q_i^{d_i-1}} \prod\limits_{j \in \nbh{i}} \frac{q_i - \xiopt }{1 + \xiopt - q_i - q_j}  \Big).
  %\end{split}
 %\end{align}
 %and -- though by different means -- ensured to stay inside the Bethe box after each gradient step.
 %Due to the potential non-convexity of the Bethe function, the approximation quality in local or even global minima may be suboptimal though. \\
 
 %To continue our analysis, we must take the first-order and second-order derivatives of the Bethe function and its components into account.
%Recall formula~\eqref{eq:Bethe_first_derivative} for the partial derivatives of the Bethe function with respect to $q_i$ and suppose we remove an arbitrary edge $(i,j)$ from the associated graphical model.
%Obviously, the number of neighbors of nodes $i$ and $j$ decreases by one, that is $d_i' = d_i - 1$ and $d_j' = d_j - 1$.
%Let $\FB^{\setminus (i,j)}$ be the Bethe function that is defined on the so modified graph.
%Then the derivative of $\FB^{\setminus (i,j)}$ on the sliced Bethe box $\Bethebox^{\, \setminus (i,j)}$ is
%\begin{align} 
 %\begin{split} \label{eq:Bethe_prime_first_derivative}
  %\hspace{-0.119cm} \frac{\partial}{\partial q_{i}} \FB^{\, (i,j)} = & -2 \, \theta_i + 2 \hspace{-0.3cm} \sum\limits_{k \in \nbh{i} \setminus j} \hspace{-0.3cm} J_{ij} + \\ & + \log \Big( \frac{(1-q_i)^{d_i-2}}{q_i^{d_i-2}} \hspace{-0.4cm} \prod\limits_{k \in \nbh{i} \setminus j} \frac{q_i - \xiopt }{1 + \xiopt - q_i - q_j}  \Big).
 %\end{split}
%\end{align}
%By definition of the Bethe energy difference function as $\FBdiff^{\, (i,j)} = \FB - \FB^{\setminus (i,j)}$, we can now easily compute the partial derivative of $\FBdiff^{\, (i,j)}$ on the sliced Bethe box $\Bethebox^{\, (i,j)}$ with respect to $q_i$ as 
\begin{align} \label{eq:Bethe_diff_first_derivative}
\begin{split}
\frac{\partial}{\partial q_{i}} \FBdiff^{\, (i,j)}
 & \, =  \frac{\partial}{\partial q_{i}} (\FB - \FB^{\setminus (i,j)}) \\ & = \, \frac{\partial}{\partial q_{i}} \FB - \frac{\partial}{\partial q_{i}} \FB^{\setminus (i,j)} \\ 
 & = \, 2 J_{ij} \! + \! \log \Big( \frac{(1-q_i) (q_i - \xi_{i j}^{\ast})}{q_i (1 + \xi_{i j}^{\ast} - q_i - q_j)} \Big).
 \end{split}
\end{align}
 \item [(b)] We can apply Theorem~\ref{thm:Weller_second_derivatives} from~\citet{weller2013bethebounds}. Note, however, that we must also take the second derivatives of the node entropies $\mathcal{S}_i$ and $\mathcal{S}_j$ into account which are computed as
\begin{align}
 \frac{\partial^2}{\partial q_i^2} \, \mathcal{S}_i & = - \, \frac{1}{q_i(1-q_i)} , \\
 \frac{\partial^2}{\partial q_j^2} \, \mathcal{S}_j & = - \, \frac{1}{q_j(1-q_j)} ,
\end{align}
and are zero for the cross derivatives. By definition, we have $\FBdiff^{\, (i,j)} = \UBdiff^{(i,j)} + \mutinfB^{\, (i,j)} = \UBdiff^{(i,j)} + \mathcal{S}_i + \mathcal{S}_j - \mathcal{S}_{ij}$. Note that $f_{ij}$ from Theorem~\ref{thm:Weller_second_derivatives} in~\citet{weller2013bethebounds}corresponds precisely to $\UBdiff^{(i,j)}-\mathcal{S}_{ij}$. If we put these observations together, the statement follows immediately.
 
 %\citet{weller2013bethebounds} have considered second order partial derivatives of the Bethe function and computed all entries of the Hessian matrix $\nabla^2 \FB$.
%As an interim result, they have computed second derivatives of edge specific Bethe terms that correspond to the difference of the energy contribution and the pairwise entropy contribution inherent to an edge, i.e., of $\UBdiff^{\, (i,j)} - \, \mathcal{S}_{ij}$ (Lemma~\ref{thm:Weller_second_derivatives} in the appendix).
%In our definition (reference) of the Bethe energy difference $\FBdiff^{\, (i,j)}$, we must also take the second derivatives of the node entropies $\mathcal{S}_i$ and $\mathcal{S}_j$ into account that are easily computed as
%\begin{align}
 %\frac{\partial^2}{\partial q_i^2} \, \mathcal{S}_i & = - \, \frac{1}{q_i(1-q_i)} , \\
 %\frac{\partial^2}{\partial q_j^2} \, \mathcal{S}_j & = - \, \frac{1}{q_j(1-q_j)} ,
%\end{align}
%and vanish for the cross derivatives.
%Putting these observations together, we compute the second derivatives of $\FBdiff^{\, (i,j)} = \UBdiff^{(i,j)} + \mutinfB^{\, (i,j)} = \UBdiff^{(i,j)} + \mathcal{S}_i + \mathcal{S}_j - \mathcal{S}_{ij}$ as

 \end{enumerate}
\end{proof}


\begin{proof} [\textbf{PROOF OF LEMMA~\ref{lm:stationary}:}] \leavevmode

The proof consists of two parts: in the first part, we show that $(0.5,0.5)$ is the only stationary point of $\FBdiff^{(i,j)}$ on the sliced Bethe box.
In the second part, we show that the Hessian matrix $\nabla^2 \FB$ evaluated in $(0.5,0.5)$ is indefinite. \\

\underline{Part 1:} Setting the gradient $\nabla \, \FBdiff^{(i,j)}$ with its components given by~\eqref{eq:Bethe_diff_first_derivative} to zero, leads to the following nonlinear equation system:
\begin{align*}
 2 J_{ij} + \log \Big( \frac{q_i - \xiopt - q_i^2 + q_i \xiopt}{q_i + q_i \xiopt - q_i^2 - q_i q_j} \Big) & =  0  \\
 2 J_{ij} + \log \Big( \frac{q_j - \xiopt - q_j^2 + q_j \xiopt}{q_j + q_j \xiopt - q_j^2 - q_i q_j} \Big) & =  0
\end{align*}
which is equivalent to
\begin{align}
	e^{2J_{ij}}(q_i - \xiopt - q_i^2 + q_i \xiopt) & =  q_i + q_i \xiopt - q_i^2 - q_i q_j  \label{eq:eqsystem_first} \\
	e^{2J_{ij}}(q_j - \xiopt - q_j^2 + q_j \xiopt) & =  q_j + q_j \xiopt - q_j^2 - q_i q_j. \label{eq:eqsystem_second}
\end{align}
By subtracting~\eqref{eq:eqsystem_second} from~\eqref{eq:eqsystem_first}, we can reduce the above system to one equation
\begin{align} 
 \begin{split} \label{eq:eqsystem_single}
	& (e^{2J_{ij}} - 1)(q_j + q_i)(q_j - q_i)  + \, (-e^{2J_{ij}} \xiopt - e^{2J_{ij}} + \xiopt + 1)(q_j - q_i) = 0.
 \end{split}
\end{align}
Note that~\eqref{eq:eqsystem_single} might possess additional solutions, that do not solve the original system~\eqref{eq:eqsystem_first} +~\eqref{eq:eqsystem_second}; each solution of~\eqref{eq:eqsystem_single}, however, does also solve~\eqref{eq:eqsystem_first} +~\eqref{eq:eqsystem_second}.
Obviously all feasible pairs $(q_i,q_j)$ such that $q_i = q_j$ solve equation~\eqref{eq:eqsystem_single} and thus also~\eqref{eq:eqsystem_first} +~\eqref{eq:eqsystem_second}.
Now assume that there exists a feasible solution to~\eqref{eq:eqsystem_single} with unequal components.
If we divide~\eqref{eq:eqsystem_single} by $(q_j - q_i)$ and simplify, we end up with the contradictory result
\begin{equation} \label{eq:eqsystem_simplified}
	\underbrace{(1 + \xiopt - q_i - q_j)}_{= \, \tilde{P}_{ij}(X_i = -1, X_j = -1) \, > \, 0}\underbrace{(1 - e^{2J_{ij}})}_{\neq 0} = 0.
\end{equation}

Consequently, the only candidates for stationary points of $\FBdiff^{(i,j)}$ on $\Bethebox^{(i,j)}$ are those with equal components.
This allows us to substitute $q_i$ for $q_j$ in either of the two equations~\eqref{eq:eqsystem_first} +~\eqref{eq:eqsystem_second} and directly solve for $q_i$.
In other words, we must solve
\begin{equation*}
 e^{2J_{ij}} (q_i - \xiopt - q_i^2 + q_i \xiopt) \, - \, (q_i + q_i \xiopt - 2 q_i^2)  = 0 ,
 \end{equation*}
 or equivalently,
\begin{equation*}
 \xiopt = \frac{(1- e^{2J_{ij}})q_i +  (e^{2J_{ij}} - 2)q_i^2}{- e^{2J_{ij}} - (1 - e^{2J_{ij}}) q_i},
\end{equation*}
where we can replace $\xiopt$ by formula~\eqref{eq:xi_optimal} (with $q_j = q_i$):
\begin{align*}
(1 + 2\alpha_{ij}q_i) - \sqrt{1 + 4 \alpha_{ij} q_i (1 - q_i) } = \, \, 2\alpha_{ij} \underbrace{\frac{(1- e^{2J_{ij}})q_i +  (e^{2J_{ij}} - 2)q_i^2}{- e^{2J_{ij}} - (1 - e^{2J_{ij}}) q_i}}_{\coloneqq A}
\end{align*}
After isolating the radical and squaring both sides of the equation we get
\begin{equation*}
(1 + \alpha_{ij}) q_i^2 - (1 + 2 \alpha_{ij} q_i) A + \alpha_{ij} A^2 = 0,
\end{equation*}
which can be further simplified to
\begin{equation*}
\frac{e^{2J_{ij}} (e^{2J_{ij}} - 1) (2 q_i - 1) (q_i - 1)^2}{(q_i + e^{2J_{ij}} - q_i e^{2J_{ij}})^2} = 0.
\end{equation*}
Finally, we multiply both sides by $\frac{(q_i + e^{2J_{ij}} - q_i e^{2J_{ij}})^2}{\underbrace{e^{2J_{ij}}}_{\neq 0} \underbrace{(e^{2J_{ij}} - 1)}_{\neq 0} \underbrace{(q_i - 1)^2}_{\neq 0}}$
and end up with
\begin{equation*}
2 q_i - 1 = 0 \quad \Rightarrow \quad q_i = 0.5.
\end{equation*}
We conclude that $(\bar{q_i},\bar{q_j}) = (0.5,0.5)$ is the only stationary point of $\FBdiff^{(i,j)}$.
\\

\underline{Part 2:} To compute the Hessian matrix $\nabla^2 \, \FBdiff^{(i,j)}$, we use the second partial derivatives in~\eqref{eq:Hessian_q_i} -~\eqref{eq:Hessian_q_j}.
Lemma~\ref{lm:xi_optimal_center} provides us with a simple expression of $\xiopt$ evaluated in $(0.5,0.5)$.
Furthermore, we make use of the relations
\begin{equation} \label{eq:cosh_sigmoid}
 \sigma(2x) (1 - \sigma(2x)) \, = \, \frac{1}{2} \sigma'(2x) \, = \, \frac{1}{4 \cosh^2 (x)}
\end{equation}
and
\begin{equation} \label{eq:tanh_sigmoid}
 (1 - 2 \sigma(2x)) \, = \, - \tanh(x).
\end{equation}
Observe that
\begin{align} \label{eq:denominator_Hessian}
\begin{split} 
& q_i q_j (1-q_i) (1-q_j) - (\xiopt - q_i q_j)^2 \\ = \, \, & \frac{1}{16} - \Big( \frac{1}{2} \sigma(2 \, J_{ij}) - \frac{1}{4} \Big) ^2 \\ = \, \, & \frac{1}{4} \sigma (2 \, J_{ij}) \big( 1 -  \sigma (2 \, J_{ij}) \big) \stackrel{\eqref{eq:cosh_sigmoid}}{=} \frac{1}{16 \cosh^2 (J_{ij})}.
\end{split}
\end{align}
Then the Hessian evaluated in $(0.5,0.5)$ is
\begin{align*}
 & \nabla^2 \, \FBdiff^{\, (i,j)}(0.5,0.5) \\  \hspace{-1.5cm} = \, \,  & \left(
 \begin{array}{ll}
 \frac{1}{\sigma (2 \, J_{ij}) \big( 1 -  \sigma (2 \, J_{ij}) \big)} - 4 & \frac{1 - 2 \sigma(2 \, J_{ij})}{\sigma (2 \, J_{ij}) \big( 1 -  \sigma (2 \, J_{ij}) \big)} \\
 \frac{1 - 2 \sigma(2 \, J_{ij})}{\sigma (2 \, J_{ij}) \big( 1 -  \sigma (2 \, J_{ij}) \big)} & \frac{1}{\sigma (2 \, J_{ij}) \big( 1 -  \sigma (2 \, J_{ij}) \big)} - 4 \\
 \end{array}\right) \\ \hspace{-1.5cm} \stackrel{\eqref{eq:tanh_sigmoid},\eqref{eq:denominator_Hessian}}{=} \, \, & 4 \left(
 \begin{array}{ll}
 \cosh^2(J_{ij}) - 1 & - \cosh^2(J_{ij}) \tanh(J_{ij}) \\
 - \cosh^2(J_{ij}) \tanh(J_{ij}) & \cosh^2(J_{ij}) - 1 \\
 \end{array}\right) \\ = \, \, & 4 \sinh(J_{ij}) \left(
 \begin{array}{ll}
 \sinh(J_{ij}) & - \cosh(J_{ij}) \\
 - \cosh(J_{ij}) & \sinh(J_{ij}) \\
 \end{array}\right).
\end{align*}
Finally, we compute the leading principal minors of the Hessian, i.e., the determinants of all upper left square submatrices. The first leading principal minor is
\begin{equation}
 \lvert 4 \sinh^2(J_{ij}) \rvert \, > \, 0.
\end{equation}
The second leading principal minor -- and thus the determinant of the entire Hessian matrix -- is
\begin{align}
 \begin{split}
 & 16 \sinh^2(J_{ij}) \underbrace{\big( \sinh^2(J_{ij}) - \cosh^2(J_{ij}) \big)}_{= \, -1} \\ = \, \, & -16 \sinh^2(J_{ij})  \, < \, 0.
 \end{split}
 \end{align}
Since the leading principal minors alternate in sign, starting with a positive number, it follows that the Hessian evaluated in $(0.5,0.5)$ is indefinite.
\end{proof}

\begin{proof} [\textbf{PROOF OF LEMMA~\ref{lm:FBdiff_negative}:}] \leavevmode

 We only prove the statement for attractive edges as the statement for repulsive edges can be proven analogously. Moreover, we only consider the scenario that both $q_i,q_j$ are $>0.5$ (due to symmetry, the reverse scenario can be treated analogously). Let $\Bethebox^{(i,j)}_{>0.5} \subseteq \Bethebox^{(i,j)}$ be the orthant of $\Bethebox^{(i,j)}$ that consists of all points $(q_i,q_j)$ with $0.5<q_i,q_j<1$. We know from Lemma~\ref{lm:stationary} that $\FBdiff^{\, (i,j)}$ has no stationary point in $\Bethebox^{(i,j)}_{>0.5}$ and is thus bounded from above and below by the limit values of $\FBdiff^{\, (i,j)}$ at the boundary of $\Bethebox^{(i,j)}_{>0.5}$. If we can prove that all limit values of $\FBdiff^{\, (i,j)}$ at the boundary of $\Bethebox^{(i,j)}_{>0.5}$ are at most zero, then the statement follows immediately (as $\FBdiff^{\, (i,j)}$ is continuous in the interior of $\Bethebox^{(i,j)}$).
 
 The boundary of $\Bethebox^{(i,j)}_{>0.5}$ consists of four line segments.
 Let us first consider the line segments that connect $(0.5,1)$ to $(1,1)$ and $(1,0.5)$ to $(1,1)$. In the proof of Theorem~\ref{thm:Bethe_infinity_norm} it is shown\footnote{Note that these results follow independently from the current statement and can therefore be utilized, although Theorem~\ref{thm:Bethe_infinity_norm} appears chronologically later in this work than Lemma~\ref{lm:FBdiff_negative}.} that
 \begin{align*}
  & \lim_{\substack{q_i \to 0.5 \\ q_j \to 1 }} \FBdiff^{\, (i,j)}(q_i,q_j) = 0, \\
  & \lim_{\substack{q_i \to 1 \\ q_j \to 0.5 }} \FBdiff^{\, (i,j)}(q_i,q_j) = 0, \\
  & \lim_{\substack{q_i \to 1 \\ q_j \to 1 }} \FBdiff^{\, (i,j)}(q_i,q_j) = -J_{ij}, \\
 \end{align*}
and that $\FBdiff^{\, (i,j)}(q_i,q_j)$ is monotonically decreasing from $(0.5,1)$ to $(1,1)$ and from $(1,0.5)$ to $(1,1)$. Consequently the statement for these two line segments is correct.

Now consider the two remaining line segments, i.e., that connect $(0.5,0.5)$ to $(0.5,1)$ and $(0.5,0.5)$ to $(1,0.5)$. Due to symmetry, we can again focus on the first case. Let us explicitly compute the value of $\FBdiff^{\, (i,j)}(q_i,q_j)$ in $(0.5,0.5)$. For that purpose, we must first evaluate the pairwise entropy $\mathcal{S}_{ij}$ in $(0.5,0.5)$, where we utilize Lemma~\ref{lm:xi_optimal_center}:

\begin{align*}
 \mathcal{S}_{ij}(0.5,0.5)  = & -\frac{\sigma(2 \, J_{ij})}{2} \log(\frac{\sigma(2 \, J_{ij})}{2}) + ( \frac{\sigma(2 \, J_{ij})}{2}) \log( \frac{\sigma(2 \, J_{ij})}{2}) - 2(0.5 - \frac{\sigma(2 \, J_{ij})}{2}) \log(0.5 - \frac{\sigma(2 \, J_{ij})}{2}) \\
 = & - 2(0.5 - \frac{\sigma(2 \, J_{ij})}{2}) \log(0.5 - \frac{\sigma(2 \, J_{ij})}{2}).
\end{align*}
Then we obtain
\begin{align*}
 \FBdiff^{\, (i,j)}(0.5,0.5) \, = & \, \, (-1- 2 (2\xiopt(0.5,0.5) - 0.5 - 0.5))J_{ij} + \overbrace{\mathcal{S}_{i}(0.5)}^{=\log(2)} + \overbrace{\mathcal{S}_{j}(0.5)}^{=\log(2)} - \, \, \mathcal{S}_{ij}(0.5,0.5) \\
  = & \, \, - J_{ij} + 2J_{ij} (1 - \sigma(2 J_{ij}))+ \log(2) + \sigma(2 J_{ij}) \log(\frac{\sigma(2 J_{ij})}{1-\sigma(2 J_{ij})}) + \log(1-\sigma(2 J_{ij})) \\
  = & \, \, J_{ij} + \log(2) + \log(1-\sigma(2 J_{ij})) \\
  = & \, \, \log(2 e^{J_{ij}} (1 - \sigma(2 J_{ij}))) \\
  = & \, \, \log(\sech(J_{ij})) < 0.
\end{align*}
If we can finally show that $\FBdiff^{\, (i,j)}$ is monotonically increasing from $(0.5,0.5)$ to $(0.5,1)$, then the statement of the Lemma follows. By an analogous calculation as in the proof of Lemma~\ref{lm:stationary}, we can conclude that the (one-dimensional) function $\FBdiff^{\, (i,j)}(0.5,q_j)$ has no stationary point for $q_j \in (0.5,1)$ and must therefore be monotonically increasing (as $\FBdiff^{\, (i,j)}(0.5,0.5) < 0$ and $\lim\limits_{\substack{q_i \to 0.5 \\ q_j \to 1 }} \FBdiff^{\, (i,j)}(q_i,q_j) = 0$).
 \end{proof}
\vspace{0.5cm}


\begin{proof} [\textbf{PROOF OF THEOREM~\ref{thm:Bethe_infinity_norm}:}] \leavevmode

 By Lemma~\ref{lm:stationary}, the Bethe energy difference function does not have any local optima and hence both its greatest lower bound and least upper bound must be located at the boundary of the sliced Bethe box.
 As in the proof of Lemma~\ref{lm:mut_inf_properties} (b), we denote this boundary by $\partial \Bethebox^{(i,j)}$.
 Let further $\overline{\Bethebox^{(i,j)}} = \Bethebox^{(i,j)} \, \dot\cup \, \, \partial \Bethebox^{(i,j)}$ be the closure of $\Bethebox^{(i,j)}$ and
 \begin{equation} \label{eq:Bethe_diff_continuation}
  \overline{\FBdiff^{\, (i,j)}} (q_i,q_j) \coloneqq \lim_{(q_i,q_j) \to \overline{\Bethebox^{(i,j)}}} \FBdiff^{\, (i,j)} (q_i,q_j)
 \end{equation}
be the analytic continuation of $\FBdiff^{\, (i,j)}$ to $\overline{\Bethebox^{(i,j)}}$.
Without loss of generality, we assume $(i,j)$ to be an attractive edge (the calculations for a repulsive edge can be done analogously).
We utilize Lemma~\ref{lm:xi_optimal_limits} and Lemma~\ref{lm:mut_inf_properties} (b) to compute the limit values of $\FBdiff^{\, (i,j)}(q_i,q_j)$ at the four corner points of $\overline{\Bethebox^{(i,j)}}$:
\begin{align*}
 & \lim_{\substack{q_i \to 0 \\ q_j \to 0 }} \FBdiff^{\, (i,j)}(q_i,q_j) \\ = & \lim_{\substack{q_i \to 0 \\ q_j \to 0 }}  \UBdiff^{\, (i,j)} (q_i,q_j) + \overbrace{\lim_{\substack{q_i \to 0 \\ q_j \to 0 }} \mutinfB^{\, (i,j)} (q_i,q_j)}^{= \, 0} \\
 = & \lim_{\substack{q_i \to 0 \\ q_j \to 0}} -\, (1+  2 \; (2 \, \xi_{ij} - q_i - q_j)) \, J_{ij}) \\
 = & -J_{ij} \, - \lim_{\substack{q_i \to 0 \\ q_j \to 0}} 2 J_{ij} \, (\overbrace{2 \xi_{ij}}^{\to \, 0} - \overbrace{q_i}^{\to \, 0} - \overbrace{q_j}^{\to \, 0})) \\
 = & -J_{ij},
\end{align*}
\begin{align*}
 & \lim_{\substack{q_i \to 0 \\ q_j \to 1 }} \FBdiff^{\, (i,j)}(q_i,q_j) \\ = & -J_{ij} \, - \lim_{\substack{q_i \to 0 \\ q_j \to 1 }} 2 J_{ij} \, (\overbrace{2 \xi_{ij}}^{\to \, 0} - \overbrace{q_i}^{\to \, 0} - \overbrace{q_j}^{\to \, 1})) \\
 = & \, \, J_{ij},
\end{align*}
\begin{align*}
 & \lim_{\substack{q_i \to 1 \\ q_j \to 0}} \FBdiff^{\, (i,j)}(q_i,q_j) \\ = & -J_{ij} \, - \lim_{\substack{q_i \to 1 \\ q_j \to 0}} 2 J_{ij} \, (\overbrace{2 \xi_{ij}}^{\to \, 0} - \overbrace{q_i}^{\to \, 1} - \overbrace{q_j}^{\to \, 0})) \\
 = & \, \, J_{ij},
\end{align*}
\begin{align*}
 & \lim_{\substack{q_i \to 1 \\ q_j \to 1 }} \FBdiff^{\, (i,j)}(q_i,q_j) \\ = & -J_{ij} \, - \lim_{\substack{q_i \to 1 \\ q_j \to 1 }} 2 J_{ij} \, (\overbrace{2 \xi_{ij}}^{\to \, 2} - \overbrace{q_i}^{\to \, 1} - \overbrace{q_j}^{\to \, 1})) \\
 = & -J_{ij}.
\end{align*}
For the remainder of the proof, we observe that $\overline{\FBdiff^{\, (i,j)}}$ exhibits monotonic behavior between the corner points of $\overline{\Bethebox^{(i,j)}}$.
On the one hand, it is monotonically increasing over the two line segments that connect $(0,0)$ to $(0,1)$ and $(0,0)$ to $(1,0)$:
\begin{align*}
 \overline{\FBdiff^{\, (i,j)}}(0,k) & = \lim_{\substack{q_i \to 0 \\ q_j \to k}} \FBdiff^{\, (i,j)}(q_i,q_j) \\
 & = -J_{ij} \, - \lim_{\substack{q_i \to 0 \\ q_j \to k }} 2 J_{ij} \, (\overbrace{2 \xi_{ij}}^{\to \, 0} - \overbrace{q_i}^{\to \, 0} - \overbrace{q_j}^{\to \, k})) \\
 & = -J_{ij} + 2 k J_{ij} \\ & = \, \, J_{ij} \, (2k - 1)
\end{align*}
and
\begin{align*}
 \overline{\FBdiff^{\, (i,j)}}(k,0) & = -J_{ij} \, - \lim_{\substack{q_i \to k \\ q_j \to 0 }} 2 J_{ij} \, (\overbrace{2 \xi_{ij}}^{\to \, 0} - \overbrace{q_i}^{\to \, k} - \overbrace{q_j}^{\to \, 0})) \\
 & = -J_{ij} + 2 k J_{ij} \\ & = \, \, J_{ij} \, (2k - 1),
\end{align*}
with both expressions being monotonically increasing if $k$ increases in $[0,1]$.
On the other hand, it is monotonically decreasing over the two line segments that connect $(0,1)$ to $(1,1)$ and $(1,0)$ to $(1,1)$:
\begin{align*}
 \overline{\FBdiff^{\, (i,j)}}(k,1) & = -J_{ij} \, - \lim_{\substack{q_i \to k \\ q_j \to 1 }} 2 J_{ij} \, (\overbrace{2 \xi_{ij}}^{\to \, 2k} - \overbrace{q_i}^{\to \, k} - \overbrace{q_j}^{\to \, 1})) \\
 & = \, \, J_{ij} - 2 k J_{ij} \\ & = \, \, J_{ij} (1 - 2k) 
\end{align*}
and
\begin{align*}
 \overline{\FBdiff^{\, (i,j)}}(1,k) & = -J_{ij} \, - \lim_{\substack{q_i \to 1 \\ q_j \to k}} 2 J_{ij} \, (\overbrace{2 \xi_{ij}}^{\to \, 2k} - \overbrace{q_i}^{\to \, 1} - \overbrace{q_j}^{\to \, k})) \\
 & = \, \, J_{ij} - 2 k J_{ij} \\ & = \, \, J_{ij} (1 - 2k) 
\end{align*}
with both expressions being monotonically decreasing if $k$ increases in $[0,1]$.
By this, we conclude that 
\begin{align*}
 \min_{\substack{(q_i,q_j) \in \overline{\Bethebox^{(i,j)}}}} \overline{\FBdiff^{\, (i,j)}} (q_i,q_j) = -J_{ij}
\end{align*}
and
\begin{align*}
 \max_{\substack{(q_i,q_j) \in \overline{\Bethebox^{(i,j)}}}} \overline{\FBdiff^{\, (i,j)}} (q_i,q_j) = J_{ij}.
\end{align*}
According to the definition~\eqref{eq:Bethe_diff_continuation} of $\overline{\FBdiff^{\, (i,j)}}$ and in consequence of our previous observations, the statements~\eqref{eq:Bethe_infimum},~\eqref{eq:Bethe_supremum}, and hence~\eqref{eq:Bethe_infinity_norm} follow immediately.
\end{proof}

\begin{proof} [\textbf{PROOF OF COROLLARY~\ref{cor:Bethe_opt_edge_Linf}:}] \leavevmode

 This is an immediate consequence of Theorem~\ref{thm:Bethe_infinity_norm}.
\end{proof}

\vspace{0.5cm}

\begin{proof} [\textbf{PROOF OF THEOREM~\ref{thm:Bethe_opt_edge_L2}:}] \leavevmode

 We prove that the derivative of $\lvert \lvert \FBdiff^{(i,j)} \rvert \rvert_{L^{2}}^2$ with respect to $J_{ij}$ is larger than $0$ if $J_{ij}> 0$, and smaller than $0$ if $J_{ij}< 0$. Recall that we assume $\FB$ to be defined on the local polytope instead of the Bethe box (this implies also that $\FBdiff^{\, (i,j)}$ is defined on the sliced local polytope $\polytopeLocal^{\, (i,j)}$~\ref{eq:sliced_local_polytope}). We compute
 \begin{align}
  & \frac{\partial}{\partial J_{ij}} \lvert \lvert \FBdiff^{(i,j)} \rvert \rvert_{L^{2}}^2 \\ 
  = & \, \, \frac{\partial}{\partial J_{ij}} \iiint\limits_{\polytopeLocal^{\, (i,j)}} (\FBdiff^{\, (i,j)})^2 d \xi_{ij} d q_i d q_j \\
  = &  \iiint\limits_{\polytopeLocal^{\, (i,j)}} \frac{\partial}{\partial J_{ij}} (\FBdiff^{\, (i,j)})^2 d \xi_{ij} d q_i d q_j \\
  = & \iiint\limits_{\polytopeLocal^{\, (i,j)}} 2 \, \cdot \FBdiff^{\, (i,j)} \cdot \frac{\partial}{\partial J_{ij}} \FBdiff^{\, (i,j)} d \xi_{ij} d q_i d q_j \label{eq:integral_L2_Bethediff}
 \end{align}
 Now observe that $\frac{\partial}{\partial J_{ij}} \FBdiff^{\, (i,j)} = \frac{\partial}{\partial J_{ij}} (\UBdiff^{\, (i,j)} + \mutinfB^{\, (i,j)})$ (this is just the definition of $\FBdiff^{\, (i,j)}$~\eqref{eq:Bethe_energy_difference}), and that the mutual information on the sliced local polytope is independent of $J_{ij}$. Consequently $\frac{\partial}{\partial J_{ij}} \mutinfB^{\, (i,j)} = 0$. We continue with~\eqref{eq:integral_L2_Bethediff} and split the integral:
 \begin{align*}
  \hspace{-0.2cm} & 2 \iiint\limits_{\polytopeLocal^{\, (i,j)}} \, (\UBdiff^{\, (i,j)} + \mutinfB^{\, (i,j)}) \cdot \frac{\partial}{\partial J_{ij}} \UBdiff^{\, (i,j)} d \xi_{ij} d q_i d q_j \\
 \hspace{-0.2cm} = & \, \, 2 \underbrace{\iiint\limits_{\polytopeLocal^{\, (i,j)}} \, \UBdiff^{\, (i,j)} \cdot \frac{\partial}{\partial J_{ij}} \UBdiff^{\, (i,j)} d \xi_{ij} d q_i d q_j}_{(I.)}
  \, + \, \,  2 \underbrace{\iiint\limits_{\polytopeLocal^{\, (i,j)}} \, \mutinfB^{\, (i,j)} \cdot \frac{\partial}{\partial J_{ij}} \UBdiff^{\, (i,j)} d \xi_{ij} d q_i d q_j}_{(II.)} 
 \end{align*}
 The integral $(I.)$ can be simplified, as
 \begin{align*}
  & \frac{\partial}{\partial J_{ij}} \UBdiff^{\, (i,j)}  =  -1 - 2\, (2\xi_{ij} - q_i - q_j) = \frac{1}{J_{ij}} \UBdiff^{\, (i,j)}
 \end{align*}
 and therefore
  \begin{align*}
  (I.) = J_{ij} \iiint\limits_{\polytopeLocal^{\, (i,j)}} (-1 - 2(2\xi_{ij} - q_i - q_j))^2 d \xi_{ij} d q_i d q_j,
 \end{align*}
 which is larger than $0$ for $J_{ij}>0$ and smaller than $0$ for $J_{ij} < 0$. 
 
 It remains to prove that the integral $(II.)$ is equal to zero. It is sufficient to consider the integral over the halfspace of the sliced local polytope where $q_i < q_j$ (due to symmetry, the integral over the other halfspace where $q_i > q_j$ is equal to the first).
 %Note that we integrate $q_i$ and $q_j$ from $0$ to $1$ and $\xi_{ij}$ from $\max(0,q_i + q_j - 1)$ to $ \min(q_i,q_j)$~\eqref{eq:xi_bounds}. 
 We can split this halfspace in four 'sub-polytopes':
 \begin{align}
 \begin{split}
  \polytopeLocal^{\, (i,j)}_1: \, \, \{& (q_i,q_j,\xi_{ij}) \in \polytopeLocal^{\, (i,j)}: \\ & 0 < q_j \leq \frac{1}{2}, 0 < q_i \leq q_j, \, 0 < \xi_{ij} < q_i \} \\
  \polytopeLocal^{\, (i,j)}_2: \, \, \{& (q_i,q_j,\xi_{ij}) \in \polytopeLocal^{\, (i,j)}: \\ & \frac{1}{2} \leq q_j < 1, 0 < q_i \leq 1-q_j, \,  0 < \xi_{ij} < q_i \} \\
  \polytopeLocal^{\, (i,j)}_3: \, \, \{& (q_i,q_j,\xi_{ij}) \in \polytopeLocal^{\, (i,j)}: \\ & \frac{1}{2} \leq q_j < 1, 1 - q_j \leq q_i \leq \frac{1}{2}, \\ 
  & q_i + q_j -1  < \xi_{ij} < q_i \} \\
  \polytopeLocal^{\, (i,j)}_4: \, \, \{& (q_i,q_j,\xi_{ij}) \in \polytopeLocal^{\, (i,j)}: \\ & \frac{1}{2} \leq q_j < 1, \frac{1}{2} \leq q_i \leq q_j, \\ &  q_i + q_j -1  < \xi_{ij} < q_i \}
  \end{split}
 \end{align}
We show that, for each point $(q_i^{(1)},q_j^{(1)},\xi_{ij}^{(1)})$ in $\polytopeLocal^{\, (i,j)}_1$ we can find a point $(q_i^{(2)},q_j^{(2)},\xi_{ij}^{(2)})$ in $\polytopeLocal^{\, (i,j)}_2$ such that the value of the integrand $\mutinfB^{\, (i,j)} \cdot \frac{\partial}{\partial J_{ij}} \UBdiff^{\, (i,j)}$ of $(II.)$ in $(q_i^{(2)},q_j^{(2)},\xi_{ij}^{(2)})$ is precisely the negative of the value that $\mutinfB^{\, (i,j)} \cdot \frac{\partial}{\partial J_{ij}} \UBdiff^{\, (i,j)}$ takes in $(q_i^{(1)},q_j^{(1)},\xi_{ij}^{(1)})$. To this end, we map $(q_i^{(1)},q_j^{(1)},\xi_{ij}^{(1)})$ from $\polytopeLocal^{\, (i,j)}_1$ to $(q_i^{(1)},1 - q_j^{(1)},q_i^{(1)} - \xi_{ij}^{(1)})$ in $\polytopeLocal^{\, (i,j)}_2$.

Then one can check that
\begin{align*}
 & \UBdiff^{\, (i,j)}(q_i^{(1)},1 - q_j^{(1)},q_i^{(1)} - \xi_{ij}^{(1)}) \\ = - &  \, \UBdiff^{\, (i,j)}(q_i^{(1)},q_j^{(1)},\xi_{ij}^{(1)})
\end{align*}
and
\begin{align*}
 & \mutinfB^{\, (i,j)}(q_i^{(1)},1 - q_j^{(1)},q_i^{(1)} - \xi_{ij}^{(1)}) \\ = & \,  \mutinfB^{\, (i,j)}(q_i^{(1)},q_j^{(1)},\xi_{ij}^{(1)}),
\end{align*}
and consequently
\begin{align*}
& \mutinfB^{\, (i,j)} \cdot \frac{\partial}{\partial J_{ij}} \UBdiff^{\, (i,j)}(q_i^{(1)},1 - q_j^{(1)},q_i^{(1)} - \xi_{ij}^{(1)}) \\
= - & \mutinfB^{\, (i,j)} \cdot \frac{\partial}{\partial J_{ij}} \UBdiff^{\, (i,j)}(q_i^{(1)},q_j^{(1)},\xi_{ij}^{(1)}).
\end{align*}
As our so defined mapping is bijective, we conclude that the integral $(II.)$ over $\polytopeLocal^{\, (i,j)}_2$ is exactly the negative of the integral $(I.)$ over $\polytopeLocal^{\, (i,j)}_1$. Similarly, we can define a bijective mapping between $\polytopeLocal^{\, (i,j)}_3$ and $\polytopeLocal^{\, (i,j)}_4$ with the same result. We summarize, that the integral $(II.)$ over the entire sliced local polytope equals to zero, which completes the proof.
\end{proof}

\begin{proof} [\textbf{PROOF OF THEOREM~\ref{thm:error_Bethe_partition}:}] \leavevmode

 If $\PartitionBethe = \PartitionBethe^{\setminus (i,j)}$, the statement is trivial and we will therefore exclude this scenario.
 According to Theorem~\ref{thm:Bethe_infinity_norm}, each edge contributes less than $\pm J_{ij}$ to the 'overall' Bethe function and removing an edge can only transpose the individual function values by less than $\lvert J_{ij} \rvert$.
 This means that for all $\bm{q} \in \Bethebox$,
 \begin{equation} \label{eq:J_ij_estimate}
  \lvert \FB(\bm{q}) - \FB^{\setminus (i,j)}(\bm{q}) \rvert = \lvert \FBdiff^{\, (i,j)}(q_i,q_j) \rvert < \lvert J_{ij} \rvert.
 \end{equation}
In particular, this is also valid for the global minimizer $\bm{q_{1}}^{\ast}$ of $\FB$ as well as for the global minimizer $\bm{q_{2}}^{\ast}$ of $\FB^{\setminus (i,j)}$.
We now distinguish between two cases: \\
\underline{Case 1:} $\PartitionBethe > \PartitionBethe^{\setminus (i,j)}$, or equivalently, $\FB(\bm{q_{1}}^{\ast}) < \FB^{\setminus (i,j)}(\bm{q_{2}}^{\ast})$. Then
\begin{align*}
 0 < \FB^{\setminus (i,j)}(\bm{q_{2}}^{\ast}) - \FB(\bm{q_{1}}^{\ast}) < \FB^{\setminus (i,j)}(\bm{q_{1}}^{\ast}) - \FB(\bm{q_{1}}^{\ast}) \stackrel{\eqref{eq:J_ij_estimate}}{<} \lvert J_{ij} \rvert .
\end{align*}
\underline{Case 2:} $\PartitionBethe < \PartitionBethe^{\setminus (i,j)}$, or equivalently, $\FB(\bm{q_{1}}^{\ast}) > \FB^{\setminus (i,j)}(\bm{q_{2}}^{\ast})$. Then
\begin{align*}
 0 < \FB(\bm{q_{1}}^{\ast}) - \FB^{\setminus (i,j)}(\bm{q_{2}}^{\ast}) < \FB(\bm{q_{2}}^{\ast}) - \FB^{\setminus (i,j)}(\bm{q_{2}}^{\ast}) \stackrel{\eqref{eq:J_ij_estimate}}{<} \lvert J_{ij} \rvert .
\end{align*}
Together, we finally obtain
\begin{align*}
 \lvert J_{ij} \rvert & > \lvert \FB(\bm{q_{1}}^{\ast}) - \FB^{\setminus (i,j)}(\bm{q_{2}}^{\ast}) \rvert \\ & = \lvert \min\limits_{\bm{q} \in \Bethebox} \FB - \min\limits_{\bm{q} \in \Bethebox} \FB^{\setminus (i,j)} \rvert = \\
 & = \lvert \log(\PartitionBethe) -\log(\PartitionBethe^{\setminus (i,j)}) \rvert \\ & = \bigg\vert \log \Big( \frac{\PartitionBethe}{\PartitionBethe^{\setminus (i,j)}} \Big) \bigg\vert.
\end{align*}
\end{proof}

\begin{proof} [\textbf{PROOF OF THEOREM~\ref{thm:Z_error_unidirectional}:}] \leavevmode
 
 Without loss of generality, we assume that all $\theta_i$ are larger than $0.5$. 
 From Lemma~\ref{lm:Knoll_Bethe_unidirectional}~\citep{knoll2021selfguided} we know, that the global minimum $-\log(\PartitionBethe)$ of $\FB$ must be located in a point of $\Bethebox$ where all components $q_i$ are larger than $0.5$. 
 If we remove an edge, we obtain again a unidirectional model that satisfies the same property. 
 By Lemma~\ref{lm:FBdiff_negative}, we then know that the global minimum of $\FB$ in the new model must be larger than the global minimum in the old model, i.e., 
 \begin{equation} \label{eq:ZB_unidirectional}
  -\log(\PartitionBethe) < -\log(\PartitionBethe^{\, \setminus (i,j)})
 \end{equation}
 (as the removed energy difference function $\FBdiff^{\, (i,j)}$ makes in the original model a negative contribution to the energy of all points $\bm{q} \in \Bethebox$ with $q_i,q_j > 0.5$ -- in particular, to the global minimum of the new model).
 Finally, we apply Theorem~\ref{thm:Ruozzi_lower_bound} from~\citet{ruozzi2012bethepartition} which says that in attractive models, the Bethe partition function is always a lower bound to the true partition function, i.e., $-\log(\Partition) < - \log(\PartitionBethe)$. Together with~\eqref{eq:ZB_unidirectional}, we conclude
 \begin{equation*}
  \lvert -\log(\Partition) + \log(\PartitionBethe)  \rvert < \lvert -\log(\Partition)  + \log(\PartitionBethe^{\, \setminus (i,j)}) \rvert,
 \end{equation*}
 which is equivalent to the statement.
\end{proof}


 %\begin{proof}
 %To derive the bound~\eqref{eq:Bethe_lower_bound}, we utilize the rewritten form~\eqref{eq:Bethe_rewritten} of the Bethe free energy:
 %\begin{align*}
  %\FB(q_i,q_j) & = \sum_{i=1}^{n} \overbrace{(1 - 2 q_i) \, \theta_i}^{> \, \lvert \theta_i \rvert} - \sum_{i=1}^{n} \hspace{-0.15cm} \overbrace{\mathcal{S}_{i}}^{\leq \, \log(2)} \hspace{-0.15cm} + \hspace{-0.15cm} \sum_{(i,j) \in \setofedges} \hspace{-0.1cm} \overbrace{\FBdiff^{\, (i,j)}}^{> \, \, -\lvert J_{ij} \rvert} \\
  %& > - \sum_{i=1}^{n} \, \lvert \theta_i \rvert \, - n \log(2) - \sum_{(i,j) \in \setofedges} \lvert J_{ij} \rvert
 %\end{align*}
%\end{proof}

\subsection*{APPENDIX B -- RESULTS FROM RELATED WORK} \label{app:appendix_B}

\begin{lemma} \label{lm:Dragomir_bounds}
 Let $X, Y$ discrete random variables with alphabets $\mathcal{X}, \mathcal{Y}$. Their mutual information $I(X;Y)$ is bounded by
 \begin{align} 
 \begin{split}\label{eq:Dragomir_bounds}
  \frac{1}{2} \big( \sum\limits_{(x,y) \in \mathcal{X} \times \mathcal{Y}} \lvert p_{X Y}(x,y) - p_X(x) p_Y(y) \rvert \, \big)^2 \, \, \leq \, \, I(X;Y) \, \, \leq \sum\limits_{(x,y) \in \mathcal{X} \times \mathcal{Y}} \frac{p_{X Y}^2(x,y)}{p_X(x) p_Y(y)} \, - \, 1.
 \end{split}
 \end{align}
\end{lemma}

\begin{proof}
 Corollary 1 and 2 in~\citet{dragomir2000relentropy}.
\end{proof}

\begin{lemma} \label{lm:Weller_bounds_xi} \leavevmode
 \begin{enumerate}
  \item [(a)] If $(i,j)$ is an attractive edge (i.e., $J_{ij} > 0$), then $\xiopt > q_i q_j$.
  \item [(b)] If $(i,j)$ is an repulsive edge (i.e., $J_{ij} < 0$), then $\xiopt < q_i q_j$.
 \end{enumerate}
\end{lemma}

\begin{proof}
 Lemma 2 in~\citet{weller2013bethebounds}.
\end{proof}

\begin{lemma} \label{lm:Welling_derivatives_FB}
The first-order derivatives of the Bethe free energy $\FB$ on the Bethe box $\Bethebox$ are
\begin{align}
 \begin{split} \label{eq:Bethe_first_derivative}
  \frac{\partial}{\partial q_{i}} \FB =  -2 \, \theta_i + 2 \sum\limits_{j \in \nbh{i}} J_{ij}  + \log \Big( \frac{(1-q_i)^{d_i-1}} {q_i^{d_i-1}} \prod\limits_{j \in \nbh{i}} \frac{q_i - \xiopt }{1 + \xiopt - q_i - q_j}  \Big).
 \end{split}
\end{align}
\end{lemma}

\begin{proof}
 This is an intermediate result in~\citet{welling2001belief}.
\end{proof}

\begin{thm} \label{thm:Weller_second_derivatives}
 The second partial derivatives of edge specific Bethe terms of the form
 \begin{equation}
  f_{ij}(q_i,q_j) = - \, (1+  2 \; (2 \, \xi_{ij} - q_i - q_j)) \, J_{ij} -  \mathcal{S}_{ij}
 \end{equation}
 are calculated as
\begin{align}
 \hspace{-0.3cm} \frac{\partial^2}{\partial q_i^2} & f_{ij} =  \frac{q_j (1-q_j)}{T_{ij}}, \\
 \hspace{-0.3cm} \frac{\partial^2}{\partial q_i q_j} f_{ij} = \frac{\partial^2}{\partial q_j q_i} & f_{ij} = \frac{q_i q_j - \xiopt}{T_{ij}}, \, \, \, \text{and} \\
 \hspace{-0.3cm} \frac{\partial^2}{\partial q_j^2} & f_{ij} = \frac{q_i (1-q_i)}{T_{ij}},
\end{align}
where $T_{ij} \coloneqq q_i q_j (1-q_i) (1-q_j) - (\xiopt - q_i q_j)^2$.
\end{thm}

\begin{proof}
 Theorem 10 in~\citet{weller2013bethebounds}.
\end{proof}

\begin{lemma} \label{lm:Knoll_Bethe_unidirectional}
 Consider a unidirectional model and assume without loss of generality that $\theta_i > 0.5$. Then, in the global minimum $\bm{q}^{\ast}$ of $\FB$ all components $q_i$ are $>0.5$.
\end{lemma}


\begin{proof}
 Lemma 2 in~\citet{knoll2021selfguided}.
\end{proof}

\begin{thm} \label{thm:Ruozzi_lower_bound}
 For an attractive model with binary variables, the Bethe partition function is always a lower bound on the true partition function. That is,
 \begin{equation*}
  \PartitionBethe \leq \Partition,
 \end{equation*}
 or equivalently,
 \begin{equation*}
  \min\limits_{\polytopeMarginal} \FG = -\log (\Partition) \leq -\log (\PartitionBethe) = \min\limits_{\polytopeLocal} \FB.
 \end{equation*}
\end{thm}

\begin{proof}
 Theorem 4.1 in~\citet{ruozzi2012bethepartition}.
\end{proof}



\subsection*{APPENDIX C -- FURTHER EXPERIMENTS}
Here we present supplementary experiments that we have performed on a (non-toroidal) $5\times5$-grid graph. The experimental setting is the same as in Sec.~\ref{sec:experiments}. Again, we observe the beneficial effects of edge removal on the approximated marginals. In this sparser model, the treshold beyond which the Bethe free energy becomes non-convex and the accuracy of BP degrades is larger than for the fully connected graph considered in Sec.~\ref{sec:experiments}. Beyond that treshold, edge removal is particularly effective if the local potentials are weak; but also for models with strong local potentials the results are mostly superior to the marginal accuracy of BP in the original model. For attractive models with strong local potentials, we observe again the presence of a 'channel' that specifies an optimal intermediate model state.
% NOTE: necessary when ptmx or no mathfont class option is given
%\providecommand{\upGamma}{\Gamma}
%\providecommand{\uppi}{\pi}

\begin{figure}[htb]
\centering \hspace*{-5pt}
%\subfloat[][]{\label{attractive_theta_m02_p02_random} \includegraphics[width=0.4\linewidth]{heatmap_completegraph10_attractive_theta_m02_p02_random_l1_error_sing_all_runs}}
\subfloat[][]{\label{gridgraph5x5_attractive_theta_m02_p02_weakest_edge} \includegraphics[width=0.25\linewidth]{figures_UAI2022/heatmap_gridgraph5x5_attractive_theta_m02_p02_weakest_edge_l1_error_sing_all_runs}}
\subfloat[][]{\label{gridgraph5x5_attractive_theta_m02_p02_Chow_Liu} \includegraphics[width=0.25\linewidth]{figures_UAI2022/heatmap_gridgraph5x5_attractive_theta_m02_p02_mutual_information_l1_error_sing_all_runs}}
%\caption{Attractive models, $\theta_i \in [-0.2,0.2]$. (a) $\texttt{BETHE-OPT}$ criterion vs. (b) $\texttt{CHOW LIU}$ criterion}
%\label{attractive_theta_m02_p02}
%\end{figure}

%\begin{figure}[h]
%\centering \hspace*{-5pt}
%\subfloat[][]{\label{attractive_theta_m02_p02_random} \includegraphics[width=0.4\linewidth]{heatmap_completegraph10_attractive_theta_m02_p02_random_l1_error_sing_all_runs}}
\subfloat[][]{\label{gridgraph5x5_attractive_theta_m05_p05_weakest_edge} \includegraphics[width=0.25\linewidth]{figures_UAI2022/heatmap_gridgraph5x5_attractive_theta_m05_p05_weakest_edge_l1_error_sing_all_runs}}
\subfloat[][]{\label{gridgraph5x5_attractive_theta_m05_p05_Chow_Liu} \includegraphics[width=0.25\linewidth]{figures_UAI2022/heatmap_gridgraph5x5_attractive_theta_m05_p05_mutual_information_l1_error_sing_all_runs}}
\caption{Attractive models (5x5 - grid graph). First row: $\theta_i \in [-0.2,0.2]$; second row: $\theta_i \in [-0.5,0.5]$. (a) + (c): $\texttt{BETHE-OPT}$ criterion; (b) + (d): $\texttt{CHOW LIU}$ criterion.}
\label{fig:gridgraph5x5_attractive}
\end{figure}

\begin{figure}[htb]
\centering \hspace*{-5pt}
%\subfloat[][]{\label{general_theta_m02_p02_random} \includegraphics[width=0.4\linewidth]{heatmap_completegraph10_general_theta_m02_p02_random_l1_error_sing_all_runs}}
\subfloat[][]{\label{gridgraph5x5_general_theta_m02_p02_weakest_edge} \includegraphics[width=0.25\linewidth]{figures_UAI2022/heatmap_gridgraph5x5_general_theta_m02_p02_weakest_edge_l1_error_sing_all_runs}}
\subfloat[][]{\label{gridgraph5x5_general_theta_m02_p02_Chow_Liu} \includegraphics[width=0.25\linewidth]{figures_UAI2022/heatmap_gridgraph5x5_general_theta_m02_p02_mutual_information_l1_error_sing_all_runs}}
%\caption{General models, $\theta_i \in [-0.2,0.2]$. (a) $\texttt{BETHE-OPT}$ criterion vs. (b) $\texttt{CHOW LIU}$ criterion}
%\label{general_theta_m02_p02}
%\end{figure}

%\begin{figure}[h]
%\centering \hspace*{-5pt}
%\subfloat[][]{\label{general_theta_m02_p02_random} \includegraphics[width=0.4\linewidth]{heatmap_completegraph10_general_theta_m02_p02_random_l1_error_sing_all_runs}}
\subfloat[][]{\label{gridgraph5x5_general_theta_m05_p05_weakest_edge} \includegraphics[width=0.25\linewidth]{figures_UAI2022/heatmap_gridgraph5x5_general_theta_m05_p05_weakest_edge_l1_error_sing_all_runs}}
\subfloat[][]{\label{gridgraph5x5_general_theta_m05_p05_Chow_Liu} \includegraphics[width=0.25\linewidth]{figures_UAI2022/heatmap_gridgraph5x5_general_theta_m05_p05_mutual_information_l1_error_sing_all_runs}}
\caption{General models (5x5 - grid graph). First row: $\theta_i \in [-0.2,0.2]$; second row: $\theta_i \in [-0.5,0.5]$. (a) + (c): $\texttt{BETHE-OPT}$ criterion; (b) + (d): $\texttt{CHOW LIU}$ criterion.}
\label{gridgraph5x5_general}
\end{figure}

\def\bibfont{\small}
\bibliography{uai2022-template}

\end{document}
