\documentclass[accepted]{uai2023} %


\newif\ifbiblatex
    \biblatexfalse %
    
\newif\ifvfull
    \vfulltrue %
    
    \newif\ifvfullred %
    \vfullredtrue

\usepackage[american]{babel}

\ifbiblatex\else
    \usepackage{natbib} %
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
    \fi
\usepackage{booktabs} %
\usepackage{algorithm}
\usepackage{algorithmic}

        
\newcommand\vfull[1]{{\ifvfullred\color{red!75!black}\fi\ifvfull#1\fi}}
\newenvironment{vfullenv}{\begingroup\ifvfullred\color{red!75!black}\fi\ifvfull}{\fi\endgroup}

\usepackage{microtype}
\relax %
    \newif\ifmarginprooflinks
    	\marginprooflinksfalse

\relax %
    \usepackage[dvipsnames]{xcolor}
    
    \usepackage{mathtools}
    \usepackage{amssymb}
		\DeclareMathSymbol{\shortminus}{\mathbin}{AMSa}{"39}
    
    \usepackage{bbm}
    \usepackage{faktor}
    \usepackage{booktabs}
    \usepackage{graphicx}
    \usepackage{scalerel}
    \usepackage{enumitem}
    \usepackage{wrapfig}
    \usepackage{nicefrac}\let\nf\nicefrac


    \usepackage{hyperref} %
        \hypersetup{colorlinks=true, linkcolor=blue!75!black, urlcolor=magenta, citecolor=green!50!black}
    

\relax %
    \let\Horig\H \let\H\relax %
	\DeclareMathOperator{\H}{\mathrm{H}} %
	\DeclareMathOperator{\I}{\mathrm{I}} %
	\DeclareMathOperator*{\Ex}{\mathbb{E}} %
	\DeclareMathOperator*{\EX}{\scalebox{1.5}{$\mathbb{E}$}} %
    
	\DeclareMathOperator{\supp}{\mathrm{supp}} %
	\DeclareMathOperator*{\argmin}{arg\,min} %
    
    \DeclarePairedDelimiterX{\infdivx}[2]{(}{)}{%
        #1\;\delimsize\|\;#2} %
	\newcommand{\thickD}{I\mkern-8muD}  %
	\newcommand{\kldiv}{\thickD\infdivx} %
	\newcommand{\tto}{\rightarrow\mathrel{\mspace{-15mu}}\rightarrow} %
    
    \newcommand{\Rext}{\mskip1mu\overline{\mskip-1mu\mathbb R\!}\,}
    \def\Rextge0{\Rext_{\scriptscriptstyle \ge 0}} %
    \def\Rge0{\mathbb{R}_{\scriptscriptstyle \ge 0}} %
    \newcommand{\mat}[1]{\mathbf{#1}} %

    \DeclarePairedDelimiter{\norm}{\Vert}{\Vert}
    \DeclarePairedDelimiter{\pqty}{\lparen}{\rparen}

	\makeatletter
	\newcommand{\subalign}[1]{%
	  \vcenter{%
	    \Let@ \restore@math@cr \default@tag
	    \baselineskip\fontdimen10 \scriptfont\tw@
	    \advance\baselineskip\fontdimen12 \scriptfont\tw@
	    \lineskip\thr@@\fontdimen8 \scriptfont\thr@@
	    \lineskiplimit\lineskip
	    \ialign{\hfil$\m@th\scriptstyle##$&$\m@th\scriptstyle{}##$\hfil\crcr
	      #1\crcr
	    }%
	  }%
    	}
	\makeatother
	\newcommand\numberthis{\addtocounter{equation}{1}\tag{\theequation}}


\usepackage{tikz}
	\usetikzlibrary{positioning,fit,calc, decorations, arrows, shapes, shapes.geometric}
	\usetikzlibrary{cd}

	\tikzset{AmpRep/.style={ampersand replacement=\&}}
	\tikzset{center base/.style={baseline={([yshift=-.8ex]current bounding box.center)}}}
	\tikzset{paperfig/.style={center base,scale=0.9, every node/.style={transform shape}}}

	\tikzset{dpadded/.style={rounded corners=2, inner sep=0.7em, draw, outer sep=0.3em, fill={black!50}, fill opacity=0.08, text opacity=1}}
	\tikzset{dpad0/.style={outer sep=0.05em, inner sep=0.3em, draw=gray!75, rounded corners=4, fill=black!08, fill opacity=1, align=center}}
	\tikzset{dpadinline/.style={outer sep=0.05em, inner sep=2.5pt, rounded corners=2.5pt, draw=gray!75, fill=black!08, fill opacity=1, align=center, font=\small}}

 	\tikzset{dpad/.style args={#1}{every matrix/.append style={nodes={dpadded, #1}}}}
	\tikzset{light pad/.style={outer sep=0.2em, inner sep=0.5em, draw=gray!50}}

	\tikzset{arr/.style={draw, ->, thick, shorten <=3pt, shorten >=3pt}}
	\tikzset{arr0/.style={draw, ->, thick, shorten <=0pt, shorten >=0pt}}
	\tikzset{arr1/.style={draw, ->, thick, shorten <=1pt, shorten >=1pt}}
	\tikzset{arr2/.style={draw, ->, thick, shorten <=2pt, shorten >=2pt}}

	\newcommand\cmergearr[5][]{
		\draw[arr, #1, -] (#2) -- (#5) -- (#3);
		\draw[arr, #1, shorten <=0] (#5) -- (#4);
		}
	\newcommand\mergearr[4][]{
		\coordinate (center-#2#3#4) at (barycentric cs:#2=1,#3=1,#4=1.2);
		\cmergearr[#1]{#2}{#3}{#4}{center-#2#3#4}
		}
	\newcommand\cunmergearr[5][]{
		\draw[arr, #1, -, shorten >=0] (#2) -- (#5);
		\draw[arr, #1, shorten <=0] (#5) -- (#3);
		\draw[arr, #1, shorten <=0] (#5) -- (#4);
		}
	\newcommand\unmergearr[4][]{
		\coordinate (center-#2#3#4) at (barycentric cs:#2=1.2,#3=1,#4=1);
		\cunmergearr[#1]{#2}{#3}{#4}{center-#2#3#4}
		}
		
	\newcommand\lab[1]{(#1)(lab-#1)}
	\tikzset{alternative/.style args={#1|#2|#3}{name=#1, circle, fill, inner sep=1pt,label={[name={lab-#1},gray!30!black, inner sep=1pt]#3:\scriptsize #2}} }
	\tikzset{tpt/.style args={#1|#2}{alternative={#1|#2|below}} }
	\tikzset{Dom/.style args={#1[#2] (#3) around #4}{dpadded, name=#3, label={[name={lab-#3},align=center,label distance=-1.9em, shading=axis, top color=white, bottom color=black!04,inner sep=0pt, #2]150:#1}, fit={ #4 }, inner sep=0.5em}}


\relax %
    \newcommand{\nhphantom}[2]{\sbox0{\kern-2%
    \nulldelimiterspace$\left.\delimsize#1\vphantom{#2}\right.$}\hspace{-.97\wd0}}
    \makeatletter
    \newsavebox{\abcmycontentbox}
    \newcommand\DeclareDoubleDelim[5]{
    \DeclarePairedDelimiterXPP{#1}[1]%
        {%
            \sbox{\abcmycontentbox}{\ensuremath{##1}}%
        }{#2}{#5}{}%
        {%
            \nhphantom{#3}{\usebox\abcmycontentbox}%
            \hspace{1.2pt} \delimsize#3%
            \mathopen{}\usebox{\abcmycontentbox}\mathclose{}%
            \delimsize#4\hspace{1.2pt}%
            \nhphantom{#4}{\usebox\abcmycontentbox}%
        }%
    }
    \makeatother

\relax %
	\newcommand{\ssub}[1]{_{\!_{#1}\!}}
	
    \newcommand{\pdgunit}{\mathrlap{\mathit 1} \mspace{2.3mu}\mathit 1}
	
	\newcommand{\X}{\mathcal X}
	\newcommand{\V}{\mathcal V}
	\newcommand{\N}{\mathcal N}
	\newcommand{\Ed}{\mathcal E}
	\newcommand{\Ar}{\mathcal A}
	\newcommand{\ArST}{\hat\Ar}
	
    \newcommand{\balpha}{\boldsymbol\alpha}
    \newcommand{\bbeta}{\boldsymbol\beta}

	\newcommand{\bp}[1][L]{\mat{p}\ssub{#1}}
	\newcommand{\bP}[1][L]{\mat{P}\ssub{#1}}
	\def\p_#1{\mathbb P_{\!#1\mskip-2mu}}	

	\newif\ifsuba \subatrue
	\let\psimp\p  %
	\newcommand\Src[1]{\ifsuba S\mskip-2mu\vphantom{|}_{{#1}} \else S \fi}
	\newcommand\Tgt[1]{\ifsuba T\mskip-3mu\vphantom{|}_{{#1}} \else T \fi}

    

	\DeclareMathAlphabet{\mathdcal}{U}{dutchcal}{m}{n}
	\DeclareMathAlphabet{\mathbdcal}{U}{dutchcal}{b}{n}
	\newcommand{\dg}[1]{\mathbdcal{#1}}
	\newcommand{\PDGof}[1]{{\dg M}_{#1}}
	\newcommand{\UPDGof}[1]{{\dg N}_{#1}}
	\newcommand\VFE{\mathit{V\mkern-4mu F\mkern-4.5mu E}}

	\newcommand\Inc{\mathit{Inc}}
	\newcommand{\IDef}[1]{\mathit{IDef}_{\!#1}}
	\newcommand\OInc{\mathit{O\mskip-2.5muI\mskip-3.5mun\mskip-1.7muc}} %
	\newcommand{\SInc}{\mathit{S\mskip-2.5muI\mskip-3.5mun\mskip-1.7muc}} %
	\newcommand{\ed}[3]{#2%
	 {\overset{\smash{\mskip-5mu\raisebox{-1pt}{$\scriptscriptstyle
	        #1$}}}{\rightarrow}} #3}

	\newcommand{\bundle}{\mathbin{+}}
	\DeclareDoubleDelim
		\SD\{\{\}\}
	\DeclareDoubleDelim
		\bbr[[]]
	\makeatletter
	\newsavebox{\aar@content}
	\newcommand\aar{\@ifstar\aar@one@star\aar@plain}
	\newcommand\aar@one@star{\@ifstar\aar@resize{\aar@plain*}}
	\newcommand\aar@resize[1]{\sbox{\aar@content}{#1}\scaleleftright[3.8ex]
		{\Biggl\langle\!\!\!\!\Biggl\langle}{\usebox{\aar@content}}
		{\Biggr\rangle\!\!\!\!\Biggr\rangle}}
	\DeclareDoubleDelim
		\aar@plain\langle\langle\rangle\rangle
	\makeatother


\relax %
    \newcommand\bmu{\boldsymbol\mu}
    \newcommand\C{\mathdcal C}
    \newcommand\minimize{\mathop{\scalebox{0.98}{$\mathsf{minimize}$}}\limits}
    \newcommand\maximize{\mathop{\scalebox{0.98}{$\mathsf{maximize}$}}\limits}
    \newcommand\subjto{{\scalebox{0.98}{$\mathsf{subject~to}$}}}
    
    
    \newcommand\CI{\mathbin{\bot\!\!\!\bot}}

\usepackage{amsthm,thmtools} %
	\usepackage[noabbrev,nameinlink,capitalize]{cleveref}
    \theoremstyle{plain}
    \newtheorem{theorem}{Theorem}
	\newtheorem{coro}{Corollary}[theorem]
    \newtheorem{prop}[theorem]{Proposition}
    \newtheorem{conj}[theorem]{Conjecture}
    \newtheorem{claim}{Claim}
	\declaretheorem[numberwithin=theorem,name=Claim]{iclaim}
    \newtheorem{remark}{Remark}
    \newtheorem{lemma}[theorem]{Lemma}
    \theoremstyle{definition}
    \declaretheorem[name=Definition, qed=$\square$]{defn}
    \declaretheorem[name=Example, qed=$\triangle$]{example}

	\crefname{defn}{Definition}{Definitions}
	\crefname{prop}{Proposition}{Propositions}
    \crefname{issue}{Issue}{Issues}

\relax %
	\usepackage{xpatch}
	\makeatletter
	\xpatchcmd{\thmt@restatable}%
	   {\csname #2\@xa\endcsname\ifx\@nx#1\@nx\else[{#1}]\fi}%
	   %
	   {\ifthmt@thisistheone\csname #2\@xa\endcsname\ifx\@nx#1\@nx\else[{#1}]\fi
	   \else\fi}
	   {}{\typeout{FIRST PATCH TO THM RESTATE FAILED}} %
	\xpatchcmd{\thmt@restatable}%
	   {\csname end#2\endcsname}
	   {\ifthmt@thisistheone\csname end#2\endcsname\else\fi}
	   {}{\typeout{FAILED SECOND THMT RESTATE PATCH}}

	\newcommand{\recall}[1]{\medskip\par\noindent{\bf \Cref{thmt@@#1}.} \begingroup\em \noindent
	   \expandafter\csname#1\endcsname* \endgroup\par\smallskip}

   	\setlength\marginparwidth{1.55cm}
	\newenvironment{linked}[3][]{%
		\def\linkedproof{#3}%
		\def\linkedtype{#2}%
		\ifmarginprooflinks
		\marginpar{%
			\vspace{1.5em}
			\centering%
			\hyperref[proof:\linkedproof]{%
			\color{blue!30!white}%
			\scaleleftright{$\Big[$}{\,\mbox{\footnotesize\centering\tt\begin{tabular}{@{}c@{}}
				link to\\[-0.15em]
				proof
			\end{tabular}}\,}{$\Big]$}}~
			}%
		\fi
        \restatable[#1]{#2}{#2:#3}\label{#2:#3}%
        }%
		{\endrestatable%
		}
	\makeatother
		\newcounter{proofcntr}
		\newenvironment{lproof}{\begin{proof}\refstepcounter{proofcntr}}{\end{proof}}

        \usepackage{cancel} %
        \newcommand{\Cancel}[2][red]{{\color{#1}\cancel{\color{black}#2}}}


		\usepackage{tcolorbox}
		\tcbuselibrary{most}
        \colorlet{proofleft}{blue!20!black!40!white}
        \colorlet{proofmatt}{blue!20!black!05!white}
		\tcolorboxenvironment{lproof}{
			enhanced,
			parbox=false,
			boxrule=0pt,
			frame hidden,
			borderline west={4pt}{0pt}{proofleft},
			colback={proofmatt},
			sharp corners,
			breakable,
		}
        \tikzset{proofmatDom/.style args={#1[#2] (#3) around #4}{dpadded, name=#3, label={[name={lab-#3},align=center,label distance=-1.9em, shading=axis, top color=proofmatt, bottom color=black!04!proofmatt,inner sep=0pt, #2]150:#1}, fit={ #4 }, inner sep=0.5em}}
	\newcommand{\begthm}[3][]{\begin{#2}[{name=#1},restate=#3,label=#3]}

\relax %
    \newcommand{\TODO}[1][INCOMPLETE]{{\color{red}\raggedright\hangindent=0.5cm\rightskip=0.4cm$\smash{\Big\langle}$~\texttt{#1}~\raisebox{-0.3ex}{${\Big\rangle}$}\hspace{-1.5cm}\par}}
    
    \newcounter{daggerfootnote}
    \newcommand*{\daggerfootnote}[1]{%
        \setcounter{daggerfootnote}{\value{footnote}}%
        \renewcommand*{\thefootnote}{\fnsymbol{footnote}}%
        \footnote[2]{#1}%
        \setcounter{footnote}{\value{daggerfootnote}}%
        \renewcommand*{\thefootnote}{\arabic{footnote}}%
        }

\relax %

	\usepackage[american]{babel}
	\usepackage{csquotes}

    \ifbiblatex
	\usepackage[backend=biber, style=authoryear]{biblatex}
	\DeclareLanguageMapping{american}{american-apa}
	\addbibresource{refs.bib}

	\DeclareFieldFormat{citehyperref}{%
	  \DeclareFieldAlias{bibhyperref}{noformat}%
	  \bibhyperref{#1}}

	\DeclareFieldFormat{textcitehyperref}{%
	  \DeclareFieldAlias{bibhyperref}{noformat}%
	  \bibhyperref{%
	    #1%
	    \ifbool{cbx:parens}
	      {\bibcloseparen\global\boolfalse{cbx:parens}}
	      {}}}

	\savebibmacro{cite}
	\savebibmacro{textcite}

	\renewbibmacro*{cite}{%
	  \printtext[citehyperref]{%
	    \restorebibmacro{cite}%
	    \usebibmacro{cite}}}

	\renewbibmacro*{textcite}{%
	  \ifboolexpr{
	    ( not test {\iffieldundef{prenote}} and
	      test {\ifnumequal{\value{citecount}}{1}} )
	    or
	    ( not test {\iffieldundef{postnote}} and
	      test {\ifnumequal{\value{citecount}}{\value{citetotal}}} )
	  }
	    {\DeclareFieldAlias{textcitehyperref}{noformat}}
	    {}%
	  \printtext[textcitehyperref]{%
	    \restorebibmacro{textcite}%
	    \usebibmacro{textcite}}}

	\DeclareCiteCommand{\brakcite}
	  {\usebibmacro{prenote}}
	  {\usebibmacro{citeindex}%
	   \printtext[bibhyperref]{[\usebibmacro{cite}]}}
	  {\multicitedelim}
	  {\usebibmacro{postnote}}
      
     \else
     \gdef\parencite{\citep}
     \gdef\textcite{\citet}
     \makeatletter
     \makeatother
     \fi



\newcommand\discard[1]{}
\newcommand\zogamma{{\mathrlap{\raisebox{-0.1ex}{$\hat{\phantom{x}}$}}\gamma}}

\usepackage{xr}
\externaldocument{richardson_797}



\newcommand{\swap}[3][-]{#3#1#2} %

\title{Inference for Probabilistic Dependency Graphs (Supplementary material)}

\author[1]{Oliver~E.~Richardson}
\author[1]{Joseph~Y.~Halpern}
\author[1]{Christopher De Sa}
\affil[1]{%
    Deparment of Computer Science\\
    ~Cornell University\\
    ~Ithaca NY 14853
}
  
\begin{document}

\onecolumn
\maketitle
\appendix

% \begin{abstract}
%     Probabilistic dependency graphs (PDGs)
%     are a flexible class of probabilistic graphical models,
%     subsuming Bayesian Networks and Factor Graphs.
%     They can also capture inconsistent beliefs, and provide a way of measuring the degree of this inconsistency.
%     We present the first tractable inference algorithm for
%     PDGs with discrete variables,
%     making the asymptotic complexity of PDG inference similar
%     that of the graphical models they generalize. 
%     The key components are:
%     (1) the observation that PDG inference can be reduced to convex optimization with exponential cone constraints, 
%     (2) a construction that allows us to express these problems compactly for PDGs of boundeed treewidth, for which we needed to further develop the theory of PDGs, and
%     (3) an appeal to interior point methods that can solve such problems in polynomial time.
%     We verify the correctness and time complexity of our approach, 
%     and provide an implementation of it.
%     We then evaluate our implementation, and demonstrate that 
%     it outperforms
%     baseline approaches.
%     Our code is available at {\small\url{github.com/orichardson/pdg-infer-uai}}.
% \end{abstract}


% \input{inference-paper-body.tex}
\def\Par#1{\mathrm{Par}(#1)}
\def\Pash{\mathit{V\mskip-5muC\mskip-3.5muP\!}}

%oli9: if not full version, need extra section
\ifvfull\else
\section{Full Descriptions of Clique Tree Convex Programs}
\label{appendix:prob-details}

\subsection{Finding a Clique Tree for \texorpdfstring{$\zogamma$}{gamma}-Inference, when \texorpdfstring{$\gamma$}{gamma} is Small}

%oli4: added
We now give the rest of the details for problem \eqref{prob:cluster-small-gamma},
which was described in \cref{sec:clique-tree-expcone}, and we claimed was correct in \cref{prop:cluster-small-gamma-correct}. 
Over vectors
$\mat v, \bmu \in \Rext^{\V\C}$ and
$\mat u \in \Rext^{\V\!\Ar}$,
the problem is formally given by:

{\allowdisplaybreaks%
\begin{align*}
    \minimize_{\bmu, \mat u, \mat v}\quad&
    % \limits_{{\subalign{
    %     \bmu &= \textstyle [\mu_C(c)]_{(C,c) \in \V\C} \\
    %     \mat u &=\textstyle [u_{a,s,t}]_{(a,s,t) \in \V\!\Ar}\\
    %     \mat v &=\textstyle [v_{C,c}]_{C \in \C\!,\,c \in \V(C)}\\[-5ex]
    % }}} &~~
    % \Vert t \Vert_1
    % \sum_{L}\beta_L\, | \mat u\ssub {\mskip 1muL}\mskip 1mu |
    % \sum_{a \in \Ar} (\beta_a - \alpha_a \gamma) \, \sum_{\mathrlap{\!\!\!\!\! s,t \in \V(\Src a\!, \Tgt a)}} u^a_{s,t} ~\quad +~  \sum_{\mathclap{w \in \V\!\X}} v_w
    \sum_{\mathrlap{\!\!\!(a,s,t) \in \V\!\Ar}} (\beta_a \!- \alpha_a \gamma) u_{a,s,t}
    % + \gamma \sum_{C \in \C} \sum_{{c \in \V(C)}} v_{C,c}
    + \gamma \sum_{\mathclap{(C,c) \in \V\C}}  v_{C,c}
    \numberthis\label{prob:cluster-small-gamma}
    % \\[-0.2ex]
    - \sum_{\mathrlap{\!\!\!(a,s,t) \in \smash{\V\!\Ar^+}}}
        % \beta_a\, 
        \alpha_a\gamma\,
        \mu_{C\!_a}\!(\Src a{=}s,\Tgt a {=} t)
        %oli6: made space to expand this
        % \log \p_a (t|s)
        \log \p_a (\Tgt a{=}t\mid s)
\\[0.2ex]
\subjto\quad&
    \forall C \in \C.~\mu_C \in \Delta\V(C), \\[-0.2ex]
    %%% exponential constraints
    &\forall a \in \Ar.~
        \big(\!- \! \mat u_a,\, \mu_{C\!_a}\!(\Src a,\mskip-2mu \Tgt a),\, \mu_{C\!_a}\!(\Src a) \p_a(\Tgt a | \Src a)\big) \in K_{\exp}^{\V a}, \\
    % \forall a \in \Ar.~
    %     &\big(-\mat u_a, \mu( \Tgt a,\Src a),\p_a(\Tgt a | \Src a)  \mu(\Src a) \big)
    %         \in K_{\exp}^{\V a}, \\[-0.1ex]
    %%% hard constraints
    &\forall (a,s,t) \in \V\!\Ar^0\!.~
    % \forall a \in \Ar. &~ \forall (s,t) \in \V(\Src a, \Tgt a)
    % \text{ s.t. } \p_a(t|s) \!=\! 0.~
    \mu_{C\!_a}\!(\Src a{=}\mskip2mus, \Tgt a{=}\mskip2mut) = 0, \\[-0.2ex]
    % & (-\mat v,  \mu,  \mat 1) \in K_{\exp}^{\V\!\X} \\
    %%% marginal constraints
    &\forall (C,D) \in \mathcal T.~~ \mu_{C}(C \cap D) = \mu_{D}(C \cap D),\\[-0.3ex]
    %%% ent tol constraints
    % \forall C & \in \C.~~
    %oli4: the next line is a little problematic. 
    % \forall C \in \C.& ~~(-\mat v_C, \mu_C, \mu_C(\Pash_C)) \in K_{\exp}^{\V(C)}
    %one solution is component wise;
    % \forall C \in \C.& ~
    % \forall c \in C.~~ (-\mat v_{C,c}, \mu_{C}(c), \mu_C(\Pash_C(c))) \in K_{\exp}
    %or more compactly:
    % \forall (C,c) \in \V &\C.~~ (-\mat v_{C,c},\, \mu_{C}(c),\, \mu_C(\Pash_C(c))) \in K_{\exp} \\
        % another is to define a vector for the last bit
    % \forall C \in \C.& ~
    % ~(-\mat v_C, \mu_C, [\,\mu_C(\Pash_C(c))\,]_{c \in \V(C)}) \in K_{\exp}^{\V(C)}
    &\big(-\mat v,\,\bmu,\, [\,\mu_{C}(\Pash_C(c))\,]_{(C,c)\in\V\C}\big) \in K_{\exp}^{\V\C}
    .
%%%%%%%%%%%%% DESTRUCTURED VERSION
% \minimize_{\bmu, \mat u, \mat v} & ~~
%     \sum_{\mathrlap{\!\!\!(a,s,t, \alpha,\beta) \in \langle\V\!{\Ar,\balpha,\bbeta}\rangle}} (\beta \!- \alpha \gamma) u_{a,s,t}
%     + \gamma \sum_{C \in \C} \sum_{{c \in \V(C)}} v_{C,c}
%     \\[-0.2ex]
%     &\!
%     - \sum_{\mathrlap{\!\!\!(a,S,s,T,t,\beta,p) \in \langle\smash{\V\!\ArST^+},\bbeta,\mathbb P\rangle}} \subafalse
%         \beta\, \mu_{C\!_a}\!(\Src a{=}s,\Tgt a {=} t) \log p (t|s)
%         \numberthis\label{prob:cluster-small-gamma}
% \\[0.2ex]
% \subjto&\quad
%     \forall C \in \C.~\mu_C \in \Delta\V(C), \\[-0.2ex]
%     %%% exponential constraints
%     \mathllap{\forall (a,S,T,p) \in \langle\ArST,\mathbb P\rangle}.~ 
%         \big(&\subafalse\!- \! \mat u_a,\, \mu_{C\!_a}\!(\Src a,\mskip-2mu \Tgt a),\, \mu_{C\!_a}\!(\Src a) p(\Tgt a | \Src a)\big) \in K_{\exp}^{\V a}, \\
%     %%% hard constraints
%     \forall (a,S,T,s,t) & \in \V\!\ArST^0\!.~\subafalse
%     \mu_{C\!_a}\!(\Src a{=}\mskip2mus, \Tgt a{=}\mskip2mut) = 0, \\[-0.2ex]
%     %%% marginal constraints
%     \forall (C,D) &\in \mathcal T.~~ \mu_{C}(C \cap D) = \mu_{D}(C \cap D),\\[-0.3ex]
%     %%% ent tol constraints
%     \forall C \in \C.& ~
%     ~(-\mat v_C, \mu_C, [\,\mu_C(\Pash_C(c))\,]_{c \in \V(C)}) \in K_{\exp}^{\V(C)}
%     .
\end{align*}}%
This is our most complex problem, but it is made up of building blocks
that we have discussed at length in the main paper: the objective comes from \eqref{eq:altscore}, the first and fourth lines of constraints restrict to calibrated clique trees as in \eqref{prob:cluster-inc}, the second and third lines of constraints are essentially present in \eqref{prob:joint-small-gamma},
and the final constraint comes from \eqref{eq:cluster-ent-decomp}.

\subsection{Finding a Clique Tree for $0^+$\!-Inference}
Next, we deal with the analogous construction of the clique tree optimization for $0^+$\!-inference.  
We begin with the straightforward adaption of the relevant prerequisites in \Cref{sec:empirical-limit}.
%
Suppose that $\boldsymbol\nu = \{\nu_C : C \in \C\}$ is a calibrated clique tree over the tree decomposition $(\C, \mathcal T)$ representing a distribution $\Pr_{\boldsymbol\nu} \in \bbr{\dg M}^*_0$, say obtained by solving \eqref{prob:cluster-inc}.
 % representing
%
For $C \in \C$, let $\Ar_C:= \{ a \in \Ar : C_a = C\}$ be the set of
edges assigned to cluster $C$, and let
% $$
% \boldsymbol\psi_C  := \prod_{\substack{L \in \Ed\\C_L = C}} \nu_C (\Tgt Lw | \Src Lw)^{\alpha\ssub L}
% $$
\[
    %oli4:
    % \boldsymbol\psi_C
    %oli4: also adding parens as requested + (changed the letter w to c, for consistency)
    %oli5: bigger vectorization
    % \mat k_C := \bigg[ \prod_{a \in \Ar_C} \nu_C (\Tgt a (c) | \Src a (c))^{\alpha\ssub L} \bigg]_{c \in \V(C)}
    \mat k := \bigg[ \prod_{\mathrlap{a \in \Ar_C}} \nu_C (\Tgt a (c) | \Src a (c))^{\alpha_a} \bigg]_{(C,c) \in \V \C} \in \Rext^{\V\C}
\]
%oli5
% be the analogue of \eqref{eq:cm-product} local to the cluster $C$.
be the analogue of \eqref{eq:cm-product} for a cluster tree.
Once again, consider
% $\mat u := [ u^C_c ]^{C \in \C}_{c \in \V(C)}$,
% $\mat u := [ u_{(C,c)} ]_{C \in \C,\,c \in \V(C)}$,
$\mat u := [ u_{(C,c)} ]_{(C,c) \in \V\C}$,
in the optimization problem
%
\begin{align*}
\minimize_{\bmu, \mat u} \quad&
    % \sum_{w \in \V\X} u_w
    \mat 1^{\sf T} \mat u
    % |\,\mat u\,|
    \numberthis\label{prob:cluster+idef}\\
\subjto\quad&
    \forall C \in \C.~\mu_C \in \Delta\V(C), \\[-0.2ex]
    %oli5: trying cleaner notation. This is option 0
    % \forall C \in \C.~&
    %     (-\mat u_C,\,  \mu_C,\,
    %         \mat k_C \! \odot
    %         [\,\mu_C(\Pash_C(c)) ]_{c \in \V\mskip-1mu C}
    %         ) \in K_{\exp}^{\V(C)}, \\[-0.2ex]
    %oli5: option 1: fully scalarized;
    % \forall (C,c) \in\,& \V\C.~
    %     (- u_{C,c},\,  \mu_{C,c},\,
    %         k_{C,c} \,\mu_C(\Pash_C(c)) 
    %         ) \in K_{\exp}, \\[-0.2ex]
    %oli5: option 2: fully vectorized
     &\big({-}\mat u,\,  \bmu,\,\, 
            \mat k \odot
            \big[\;\mu_C(\Pash_C(c))\;\big]_{(C,c) \in \V\C}
            \big) \in K_{\exp}^{\V\C}, \\[-0.2ex]
    % conditional marginal matching
    &\forall a \in \Ar.~~\mu_{C_{\!a}}\!(\Src a, \Tgt a) \nu_{C_{\!a}}\!(\Src a) = \mu_{C_{\!a}}\!(\Src a) \nu_{C_{\!a}}\!(\Src a, \Tgt a)\\
    % marginal constraints
    &\forall (C,D) \in \mathcal T.~~ \mu_{C}(C \cap D) = \mu_{D}(C \cap D).
\end{align*}
The biggest change is in the second constraint: 
%oli5:
% cluster $C$'s upper bound $\mat u_C$ now only accounts 
% the additional factor on the right means that
intuitively, the upper bounds at each cluster $C$ now only account
for the \emph{additional} entropy not already modeled by 
%oli5:
% its
$C$'s
ancestors.
By \cref{prop:cluster+idef-correct}, if given a proper PDG, the output $\bmu$ of this problem is the unique distribution $0^+$\!-semantics.

\clearpage
\fi

\section{Proofs}

\textbf{\cref{prop:joint-inc-correct}.}
\textit{
If $(\mu, \mat u)$ is a solution to \eqref{prob:joint-inc}, then
$\mu \in \bbr{\dg M}_0^*$,
and
$%
    \sum_{(a,s,t) \in \V\!\Ar} \beta_a u_{a,s,t} = \aar{\dg M}_0$.}


%joe7: no need; your proof is careful enough.
%We'll do the first one particularly carefully.
\begin{lproof}
    \label{proof:joint-inc-correct}
    Suppose that $(\mu, \mat u)$ is a solution to \eqref{prob:joint-inc}.
    The exponential cone constraints ensure that, for every $(a, s,t) \in \V\!\Ar$, 
    $$
        u_{a,s,t} \ge \mu(s,t) \log \frac{\mu(s,t)}{\p_a(t|s)\mu(s)}
    $$
    where $\mu(s,t)$ and $\mu(s)$, as usual, are shorthand for $\mu(\Src a{=}s, \Tgt a{=}t)$ and $\mu(\Src a {=} s)$, respectively.
    
    Suppose, for contradiction, that one of these inequalities is strict at some an index $(a',s',t') \in \V\!\Ar$ for which $\beta_{a'} > 0$.
    Explicitly, this means
    $$
        u_{a',s',t'} > \mu(s_0,t_0) \log \frac{\mu(s',t')}{\p_{a'}(t'|s')\mu(s')}.
    $$
    In that case, we can define a vector $\mat u' = [u'_{a,s,t}]_{(a,s,t)\in\V\!\Ar}$ which is identical to $\mat u$, except that at $(a',s',t')$, it is halfway between the two quantities described as different above.  More precisely:
    $$
        u'_{a',s',t'} = \frac12 u_{a',s',t'} + \frac12 \log \mu(s',t') \log \frac{\mu(s',t')}{\p_a(t'|s')\mu(s')}.
    $$
    Note that $u'_{a',s',t'} < u_{a',s',t'}$,
    and also that, by construction, $(\mu, \mat u')$ also satisfies the constraints of \eqref{prob:joint-inc}.
    In more detail: for $(a', s', t')$ it doesn't violate the associated exponential cone constraint, as
    $$
        \left( \text{formally:} \quad
        u'_{a',s',t'} = \frac12 u_{a',s',t'} + \frac12 \log \mu(s',t')\log \frac{\mu(s',t')}{\p_{a'}(t'|s')\mu(s')}
        > 
        % \frac12 \mu(s_0,t_0) \log \frac{\mu(s,t)}{\p_a(t|s)\mu(s)} + \frac12 \mu(s_0,t_0) \log \frac{\mu(s,t)}{\p_a(t|s)\mu(s)}
        % =
        \mu(s',t') \log \frac{\mu(s',t')}{\p_{a'}(t'|s')\mu(s')}
        \right),
    $$
    and $\mat u'$ remains unchanged at the other indices, and so satisfies the constraints at those indices, becasuse $\mat u$ does. 
    But now, because $u'_{a', s', t'} < u_{a',s',t'}$, and $\beta_{a'} >0$, we also have
    \[
        \sum_{(a,s,t) \in \V\!\Ar} \beta_a u'_{a,s,t}
            > \sum_{(a,s,t) \in \V\!\Ar} \beta_a u'_{a,s,t}.
    \]
    Thus the objective value at $(\mu, \mat u')$ is strictly
    smaller than the one at $(\mu, \mat u)$, both of which are feasible points. 
    This contradicts the assumption that $(\mu, \mat u)$ is optimal. 
    We therefore conclude that none of these inequalities can be strict at points where $\beta_{a} > 0$. 
    This can be compactly written as: 
    \begin{align*}
        \forall (a,s,t) \in \V\!\Ar.\quad
        \beta_a u_{a,s,t} &= \beta_a \mu(s,t) \log \frac{\mu(s,t)}{\p_a(t|s)\mu(s)} \\
        \implies\qquad
        \sum_{(a,s,t) \in \V\!\Ar}\beta_a u_{a,s,t} 
            &= \sum_{(a,s,t) \in \V\!\Ar} \beta_a \mu(s,t) \log \frac{\mu(s,t)}{\p_a(t|s)\mu(s)} 
            %\\&
            = \OInc_{\dg M}(\mu).
    \end{align*}
    In other words, the objective of problem \eqref{prob:joint-inc} at
    $(\mu, \mat u)$ is equal to the observational incompatibility $\OInc_{\dg M}(\mu)$ of $\mu$ with $\dg M$. 
     % because $(\mu, \mat u)$ minimizes this quantity, 
    And, because $(\mu, \mat u)$ minimizes this value among all joint distributions, $\mu$ must be a minimum of $\OInc_{\dg M}$. 
    
    More formally: assume for contradiciton that $\mu$ is not a minimizer of $\OInc_{\dg M}$. Then there would be some other distribution $\mu'$ for which $\OInc_{\dg M}(\mu') < \OInc_{\dg M}(\mu)$. 
    Let $\mat u'' := [ \mu'(s,t) \log \frac{\mu'(s,t)}{\p_a(t|s) \mu'(s)} ]_{(a,s,t) \in \V\!\Ar}$. Clearly $(\mu', \mat u'')$ satisfies the constraints of the problem, and moreover,
    \[
        \sum_{(a,s,t)\in \V\!\Ar} \beta_a u_{a,s,t} =
        \OInc_{\dg M}(\mu) >
        \OInc_{\dg M}(\mu') =            
        \sum_{(a,s,t)\in \V\!\Ar} \beta_a u'_{a,s,t},
    \]
    contradicting the assumption that the $(\mu, \mat u)$ is optimal for problem \eqref{prob:joint-inc}. Thus, $\mu$ is a minimizer of $\OInc_{\dg M}$, and the objective value is $\inf_{\mu} \OInc_{\dg M}(\mu) = \aar{\dg M}_0$, as desired.  
\end{lproof}

\textbf{\cref{prop:joint-small-gamma-correct}.}\textit{
If $(\mu, \mat u, \mat v)$ is a solution to \eqref{prob:joint-small-gamma},
and $\bbeta \ge \gamma \balpha$,
then
$\mu$ is the unique element of
$\bbr{\dg M}^*_\gamma$, and $\aar{\dg M}_\gamma$
equals the objective of \eqref{prob:joint-small-gamma} evaluated at $(\mu, \mat u, \mat v)$.}

For convenience, we repeat problem \eqref{prob:joint-small-gamma}
(left) and an equivalent variant of it that we implement (right) below.

\begin{minipage}{0.49\linewidth}
\begin{align*}
\minimize_{\mu, \mat u, \mat v} & ~~
    % \Vert t \Vert_1
    % \sum_{L}\beta_L\, | \mat u\ssub {\mskip 1muL}\mskip 1mu |
    % \sum_{a \in \Ar} (\beta_a - \alpha_a \gamma) \, \sum_{\mathrlap{\!\!\!\!\! s,t \in \V(\Src a\!, \Tgt a)}} u^a_{s,t} ~\quad +~  \sum_{\mathclap{w \in \V\!\X}} v_w
    \sum_{\mathrlap{\!\!\!(a,s,t) \in \V\!\Ar}}
        (\beta_a \!- \alpha_a \gamma) u_{a,s,t}
        \,+
        \gamma
        \sum_{\mathclap{w \in \V\!\X}} v_w
    \tag{\ref{prob:joint-small-gamma}}
\\[-0.2ex]
    &\qquad
    - \sum_{\mathrlap{\!\!\!(a,s,t) \in \smash{\V\!\Ar^+}}} 
        \alpha_a \gamma \, 
        \mu(\Src a{=}s,\Tgt a {=} t) \log \p_a (t|s)
\\[0.2ex]
\subjto&\quad \mu \in \Delta\V\!\X, 
        \quad ( -\mat v,  \mu,  \mat 1) \in K_{\exp}^{\V\!\X},
\\[-0.4ex]
    \forall a \in \Ar.~
        &\big(-\mat u_a, \mu( \Tgt a,\Src a),\p_a(\Tgt a | \Src a)  \mu(\Src a) \big)
            \in K_{\exp}^{\V a}, 
\\[-0.2ex]
    \forall (a,s,t) &\in \V\!\Ar^0\!.~
    % \forall a \in \Ar. &~ \forall (s,t) \in \V(\Src a, \Tgt a)
    % \text{ s.t. } \p_a(t|s) \!=\! 0.~
    \mu(\Src a{=}\mskip2mus, \Tgt a{=}\mskip2mut) = 0;
\end{align*}
\end{minipage}
%
~~\vrule~~
%
\begin{minipage}{0.49\linewidth}
\begin{align*}
\minimize_{\mu, \mat u, \mat v} & ~~
    \sum_{\mathrlap{\!\!\!(a,s,t) \in \V\!\Ar}}
        (\beta_a \!- \alpha_a \gamma) u_{a,s,t}
        \,+
        \gamma
        \sum_{\mathclap{w \in \V\!\X}} v_w
        \tag{\ref*{prob:joint-small-gamma}b}\label{prob:joint-small-gamma-b}
\\[-0.2ex]
    &\qquad
    - \sum_{\mathrlap{\!\!\!(a,s,t) \in \smash{\V\!\Ar^+}}} 
        \beta_a \, 
        \mu(\Src a{=}s,\Tgt a {=} t) \log \p_a (t|s)
\\[0.2ex]
\subjto&\quad \mu \in \Delta\V\!\X, 
        \quad ( -\mat v,  \mu,  \mat 1) \in K_{\exp}^{\V\!\X},
\\[-0.4ex]
    \forall a \in \Ar.~
    &\big(-\mat u_a, \mu( \Tgt a,\Src a),
        \big[\,\mu(\Src a=s) \big]_{(s,t) \in \V a} \big)
        \in K_{\exp}^{\V a}, 
\\[-0.2ex]
    \forall (a,s,t) &\in \V\!\Ar^0\!.~
    % \forall a \in \Ar. &~ \forall (s,t) \in \V(\Src a, \Tgt a)
    % \text{ s.t. } \p_a(t|s) \!=\! 0.~
    \mu(\Src a{=}\mskip2mus, \Tgt a{=}\mskip2mut) = 0.
\end{align*}
\end{minipage}
\medskip

\begin{lproof}\label{proof:joint-small-gamma-correct}
    We start with the problem on the left, which is \eqref{prob:joint-small-gamma} from the main text.
    Suppose that $(\mu, \mat u, \mat v)$ is a solution to \eqref{prob:joint-small-gamma}.
    The exponential constraints ensure that
    \[
        \forall (a,s,t) \in \V\!\Ar.~
        u_{a,s,t} \ge \mu(s,t) \log \frac{\mu(t|s)}{\p_a(t|s)}
    \qquad\text{and}\qquad
        \forall w \in \V \X.~
        v_{w} \ge \mu(w) \log \mu(w).
    \]
    As in the previous proof, we claim that these must hold with equality (except possibly for $u_{a,s,t}$ at indices satisfying $\beta_a = \gamma \alpha_a$, when it doesn't matter). 
    This is because otherwise one could reduce the value of a component of $u$ or $v$ while still satisfying all of the constraints, to obtain a strictly smaller objective, contradicing the assumption that $(\mu, \mat u, \mat v)$ minimizes it. 
    
    Thus, $\mat v$ is a function of $\mu$, as is every value of $\mat u$ that affects the objective value of \eqref{prob:joint-small-gamma}, meaning that this objective value can be written as a function of $\mu$ alone: 
    \begin{align*}
    % \textit{(\ref{prob:joint-small-gamma}.obj)} =
        &\sum_{\mathrlap{\!\!\!(a,s,t) \in \V\!\Ar}}
            (\beta_a \!- \alpha_a \gamma) u_{a,s,t}
        ~+ \gamma \sum_{\mathclap{w \in \V\!\X}} v_w
        ~- \sum_{\mathrlap{\!\!\!(a,s,t) \in \smash{\V\!\Ar^+}}}
            \alpha_a\gamma \, \mu(s,t) \log \p_a (t|s) \\
    &=
        \sum_{\mathrlap{\!\!\!(a,s,t) \in \V\!\Ar}}
            (\beta_a \!- \alpha_a \gamma) \pqty*{\mu(s,t) \log \frac{\mu(t|s)}{\p_a(t|s)}}
        ~+~ \gamma \sum_{\mathclap{w \in \V\!\X}} \mu(w) \log \mu(w)
        ~-~ \sum_{\mathrlap{\!\!\!(a,s,t) \in \smash{\V\!\Ar^+}}}
            \alpha_a\gamma \, \mu(s,t) \log \p_a (t|s) \\
    &=
        \sum_{a \in \Ar} (\beta_a \!- \alpha_a \gamma) \sum_{(s,t) \in \V a}
             \pqty*{\mu(s,t) \log \frac{\mu(t|s)}{\p_a(t|s)}}
        - \gamma \H(\mu)
        - \sum_{a \in \Ar} \alpha_a\gamma \, \sum_{(s,t) \in \V\!\Ar}
             \mu(s,t) \log \p_a (t|s) \\
     &=
         \sum_{a \in \Ar} (\beta_a \!- \alpha_a \gamma)
          \sum_{(s,t) \in \V a}
             \mu(s,t) \pqty*{\log \frac{1}{\p_a(t|s)} - \log \frac{1}{\mu(t|s)}}
         - \gamma \H(\mu)
         - \sum_{a \in \Ar} \alpha_a\gamma \, \Ex_{\mu} [ \log \p_a (\Tgt a|\Src a) ] \\
    &= 
        \sum_{a \in \Ar} (\beta_a \!-\! \alpha_a \gamma)
           \Ex_{\mu}[ - \log \p_a(\Tgt a | \Src a)]
        - \sum_{a \in \Ar} (\beta_a \!-\! \alpha_a \gamma)
           \H_{\mu}(\Tgt a | \Src a)
        - \gamma \H(\mu)
        - \sum_{a \in \Ar} \alpha_a\gamma \, \Ex_{\mu} [ \log \p_a (\Tgt a|\Src a) ] \\
    &= 
        \sum_{a \in \Ar} \Big( - \alpha_a\gamma - (\beta_a \!-\! \alpha_a \gamma) \Big)
           \Ex_{\mu}[ \log \p_a(\Tgt a | \Src a)]
        + \sum_{a \in \Ar} (\alpha_a \gamma \!-\! \beta_a)
           \H_{\mu}(\Tgt a | \Src a)
        - \gamma \H(\mu) \\
    &= 
        -\sum_{a \in \Ar} \beta_a
           \Ex_{\mu}[ \log \p_a(\Tgt a | \Src a)]
        + \sum_{a \in \Ar} (\alpha_a \gamma \!-\! \beta_a)
           \H_{\mu}(\Tgt a | \Src a)
        - \gamma \H(\mu).
    \end{align*}
    ( In the third step, we were able to convert $\V\!\Ar^+$ to $\V\!\Ar$ because, as usual in when dealing with information-therotic quantities, we interpret $0 \log \frac{1}0$ as equal to zero, which is its limit. ) 
    
    The algebra, for the right side variant 
    % (\ref{prob:joint-small-gamma}b)
    \eqref{prob:joint-small-gamma-b}
    is slightly simpler. In this case the middle conic constraint is almost the same, except for that $\p_a(t|s)$ has been replaced with $1$, and so it ensures that $u_{a,s,t} = \mu(s,t) \log \mu(t\mid s)$ (i.e., the same as before, but without the probability in the denomiator). So,
    \begin{align*}
    % \textit{(\ref{prob:joint-small-gamma}b.obj)} =
        &\sum_{\mathrlap{\!\!\!(a,s,t) \in \V\!\Ar}}
            (\beta_a \!- \alpha_a \gamma) u_{a,s,t}
        ~+ \gamma \sum_{\mathclap{w \in \V\!\X}} v_w
        ~- \sum_{\mathrlap{\!\!\!(a,s,t) \in \smash{\V\!\Ar^+}}}
            \beta_a \, \mu(s,t) \log \p_a (t|s) \\
    &=
        \sum_{\mathrlap{\!\!\!(a,s,t) \in \V\!\Ar}}
            (\beta_a \!- \alpha_a \gamma) \mu(s,t) \log \mu(t|s)
        ~+~ \gamma \sum_{\mathclap{w \in \V\!\X}} \mu(w) \log \mu(w)
        ~-~ \sum_{\mathrlap{\!\!\!(a,s,t) \in \smash{\V\!\Ar^+}}}
            \beta_a \, \mu(s,t) \log \p_a (t|s) \\
    &=
        \sum_{a \in \Ar} (\beta_a \!- \alpha_a \gamma) \sum_{(s,t) \mathrlap{\in \V a}}
             \mu(s,t) \log \mu(t|s)
        - \gamma \H(\mu)
        - \sum_{a \in \Ar} \beta_a \, \sum_{(s,t) \mathrlap{\in \V\!\Ar}}
             \mu(s,t) \log \p_a (t|s) \\
        &= 
        % -
        \sum_{a \in \Ar}
         (\alpha_a \gamma \!-\! \beta_a)
         % (\beta_a  \!-\! \gamma \alpha_a)
           \H_{\mu}(\Tgt a | \Src a)
        - \gamma \H(\mu)
        -\sum_{a \in \Ar} \beta_a
           \Ex_{\mu}[ \log \p_a(\Tgt a | \Src a)].
    \end{align*}
    
    
    In either case, the objective value is equal to $\bbr{\dg M}_\gamma(\mu)$, by \eqref{eq:altscore}.
    % Since another distribution with lower would make $(\mu, \mat u, \mat v)$ 
    Because $(\mu, \mat u, \mat v)$ is optimal for this problem, we know that $\mu$ is a minimizer of $\bbr{\dg M}_{\gamma}(\mu)$, and that the objective value equals $\aar{\dg M}_\gamma$. 
\end{lproof}


\begin{lemma}\label{lem:hess-relent}
    The gradient and Hessian of the conditional relative entropy
    are given by
    \begin{align*}
        \Big[ \nabla_{\mu} \kldiv{\mu(X,Y)}{\mu(X) p(Y|X) } &\Big]_u
            = \log \frac{\mu(Y\! u | X\! u)}{  p(Y\! u | X\! u)} \\
        \Big[ \nabla^2_\mu \kldiv{\mu(X,Y)}{\mu(X)p(Y|X)}&\Big]_{u,v}
            = \frac{\mathbbm1[X\!u {=} X\!v \land Y\!u {=} Y\!v]}{\mu(Y\! u, X\! u)}
            - \frac{\mathbbm1[X\!v = X\!u]}{\mu(X\!u)}
        .
    \end{align*}
\end{lemma}
\begin{lproof} 
    \allowdisplaybreaks

    \def\pd/d#1[#2]{\,\frac{\partial}{\partial #1}\!\! \left[\vphantom{\Big|}#2\right]}
   
    Represent $\mu$ as a vector $[\mu_{w}]_{w \in \V\X}$.
    We will make repeated use of the following facts:
    \begin{align*}
        \pd/d\mu_u [\mu(X{=}x)]=
        \pd/d\mu_u [\mu(x)]
         &= \sum_w \pd/d\mu_u[\mu_w] \! \mathbbm1[X\!w{=}x]
            ~=~  \mathbbm1[ X\! u {=} x] ; \quad\text{and}\\
        % \pd/d \mu_u [\mu(Y{=}y|X{=}x)] = 
        \pd/d\mu_u [\mu(y|x)] &=  
            % \pd/d \mu_u [ \frac{\mu(X{=}x,Y{=}y)}{\mu(X{=}x)}]
            \pd/d\mu_u [ \frac{\mu(x,y)}{\mu(x)}] \\
        &= - \mu(x,y) \pd/d\mu_u[ \frac{1}{\mu(x)} ]
            + \frac1{\mu(x)} \pd/d\mu_u[ \mu(x,y) ] \\
        &= - \frac{\mu(x,y)}{\mu(x)^2} 
            \mathbbm1[ X\!u = x] + \frac{1}{\mu(x)} \mathbbm1[XY(u){=}xy] \\
        &= \frac{\mathbbm1[X\!u = x]}{\mu(x)}\Big( \mathbbm1[Y\!u=y] - \mu(y|x) \Big).
        % \implies\quad\nabla_\mu \mu(y|x) &= \frac{\mathbbm1_x}{\mu(X)}\Big( \mathbbm1_y - \mu(y|x) \Big).
    \end{align*}
    
    We now apply this to the (conditional) relative entropy:
    \begin{align*}
        &\pd/d\mu_u [ \kldiv{\mu(X,Y)}{\mu(X) p(Y|X) }] \\
            &= \pd/d\mu_u [ \sum_{w} \mu_w \log \frac{\mu(Y\! w | X\! w)}{ p(Y\! w | X\! w)} ] \\
            &= \sum_{w} \mathbbm1[u{=}w]  \log \frac{\mu(Y\! w | X\! w)}{  p(Y\! w | X\! w)}
                + \sum_{w} \mu_w  \pd/d\mu_u [  \log \frac{\mu(Y\! w | X\! w)}{  p(Y\! w | X\! w)} ] \\
            &=  \log \frac{\mu(Y\! u | X\! u)}{  p(Y\! u | X\! u)}
                + \sum_{w} \mu_w  
                \frac{  p(Y\! w | X\! w)}{\mu(Y\! w | X\! w)}
                \pd/d\mu_u [ \frac{\mu(Y\! w | X\! w)}{  p(Y\! w | X\! w)} ] \\
            &=  \log \frac{\mu(Y\! u | X\! u)}{  p(Y\! u | X\! u)}
                + \sum_{w} \mu_w  
                \frac{1}{\mu(Y\! w | X\! w)}
                \pd/d\mu_u [\mu(Y\! w | X\! w) ] \\
            &=  \log \frac{\mu(Y\! u | X\! u)}{  p(Y\! u | X\! u)}
                + \sum_{w} \mu_w  
                \frac{1}{\mu(Y\! w | X\! w)}
                 \frac{\mathbbm1[X\!u = X\!w]}{\mu(X\!w)}\Big( \mathbbm1[Y\!u=Y\!w] - \mu(Y\!w|X\!w) \Big)\\
            &=  \log \frac{\mu(Y\! u | X\! u)}{  p(Y\! u | X\! u)}
                + \sum_{w} \mu_w  \frac{\mathbbm1[X\!u{=}X\!w \land Y\!u{=}Y\!w]}
                    {\mu(X\!w, Y\!w)}
                - \sum_{w} \mu_w \frac{\mathbbm1[X\!u = X\!w]}{\mu(X\!w)}
                \\
            &=  \log \frac{\mu(Y\! u | X\! u)}{  p(Y\! u | X\! u)}
                + \frac{1}{\mu(X\!u, Y\!u)} \sum_w \mu_w  \mathbbm1[X\!u{=}X\!w \land Y\!u{=}Y\!w]
                - \frac{1}{\mu(X\!u)} \sum_w \mu_w \mathbbm1[X\!u = X\!w]
                \\
            &=  \log \frac{\mu(Y\! u | X\! u)}{  p(Y\! u | X\! u)}
                + \frac{\mu(X\!u, Y\!u)}{\mu(X\!u, Y\!u)} 
                - \frac{\mu(X\!u)}{\mu(X\!u)}   \\
            &= \log \frac{\mu(Y\! u | X\! u)}{  p(Y\! u | X\! u)}
    \end{align*}
    
    This allows us to compute the Hessian of the conditional relative entropy, whose  components are 
        % the second partial derivatives
    \begin{align*}
        \frac{\partial^2}{\partial \mu_u \partial \mu_v} \Big[ \kldiv{\mu(XY)}{\mu(X)p(Y|X)}\Big] 
        &= 
        \pd/d\mu_v[ \log \frac{\mu(Y\! u | X\! u)}{  p(Y\! u | X\! u)} ] \\
        &=
        \frac{ p(Y\! u | X\! u)}{\mu(Y\! u | X\! u)} \frac1{ p(Y\! u | X\! u)}
        \pd/d\mu_v[ \mu(Y\! u | X\! u) ] \\
        &= \frac{1}{\mu(Y\! u | X\! u)}
            \frac{\mathbbm1[X\!v{=}X\!u]}{\mu(X\!u)}\Big( \mathbbm1[Y\!v{=}Y\!u] - \mu(Y\!u|X\!u) \Big)\\
        &= \frac{\mathbbm1[X\!u {=} X\!v \land Y\!u {=} Y\!v]}{\mu(Y\! u, X\! u)}
            - \frac{\mathbbm1[X\!v = X\!u]}{\mu(X\!u)}
        % = - \frac{\mathbbm1[X\!v{=}X\!u]}{\mu(X\!u)}\Big(1 - \frac{\mathbbm1[Y\!v{=}Y\!u]}{\mu(Y\! u | X\! u)} \Big)
        .
    \end{align*}
    
    % By linearity, the Hessian $\mat H = \nabla^2 \OInc(\mu) $ of $\OInc(\mu)$ is then simply
    % \begin{align*}
    %     ( \mat H )_{u,v} = 
    %     \big[ \nabla^2 \OInc(\mu) \big]_{u,v}
    %         &= \sum_{a \in \Ar} \beta_a \left( \frac{\mathbbm1[\Src a u {=} \Src a v \land \Tgt a u {=} \Tgt a v]}{\mu(\Tgt a u, \Src a u)}
    %             - \frac{\mathbbm1[\Src a v = \Src a u]}{\mu(\Src a u)} \right).
    % \end{align*}
    % 
    % Consider a direction $\delta \in T_{\mu}\Delta\V\!\X$ (i.e., the tangent space to the space of distributions over $\X$, at $\mu$), which can be represented as
    % a vector $\delta = [\delta_w]_{w \in \V\!\X}$ such that $\sum_w \delta_w = 0$. 
    % 
    %it's psd. 
    % \begin{align*}
    %     \delta^{\sf T} \nabla^2 \OInc(\mu) \delta
    %         &= 
    %         \sum_u \sum_v \delta_u \delta_v
    %         \sum_{a \in \Ar} \beta_a \left( \frac{\mathbbm1[X\!u {=} X\!v \land Y\!u {=} Y\!v]}{\mu(Y\! u, X\! u)}
    %             - \frac{\mathbbm1[X\!v = X\!u]}{\mu(X\!u)} \right)
    % \end{align*}
    % \begin{align*}
    %     \big( \nabla^2 \OInc(\mu) \delta \big)_{u}
    %     &= \sum_v \delta_v
    %         \sum_{a \in \Ar} \beta_a \left( \frac{\mathbbm1[X\!u {=} X\!v \land Y\!u {=} Y\!v]}{\mu(Y\! u, X\! u)}
    %             - \frac{\mathbbm1[X\!v = X\!u]}{\mu(X\!u)} \right) \\
    %     &= \sum_{a \in \Ar} \beta_a 
    %         \left( \Big( \frac{1}{\mu(Y\! u, X\! u)}\sum_v \delta_v {\mathbbm1[X\!u {=} X\!v \land Y\!u {=} Y\!v]} \Big)
    %             - \Big( \frac{1}{\mu( X\! u)}\sum_v \delta_v {\mathbbm1[X\!v = X\!u]} \Big) \right) \\
    % \intertext{
    % We can view $\delta$ as a (signed) measure on $\V\!\X$, that has marginals in the usual sense. In particular, we can use the analogous notation $\delta(x)$ for $\sum_{w \in \V\!\X} \delta_w \mathbbm 1[X\!w=x]$.
    % indeed, if $\delta$ were non-negative and summed to 1, making it a probability measure, then $\delta(x)$ would be the probability $\delta(X{=}x)$ in the usual sense.  With this notation, the expression above becomes:}
    %     &= \sum_{a \in \Ar} \beta_a \left( 
    %         \frac{\delta(Y\! u, X\! u) }{\mu(Y\! u, X\! u)}
    %         -\frac{\delta(X\!u)}{\mu( X\! u)} \right) \\
    % \end{align*}
    
    % \begin{align*}
    %     % \frac{\partial \OInc_{\dg M}(\mu)}{\partial \mu_u}        
    %     \pd/d \mu_u [\OInc_{\dg M}]
    %     &= \pd/d \mu_u [ \sum_{a} \sum_w \mu_w 
    %         \log \frac{\mu(\Tgt a w | \Src a w)}{\p_a(\Tgt a w | \Src a w)} ] \\
    % \end{align*}
\end{lproof}


\begin{lemma}
    Let $p(Y|X)$ be a cpd,
    and suppose $\mu_0, \mu_1 \in \Delta \V(X,Y)$ are joint distributions that have different conditional marginals on $Y$ given $X$; that is, that
    % If there exist $x,y \in \V(X,Y)$, such that 
    there exist $(x,y) \in \V(X,Y)$ such that
    $
        % \nu(X,Y) \mu(X)  \ne \mu(X,Y) \nu(X).
        \mu_0(x,y) \mu_1(x)  \ne \mu_1(x,y) \mu_0(x).
    $
    % Then the function $f : [0, 1] \to \Delta \V(X,Y)$ given by    
    % For $t \in [0,1]$, define $\mu_t := (1-t) \mu_0 + t \mu_1$
    Then the conditional relative entropy 
    $
        \kldiv[\Big]{ \mu(X,Y) }{ \mu(X) p(Y|X) }
    $
    is strictly convex in $\mu$ along the line segment from $\mu_0$ to $\mu_1$. 
    More precisely, for $t \in [0,1]$, if we define
    $\mu_t := (1-t) \mu_0 + t\, \mu_1$, then 
    the function
    \[
    % t \mapsto (1-t) \mu_0 + t \mu_1
    t ~\mapsto~ \kldiv[\Big]{ \mu_t(X,Y) }
        {\mu_t(X) p(Y|X)}
        \qquad\text{is strictly convex. }
    \]
    \label{lem:seg-strictcvx}
\end{lemma}
\begin{lproof}
    We can only have non-strict convexity if the direction $\delta$ lies in the null-space of the Hessian matrix $\mat H(\mu)$ of the relative entropy. 
    By \cref{lem:hess-relent}, 
    \[
        \mat H_{(xy),(x'y')}
         = \frac{\mathbbm1[x {=} x' \land y {=} y']}{\mu(x,y)}
             - \frac{\mathbbm1[x {=} x']}{\mu(x)}.
    \]    
    
    \def\bdelta{{\boldsymbol\delta}}
    Consider a function $\delta : \V(X,Y) \to \mathbb R$ that is not identically zero, which can be viewed as a vector $\bdelta = [\delta(x,y)]_{(x,y) \in \V(X,Y)} \in \mathbb R^{\V(X,Y)}$. 
    We can also view $\delta$ as a (signed) measure on $\V(X,Y)$, that has marginals in the usual sense. In particular, we use the analogous notation 
    % $\delta(x)$ for $\sum_{w \in \V\!\X} \delta_w \mathbbm 1[X\!w=x]$.
    \[
        \delta(x) :=
            % \sum_{(x',y') \in \V(X,Y)} \delta(x,y) \mathbbm 1[x=x'] =
            \sum_{y \in \V Y} \delta(x,y).
    \]
    % Indeed, if $\delta$ were non-negative and summed to 1, making it a probability measure, then $\delta(x)$ would be the probability $\delta(X{=}x)$ in the usual
    We then compute
    \begin{align*}
        \big(\, \mat H(\mu)\, \bdelta\, \big)_{x,y} 
        &= \sum_{x', y'} \delta(x',y') \left( \frac{\mathbbm1[x {=} x' \land y {=} y']}{\mu(x,y)} - \frac{\mathbbm1[x {=} x']}{\mu(x)} \right) \\
        % &= \frac{\delta(x,y)}{\mu(x,y)} - \frac{\sum_{y'} \delta(x,y')}{\mu(x)} \\
        &= \frac{\delta(x,y)}{\mu(x,y)} - \frac{\delta(x)}{\mu(x)}.
    \end{align*}
    
    and also
    \begin{align*}
        \bdelta^{\sf T} \mat H(\mu) \,\bdelta 
            &= \sum_{x,y} \delta(x,y) (\, \mat H(\mu)\, \bdelta\, )_{x,y} \\
            &= \sum_{x,y} \delta(x,y) \left(
                \frac{\delta(x,y)}{\mu(x,y)} - \frac{\delta(x)}{\mu(x)} \right) \\
            &= \sum_{x,y} 
                \frac{\delta(x,y)^2}{\mu(x,y)} - \sum_{x} \frac{\delta(x)}{\mu(x)} \sum_y \delta(x,y) \\
            &= \sum_{x,y} \frac{\delta(x,y)^2}{\mu(x,y)} - \sum_{x} \frac{\delta(x)^2}{\mu(x)}  \\
            &= \sum_{x} \frac{\delta(x)^2}{\mu(x)} \left( \sum_y \frac{\delta(x,y)^2}{\delta(x)^2 \mu(y|x)} - 1 \right). \numberthis\label{line:beforabs}
    \end{align*}
    
    Now, consider another discrete measure $|\delta|$, whose value at each component is the absolute value of the value of $\delta$ at that component, i.e., $|\delta|(x,y) := |\delta(x,y)|$.
    By construction, $|\delta|$ is now an unnormalized probability measure: $|\delta| = k q(X,Y)$, where $k = \sum_{x,y}|\delta(x,y)| > 0$ and $q \in \Delta\V(X,Y)$. 

    Note also that $|\delta|(x)^2 = (\sum_{y} |\delta(x,y)|)^2 \ge (\sum_{y} \delta(x,y))^2$, and strictly so if there are $y,y'$ such that $\delta(x,y) < 0 < \delta(x,y')$.
    In other words, the vector $\bdelta_x = [\delta(x,y)]_{y \in \V Y}$ is either non-negative or non-positive: $\bdelta_x \ge 0$ or $\bdelta_x \le 0$ for each $x$. 
     Meanwhile, $|\delta|(x,y)^2 = \delta(x,y)^2$ is unchanged.
    Thus, for every $x \in \V X$, we have:
    \begin{align*}
        \sum_y \frac{\delta(x,y)^2}{\delta(x)^2 \mu(y|x)} - 1
        &\ge \sum_y \frac{|\delta|(x,y)^2}{|\delta|(x)^2 \mu(y|x)} - 1 \\
        &= \sum_y \frac{k^2 q(x,y)^2}{k^2 q(x)^2 \mu(y|x)} - 1 \\
        &= \sum_y \frac{ q(y|x)^2}{\mu(y|x)} - 1   \\
        &= \chi^2 \Big( q(Y|x) \Big\Vert  \mu(Y|x) \Big) \ge 0.
    \end{align*}
    The final line depicts the $\chi^2$ divergence between the distributions $q(Y|x)$ and $\mu(Y|x)$, both distributions over $Y$.  Since it is a divergence, this quantity is non-negative and equals zero if and only if $q(Y|x)=\mu(Y|x)$.
    
    Picking up where we left off, we have:
    \begin{align*}
        \bdelta^{\sf T} \mat H(\mu) \bdelta 
            &= \sum_{x} \frac{\delta(x)^2}{\mu(x)} \left( \sum_y \frac{\delta(x,y)^2}{\delta(x)^2 \mu(y|x)} - 1 \right) \\
            &\ge \sum_{x} \frac{\delta(x)^2}{\mu(x)} \left( \sum_y \frac{|\delta|(x,y)^2}{|\delta|(x)^2 \mu(y|x)} - 1 \right) \\
            &=
            % k^2 \sum_{x} \frac{q(x)^2}{\mu(x)} 
            \sum_{x} \frac{\delta(x)^2}{\mu(x)} 
            \chi^2 \Big( q(Y|x) \Big\Vert  \mu(Y|x) \Big) \ge 0.
    \end{align*}
    As a non-negatively weighted sum of non-negative numbers, this final quantity is non-negative, and equals zero if and only if, for each $x \in \V X$, we have either $q(Y|x) = \mu(Y|x)$, or $\delta(x) = 0$. 
    %
    Furthermore, if $\bdelta^{\sf T} \mat H(\mu) \bdelta = 0$, then \emph{both} inequalities hold with equality. Therefore, we know that if $\delta(x) \ne 0$, then $\bdelta_x \ge \mat 0$ or $\bdelta_x \le \mat 0$. 
    These two conditions are also sufficient to show that $\bdelta^{\sf T} \mat H(\mu) \bdelta = 0$.
    To summarize what we know so far: 
    % /, to summarize more compactly:
    \begin{align*}
        \bdelta^{\sf T} \mat H(\mu) \bdelta = 0
            \quad\iff\quad\forall x \in \V\!X.\quad
                % \text{either}&~~ \delta(x,Y) \ge 0 \text{~~and~~ $|\delta|(Y|x) = \mu(Y|x) $}\\
                \text{either}&~~  (\bdelta_x \ge \mat 0 \text{ or } \bdelta_y \le \mat 0) \text{~~and~~ $|\delta|(Y|x) = \mu(Y|x) $}\\
                \text{or }&~~ \delta(x) = 0.
    \end{align*}
    
    % But if $\delta(x) = 0$, we have 
    The second possibility, however, is somewhat of a fluke; we now return to the expression we had in \eqref{line:beforabs} before considering $|\delta|$, 
    We've already shown that the contribution to the sum at each value of $x$ is non-negative, so if $\bdelta^{\sf T} \mat H(\mu) \bdelta$ is to equal zero, each summand which depends on $x$ must be zero as well.  
    So if $x$ is a value of $X$ for which $\delta(x) = 0$, then
    \begin{align*}
        0 = \frac{1}{\mu(x)} \left( \sum_y \frac{\delta(x,y)^2}{\mu(y|x)} - \delta(x)^2 \right) 
         = \frac{1}{\mu(x)} \sum_y \frac{\delta(x,y)^2}{\mu(y|x)}  
     % \implies \qquad 
         = \sum_y \frac{\delta(x,y)^2}{\mu(x,y)},
    \end{align*}
    which is only possible if $\delta(x,y) = 0$ for all $y$. 
    % We have now seen that the second case also implies that $\delta(x,Y) \ge 0$, so we can conclude simply that:
    This allows us to compute, more simply, that 
    % any direction $\bdelta$ in the null $
    \begin{align*}
        \bdelta^{\sf T} \mat H(\mu) \bdelta = 0
        \quad&\iff\quad 
            (\forall x .~  \bdelta_x \ge \mat 0 \text{ or } \bdelta_x \le \mat 0)
            \qquad\text{and}\qquad 
            \forall (x,y) \in \V(X,Y).~~
                \delta(x,y) \mu(x) = \delta(x) \mu(x,y)
    %             \\
    %     % \quad&\iff 
    %     %     \exists \delta(x) : 
    %     %     \delta(x,y) = \delta(x) q(y|x) 
        \end{align*}
    % $\bdelta$ is in the null space of $\mat H(\mu)$ if and only if 
    % it can be written as $\delta(x) \mu(y|x)$
    
    \medskip
    \hrule
    \medskip
    
    Finally, we are in a position to prove the lemma.  
    Suppose $\mu_0,\mu_1 \in \Delta\V(X,Y)$ and $(x^*,y^*) \in \V(X,Y)$ 
    are such that $\mu_0(x^*,y^*) \mu_1(x^*)  \ne \mu_1(x^*,y^*) \mu_0(x^*)$.
    So, the quantity
    \[
        \mathit{gap} := \mu_1(x^*,y^*) \mu_0(x^*) - \mu_0(x^*,y^*) \mu_1(x^*) 
        % \quad \ne 0.
        \quad\text{is nonzero}.
    \]
    Then for all $t \in (0,1)$ the intermediate point $\mu_t = (1-t)\, \mu_0 + t\, \mu_1$ must have different conditional marginals from both $\mu_0$ and $\mu_1$, as
    \begin{align*}
        &\mu_t(x^*\!,y^*)\mu_0(x^*) - \mu_0(x^*\!,y^*) \mu_t(x^*) \\
            &= \Cancel{(1-t)\mu_0(x^*\!,y^*)\mu_0(x^*)} + t \mu_1(x^*\!,y^*)\mu_0(x^*)
                -  \Cancel{(1-t)\mu_0(x^*\!,y^*) \mu_0(x^*)} - t\mu_0(x^*\!,y^*) \mu_1(x^*) \\
            &= t \pqty[\big] { \mu_1(x^*\!,y^*)\mu_0(x^*) -\mu_0(x^*\!,y^*) \mu_1(x^*)}
            \\
            &=~ t \cdot \mathit{gap}
            ~\ne~ 0,
    \end{align*}
    and analogously for $\mu_1$,
    \begin{align*}
        &\mu_t(x^*\!,y^*)\mu_1(x^*) - \mu_1(x^*\!,y^*) \mu_t(x^*) \\
            &= (1-t)\mu_0(x^*\!,y^*)\mu_1(x^*) + \Cancel{ t \mu_1(x^*\!,y^*)\mu_1(x^*) }
                -  (1-t)\mu_1(x^*\!,y^*) \mu_0(x^*) - \Cancel{ t\mu_1(x^*\!,y^*) \mu_1(x^*) } \\
            &= (1-t) (\mu_0(x^*,y^*)\mu_1(x^*) -\mu_1(x^*\!,y^*) \mu_0(x^*)) 
            \\
            &=~ -(1-t) \cdot \mathit{gap}
            ~\ne~ 0.
    \end{align*}
    
    Then for any direction $\delta := k(\mu_0 - \mu_1)$ parallel to the segment between $\mu_0$ and $\mu_1$ (intuitively a tangent vector at $\mu_t$, although this fact doesn't affect the computation), of nonzero length ($k\ne 0$), we have:
    \begin{align*}
        &
        %\frac1k \pqty[\Big]{
        \mu_t(x^*\!,y^*) \delta(x^*)  - \delta(x^*\!,y^*) \mu_t(x^*)
        % }
        \\
        &= k\; \mu_t(x^*\!,y^*) \pqty[\big]{\mu_0(x^*) - \mu_1(x^*)}  - k\; \pqty[\big]{\mu_0(x^*\!,y^*)-\mu_1(x^*\!,y^*)} \mu_t(x^*) \\
        % &= k\; \pqty[\Big]{(1-t) \mu_0(x^*\!,y^*)+ t\, \mu_1(x^*,y^*)}
        %     \pqty[\big]{\mu_0(x^*) - \mu_1(x^*)}  
        %     - k\; \pqty[\big]{\mu_0(x^*\!,y^*)-\mu_1(x^*\!,y^*)} \pqty[\Big]{ \mu_0(x^*)+ t\, \mu_1(x^*)} \\
        &= k \pqty[\Big]{\mu_t(x^*\!,y^*)\mu_0(x^*) - \mu_t(x^*\!,y^*)\mu_1(x^*)
            -\mu_0(x^*\!,y^*)\mu_t(x^*) + \mu_1(x^*\!,y^*)\mu_t(x^*)} \\
        &= k \pqty[\Big]{ \pqty[\big]{ \mu_t(x^*\!,y^*)\mu_0(x^*) -\mu_0(x^*\!,y^*)\mu_t(x^*)} + \pqty[\big]{\mu_1(x^*\!,y^*)\mu_t(x^*) - \mu_t(x^*\!,y^*)\mu_1(x^*)}} \\
        &= k \pqty[\big]{ + t \,\mathit{gap} + (1-t) \,\mathit{gap}} \\
        &= k \,\mathit{gap} \qquad \ne~ 0. 
    \end{align*}
    
    So at every $t$, directions parallel to the segment are not in the null space of $\mat H(\mu_t)$, meaning that
    $\bdelta^{\sf T} \mat H(\mu_t) \bdelta > 0$ and so our function is strictly convex along this segment.   
\end{lproof}

\textbf{\cref{prop:marginonly}.}
\textit{If $\dg M$ has arcs $\Ar$ and $\bbeta \ge 0$,
the minimizers of $\OInc_{\dg M}$ all have the same conditional
    marginals along $\Ar$.
That is, for all $\mu_1, \mu_2 \in \bbr{\dg M}_0^*$
and all $\ed aST \in \Ar$ 
with $\beta_a > 0$, we have
{\subafalse
$\mu_1(\Tgt a, \Src a)\mu_2(\Src a) = \mu_2(\Tgt a, \Src a) \mu_1(\Src a)$.%
}}
\begin{lproof}\label{proof:marginonly}
    For contradiction, suppose that $\mu_1, \mu_2 \in \bbr{\dg M}_0^*$, but 
    % for $a \in \Ar$ with $\beta > 0$, 
    % \def\pt 
    there is some $(\hat a, \hat s, \hat t) \in \V\!\Ar$ such that $\beta_a > 0$ and
    \[
        \mu_1(\Tgt a{=}\hat t, \Src a{=}\hat s)\mu_2(\Src a{=}\hat s) \ne \mu_2(\Tgt a{=}\hat t, \Src a{=}\hat s) \mu_1(\Src a\hat s).
    \]
    % For $\mu \in \Delta \V\!\X$, 
    % let $M_{XY}(\mu) := \mu(X,Y)$, and 
    For $t \in [0,1]$,
    let $\mu_t := (1-t) \mu_0 + t \, \mu_1$ as before.
    Then define
    \begin{align*}
        F(t) := \kldiv[\Big]{ \mu_t(\Src a, \Tgt a) }{  \mu_t(\Src a) \p_a(\Tgt a|\Src a) }.
    \end{align*}
    Since $\mu_0(\Src a, \Tgt a)$ and  $\mu_1(\Src a, \Tgt a)$ are joint distributions voer two variables, with different conditional marginals, as above, \cref{lem:seg-strictcvx} applies, and so $F(t)$ is strictly convex. 
    
    % Because $\OInc$ is convex, the average 
    % $\mu_t = \frac12 \mu_1 + \frac12\mu_2$
    Let
    \[ \OInc_{\dg M \setminus \hat a}
        := \sum_{a \ne \hat a} \beta_a \kldiv{\mu(\Tgt a, \Src a)}{\p_a(\Tgt a|\Src a) \mu(\Src a)}
    \]
    be the observational incompatibility loss, but without the term corresponding to edge $\hat a$. 
    Since $\OInc_{\dg M \setminus \hat a}$ is convex in its argument, it is in particular convex along the segment from $\mu_0$ to $\mu_1$; that is, for $t \in [0,1]$, the function $t \mapsto \OInc_{\dg M \setminus \hat a}(\mu_t)$ is convex.
    Therefore, we know that the function
    \begin{align*}
    % $
        G(t) := 
        \OInc_{\dg M}(\mu_t) 
        =
        % t \mapsto
        \OInc_{\dg M \setminus a}( \mu_t ) + \beta_a\, F(t),
    % $
    \end{align*}
    is \emph{strictly} convex. 
    But then this means $\mu_{\nf12}$ satisfies
    \[
        \OInc_{\dg M}( \mu_{\nicefrac12} ) < \OInc_{\dg M}( \mu_0 ),
    \]
    contradicting the premise that $\mu_0$ minimizes $\OInc_{\dg M}$ (i.e., $\mu_0 \in \bbr{\dg M}^*_0$). 
    Therefore, it must be the case that all distributions in $\bbr{\dg M}_0^*$ have the same conditional marginals, as promised.
\end{lproof}


\textbf{\cref{prop:idef-frozen}.}
\textit{If $\mu \in \bbr{\dg M}_0^*$\,,
then
\begin{equation}
    \underset{{\dg M}}{\SInc_{}}(\mskip-0.5mu\mu\mskip-1mu) \!=\!
        \sum_{\mathclap{ w \in \V\!\X } }
            \mskip-1mu
            \mu(\mskip-1.5mu w \mskip-1.5mu)
            \log \!  \bigg(\!
                \faktor{\mu(\mskip-1.5mu w\mskip-1.5mu )}{\,\prod_{\mathclap{a \in \Ar}} 
                \mskip-1mu
                \nu\big(\mskip-1mu\Tgt a \mskip-2.2mu(\mskip-2mu w \mskip-2mu) 
                    \mskip-0.5mu \big|  \Src a \mskip-2mu(\mskip-2mu w \mskip-2mu)\mskip-2mu\big)^
                {\!\alpha\ssub a}
                }\!\mskip-2mu
            \bigg)\mskip-2mu
        ,\!
        \label{eq:idef-alt-constr}
\end{equation}
where $\{ \nu(\Tgt a | \Src a ) \}_{a \in \Ar}$ are the
marginals along the arcs $\Ar$
shared by all distributions in $\bbr{\dg M}^*_0$\
(per \cref{prop:marginonly}),
and $\Src a \mskip-1mu(\mskip-1mu w \mskip-1mu), \Tgt a \mskip-1mu(\mskip-1mu w \mskip-1mu)$ are the values of variables $\Src a$ and $\Tgt a$ in $w$.}
\begin{lproof}\label{proof:idef-frozen}
    % By definition, we have
    This is mostly a simple algebraic manipulation. By definition:
    \begin{align*}
        \SInc_{\dg M}(\mu) &= - \H(\mu) + \sum_{a \in \Ar} \alpha_a \H_\mu(\Tgt a | \Src a) \\
        &= \Ex_\mu \left[ - \log \frac{1}{\mu} + \sum_{a \in \Ar} \alpha_a \log \frac{1}{\mu(\Tgt a|\Src a)} \right] \\
        &= \sum_{w \in \V\!\X} \mu(w) \left[ \log \mu(w) + \sum_{a \in \Ar} \log \frac{1}{\mu(\Tgt a(w)|\Src a(w))^{\alpha_a}} \right] \\
        &= \sum_{w \in \V\!\X} \mu(w) \log \pqty[\bigg]{ \faktor{\mu(w)~}{~\prod_{a \in \Ar}\mu(\Tgt a(w)|\Src a(w))^{\alpha_a}}} 
    \end{align*}
    But, by \cref{prop:marginonly}, if we restrict $\mu \in \bbr{\dg M}_0^*$, then the conditoinal marginals in the denominator do not depend on the particular choice of $\mu$; they're shared among all $\nu \in \bbr{\dg M}_0^*$. 
\end{lproof}

\textbf{\cref{prop:joint+idef-correct}.}
\textit{
    If $\nu \in \bbr{\dg M}_0^*$
    and $(\mu, \mat u)$ 
    solves the problem
    \begin{align*}
        \minimize_{\mu, \mat u} & \quad
            \smash{\mat 1^{\sf T} \mat u}
            \numberthis\label{prob:joint+idef}\\[-0.5ex]
        \subjto &\quad
            (-\mat u,  \mu, \mat k ) \in K_{\exp}^{\V\!\X},~~\quad \mu \in \Delta\V\!\X, \\[-0.4ex]
                \forall& \ed aST \subafalse \in \Ar.~~\mu(\Src a, \Tgt a)\, \nu(\Src a) = \mu(\Src a)\, \nu(\Src a, \Tgt a),
    \end{align*}
    then $\bbr{\dg M}^*_{0^+} = \{ \mu \}$
    and $\mat 1^{\sf T} \mat u = \SInc_{\dg M}(\mu)$.
}
\begin{lproof}\label{proof:joint+idef-correct}
    Suppose $(-\mat u, \mu, \mat k)$ is a solution to problem \eqref{prob:joint+idef}.
    The second constraint, by \cref{prop:marginonly}, ensures that $\mu \in \bbr{\dg M}_0^*$.
    Then,
    \begin{align*}
        (-\mat u, \mu, \mat k) \in K^{\V\!\X} 
        \quad\implies\quad 
            \forall w \in \V\!\X.~ u_w &\ge \mu(w) \log \frac{ \mu(w) } { k_w } \\
            &=  \mu(w) \log \pqty[\bigg]{ \faktor{\mu(w)~}{~\prod_{a \in \Ar}\mu(\Tgt a(w)|\Src a(w))^{\alpha_a}}}.
            % &= .
    \end{align*}
    The same logic as in the 
    \hyperref[proof:joint-inc-correct]{proofs}
    \hyperref[proof:joint-small-gamma-correct]{of} 
    \cref*{prop:joint-inc-correct,prop:joint-small-gamma-correct}
    shows that this inequality must be tight, or else
    $(-\mat u, \mu, \mat k)$ would not be optimal for \eqref{prob:joint+idef}.
    % (Concretely: we could reduce some component of $\mat u$ to get a smaller objective value but still satisfy all constraints). 
    So, $\mat u$ is a function of $\mu$.  Also, by \cref{prop:idef-frozen}, the problem objective satisfies
    \[
        \mat 1^{\sf T}\mat u = \sum_{w \in \V\!X} u_w = \SInc_{\dg M}(\mu).
    % & \big[\text{by \cref{prop:idef-frozen}}\big]
    \]
    
    Finally, because $\mu$ is optimal, it must be the unique distribution
    $\bbr{\dg M}^*$, which among those distributions that minimize $\OInc_{\dg M}$, also minimizes $\SInc_{\dg M}$, meaning $\mu = \bbr{\dg M}^*$.
\end{lproof}


\textbf{\cref{theorem:markov-property}.}\textit{
If\, $\dg M_1$ and $\dg M_2$ are PDGs over the sets of variables
$\X_1$ and $\X_2$, respectively, then $\X_1$ and $\X_2$ are
conditionally independent given $\X_1 \cap \X_2$ in every $\mu \in
\bbr{\dg M_1 \bundle \dg M_2}^*_\gamma$\,, for all $\gamma > 0$ and
$\gamma=0^+$. }

% for all $\gamma > 0$,
\[ 
    \text{Or symbolically: }\qquad\quad
    % \bbr{\dg M_1 \bundle \dg M_2}^*_\gamma
    \dg M_1 \bundle \dg M_2
        ~\models~
    \X_1 \mathbin{\bot\!\!\!\bot} \X_2 \mid \X_1 \cap \X_2. \] 
\begin{lproof}\label{proof:markov-property}
    Note that,
    save for the joint entropy, every summand the scoring function $\bbr{\dg M_1 + \dg M_2}_\gamma : \Delta(\V\!\X_1 \times \V\!\X_2)$, is a function of the conditional marginal of $\mu$ along some edge.
    In particular, those terms that correspond to edges of $\dg M_1$ can be computed from the marginal $\mu(\X_1)$, while those that correspond to edges of $\dg M_2$ can be computed from the marginal $\mu(\X_2)$.
    Therefore, there are functions $f$ and $g$ such that:
    \[
        \bbr{\dg M_1 \bundle \dg M_2}_\gamma(\mu) = f(\mu(\X_1)) + g(\mu(\X_2)) - \gamma \H(\mu).
    \]
    
    To make this next step extra clear, let $\mat X := \X_1 \setminus \X_2$ and
    $\mat Z := \X_2 \setminus \X_1$, be the variables unique to each PDG, and $\mat S:= \X_1 \cap \X_2$ be the set of variables they have in common, so that $(\mat X, \mat S, \mat Z)$ is a partition of all variables $\mat X_1 \cup \mat X_2$.
    Now, define a new distribution $\mu' \in \Delta(\V\!\X_1 \times \V\!\X_2)$ by
    \[
        % \mu'(\X_1, \X_2) := \mu(\X_1) \mu(\X_2 \mid \X_1 \cap \X_2).
        \bf
        \mu'( X,  S,  Z) 
            := \mu(S) \mu( Z \mid  S)\mu( X \mid  S)
            \qquad \Big(~
            = \mu( X,  S) \mu( Z \mid  S)
            = \mu( Z,  S) \mu( X \mid  S)~\Big).
    \]
    One can easily verify that $\mat X$ and $\mat Z$ are independent given $\mat S$ in $\mu'$ (by construction), and the alternate forms on the right make it easy to see that $\mu(\X_1) = \mu'(\X_1)$ and $\mu(\X_2) = \mu'(\X_2)$.
    Furthermore, for any $\nu'(\mat{X,S,Z})$, we can write
    % Then, we can compute
    \begin{align*}
        \H( \nu ) &=  \H_\nu(\mat{X,S,Z}) =
            \H_\nu(\mat{X,S}) + \H_\nu(\mat Z \mid \mat{X,S}) \\
            &= \H_\nu(\mat{X,S}) + \H_\nu(\mat Z \mid \mat{X,S}) - \H_\nu(\mat Z \mid \mat S) + \H_\nu(\mat Z \mid \mat S) \\
            &= \H_\nu(\mat X,\mat S) + \H_\nu(\mat Z \mid \mat S) - \I_\nu(\mat Z; \mat X| \mat S),
    \end{align*}
    where $\I_\nu(\bf X;Z|S)$, the conditional mutual information between $\mat X$ and $\mat Z$ given $\mat S$ (in $\nu$), is non-negative, and equal to zero if and only if $\mat X$ and $\mat Z$ are conditionally independent given $\mat S$ \parencite[see, for instance,][\S1]{mackay2003information}. 
    % In particular, it is zero for $\mu'$, so $\H(\mu') = \H()$
    So $\I_{\mu'}(\mat X; \mat Z| \mat S) = 0$, and
        $\H_{\mu'} = \H_{\mu'}(\mat X, \mat S) + \H_{\mu'}(\mat Z| \mat S)$. 
    Because $\mu$ and $\mu'$ share marginals on $\X_1$ and $\X_2$, while the terms $\H(\mat X, \mat S)$ and $\H(\mat Z|\mat S)$ depend only on these marginals, respectively, we also know that $\H_{\mu}(\mat X, \mat S) = \H_{\mu'}(\mat X, \mat S)$ and $\H_{\mu}(\mat Z | \mat S) = \H_{\mu'}(\mat Z| \mat S)$; thus we have
    \begin{align*}
        \H(\mu) &= \H_\mu(\mat X,\mat S) + \H_\mu(\mat Z \mid \mat S) - \I_\mu(\mat Z; \mat X| \mat S) \\
            &= \H(\mu') - \I_\mu(\mat Z; \mat X| \mat S).
    \end{align*}
    Therefore,
    \begin{align*}
        \bbr{\dg M_1 \bundle \dg M_2}_\gamma(\mu)
         &= f(\mu(\X_1)) + g(\mu(\X_2)) - \gamma \H(\mu) \\
         &= f(\mu'(\X_1)) + g(\mu'(\X_2)) - \gamma \H(\mu') + \gamma \I_\mu(\mat Z; \mat X| \mat S) \\
         &= \bbr{\dg M_1 \bundle \dg M_2}_\gamma(\mu') + \gamma \I_\mu(\mat Z; \mat X| \mat S).
    \end{align*}
    But conditional mutual information is non-negative, and by assumption, $\bbr{\dg M \bundle \dg M_2}_\gamma(\mu)$ is minimal. Therefore, it must be the case that
    \[
        \I_\mu(\mat Z; \mat X| \mat S) = \I_\mu(\X_1; \X_2 \mid \X_1 \cap \X_2) = 0,
    \]
    showing that $\X_1$ and $\X_2$ are conditionally independent given the varaibles that they have in common. \\
    (The fact that $\I_\mu(\mat Z; \mat X| \mat S) = \I_\mu(\X_1; \X_2 \mid \X_1 \cap \X_2)$ is both easy to show and an instance of a well-known identity; see CIRV2 in Theorem 4.4.4 of \textcite{halpern2017reasoning}, for instance.)
\end{lproof}

\textbf{\cref{coro:can-use-cliquetree}.}\textit{
    If $\dg M$ is a PDG with arcs $\Ar$, 
    $(\C, \mathcal T)$ is a tree decomposition of $\Ar$,
    $\gamma > 0$, and
    $\mu \in \bbr{\dg M}^*_\gamma$, then there exists a clique tree
    $\bmu$ over $(\C, \mathcal T)$ such that $\Pr_{\bmu} = \mu$.
}
\begin{lproof}\label{proof:can-use-cliquetree}
    % In addition to \cref{prop:markov-property}, we also need a standard property of clique trees:
    % 
    % \begin{iclaim}\label{claim:can-use-cliquetree}
        % Every distribution $\mu$ 
        % satisfying the independencies
        % \[
        %     \forall C, C' \in \C.~~ C \bot\!\!\!\bot C' \mid C \cap C'
        % \]
        % can be described by a clique tree $\bmu$. That is, $\Pr_{\!\bmu} = \mu$. 
        % If $\mu$ is a distribution in which
        % $C$ and $C'$ are independent given $C \cap C'$ for all $C, C' \in \C$,
    % \end{iclaim}
    % 
    % At a high level, this must be true because such distribution can be represented as a factor graph with a factor for each cluster, and so algorithms such as belief propogation produce a clique tree that represents it.
    % We can also prove it directly, as follows.
    % Choose a root of the tree, and orient each edge away from the root. The edges of $\cal T$ so directed give a qualitative Bayesian Network whose variables are clusters. 
    % Moreover, $\mu$ satisfies the independencies of this BN, since, if $C_1$ is a non-descendent of $C_2$, then $C_1 \bot\!\!\!\bot $
    %
    % Consider the Markov structure over $\X$ that has an edge $(X,Y)$ iff there exists some clique $C \in \C$ that contains both $X$ and $Y$.     
    % \textcite[Theorem 4.3 of][]{koller2009probabilistic}
    
    The set of distributions that can be represented by a calibrated clique tree over $(\C,\cal T)$ is the same as the set of distributions that can represeted by a factor graph for which $(\C, \cal T)$ is a tree decomposition.
    One direction holds because any such product of factors ``calibrated'', via message passing algorithms such as belief propogation, to form a clique tree.
    The other direction holds because $\Pr_{\bmu}$ itself is a product of factors that decomposes over $(\C, \cal T)$. 

    
    % Suppose $\mu \in \Delta \V\!\X$ is a distribution that has the following independence property:
    % for every cluster $C \in \C$, and every 
    %     $\mat X,\mat Y \subset \X$ that lie on opposite sides of the clique
    % Then there exists a clique tree $\bmu$ over $(\C,\cal T)$ such that $\Pr_{\!\bmu} = \mu$.     
    % \TODO
    % \def\ued{{\boldsymbol-}}
    
    Alternatively, this same set of distributions that satisfy the independencies of the Markov Network obtained by connecting every pair of variables that share a cluster.
    More formally, this network is the graph $G := (\X, E := \{ (X{-}Y) :  \exists C \in \C.~\{X,Y\} \subseteq C\})$. 
    Also, $G$ happens to chordal as well, which we prove at the end.
    % if $(X, Y) \in E$ and $(Y, Z) \in E$, then there exist $C, C' \in \C$ with $\{X, Y\} \subseteq C$ and $\{Y, Z\} \subseteq C'$, meaning 
    
    
    Using only the PDG Markov property (\cref{theorem:markov-property}), we now show that every independence described by $G$ also holds in every distribution $\mu \in \bbr{\dg M}^*_\gamma$.
    %
    Suppose that, 
    % $\I(\mat X; \mat Y|\mat Z)$ is an independence
    for sets of variables $\mat X, \mat Y, \mat Z \subseteq \X$,
    $\I(\mat X; \mat Y|\mat Z)$ is an independence
    described by $G$.
    %TODO: add explicit separation undirected graphs
    % such that $X$ and $Y$ are separated by $\mat Z$ according to $G$---
    This means \parencite[Defn 4.8]{koller2009probabilistic} that
    % every path in $G$ between any $X \in \mat X$ and $Y \in \mat Y$ intersects with $\mat Z$. 
    if $X \in \mat X$, $Y \in \mat Y$, and $\pi$ is a path in $G$ between them, then
    some node along $\pi$ lies in $\mat Z$.
    
    % Consider the graph $G' := G\setminus\mat Z$ induced by removing the variables $\mat Z$, as well as the edges connecting them to other variables. 
    % By assumption, the variables $\mat X$ and $\mat Y$ now belong to separate, disconnected components of $G'$.  
    % We claim that this picture lifts to the level of clusters, in the following sense.
    
    Let $\cal T'$ be the graph that results from removing each edge $(C{-}D) \in \cal T$ that satisfies $C \cap D \subseteq \mat Z$, which is a disjoint union  $\mathcal T' = \mathcal T_1 \sqcup \ldots \sqcup \mathcal T_n$ of subtrees that have no clusters in common. 
    To parallel this notation, let $\C_1, \ldots, \C_n$ be their respective vertex sets.
    Note that for every edge $e=(C{-}D)\in \cal T'$, there must by definiton be some variable $U_e \in (C \cap D) \setminus \mat Z$. 
    
    We claim that no subtree $\mathcal T_i$ can have both a cluster $D_X$ containing a variable $X \in \mat X \setminus \mat Z$ and also a cluster $D_Y$ containing a variable $Y \in \mat Y \setminus \mat Z$.
    % For if it did,
    Suppose that it did. 
    Then the (unique) path in $\cal T$ between $D_X$ and $D_Y$, which we label
    \[
    \begin{tikzcd}[column sep=2em]
        % C_{X_0}=
        D_X=
        &D_0 \ar[r,-,"e_1"]&
        D_1 \ar[r,-,"e_2"]&
        % D_2\ar[r,-,"e_3"]&
          \cdots
        \ar[r,-,"e_{m-1}"]& D_{m-1}
        \ar[r,-,"e_m"] & D_m&
        =D_Y
        % =C_{Y_0}
    \end{tikzcd},
    \]
    would lie entirely within $\cal T_i \subseteq \mathcal T'$. This gives rise to 
    a corresponding path in $G$:
    \[\begin{tikzcd}[column sep=1em,row sep=1.5ex]
        X \ar[r,-] \ar[d,sloped,phantom,"\in"]
        & U_{e_1}\ar[r,-] \ar[d,sloped,phantom,"\in"]
        & U_{e_2}\ar[r,-] \ar[d,sloped,phantom,"\in"]
           &\cdots\ar[r,-]
        & U_{e_{n{-}1}} \ar[r,-] \ar[d,sloped,phantom,"\in"]
        & U_{e_n}\ar[r,-] \ar[d,sloped,phantom,"\in"]
        & Y \ar[d,sloped,phantom,"\in"]  
            \\
        D_0
        & D_0 \cap D_1
        & D_1 \cap D_2
        &
        & D_{n{-}2} \cap D_{n{-}1}
        & D_{n{-}1} \cap D_n
        & D_n
    \end{tikzcd}\quad,\]
    and moreover, this path is disjoint from $\mat Z$.
    This contradicts our assumption that every path in $G$ between a member of $\mat X$ and a member of $\mat Y$ must intersect with $\mat Z$, and so no subtree can have both a cluster containing a variable $X \in \mat X \setminus \mat Z$ and also one containing $Y \in \mat Y \setminus \mat Z$. 
    
    % assume this for now, but we will prove it at the end. 
    % Then, for each every $\mathcal T_i$
    % $ can either contain a variable $X \in \mat X \setminus Z$
    \def\CX{\C_{\mat X}}
    % \def\CNX{\C_{\mat{\bar{X}}}}
    \def\CNX{\C_{\mat{Y}}^+}
    We can now partition the clusters as $\C = \CX \sqcup \CNX$, where
    $\CX$ is the set of the clusters that belong to subtrees $\mathcal T_i$ with a cluster containing some $X \in \mat X \setminus \mat Z$, and
    its $\CNX$ is its complement, which in particular contains those subrees have some $Y \in \mat Y \setminus \mat Z$.
    % $\CX$ 
    Or, more formally, we define
    \[
        \CX :=~ \bigcup_{\mathclap{\substack{i \in \{1,\ldots,n\}\\ (\cup\C_i) \cap (\mat X\setminus\mat Z) \ne \emptyset }}}\, \C_i
        \quad\qquad \text{and}\qquad
        \CNX :=~ \bigcup_{\mathclap{\substack{i \in \{1,\ldots,n\}\\ (\cup\C_i) \cap (\mat X\setminus\mat Z) = \emptyset }}}\, \C_i
        \quad.
    \]
    \def\XX{\X_{\mat X}}
    % \def\XNX{\X_{\mat{\bar{X}}}}
    \def\XNX{\X_{\mat Y}^+}
    Let $\XX := \cup \CX$ set of all variables appearing in the clusters $\CX$; symmetrically, define $\XNX := \cup \CNX$.
    
     
    We claim that $\XX \cap \XNX \subset \mat Z$. 
    Choose any variable $U \in \XX \cap \XNX$. 
    % Suppose, for contradiction, that it did contain some $Y \notin C_1 \cap C_2$. 
    From the definitions of $\XX$ and $\XNX$, this means $U$ is a member of some cluster $C \in \CX$, and also a member of a cluster $D \in \CNX$.
    Recall that the clusters of each disjoint subtree $\mathcal T_i$ either fall entirely within $\CX$ or entirely within $\CNX$ by construction.
    This means that $C$ and $D$, which are on opposite sides of the partition, must have come from distinct subtrees.
    % Since there is a unique path in $\cal T$ between any two clusters, and in particular, between $C$ and $D$, the
    So, some edge $e = (C'{-}D') \in \mathcal T$ along the (unique) path from $C$ to $D$ must have been removed when forming $\mathcal T'$, which by the definition of $\mathcal T'$, means that $(C' \cap D') \subset Z$. 
    But by the running intersection property (clique tree property 2), every cluster along the path from $C$ to $D$ must contain $C \cap D$---in particular, this must be true of both $C'$ and $D'$.
    Therefore,
    \[
        U \in C \cap D \subset C' \cap D' \subset \mat Z.
    \]
    So $\XX \cap \XNX \subset \mat Z$, as promised.  We will rather use it in the equivalent form $(\XX \cap \XNX) \cup \mat Z = \mat Z$. 
    
    Next, since $(\C, \cal T)$ is a tree decomposition of $\Ar$, each hyperarc $a \in \Ar$ can be assigned to some cluster $C_a$ that contains all of its variables; this allows us to lift the cluster partition $\C = \CX \sqcup \CNX$ to a partition $\Ar = \Ar_{\mat X} \sqcup \Ar_{\mat Y}^+$ of edges, and consequently, a partition of PDGs $\dg M = \dg M_{\mat X} \bundle \dg M_{\mat Y}^+$.
    Concretely: let $\dg M_{\mat X}$ be the sub-PDG of $\dg M$ induced by restricting to the variables $\XX \subseteq \X$ arcs $\Ar_{\mat X} = \{ a \in \Ar : C_a \in \CX \} \subseteq \Ar$; define $\dg M_{\mat Y}^+$ symmetrically. (To be explicit: the other data of $\dg M_{\mat X}$ and $\dg M_{\mat Y}^+$ are given by restricting each of $\{\mathbb P,\balpha,\bbeta\}$ to $\Ar_{\mat X}$ and $\Ar_{\mat Y}^+$, respectively.)
    % Thus, $\dg M = \dg M_1 \bundle \dg M_2$. 

    This partition of $\dg M$ allows us to use the PDG Markov property.
    Suppose for some $\gamma > 0$ that $\mu \in \bbr{\dg M}^*_\gamma = \bbr{\dg M_{\mat X} \bundle \dg M_2}^*_\gamma$.
    We can then apply \cref{theorem:markov-property}, to find that
    $\XX$ and $\XNX$ are independent given $\XX \cap \XNX$.
    Then, we use standard standard properties of random variable independence
        \parencite[CIRV1-5 of][Theorem 4.4.4]{halpern2017reasoning} to find that $\mu$ must satisfy:
    \begin{align*}
        \XX  &\CI \XNX \mid \XX \cap \XNX\\
    % \qquad\overset{\mathrm{CIRV3}}\implies\qquad 
    \implies\qquad
        (\XX \setminus \mat Z) &\CI (\XNX\setminus \mat Z) \mid (\XX \cap \XNX) \cup \mat Z 
            & \big[\,\text{CIRV3}\,\big] \\
    \implies\qquad
        (\mat X \setminus \mat Z) &\CI (\mat Y \setminus \mat Z) \mid (\XX \cap \XNX) \cup \mat Z 
        & \big[\,\text{by CIRV2, as $\mat X \subseteq \XX$ and $\mat Y \subseteq \XNX$}\,\big] \\
    \implies\qquad
        (\mat X \setminus \mat Z) &\CI (\mat Y \setminus \mat Z) \mid \mat Z 
        & \big[\,\text{since $(\XX \cap \XNX) \cup \mat Z = \mat Z$}\,\big] \\
    \iff\qquad
        \mat X &\CI \mat Y \mid \mat Z 
        & \hspace{-4em}\big[\,\text{standard; e.g., Exercise 4.18 of \textcite{halpern2017reasoning}}\,\big] \\
    \end{align*}
     % = C_1 \cap C_2$. 
    
    Using only the PDG Markov property, we have now shown that every independence 
    modeled by the Markov Network $G$ also holds
    in every distribution $\mu \in \bbr{\dg M}^*_\gamma$. Moreover, $G$ is chordal (as we will prove momentarily),
    and is well-known that distributions that have the independencies of a chordal graph can be can be represented by clique trees \parencite[Theorem 4.12]{koller2009probabilistic}.
    %
    % Thus we can apply \cref{claim:can-use-cliquetree}, and conclude that
    %
    Therefore, there is a clique tree $\bmu$ representing every $\mu \in \bbr{\dg M}^*_\gamma$.
    
    \begin{iclaim}
        $G$ is chordal.
    \end{iclaim}
    \begin{proof}
        Suppose that $G$ contains a loop $X{-}Y{-}Z{-}W{-}X$.
         % and let $C_1, C_2, C_3, C_4$ be clusters shared, respectively, by $\{X,Y\}$, $\{Y,Z\}$, $\{Z, W\}$, and $\{W, X\}$. 
        % By the running intersection property (2 of the definition of being a tree decomposition of a graph), every 
        % Since every cluster on the unique path from $C_1$ to $C_4$ must contain $C_1 \cap C_4 \ni X$, that path cannot contain $C_2$ or $C_3$
        % Since it is a tree, $\cal T$ cannot contain all four edges of the loop $C_1{-}C_2{-}C_3{-}C_4{-}C_1$; suppose without loss of generality, that it does not contain $C_4{-}C_1$. The path from $C_1$ to $C_4$ must 
        Suppose further, for contradiction, that neither $X$ and $Z$ nor $Y$ and $W$ share a cluster.
        % Then, the unique path between $C_1$ and $C_2$, along which $Y$ is a member of every cluster, must be disjoint from the unique path between $C_3$ and $C_4$, along which $W$ is a member of every cluster.  Because $\cal T$ is a tree, 
        % \TODO
        % It is easy to see that property (2) of the tree decomposition ensures that,
        Given a variable $V$, it is easy to see that property (2) of the tree decomposition ensures that the subtree $\mathcal T(V) \subseteq \mathcal T$ induced by the clusters $C \in \C$ that contain $V$, is connected.
            % \footnote{In fact, this is is equivalent to the running intersection (2) tree decomposition property.}
        % So, there are connected subtrees that contain $X$ and $Y$, 
        By assumption, ${\cal T}(Y)$ and ${\cal T}(W)$ must be disjoint. 
        There is an edge between $Y$ and $Z$, so some cluster must contain both variables, meaning ${\cal T}(Y) \cap {\cal T}(Z)$ is non-empty. 
        Similarly, ${\cal T}(Z) \cap {\cal T}(W)$ is non-empty because of the edge between $Z$ and $W$.
        This creates an (indirect) connection in $\cal T$ between ${\cal T}(Y)$ and ${\cal T}(W)$. Because $\cal T$ is a tree, and ${\cal T}(Y) \cap {\cal T}(W) = \emptyset$,
        every path from a cluster $C_1 \in {\cal T}(Y)$ to a cluster $C_2 \in {\cal T}(W)$ must pass through ${\cal T}(Z)$, which is not part of ${\cal T}(Y)$ or ${\cal T}(W)$. 
        Now, ${\cal T}(X)$ and ${\cal T}(Y)$ intersect as well, meaning that, for any $C \in {\cal T}(X)$, there is a (unique) path from $C$ to that point of intersection, then across edges of ${\cal T}(Y)$, then edges of ${\cal T}(Z)$, and finally connects to the clusters of ${\cal T}(W)$. And also, since $\cal T$ is a tree, that path must be unique. 
        % The problem is that we , as do ${\cal T}(X)$ and
        % The problem is that ${\cal T}(X)$ and ${\cal T}(W)$ also intersect directly.
        The problem is that there is also an edge between $X$ and $W$, so there's some cluster that contains $X$ and $W$; let's call it $C_0$.
         % So the unique path from $C_0$ to the cluster $D_0$ 
        It's distinct from the cluster $D_0$ that contains $Z$ and $W$, since no cluster contains both $X$ and $Z$ by assumption.
        The unique path from $C_0$ to $D_0$
        % , like all paths from a cluster in ${\cal T}(X)$ to one in ${\cal T}(W)$
        intersects with ${\cal T}(Y)$.
        But now $W \in C_0 \cap D_0$, and by the running intersection property, every node along this unique path must contain $W$ as well. 
        But this contradicts our assumption that $W$ is disjoint from $Y$! So $G$ is chordal.
    \end{proof}
\end{lproof}

\textbf{\cref{prop:cluster-inc-correct}.}\textit{
    If $(\bmu, \mat u)$ is a solution to \eqref{prob:cluster-inc}, then
    \begin{enumerate}[label={(\alph*)},nosep]
    \item $\bmu$ is a calibrated, with $\Pr_{\bmu} \in \bbr{\dg M}^*_0$, and
    \item the objective of \eqref{prob:cluster-inc} evaluated at $\mat u$ equals $\aar{\dg M}_0$.
    \end{enumerate}
}
\begin{lproof}\label{proof:cluster-inc-correct}
    The final constraints alone are enough to ensure that $\bmu$ is calibrated. 
    Much like before, the exponential conic constraints tell us that
    \[
        \forall (a, s,t) \in \V\!\Ar.\quad
            u_{a,s,t} \ge \mu_{C_{\!a}}\!(s,t) \log \frac{\mu_{C_{\!a}}\!(s,t)}{\mu_{C_{\!a}}\!(s)\p_a(t|s)}
    \]
    and they hold with equality (at least at those indices where $\beta_a > 0$) because $\mat u$ is optimal. 
    So
    \begin{align*}
        \sum_{(a,s,t) \in \V\!\Ar} \beta_a u_{a,s,t} 
        &= \sum_{(a,s,t) \in \V\!\Ar} \beta_a \mu_{C_{\!a}}\!(s,t) \log \frac{\mu_{C_{\!a}}\!(s,t)}{\mu_{C_{\!a}}\!(s)\p_a(t|s)} \\
        &= \sum_a \beta_a \sum_{(s,t) \in \V a}\mu_{C_{\!a}}\!(s,t) \log \frac{\mu_{C_{\!a}}\!(s,t)}{\mu_{C_{\!a}}\!(s)\p_a(t|s)} \\
        &= \OInc_{\dg M}(\Pr\nolimits_{\bmu}).
    \end{align*}
    Because $\bmu$ is optimal, it is the choice of calibrated clique tree that minimizes this quantity.
    By \cref{coro:can-use-cliquetree}, the distribution $\bbr{\dg M}^*$ can be represented by such a clique tree, and by \textcite[Prop. 3.4]{pdg-aaai}, 
    this distribution minimizes $\OInc_{\dg M}$. 
    All this is to say that there exist clique trees of this form whose corresonding distributions attain the minimum value $\OInc_{\dg M}(\Pr_{\bmu}) = \aar{\dg M}_0$.
    So $\bmu$ must be one of them, as it minimizes $\OInc(\Pr_{\bmu})$ among such clique trees by assumption. Thus $\Pr_{\bmu} \in \bbr{\dg M}^*_0$ and the objective value of \eqref{prob:cluster-inc} equals $\aar{\dg M}_0$. 
    % The result follows from the fact that 
\end{lproof}

\textbf{\cref{prop:cluster-small-gamma-correct}.}
\textit{ If $(\bmu, \mat u, \mat v)$ is a solution to \eqref{prob:cluster-small-gamma},
and $\bbeta \ge \gamma \balpha$, then
$\Pr_{\bmu}$ is the unique element of $\bbr{\dg M}^*_\gamma$,
and the objective of \eqref{prob:cluster-small-gamma} at $(\bmu, \mat u, \mat v)$ equals $\aar{\dg M}_\gamma$.
}

\begin{lproof}\label{proof:cluster-small-gamma-correct}
    Suppose that $(\bmu, \mat u, \mat v)$ is a solution to \eqref{prob:cluster-small-gamma}.    
    The first and fourth lines of constraints ensures that $\bmu$ is indeed a calibrated clique tree.  The second line of constraints, plays exactly the same role that it did in the previous problems, most directly in the variant \eqref{prob:cluster-inc} for $\gamma=0$. In particular, it tells says
    \[
        \forall (a, s,t) \in \V\!\Ar.\quad
            u_{a,s,t} \ge \mu_{C_{\!a}}\!(s,t) \log \frac{\mu_{C_{\!a}}\!(s,t)}{\mu_{C_{\!a}}\!(s)\p_a(t|s)}
    \]
    as before, this holds with equality (at least at those indices where $\beta_a > \alpha_a\gamma$) because $\mat u$ is optimal.
     % 
    Because $\bbeta \ge \gamma \alpha$ by assumption, either $\beta_a > \gamma \alpha_a$ or the two are equal, for every $a \in \Ar$. Either way, 
    the argument used at this point in \hyperref[proof:cluster-inc-correct]{the proof of} \cref{prop:cluster-inc-correct} goes through, giving us:
    \begin{align*}
        \sum_{(a,s,t) \in \V\!\Ar} (\beta_a - \alpha_a\gamma) u_{a,s,t} 
        &= \sum_{(a,s,t) \in \V\!\Ar} ((\beta_a - \alpha_a\gamma) \mu_{C_{\!a}}\!(s,t) \log \frac{\mu_{C_{\!a}}\!(s,t)}{\mu_{C_{\!a}}\!(s)\p_a(t|s)} \\
        &= \sum_a (\beta_a - \alpha_a\gamma) \sum_{(s,t) \in \V a}\mu_{C_{\!a}}\!(s,t) \log \frac{\mu_{C_{\!a}}\!(s,t)}{\mu_{C_{\!a}}\!(s)\p_a(t|s)} \\
        % &= \OInc_{\dg M}(\Pr\nolimits_{\bmu}).
        &= \sum_a (\beta_a - \alpha_a\gamma)~
            \kldiv[\Big]{\mu_{C_{\!a}}\!(\Src a, \Tgt a)}
                  {\mu_{C_{\!a}}\!(\Src a)\, \p_a(\Tgt a|\Src a)}
    \end{align*}
    This time, though, that's not the problem objective. In this regard, our problem \eqref{prob:cluster-small-gamma} is more closely related to \eqref{prob:cluster-small-gamma}.
    
    Before we get to that, we have to first bring in the final collection of exponential constraints, which show that
    \begin{align*}
        \forall C \in \C.~ \forall c \in \V(C).\quad
            v_{C,c} \ge \mu_{C}(c) \log \frac{\mu_C(c)}{ \Pash_C(c) },
    \end{align*}
    and yet again these constraints hold with equality, 
    for otherwise $\mat v$ would not be optimal (since we assumed $\gamma > 0$). Therefore,
    \[
        \sum_{\mathclap{(C,c) \in \V\C}} v_{C,c}
        ~=~
        \sum_{\mathclap{(C,c) \in \V\C}}
        \mu_{C}(c) \log \frac{\mu_C(c)}{ \Pash_C(c) } 
        ~=~ - \H(\Pr\nolimits_{\bmu})\quad\text{by \cref{eq:cluster-ent-decomp}}. 
    \]
        
    Now, the objective of our problem \eqref{prob:cluster-small-gamma} is essentially the same as that of \eqref{prob:joint-small-gamma}, so the analysis in \hyperref[proof:joint-small-gamma-correct]{the proof of} \cref{prop:joint-small-gamma-correct} applies with only a handful of superficial modifications. 
    Using that proof to take a shortcut, the objective of \eqref{prob:cluster-small-gamma} must equal
    \begin{align*}
    % \textit{(\ref{prob:joint-small-gamma}.obj)} =
        &\sum_{\mathclap{(a,s,t) \in \V\!\Ar}}
            (\beta_a \!- \alpha_a \gamma) u_{a,s,t}
        ~+ \gamma \sum_{\mathclap{(C,c) \in \V\C}} v_{C,c}
        ~- \sum_{\mathrlap{\!\!\!(a,s,t) \in \smash{\V\!\Ar^+}}}
            \alpha_a\gamma \, \mu_{C_{\!a}}\!(s,t) \log \p_a (t|s) \\
    &=
        \sum_{\mathclap{(a,s,t) \in \V\!\Ar}}
             (\beta_a - \alpha_a\gamma) \mu_{C_{\!a}}\!(s,t) \log \frac{\mu_{C_{\!a}}\!(s,t)}{\mu_{C_{\!a}}\!(s)\p_a(t|s)}
        ~-~ \gamma \H(\Pr\nolimits_{\bmu})
        ~-~ \sum_{\mathrlap{\!\!\!(a,s,t) \in \smash{\V\!\Ar^+}}}
            \alpha_a\gamma \, \mu_{C_{\!a}}\!(s,t) \log \p_a (t|s) \\
    &= 
        \sum_{a \in \Ar} \beta_a
           \Ex_{\mu_{C_{\!a}}}[ \log \p_a(\Tgt a | \Src a)]
        + \sum_{a \in \Ar} (\alpha_a \gamma \!-\! \beta_a)
           \H_{\Pr_{\bmu}}(\Tgt a | \Src a)
        - \gamma \H(\Pr_{\bmu})  \\
    &= \bbr{\dg M}_\gamma(\Pr\nolimits_{\bmu}),
    \quad.
    \end{align*}
    Finally, since $\bmu$ is such that this quantity is minimized, and because 
    its unique minimizer can be represented as a cluster tree (per \cref{coro:can-use-cliquetree}), we conclude that $\bmu$ must be the cluster tree representation of it.
    Therefore, $\Pr_{\bmu}$ is the unique element of $\bbr{\dg M}^*_\gamma$, and the objective at $(\bmu, \mat u, \mat v)$ equals $\aar{\dg M}_\gamma$, as promised.
\end{lproof}

\textbf{\cref{prop:cluster+idef-correct}.}\textit{
    If $(\bmu, \mat u)$ is a solution to \eqref{prob:cluster+idef},
    then $\bmu$ is a calibrated clique tree
    and $\bbr{\dg M}^*_{0^+} = \{ \Pr_{\!\bmu} \}$.
}
\begin{lproof}\label{proof:cluster+idef-correct}
    Suppose that $(\bmu, \mat u)$ is a solution to \eqref{prob:cluster+idef}. 
    % The same reasoning as in the last two proofs shows that
    The exponential cone constraints state that
    \begin{align*}
        \forall C \in \C.~ \forall c \in \V(C).\quad
        u_{C,c} &\ge \mu_{C}(c) \log \frac{\mu_C(c)}{k_{C,c} \Pash_C(c) } \\
        &= \mu_{C}(c) \log \frac{\mu_C(c)}{\Pash_C(c)}
            - \mu_{C}(c) \log \prod_{a \in \Ar_C} \nu_C (\Tgt a (c) | \Src a (c))^{\alpha_a} \\
        &= \mu_{C}(c) \log \frac{\mu_C(c)}{\Pash_C(c)}
         - \mu_{C}(c) \sum_{a \in \Ar_C} \alpha_a \log \nu_C (\Tgt a (c) | \Src a (c)),
    \end{align*}
    and once again this holds with equality, as each $u_{C,c}$ is minimal with this property.  
    The third line of constraints
    \[
        \forall a \in \Ar.~~\mu_{C_{\!a}}\!(\Src a, \Tgt a) \nu_{C_{\!a}}\!(\Src a) = \mu_{C_{\!a}}\!(\Src a) \nu_{C_{\!a}}\!(\Src a, \Tgt a)
    \]
    and the assumption that $\Pr_{\boldsymbol\nu} \in \bbr{\dg M}^*_0$, suffice to ensure that $\Pr_{\bmu} \in \bbr{\dg M}^*_0$ by \cref{prop:marginonly}.
    % we know that 
    % 
    They also allow us to replace each $\nu_{C_{a}}(\Tgt a(c) | \Src a (c))$ with 
    $\nu_{C_{a}}(\Tgt a(c) | \Src a(c))$, in cases where $\Src a(c) \ne 0$. 
    Therefore, we calculate the objective to be:
    \begin{align*}
        \mat 1^{\sf T} \mat u &= 
        % \sum_{\!\!\!\!\mathrlap{(C,c) \in \V\C}}
        \sum_{C \in \C} \sum_{c \in \V(C)} \left( \mu_{C}(c) \log \frac{\mu_C(c)}{\Pash_C(c)} -
            \mu_{C}(c) \sum_{a \in \Ar_C} \alpha_a \log \nu_C (\Tgt a (c) | \Src a (c)) 
            \right)\\
        &= \sum_{C \in \C} \sum_{c \in \V(C)}
                \mu_{C}(c) \log \frac{\mu_C(c)}{\Pash_C(c)}
            - \sum_{C \in \C} \sum_{c \in \V(C)}
            \mu_{C}(c) \sum_{a \in \Ar} \mathbbm1[C = C_a] \alpha_a \log \nu_C (\Tgt a (c) | \Src a (c)) \\
        &= -\H(\Pr\nolimits_{\bmu}) - \sum_{a \in \Ar} \alpha_a \sum_{C \in \C} \mathbbm1[C = C_a] \sum_{c \in \V(C)}
            \mu_{C}(c)\log \nu_C (\Tgt a (c) | \Src a (c)) 
            \qquad\qquad\big[\,\text{by \eqref{eq:cluster-ent-decomp}}\,\big]\\
        &= -\H(\Pr\nolimits_{\bmu}) - \sum_{a \in \Ar} \alpha_a \sum_{c \in \V(C)_a}
                \mu_{C_{\!a}}\!(c) \log  \nu_{C_{\!a}}\! (\Tgt a (c) | \Src a (c)) \\
        &= -\H(\Pr\nolimits_{\bmu}) - \sum_{a \in \Ar} \alpha_a \sum_{c \in \V(C)_a}
                \mu_{C_{\!a}}\!(c) \log { \mu_{C_{\!a}}\! (\Tgt a (c) | \Src a (c))} 
            \qquad\Big[~\text{since $\mu_{C_{\!a}}\!(\Src a(c)) > 0$ whenever 
                $\mu_{C_{\!a}}\!(c) > 0$}~\Big]\\
        &= -\H(\Pr\nolimits_{\bmu}) + \sum_{a \in \Ar} \alpha_a \H_{\Pr_{\bmu}}(\Tgt a | \Src a ) \\
        &= \SInc_{\dg M}(\Pr\nolimits_{\bmu}).
    \end{align*}
    
    To summarize: $\Pr_{\bmu}$ minimizes $\SInc_{\dg M}(\Pr\nolimits_{\bmu})$ among calibrated clique trees with condtional marginals matching those of $\boldsymbol\nu$.
    Since we know that there is a unique distribution that minimizes $\SInc_{\dg M}$ among the elements $\bbr{\dg M}_0^*$, and also that this distribution can be represented by a clique tree (by \Cref{coro:can-use-cliquetree}), we conclude that $\bmu$ must represent this distribution. Thus, $\Pr_{\bmu} = \bbr{\dg M}^*$ as desired. 
\end{lproof}

\begin{lemma} \label{lem:mainlemma}
    Fix integers $n_{\sf o}, n_{\sf e} \in \mathbb N$, and let $n:= 3n_{\sf e} + n_{\sf o}$.
    %joe6
    %    Suppose that $K = \mathbb R_{\ge 0}^{n_{\sf o}} \times K^{n_{\sf
    %oli6: you parsed this wrong; it requires more significant rework than this if you want to start with "If". Silently reverting/rewriting.
    %If
     Suppose that
     $K = \mathbb R_{\ge 0}^{n_{\sf o}} \times K^{n_{\sf e}}_{\exp} \subset \mathbb R^n$ is a product cone, consisting of $n_{\sf o}$ copies of the non-negative orthant and $n_{\sf e}$ copies of the exponential cone.
    % 
    If, for
    % $\mat c \in \Rext^{n}$, $\mat b \in \Rext^m$, and $\mat A \in \Rext^{m \times n}$,
    % $c \in \Rext^{n}$, $ b \in \Rext^m$, and $A \in \Rext^{m \times n}$,
    $c \in [-1,1]^{n}$, $ b \in [-1,1]^m$, and $A \in [-1,1]^{m \times n}$,
    the exponential conic program
    % \eqref{eq:exp-conic-program}
    \begin{align*}
        &
        \minimize_{
            % x \in K}~~ c^{\sf T} x
            \mat x \in K}~~ \mat c^{\sf T} \mat x
        \quad\subjto\quad A \mat x = \mat b,
        % ~~ x \in K,
        \tag{\ref{eq:exp-conic-program}}
        % \qquad 
        % \text{and its dual}\qquad
    \end{align*}
    %joe6: I don't know what it means for an exponential program to be
    %strictly feasible as its dual problem.  Is that standard terminology?
    %oli6: yes, but more importantly the definition is right here; I think the problem is that your reading frame got misaligned. This isn not the result of the lemma. 
    % admits an optimal solution $x^*$ in the interior of $K$, 
    is strictly feasible (i.e., if there exists $\mat x \in \mathrm{int}\, K$  such that $A \mat x = \mat b$ )
    as is its dual problem
    % and $A^{\sf T} \mat y + \mat s = \mat c$)
    % then both it and its dual
    \[
        \maximize
            % \limits_{\mathclap{ s \in \Rext^{3n}\!,\; \mat y \in \Rext^{m}}}
            \limits_{
            % s \in K_*,\, y \in \Rext^m} ~~ b^{\sf T} y
            \mat s \in K^*,\, \mat y \in \Rext^m} ~~ \mat b^{\sf T} \mat y
        % \quad\text{\sf subject to}~~  A^{\sf T} y  +  s = c,
        \quad\subjto\quad  A^{\sf T} \mat y  +  \mat s = \mat c,
            % ~~\mat s \in (K^*_{\exp})^n
    \]
    (i.e, if there exists $\mat s \in \mathrm{int}\,K_*$ such that $A^{\sf T} \mat y + \mat s = \mat c$), 
    then both
    can be simultaneously
    % solved to machine precision
    solved to precision $\epsilon$
    in $O(n (m+n)^{\omega} \log\frac{n+m}{\epsilon}
    % \log(\nicefrac1\epsilon)
    )$ time,
    where $\omega$ is the smallest exponent such that a linear system of $k$ variables and equations can be solved in $O(k^\omega)$ time. 
    %
    Furthermore, MOSEK solves this problem in $O(n (m+n)^3 \log \frac{n+m}{\epsilon})$ time. 
    % \TODO[What does ``$\epsilon$-close'' mean here? Can we get machine precision?]
\end{lemma}
\begin{lproof}
    For this, we begin by appealing to the algorithm and analysis of
    \textcite{badenbroek2021algorithm}, threading details through for this specific choice of cone $K$. 
    To finish the proof, however, we will also need to supplement that analysis with some other well-established results of \textcite{nesterov1996infeasible} that the authors were no doubt familiar with, but did not bother referencing.
    % where the conic constraint is $K_{\exp}^n$.
    
    % First, we'll need quite a few definitions.
    % First, some definitions.
    First, we'll need some background material from convex optimization.
    A \emph{logarithmically homogeneous self-concordant barrier} with parameter $\nu$ ($\nu$-LHSCB) for a cone $K$ is a thrice differentiable strictly convex function $F: \mathrm{int}\, K \to \mathbb R$ satisfying
    $F(tx) = F(x) - \nu \log t$
    for all $t > 0$ and $x \in \mathrm{int}\, K$. 
    In some sense, the point of such a barrier function is to augment the optimization objective so that we remain within the cone during the optimization process. 
    
    % Consider a point $\mat x = (\mat x_1, \mat x_2, \mat x_3) \in K_{\exp}^n$, 
        % where $\mat x_1, \mat x_2, \mat x_3 \in \mathbb R^{n}$. 
    % Consider a point $\mat x = 
    % (x^1_1, \ldots, x_n^1, x^2_1, \ldots, x^2_n, x^3_1, \ldots, x^3_n) 
    % (x_1^1, x_2^1, x_3^1, \ldots, x_1^n, x_2^n, x_3^n) 
    % \in K_{\exp}^n$.
        % where $\mat x_1, \mat x_2, \mat x_3 \in \mathbb R^{n}$. 
    
    % We now take a break from their presentation to 
    For the positive orthant cone $\mathbb R_{\ge 0}$, the function 
        $x \mapsto - \log x$ is a 1-LHSCB. 
    We now fill in some background facts about exponential cones.
    The \emph{dual} the exponential cone is
    % https://www.seas.ucla.edu/~vandenbe/236C/lectures/conic.pdf, slide 25. 
    \begin{align*}
        K_{\exp}^* &:= \big\{ (s_1, s_2, s_3) \in \mathbb R^3 \;:\;
            \forall (x_1, x_2, x_3) \in K_{\exp}.~~
            x_1 s_1 + x_2 s_2 + x_3 s_3 \ge 0   \big\}\\
            &= \big\{
                (s_1, s_2, s_3) \;:\; - s_1 \log (- s_1 / s_3) + s_1 - s_2 \le 0, 
                    \, s_1 \le 0,\,  s_3 \ge 0
            \big\}.
    \end{align*}
    Consider points $x = (x_1, x_2, x_3) \in K_{\exp}$. 
    The function 
    \begin{equation}
        F_{\exp}(x) := - \log \Big(x_2 \log\frac{x_1}{x_2} - x_3\Big) - \log x_1 x_2
        % F(\mat x) := \sum_{i=1}^{n}
            % - \log \Big(x_2^i \log\frac{x_1^i}{x_2^i} - x_3^i\Big) - \log x_1^i x_2^i
        % F(\mat x) := - \log \Big(\mat x_2 \odot \log\frac{\mat x_1}{\mat x_2} - \mat x_3\Big) - \log (\mat x_1 \odot \mat x_2)
    \end{equation}
    % is a $3n$-LHSB for $K_{\exp}^n$, since
    is a $3$-LHSCB for $K_{\exp}$, since
    \begin{align*}
        F_{\exp}(t x) &= 
            -\log \Big( t x_2 \log \frac{t x_1}{t x_2} - t x_3\Big) - \log(t^2 x_1 x_2) \\
            % \sum_{i=1}^{n} - \log \Big( t x_2^i \log \frac{t x_1^i}{t x_2^i} - t x_3^i\Big) - \log(t^2 x_1^i x_2^i) \\ 
        % &= - \log \Big(t \big(\log \frac{x_1}{x_2} - x_3\big)\Big) - \log(x_1 x_2) - 2 \log t \\
        &= - \log \Big(t \big(\log \frac{x_1}{x_2} - x_3\big)\Big) - \log(x_1 x_2) - 2 \log t \\
        &= F_{\exp}(x) - 3 \log t
    \end{align*}
    % Taking $k$ copies of a cone, each of which has a $\nu$-LHSCB, allows for a $(k\nu)$-LHSCB by summation.
    Such barrier functions can be combined to act on product cones by summation. 
    Concretely, suppose that for each $i \in \{1, \ldots, k\}$,
    we have a $\nu_i$-LHSCB $F_i: \mathrm{int}\, K_i \to \Rext$.
    Then, for $x = (x_i)_{i=1}^k \in \prod_i K_i$, the function
    $F(x) := \sum_{i=1}^k F_i(x_i)$ is a $(\sum_i \nu_i)$-LHSCB for $\prod_i K_i$,
    since
    \[
        F(tx) = \sum_{i=1}^k F_i(t x_i)
            = \sum_{i=1}^k ( F(x_i) - \nu_i \log t)
            = F(x) - \sum_{i=1}^k \nu_i.
    \]
    %
    In this way, our product cone $K = \mathbb R_{\ge 0}^{n_{\sf o}} \times K^{n_{\sf e}}_{\exp}$ admits a LHSCB $F$ with parameter $\nu = n_{\sf o} + 3 n_{\sf e} = n$. 
    Furthermore can be evaluated in $O(n)$ time, as can each component of
    its gradient $F'(x)$ and Hessian $F''(x) \in \mathbb R^{n \times n}$ at $x$, all of which can be expressed analytically.
    % $9n^2$ entries.
    % Note that the latter has $(3n)^2$ entries. 
    In addition, the convex conjugate of $F$
    % \unskip, given by 
    % \begin{align*}
    %     F_*(s) &:=  
    %     % \sup_{x \in \mathrm{int}\, K_{\exp}^n} \{ - s^{\sf T} x- F(x) \}
    %     \sup\{ - s^{\sf T} x- F(x) : x \in \mathrm{int}\, K_{\exp}^n \}
    %         \\
    %     &= \color{red} \texttt{??? TODO}
    % \end{align*}
    also has a known analytic form.
    
    
    % Infeasible-start primal-dual interior point methods \parencite{nesterov1996infeasible}
    % Interior point methods 
    Generally speaking, 
    the idea behind primal-dual interior point methods \parencite{nesterov1994book} such as the one behind MOSEK, is 
    to maintain both a point $x \in K$ and a dual point $s \in K_*$ (as well as $y \in \mathbb R^m$)
    and iteratively update them, as we slowly relax the barrier and approach a point on the boundary of the cone. 
    The quantity $\mu(z) := \nf{\langle s, x \rangle}{\nu} \ge 0$, called the complementarity gap, is a measure of how close the process is to converging. 
    
    Because the initial points may not satisfy the constraints, instead 
    the standard algorithms work with ``extended points'' $\bar x = (x, \tau)$ and $\bar s = (s, \kappa)$, for which the analogous complementarity gap is 
    $\mu^{\sf e}(\bar x, \bar s) := (\langle x, s\rangle + \kappa\tau)/(\nu+1)$.
    Altogether, the data at each iteration may be summarized as a point $z = (y, x, \tau, s, \kappa) \in \mathbb R \times (K \times \mathbb R_{\ge 0}) \times (K_* \times \mathbb R_{\ge 0})$.
        % The development requires a
    The primary object of interest is then something called the \emph{homogenous self-dual} model.
    Originally due to \textcite{nesterov1996infeasible} and also used by others \parencite{skajaa2015homogeneous},
    it can be defined as a linear operator:
    \begin{align*}
        G
            &: \Rext^{m + 2n + 2} \to \Rext^{n + m + 1} \\
        G(y,x,\tau,s,\kappa) 
            &:= \begin{bmatrix}
                0          &  A  &  -b \\
                -A^{\sf T} &  0  &  c \\
                b^{\sf T}  & -c^{\sf T} & 0
        \end{bmatrix}
        \begin{bmatrix}
            y \\ x \\ \tau
        \end{bmatrix}
        -
        \begin{bmatrix}
            0 \\ s \\k
        \end{bmatrix}.
    \end{align*}
    The reason for our interest is that if $z$ is such that $G(z) = 0$ and $\tau > 0$, then $(\nf x\tau)$ is a solution to the primal problem, and $\nf{(y,s)}{\tau}$ is a solution to the dual problem \parencite[Lemma 1]{skajaa2015homogeneous}, while if $G(z) = 0$ and $\kappa > 0$, then at least one of the two problems is infeasible. 
    
    % which is an $(3n+m+1)$-dimensional square matrix. 
    % which is a linear operator $G$
    We now are in a better position to describe the algorithm.    
    According to the MOSEK documentation \parencite{dahl2022primal},
        for the exponential cone,
            begins with an initial point
        \[
            \mat v := (1.291, 0.805, -0.828) \in (K_{\exp} \,\cap\, K_{\exp}^*)
        \]
    
        for this particular cone $K$, 
    % the algorithm initializes a guess
    the algorithm begins at the initial point
    \begin{align*}
        z_0 := (y_0, x_0, \tau_0, s_0, \kappa_0)
        \qquad
                    \text{where}\quad
                % x_0 &= s_0 = (1.291, 0.805, -0.828)^n \in 
                %     (K_{\exp} \,\cap\, K_{\exp}^*)^n, \\
                x_0 &= s_0 = (\overbrace{\vphantom| 1, \ldots, 1}^{n_{\sf o} \text{ copies}},~
                    \overbrace{\vphantom|
                        \mat v, \ldots, \mat v}^{n_{\sf e}\text{ copies}})
                \in (\mathbb R_{\ge 0})^{n_{\sf o}} \times (K_{\exp} \,\cap\, K_{\exp}^*)^{n_{\sf e}}, \\
            \qquad
                y_0 &= \mat 0  \in \mathbb R^m,
            \quad
                \tau_0 = \kappa_0 = 1.
    \end{align*}
    
    \def\daff#1{\Delta {#1}^{\text{aff}}}
    \def\dcen#1{\Delta {#1}^{\text{cen}}}
    
    At each iteration, the first step is to predict a direction for which 
    \textcite{badenbroek2021algorithm}
    compute a scaling matrix $W$.
    To describe it, we first need to define \emph{shadow iterates}
    \[
        \tilde x := -F'_*(s)
        \qquad \text{and} \qquad
        \tilde s := - F'(x).
    \]
    which are in a sense reflections of $s$ and $x$ across their barrier functions, and can be computed in in $O(n)$ time. 
    The analogous notion of complementarity can then be defined as $\tilde \mu(z) := \nicefrac{\langle \tilde x, \tilde s \rangle}{\nu}$.
    The scaling matrix, which we do not interpret here, can then be calculated as:
    \begin{equation}
        W :=
            \mu F''(x) + \frac{s s^{\sf T}}{\nu \mu}
            - \frac{\mu \tilde s \tilde s^{\sf T}}{\nu}
            + \frac{(s- \mu\tilde s)(s-\mu\tilde s)^{\sf T}}
                    {(s-\mu\tilde s)^{\sf T} (x - \mu \tilde x) }
            - \frac{\mu [ F''(x) \tilde x - \tilde \mu \tilde s]
                [ F''(x) \tilde x - \tilde \mu \tilde s]^{\sf T}}
                % { \Vert \tilde x \Vert^2_x - \nu \tilde \mu^2}
                { \tilde x^{\sf T} F''(x) \tilde x - \nu \tilde \mu^2}
        \label{eq:scalemat}
    \end{equation}
    Doing so requires $O(n^2)$ steps (although it may be parallelized). 
    The first four terms clearly require $O(n^2)$ steps, since each one is an outer product resulting in a $n \times n$ matrix. 
    The last term computes a matrix-vector product (which requires $O(n^2)$ steps), and computes an outer product with the resulting vector, which takes $O(n^2)$ steps as well. 
    
    The next step involves finding a solution $\daff z = (\cdots)$
    to the system of equations
    \begin{subequations} \label{eqns:sys1}
    \begin{align}
        G(\daff z) &= -G(z) \\
        \tau \daff\kappa + \daff\tau &= - \tau \kappa \\
        W \daff x + \daff s &= -s
    \end{align}
    \end{subequations}
    
    % Since $\mathrm{dim}\,(z) = 6n+m+2$ and $\mathrm{dim}\,(x) = 3n$,
    % \eqref{eqns:sys1} is a system of $(3n + m + 1) + 1 + (3n) = 6n + m + 2$ equations
    (\ref{eqns:sys1}a-c) describe a system of $(n + m + 1) + 1 + (n) = 2n + m + 2$ equations and equally many unknowns, 
    % which can be solved naively with Gaussian elimination in $O((n+m)^3)$ iterations,
    % or with slightly better asymptotic complexity of
    % $O((n+m)^{2.332})$.
    and solved in $O((n+m)^\omega)$ steps.
    It may be possible to exploit the sparsity of $G$ to do better.
    
    The next step is to center that search direction so that it lies on the central path. This is done by finding a solution $\dcen z$ to
    \begin{subequations}\label{eqns:sys2}
    \begin{align}
        G(\dcen z) &= G(z) \\
        \tau \dcen \kappa + \kappa \dcen \tau &= \mu^e \\
        W \dcen x + \dcen s &= \mu^e \tilde s
    \end{align}
    \end{subequations}
    which again can be done in $O((n+m)^3)$ steps with Gaussian elimination, or
    with a fancier solver in $O((n+m)^2.332)$ steps. 
    %
    The two updates are then applied to the current point $z$ to obtain
    \[
        z_+ = (y_+, x_+, \tau_+, s_+, \kappa_+) := z + \alpha (\daff z + \gamma \dcen z).
    \]
    
    Finally, a ``correction step'', which is the primary innovation of \textcite{badenbroek2021algorithm} and used in MOSEK's algorithm, 
    is a third direction $\Delta z_+^{\text{cor}}$, which is found by solving the system of equations
    \begin{subequations}\label{eqns:sys3}
    \begin{align}
        G(\Delta z^{\text{cor}}) &= 0 \\
        \tau_+ \Delta \kappa^{\text{cor}}  + \kappa_+ \Delta \tau^{\text{cor}} &= 0 \\
        W_+ \Delta {x_+}^{\text{cor}} + \dcen s &= \mu^e \tilde s
    \end{align}
    \end{subequations}
    where
    % \[
    %     W_+ :=
    %         \mu_+ F''(x_) + \frac{s s^{\sf T}}{\nu \mu}
    %         - \frac{\mu \tilde s \tilde s^{\sf T}}{\nu}
    %         + \frac{(s- \mu\tilde s)(s-\mu\tilde s)^{\sf T}}
    %                 {(s-\mu\tilde s)^{\sf T} (x - \mu \tilde x) }
    %         - \frac{\mu [ F''(x) \tilde x - \tilde \mu \tilde s]
    %             [ F''(x) \tilde x - \tilde \mu \tilde s]^{\sf T}}
    %             % { \Vert \tilde x \Vert^2_x - \nu \tilde \mu^2}
    %             { \tilde x^{\sf T} F''(x) \tilde x - \nu \tilde \mu^2}
    % \]
    $W_+$ is defined the same way that $W$ is, except that it uses the components of $z_+$ instead of $z$.     
    After adding the correction step $\Delta z_+^{\text{cor}}$ to $z$, we repeat the entire process. The full algorithm, then, is summarized as follows:

    \begin{algorithmic}
        % \State $z \gets (y_0, x_0, \tau_0, s_0, \kappa_0)$ as in \eqref{bbeq:init};
        \STATE $z \gets (y_0, x_0, \tau_0, s_0, \kappa_0)$;
        \WHILE{}
            \STATE Compute scaling matrix $W$ as in \eqref{eq:scalemat};
            \STATE Find the solution $\daff z$ to (\ref{eqns:sys1}a-c),
                and the solution $\dcen z$ to (\ref{eqns:sys2}a-c);
            \STATE $z_+ \gets z + \alpha (\daff z + \gamma \dcen z)$;
            \STATE Compute the saling matrix $W_+$;
            \STATE Find the solution $\Delta z^{\text{cor}}_+$ to (\ref{eqns:sys3}a-c);
            \STATE $z \gets z_+ + \Delta z_+^{\text{cor}}$;
        \ENDWHILE
    \end{algorithmic}
    
    % and each iteration of it takes $O((n+m)^{2.332})$ time. 
    We have verified that each iteration of this process can be done in $O((n+m)^\omega))$ time.
    Their main result \parencite[Theorem 3]{badenbroek2021algorithm}, states that for every $\epsilon \in (0,1)$,
    the algorithm results in a solution $z$ satisfying
    \[
        \mu^{\sf e}(z)
        % \frac{x^{\sf T} s + \tau \kappa}{3n + 1}
        \le \epsilon
        \qquad \text{and}\qquad
        \Vert G(z) \Vert \le \epsilon 
            \Vert G(z_0) \Vert 
            % \sqrt{3n + 1}
    \]
    in $O(n \log (1/\epsilon))$ iterations, 
    for a total cost of 
    $O(n (m+n)^3 \log (1/\epsilon) )$ time with Gaussian elimination, or
    $O(n (m+n)^{2.332} \log (1/\epsilon) )$ time using the linear solver with best
         known asymptotatic complexity as of 2022 \cite{duan2022faster}.

    \medskip
    \hrule
         
    \textbf{Verifying that the solution is approximately optimal.} 
    What we have at this point is not quite enough: simply because the residual quantity $G(z)$ is approximately zero (so that we have approximately solved the homogenous model), does not mean that we've approximately solved the original problem.
    Specifically, it's entirely possible on the surface that the parameter $\tau$ goes to zero at the same rate as everything else, and the quantity $(x/\tau)$ does not converge to a solution to the primal problem.
    To address this issue, we must also trace the analysis of the seminal work of \textcite{nesterov1996infeasible}, who use slightly different quantities, conflicting with the notation we have been using thus far.
     
     
    Following \textcite[pg. 231]{nesterov1996infeasible}, fix an initial point $z_0$, and let \emph{shifted feasible set} 
    $\mathcal F := \{ z
        \in \mathbb R \times K \times \mathbb R_{\ge 0} \times K^* \times \mathbb R_{\ge 0}
        : G(z) = G(z_0)\}$
    be the collection of all points that have the same residual as $z_0$.     
    \citeauthor*{nesterov1996infeasible} also refer to a complementary gap by $\mu(z)$ and define it identically, but the meaning of this parameter is different, because the set $\mathcal F$ on which it's defined is quite distinct from (if closely related to) the iterates of \citeauthor*{badenbroek2021algorithm}'s algorithm.
    In the service of clarity, 
    will call this quantity $\mu^{\sf N}(z^{\sf N})$, for 
    $z^{\sf N}
    = (y^{\sf N}, x^{\sf N}, \tau^{\sf N}, s^{\sf N}, \kappa^{\sf N})
    \in \mathcal F$.
    
    
    Although we made a point of emphasizing that the two are distinct, 
    the actual relationship between them is straightforward.
    Let $z = (y, x,\tau, s, \kappa)$ be the final output of \textcite{badenbroek2021algorithm}. 
    % Because they also prove that
    In proving their main theorem, they also prove that
    $G(z) = \epsilon G(z_0)$,  and $\mu^{\sf e}=\epsilon$;
    because $G$ is linear, we know that $G(\nf{z}{\epsilon}) = G(z_0)$.
    This means that $z^{\sf N} 
    := \nf z\epsilon \in \mathcal F$. 
    % This is precisely what it means for $\frac{z}{\epsilon}$ to be a member of the \emph{shifted feasible set} $\mathcal F$ defined by \textcite[pg. 231]{nesterov1996infeasible}.
    Therefore, 
    \[  
        \mu^{\sf N}(z^{\sf N} )
        = \frac{1}{\nu+1} \left( \left\langle\frac{s}{\epsilon}, \frac{x}{\epsilon}\right\rangle + \frac{\tau}{\epsilon} \frac{\kappa}{\epsilon} \right) 
    = \frac{1}{\epsilon^2} \mu^{\sf e}(z) = \frac{1}{\epsilon}.
    \]
    % where $(*)$ is the definition of $\mu^{\sf e}$. 
    So, roughly speaking, $\mu^{\sf N}$ and $\mu^{\sf e}$ are reciprocals.
    % They also
    \citeauthor*{badenbroek2021algorithm} also
    prove that, every iterate $z$ satisfies their assumption (A2): for a fixed constant $\beta$ (equal to 0.9 in their analysis), $\beta \mu^{\sf e}(z) \le \tau \kappa$. 
    Consequently, 
    it happens that the same inequality holds with Nesterov's notation: 
    \[ 
        \tau^{\sf N} \kappa^{\sf N} = 
        \frac{\tau}{\epsilon} \frac{\kappa}{\epsilon}
            = \frac{\tau\kappa}{\epsilon^2} \ge \frac{\beta \epsilon}{\epsilon^2}
            = \frac\beta\epsilon = \beta \mu^{\sf N}
                % \left(\frac{z}{\epsilon}\right)
                (z^{\sf N}).
    \]
    This witnesses that $z^{\sf N} = \frac{z}{\epsilon}$ satisfies equation (81) of \citeauthor{nesterov1996infeasible}, which allows us to apply one of their main theorems, which addresses these issues.
    Supposing that the orignal problem is solvable, 
    let $(x^*, s^*)$ be any solution to the primal and dual problems,
    and define the value $\psi := 1 + \langle s_0, x^*\rangle + \langle s^*, x_0 \rangle \ge 1$, 
    which depends only on the problem and the choice of initialization.
    Then Theroem 1, part 1 of \citeauthor*{nesterov1996infeasible}, allows us to conclude that
    \[
        \frac{\kappa}{\epsilon}  \le \psi
        \quad\text{and}\quad
        \frac\tau\epsilon \ge \frac{\beta}{\epsilon\psi}
    %
    \qquad\qquad \iff\qquad\qquad
        \kappa \le \epsilon\psi
        \quad\text{and}\quad \tau \ge \frac\beta\psi.    
    \]
    Finally, the original theorem guarantees that 
    $\Vert G(x) \Vert \le \epsilon \Vert G(z_0)\Vert$, meaning that
    \[
        \Big\Vert A \Big(\frac x\tau\Big) - b \Big\Vert \tau
        ~+~\Big\Vert A^{\sf T} \Big(\frac y\tau\Big) - \frac{s}\tau - c \Big\Vert \tau
        ~+~\Big\Vert b^{\sf T} \left(\frac{y}{\tau}\right) - c^{\sf T} \left(\frac{x}{\tau}\right) - \frac\kappa\tau \Big\Vert \tau \le \epsilon \Vert G(z_0) \Vert
    \]
    Since the euclidean norm is an upper bound on the deviation in any component
    ( $\Vert v \Vert := \sqrt{\sum_i v_i^2} \ge \sqrt{\max_i v_i^2} = \max_i v_i =: \Vert v\Vert_\infty$ ), this means that in light of our bound on $\tau$ above, we have
    \[
        \Big\Vert A \Big(\frac x\tau\Big) - b \Big\Vert_\infty
        ~+~\Big\Vert A^{\sf T} \Big(\frac y\tau\Big) + \frac{s}\tau - c \Big\Vert_\infty 
        ~+~\Big\Vert b^{\sf T} \left(\frac{y}{\tau}\right) - c^{\sf T} \left(\frac{x}{\tau}\right) - \frac\kappa\tau \Big\Vert_\infty
        \le \epsilon  \frac{\beta \Vert G(z_0) \Vert}{\psi }.
    \]
    The first two components show that the total constraint violation (in the primal and dual problems, respectively) is at most $\nicefrac{\epsilon\beta}{\psi} \Vert G(z_0) \Vert$.
    Meanwhile, the final component shows that the duality gap 
    $\mathit{gap} = b^{\sf T} (\frac{y}{\tau}) - c^{\sf T} (\frac{x}{\tau})$,
    which is positive and an upper bound on the difference between the objective at $x/\tau$ and the optimal objective value, satisfies
    \[   
        \mathit{gap}  \le \mathit{gap} + \frac\kappa\tau \le \frac{\epsilon \beta \Vert G(z_0) \Vert}{\psi}.
    \]    
    %oli14: adding some text to this to clarify
    Thus $x/\tau$ is an $(\epsilon \Vert G(z_0)\Vert)$-approximate solution to the original exponential conic problem.
    Since also $\psi \ge 1$, we may freely drop it to get a looser bound.
    All that remains is to investigate $\Vert G(z_0) \Vert$,
     the residual norm of the initial point
        chosen by the MOSEK solver, which equals:
    \[
        \Vert G(z_0) \Vert
        = \Vert A x_0 - b \Vert
            + \Vert A^{\sf T} y_0 + s_0 - c  \Vert
            + | c^{\sf T} x - b^{\sf T} y + 1 |.
    \]
    Making use of our assumption that every component of $A$, $b$, and $c$ is at most one, we find that
    \begin{align*}
        \Vert A x_0 - b \Vert^2 
            &= \sum_j (\sum_i A_{j,i} (1.3)  - b_j)^2 
            % \le \sum_j (1.3n - b_j)^2
            \le m (1.3n+1)^2 \in O(m n^2)
                &\subset O((m+n)^3)\\
        \Vert A^{\sf T}y_0 + s_0 - c \Vert^2 
            &= \sum_i (\sum_j (A_{j,i})
            \le n (m + 2)^2 \in O(n m^2)
                &\subset O((m+n)^3) \\
        | c^{\sf T} x - b^{\sf T} y + 1 | ^2
            &\le (1.3n + m + 1)^2 \in O( (n+m)^2 )
                &\subset O((n+m)^3).
    \end{align*}
    %oli14: rewrote the rest of this proof to make it clearer; I didn't at first
    %  understand why I was saying what I was saying when I re-read it.
    Therefore, the residual of the initial point is 
        $G(z_0) \in O((n+m)^{\nf32})$.
    
    To obtain a solution at most $\epsilon_0$ away from the true
    solution in any coordinate, we need to select $\epsilon$ small enough
    that the final output of the algorithm $z$ satisfies 
    \[
        \epsilon \Vert G(z_0) \Vert  \le \epsilon_0
        \qquad\iff\qquad
        \frac1\epsilon \ge \frac1{\epsilon_0} \Vert G(z_0) \Vert
    \]
    It therefore suffices to choose 
    % $\frac{1}{\epsilon} \in O(\frac1{\epsilon_0} (n+m)^{\nf32})$.
    $\frac{1}{\epsilon} \in O(\frac1{\epsilon_0} (n+m)^{\nf32})$, 
    leading to $\log \frac{1}{\epsilon} = O( \log \frac{n+m}{\epsilon_0} ) )$
    iterations. 
    
    Thus, we arrive at our total advertised asymptotic complexity of time
    \[
        O \Big( n (n+m)^\omega \log \frac{n+m}{\epsilon_0} \Big). 
    \]
    
    In particular, to attain machine precision, we can fix
    $\epsilon_0$ to be the smallest gap between numbers representable
    (say with 64-bit floats, leading to $\epsilon_0 = 10^{-78}$ in the worst case), and omit the dependance on $\epsilon_0$ for the price of relatively
    small constant (78, for 64-bit floats).
\end{lproof}

\discard{
\begin{prop}
    It can also be solved with the methods of \textcite{skajaa2015homogeneous} in
    time
    \[ O(\sqrt{n} (m+n)^\omega \log n ) 
        \quad\subset\quad O( (m+n)^{2.872} \log n ) 
        % \qquad \text{time.}
        \]
\end{prop}
\begin{lproof}
    
\end{lproof} 
}

Having combed through all of the details of the analysis of  \textcite{badenbroek2021algorithm} and \textcite{nesterov1996infeasible} for exponential
conic programs as we have defined them, we are ready to show that this algorithm solves the problems presented in \cref{sec:clique-tree-expcone} within polynomial time. 

\begin{lemma}\label{lem:cluster-inc-polytime}
Problem \eqref{prob:cluster-inc} can be solved to $\epsilon$ precision in time
\[
    O\Big( (\V\!\Ar + \V\C)^{1 + \omega} ( \log \frac{|\V\!\Ar| + |\V\C|}{\epsilon}
        % + \log(1+ \Vert\bbeta\Vert_\infty)) \Big)
        + \log \frac{\beta^{\max}}{\beta^{\min}} \Big)
    \quad \subset \quad
        \tilde O\Big( (\V\!\Ar + \V\C)^4 \Big),
\]
where $\beta^{\max} := \max_{a \in \Ar} \beta_a$ is the largest value of $\beta$,
 and $\beta^{\min}:= \min_{a \in \Ar} \{ \beta_a : \beta_a > 0\}$ is the smallest positive one.
\end{lemma}
\begin{lproof}
    Problem \eqref{prob:cluster-inc}
    % has $\V\!\Ar + $ constraints.
    can translated via the DCP framework to 
    the following exponential conic program, which has:
    \begin{itemize}[label=$\blacktriangleright$]
    \item variables 
        $x = (\mat u, \mat v, \mat w, \bmu) \in K_{\exp}^{\V\!\Ar} \times \mathbb R_{\ge 0}^{\V \C}$, 
        where 
        \begin{itemize}[label=\textbullet]
        \item $\mathbf{u, v,w} \in \Rext^{\V\!\Ar}$
            are all vectors over $\V\!\Ar$,
            that at index $\iota = (a,s,t) \in \V\!\Ar$, have 
            components $u_\iota$, $v_\iota$, and $w_\iota$, respectively;
            % components 
            % $\mat u = [u_{a,s,t}]_{ast\in\V\!\Ar},
            %  \mat v = [v_{a,s,t}]_{ast\in\V\!\Ar}$, 
            % and $\mat w = [w_{a,s,t}]_{ast\in\V\!\Ar}$ respectively, 
            % respective components
            % $(\mat u; \mat v; \mat w) = []_{(a,s,t)\in\V\!\Ar}$
        \item
            $\bmu = [\mu_{C}(C{=}c)]_%
            % {(C,c) \in \V\C}
            {C \in \C,\,c \in\V(C)}
             \in \Rext^{\V \C}$ is a vector representation of a clique tree over clusters $\C$;
    \end{itemize}

    \item constraints as follows:
        \begin{itemize}[label=\textbullet]
            \item 
            two linear constraints for every $(a,s,t) \in \V\!\Ar$ to ensure that
            \begin{align*}
                v_{a,s,t} &= \mu_{C_{\!a}}\!(s,t)
                    \qquad\Big(~= \sum_{\bar c \in \V(C_a \setminus\{\Src a, \Tgt a\})}
                        \mu_{C_{\!a}}\!(\bar c, s, t) \Big) \\
                \qquad\text{and}\qquad
                w_{a,s,t} &= \mu_{C_{\!a}}\!(\Src a{=}s)\, \p_a(\Tgt a{=}t\mid\Src a{=}s)
                    \qquad\Big(~= \p_a(\Tgt a{=}t\mid\Src a{=}s) \sum_{\bar c \in \V(C_a \setminus\{\Src a\})}
                        \mu_{C_{\!a}}\!(\bar c, s) \Big)
            \end{align*}
            \item for every edge $(C\!{-}\!D) \in \cal T$, and every value $\omega \in \V(C \cap D)$ of the variables that clusters $C$ and $D$ have in common, a linear constraint
            \[
                \sum_{\bar c \in \V(C\setminus D)} \mu_C(\bar c, \omega) 
                    =
                \sum_{\bar d \in \V(D \setminus C)} \mu_D(\bar d, \omega)
            \]
            \item and one constraint for each cluster $C \in \C$ to ensure that $\mu_{C}$ lies on the probability simplex, i.e.,
            \[
                \sum_{c \in \V(C)} \mu_C(c) = 1.
            \]     
        \end{itemize}
    \end{itemize}
    
    Altogether this means that we have an exponential conic program in the form
    of \cref{lem:mainlemma}, with
    % \begin{itemize}
        % \item 
        $n = 3|\V\!\Ar| + |\V\C|$ variables,
        and
        % \item 
        $m = 2 |\V\!\Ar| + |\V \mathcal T| +  |\C|$ constraints,
    where
    $\V\mathcal T = \{ (C\!{-}\!D, \omega) :  C\!{-}\!D \in \mathcal T, \omega \in \V(C\cap D)\}$.
    Since we can simply disregard variables whose value sets are singletons, we can assume $\V(C) > 1$; summing over all clusters yields $\V\C > |\C|$. 
    At the same time, since $\V\mathcal T \le \V\C$, 
    we have 
    \[ m,n,(m + n) \in O(\V\!\Ar, + \V\C).  \]
        % \footnote{by analogy to all of the other quantities}

    
    We now give the explicit construction of the data $(A, b,c)$ of the exponential conic program that \eqref{prob:cluster-inc} compiles to.
    The variables are indexed by tuples
    of the form $i = (\ell,a,s,t)$ for $(a,s,t) \in \V\!\Ar$ and $\ell \in \{u,v,w\}$, 
    or by tuples of the form $(C,c)$, for $c \in \V(C)$ and $C \in \C$, 
    while the
    constraints are indexed by tuples of the form
    $j = (\ell,a,s,t)$ for $(a,s,t) \in \V\!\Ar$ and $\ell \in \{v,w\}$, 
    of the form $(C\!{-}\!D, \omega)$, for an edge $(C\!{-}\!D) \in \mathcal T$ and $\omega \in \V(C \cap D)$, 
    or simply by $(C)$, the name of a cluster $C \in \C$. 
    The problem data $A = [A_{j,i}],b = [b_j],c = [c_i]$ of this program are zero, except (possibly) for the
        components:
    \begin{align*}
        c_{(u,a,s,t)} &= \beta_a \\
        % c_{(w,a,s,t)} &=  c_{(v,a,s,t)} = c_{(C,c)} = 0. 
        A_{(v,a,s,t), (C,c)} &= 
        \mathbbm1[C {=} C_{a} ~\land~ \Src a(c) {=} s ~\land~ \Tgt a(c) {=} t] \\
        %  \begin{cases}
        %     1 & \text{if }C = C_a,\,
        %         \Src a(c) = s,~\text{ and } \Tgt a(c) = t\\
        %     0 & \text{ otherwise}
        % \end{cases} \\
        A_{(w,a,s,t), (C,c)} &= 
           %  \begin{cases}
           %     \p_a(\Tgt a{=}t\mid \Src a{=}s)
           %     % \p_a(t|s)
           %      & \text{if }C = C_a\text{ and }
           %         \Src a(c) = s\\
           %     0 & \text{ otherwise}
           % \end{cases}  \\
           \p_a(\Tgt a{=}t\mid \Src a{=}s) 
            \mathbbm1[C {=} C_a ~\land~ \Src a(c) {=} s] \\
        A_{(w,a,s,t), (w,a,s,t)} &= -1 \\
        A_{(v,a,s,t), (v,a,s,t)} &= -1 \\
        % A_{(C'),(C,c)} &= \begin{cases}
        %         1 & \text{ if } C = C' \\ 0 & \text{otherwise}
        %     \end{cases}\\
        % A_{(C{-}D, \omega), (C,c)} &= 1\\
        % A_{(D{-}C, \omega), (C,c)} &= -1\\
        A_{(C\!{-}\!D, \omega),(C',c)} &= \mathbbm1[C{=}C'] - \mathbbm1[C'{=}D]\\
        % A_{(C), (C',c)} &= \mathbbm1[C{=}C'] \\
        A_{(C),(C,c)} &= 1 \\
        b_{(C)} &= 1,
    \end{align*}
    where $\mathbbm1[\varphi]$ is equal to 1 if $\varphi$ is true, and zero if $\varphi$ is false.
    We note that we can equivalently divide each $\beta_a$ by $\max_a \beta_a$ without affecting the problem, 
    although this could affect the approximation accuracy by the same factor. 
    Thus, we get another factor of 
    \[
        \log ( \max \{1 \} \cup \{ \beta_a : a \in \Ar\} ) ~\subseteq~ O( \log (1 + \max_a \beta_a ) ).
    \]
    
    % Finally, \cref{fact:strongconcave} reveals that 
    Finally, to find a point that is $\epsilon$-close (say, in 2-norm) to the limiting point $\mu^*$ on the central path, as opposed to simply one that for which the suboptimality gap is at most $\epsilon$, we can appeal to strong concavity of the objective function. 
    Now, (conditional) relative entropy is 1-strongly convex, and each relative entropy term is scaled by $\beta_a$.
    Furthermore, we're only considering marginal conditional entropy, so this convexity may not hold in all directions.
    Still, if the next step direction $\delta$ is not far from the gradient, as is the case if the interior point method has nearly converged, then in that direction, the objective will be at least ($\min_a \{ \beta_a : \beta_a > 0\}$)-strongly convex. 
    Therefore, by requiring an precision to an additional factor of $\min_a \{ \beta_a : \beta_a > 0\}$, we can guarantee that our point is $\epsilon$-close to $\mu^*$, and not just in complementarity gap. 
    
    % We have arrived at our result:
    To summarize, applying \Cref{lem:mainlemma}, we find that we can solve problem \eqref{prob:cluster-inc} in time
    \begin{align*}
        O\Big( \big(|\V\!\Ar| + |\V\C|\big)^{1 + \omega} \Big( \log \frac{ |\V\!\Ar| + |\V\C| }{\epsilon} + \log \frac{\beta_{\max}}{\beta_{\min}} \Big) \Big)
        \quad
        \subset \tilde O\Big( \big(|\V\!\Ar| + |\V\C| \big)^4 \Big).
    \end{align*}
\end{lproof}

We now quickly step through the analogous construction for problems \eqref{prob:cluster-small-gamma} and \eqref{prob:cluster+idef}, which
solve the $\zogamma$-inference problem, and $0^+$-inference, respectively.

\begin{lemma}\label{lem:smallgamma-polytime}
    Problem \eqref{prob:cluster-small-gamma} is solved to precision $\epsilon$ in time
    \[
        O\bigg( |\V\!\Ar + \V\C|^{1 + \omega} \Big( \log \frac{ |\V\!\Ar| + |\V\C|}{\epsilon} + \log(1+ \Vert\bbeta\Vert_\infty) + \log \log \frac{1}{p} \Big) \bigg)
        \quad
        \subset 
        \quad
        \tilde O\Big( (\V\!\Ar + \V\C)^4 \Big)
    \]
    where $p$ is the smallest nonzero probability in the PDG.
\end{lemma}
\begin{lproof}
    Problem \eqref{prob:cluster-small-gamma} has 
    \begin{itemize}[label=$\blacktriangleright$]
    \item variables 
        $x = (\mat u, \mat y, \mat w,\,\, \mat v, \bmu, \mat z)
        % where 
        \in K_{\exp}^{\V\!\Ar} \times K_{\exp}^{\V \C}$
        % $x = (\mat u, \mat v, \mat w, \bmu) \in K_{\exp}^{\V\!\Ar} \times \mathbb R_{\ge 0}^{\V \C}$, 
        where 
        \begin{itemize}[label=\textbullet]
        \item 
        $\mathbf{u,y,w} \in \Rext^{\V\!\Ar}$
            are all vectors over $\V\!\Ar$
            that at index $\iota = (a,s,t) \in \V\!\Ar$, have 
            components $u_\iota$, $v_\iota$, and $w_\iota$, respectively;
        \item 
        Meanwhile, 
        $\mathbf{v,\bmu,z} \in \Rext^{\V\C}$
            are all vectors over $\V\C$ 
            which at index $(C,c)$, have 
            components $v_{C,c}$, $\mu_C(c)$, and $z_{C,c}$, respectively.
            Once again, $\bmu = [\mu_{C}(C{=}c)]_%
            % {(C,c) \in \V\C}
            {C \in \C,\,c \in\V(C)}
             \in \Rext^{\V \C}$ is intended to be a vector representation of a clique tree.
    \end{itemize}

    \item constraints as follows:
        \begin{itemize}[label=\textbullet]
            \item 
            two linear constraints for each $(a,s,t) \in \V\!\Ar$, to ensure that
            \[
                y_{a,s,t} = \mu_{C_{\!a}}(s,t)
                \qquad\text{and}\qquad
                w_{a,s,t} = \mu_{C_{\!a}}\!(\Src a{=}s)\, \p_a(\Tgt a{=}t\mid\Src a{=}s),
            \]
            \item for every edge $(C\!{-}\!D) \in \cal T$, and every value $\omega \in \V(C \cap D)$ of the variables that clusters $C$ and $D$ have in common, a linear constraint
            \[
                \sum_{\bar c \in \V(C\setminus D)} \mu_C(\bar c, \omega) 
                    =
                \sum_{\bar d \in \V(D \setminus C)} \mu_D(\bar d, \omega)
            \]
            \item for every $(a,s,t) \in \V\!\Ar^0$, a linear constraint
            that ensures
            \[
                0 = \mu_{C_a}\!(\Src a{=}s,\Tgt a{=}t) 
                \qquad\Big(~~
                   = \sum_{\bar c \in \V(C \setminus \{\Src a, \Tgt a\})} \mu_{C_a}(\bar c, s, t) \Big)
            \]
            
            \discard{\TODO[POSSIBLE ISSUE: doesn't this mean the problem isn't strictly feasible?]}

             
            \item a linear constraint for every value $c \in \V(C)$ of every cluster $C \in \C$, to ensure that
            \[
                z_{C,c} = \mu_C( \Pash_C(c) )
                    \qquad \Big(~~= \sum_{\bar c \in \V(C \setminus \Pash_C)}
                        \mu_{C}(\bar c, \Pash_C(c)) ~~\Big)
            \]
            \item and one constraint for each cluster $C \in \C$ to ensure that $\mu_{C}$ lies on the probability simplex, i.e.,
            \[
                \sum_{c \in \V(C)} \mu_C(c) = 1.
            \]
        \end{itemize}
    \end{itemize}
    So in total, there are 
    $n = |3 \V \!\Ar + 3 \V \C|$ variables,
    and 
    $m = 2 |\V\!\Ar| + |\V \mathcal T| + |\V\!\Ar^0| + |\V \C| + |\C|$ constraints. 
    The same arguments made in \cref{lem:cluster-inc-polytime} show that both $n,m \in O(|\V \!\Ar + \V \C|)$.
    
    Also like before, it is easy to see that the components of $A$ and $b$ are all at most 1.  However, we will need to rescale the objective $c$ in order for each of its components to be most 1. We can do this by dividing it by
    $\max \{ - \beta_a \log {p_a(t|s)} \}_{(a,s,t) \in \V\!\Ar} \cup \{ 1 \}$.
    
    Finally, to ensure that we have a solution that is $\epsilon$-close to the end of the central path, as opposed to one that is merely $\epsilon$-close in compelementarity gap, we must appeal to convexity. 
    As in the proof of \cref{lem:cluster-inc-polytime}, this amounts to reducing the target accuracy by a factor of the smallest possible coefficient of strong convexity, along the next step direction. 
    In this case, the bound is simpler: because negative entropy is (unconditionally) 1-strongly convex, and since $\bbeta \ge \balpha \gamma$, the remaining terms are convex, this could be, at worst, $\frac1\gamma$. 
    % In this case, the worst possible value is $\min_a (\beta_a - \alpha_a)$
    
    This gives rise to our result: problem \eqref{prob:cluster-small-gamma} can be solved in 
    \begin{align*}
        O\left( |\V \!\Ar + \V \C|^{1+\omega} 
            \left(\log \frac{|\V \!\Ar + \V \C|}{\epsilon} + \log \frac1\gamma \left(1 + \max_{(a,s,t) \in \V\!\Ar} \beta_a \log \frac{1}{\p_a(t|s)} \right) \right)  \right) \\
        \subset 
        O\left( |\V \!\Ar + \V \C|^{1+ \omega} 
        \left\{
            \log \frac{|\V \!\Ar + \V \C|}{\epsilon}+
            \log  \frac{\beta^{\max}}{\gamma} +
            \log\log \frac{1}{p} 
        \right\}
        \right)
    \end{align*}
    operations, where $p$ is the smallest nonzero probability in the PDG, and $\beta^{\max}$ is the largest confidence in the PDG larger than 1.
\end{lproof}

\begin{lemma}\label{lem:cluster+idef-polytime}
    Problem \eqref{prob:cluster+idef} is solved to precision $\epsilon$ in 
    \[
        O\Big( |\V \C| |\V \!\Ar + \V \C|^{\omega} 
            \log \frac{|\V \!\Ar + \V \C|}{\epsilon} \Big) 
        \quad\subset\quad
        \tilde O\Big( |\V \C + \V \! \Ar|^{4} \Big) \text{~~time}.
    \]
\end{lemma}
\begin{lproof}
    Problem \eqref{prob:cluster+idef} is slightly more straightforward; having done
    \cref{lem:cluster-inc-polytime,lem:smallgamma-polytime} in depth, we do this one more quickly.
    In the standard form, problem \eqref{prob:cluster+idef}, has variables 
    $x = (\mat u, \bmu, \mat w)
        \in K_{\exp}^{\V\C}$.
    The constraints  are:
     
    \begin{itemize}[label=\textbullet]
        \item 
        one linear constraint for each $(C,c) \in \V\C$, to ensure that
        \[
            w_{C,c} = k_{(C,c)} \mu_C( \Pash_C(c) )
            \qquad\Big(~~= \sum_{\bar c \in \V(C\setminus \Pash_C )} \mu_C(\bar c, \Pash_C(c))
                \Big)
        \]
        \item for every edge $(C\!{-}\!D) \in \cal T$, and every value $\omega \in \V(C \cap D)$ of the variables that clusters $C$ and $D$ have in common, a linear constraint
        \[
            \sum_{\bar c \in \V(C\setminus D)} \mu_C(\bar c, \omega) 
                =
            \sum_{\bar d \in \V(D \setminus C)} \mu_D(\bar d, \omega)
        \]
        \item for every $(a,s,t) \in \V\!\Ar$, a linear constraint
        that ensures
        \[
            \mu_{C_a}(\Src a{=}s, \Tgt a{=}t) \, \nu_{C_a}(\Src a{=}s)
                =
            \nu_{C_a}(\Src a{=}s, \Tgt a{=}t) \, \mu_{C_a}(\Src a{=}s).
        \]
        This is linear, because recall that $\nu$ is a constant in this optimization
        problem, found by having previously solved \eqref{prob:cluster-inc}. 
        
        \item and one constraint for each cluster $C \in \C$ to ensure that $\mu_{C}$ lies on the probability simplex.
    \end{itemize}
    So in total, there are 
    $n = 3 |\V \C|$ variables,
    and 
    $m =  |\V \C| + |\V \mathcal T| + |\V\!\Ar| + |\C|$ constraints. 
     % O(|\V \!\Ar + \V \C|)$.
    Once again the components of $A$ and $b$ are all at most one, and now the components of the cost function $c = \mat 1$ are identically one. 
    Furthermore, our objective is 1-strongly convex, so no additional multaplicative terms are required to convert an $\epsilon$-close solution in the sense of suboptimality, to an $\epsilon$-close solution in the sense of proximity to the true solution.

    Therefore \eqref{prob:cluster+idef} can be solved in 
    \begin{align*}
        O\Big( |\V \C| |\V \!\Ar + \V \C|^{\omega} 
            \log \frac{|\V \!\Ar + \V \C|}{\epsilon} \Big) 
        \quad\subset\quad
        \tilde O( |\V \C + \V \! \Ar|^{4} )
    \end{align*}
    operations.
\end{lproof}

\textbf{\cref{theorem:main}.} \textit{
Let $\dg M = (\X, \Ar, \mathbb P, \balpha, \bbeta)$
be a proper discrete PDG with $N = |\X|$ variables each taking at most $V$ values, and $A = |\Ar|$ arcs forming a hypergraph of treewidth $T$.
Then for all $\gamma \in \{0^+\}\cup (0,\,  \min_{a \in \Ar} \frac{ \beta_a}{\alpha_a}]$
and $\epsilon > 0$, 
we can do $\zogamma$-inference to precision $\epsilon$ in 
\[ 
    O\pqty[\bigg]{  (N\!+\!A)^4 V^{4T}
        \pqty[\Big]{ T \log V + \log \frac{N\!+\! A}{\epsilon} 
    + \log
    \frac{\beta^{\max}}{\beta^{\min}}
        }\! }%
\]
time,
\unskip\daggerfootnote{At the cost of substantial overhead and engineering effort, the exponent $4$ can be reduced to 2.872, by appeal to \textcite{skajaa2015homogeneous} and the current best matrix multiplication algorithm \parencite[$O(n^{2.372})$]{duan2022faster}
to invert 
$n{\times} n$
linear systems. }
where $\quad \beta^{\max} := \max_{{a \in \Ar}} \beta_a\quad$ and
\[
\beta^{\min} := 
\begin{cases}
    ~\displaystyle\min_{a \in \Ar} \{ \beta_a : \beta_a > 0\}& \text{if}~ \gamma = 0^+\\[-0.5ex]
    ~\hfill\gamma~~ & \text{if}~ \gamma > 0~~\text{.}
\end{cases}
\]
}
\begin{lproof}\label{proof:main}
    Suppose the PDG has $N$ variables 
    (each of which can take at most $V$ distinct values), 
    and $A$ hyperarcs, which together form a structure has tree-width $T$. 
    
    Then each cluster (of which there are at most $N$) 
    can have at most $T$ variables, and so can take at most $V^T$ values.
    Therefore, $|\V \C| \le N V^T$.
    % Furthermore, because each 
    Since each arc must be entirely contained within some cluster, 
    $|\V\!\Ar| \le A V^T$. 
    So, $|\V\!\Ar + \V \C| \le (N+A) V^T$. 
    
    Applying \cref{lem:cluster-inc-polytime,lem:smallgamma-polytime,lem:cluster+idef-polytime}, we conclude that, for $\gamma \in (0, \min_a \frac{\beta_a}{\alpha_a}]$,  $\zogamma$-inference can be done in time
    \[
        O\Big(  (N+A)^4 V^{4T} \log \Big(\frac{N+A}{\epsilon} V^{4T} \frac{\beta^{\max}}{\gamma}) + \log \frac{1}{p} \Big)  \Big)
        % \subset
        % =
        % O\Big(  (N+A)^4 \exp\log(V^{4T}) \Big) 
        % = O\Big(  (N+A)^4 \exp(4T\log V) \Big) 
    \]
    while $0^+$ inference can be done in time
    \[
        O\Big(  (N+A)^4 V^{4T} \log \Big(\frac{N+A}{\epsilon} V^{4T} \Big) 
            \log \frac{\beta^{\max}}{\beta^{\min}} \Big).
        % \subset
        % =
        % O\Big(  (N+A)^4 \exp\log(V^{4T}) \Big) 
        % = O\Big(  (N+A)^4 \exp(4T\log V) \Big) 
    \]
    Finally, we suppress the factor of $\log \log \frac1p$ in the first case.
    We argue this is justified because it is small even for the smallest number representable with 64 bits, and because this notion of precision is dwarfed by the one captured by $\epsilon$. 
    There is also no problem when $p=0$, because then it simply becomes a constraint
    and is handled gracefully. 
\end{lproof}

\subsection{Hardness Results and Reductions}
% We now turn to he hardness results of \cref{theorem:inf-via-inc-oracle}
We now prove \cref{theorem:consistent-NP-hard} parts (a) and (b)
directly by reduction to 3-SAT.  They also follow from part (c) 
 combined with the hardness of inference in BNs, which PDGs generalize,
 but the direct proof is nicer. 

\textbf{\cref{theorem:consistent-NP-hard}.}\textit{
\begin{enumerate}[nosep,label={\rm{(\alph*)}}]
\item Determining whether or not there is a distribution %
    that shares all cpds with $\dg M$ is NP-hard.
\item Computing $\aar{\dg M}_\gamma$ is \#P-hard, for all $\gamma \ge 0$.
\item For $\gamma \in \{0^+\}\cup(0,\min_a \frac{\beta_a}{\alpha_a})$,
    there is an $O($size of query result$)$ reduction
    from $\zogamma$-inference to the problem of calculating
    $\gamma$-inconsistency. 
    Under bounded tree-width, there
    is also an $O(|\V \C|)$ reduction in the other direction,
    making the problems essentially equivalent.
\end{enumerate}}
\begin{lproof} \label{proof:consistent-NP-hard}
    \textbf{(a).}
	We can directly encode SAT problems in PDGs.
	Specifically, let
	$$\varphi := \bigwedge_{j \in \mathcal J} \bigvee_{i \in \mathcal I(j)} (X_{j,i})$$
	be a CNF formula over binary variables $\mat X := \bigcup_{j,i} X_{j,i}$. Let
	$\dg M_\varphi$ be the PDG containing every variable $X \in \mat X$ and a binary
	variable $C_j$ (taking the value 0 or 1) for each clause $j \in \mathcal J$, as well as the following edges, for each $j \in \mathcal J$:
	%\{$``$\varphi(\mat X)$''$\}$ with $\V(\varphi) = \{0,1\}$, and
	\begin{itemize}
		\item a hyperedge $\{X_{j,i} : i \in \mathcal I(j)\} \tto C_j$, together with a degenerate cpd
			encoding the boolean OR function (i.e., the truth of $C_j$ given $\{X_{j,i}\}$);
		\item an edge $\pdgunit \tto C_j$, together with a cpd asserting $C_j$ be equal to 1.
	\end{itemize}
	% We give each edge $\alpha = 0$ and $\beta = 1$.
	First, note that the number of nodes, edges, and non-zero entries in the cpds are polynomial in the $|\mathcal J|, |\mat X|$, and the total number of parameters in a simple matrix representation of the cpds is also polynomial if $\mathcal I$ is bounded (e.g., if $\varphi$ is a 3-CNF formula).
	A satisfying assignment $\mat x \models \varphi$ of the variables $\mat X$ can be regarded as a degenerate joint distribution $\delta_{\mat X = \mat x}$ on $\mat X$, and extends uniquely to a full joint distribution $\mu_{\mat x} \in \Delta \V(\dg M_\varphi)$ consistent with all of the edges, by
	\[ \mu_{\mat x} = \delta_{\mat x} \otimes \delta_{\{C_j = \vee_i  x_{j,i}\}} \]

 	Conversely, if $\mu$ is a joint distribution consistent with the edges above, then any point $\mat x$ in the support of $\mu(\mat X)$ must be a satisfying assignment, since the two classes of edges respectively ensure that $1 =\mu(C_j\!=\! 1 \mid \mat X \!=\! \mat x) = \bigvee_{i \in \mathcal I(j)} \mat x_{j,i}$ for all $j \in \mathcal J$, and so $\mat x \models \varphi$.

	Thus, $\SD{\dg M_\varphi} \ne \emptyset$ if and only if $\varphi$ is satisfiable, so
	an algorithm for determining if a PDG is consistent can also be adapted (in polynomial space and time) for use as a SAT solver, and so the problem of determining if a PDG consistent is NP-hard.

% \end{lproof}
% \recall{prop:sharp-p-hard}
% \begin{lproof}\label{proof:sharp-p-hard}

    \medskip\hrule\smallskip

	\textbf{(b).}
    We prove this by reduction to \#SAT. Again, let $\varphi$ be some CNF formula over $\mat X$, and construct
	$\dg M_\varphi$ as in \hyperref[proof:consistent-NP-hard]{the proof} of
	\Cref{theorem:consistent-NP-hard}.
	Furthemore, let $\bbr{\varphi} := \{ \mat x : \mat x \models \varphi \}$ be the set of  assingments to $\mat X$ satisfying $\varphi$, and $\#_\varphi := |\bbr{\dg M}|$ denote the number such assignments. We now claim that
	\begin{equation}\label{eqn:number-of-solns}
		\#_\varphi = \exp \left[- \frac1\gamma \aar{ \dg M_\varphi }_\gamma \right].
	\end{equation}
 	If true, we would have a reduced the \#P-hard problem of computing $\#_\varphi$ to the problem of computing $\aar{\dg M}_\gamma$ for fixed $\gamma$. We now proceed with proof \eqref{eqn:number-of-solns}.
	By definition, we have
	\[ \aar{\dg M_\varphi}_\gamma = \inf_\mu \Big[ \OInc_{\dg M_\varphi}(\mu) + \gamma \SInc_{\dg M_\varphi}(\mu) \Big]. \]
	We start with a claim about first term.
	% For the particular PDG $\dg M_\varphi$, the

	\begin{iclaim} \label{claim:separate-inc-varphi}
		% $\OInc(\dg M_\varphi)$ is finite if and only if $\varphi$ is statisfiable.
		$\OInc_{\dg M_\varphi}\!(\mu) =
		% \begin{cases}
		% 	0 & \text{if}~  \mat x \models \varphi~\text{and}~\mat c = \mat 1
		% 	 	~\text{for all}~(\mat x, \mat c) \in \supp \mu\\
		% 	\infty & \text{otherwise}
		% \end{cases}
		\begin{cases}
			0 & \text{if}~  \supp \mu \subseteq \bbr{\varphi} \times \{ \mat 1\} \\
			\infty & \text{otherwise}
		\end{cases}$.
	\end{iclaim}
	\vspace{-1em}
	\begin{lproof}
		Writing out the definition explicitly, the first can be written as
		\begin{equation}
			\OInc_{\dg M_\varphi}\!(\mu) = \sum_{j} \left[ \kldiv[\Big]{\mu(C_j)}{\delta_1} +
				\Ex_{\mat x \sim \mu(\mat X_j)} \kldiv[\Big]{\mu(C_j \mid \mat X_j = \mat x)}{\delta_{\lor_i \mat x_{j,i}}} \right], \label{eqn:explicit-INC-Mvarphi}
				% &= \sum_{j} \left[
				% 	\begin{matrix} \mu(C_j\!=\!0) (\infty) \\
				% 	 	+ \mu(C_j \!=\! 1) \log \mu(C_j \!=\! 1)
				% 	\end{matrix} +
				% 	\Ex_{\mat x \sim \mu(\mat X_j)} \kldiv[\Big]{\mu(C_j \mid \mat X_j = \mat x)}{\delta_{\lor_i \mat x_i}} \right],
		\end{equation}
		where $\mat X_j = \{X_{ij} : j \in \mathcal I(j)\}$ is the set of variables that
		appear in clause $j$, and $\delta_{(-)}$ is the probability distribution placing all mass on the point indicated by its subscript.
		As a reminder, the relative entropy is given by
		\[ \kldiv[\Big]{\mu(\Omega)}{\nu(\Omega)} := \Ex_{\omega \sim \mu} \log \frac{\mu(\omega)}{\nu(\omega)},
		\quad\parbox{1.4in}{\centering and in particular, \\ if $\Omega$ is binary,}\quad
			\kldiv[\big]{\mu(\Omega)}{\delta_\omega} = \begin{cases}
				0 &  \text{if}~\mu(\omega) = 1 ; \\
				\infty & \text{otherwise}.
		\end{cases} \]
		Applying this to \eqref{eqn:explicit-INC-Mvarphi}, we find that either:
		\begin{enumerate}[itemsep=0pt]
			\item Every term of \eqref{eqn:explicit-INC-Mvarphi} is finite (and zero) so $\OInc_{\dg M_\varphi}(\mu) = 0$, which happens when $\mu(C_j = 1) = 1$ and $\mu(C_j = \vee_i~ x_{j,i}) = 1$ for all $j$.  In this case, $\mat c = \mat 1 = \{ \vee_i~x_{j,i} \}_j$ so $\mat x \models \varphi$ for every $(\mat{c,x}) \in \supp \mu$;
			\item Some term of \eqref{eqn:explicit-INC-Mvarphi} is infinite, so that $\OInc_{\dg M_\varphi}(\mu) = \infty$, which happens if some $j$, either

			\begin{enumerate}
				\item $\mu(C_j \ne 1) > 0$ --- in which case there is some $(\mat{x,c}) \in \supp \mu$ with $\mat c \ne 1$, or
				\item $\supp \mu(\mat C) = \{\mat 1\}$, but $\mu(C_j \ne \vee_i~ x_{j,i}) > 0$ --- in which case there is some $(\mat{x,1}) \in \supp \mu$ for which $1 = c_j \ne \vee_i~x_{j,i}\;$, and so $\mat x \not\models \varphi$.
			\end{enumerate}
		\end{enumerate}
		Condensing and rearranging slightly, we have shown that
		\[
			\OInc_{\dg M_\varphi}(\mu) =
			\begin{cases}
				0 & \text{if}~  \mat x \models \varphi~\text{and}~\mat c = \mat 1
				 	~\text{for all}~(\mat x, \mat c) \in \supp \mu\\
				\infty & \text{otherwise}
			\end{cases}~.
		\]
		% So if $\mat x \models \varphi$ for all $\mat x \in \supp \mu(X)$,
		%
		% $\OInc_{\dg M_\varphi}(\mu) = 0$
		% The first term is infinite if $\mu(C_j = 1) < 1$, and the second is infinite
		% if $\mu(C_j = \lor_i X_{i,j}) < 1$. Thus, if $\OInc_{\dg M_\varphi}(\mu)$ is finite, then $\mat x \sim \mu(\mat X)$ satisfies $\varphi$ with probability 1, and $\varphi$ must be satisfiable.
		% Conversely,
	\end{lproof}

	% Thus, if $\OInc_{\dg M_\varphi}(\mu)$ is finite, then every $\mat x \in \supp \mu$ is a satisfying assignment of $\varphi$.
	Because $\SInc_{}$ is bounded, it follows immediately that
 	$\aar{\dg M_\varphi}_\gamma$, is finite if and only if
	there is some distribution $\mu \in \Delta\V(\mat X,\mat C)$ for which $\OInc_{\dg M_\varphi}(\mu)$ is finite, or equivalently, by \Cref{claim:separate-inc-varphi}, iff there exists some $\mu(\mat X) \in \Delta \V(\mat X)$ for which $\supp \mu(\mat X) \subseteq \bbr{\varphi}$, which in turn is true if and only if $\varphi$ is satisfiable.

	In particular, if $\varphi$ is not satisfiable (i.e., $\#_\varphi = 0$), then $\aar{\dg M_\varphi}_\gamma = +\infty$, and
	\[
		\exp \left[ -\frac1\gamma \aar{\dg M_\varphi}_\gamma \right] =
	 		\exp [ - \infty ] = 0 = \#_\varphi,
	\]
	so in this case \eqref{eqn:number-of-solns} holds as promised. On the other hand, if $\varphi$ \emph{is} satisfiable, then, again by \Cref{claim:separate-inc-varphi}, every $\mu$ minimizing $\bbr{\dg M_\varphi}_\gamma$, (i.e., every $\mu \in \bbr{\dg M_\varphi}_\gamma^*$) must be supported entirely on $\bbr{\varphi}$ and have $\OInc_{\dg M_\varphi}\!(\mu) = 0$.  As a result, we have
	\[
		\aar{\dg M_\varphi}_\gamma =
			\inf\nolimits_{\mu \in \Delta \big[\bbr{\varphi} \times \{\mat 1\}\big]} \gamma\; \SInc_{\dg M_\varphi}(\mu) .
	\]
	A priori, by the definition of $\SInc_{\dg M_\varphi}$, we have
	\[
		\SInc_{\dg M_\varphi}(\mu) =
		 	- \H(\mu) + \sum_{j} \Big[ \alpha_{j,1} \H_\mu(C_j \mid \mat X_j)
						+ \alpha_{j,0} \H_\mu(C_j) \Big],
	\]
	where $\alpha_{j,0}$ and $\alpha_{j,1}$ are values of $\alpha$ for the edges of $\dg M_\varphi$, which we have not specified because they are rendered irrelevant by the fact that their corresponding cpds are deterministic. We now show how this plays out in the present case.
	Any $\mu \in \Delta\big[\bbr{\varphi} \times \{\mat 1\}\big]$ we consider has a degenerate marginal on $\mat C$. Specifcally, for every $j$, we have $\mu(C_j) = \delta_1$, and since entropy is non-negative and never increased by conditioning,
	$$
		0 \le \H_\mu(C_j \mid \mat X_j) \le \H_\mu(C_j) = 0.
	$$
	Therefore, $\SInc_{\dg M_\varphi}(\mu)$ reduces to the negative entropy of $\mu$.
	Finally, making use of the fact that the maximum entropy distribution $\mu^*$ supported on a finite set $S$ is the uniform distribution on $S$, and has $\H(\mu^*) = \log | S |$, we have
	\begin{align*}
		\aar{\dg M_\varphi}_\gamma &= \inf\nolimits_{\mu \in \Delta \big(\bbr{\varphi} \times \{\mat 1\}\big)} \gamma\; \SInc_{\dg M_\varphi}(\mu) \\
			&= \inf\nolimits_{\mu \in \Delta \big(\bbr{\varphi} \times \{\mat 1\}\big)} -\, \gamma\, \H(\mu) \\
			&= - \gamma\, \sup\nolimits_{\mu \in \Delta \big(\bbr{\varphi} \times \{\mat 1\}\big)}  \H(\mu) \\
			&= - \gamma\, \log (\#_\varphi),
	\end{align*}
	\hspace{1in}giving us
	$$
		\#_\varphi = \exp \left[- \frac1\gamma \aar{ \dg M_\varphi }_\gamma \right],
	$$
	as desired. We have now reduced \#SAT to computing $\aar{\dg M}_\gamma$, for $\gamma > 0$ and an arbitrary PDG $\dg M$, which is therefore \#P-hard.

    To show the same for $\gamma = 0$, it suffices to add an additional hyperedge pointing to all variables, and associate it with a joint uniform distribution, and confidence 1, resulting in a new PDG $\dg M_\varphi'$.
    Then, because this new edge's contribution to $\OInc_{\dg M}$
    equals $\kldiv{\mu}{\mathsf{Unif}(\X)} = \log |\V\!\X| - \H(\mu)$,
    we have
    \[
        \bbr{\dg M_\varphi'}_0(\mu)
            = \OInc_{\dg M_\varphi'}(\mu)
            = \bbr{\dg M_\varphi}(\mu) + \log | \V\!\X | - \H(\mu)
            = \bbr{\dg M_\varphi}_{1}(\mu) - \log |\V\!\X |.
    \]
    Since this is true for all $\mu$, we conclude that
    \[
        \aar{\dg M_\varphi'} = \aar{\dg M_\varphi}_1 - \log |\V\!\X| = -\log \big( |\V\!\X|\cdot \#_\varphi \big)
    \]
    so the two differ by a constant, and both compute the number of satisfying assignments to $\varphi$. So in general, computing $\aar{\dg M}$ is \#P-hard as well.
\end{lproof}
We prove part (c) in the next section.

%oli13: moving to proofs
\subsection{Inference via Inconsistency Minimization}
    \label{sec:inf-via-inc}

We now address part (c) of \cref{theorem:inf-via-inc-oracle},
which is closely related to \textcite{pdg-aaai}'s original idea for an inference
algorithm.  While that idea does not yield an efficient inference algorithm,
it does yield a very efficient reduction from inconsistency minimization to inference.
% The approach is in the sprit of the with the general approach 
% towards probabilistic modeling in general proposed by
% \textcite{one-true-loss}. 
% Before we begin, we will need some lemmas.
%
In order to prove this, we first need another construction with PDGs.
%
A cpd between discrete variables can be represented by
    a stochastic matrix (i.e., a matirx whose rows sum to one).
It turns out that it is possible to use the machinery of PDGs
    to, effectively, give only one value of that matrix.
That is, for any $p \in [0,1]$, we can construct a PDG
    that represents the belief that $\Pr(Y{=}y|X={x}) = p$, but say nothing about
    how the probability splits between other values of $y$, and also says nothing
    about the probability of $Y$ if $X \ne x$.
We now describe that construction.

\def\XxYy{{X{=}x\Vert Y{=}y}}
\def\XxYyshort{{X_x Y_y}}
\def\Yy{{Y{=}y}}
\def\Yyshort{{Y_y}}

First, we introduce two new auxiliary variables.
The first variable, which we might like to call ``$\Yy$'', but
    mostly refer to as $\Yyshort$ to prevent confusion with the synonomous
    event, is a binary variable, with $\V(\Yyshort) = \{y, \lnot y\}$, 
    and takes the value $y$ if $Y=y$, and $\lnot y$ if $Y \ne y$.
The second variable, which we would like to call ``$\XxYy$'',
    but instead mostly refer to as $\XxYyshort$ to prevent notational confusion, 
    can take three values: $\V(\XxYyshort):= \{ x, y, \lnot y \}$.
The value $x$ is meant to correspond exactly to the event $X{=}x$,
    much like before, so that $\XxYyshort = x$ if and only if $X = x$.  
The values $y$ and $\lnot y$ also correspond to their respective
    events, but more loosely; the variable $\XxYyshort$ only takes one of these
    values when $X \ne x$.
Note that both variables can be determined from $X$ and $Y$
(although we will need to enforce this with additional arcs), and
therefore there is a unique way to extend a
distribution over $X$ and $Y$ to also include the variables $\Yyshort$ and $\XxYyshort$. 
 
% Now, supposing that $\XxY$ takes its special value $X{=}x$ if and only if $X=x$, and otherwise takes the value of $Y$, then there's a clear choice of distribution on $B$ given $X$: if $X$ takes its special value, we $\Pr(B) = p$. Otherwise, $\Pr(B)$ is a point mass on the value $b$. And this value of $X$ an be calculated from joint values of $B$ and $A$.
With these definitions in place, there is
    now an obvious way to add an arc from the variable $(\XxYyshort)$
    to the variable $\Yyshort$, together with a cpd asserting that $\Pr(Y{=}y|X{=}x)=p$. 
This cpd is written as a stochastic matrix $\hat p$ on the right of the figure below.
The PDG we have just constructed is illustrated on the left of the figure below.
In addition to $\hat p$ and the new variables, this PDG
    includes the structural constraints $s_1$ and $s_2$ needed to define the variables
    $\XxYyshort$ and $\Yyshort$ in terms of $X$ and $Y$; they are deterministic functions,
    drawn in double-headed gray arrows.
% are illustrated below.

\begin{center}
% $\dg M^{y|x=p}:=$
    \begin{tikzpicture}[center base]
        \node[dpadded] (X) at (0,-0.5){$X$};
        \node[dpadded, right=1.6 of X](Y){$Y$};

        \node[tpt={astar|$x$}] at (0.2,1.5) {};
        \node[tpt={b1|$y$},right=0.45 of astar]{};
        \node[tpt={b2|$\lnot y$},right=0.3 of b1]{};
        % \node[right=0.2 of b2] (bdots){$\cdots$};
        % \node[tpt={bn|$y_n$},right=0.2 of bdots]{};

        \node[Dom={$\XxYy$[label distance=-1.5em, xshift=3.3em] (XxYy)
            around {\lab{astar}\lab{b1}\lab{b2}(1,1.65)}} ] {};
            
        
        \node[tpt={y1|$y$}] at (3.6, 1.5){};
        \node[tpt={y2|$\lnot y$},right=0.5 of y1]{};
        \node[Dom={$Y{=}y\,$[label distance=-1.4em, xshift=1.1em] (Yy)
            around {\lab{y1}\lab{y2}(3.6,1.7)}} ] {};

        %% structural arrows
        \mergearr[black!35!proofmatt,arr2,->>]{X}{Y}{XxYy}
        \node[black!35!proofmatt,below=2pt of center-XYXxYy]{$s_1$};
        
        %%
        \draw[black!35!proofmatt, arr2, ->>] (Y) to node[below right]{$s_2$} (Yy);

        \draw[arr2] (XxYy) to node[above]{$\hat p$} (Yy);
    \end{tikzpicture}
    \hspace{1cm}
    \begin{minipage}{0.3\textwidth}
        \begin{align*}
            s_1(\XxYyshort|X,Y) &:= \begin{cases}
                x & \text{if $X = x$} \\
                y & \text{if $X \ne x$ and $Y=y$} \\
                \lnot y & \text{if $X \ne x$ and $Y\ne y$} \\
            \end{cases}\\
            s_2(\Yyshort|Y) &:= \begin{cases}
                y & \text{if $Y=y$} \\
                \lnot y & \text{if $Y\ne y$} \\
            \end{cases}\\[1ex]
            % p \mathrm{~or~} b~(X) &:= \begin{cases}
            % 	p(B) & \text{if $X = a^*$} \\
            % 	\delta_{B=b} & \text{if $X = b$}
            % \end{cases}
            \hat p(\Yyshort|\XxYyshort) &= \begin{matrix}
                &  \begin{matrix} y & \lnot y \end{matrix} \\
                \begin{matrix} x \!\! \\ y \!\! \\ \lnot y \!\! \end{matrix} & 
                    \begin{bmatrix}
                        \;p & 1-p \; \\
                        \;1 & 0  \;\\
                        \;0 & 1 \;
                    \end{bmatrix}
            \end{matrix}
        \end{align*}

    \end{minipage}
\end{center}
    \medskip
    
So, when we add $\Pr(Y=y|X=x) = p$ to a PDG, we really first implicitly
convert this information to a PDG as above.  The first order of business
is to prove that this works as we should expect, semantically, in the case
we're interested in.

\begin{lemma}
    % For all PDGs $\dg M$ with $\bbeta \ge \mat 0$ and all $\gamma > 0$, we have that
    Suppose $\dg M$ is a PDG with variables $\X$ and $\bbeta \ge \mat 0$. 
    Then, for all $X,Y \subseteq \X$, $x \in \V X$, $y \in \V Y$, $p \in [0,1]$ and $\gamma \ge 0$, 
    we have that:
    \[
        \aar[\Big]{\dg M + ~\Pr(Y{=}y|X{=}x) = p}_\gamma \ge \aar{\dg M}_\gamma,
    \]
    with equality if and only if there exists $\mu \in \bbr{\dg M}^*_\gamma$
    such that $\mu(Y{=}y|X{=}x) = p$. 
\end{lemma}
\begin{lproof}
    The inequality is immediate; it is an instance of monotonicity of inconsistency
    \cite[Lemma 1]{one-true-loss}. Intuitively: believing more cannot make you any less
    inconsistent.  We now prove that equality holds iff there is a minimizer with the appropriate conditional probability.
    
    $(\impliedby)$. Suppose there is some $\mu \in \bbr{\dg M}^*_\gamma$ with $\mu(Y{=}y|X{=}x) = p$. 
    Because $\mu \in \bbr{\dg M}^*_\gamma$, we know that
    $\bbr{\dg M}_\gamma(\mu) = \aar{\dg M}$. 
    Let $\hat \mu$ be the extension of $\mu$ to the new variables ``$\XxYy$'' and $``\Yy$'',
        whose values are functions of $X$ and $Y$ according to $s_1$ and $s_2$. Then,
    \begin{align*}
        \aar[\Big]{\dg M + ~\Pr(Y{=}y|X{=}x) = p}_{\!\gamma}
            &\le \bbr[\Big]{\dg M + ~\Pr(Y{=}y|X{=}x) = p}_\gamma(\hat \mu) \\
            &= \bbr{\dg M}_\gamma(\mu) + \Ex_{\mu}\left[ 
                \log \frac{\hat\mu(\Yyshort | \XxYyshort)}{ \hat p(\Yyshort | \XxYyshort)} \right]\\
            &\hspace{-6em}= \bbr{\dg M}_\gamma(\mu) + 
                \mu(X{=}x,Y{=}y) \log \frac{\mu(Y{=}y|X{=}x)}{p}
                + \mu(X{=}x,Y{\ne} y) \log \frac{\mu(Y{\ne} y|X{=}x)}{1-p} \\
            &\hspace{-3em}= \bbr{\dg M}_\gamma(\mu) + 
                \mu(X{=}x,Y{=}y) \log(1)
                + \mu(X{=}x,Y{\ne} y) \log(1) \\[1ex]
            &= \bbr{\dg M}_\gamma(\mu) 
            = \aar{\dg M}_\gamma.
    \end{align*}
    The equality between the second and third lines
    is perhaps the trickiest to see, but follows
    because for joint settings in which $X{\ne}x$, 
    one can easily see that $\hat\mu(\Yyshort|\XxYyshort)$
    equals 1 with probability 1, as does $\hat p(\Yyshort|\XxYyshort)$.
    So, after dividing one by the other and taking a logarithm,
        these cases contribute nothing to the expectation.
    What remains are the two possibilities where $X{=}x$, which are shown in the second line.
    
    To complete this direction of the proof, it suffices to observe
    that we already knew the inequality held in the opposite direction
    (by monotonicity), so the two terms are equal. 
    
    $(\implies)$.  Suppose the two inconsistencies are equal, i.e.,
    $
    \aar[\Big]{\dg M + ~\Pr(Y{=}y|X{=}x) = p}_\gamma = \aar{\dg M}_\gamma.
    $
    
    This time, choose $\hat\mu \in \bbr{\dg M+ ~\Pr(Y{=}y|X{=}x) = p}^*_\gamma$,
        and define $\mu$ to be its marginal on the variables of $\dg M$ 
        (which contains the same information as $\hat \mu$ itself). 
    % 
    Let $q := \mu(Y{=}y|X{=}x)$. Then, 
    
    \begin{align*}
        \aar{\dg M}_\gamma &= \aar[\Big]{\dg M + ~\Pr(Y{=}y|X{=}x) = p}_{\!\gamma} \\
         &= \bbr[\Big]{\dg M + ~\Pr(Y{=}y|X{=}x) = p}_\gamma(\hat \mu) \\
         &= \bbr{\dg M}_\gamma(\mu) + 
             \mu(X{=}x,Y{=}y) \log \frac{\mu(Y{=}y|X{=}x)}{p}
             + \mu(X{=}x,Y{\ne} y) \log \frac{\mu(Y{\ne} y|X{=}x)}{1-p} \\
        &= \bbr{\dg M}_\gamma(\mu) + 
            \mu(X {=} x) \left[ q \log \frac qp + (1-q) \log \frac{1-q}{1-p} \right] \\
        &= \bbr{\dg M}_\gamma(\mu) + \mu(X{=}x) \kldiv qp \\
        &\ge \aar{\dg M}_\gamma + \mu(X{=}x) \kldiv qp
    \end{align*}
    Therefore $0 \ge \mu(X{=}x) \kldiv qp$. But relative entropy is non-negative,
    by Gibbs inequality. This shows $\mu(X{=}x) \kldiv qp = 0$. 
    So either $\mu(X{=}x)$, or $p = \mu(Y{=}y|X{=}x)$, and the first case is just
    a special case of the second one.  
    In addition, the algebra above shows that $\mu \in \bbr{\dg M}^*_\gamma$, as its 
        score is $\aar{\dg M}_\gamma$.
    Thus, we have found $\mu \in \bbr{\dg M}^*_\gamma$ such that $\mu(Y{=}y|X{=}x) = p$, completing the proof. 
\end{lproof}

Next, we show that the overall inconsistency is convex in the parameter $p \in [0,1]$.
It's perhaps easier to give the more general result:

\begin{linked}{lemma}{inc-strictly-cvx}
    If $h$ is a cpd, then
    the function $h \mapsto \aar{\dg M + h}_\gamma$
    and strictly convex for  $\gamma \in (0, \min_a \frac{\beta_a}{\alpha_a})$.
\end{linked}
% \recall{lemm:smooth-and-strictly-cvx}
\begin{lproof}\label{proof:inc-strictly-cvx}
	% First, we deal with the convexity, for which we make use of \cref{lem:cvx2}.
	% \commentout{
	% 	\def\mw#1{{\mat w}_{\!_{#1}}}
	% 	\def\ofmw(#1|#2){(\mw{#1} | \mw{#2})}
	% 	\begin{align*}
	% 		\aar{\dg M \bundle p}_\gamma &= \inf_\mu \Big[ \OInc_{\dg M \bundle p}(\mu)
	% 			+ \SInc_{\dg M \bundle p}(\mu) \Big] \\
	% 		&=  \inf_{\mu} \Ex_{\mat w \sim \mu}
	% 			\left[\log \mu(\mat w) +
	% 			 	\beta_p \log \frac{\mu\ofmw(Y|X)}{p\ofmw(Y|X)} \; +  \!\sum_{\ed LAB} \beta_L \log \frac{\mu\ofmw(B|A)}{\bp\ofmw(B|A)} + \alpha_L \log \frac{0}{\mu\ofmw(B|A)}\right] \\
	% 		&= f
	% 	\end{align*}
	% }
	We start by expanding the definitions, obtaining
	\begin{align*}
		\aar{\dg M + h}_\gamma &= \inf_\mu ~\bbr{\dg M + h}_\gamma(\mu) \\
			&= \inf_\mu \left[ \bbr{\dg M }_\gamma(\mu)
				+ \Ex_{x\sim\mu_{\!_X}} \kldiv[\Big]{\mu(Y\mid x)}{h(Y\mid x)} \right]\\
			&= \inf_\mu \left[ \bbr{\dg M }_\gamma(\mu)
				+  \kldiv[\Big]{\mu(X,Y)}{h(Y \mid X)\, \mu(X)} \right].
	\end{align*}
	% % Choose $\gamma < \min (\{1\}\cup\{ \beta^{\dg M}_L : L \in \Ed^{\dg M}\})$.
	% Since $\bbr{\dg M}_\gamma$ is a $\gamma$-strongly convex function of $\mu$ for all
	% such $\gamma < \min_L \beta_L$, and
	% $\kldiv{\mu_{XY}}{\mu_X \; p_{Y\mid X}}$ is 1-strongly
	% convex in $p$ for fixed $\mu$ (\cref{lem:Dstrongcvx}),
	% % $\thickD$ is convex in both of its arguments,
	% their sum is $\gamma$-strongly convex in $\mu$ and in $p$.
	% By \cref{lem:cvx2} taking an infemum preserves this convexity,
	% and so
	% $
	%  	\inf_\mu \left[ \bbr{\dg M }_\gamma(\mu)
	% 	+  \kldiv[\big]{\mu_{XY}}{p_{Y \mid X}\; \mu_X} \right]
	% $, which equals $\aar{\dg M \bundle p}_\gamma$,
	% is $\gamma$-strongly convex in $p$.
	% % $\aar{\dg M \bundle p}_\gamma$ is smooth
	% % Smoothness.


	% Choose $\gamma < \min (\{1\}\cup\{ \beta^{\dg M}_L : L \in \Ed^{\dg M}\})$.
	Fix $\gamma \le \min_a \frac{\beta_a}{\alpha_a}$. Then we know that $\bbr{\dg M}_\gamma(\mu)$ is a $\gamma$-strongly convex function for every PDG $\dg M$, and hence there is a unique joint distribution which minimizes it. We now
    show that the inconsistency is strictly convex.

	% \textbf{Strict Convexity.}/
	Suppose $h_1(Y \mid X)$ and $h_2(Y\mid X)$ are two cpds on $Y$ given $X$.
	Fix $\lambda \in [0,1]$, and define $h_\lambda := (1-\lambda) h_1 + \lambda h_2$.
	Let $\mu_1, \mu_2$ and $\mu_\lambda$ be the joint distributions that minimze $\bbr{\dg M + h_1}_\gamma$, $\bbr{\dg M + h_2}_\gamma$ and $\bbr{\dg M + h_\lambda}_\gamma$, respectively.  Then we have
	\begin{equation*}
		\aar{\dg M + h_\lambda}_\gamma
			= \bbr{\dg M}_\gamma(\mu_\lambda) + \kldiv[\Big]{\mu_\lambda(X,Y)}{h_\lambda(Y\mid X) \mu_\lambda( X)}.
	\end{equation*}
	By convexity of $\bbr{\dg M}$ and $\thickD$, we have
	\begin{align}
		\bbr{\dg M}_\gamma(\mu_\lambda)
		 	&\le (\lambda-1)\bbr{\dg M}_\gamma(\mu_1) + \lambda \bbr{\dg M}_\gamma(\mu_2)
			 	\label{eqn:score-cvx}\\
		\text{and}\qquad \kldiv[\Big]{\mu_\lambda(XY)}{h_\lambda(Y | X) \mu_\lambda( X)}
			&\le (1-\lambda)\kldiv[\Big]{\mu_1(XY)}{h_1(Y | X) \mu_1( X)} \nonumber \\
			&\qquad+ \lambda\;\;\kldiv[\Big]{\mu_2(XY)}{h_2(Y | X) \mu_2( X)}.
				\label{eqn:D-cvx}
	\end{align}
	If $\mu_1 \ne \mu_2$ then since $\bbr{\dg M}$ is strictly convex, \eqref{eqn:score-cvx} must
	be a strict inequality. On the other hand, if $\mu_1 = \mu_2$, then since $\mu_\lambda = \mu_1 = \mu_2$ and $\thickD$ is stricly convex in its second argument when its first argument is fixed, \eqref{eqn:D-cvx} must be a strict inequality.
	In either case, the sum of the two inequalities must be strict, giving us
	\begin{align*}
		\aar{\dg M \bundle h_\lambda}_\gamma &=
		\bbr{\dg M}_\gamma(\mu_\lambda) + \kldiv[\Big]{\mu_\lambda(XY)}{h_\lambda(Y | X) \mu_\lambda( X)} \\
		&<
		 (\lambda-1) \left[\bbr{\dg M}_\gamma(\mu_1)
			 	+ \kldiv[\Big]{\mu_1(XY)}{h_1(Y | X) \mu_1( X)} \right]
			 \\[-0.3em]&\qquad\qquad
			 + \lambda \left[ \bbr{\dg M}_\gamma(\mu_2)
			 	+ \kldiv[\Big]{\mu_2(XY)}{h_2(Y | X) \mu_2( X)}
			 	\right] \\
		 &= (\lambda-1) \aar{\dg M \bundle h_1} + \lambda\,\aar{\dg M \bundle h_2},
	\end{align*}
	which shows that $\aar{\dg M \bundle h}$ is \emph{strictly} convex in $h$, as desired.
\end{lproof}


\begin{coro}
    As before, let $\dg M$ is a PDG with $\bbeta \ge \mat 0$ and 
    variables $\X$, and fix $X, Y \subseteq \X$, $x \in \V X$, $y \in \V Y$, and $\gamma > 0$.
    Then, for $p \in [0,1]$ the map
    \[
        p \mapsto \aar[\Big]{\dg M + ~\Pr(Y{=}y | X{=}x)=p}_\gamma
    \]
    is strictly convex. 
\end{coro}
\begin{lproof}
    Simply take $h$ to be the cpd $\hat p$, and
    absorb the the other components of (the PDG representation of) $\Pr(Y{=}y|X{=}x)=p$
    into $\dg M$, and then apply \cref{lemma:inc-strictly-cvx}.
\end{lproof}


We are now ready to tackle the theorem itself.

\textbf{\cref{theorem:inf-via-inc-oracle}~{(c)}.~} 
\textit{ For $\gamma \in \{0^+\}\cup(0,\min_a \frac{\beta_a}{\alpha_a})$,
    there is an $O($size of query result$)$ reduction
    from $\zogamma$-inference to the problem of calculating
    $\gamma$-inconsistency. 
     % from inconsistency calculation to inference;
    Under bounded tree-width, there
    is also an $O(|\V \C|)$ reduction in the other direction,
    making the problems essentially equivalent. }
% \recall{theorem:inf-via-inc-oracle}
% 
% \[
% \]
% \begin{lproof}\label{proof:inf-via-inc-oracle}
%     More assumptions:
%     \begin{enumerate}[nosep]
%         \item $\bbeta < \boldsymbol\infty$. 
%     \end{enumerate}
% 
%     If $\aar{\dg M}_{\gamma} = \infty$, then in a sense all distributions are equally 
%     valid, and so inference is trivial.
% 
%     Otherwise, if $\aar{\dg M}_{\gamma} < \infty$, and $\beta$ is not
%     infinite, then we can show that $\aar{\dg M + h_0}_\gamma < \infty$, 
%     where $h_0(Y|X)$ is the uniform distribution on $Y$ for every value of $x$.
% 
%     Next, we need to bound $\aar{\dg M + H}_\gamma$.   
% 
% 
% \end{lproof}
\begin{lproof}
    The claim is that, with an inconsistency oracle (in particular, 
        with the ability to query the
        inconsistency of $\dg M  + ~\Pr(Y{=}y|X{=}x) = p$ for free),
    we can find $\bbr{\dg M}^*_\gamma(Y{=}y|X{=}x)$ in constant time. 
    Thus, if we are looking to find the entire matrix of conditional 
    probabilities $\bbr{\dg M}^*_\gamma(Y|X)$, it will take time linear
    in the number of entries in that matrix (i.e., $|\V(X,Y)|$).
    The reduction is also linear in another sense.        
    We will prove that we can approximate $\bbr{\dg M}^*_\gamma(Y{=}y|X{=}x)$ to within 
    precision $\epsilon$ in time $O(\log \nicefrac{1}{\epsilon})$.
    Thus the result is the strongest reduction one can hope for:
        it is linear in the number of bits we output. 
    
    We proceed by describing an algorithm
        that uses an our inconsistency oracle
        to answer porbabilistic queries with a variant of binary search.
    The state of the algorith
    consists of three points in an interval $[a,b,c] \in [0,1]$, 
    labeled with the values $f(a)$, $f(b)$, and $f(c)$, where
    $f$ is the function
    \begin{align*}
        &f : [0,1] \to \Rext\\
        &p \mapsto \aar{\dg M + (\Pr(Y{=}y|X{=}x)=p)}_\gamma
    \end{align*}
    that we assumed we have oracle access to.
    % is the inconsistency of $\dg M$ with the belief that $\Pr(Y{=}y|X{=}x)=p$,
    % .
    We can then find the minimizer $x^*$ of $f$ with the following algorithm.
    
    \medskip
    \rule{4in}{0.2ex}
    \begin{algorithmic}
        % \State $z \gets (y_0, x_0, \tau_0, s_0, \kappa_0)$ as in \eqref{bbeq:init};
        \STATE Initialize $(a,b,c) \gets (0, \frac12, 1)$;
        \FOR{$i = 1, 2,\ldots, \log(\nicefrac{1}{\epsilon}) / (\log \nicefrac43)$}
            \IF{$b-a \ge c-b$}
                \STATE Let $x := \frac{b + a}{2}$, and evaluate $f(x)$;
                \IF{$f(x) < f(b)$}
                    \STATE $(a,b,c) \gets (a,x,b)$;
                \ELSE
                    \STATE $(a,b,c) \gets (x,b,c)$;
                \ENDIF
            \ELSIF{$c-b > b-a$}
                \STATE Let $x := \frac{b + c}{2}$, and evaluate $f(x)$;
                \IF{$f(x) < f(b)$}
                    \STATE $(a,b,c) \gets (b,x,c)$;
                \ELSE
                    \STATE $(a,b,c) \gets (a,b,x)$;
                \ENDIF
            \ENDIF
        \ENDFOR
        \STATE \textbf{return} $b$;
    \end{algorithmic}
    \rule{4in}{0.2ex}
    % \medskip
    
    We begin by proving that algorithm does, in fact, find $x^*$.
    Because $f$ is convex, 
    this procedure satisfies an important invariant: both $b$ and the minimizer of $f$
    always lie in the interval $[a,c]$.  
    
    \begin{lproof}
        \vspace{-1em}
        We proceed by induction. This is obviously true at first, because
        $[a,b] = [0,1]$ contains all points in the domain of $f$. 
        Let $x^*$ be the minimizer of $f$, and suppose inductively that
        $x^* \in [a,b]$. 
        \begin{itemize}[leftmargin=4em]
            \item [(case 1)] If $b-a \ge c-b$, then $x \in [a,b]$. 
            \begin{itemize}[leftmargin=-1em]
                \item Suppose $f(x) < f(b)$. Then for 
                all $y > b$, it must be the case that $f(y) > f(b)$ by convexity of $f$.
                (For if $f(y) < f(b)$, then segment between $(x, f(x))$ and $(y, f(y))$ would lie entirely below $(b, f(b))$, which contradicts convexity).
                Thus, we can rule out all such $y$ as possible minimizers of $f$, so we can restrict our attention to $[a, b]$, which contains $x$. 
                                                
                \item On the other hand, if $f(x) > f(b)$, then it must be the case that no $y < x$ can be a minimizer of $f$ by convexity, with the same reasing as above. 
                (Namely, if $f(y) < f(x)$ then the segment between $(y,f(y))$ and $(b,f(b))$ lies below $(x,f(x))$, contradicting convexity). 
                Thus the true minimizer lies in $[x,c]$, which contains the point $b$. 
            \end{itemize}
            \item [(case 2)] The other case is symmetric; we include it for completeness. Suppose $c-b > b-a$,
                and so $x = \frac{b+c}{2}$.
            \begin{itemize}[leftmargin=-1em]
                \item Suppose $f(x) < f(b)$. Then $f(y) > f(b)$ for  all $y < b$
                (because if $f(y) < f(b)$, then segment between $(y, f(y))$ and $(x, f(x))$ would lie  below $(b, f(b))$).
                So $x^*, x \in [b, c]$. 
                                                
                \item On the other hand, if $f(x) > f(b)$, then $f(y) > f(x)$ for all $y > x$
                (because, if $f(y) < f(x)$ then the segment between $(y,f(y))$ and $(b,f(b))$ lies below $(x,f(x))$, contradicting convexity). 
                So $x^*, b \in [a,x]$. \qedhere
            \end{itemize}
        \end{itemize}
    \end{lproof}
    
    The other important aspect of the algorithm, is that at each each iteration
    reduces the size of the interval by a factor of at least 3/4 (and possibly much more).
    % This is because the first branch of each case eliminates the entire
    % larger half of the remaining interval, while the second branch eliminates half of the larger half
    % of the interval, or at least 1/4 of it. 
    This is because in each case we focus on the larger half of the interval,
        and ultimately discard either half or all of it---so
        we reduce the size of the interval by at least one quarter.
        % because cases 1.2 and 1.2 eliminate at least 
    Thus, after $n$ iterations, the size of the interval is at most $(\nf34)^n$.
    At this point it should be clear that the number of iterations
    was selected to ensure that $|c-a| \le \epsilon$.
    Of course, $[a,c]$ contains both $b$ and $x^*$.
    Therefore the output of the algorithm ($b$) must be within $\epsilon$ of the true minimizer ($x^*$).
    
    It is easy to see that this algorithm takes constant space, and $O(\log \nf1\epsilon)$ time.
    And, fixing $\epsilon = 10^{-78}$ we get a constant-time algorithm
    that outputs the closest 64-bit number to $x^*$. 
    % Initialize $a = \frac13$, $b = 2/3$. 
    
    \textbf{Reducing inconsistency calculation to inference.}
    This reduction is much simpler, shares more techniques with the primary
    thrust of the paper. First find a tree
    decomposition $(\C, \mathcal T)$ of the PDG's structure,
    and then query the marginals of each clique.  Because of the
    work we've already done, we know this information is enough 
    information to simply evaluate the scoring function, 
    including the joint entropy, by \eqref{eq:cluster-ent-decomp}.     
    % After querying the conditional probabilities $\bbr{\dg M}_\gamma^*(\Tgt a | \Src a)$ 
    % along each arc $a \in \Ar$, one has enough information to evaluate
    % the scoring function $\bbr{\dg M}_\gamma$
\end{lproof}


%joe10*: This seems like an awfully weak section, and not ready for
%prime time.  It's worth 1-2 sentences in the conclusion.  I wouldn't
%give it a whole section.
%
%oli11*:  This is a story I think is important --- it's not just that
% I proved these results and am trying to find a home for them.
% I find it valuable to see how inference fits into the framework of
% the AISTATS paper, and I think it's my scholarly duty to report back
% about the possible route to inference in the AAAI paper.  I don't
% your reaction against it.  Yes, these are not powerful technical 
% results that can carry a paper, and that is why they are in the 
% appendix. I also think this perspective augments the end of 
% section 4, where we link to it.

%oli12: Re-branding this appendix to be closer to the story I think
% it should tell. 
% \section{Larger
%     \texorpdfstring{$\boldsymbol\gamma$}{Gamma} and the Convex-Concave Procedure (CCCP)}
\section{The Convex-Concave Procedure, and Implementation Details}
    \label{sec:cccp}

%oli14: from our conversation, rewriting the first part of this section.
Optimization problems \eqref{prob:joint-small-gamma} and \eqref{prob:cluster-small-gamma}
can be extended to apply slightly more broadly.
There are some cases where there is a unique optimal distribution
% $\{\mu^* \} =  \bbr{\dg M}_\gamma^*$, 
but $\gamma$ is large
enough that $\bbeta \not\ge \gamma\balpha$.
%oli14:
% In these cases, our disciplined convex programming approach as written will fail to typecheck---but it can still be a useful building block.
In these cases, our convex program will fail to satisfy the dcp requirements, and 
    so we cannot compile it to an exponential conic program---but 
    it turns out to still be a useful building block.
We now describe how we can still do inference in some of these cases with the
    convex-concave procedure, or CCCP \parencite{yuille2003concave}.
This will give us a local minimum of the
    PDG scoring function $\bbr{\dg M}_\gamma$,
    without requiring us to write this scoring function
    in a way that proves its convexity,
%oli14: adding
    (as is necessary in order to specify a disciplined convex program). 
At this point, if we happen to know that the problem is convex (or even
    just pseudo-convex) for other reasons, then finding this distribution
    suffices for inference. 
We now describe how this can be done in more detail.
% At a high level, because we can split the PDG scoring function $\bbr{-}_\gamma$
% into convex and concave parts, 


% Now, suppose we're no longer
%oli12: new starting material
% We now describe a useful technical construction in our implementation,
% one effect of which is to enable inference under \emph{slightly} milder
% conditions (i.e., for certain larger values of $\gamma$ under
%     specific conditions).
% However, in these cases, we lose the polynomial time guarantee. 

%oli14
% Let's now entertain the case where 
Suppose
    % $\gamma$ is large enough that
    $\beta_a < \gamma \alpha_a$ some $a \in \Ar$.
%oli12: this material is unnecessary, and can be trimmed down
% meaning that the structural term $(\SInc_{})$,
% is large enough to overpower the observational one $(\OInc)$. 
% The primary difficulty is that $\SInc$ may not be convex. 
In this case $\bbr{\dg M}_\gamma$ may not be convex, in general.%
%oli12: now in footnote
\footnote{
    %oli12: reworking. 
    % It is easy to see that the set of distributions that make $X$ and $Y$ is independent,
    % If $\bbeta = \mat 0$, for instance, the minimizers of
    Consider the PDG $({\to} X,\, Y {\gets})$ for instance, which 
    has arcs to $X$ and $Y$, both with $\alpha = 1$ and $\beta = 0$. 
    The minimizers of
     $\bbr{{\to} X,\, Y {\gets}}_\gamma$ 
    % a PDG with edges $\{$
        are the distributions that make $X$ and $Y$ is independent.
    It is easily seen that this set
    %
    is not convex: $X$ and $Y$ are independent if either variable is deterministic,
    and every distribution is a convex combination of deterministic distributions.
    }
    % \footnote{one can easily verify that set of distributions that make $X$ and $Y$ independent is not convex.}
%
% While we do not know 
%     how to find all minima and answer queries with respect to them,
%     there is still a way to find \emph{some} local minimum of the
%     PDG scoring function $\bbr{\dg M}_\gamma$.
However, we do know how to decompose $\bbr{\dg M}_\gamma$ into a sum
of a convex function $f(\mu)$ and a concave one $g(\mu)$.
Concretely: each term on the second line of \eqref{eq:altscore} is either convex or concave, depending on the sign of the quantity $\gamma \alpha_a - \beta_a$.
Once we sort the terms into convex terms $f(\mu)$ and strictly concave terms $g(\mu)$, 
    the CCCP tells us to repeatedly solve $f$ plus a 
%oli14: I guess I can just remove "frozen"; I don't think it's worth
% the effort to describe what I mean here because it'll be 
% clear in two sentences anyway.  I was just trying to paint a picture.
        % frozen linear approximation to $g$.
        linear approximation to $g$.
In more detail, the algorithm proceeds as follows. 
First, choose an initial guess $\mu_0$, and iteratively use the convex solver 
    as in the main paper to compute
%
\begin{align*}
    \mu_{t+1} &:= \argmin_{\mu} f(\mu) + (\mu - \mu_{t})^{\sf T}
        \nabla g(\mu_t).
\end{align*}
%
This can be slow because each iteration of the solver is expensive. 
Still, it is guaranteed to make progress, since
\def\tplus1{{t\mskip-2mu+\mskip-2mu1}}
\begin{align*}
    f(\mu_\tplus1) \!+\! g(\mu_\tplus1) &<  f(\mu_\tplus1) \!+\! (\mu_\tplus1 \!-\! \mu_t)^{\sf T} \nabla g(\mu_t) \!+\! g(\mu_t)
        % &\text{(concavity of $g$)}
        \\
    &\le  f(\mu_t) + (\mu_t - \mu_{t})^{\sf T}\nabla g(\mu_t)  + g(\mu_t)
        % &\text{(defn of argmin)}
    \\
&= f(\mu_t) + g(\mu_t).
\end{align*}
Furthermore, because in our case $g$ is bounded,
%oli14: this asside is probably not helpful; people will believe me
% and anyone who's made it this far can probably see why.
    % (since they are conditional entropies of a finitely supported probability measure),
    the process eventually converges a local minimum
    %oli14: ading
    of $\bbr{\dg M}_\gamma$.
% \subsection*{Implementation Detail and Experiments with the CCCP}
% It is worth noting that if $\gamma = 0$
%
%oli14: merging paragraphs and tying to the sentence above
% This procedure always produces a local minimum,
This alone, however, is not sufficient for inference,
% because it may not find all local minima.
% Thus, 
%oli14:
% because we cannot use that information
because we may not be able to use this local minimum
    to answer queries in a way that is true of \emph{all} minimizing distributions. 
But, if it happens there is a unique local minimum, then the CCCP will
    find it, leading to an inference procedure.
%oli14: removing all of this, which is now covered above.
% In certain cases, however, it does suffice. For example, there
%     are situations when $\bbr{\dg M}_\gamma$ happens to be convex
%     but not because of the pointwise condition $\bbeta \ge \gamma \balpha$,
%     so it is not clear how to set up the disciplined convex program.
% We don't even need convexity; we only need the objective to be pseudo-convex
%     for the CCCP to give us an inference procedure. 

%joe10*: There is a nonsequitur here; this material does not follow
%from what you added.  Moreover, like the previous addition, I find
%this addition weak and not ready from prime time.  Again I would cut
%it and replace it by 1-2 high-level sentences.
%
%oli11*:  I tried putting 1-2 high-level sentences in the discussion,
% and you (probably correctly) advised me to remove them.  I don't
% think the prime time argument applies as well to the appendix. Here
% are some reasons I think this material should make it in:
%
%  First of all, it makes it much easier to understand the code, why
%  it's written this way, and what it's doing. If I were implementing
%  our method, I would go to the appendix whenever I got stuck, and
%  these four equations are a lot more convenient than sifting through
%  the CCCP paper, which has a lot more going on.
%
% Secondly, I want to refer to this particular algorithm in future 
% work. I already know that this process will be useful, and I expect
% it to be part of the PDG library.  Now, I understand that it
% complicates the inference story, which is why it's now in the
% appendix.  But I'm not telling an inference story here; I'm
% explaining the computations you can do with PDGs.  This material
% certainly isn't a core contribution to another paper; it's just a
% modest technical extension that unifies our implementation.  It's
% precisely the kind of thing I think belongs in an appendix.



%oli14: rewriting this, in light of our conversation
% Notice that in our implementation there is no 
% method corresponding to problems \eqref{prob:joint-small-gamma} or \eqref{prob:cluster-small-gamma} --- rather, we have only implemented 
% % the concave-convex procedure, or CCCP \parencite{yuille2003concave}. 
% the CCCP.
% This is because they are special cases of the CCCP, for when the problem happens to satisfy $\bbeta \ge \gamma \balpha$.
% %oli9: repurposing the above as an implemenation detail, as Joe suggested.
% In this case, the concave part of the function $g$ is identically zero, and one needs precisly one iteration. 
Notice that if $\bbeta \ge \gamma\balpha$, then the concave part $g$ is identically
zero, and CCCP converges after making just one call to the convex solver. 
Therefore, in the cases we could already handle, this extension 
reduces to the algorithm we described before.
For this reason, all of our code that handles problems
 \eqref{prob:joint-small-gamma} and \eqref{prob:cluster-small-gamma}
is augmented with the CCCP. 
 

% Notice that in our implementation there is no 
% method corresponding to problems \eqref{prob:joint-small-gamma} or \eqref{prob:cluster-small-gamma} --- rather, we have only implemented 
% the concave-convex procedure, or CCCP \parencite{yuille2003concave}. 
% the CCCP.
% This is because they are special cases of the CCCP, for when the problem happens to satisfy $\bbeta \ge \gamma \balpha$.
% %oli9: repurposing the above as an implemenation detail, as Joe suggested.
% In this case, the concave part of the function $g$ is identically zero, and one needs precisly one iteration. 



% The CCCP can be quite useful compared to black-box approaches as well. 
%oli14:
% Compared to the black-box approaches,
Compared to the black-box optimization baselines (Adam and LBFGS),
 which also only find one minimum,
    the CCCP still has some advantages. 
One can see in \cref{fig:gamma-v-gap-fine}, for example, that when $\gamma = 2 > 1 = \max_a {(\beta_a / \alpha_a)}$, 
CCCP performs better than the baselines. 
%oli14:
% In fact, the biggest factor holding back the accuracy
% In fact, a big factor holding back the accuracy of CCCP-augmented approach  in our experiments
In fact, the CCCP-augmented could probably even higher accuracy,
    if were we not limiting it to
    %oli14:
    % our cap of a meager five iterations, 
    % is the fact that we do
    a maximum of only five iterations.
        % as each iteration is quite time consuming.
%oli14: we technically have a link to the appendix now, and
% no more groveling is necessary now that I think everyone can
% get on board with this as part of the appendix. 
%
% We kept the CCCP extension out of the main paper because the benefits for
%     inference are marginal, and because we do not want to give the false impression 
%     that we can solve inference in this regime. 


\section{Details on the Empirical Evaluation}\label{sec:expt-setup}
Imagine a very steep $V$-shaped canyon, and inside a small slow-moving stream at a gentle incline. The end of the river may be very far away, and the whole landscape may be smooth and strongly convex, but the gradient will still almost always point perpendicular to it, and rather towards the center of the river.
This intuition may help explain why, even though $\bbr{\dg M}_\gamma$ is infinitely differentiable in $\mu$ and $\gamma$-strongly convex, it can still be challenging to optimize, especially when the $\beta$'s are very different, or when $\gamma$ is small.
For example, a solution to \eqref{prob:cluster-inc} finds a minimizer of $\OInc$, but such minimizers may be very far away from $\bbr{\dg M}_{0^+}^*$, despite sharing an objective value.

We now see how this is true even when working with very small PDGs and joint distributions.

% One interesting feature of this class of problems is that they do not seem to be
% amenable to the same kinds of generic modeling techniques that
%joe4*: this feels like the werong story, but I would need to know
%exactly hwat experiments are performed to know the "right" story.
%oli4: fair enough; this will probably be rewritten. Still, I want to defend the core
% of it. If we could just use gradient descent or other standard black-box optimization
% tools used to train neural networks, at these problems, that would severely undercut
% the story of our paper.  The contrapositive of that is that this is a very expressive
% class of optimization problems (and, I have argued, includes most of probabilistic
% modeling in ML). Not only do those methods have no guarantees, but also it seems they
% do not perform well in practice. This would mean we have found an important class of
% problems on which we can emperically outperform the standard methods!
%joe5: please run a spellchecker.  There are mistakes tha you make
%consistently (like "emperically").  "Empirically" is unrelated to
%"emperors"! 
%oli5: haha it's good to finally know what "emperical" means after using it for all these years! I've replaced the remaining few of these and will run a complete spell check in the next iteration. 

\begin{figure}
    % \includegraphics[width=\linewidth]{figs/resources-fine}
    \includegraphics[width=\linewidth]{figs/rand-joint/resource-costs}
    \caption{\small
        Resource costs for the joint-distribution optimization setting of \cref{sec:inf-as-cvx-program}.
        We measure computation time (\texttt{total\_time}, top) and maximum memory usage (\texttt{max\_mem}, bottom) for the various optimization methods (by color), as the size of the PDG increases, as measured by the number of parameters in the PDG (\texttt{n\_params}$\,=\V\!\Ar$, left), and the size of a joint distribution over its variables (\texttt{n\_worlds}$\,=\V\!\X$, right).
        Note that the convex solvers for the 0 and $0^+$ semantics are significantly faster than LBFGS, and on par with Adam.
        % However, the small-$\gamma$ method (\verb|cccp_opt_joint|)
        However, all three convex-solver based approaches require significantly more memory than the black-box optimizers.
     }\label{fig:resources}
\end{figure}



% \textbf{Comparison to Black-Box Optimizers, on Joint Distributions. }
\subsection{Synthetic Experiment: Comparison with Black-Box Optimizers, on Joint Distributions.} \label{sec:joint-expt-details}
% Even without the fancier construction 
% From an optimization perspective, 
% one interesting feature of PDG inference
% is that, despite having incredibly nice properties
% We begin with some results on joint distributions; 
% the experimental setup is described in \cref{sec:expt-setup}.
 % applying black-box optimization methods such as gradient descent and LBFGS, do not seem to reliably find minima of \eqref{eqn:scoring-fn}.
% \textit{Resources.}
\begin{wrapfigure}{R}{4.8cm}
    \centering
    \vspace{-3ex}
    \includegraphics[width=4.5cm]{figs/rand-joint/representation}
    \caption{\small differences in performance between the Gibbs and simplex parameterizations of probabilities.}%
    \label{fig:representation}%
\end{wrapfigure}


Here is a more precise description of our first synthetic experiment,
on joint distributions, which contrasts the convex optimization approaches of \cref{sec:inf-as-cvx-program} with black-box optimizers.



\begin{itemize}
    \item generate 300 PDGs, each of which has the following quantities, to each of which we choose the following natural numbers uniformly at random:
    \begin{itemize}
        % [nosep,label={$\blacktriangleright$}]
        \item $N \in \{5,\ldots,9\}$ of variables
            (so that $\X := \{1, \ldots, N\}$),
        \item $V_X \in \{2, 3\}$ values per variable
            (so that $|\V X| = V_X$ for each $X \in \X$)
        \item $A \in \{7, \ldots, 14\}$ hyperarcs,
        each $a \in \{1, \ldots, A\} =: \Ar$ of which has
        \item $N^S_a \in \{0, 1, 2, 3\}$ sources, and
        \item $N^T_a \in \{1,2\}$ targets.
    \end{itemize}
    \item For each arc $a \in \Ar$, $N^S_a$ of the $N$ variables are choosen without replacement to be sources $S_a \subseteq N$, and $N^T_a$ of remaining variables are chosen to be targets. Finally, to each value of $S_a$ and $T_a$, a number $p_{a,s,t} \in [0,1]$ is chosen uniformly at random, and the cpd 
    \[
     \p_a(\Tgt a {=} t \mid \Src a {=} s) = 
        \frac{p_{a,s,t}}{\displaystyle\sum_{t' \in \V(T)} p_{a,s,t'}}
     \qquad\text{ is given by normalizing appropriately.}
    \]
    This defines a PDG $\dg M = (\X, \Ar, \mathbb P, \mat 1, \mat 1)$, that
    has $\balpha = \bbeta = \mat 1$, which will allow us to comapre against
    belief propogation and other graphical models at $\gamma = 1$.
    The complexity of this PDG is summarized by two numbers:
    \begin{itemize}[nosep]
        % \item \texttt{n\_edges}, the number of edges in $\dg M$,
        \item \texttt{n\_params}$\,:= \V\!\Ar$, the total number of parameters in all cpds of $\dg M$, and
        \item \texttt{n\_worlds}$\,:= \V\!\X$, the dimension of joint distributions over $\dg M$'s variables.
    \end{itemize}
\end{itemize}

\begin{itemize}    
    \item Run MOSEK on \eqref{prob:joint-inc} to find a distribution that minimizes $\OInc$; we refer to this method as \texttt{cvx-idef}
    \item Use the result to run MOSEK on \eqref{prob:joint+idef} to find the special distribution $\bbr{\dg M}^*_{0^+}$; we refer to this method as \texttt{cvx+idef}. These names are due to the fact that $\SInc$ is called $\IDef{}$ in previous work \parencite{pdg-aaai,one-true-loss};
    thus, this refers to using the convex solver to compute minimizers of $\OInc$ with and without considering $\IDef{}$.
    
    \item Now for the torch baselines. 
    Let $\theta = [\theta_{\mat x}]_{\mat x \in \V\X} \in \mathbb R^{\V\X}$, be a vector of optimization variables, and choose a representation of the joint distribution, either by
    \[
    % $\displaystyle
        \Big(\begin{array}{c}\text{renormalized}\\
        \text{simplex}\end{array}\Big)\quad
        \mu_{\theta}(\mat x) = \frac{\max\{\theta_{\mat x}, 0\} }
            {\sum_{\mat y \in \V\!\X}\max\{\theta_{\mat y}, 0\} }
    % $.
    \qquad\text{or}\qquad
    \mu_{\theta}(\mat x) = \frac{\exp(\theta_{\mat x})}{\sum_{\mat y \in \V\!\X} \exp(\theta_{\mat y})}\quad(\text{Gibbs})
    \qquad\quad\text{(see \cref{fig:representation})}
    \]
    \item 
    Then, for each value of gamma: $\gamma \in \{0, 10^{-8}, 10^{-4}, 10^{-2}, 1\}$, and each learning rate $\texttt{lr} \in 1E-3, 1E-2, 1E-1, 1E0$, and each optimizer $\mathit{opt} \in \{\texttt{adam}, \texttt{L-BFGS}\}$, 
    run $\mathit{opt}$ over the parameters $\theta$ to minimize $\bbr{\dg M}_\gamma(\mu_{\theta})$
     until convergence (or a maximum of 1500 iterations)  
     
     \item We collect the following data about the resulting distribution and the process of computing it:
     \begin{itemize}[nosep]
         \item the total time taken to arrive at $\mu$;
         \item the maximmum memory taken by the process computing $\mu$;
         \item the objective and its component values:
         \vspace{-1ex}
         \[
            \texttt{inc} := \SInc_{\dg M}(\mu),
            \qquad \texttt{idef} := \SInc_{\dg M}(\mu),
            \qquad \texttt{obj} := \OInc_{\dg M}(\mu) + \gamma \SInc_{\dg M}(\mu) = \bbr{\dg M}_\gamma(\mu) 
        \]
     \end{itemize}
\end{itemize}



The numbers can then be recreated by running our experimental script as follows:
\begin{verbatim}
python random_expts.py -N 300 -n 5 9 -e 7 14 -v 2 3
    --ozrs lbfgs adam
    --learning-rates 1E0 1E-1 1E-2 1E-3
    --gammas 0 1E-8 1E-4 1E-2 1E0
    --num-cores 20
    --data-dir random-joint-data
\end{verbatim}
which creates a folder called \verb|random-joint-data|,
and fills it with \verb|.mpt| files corresponding to each distribution
and the method / parameters that gave rise to it. 

\textbf{Analyzing the Results.}
Look at  \cref{fig:resources}.  Our theoretical analysis, and in particular the proof of \cref{lem:cluster-inc-polytime}, suggest that the magnitudes of $\V\!\X$ and $\V\!\Ar$ play similar roles in the asymptotic complexity of PDG inference. 
Our experiments reveal that, at least for random PDGs, the number of worlds is the far more important of the two; observe how much more variation there is on the left side of the figure than the right---and now note that the left side has been smoothed, while the right side has not.
% Observe also that the time complexity of L-BFGS seemst to be growing at a larger rate than 
The black-box py-torch based approaches clearly have an edge in that they can handle larger models, as evidenced by the cut-offs on the right-hand side of \cref{fig:gap-resource-fine-old}, when with 5GB memory.

Note that the exponential-cone-based methods for the observational limit (gold) are actually faster than L-BFGS (the black-box optimizer with the lowest gap), and also seem to be growing at a slower rate.
However, they use significantly more memory, and cannot handle large models.
In addition to being faster, our techniques also seem to be more precise; they achieve objective values that are consistently much better than the black-box methods.
% Recall 


Now look at \cref{fig:joint-gap-vs-time-by-gamma},
which contains a break-down of the information in \cref{fig:joint-gap-time}. The bottom half of the figure is just the same information, but with each value of $\gamma$ separated out, so that the special cases of the factor product and $0^+$ inference become clear, while the top half shows why it's more important to look at the gap than the actual objective value for these random PDGs. 
\Cref{fig:joint-gap-vs-time-by-gamma} also makes it clearer how larger problems take longer, and especially so for \texttt{cccp} (violet), which solves the most complex version of the problem \eqref{prob:joint-small-gamma}.

\begin{figure}
    \includegraphics[height=0.45\textheight]{figs/rand-joint/gap-vs-time-by-gamma}
    \caption{\small
        An un-compressed version of the information in \cref{fig:joint-gap-time}, that groups by the value of $\gamma$, and also gives the absolute values of the objectives (top row) in addition to the relative gaps (bottom row).
    }\label{fig:joint-gap-vs-time-by-gamma}
\end{figure}




\discard{\begin{figure}
    \includegraphics[height=0.45\textheight]{figs/gamma-vs-gap-bettergap}
    \caption{
        A graph of the gap (the difference between the attained objective value, and the best objective value obtained across all methods for that value of $\gamma$),
        as $\gamma$ varies. The x-axis is $\log_{10} ( \gamma + 10^{-15})$.
        As before, colors indicate the optimization method; here blue corresponds to \eqref{prob:joint+idef}, while orange corresonds to \eqref{prob:joint-small-gamma}, and green, as before, correponds to all optimization baselines.
        The size of the circle illustrates the relative number of worlds.
        See \cref{fig:gamma-v-gap-fine} for a more detailed breakdown.
    }\label{fig:gamma-v-gap}
\end{figure}}


\begin{figure}
    \includegraphics[width=\linewidth]{figs/2}
    \caption{
        A graph of the gap (the difference between the attained objective value, and the best objective value obtained across all methods for that value of $\gamma$),
        as $\gamma$ varies. The x-axis is $\log_{10} ( \gamma + 10^{-15})$.
        As before, colors indicate the optimization method, and
        the size of the circle illustrates the number of optimization variables (i.e., the number of possible worlds).
        % A fine-grained variant of \cref{fig:gamma-v-gap}, which splits each method into sub-groups.
        \texttt{cvx-idef} corresponds to just solving \eqref{prob:joint-inc}, and \texttt{cvx+idef} corresponds to then solving problem \eqref{prob:joint+idef} afterwards.
         % depending on whether or not it also computed the second step described in \cref{sec:empirical-limit} to account for $\SInc_{}$.
        The CCCP runs are split into regimes where the entire problem is convex ($\gamma \le 1$, labeled \texttt{cccp-VEX}), and the entire problem is concave ($\gamma > 1$, labeled \texttt{cccp-CAVE}).
        The optimization approaches \texttt{opt\_dist} are split into three different optimizers: LBFGS, Adam, and also a third one that 
        performs relatively poorly: accelerated gradient descent.
        Note that for small $\gamma$, the exponential-cone based methods significantly outperform the gradient-based ones.
    }\label{fig:gamma-v-gap-fine}
\end{figure}


\subsection{Synthetic Experiment: Comparing with Black-Box Optimizers, on Clique Trees} \label{sec:clus-expt-details}

\begin{figure}
    \centering
    \includegraphics[width=0.67\linewidth]{figs/rand-clus/gap-vs-time}
    \caption{An analogue of \cref{fig:joint-gap-time}, for the cluster setting.
    Note that there is even more separation between the exponential-cone based approaches, and the black-box optimization based ones.
    The new grey points on the bottom correspond to belief propogation, which is both faster and typically the most accurate.}
    \label{fig:clus-gap-vs-time--appendix}
\end{figure}
\begin{figure}
    \centering
    \includegraphics[width=0.67\linewidth]{figs/rand-clus/resource-costs}
    \caption{Resource costs for the cluster setting. Once again, the $\OInc$-optimimzing exponential cone methods are in gold, the small-gamma and CCCP is in violet, and the baselines are in green. The bottom line is belief propogation, which is significantly faster and requires very little memory, but also only gives the correct answer under very specific cirucumstances.}
    \label{fig:clus-resource-costs}
\end{figure}


\begin{enumerate}
    \item Choose a number of variables $N \in \{ 8, \ldots, 32 \}$, and a treewidth $k \in \{1, \ldots, 4\}$ uniformly at random. 
    Then, draw a random $k$-tree and corresponding tree of clusters $(\C, \mathcal T)$, as follows:
    \begin{enumerate}
        \item initialize $G \gets K_{k+1}$ to a complete graph on $k+1$ vertices, and $\C \gets\{ K_{k+1} \}$ to be set containing a single cluster, and $\mathcal T\gets \emptyset$.
        \item until there are $N$ vertices: add a new vertex $v$ to $G$, then randomly select a size $k$-clique (fully-connected subgraph) $U \subset G$, and add edges between $v$ and every vertex $u \in U$.
        Then, add $U \cup \{v\}$ to $\C$, and add edges to every other cluster $C \in \C$ such that $U \subset C$.
    \end{enumerate}
    % It is well-known that this procedure gives a tree; see, for instance,
    % \cite{}
    \item Then, draw the same parameters $V_X \in \{2,3\}$, $A \in \{8, \cdots, 120\}$, $N_a^S \in \{0,1,2,3\}$, and $N^{T}\in \{1,2\}$ 
    as in \cref{sec:joint-expt-details} uniformly at random.
    While $N_a^S + N^T_a > k+1$, for any $a$, resample $N_a^S$ and $N_a^T$.
    
    \item Form a PDG whose structure $\Ar$ can be decomposed by $(\C, \cal T)$, as follows: 
    for each edge $a \in \Ar$, sample a cluster $C \in \C$ uniformly at random; then select $N_a^S$ nodes from that cluster without replacement as sources, and $N_a^T$ nodes as targets; this is possible because each cluster has $k+1$ nodes, and $N_a^S + N_a^T \le k+1$ by construction.
    \item Fill in the probabilities by drawing uniform random numbers and re-normalizing, just as before, to form a PDG $\dg M$
    
    \item The black-box optimization baselines work in much the same way also, although now the optimization variables include not one distribution $\mu$ but a collection $\bmu$ of them; 
    this time, we use only the simplex representation of $\bmu_\theta$.
    More importantly, we want these clusters to share appropriate marginals; to encourage this, we add a terms to the loss function, so overall, it is
    \[
        \ell(\theta) := \bbr{\dg M}_{\gamma}(\bmu_{\theta}) + \sum_{C{-}D \in \mathcal T} \exp \left(\sum_{w \in \V(C\cap D)} \Big(\mu_C(C\cap D{=}w) - \mu_D(C \cap D {=}w)\Big)^2 \right) - 1.
    \]
    This is admittedly pretty ad-hoc; the point is just that it is zero and does not contribute to the gradient if $\bmu_\theta$ is calibrated, and otherwise quickly becomes overwhelmingly important.
    % \begin{enumerate}
    %     \item 
    % \end{enumerate}
\end{enumerate}

\textbf{Analyzing the Results.}
Observe in \cref{fig:clus-gap-vs-time} that the separation between the clique tree convex solver and the black-box algorithms is even more distinct. This is because, in this case, the penalty for violating constraints was too small, and the optimization effort was largely wiped out by the calibration before evalution. 


\begin{figure}
    \centering
    \includegraphics[width=0.7\textwidth]{figs/bn/gap-v-time}
    \caption{Gap vs inference time for the small PDGs in the \href{https://www.bnlearn.com/bnrepository/}{\texttt{bnlearn}} repository}
    \label{fig:bn-gap-v-time}
\end{figure}

This illustrates another general advantage that the convex solver has over black-box optimizers: it is much less brittle and reliant and exactly tuning parameters correctly. Note that even in this minimal example, there were many hyper-parameters that require tuning:
the regularization strengths that enforce soft constraints (clique tree calibration, normalization), as well as learning rate, not to mention
various other structural choices: the optimizer, the representation of the distribution, and the maximum number of iterations, none of which are clear-cut choices, but rather require first being tuned to the data.
While the convex solver does have internal parameters (tolerences and such) these do not need to be tuned to the problem under normal circumstances.

\subsection{Comparing to Belief Propagation, on Clique Trees.}
    \label{sec:bn-expt-details}
 Since PDGs generalize other graphical models, one might wonder how our method stacks up against algorithms tailored to the more traditional models. In brief: our algorithm is much slower, and only handle much smaller networks. 
 Concretely, our methods can handle all of the ``small'' networks, and some of the ``medium'' ones, from the \href{https://www.bnlearn.com/bnrepository/}{\texttt{bnlearn}} repository. 
  In these cases, we have verified that the two methods yield the same results.
  \Cref{fig:bn-gap-v-time} contains the analogue of \cref{fig:joint-gap-time,fig:clus-gap-vs-time}
  for the Bayesian Nets. This graph looks qualitatively quite similar to the other graphs we've seen, suggesting that the results in our synthetic experiments hold more broadly for small real-world models as well.
  

  

 \begin{figure}
     \includegraphics[height=0.45\textheight]{figs/1}
     \caption{
         A variant of \cref{fig:resources}, with 
         with gap (accuracy) information on the left, and slightly different parameter settings. 
     }\label{fig:gap-resource-fine-old}
 \end{figure}


\discard{


%joe5*: this section should be cut.  At best, you can add a few
%sentences in the conlusion that discusses these results.
\section{OTHER APPROACHES TO PDG INFERENCE} \label{sec:other-inference}
% \section{EXTENSIONS}

\subsection{}
A stronger result than \cref{prop:markov-property} holds as well.
\begin{prop}\label{prop:same-set-dists}
    % For all $\gamma > 0$ and in the limit as $\gamma \to 0$,
    Let $\Ed$ be a set of (hyper)edges over $\X$.
    For every PDG $\dg M$ over $\X$ with edges $\Ed$, every $\gamma > 0$, and every optimum $\mu^* \in \bbr{\dg M}_\gamma^*$ of $\dg M$'s scoring function at $\gamma$,
    there is a factor graph $\Phi$ with factors along $\Ed$ such that $\Pr_\Phi = \mu^*$.
\end{prop}

In other words: every distribution that a PDG can pick out as optimal (for any choice of $\gamma > 0$ and also in the limit as $\gamma \to 0$), can also be described as a factor graph with the same structure as that PDG.
How do we square this with the \citeauthor{pdg-aaai}'s claim that PDGs are more general than factor graphs?

% This may be surprising, given how \citeauthor{pdg-aaai} position their model as strictly more expressive than other graphical models, because it implies that the optimal distribution
\TODO[TODO: answer this question.\\
    The short answer: PDGs still compose differently, and in a way that respects the meaning of the probabilities. And just because you can find a factor graph that would have given you the right distribution after the fact, doesn't mean you could have specified the component factors.]
% The answer is simply that


\TODO[Also: don't get lost; figure out how to continue as below:]
\cref{prop:same-set-dists} suggests another approach to avoiding an exponential representation of $\mu$: given a PDG, fit a factor graph that has the same structure to it.

\subsection{Approximate Inference}
% \subsubsection{Relaxing the Marginal Polytope}
\textbf{Relaxing the marginal polytope.}
Just as it is possible to do belief propagation on cluster graphs that are not trees (e.g., loopy belief propagation)
so too is it possible to drop the requirement that the cluster that we use is indeed a tree decomposition.
This program is smaller, and will converge, but it will only be an approximate solution.
Like the original PDG itself, it might be inconsistent.

\subsubsection{Variational Approaches}

% Because of the deep connection between variational approaches
% shown in \parencite{one-true-loss}, there's





}



% \subsubsection*{Acknowledgements}
% Halpern and Richardson were supported in part by MURI grant
% W911NF-19-1-0217 and ARO grant W911NF-17-1-0592.
% De Sa was supported by NSF RI-CAREER award 2046760.


\ifbiblatex
    \subsubsection*{References}
    \printbibliography
\else
    \bibliography{refs}\fi



\end{document}
