\documentclass[accepted]{uai2023} %


\newif\ifbiblatex
    \biblatexfalse %
    
\newif\ifvfull
    \vfulltrue %
    
    \newif\ifvfullred %
    \vfullredtrue

\usepackage[american]{babel}

\ifbiblatex\else
    \usepackage{natbib} %
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
    \fi
\usepackage{booktabs} %
\usepackage{algorithm}
\usepackage{algorithmic}

        
\newcommand\vfull[1]{{\ifvfullred\color{red!75!black}\fi\ifvfull#1\fi}}
\newenvironment{vfullenv}{\begingroup\ifvfullred\color{red!75!black}\fi\ifvfull}{\fi\endgroup}

\usepackage{microtype}
\relax %
    \newif\ifmarginprooflinks
    	\marginprooflinksfalse

\relax %
    \usepackage[dvipsnames]{xcolor}
    
    \usepackage{mathtools}
    \usepackage{amssymb}
		\DeclareMathSymbol{\shortminus}{\mathbin}{AMSa}{"39}
    
    \usepackage{bbm}
    \usepackage{faktor}
    \usepackage{booktabs}
    \usepackage{graphicx}
    \usepackage{scalerel}
    \usepackage{enumitem}
    \usepackage{wrapfig}
    \usepackage{nicefrac}\let\nf\nicefrac


    \usepackage{hyperref} %
        \hypersetup{colorlinks=true, linkcolor=blue!75!black, urlcolor=magenta, citecolor=green!50!black}
    

\relax %
    \let\Horig\H \let\H\relax %
	\DeclareMathOperator{\H}{\mathrm{H}} %
	\DeclareMathOperator{\I}{\mathrm{I}} %
	\DeclareMathOperator*{\Ex}{\mathbb{E}} %
	\DeclareMathOperator*{\EX}{\scalebox{1.5}{$\mathbb{E}$}} %
    
	\DeclareMathOperator{\supp}{\mathrm{supp}} %
	\DeclareMathOperator*{\argmin}{arg\,min} %
    
    \DeclarePairedDelimiterX{\infdivx}[2]{(}{)}{%
        #1\;\delimsize\|\;#2} %
	\newcommand{\thickD}{I\mkern-8muD}  %
	\newcommand{\kldiv}{\thickD\infdivx} %
	\newcommand{\tto}{\rightarrow\mathrel{\mspace{-15mu}}\rightarrow} %
    
    \newcommand{\Rext}{\mskip1mu\overline{\mskip-1mu\mathbb R\!}\,}
    \def\Rextge0{\Rext_{\scriptscriptstyle \ge 0}} %
    \def\Rge0{\mathbb{R}_{\scriptscriptstyle \ge 0}} %
    \newcommand{\mat}[1]{\mathbf{#1}} %

    \DeclarePairedDelimiter{\norm}{\Vert}{\Vert}
    \DeclarePairedDelimiter{\pqty}{\lparen}{\rparen}

	\makeatletter
	\newcommand{\subalign}[1]{%
	  \vcenter{%
	    \Let@ \restore@math@cr \default@tag
	    \baselineskip\fontdimen10 \scriptfont\tw@
	    \advance\baselineskip\fontdimen12 \scriptfont\tw@
	    \lineskip\thr@@\fontdimen8 \scriptfont\thr@@
	    \lineskiplimit\lineskip
	    \ialign{\hfil$\m@th\scriptstyle##$&$\m@th\scriptstyle{}##$\hfil\crcr
	      #1\crcr
	    }%
	  }%
    	}
	\makeatother
	\newcommand\numberthis{\addtocounter{equation}{1}\tag{\theequation}}


\usepackage{tikz}
	\usetikzlibrary{positioning,fit,calc, decorations, arrows, shapes, shapes.geometric}
	\usetikzlibrary{cd}

	\tikzset{AmpRep/.style={ampersand replacement=\&}}
	\tikzset{center base/.style={baseline={([yshift=-.8ex]current bounding box.center)}}}
	\tikzset{paperfig/.style={center base,scale=0.9, every node/.style={transform shape}}}

	\tikzset{dpadded/.style={rounded corners=2, inner sep=0.7em, draw, outer sep=0.3em, fill={black!50}, fill opacity=0.08, text opacity=1}}
	\tikzset{dpad0/.style={outer sep=0.05em, inner sep=0.3em, draw=gray!75, rounded corners=4, fill=black!08, fill opacity=1, align=center}}
	\tikzset{dpadinline/.style={outer sep=0.05em, inner sep=2.5pt, rounded corners=2.5pt, draw=gray!75, fill=black!08, fill opacity=1, align=center, font=\small}}

 	\tikzset{dpad/.style args={#1}{every matrix/.append style={nodes={dpadded, #1}}}}
	\tikzset{light pad/.style={outer sep=0.2em, inner sep=0.5em, draw=gray!50}}

	\tikzset{arr/.style={draw, ->, thick, shorten <=3pt, shorten >=3pt}}
	\tikzset{arr0/.style={draw, ->, thick, shorten <=0pt, shorten >=0pt}}
	\tikzset{arr1/.style={draw, ->, thick, shorten <=1pt, shorten >=1pt}}
	\tikzset{arr2/.style={draw, ->, thick, shorten <=2pt, shorten >=2pt}}

	\newcommand\cmergearr[5][]{
		\draw[arr, #1, -] (#2) -- (#5) -- (#3);
		\draw[arr, #1, shorten <=0] (#5) -- (#4);
		}
	\newcommand\mergearr[4][]{
		\coordinate (center-#2#3#4) at (barycentric cs:#2=1,#3=1,#4=1.2);
		\cmergearr[#1]{#2}{#3}{#4}{center-#2#3#4}
		}
	\newcommand\cunmergearr[5][]{
		\draw[arr, #1, -, shorten >=0] (#2) -- (#5);
		\draw[arr, #1, shorten <=0] (#5) -- (#3);
		\draw[arr, #1, shorten <=0] (#5) -- (#4);
		}
	\newcommand\unmergearr[4][]{
		\coordinate (center-#2#3#4) at (barycentric cs:#2=1.2,#3=1,#4=1);
		\cunmergearr[#1]{#2}{#3}{#4}{center-#2#3#4}
		}
		
	\newcommand\lab[1]{(#1)(lab-#1)}
	\tikzset{alternative/.style args={#1|#2|#3}{name=#1, circle, fill, inner sep=1pt,label={[name={lab-#1},gray!30!black, inner sep=1pt]#3:\scriptsize #2}} }
	\tikzset{tpt/.style args={#1|#2}{alternative={#1|#2|below}} }
	\tikzset{Dom/.style args={#1[#2] (#3) around #4}{dpadded, name=#3, label={[name={lab-#3},align=center,label distance=-1.9em, shading=axis, top color=white, bottom color=black!04,inner sep=0pt, #2]150:#1}, fit={ #4 }, inner sep=0.5em}}


\relax %
    \newcommand{\nhphantom}[2]{\sbox0{\kern-2%
    \nulldelimiterspace$\left.\delimsize#1\vphantom{#2}\right.$}\hspace{-.97\wd0}}
    \makeatletter
    \newsavebox{\abcmycontentbox}
    \newcommand\DeclareDoubleDelim[5]{
    \DeclarePairedDelimiterXPP{#1}[1]%
        {%
            \sbox{\abcmycontentbox}{\ensuremath{##1}}%
        }{#2}{#5}{}%
        {%
            \nhphantom{#3}{\usebox\abcmycontentbox}%
            \hspace{1.2pt} \delimsize#3%
            \mathopen{}\usebox{\abcmycontentbox}\mathclose{}%
            \delimsize#4\hspace{1.2pt}%
            \nhphantom{#4}{\usebox\abcmycontentbox}%
        }%
    }
    \makeatother

\relax %
	\newcommand{\ssub}[1]{_{\!_{#1}\!}}
	
    \newcommand{\pdgunit}{\mathrlap{\mathit 1} \mspace{2.3mu}\mathit 1}
	
	\newcommand{\X}{\mathcal X}
	\newcommand{\V}{\mathcal V}
	\newcommand{\N}{\mathcal N}
	\newcommand{\Ed}{\mathcal E}
	\newcommand{\Ar}{\mathcal A}
	\newcommand{\ArST}{\hat\Ar}
	
    \newcommand{\balpha}{\boldsymbol\alpha}
    \newcommand{\bbeta}{\boldsymbol\beta}

	\newcommand{\bp}[1][L]{\mat{p}\ssub{#1}}
	\newcommand{\bP}[1][L]{\mat{P}\ssub{#1}}
	\def\p_#1{\mathbb P_{\!#1\mskip-2mu}}	

	\newif\ifsuba \subatrue
	\let\psimp\p  %
	\newcommand\Src[1]{\ifsuba S\mskip-2mu\vphantom{|}_{{#1}} \else S \fi}
	\newcommand\Tgt[1]{\ifsuba T\mskip-3mu\vphantom{|}_{{#1}} \else T \fi}

    

	\DeclareMathAlphabet{\mathdcal}{U}{dutchcal}{m}{n}
	\DeclareMathAlphabet{\mathbdcal}{U}{dutchcal}{b}{n}
	\newcommand{\dg}[1]{\mathbdcal{#1}}
	\newcommand{\PDGof}[1]{{\dg M}_{#1}}
	\newcommand{\UPDGof}[1]{{\dg N}_{#1}}
	\newcommand\VFE{\mathit{V\mkern-4mu F\mkern-4.5mu E}}

	\newcommand\Inc{\mathit{Inc}}
	\newcommand{\IDef}[1]{\mathit{IDef}_{\!#1}}
	\newcommand\OInc{\mathit{O\mskip-2.5muI\mskip-3.5mun\mskip-1.7muc}} %
	\newcommand{\SInc}{\mathit{S\mskip-2.5muI\mskip-3.5mun\mskip-1.7muc}} %
	\newcommand{\ed}[3]{#2%
	 {\overset{\smash{\mskip-5mu\raisebox{-1pt}{$\scriptscriptstyle
	        #1$}}}{\rightarrow}} #3}

	\newcommand{\bundle}{\mathbin{+}}
	\DeclareDoubleDelim
		\SD\{\{\}\}
	\DeclareDoubleDelim
		\bbr[[]]
	\makeatletter
	\newsavebox{\aar@content}
	\newcommand\aar{\@ifstar\aar@one@star\aar@plain}
	\newcommand\aar@one@star{\@ifstar\aar@resize{\aar@plain*}}
	\newcommand\aar@resize[1]{\sbox{\aar@content}{#1}\scaleleftright[3.8ex]
		{\Biggl\langle\!\!\!\!\Biggl\langle}{\usebox{\aar@content}}
		{\Biggr\rangle\!\!\!\!\Biggr\rangle}}
	\DeclareDoubleDelim
		\aar@plain\langle\langle\rangle\rangle
	\makeatother


\relax %
    \newcommand\bmu{\boldsymbol\mu}
    \newcommand\C{\mathdcal C}
    \newcommand\minimize{\mathop{\scalebox{0.98}{$\mathsf{minimize}$}}\limits}
    \newcommand\maximize{\mathop{\scalebox{0.98}{$\mathsf{maximize}$}}\limits}
    \newcommand\subjto{{\scalebox{0.98}{$\mathsf{subject~to}$}}}
    
    
    \newcommand\CI{\mathbin{\bot\!\!\!\bot}}

\usepackage{amsthm,thmtools} %
	\usepackage[noabbrev,nameinlink,capitalize]{cleveref}
    \theoremstyle{plain}
    \newtheorem{theorem}{Theorem}
	\newtheorem{coro}{Corollary}[theorem]
    \newtheorem{prop}[theorem]{Proposition}
    \newtheorem{conj}[theorem]{Conjecture}
    \newtheorem{claim}{Claim}
	\declaretheorem[numberwithin=theorem,name=Claim]{iclaim}
    \newtheorem{remark}{Remark}
    \newtheorem{lemma}[theorem]{Lemma}
    \theoremstyle{definition}
    \declaretheorem[name=Definition, qed=$\square$]{defn}
    \declaretheorem[name=Example, qed=$\triangle$]{example}

	\crefname{defn}{Definition}{Definitions}
	\crefname{prop}{Proposition}{Propositions}
    \crefname{issue}{Issue}{Issues}

\relax %
	\usepackage{xpatch}
	\makeatletter
	\xpatchcmd{\thmt@restatable}%
	   {\csname #2\@xa\endcsname\ifx\@nx#1\@nx\else[{#1}]\fi}%
	   %
	   {\ifthmt@thisistheone\csname #2\@xa\endcsname\ifx\@nx#1\@nx\else[{#1}]\fi
	   \else\fi}
	   {}{\typeout{FIRST PATCH TO THM RESTATE FAILED}} %
	\xpatchcmd{\thmt@restatable}%
	   {\csname end#2\endcsname}
	   {\ifthmt@thisistheone\csname end#2\endcsname\else\fi}
	   {}{\typeout{FAILED SECOND THMT RESTATE PATCH}}

	\newcommand{\recall}[1]{\medskip\par\noindent{\bf \Cref{thmt@@#1}.} \begingroup\em \noindent
	   \expandafter\csname#1\endcsname* \endgroup\par\smallskip}

   	\setlength\marginparwidth{1.55cm}
	\newenvironment{linked}[3][]{%
		\def\linkedproof{#3}%
		\def\linkedtype{#2}%
		\ifmarginprooflinks
		\marginpar{%
			\vspace{1.5em}
			\centering%
			\hyperref[proof:\linkedproof]{%
			\color{blue!30!white}%
			\scaleleftright{$\Big[$}{\,\mbox{\footnotesize\centering\tt\begin{tabular}{@{}c@{}}
				link to\\[-0.15em]
				proof
			\end{tabular}}\,}{$\Big]$}}~
			}%
		\fi
        \restatable[#1]{#2}{#2:#3}\label{#2:#3}%
        }%
		{\endrestatable%
		}
	\makeatother
		\newcounter{proofcntr}
		\newenvironment{lproof}{\begin{proof}\refstepcounter{proofcntr}}{\end{proof}}

        \usepackage{cancel} %
        \newcommand{\Cancel}[2][red]{{\color{#1}\cancel{\color{black}#2}}}


		\usepackage{tcolorbox}
		\tcbuselibrary{most}
        \colorlet{proofleft}{blue!20!black!40!white}
        \colorlet{proofmatt}{blue!20!black!05!white}
		\tcolorboxenvironment{lproof}{
			enhanced,
			parbox=false,
			boxrule=0pt,
			frame hidden,
			borderline west={4pt}{0pt}{proofleft},
			colback={proofmatt},
			sharp corners,
			breakable,
		}
        \tikzset{proofmatDom/.style args={#1[#2] (#3) around #4}{dpadded, name=#3, label={[name={lab-#3},align=center,label distance=-1.9em, shading=axis, top color=proofmatt, bottom color=black!04!proofmatt,inner sep=0pt, #2]150:#1}, fit={ #4 }, inner sep=0.5em}}
	\newcommand{\begthm}[3][]{\begin{#2}[{name=#1},restate=#3,label=#3]}

\relax %
    \newcommand{\TODO}[1][INCOMPLETE]{{\color{red}\raggedright\hangindent=0.5cm\rightskip=0.4cm$\smash{\Big\langle}$~\texttt{#1}~\raisebox{-0.3ex}{${\Big\rangle}$}\hspace{-1.5cm}\par}}
    
    \newcounter{daggerfootnote}
    \newcommand*{\daggerfootnote}[1]{%
        \setcounter{daggerfootnote}{\value{footnote}}%
        \renewcommand*{\thefootnote}{\fnsymbol{footnote}}%
        \footnote[2]{#1}%
        \setcounter{footnote}{\value{daggerfootnote}}%
        \renewcommand*{\thefootnote}{\arabic{footnote}}%
        }

\relax %

	\usepackage[american]{babel}
	\usepackage{csquotes}

    \ifbiblatex
	\usepackage[backend=biber, style=authoryear]{biblatex}
	\DeclareLanguageMapping{american}{american-apa}
	\addbibresource{refs.bib}

	\DeclareFieldFormat{citehyperref}{%
	  \DeclareFieldAlias{bibhyperref}{noformat}%
	  \bibhyperref{#1}}

	\DeclareFieldFormat{textcitehyperref}{%
	  \DeclareFieldAlias{bibhyperref}{noformat}%
	  \bibhyperref{%
	    #1%
	    \ifbool{cbx:parens}
	      {\bibcloseparen\global\boolfalse{cbx:parens}}
	      {}}}

	\savebibmacro{cite}
	\savebibmacro{textcite}

	\renewbibmacro*{cite}{%
	  \printtext[citehyperref]{%
	    \restorebibmacro{cite}%
	    \usebibmacro{cite}}}

	\renewbibmacro*{textcite}{%
	  \ifboolexpr{
	    ( not test {\iffieldundef{prenote}} and
	      test {\ifnumequal{\value{citecount}}{1}} )
	    or
	    ( not test {\iffieldundef{postnote}} and
	      test {\ifnumequal{\value{citecount}}{\value{citetotal}}} )
	  }
	    {\DeclareFieldAlias{textcitehyperref}{noformat}}
	    {}%
	  \printtext[textcitehyperref]{%
	    \restorebibmacro{textcite}%
	    \usebibmacro{textcite}}}

	\DeclareCiteCommand{\brakcite}
	  {\usebibmacro{prenote}}
	  {\usebibmacro{citeindex}%
	   \printtext[bibhyperref]{[\usebibmacro{cite}]}}
	  {\multicitedelim}
	  {\usebibmacro{postnote}}
      
     \else
     \gdef\parencite{\citep}
     \gdef\textcite{\citet}
     \makeatletter
     \makeatother
     \fi



\newcommand\discard[1]{}
\newcommand\zogamma{{\mathrlap{\raisebox{-0.1ex}{$\hat{\phantom{x}}$}}\gamma}}

\title{Inference for Probabilistic Dependency Graphs}

\author[1]{Oliver~E.~Richardson}
\author[1]{Joseph~Y.~Halpern}
\author[1]{Christopher De Sa}
\affil[1]{%
    Deparment of Computer Science\\
    ~Cornell University\\
    ~Ithaca NY 14853
}
  
\begin{document}
\maketitle

\begin{abstract}
    Probabilistic dependency graphs (PDGs) are a flexible class of probabilistic graphical models,
    subsuming Bayesian Networks and Factor Graphs.
    They can also capture inconsistent beliefs, and provide a way of measuring the degree of this inconsistency.
    We present the first tractable inference algorithm for
    PDGs with discrete variables,
    making the asymptotic complexity of PDG inference similar
    that of the graphical models they generalize. 
    The key components are:
    (1) the observation that PDG inference can be reduced to convex optimization with exponential cone constraints, 
    (2) a construction that allows us to express these problems compactly for PDGs of boundeed treewidth, for which we needed to further develop the theory of PDGs, and
    (3) an appeal to interior point methods that can solve such problems in polynomial time.
    We verify the correctness and time complexity of our approach, 
    and provide an implementation of it.
    We then evaluate our implementation, and demonstrate that 
    it outperforms
    baseline approaches.
    Our code is available at {\small\url{github.com/orichardson/pdg-infer-uai}}.
\end{abstract}


\section{Introduction}


\emph{Probabilistic dependency graphs (PDGs)} \parencite{pdg-aaai},
form a very general class of probabilistic graphical models,
that includes not only
Bayesian Networks (BNs) and
Factor Graphs (FGs),
but also more recent statistical models built out of neural networks,
such as Variational Autoencoders (VAEs) \parencite{kingma2013autoencoding}.
PDGs can also capture inconsistent 
beliefs, and provide a useful way to measure the degree of this inconsistency;
for a VAE, this is the loss function used in training
    \parencite{one-true-loss}.
    PDGs have some
significant advantages over other representations of probabilistic information. 
    Their flexibility allows them to model beliefs that BNs cannot, 
such as information from independent studies of the same variable
(perhaps with different controls, yielding probabilistic observations $p(Y|X)$ and $q(Y|Z)$).
PDGs
can deal gracefully with conflicting information from multiple sources.
Every subcomponent of a PDG has probabilistic
    meaning, independent of the other components;
compared to FGs, this makes PDGs more interpretable.
But up to now, there has been no practical way to do inference for
PDGs---that is,
to answer questions of the form
``what is the probability of $Y$ given $X$?''
This paper presents the first algorithm to do so.



Before discussing our algorithm, 
we must discuss what it even means to do inference for a PDG.  
A BN or FG represents 
a unique joint distribution. 
Thus, for example, when we ask ``what is the probability of $Y$ given that $X{=}x$?''
\def\BNPr{\mu}
in a BN, we mean ``what is $\BNPr(Y | X{=}x)$?'' for the probability 
measure $\BNPr$ that the BN represents.
But a PDG might, in general, represent more than just one distribution.

Like a BN, a PDG encodes
two types of information: ``structural'' 
information about the independence of causal mechanisms,
and ``observational'' information
about conditional probabilities.
Unlike in a BN, the two can conflict in a PDG.
Corresponding to these two types of information,
a PDG has two
loss functions,
which quantify how far a distribution $\mu$ is from
modeling the information of each type.
Given a number $\zogamma
\in [0,1]
$
indicating the importance of structure relative to observation,
we take the \emph{$\zogamma$-semantics} of a PDG to be the
set of distributions that minimize 
the appropriate convex combination of
losses.
We also consider the \emph{$0^+$\!-semantics}: the limiting case that
arises as $\zogamma$ goes to zero
(which focuses
 on observation, using structure only to break ties).
This set
can be shown to contain precisely one distribution
for PDGs satisfying a mild regularity condition 
(required by definition by \citeauthor{pdg-aaai});
we call such PDGs \emph{proper}.
Thus, we have
 a parameterized family of inference notions:
to do $\zogamma$-inference, for $\zogamma \in [0,1] \cup \{0^+\}$,
is to answer queries in a way that is true of all distributions in the $\zogamma$-semantics.

If there are distributions
fully
consistent with
both the observational and the structural information
in a PDG $\dg M$, 
then for $\zogamma \in (0,1) \cup \{0^+\}$, all
notions of $\zogamma$-inference 
coincide.
\discard{
    For PDGs satisfying a mild condition 
    (required by definition in \citeauthor{pdg-aaai})
    which for now we call \emph{proper},
    selecting a small enough positive value of $\zogamma$
    suffices to ensure that
    the $\zogamma$-semantics consists of only a single distribution.
    The $0^+$ semantics of a proper PDG is also a unique distribution,
    and one that does not depend on the choice of a small positive number. }%
\def\PrM{\mu_{\dg M}}%
If $\dg M$ is also proper,
    this means there is
    a single distribution $\PrM$
    that minimizes both loss functions, 
    in which case we want to answer queries with respect to $\PrM$
    no matter how we weight observational and structural information. 
Moreover, if $\dg M$ represents a BN,
then $\PrM$ is the distribution represented by the BN.  
However, if there is no distribution that is consistent with both types of information, then the choice of $\zogamma$ matters.  

Since PDGs subsume BNs, and inference for BNs is already NP-hard, the same must be true of PDGs.
At a high level, the best we could hope for would be tractability on the restricted
class of models on which inference has traditionally been tractable---that is, a polynomial algorithm for models whose
underlying structure has \emph{bounded treewidth} (see
\Cref{sec:tw} for formal definitions).
That is indeed what we have.  
More precisely, we show that
$0^+$\!-inference
and $\zogamma$-inference for small $\zogamma$ 
can be done 
for discrete PDGs of bounded treewidth containing $N$ variables in
$\tilde O(N^4)$ 
time.
 
Our algorithm
is based on a line of recent work in 
convex programming
that establishes
polynomial-time
for a class of optimization problems called \emph{exponential conic programs}
\parencite{badenbroek2021algorithm,skajaa2015homogeneous,nesterov1996infeasible}.
Our contribution is to show that the problem of inference in a PDG
of bounded treewidth
can be efficiently converted to a (sequence of) exponential conic program(s), at which point it can be solved with a commercial solver
(e.g., \textcite{mosek}) in polynomial time. 
The direct appeal to a solver allows us
to benefit from the speed and reliability of such highly optimized solvers, and also from future improvements in exponential conic optimization.
Thus, our result is not only a theoretical one, but practical as well.



Beyond its role as a probabilistic model,
a PDG is also of interest for its degree of inconsistency---%
that is, the minimum value of its loss function. 
As shown by 
\textcite{one-true-loss},
many loss functions and statistical divergences
can be viewed as measuring 
the inconsistency
of a PDG that models the context appropriately.
This makes calculating this minimum value of interest%
---but up to now, there has been no way to do so.
There is a deep connection between this problem and PDG inference;
for now, we remark that this number is a byproduct of our techniques.


\textbf{Contributions.}
We provide the first algorithm for inference in a PDG;
in addition, it calculates a PDG's degree of inconsistency. 
We prove that 
our algorithm
is correct, and also
fixed-parameter tractable: for PDGs of bounded treewidth,
it runs in polynomial time.
We also prove that PDG inference and inconsistency 
    calculation are equivalent problems.
Our algorithm reduces inference in PDGs to exponential conic programming
in a way that can be offloaded to powerful existing solvers.
We provide an implementation of this reduction in a
standard convex optimization framework, giving users an
interface between such solvers and the standard PDG Python library.
Finally, we evaluate our implementation. The
    results suggest our method is faster and 
    significantly more reliable than simple baseline approaches.

\section{Preliminaries \& Related Work}

\textbf{Vector Notation.}
For us, a vector is a map from a finite set $S$, called its 
\emph{shape},
to the extended reals $\Rext := \mathbb R \cup \{\infty\}$.
We write $\mat u := [u_i]_{i \in S}$ to define a vector $\mat u$ by its components.
\discard{\color{gray!30!white}
    We will sometimes use superscripts as well, especially when indices depend on one another. For example, if $\dg S$ is a finite set of finite sets, then
    $[u^S_s]^{S \in \dg S}_{s \in S}$ denotes a vector which has an element
    for each pair $(S,s)$, satisfying $s \in S \in \dg S$.
    By supplying just the upper index of such a vector, as in $\mat u^{S_0}$,
    we mean $[u^{S_0}_s]_{s \in {S_0}}$, the projection of $\mat u$ onto the subspace whose upper index is $S_0$.
}
Vectors of the same shape
can be added (+), partially ordered ($\le$), or multiplied ($\odot$) pointwise as usual.
$\mat 1$ denotes an all-ones vector, of a shape implied by context.
\discard{\vfull{
$\mat u^{\sf T}$ denotes the transpose of $\mat u$, and is used to
express the inner product $\mat u^{\sf T} \mat v$ of vectors $\mat u$ and
$\mat v$ of the same shape.}}

\discard{
    \color{gray!30!white} If $\mat u = [u_a]_{a \in A}$ is a vector over $A$ and $\mat v = [v_b]_{b \in B}$ is a vector over $B$, then $\mat u \mathbin{\otimes} \mat v := [ u_a \cdot v_b ]_{a \in A, b \in B}$ is a vector over $A \times B$. }

    \textbf{Probabilities.}
We write $\Delta S$ to denote the set of probability distributions over a finite set $S$.
Every variable $X$ can take on values from a finite set
$\V\mskip-1.5mu X$
of possible values.
We can regard sets of variables $\mat X$ as variables themselves, with
$\V \mat X = \Pi_{X \in \mat X} \V X$.
A conditional probability distribution (cpd) $p(Y|X)$ is a map
\ifvfull
$p : \V\mskip-1.5mu  X \to \Delta \V Y$ assigning to each $x \in \V\mskip-1.5mu X$ a
probability distribution 
$p(Y|x) \in \Delta \V Y$, which is shorthand for $p(Y|X{=}x)$.
\else
$p$ that assigns each $x \in \V\mskip-1.5mu X$ a probability distribution
$p(Y|X{=}x) \in \Delta \V Y$.
\fi
\ifvfull
Given a distribution $\mu$ over (the values of) a set of variables including $X$ and $Y$,
\else
Given a joint distribution $\mu$,
\fi
we write $\mu(X)$ for its marginal 
on $X$,
and $\mu(Y|X)$ for the cpd obtained by 
\ifvfull
conditioning on $X$ and marginalizing to $Y$.
We also refer to $\mu$'s entropy
    $\H(\mu) := \Ex_{\mu} [\log \frac1\mu]$ and 
    conditional entropy $\H_\mu(Y|X) := \Ex_\mu[\log\nicefrac1{\mu(Y|X)}]$
    of $Y$ given $X$.
\else
conditioning on $X$ and marginalizing to $Y$.
We also refer to $\mu$'s entropy $\H(\mu) := \Ex_{\mu} [\log \frac1\mu]$ and conditional entropy $\H_\mu(Y|X) := \Ex_\mu[\log\nicefrac1{\mu(Y|X)}]$.
\fi




\textbf{Hypergraphs and Treewidth.} \label{sec:tw}
A hypergraph 
$
(V, \Ed)$ is a set $V$ of vertices and a
set
 $\Ed$ of \emph{hyperedges}, which correspond to subsets of $V$.
An ordinary graph may be viewed as the special case in which every hyperedge contains  two vertices.

\begin{defn}
    A \emph{directed hypergraph}
    $(N, \mathcal A)$ is a set of nodes $N$, and
    a set
    of \emph{(hyper)arcs} $\mathcal A$,
    each $a \in \mathcal A$ of which
    is associated with 
    a set of source nodes $\Src a \subseteq N$,
    and target nodes $\Tgt a \subseteq N$.
    We also write $\ed {a}{S}{T} \in \Ar$ to specify an
    arc $a$ together with its sources $S = \Src a$ and targets $T = \Tgt a$.
\end{defn}

A directed hypergraph 
can be viewed as
    a hypergraph
by joining
each source and target set,
thereby ``forgetting'' the direction of the arrow.
\ifvfull
Thus, notions defined for undirected hypergraphs (like that of
treewidth, which we now review), can be 
    applied to directed hypergraphs as well.
\fi

Many problems that are intractable for general graphs
are tractable for trees, and
some graphs are closer to being trees than others.
A tree decomposition of a (hyper)graph $G = (V, \Ed)$ is a tree $(\C, \mathcal T)$ whose vertices $C \in \C$, called
\emph{clusters}, are subsets of $V$ such that:

\begin{enumerate}[nosep]
    \item every vertex $v \in V$ and every hyperedge $E \in \Ed$ is contained in at least one cluster, and
        \item every cluster $D$ along the unique path from $C_1$ to $C_2$ in $\cal T$,
         contains $C_1 \cap C_2$.
\end{enumerate}

The \emph{width} of a tree decomposition is one less than the size of its largest cluster,
and the \emph{treewidth} of a (hyper)graph $G$ is the smallest possible width of any tree decomposition of $G$.
It is NP-hard to determine the tree-width of a graph, but
if the tree-width is known to be bounded above, a tree decomposition may be constructed in linear time \parencite{bodlaender1993linear}.
For graphs of bounded tree-width, many problems 
(indeed, any problem expressible in a certain second-order logic \parencite{courcelle1990})
can be solved in
linear time.
This is also true of inference in 
standard graphical models.


\textbf{Graphical Models and Inference.}
A \emph{graphical model structure}
is a (directed) (hyper)graph whose vertices $\X$ are variables, and whose (hyper)edges 
somehow
indicate local influences between variables.
A \emph{probabilistic graphical model},
or simply  ``graphical model'',
is a
graphical model structure
together with quantitative information about these local influences.
Semantically,
a graphical model $\cal M$
typically
represents a joint probability distribution $\Pr_{\!\cal M}
 \in \Delta \V\!\X$ over its variables.
\discard{
    Although there is often more to the story,
    it can typically be
    expressed as a product
    $\Pr_{\!\cal M}(\X) \propto \prod_{E \in \Ed} \phi_{E}(E)$
    of factors $\boldsymbol\phi = 
    \{ \phi_E : \V E \to \mathbb R_{\ge 0} \}_{E \in \Ed}$
    over a hypergraph $(\X, \Ed)$ closely related to the structure of $\cal M$.
    For this reason, some authors use the term ``graphical model'' to refer to a tuple $(\X ,\Ed, \boldsymbol\phi)$,
    i.e., a factor graph.
    PDGs, however, do not represent probabilities this way.}
Inference for $\cal M$ is then the ability to calculate cpds $\Pr_{\!\cal M}(Y|X{=}x)$,
where $X,Y \subset \X$ and $x \in \V\! X$. 

\discard{
    To do inference in probabilistic model $\cal M$ is to answer probabilistic queries, of the form
    \textit{``what is the distribution of variables $Y$, given that $X\!=\!x$?''}
    If $\cal M$ represents the joint distribution $\Pr_{\cal M}$, then the
    appropriate answer to this question is $\Pr_{\cal M}(Y \mid X\!=\!x)$.}

Many inference algorithms (such as belief propagation),
when applied to tree-like graphical models,
run in linear time and are provably correct.
If the same algorithms are na{\"i}vely applied to graphs with cycles (as in loopy belief propagation),
then they may not converge, and even if they do,
may give incorrect (or even inconsistent) answers
\parencite{wainwright2008graphical}.
Nearly all exact inference algorithms
(including variable elimination, clique  calibration, and message-passing with and without division),
effectively construct a tree decomposition, and can be
viewed as running on a tree \parencite[\S9-11]{koller2009probabilistic}.
Indeed,
under widely believed assumptions,
every class of graphical models
for which inference is \emph{not} NP-hard
has bounded treewidth
\parencite{chandrasekaran2012complexity}.
Given a tree decomposition $(\C, \mathcal T)$ of the 
underlying model structure,
many of these algorithms
use a 
data structure
known as a \emph{clique tree}, which
is a collection
$\bmu \!=\! \{\mu_C(C)\}_{ C \in \C}$ 
of probabilities over the clusters \parencite[\S10]{koller2009probabilistic}.

A clique tree
$\bmu$ 
is said to be \emph{calibrated} if neighboring clusters' distributions agree on the variables 
they share.
In this case, 
it determines a joint distribution by
\begin{equation}
    \Pr\nolimits_{\bmu}
    (\X)
        = \quad 
        {\prod_{\mathclap{C \in \C}} \mu_C(C)~\,}\Big/
        {~~\prod_{\mathclap{(C{-}D) \mathrlap{\in \cal T}}} \mu_{C}(C \cap D)\,,}
    \label{eq:cliquedist}
\end{equation}
which has the property that $\Pr_{\bmu}(C) = \mu_C$ for 
all
$C \in \C$.
\discard{%
\vfull{A calibrated clique tree $\bmu$ summarizes the answers to 
many queries about $\Pr_{\bmu}$ at once. 
To see why
in a simple case, note that 
for an unconditional query about $Y$ contained within a single cluster $C$, we have
$\Pr_{\bmu}(Y) = \mu_C(Y)$.
\discard{%
    Note also that if $\C = \{ \X \}$ just contains one big cluster, 
    a clique tree is an explicit joint distribution, reducing queries to summation.}%
With some care, the general idea can be extended to arbitrary queries 
    \parencite[see][\S 10.3.3]{koller2009probabilistic};
    those conditional on evidence $X{=}x$ can be handled
    by conditioning the clusters that contain $X$,
    and then recalibrating $\bmu$ with 
    a standard algorithm like belief propagation.
    }
}
A calibrated clique tree summarizes the answers to
queries about $\Pr_{\bmu}$
\parencite[see][\S 10.3.3]{koller2009probabilistic}.
Therefore, to answer probabilistic queries with respect to a distribution $\mu$, it suffices to find a calibrated clique tree $\bmu$ that represents $\mu$, and appeal to standard algorithms.

\textbf{Probabilistic Dependency Graphs.}
Our presentation of PDGs is slightly
different from (but equivalent to) that of
\textcite{pdg-aaai}, which
the reader is encouraged to consult for more details and intuition.
At a high level, a PDG
is
just
a collection of cpds and causal assertions,
    weighted by confidence. More precisely:

\begin{defn}
    A PDG $\dg M \!=\! (\X\mskip-2mu, \Ar,
        \mathbb P, 
        \balpha, \bbeta )
    $
    is     
    a directed hypergraph 
    $(\X\mskip-2mu, \Ar)$ 
    whose nodes 
    are
    variables,
    together with 
    probabilities $\mathbb P$
    and
    confidence vectors
    $\balpha \!=\! [\alpha_a]_{a \in \Ar}$ 
    and $\bbeta \!=\! [\beta_a]_{a \in \Ar}$,
    so that
    each $\ed aST \! \in\! \Ar$ is associated with:
    
    \begin{itemize}[nosep,itemsep=2pt]
    \item
    a conditional probability distribution
    {\subafalse $\p_a(\Tgt a | \Src a)$}
    on the target variables given 
    values of
    the source variables,
    \item a weight 
    $\beta_a \in \smash{\Rext}$ 
    indicating
    the modeler's confidence in 
    the cpd {\subafalse $\p_a(\Tgt a | \Src a)$},
    \discard{(as measured by the number of independent observations that support $\p_a$), }
    and
    \item 
    a weight $\smash{\alpha_a \in \mathbb R}$
    indicating
    the modeler's confidence in the functional dependence of 
    {\subafalse$\Tgt a\mskip-2mu$ on $\Src a\mskip-2mu$}
    expressed by
    $a$.
    \discard{
    (as measured by the expected number of independent causal mechanisms corresponding to $a$,
    that determine $\Tgt a$ given $\Src a$).%
    }
    \end{itemize}
If $\bbeta \ge \mat 0$ and $\alpha_a\! > 0$ implies $\beta_a\! > 0$, we
write $\bbeta \gg \balpha$ and
call $\dg M$ \emph{proper}.
Note that  $\bbeta \gg \balpha$ if $\bbeta > \mat 0$.
\end{defn}

One significant advantage of PDGs is their modularity:
we can combine the information in $\dg M_1$ and $\dg M_2$ 
by taking the union of their variables and the disjoint union of their arcs (and associated data) to get a new PDG, denoted $\dg M_1 + \dg M_2$.


As mentioned in the introduction,
a PDG contains two types of information:
``structural'' information, in the hypergraph $\Ar$ and
weights $\balpha$, and ``observational'' data,
in the cpds  $\mathbb P$ and weights $\bbeta$.
PDG semantics are based on two scoring functions
that quantify discrepancy between 
each type of information and a distribution
$\mu \in \Delta \V \!\X$ over its variables.

The \emph{observational incompatibility} of $\mu$ with $\dg M$, which
can be thought of as a ``distance''  between $\mu$ and the cpds of $\dg M$,
is given by the weighted sum of relative entropies:
\begin{align*}
    \OInc_{\dg M}(\mu) :=
        \sum_{\ed aST \mathrlap{\,\in \Ar}} \subafalse
        \beta_a\, \kldiv[\Big]{\mu(\Tgt a,\Src a)}{\p_a(\Tgt a | \Src a) \mu(\Src a)}.
\end{align*}
Under a standard interpretation of the relative entropy $\kldiv{\mu}{p} \mskip-1.5mu=\mskip-1.5mu \Ex_{\mu}[\log \frac\mu p]$,
$\OInc_{\dg M}$ measures the excess cost of using codes optimized for the cpds of $\dg M$ 
(weighted by their confidences),
when reality is
distributed according to $\mu$.

The second scoring function measures
the extent to which
$\mu$ 
is incompatible with 
a causal picture consisting of independent mechanisms 
along each hyperarc. 
This is captured by the
\emph{structural incompatibility}
(of $\mu$ with $\dg M$), 
and given by
\begin{equation*}
    \SInc_{\dg M}(\mu) := \,
        \pqty[\Big]{\; \sum_{\ed aST \mathrlap{\,\in \Ar}}\subafalse \alpha_a\, \H_\mu(\Tgt a | \Src a) } - \H(\mu).
\end{equation*}
Note that
it
does not depend on the cpds
of $\dg M$, nor the possible values of the 
variables; it
is defined purely in terms of
the weighted hypergraph structure $(\Ar,\balpha)$.


When the observational and structural information conflict, then the distribution(s)
that best represent a PDG will depend on the relative importance of observation and structure.
\discard{ This is captured by a trade-off 
    parameter $\gamma \ge 0$, which 
    can be used to define the scoring function
    $\bbr{\dg M}_\gamma: \Delta \V\!\X \to \Rext$, as follows:}%
This is captured by a trade-off paramter $\zogamma \in [0,1]$,
which we can use to form 
the convex combination
$(1-\zogamma)\OInc + \zogamma \SInc$. 
So as to 
simplify the math
and match the notation in
previous work (\citeyear{pdg-aaai,one-true-loss}),
we instead use a rescaled variant with a different parameterization.
Let
$\gamma := \nicefrac{\zogamma}{(1-\zogamma)} \in [0,\infty]$,
which is almost identical to $\zogamma$ when $\zogamma \approx 0$.
With it,
 define the combined scoring function:
\begin{align*}
    \bbr{\dg M}_\gamma&(\mu) 
        := \OInc_{\dg M}(\mu) + \gamma \, \SInc_{\dg M}(\mu)
            \numberthis\label{eqn:scoring-fn} \\[-0.2ex]
        &\!\!\!= \scalebox{0.85}{$\displaystyle\frac{1}{1-\zogamma}$} \Big(\, (1-\zogamma) \OInc_{\dg M}(\mu) + \zogamma \, \SInc_{\dg M}(\mu)\, \Big) \\[-0.1ex]
        &= \Ex\nolimits_{\mu}\bigg[
            \,
            \sum_{\ed aST \mathrlap{\, \in \Ar}} \subafalse
            \log \frac
            {\mu(\Tgt a| \Src a)^{\beta_a - \gamma \alpha_a}}
            {\p_a(\Tgt a | \Src a)^{\beta_a}}
        \bigg] - \gamma \H(\mu)
        .
\end{align*}
Let $\bbr{\dg M}^*_\gamma := \argmin_\mu \bbr{\dg M}_\gamma(\mu)$ denote
the set of optimal distributions at a particular value $\gamma$.
One natural conception of inference in PDGs is then parameterized by
$\zogamma$:
to do $\zogamma$-inference
in $\dg M$ is to respond to probabilistic queries in a way that is sound with respect to every $\mu \in \bbr{\dg M}^*_\gamma$.
It is not too difficult to see that when $\bbeta \ge \gamma\balpha$, 
 \eqref{eqn:scoring-fn} is strictly convex, which ensures that
 $\bbr{\dg M}^*_\gamma$ is a singleton.
This paper demonstrates that
$\zogamma$-inference
is tractable for such PDGs.
\discard{%
    The former is a notational convenience,
    because for $\gamma \in (0, \infty)$,
    $\gamma$-inference is just $1$-inference for a
    slightly different PDG:
    $\bbr{
        \balpha, \bbeta}^*_\gamma = \bbr{
        \gamma \balpha, \bbeta}^*_1$.
    This paper demonstrates the tractability of
    1-inference for the specific case of PDGs satisfying $\bbeta \ge \balpha$, which is sufficient to ensure strict convexity of \eqref{eqn:scoring-fn}, and hence a unique optimal distribution.}%

The limiting behavior of the $\zogamma$-semantics as $\zogamma \to 0$,
which we denote $\bbr{\dg M}^*_{0^+}$ and call the \emph{$0^+$\!-semantics},
has some special properties.
\discard{Supposing that $\bbeta > \mat 0$,%
    \footnote{or, more generally, that $\bbeta \gg \balpha$,
        (i.e., if $\alpha_a>0$ then $\beta_a>0$), 
        which, for every $\gamma > 0$, is weaker than $\bbeta \ge \gamma \balpha$. }}%
If $\dg M$ is proper, then        
$\bbr{\dg M}^*_{0^+}$ contains precisely one distribution.
This distribution intuitively
reflects an extreme empirical
view: observational data trumps causal structure.
Note that in the absence of a causal picture
($\balpha = \mat0$), 
this
corresponds to the well-established 
practice of selecting
the maximum entropy distribution consistent with some observational constraints \parencite{jaynes1957information}.
One should be careful to distinguish 
$\bbr{\dg M}^*_{0^+}$
from $\bbr{\dg M}^*_0$,
the set of distributions that minimize
$\OInc_{\dg M}$; the latter  set
includes
 $\bbr{\dg M}^*_{0^+}$
\parencite[Prop 3.4]{pdg-aaai},
but may also contain other distributions.
This paper also shows how to efficiently answer queries with respect 
to the unique distribution in $\bbr{\dg M}^*_{0^+}$, which we call
\emph{$0^+$\!-inference}.

Given a PDG $\dg M$, the smallest possible value of its scoring function,
$
    \aar{\dg M}_\gamma := \inf_{\mu}\, \bbr{\dg M}_\gamma(\mu),
$
is known as its $\gamma$-inconsistency
and is interesting in its own right:
$\aar{\,\cdot\,}_\gamma$ 
seems to be a ``universal'' loss
function \parencite{one-true-loss}.










\textbf{Interior-Point Methods and Convex Optimization.}
Interior-point methods provide an iterative way of approximately solving linear programs in polynomial time \parencite{karmarkar1984new}.
With the theory of ``symmetric cones'', these methods were extended in the 1990s to handle second-order cone programs (SOCPs) and semidefinite programs (SDPs), which allow more expressive constraints.
But the constraints that these methods can handle are insufficient for
our purposes. We need what have been called \emph{exponential cone constraints}.
The \emph{exponential cone} is the convex set
\begin{align*}
        \begin{aligned}
        K_{\mskip-1mu\exp} :=
        \big\{ (x_1, x_2, x_3) &:
                x_1 \ge x_2 e^{x_3 / x_2}\!,\, x_2 > 0 \big\}
            \\[-0.6ex]\quad \mathbin{\cup}\,
        \big\{ (x_1, 0, x_3&) : x_1 \ge 0,\, x_3 \le 0 \big\}
        \qquad \subset \mathbb \Rext^3.
    \end{aligned}
\end{align*}
\discard{\voli{It is also sometimes called the ``relative entropy'' cone, because if $\mat m, \mat p \in \Delta^{n-1} \subset \mathbb R^n$ are points on a probability simplex, then $(-\mat u, \mat m, \mat p) \in K_{\exp}^n$ if and only if $\mat u$ is an upper bound on $\mat m \log (\nicefrac{\mat m}{\mat p})$, the pointwise contribution to relative entropy at each outcome.}}
Let $K \mskip-2mu := \mskip-2mu K_{\exp}^p \mskip-2mu \times \mskip-1mu
[0, \infty]^q 
\subset \smash\Rext^n$ be a product of $p$ exponential cones and $q = n - 3k$ non-negative orthants.
An \emph{exponential conic program} is then an optimization problem of the form
\begin{equation}
    \minimize
        _{\mat x}
        ~~ \mat c^{\sf T} \mat x
    \quad\subjto
    ~~ A \mat x = \mat b,~~\mat x \in 
        K,
        \label{eq:exp-conic-program}
\end{equation}
where $\mat c \in \Rext^{n}$ is some cost vector,
the function $\mat x \mapsto \mat c^{\sf T} \mat x$ is called the \emph{objective},
and $\mat b \in \Rext^m$, $\mat A \in \Rext^{m \times n}$ encode linear constraints.
\textcite*{nesterov1996infeasible} first established that such problems can be solved in polynomial time, but incur double the memory and eight times the time, compared to the symmetric counterparts. These drawbacks were eliminated in \cite{skajaa2015homogeneous}.
The algorithm that seems to display the best empirical performance \parencite{dahl2022primal}, however, was only recently shown to
run in polynomial time \parencite{badenbroek2021algorithm}.

Disciplined Convex Programming \parencite{dcp-thesis} is a
compositional approach to convex 
optimization that imposes
certain 
restrictions on how problems can be formed;
a program 
conforming to
those rules is said to be dcp.
The reason to do so is that a dcp program can be efficiently compiled to a
standard form \parencite{agrawal2018rewriting}, 
which in our case is
 an exponential conic program.
Only two rules are relevant to us: a constraint of the form
$(x,y,z) \in K_{\exp}$ is dcp iff $x$, $y$, and $z$ are affine transformations of the
    optimization variables, 
and a linear program
augmented
with dcp exponential conic constraints is dcp.
Because all the optimization problems that we give are
of this form,
we can easily compile them
to exponential conic programs even if they do not exactly conform to \eqref{eq:exp-conic-program}.





\discard{\color{gray!80!white}
    \section{AN EXPRESSIVE CLASS OF OPTIMIZATION PROBLMS}

    \begin{itemize}
        \item
        Many optimization problems can be effectively solved with
    \end{itemize}

    \begin{itemize}
        \item
        For optimization people: a new class of optimization problems,
    \end{itemize}

}

\section{Inference as a Convex Program}
    \label{sec:inf-as-cvx-program}

Here is an obvious, if inefficient,
way
of calculating
$\Pr_{\!\cal M}(Y|X{=}x)$ in a
probabilistic model $\cal M$. 
First compute an explicit representation of the joint distribution 
$\Pr_{\!\cal M} \in \Delta\mskip-2mu\V\!\X$, 
then marginalize to 
$\Pr_{\!\cal M}(X,Y)$ and condition on $X{=}x$.
For a factor graph or BN,
each step is
straightforward;
the problem is the exponential time and space required to represent $\Pr_{\!\cal M}(\X)$ explicitly.
A key feature of inference algorithms for BNs and FGs is that they
do not represent joint distributions in this way.
For PDGs, though, it is not
obvious that
we can calculate the $\zogamma$-semantics,
even if
we know it is unique, and
we ignore the space required to represent it (as we do in this section).
Note that $\zogamma$-inference is already an optimization problem by definition:
\[
    \minimize_\mu\quad
        \bbr{\dg M}_\gamma(\mu)
    \quad \subjto\quad \mu \in \Delta\mskip-2mu\V\mskip-2mu\X.
\]
For small enough 
 $\gamma$,
it is even convex.
But can we solve it efficiently?
With exponential cone constraints,
the answer is yes, as we show in \Cref{sec:small-gamma}.
Moreover, we can compute the $0^+$\!-semantics with a sequence of two exponential conic programs (\Cref{sec:empirical-limit}).
To give a flavor of our constructions and ease into
the more complicated ones, we begin by 
minimizing
 $\OInc$, the simpler of the two scoring functions.



\subsection{%
    Minimizing Incompatibilty
    (\texorpdfstring{$\boldsymbol\gamma\boldsymbol=\mat0$}{gamma=0})%
} \label{sec:minimize-inc}

When $\gamma = 0$, we want to find minimizers of $\OInc$,
which is a
weighted sum of conditional relative entropies.
There is a straightforward connection between the exponential cone and
relative entropy:
if $\mat m, \mat p \in
\Delta \{1, \ldots, n\}
\subset \mathbb R^n$ are points on
a probability simplex,
then $(-\mat u, \mat m, \mat p) \in K_{\exp}^n$ if and only if
$\mat u$ is an upper bound on $\mat m \log \frac{\mat m}{\mat p}$,
the pointwise contribution to relative entropy at each outcome.
Thus, perhaps unsurprisingly, we can use an exponential conic program to
find minimizers of $\OInc$.
If all beliefs are unconditional and over the same space,
the construction is standard;
we review it here, so that we can build upon it.

\textbf{Warm-up.}
\begingroup
Consider a PDG with
only one variable $X$
with
$\V\mskip-2mu X = \{1, \ldots, n\}$.
\discard{\vfull{
    Suppose further that for every arc $j \in \Ar = \{1, \ldots, k\}$, the cpd $\p_j(X)$ is an unconditional distribution over $X$.
    That is, $\Tgt j = \{ X \}$, and $\Src j = \emptyset$.
    Such unconditional probabilities may be identified with vectors $\mat p_j \in [0,1]^n$, and all $k$ of them may conjoined to form a
    matrix $\mat P = [\,p_{ij}] \in [0,1]^{n \times k}$.
    A candidate
    (joint)
    distribution $\mu(X)$
    may be represented as a vector $\mat m \in [0,1]^n$.}}
Suppose further that every arc $j \in \Ar = \{1, \ldots, k\}$
has $\Tgt j = \{ X \}$ and $\Src j = \emptyset$.
Then each $\p_j(X)$ can be identified with a vector $\mat p_j \in [0,1]^n$, and all $k$ of them can conjoined to form a matrix $\mat P = [\,p_{ij}] \in [0,1]^{n \times k}$.
Similarly, a candidate distribution $\mu$ can be identified with $\mat m \in [0,1]^n$. 
Now consider a matrix $\mat U = [u_{i,j}] \in \Rext^{n \times k}$
that, intuitively, gives an upper bound on
the contribution to $\OInc$ due to each edge and value of $X$.
Observe that
\begin{align*}
    &&(- \mat U,~
        [\mat m,\,...\,, \mat m]
        ,~ \mat P) &\in K_{\exp}^{n \times k} \\
    &\iff& \forall  i,j.~~
            u_{ij} &\ge m_i \log (\nicefrac{m_i}{p_{ij}}) \\
    &\implies& \forall j.~~
        {\textstyle\sum_i} u_{ij}  &\ge \kldiv{\mu}{p_j} \\
    &\implies& {\textstyle\sum_{i,j}} \beta_j u_{ij}  &\ge {\textstyle\sum_j} \beta_j \kldiv{\mu}{p_j} \\
    &\iff& \mat 1^{\sf T} \mat U \bbeta &\ge \OInc(\mu) .
        \numberthis\label{eqn:warmup-logic}
\end{align*}

So now, if $(\mat U, \mat m)$ is a solution to the convex program
\begin{align*}
    \minimize_{\mat m, \mat U}~~
        \mat 1^{\sf T} \mat U \bbeta
    \quad\subjto\quad &
        \mat 1^{\sf T} \mat m  = 1, \\[-2ex]
        (-\mat U,\;&
            [\mat m,\,...\,, \mat m]
            ,\; \mat P
        )
            \in K_{\exp}^{n \times k},
\end{align*}
then (a) 
the
objective value $\mat 1^{\sf T} \mat U \bbeta$
equals
the inconsistency $\aar{\dg M}_0$, and (b) $\mu \in \bbr{\dg M}^*_0$,
meaning $\mu$ minimizes $\OInc_{\dg M}$.

\endgroup

\textbf{The General Case.}
We now show how the same construction can be used to find
 a distribution $\mu \in \bbr{\dg M}^*_0$
for an arbitrary PDG $\dg M = (\X, \Ar, \mathcal P, \balpha, \bbeta)$.
\discard{\color{gray}Now that we have a taste for how this works in terms of matrices,
    let's now move up a level,
    and identify distributions with their simplex representations.}
To further simplify the presentation,
for each arc $a \in \Ar$, let
$\V a := \V(\Src a, \Tgt a)$
denote all joint settings of $a$'s source and target variables, and
write
$
\V\!\Ar :%
    = \sqcup_{a \in A} \V a
    = \{ (a, s, t) : a \in \Ar,\, (s,t) \in \V(\Src a,\Tgt a) \}
$
for the set of all choices of an arc together with values of its source and target.
For each $a \in \Ar$, 
we can regard $\mu(\Tgt a, \Src a)$ and $\mu(\Src a)\p_a(\Tgt a | \Src a)$, both distributions over $\{\Src a,\Tgt a\}$, 
as vectors of shape $\V a$.
As before, we introduce an optimization variable $\mat u$ that packages together
    all of the relevant pointwise upper bounds.
To that end, consider a 
vector
$\mat u = [u_{a,s,t}] \in \Rext^{\V\!\Ar}$
in the optimization problem
\discard{\begin{align*}
    \minimize_{\mu, \mat u} \quad
        \sum_{\mathrlap{(a,s,t) \in \V\!\Ar}} \beta_a \, u_{a,s,t}
        \quad~~&
    \numberthis\label{prob:joint-inc}\\[-3ex]
    &\subjto\quad \mu \in \Delta\V\!\X, \\[-0.3ex]
    \forall a \in \Ar.~\big({-}{\mat u}_a,\, \mu( \Tgt a,\Src a),&\, \p_a(\Tgt a | \Src a)  \mu(\Src a) \big) \in K_{\exp}^{\V a}
\end{align*}}%
{\begin{align*}
    \minimize_{\mu, \mat u} &\quad
        \sum_{\mathrlap{(a,s,t) \in \V\!\Ar}} \beta_a \, u_{a,s,t}
    \numberthis\label{prob:joint-inc}\\
    \subjto&\quad \mu \in \Delta\V\!\X, \\[-0.4ex]
        \forall a \in \Ar.~&\big(-{\mat u}_a,\, \mu( \Tgt a,\Src a),\, \p_a(\Tgt a | \Src a)  \mu(\Src a) \big) \in K_{\exp}^{\V a}
        .
\end{align*}}%
where $\mat u_a = [u_{a,s,t}]_{(s,t) \in \V a}$ consists of those
components of $\mat u$ associated with arc $a$.
Note that
the marginals
 $\mu(\Src a, \Tgt a)$ and $\mu(\Src a)$ 
are affine transformations of $\mu$, so \eqref{prob:joint-inc} is dcp.
A straightforward generalization of the logic in \eqref{eqn:warmup-logic} gives us:

\begin{linked}{prop}{joint-inc-correct}
    If $(\mu, \mat u)$ is a solution to \eqref{prob:joint-inc}, then
    $\mu \in \bbr{\dg M}_0^*$,
    and
    $%
        \sum_{(a,s,t) \in \V\!\Ar} \beta_a u_{a,s,t} = \aar{\dg M}_0$.
\end{linked}

Thus, a solution to \eqref{prob:joint-inc}
encodes a distribution that minimizes $\OInc$, and
the (0-)inconsistency
of $\dg M$.
This is a start, but to do 
$0^+$\!-inference,
among the minimizers of $\OInc$
we must find the unique distribution in $\bbr{\dg M}^*_{0^+}$, 
while for $\zogamma$-inference ($\zogamma > 0$), we need to find the optimizers of
$\bbr{\dg M}^*_\gamma$.
Either way, we must consider 
$\SInc$
in addition to $\OInc$. 

\subsection{%
    \texorpdfstring{$\boldsymbol\gamma$}%
    {gamma}-Inference
    for small
    \texorpdfstring{$%
    \boldsymbol\gamma%
    \boldsymbol>\mat0$}{gamma}%
} \label{sec:small-gamma}

When $\gamma > 0$ is small enough,
the scoring function \eqref{eqn:scoring-fn} is not only convex,
but admits a straightforward representation as an exponential conic program.
To see this, note that \eqref{eqn:scoring-fn} can be rewritten \parencite[Prop 4.6]{pdg-aaai} as:
\begin{equation}
    \begin{aligned}
        \bbr{\dg M}_\gamma(\mu) = &-\gamma\H(\mu) -
            \sum_{a \in \Ar}
                \beta_a\, \Ex_\mu
                    \log {\p_a(\Tgt a | \Src a)}
                \\[-0.6ex]
            &~~+ \sum_{a \in \Ar}
            (\gamma \alpha_a - \beta_a)
                \H_\mu (\Tgt a | \Src a).
    \end{aligned}
    \label{eq:altscore}
\end{equation}
The first term,
$-\gamma\H(\mu)$,
is strictly convex and has a well-known
translation into an exponential cone constraint;
the second one linear in $\mu$.
Now,
if $0 < \gamma \le \min_{a} \frac{\beta_a}{\alpha_a}$ , then
every summand of the last term is a negative conditional entropy, and 
can be captured by an exponential cone constraint.
The only wrinkle is that it is possible for a user to specify that some $\p_a(t\mid s) = 0$, in which case the linear term 
is undefined.
The result is a requirement that $\mu(s,t) = 0$ at such points,
which we can instead encode directly with linear constraints.
To do this formally,
divide $\V\!\Ar$ into two parts:
$\V\!\Ar^+ := \{ (a,s,t) \in\V\!\Ar : \p_a(t |s) > 0\}$ and
$\V\!\Ar^0 := \{ (a,s,t) \in\V\!\Ar : \p_a(t |s) = 0\}$.
Armed with this notation, consider upper bound vectors
$\mat u = [ u_{a,s,t}]_{(a,s,t) \in \V\!\Ar}$ and $\mat v = [v_w]_{w \in \V\!\X}$,
in the following optimization problem:
{%
\begin{align*}
\minimize_{\mu, \mat u, \mat v} & ~~
    \sum_{\mathrlap{\!\!\!(a,s,t) \in \V\!\Ar}}
        (\beta_a \!- \alpha_a \gamma) u_{a,s,t}
        \,+
        \gamma
        \sum_{\mathclap{w \in \V\!\X}} v_w
    \numberthis\label{prob:joint-small-gamma}
    \\[-0.2ex]
    &\qquad
    - \sum_{\mathrlap{\!\!\!(a,s,t) \in \smash{\V\!\Ar^+}}} 
        \alpha_a \gamma \, 
        \mu(\Src a{=}s,\Tgt a {=} t) \log \p_a (t|s)
\\[0.2ex]
\subjto&\quad \mu \in \Delta\V\!\X, 
        \quad ( -\mat v,  \mu,  \mat 1) \in K_{\exp}^{\V\!\X},
    \\[-0.4ex]
    \forall a \in \Ar.~
        &\big(-\mat u_a, \mu( \Tgt a,\Src a),\p_a(\Tgt a | \Src a)  \mu(\Src a) \big)
            \in K_{\exp}^{\V a}, \\[-0.1ex]
    \forall (a,s,t) &\in \V\!\Ar^0\!.~
    \mu(\Src a{=}\mskip2mus, \Tgt a{=}\mskip2mut) = 0.
\end{align*}}

This optimization problem may look complex, but it 
falls out of
\eqref{eq:altscore} 
fairly
directly,
and gives us what we wanted.

\begin{linked}{prop}{joint-small-gamma-correct}
    If $(\mu, \mat u, \mat v)$ is a solution to \eqref{prob:joint-small-gamma},
    and $\bbeta \ge \gamma \balpha$,
    then
    $\mu$ is the unique element of
    $\bbr{\dg M}^*_\gamma$, and $\aar{\dg M}_\gamma$
    equals the objective of \eqref{prob:joint-small-gamma} evaluated at $(\mu, \mat u, \mat v)$.
\end{linked}

\subsection{
    Calculating the \texorpdfstring{$\mat 0^{\boldsymbol+}$\!}{0+}-semantics
    (\texorpdfstring{$\boldsymbol\gamma\boldsymbol\to\mat 0$}{gamma->0})}
    \label{sec:empirical-limit}
\cref{sec:minimize-inc} shows how to find a distribution $\nu$ that minimizes
$\OInc$%
---but to do
$0^+$\!-inference,
we need to find the minimizer
that, uniquely among them, best minimizes
$\SInc$.
It turns out this can be done by
    using $\nu$ to construct a second optimization problem.
The justification requires two more results;
we start by characterizing the minimizers of $\OInc$.


\begin{linked}{prop}{marginonly}
    If $\dg M$ has arcs $\Ar$ and $\bbeta \ge 0$,
    the minimizers of $\OInc_{\dg M}$ all have the same conditional
        marginals along $\Ar$.
    That is, for all $\mu_1, \mu_2 \in \bbr{\dg M}_0^*$
    and all $\ed aST \in \Ar$ 
    with $\beta_a > 0$, we have
    {\subafalse
    $\mu_1(\Tgt a, \Src a)\mu_2(\Src a) = \mu_2(\Tgt a, \Src a) \mu_1(\Src a)$.%
    \footnotemark
    }
\end{linked}
\footnotetext{Intuitively, this assserts 
$\mu_1(\Tgt a | \Src a) = \mu_2(\Tgt a | \Src a)$,
but also handles cases where some
$\mu_1(\Src a {=} s)$ or $\mu_2(\Src a {=} s)$ 
equals zero.}

As a result, once we find one minimizer $\nu$ of $\OInc_{\dg M}$
(e.g., via \eqref{prob:joint-inc}),
it then suffices to constrain distributions that have the same
conditional marginals along the edges, and optimize $\SInc$.
But in attepting to do so, we run into a second issue: $\SInc$
is typically not convex.
Fortunately, it is if we constrain to distributions that minimize $\OInc$.
Moreover, on this restricted domain, it can be represented 
with dcp exponential cone constraints.

\begin{linked}{prop}{idef-frozen}
If $\mu \in \bbr{\dg M}_0^*$\,,
then
\begin{equation}
    \underset{{\dg M}}{\SInc_{}}(\mskip-0.5mu\mu\mskip-1mu) \!=\!
        \sum_{\mathclap{ w \in \V\!\X } }
            \mskip-1mu
            \mu(\mskip-1.5mu w \mskip-1.5mu)
            \log \!  \bigg(\!
                \faktor{\mu(\mskip-1.5mu w\mskip-1.5mu )}{\,\prod_{\mathclap{a \in \Ar}} 
                \mskip-1mu
                \nu\big(\mskip-1mu\Tgt a \mskip-2.2mu(\mskip-2mu w \mskip-2mu) 
                    \mskip-0.5mu \big|  \Src a \mskip-2mu(\mskip-2mu w \mskip-2mu)\mskip-2mu\big)^
                {\!\alpha\ssub a}
                }\!\mskip-2mu
            \bigg)\mskip-2mu
        ,\!
        \label{eq:idef-alt-constr}
\end{equation}
where $\{ \nu(\Tgt a | \Src a ) \}_{a \in \Ar}$ are the
marginals along the arcs $\Ar$
shared by all distributions in $\bbr{\dg M}^*_0$\
(per \cref{prop:marginonly}),
and $\Src a \mskip-1mu(\mskip-1mu w \mskip-1mu), \Tgt a \mskip-1mu(\mskip-1mu w \mskip-1mu)$ are the values of variables $\Src a$ and $\Tgt a$ in $w$.
\end{linked}

If we already know a distribution $\nu \in \bbr{\dg M}_0^*$,
perhaps by solving \eqref{prob:joint-inc}, then
the denominator of \eqref{eq:idef-alt-constr} does not depend on $\mu$ 
and so is constant in our search for minimizers of
$\SInc_{}$.
For ease of exposition, aggregate these values into a vector
\begin{equation}
    \mat k :=
        \Big[
        ~\prod_{\smash{a \in \Ar}} \nu(\Tgt a (w) | \Src a (w))^{\alpha\ssub a}
        \Big]%
        _{w \in \V\!\X}~\raisebox{-1ex}.
        \label{eq:cm-product}
\end{equation}
We can now capture $\bbr{\dg M}^*_{0^+}$ with a convex program.

\begin{linked}{prop}{joint+idef-correct}
If $\nu \in \bbr{\dg M}_0^*$
and $(\mu, \mat u)$ 
solves the problem
\begin{align*}
    \minimize_{\mu, \mat u} & \quad
        \smash{\mat 1^{\sf T} \mat u}
        \numberthis\label{prob:joint+idef}\\[-0.5ex]
    \subjto &\quad
        (-\mat u,  \mu, \mat k ) \in K_{\exp}^{\V\!\X},~~\quad \mu \in \Delta\V\!\X, \\[-0.4ex]
            \forall& \ed aST \subafalse \in \Ar.~~\mu(\Src a, \Tgt a)\, \nu(\Src a) = \mu(\Src a)\, \nu(\Src a, \Tgt a),
\end{align*}
then $\bbr{\dg M}^*_{0^+} = \{ \mu \}$
and $\mat 1^{\sf T} \mat u = \SInc_{\dg M}(\mu)$.
\end{linked}
Running \eqref{prob:joint+idef} through a convex solver gives rise to the 
first algorithm
that can reliably find $\bbr{\dg M}^*_{0^+}$.




\section{Polynomial-Time Inference Under Bounded Treewidth}
    \label{sec:clique-tree-expcone}
We have now seen how $\zogamma$-inference
(for small $\zogamma$) can be reduced to convex optimization
over joint distributions $\mu$---%
but $\mu$ grows exponentially with the number of variables in the PDG,
so we do not yet have a tractable inference algorithm.
We now show how $\mu$ can be replaced with a clique tree over the PDG's structure. 
What makes this possible is a key independence property of traditional graphical models,
which we now prove
holds for PDGs as well.



\begin{linked}[Markov Property for PDGs]{theorem}{markov-property}
  If\, $\dg M_1$ and $\dg M_2$ are PDGs
    over the sets of variables $\X_1$ and $\X_2$, respectively,
\discard{
    Then for all $\gamma > 0$, we have that
    \[  \bbr{\dg M_1 \bundle \dg M_2}^*_\gamma
			~\models~
		\X_1 \mathbin{\bot\!\!\!\bot} \X_2 \mid \X_1 \cap \X_2. \] 
    That is: for every distribution $\mu \in \bbr{\dg M_1 \bundle \dg M_2}^*_\gamma$,
    the variables of $\dg M_1$ and of $\dg M_2$ are conditionally independent given the variables they have in common.
}
then
$\X_1$
and $\X_2$
are conditionally independent given $\X_1 \cap \X_2$
in every
 $\mu \in \bbr{\dg M_1 \bundle \dg M_2}^*_\gamma$\,,
for all $\gamma > 0$ and
$\gamma=0^+$.
\end{linked}

For the remainder of this section, fix a PDG $\dg M$ and a tree decomposition $(\C, \mathcal T)$ of $\dg M$'s hypergraph.
One significant consequence of \cref{theorem:markov-property} is that, in the
search for optimizers of \eqref{eqn:scoring-fn}, we
need consider only distributions that satisfy those independencies,
all of which can be represented as a clique tree
$\bmu = \{\mu_C \in \Delta\V(C) \}_{C \in \C}$
over $(\C, \mathcal T)$.

\begin{linked}{coro}{can-use-cliquetree}
    If $\dg M$ is a PDG with arcs $\Ar$, 
    $(\C, \mathcal T)$ is a tree decomposition of $\Ar$,
    $\gamma > 0$, and
    $\mu \in \bbr{\dg M}^*_\gamma$, then there exists a clique tree
    $\bmu$ over $(\C, \mathcal T)$ such that $\Pr_{\bmu} = \mu$.
\end{linked}

For convenience, let
$\V\C := \{(C,c) : C \in \C, c \in \V(C)\}$ 
be
the set of all choices of a cluster together with a setting of its variables. 
Like before, we start by optimizing $\OInc$, this time
over calibrated clique trees $\bmu$,
which we identify with vectors
$
 \bmu
    \cong [\mu_C(C{=}c)]_{(C,c)\in\V\C} 
$.
We need the conditional marginals $\Pr_{\bmu}(\Tgt a | \Src a)$ of $\bmu$ along every arc $a$ in order
to calculate $\OInc_{\dg M}(\Pr_{\bmu})$; fortunately, they are readily available.
Since $(\C, \mathcal T)$ is a tree decomposition,
we know $\Src a$ and $\Tgt a$ lie entirely within some cluster $C_{\!a} \in \C$,
and $\Pr_{\!\bmu}(\Tgt a | \Src a) = \mu_{C_{\!a}}\!(\Tgt a | \Src a)$ if $\bmu$ is calibrated.
For $\mat u \in \Rext^{\V\!\Ar}$, consider the problem
\begin{align*}
    \minimize_{\bmu, \mat u} &\quad
        \sum_{\mathrlap{(a,s,t) \in \V\!\Ar}}\beta_a \,  u_{a,s,t}
    \numberthis\label{prob:cluster-inc}\\
    \subjto&\quad
        \forall C \in \C.~\mu_C \in \Delta\V(C), \\[-0.3ex]
        \forall a \in \Ar.~
            \big(&\!- \! \mat u_a,\, \mu_{C\!_a}\!(\Src a,\mskip-2mu \Tgt a),\, \mu_{C\!_a}\!(\Src a) \p_a(\Tgt a | \Src a)\big) \in K_{\exp}^{\V a} \\[-0.2ex]
        \forall (C,D) &\in \mathcal T.~~ \mu_{C}(C \cap D) = \mu_{D}(C \cap D),
\end{align*}
where again $\mat u_a$ is the restriction of $\mat u$ to components associated with $a$.
Problem \eqref{prob:cluster-inc} is similar to \eqref{prob:joint-inc}, except
that it requires local marginal constraints to restrict our search to calibrated clique trees.
It is analogous to problem 
\textsc{CTree-Optimize-KL}
of \textcite[pg. 384]{koller2009probabilistic}.

\begin{linked}{prop}{cluster-inc-correct}
    If $(\bmu, \mat u)$ is a solution to \eqref{prob:cluster-inc}, then
    \begin{enumerate}[label={(\alph*)},nosep]
    \item $\bmu$ is a calibrated, with $\Pr_{\bmu} \in \bbr{\dg M}^*_0$, and
    \item the objective of \eqref{prob:cluster-inc} evaluated at $\mat u$ equals $\aar{\dg M}_0$.
    \end{enumerate}
\end{linked}
We can now find a minimizer of $\OInc$ and
compute $\aar{\dg M}_0$ without storing a joint distribution.
\discard{
Note that
\eqref{prob:cluster-inc} is the result of modifying \eqref{prob:joint-inc} in the obvious way to deal with clique trees.
}
But to do anything else, we must deviate from the template laid out in \cref{sec:inf-as-cvx-program}.

\textbf{Dealing with Joint Entropy.}
In the construction of \eqref{prob:cluster-inc},
we rely heavily on the fact that each term of $\OInc_{\dg M}$
depends only on local marginal distributions $\mu_{C_{\mskip-2mu a}}\!(\Tgt a,  \Src a)$
and $\mu_{C_{\mskip-2mu a}}\!(\Src a)$.
The same is not true of $\SInc_{}$, which depends on the joint entropy $\H(\Pr_{\bmu})$ of the entire distribution.
At this point we should point out an important 
reason to restrict our focus to trees:
it allows the joint entropy to be expressed
in terms of the cluster marginals \parencite{wainwright2008graphical},
by
\begin{equation}\label{eq:bethe-entropy}
    -\H(\Pr\nolimits_{\bmu})
        = -\sum_{C \in \C} \H(\mu_C)
        ~+~ \sum_{\mathclap{(C,D) \in \mathcal T}} \H_{\bmu}(C \cap D).
\end{equation}
Even so,
it is not obvious that
\eqref{eq:bethe-entropy} can be
captured with dcp exponential cone constraints.
(Exponential conic programs can minimize negative entropy,
but not positive entropy, which is concave.)
We now describe how this can be done.

\def\Par#1{\mathrm{Par}(#1)}
\def\Pash{\mathit{V\mskip-5muC\mskip-3.5muP\!}}

Choose a root node $C_0$ of the tree decomposition, and orient each edge of $\mathcal T$ so that it points away from $C_0$.
Now each cluster $C \in \cal C$, except for $C_0$, has a parent cluster $\Par C$;
define $\Par{C_0} := \emptyset$ to be an empty cluster, since $C_0$ has no parent.
Finally, for each $C \in \C$, let $\Pash_C := C \mathbin{\cap} \Par C$ denote the
the set of $\mathbf v$ariables that cluster $C$ has in $\mathbf c$ommon with its $\mathbf p$arent cluster.
\unskip\footnotemark 
As $\cal T$ is now a directed tree, this definition allow us to express
\eqref{eq:bethe-entropy} in a more useful form:
\begin{align*}
    - \H(&\Pr\nolimits_{\bmu}) =
        - \H(\mu_{C_0}) - \!
        \sum_{(C \to D)\mathrlap{ \in \mathcal T}}
        \H_{\Pr_{\bmu}}(D \mid C)\\[-1ex]
    &= 
        \sum_{C \in \cal C} \sum_{c \in \V(C)}
        \mu_C(C{=}c)
        \log \frac
            { \mu_C(C{=}c)}
            { \mu_C(\Pash_C(c)) }
        ,
            \numberthis\label{eq:cluster-ent-decomp}
\end{align*}
where $\Pash_C(c)$ is the restriction of the 
joint value $c \in \V(C)$
to the variables $\Pash_C \subseteq C$
\unskip.
Crucially, the denominator of \eqref{eq:cluster-ent-decomp} is an affine transformation of $\mu_C$.
The upshot: we have now rewritten the joint entropy
as a sum of functions of the clusters, each of which can be captured with a dcp exponential cone constraint.
This gives us analogues of the problems
in \cref{sec:small-gamma,%
sec:empirical-limit} that 
operate on clique trees.

\textbf{Finding clique trees for $\zogamma$-inference.}
The ability to decompose the joint entropy as in \eqref{eq:cluster-ent-decomp} allows us to adapt 
\eqref{prob:joint-small-gamma} 
to operate on calibrated clique trees, rather than joint distributions. 
Beyond the changes already present in \eqref{prob:cluster-inc},
the key
is to replace 
the exponential cone constraint
$( -\mat v,  \mu,  \mat 1) \in K_{\exp}^{\V\!\X}$,
which captures the entropy
of $\mu$,
with
\[
\big(-\mat v,\;\bmu,\, [\,\mu_{C}(\Pash_C(c))\,]_{(C,c)\in\V\C}\big) \in K_{\exp}^{\V\C},
\]
which captures the entropy of $\bmu$, by
\eqref{eq:cluster-ent-decomp}.
\ifvfull %
Over vectors
$\mat v, \bmu \in \Rext^{\V\C}$ and
$\mat u \in \Rext^{\V\!\Ar}$,
the problem becomes: 
{\allowdisplaybreaks
\begin{align*}
    \minimize_{\bmu, \mat u, \mat v} & ~~
    \sum_{\mathrlap{\!\!\!(a,s,t) \in \V\!\Ar}} (\beta_a \!- \alpha_a \gamma) u_{a,s,t}
    + \gamma \sum_{\mathclap{(C,c) \in \V\C}}  v_{C,c}
    \numberthis\label{prob:cluster-small-gamma}
    \\[-0.2ex]
    - \sum_{\mathrlap{\!\!\!(a,s,t) \in \smash{\V\!\Ar^+}}}&
        \alpha_a\gamma\,
        \mu_{C\!_a}\!(\Src a{=}s,\Tgt a {=} t)
        \log \p_a (\Tgt a{=}t\mid s)
\\[0.2ex]
\subjto&\quad
    \forall C \in \C.~\mu_C \in \Delta\V(C), \\[-0.2ex]
    \forall a \in \Ar.~
        \big(&\!- \! \mat u_a,\, \mu_{C\!_a}\!(\Src a,\mskip-2mu \Tgt a),\, \mu_{C\!_a}\!(\Src a) \p_a(\Tgt a | \Src a)\big) \in K_{\exp}^{\V a}, \\
    \forall (a,s,t) &\in \V\!\Ar^0\!.~
    \mu_{C\!_a}\!(\Src a{=}\mskip2mus, \Tgt a{=}\mskip2mut) = 0, \\[-0.2ex]
    \forall (C,D) &\in \mathcal T.~~ \mu_{C}(C \cap D) = \mu_{D}(C \cap D),\\[-0.3ex]
    \big(-\mat v,\;&\bmu,\, [\,\mu_{C}(\Pash_C(c))\,]_{(C,c)\in\V\C}\big) \in K_{\exp}^{\V\C}
    .\\[-5ex]
\end{align*}%
}%
\else%
This gives rise to an optimization problem 
over
$\mat v, \bmu \in \Rext^{\V\C}$ and
$\mat u \in \Rext^{\V\!\Ar}$,
that we call \eqref{prob:cluster-small-gamma}.
The rest of the details are less instructive, so we defer
them to \cref{appendix:prob-details} for brevity. 
\fi%

\footnotetext{
    Different choices of $C_0$ 
    yield
    different definitions of $\Pash$, 
    and ultimately optimization problems of different sizes;
    the optimal choice
    can be found with Edmund's Algorithm \parencite{chu1965shortest},
    which computes a directed analogue of the minimum spanning tree.}

\begin{linked}{prop}{cluster-small-gamma-correct}
    If $(\bmu, \mat u, \mat v)$ is a solution to \eqref{prob:cluster-small-gamma},
    and $\bbeta \ge \gamma \balpha$, then
    $\Pr_{\bmu}$ is the unique element of $\bbr{\dg M}^*_\gamma$,
    and the objective of \eqref{prob:cluster-small-gamma} at $(\bmu, \mat u, \mat v)$ equals $\aar{\dg M}_\gamma$.
\end{linked}%
\discard{%
\textbf{A clique tree for
    $\mat 0^{\boldsymbol+}$\!-inference.%
}}%
A related use of \eqref{eq:cluster-ent-decomp} is
to enable an analogue of
\ifvfull%
\eqref{prob:joint+idef} that operates on clique trees (rather than joint distributions),
to find
a compact representation of $\bbr{\dg M}^*_{0^+}$.
We begin with a straightforward adaptation of
    the relevant machinery in \Cref{sec:empirical-limit}.
Suppose that $\boldsymbol\nu {=} \{\nu_C : C \in \C\}$ is a calibrated clique tree over the tree decomposition $(\C, \mathcal T)$ representing a distribution $\Pr_{\boldsymbol\nu} \in \bbr{\dg M}^*_0$, say obtained by solving \eqref{prob:cluster-inc}.
For $C \in \C$, let $\Ar_C:= \{ a \in \Ar : C_a = C\}$ be the set of
arcs assigned to cluster $C$, and let
\[
    \mat k := \smash{\bigg[} \prod_{\smash{\mathrlap{a \in \Ar_C}}} \nu_C (\Tgt a (c) | \Src a (c))^{\alpha_a} \smash{\bigg]\mathclose{\vphantom{\Big|}}_{(C,c) \in \V \C}} ~~\in \Rext^{\V\C}
\]
be the analogue of \eqref{eq:cm-product} for a cluster tree.
Once again, consider
$\mat u := [ u_{(C,c)} ]_{(C,c) \in \V\C}$
in the optimization problem
{\allowdisplaybreaks%
\begin{align*}
\minimize_{\bmu, \mat u} & \quad
    \mat 1^{\sf T} \mat u
    \numberthis\label{prob:cluster+idef}\\
\subjto &\quad
    \forall C \in \C.~\mu_C \in \Delta\V(C), \\[-0.2ex]
     \big({-}\mat u,\,  \bmu,\,\, &
            \mat k \odot
            \big[\;\mu_C(\Pash_C(c))\;\big]_{(C,c) \in \V\C}
            \big) \in K_{\exp}^{\V\C}, \\[-0.2ex]
    \forall a \in \Ar.&~~\mu_{C_{\!a}}\!(\Src a, \Tgt a) \nu_{C_{\!a}}\!(\Src a) = \mu_{C_{\!a}}\!(\Src a) \nu_{C_{\!a}}\!(\Src a, \Tgt a)\\
    \forall (C,D) &\in \mathcal T.~~ \mu_{C}(C \cap D) = \mu_{D}(C \cap D).
\end{align*}}%
The biggest change is in the second constraint: 
the upper bounds $[u_{(C,c)}]_{c \in \V C}$ for cluster $C$ now account only
for the additional entropy not already modeled by 
$C$'s
ancestors.
\else %
\eqref{prob:joint+idef} for clique s,
resulting in an optimization problem that we call \eqref{prob:cluster+idef},
whose details we defer to \cref{appendix:prob-details} as well.
\fi

\begin{linked}{prop}{cluster+idef-correct}
    If $(\bmu, \mat u)$ is a solution to \eqref{prob:cluster+idef},
    then $\bmu$ is a calibrated clique tree
    and $\bbr{\dg M}^*_{0^+} = \{ \Pr_{\!\bmu} \}$.
\end{linked}


\begin{figure*}
    \centering
        \includegraphics[width=0.67\linewidth]{figs/rand-joint/joint-gap-vs-time-relabeled}
        \includegraphics[width=0.32\linewidth]{figs/rand-joint/time-diff}
    \caption{
        Accuracy and resource costs for the methods in \cref{sec:inf-as-cvx-program}.  
        Left: a scatter plot of several algorithms on random PDGs of $\approx 10$ variables. The x-axis is 
            the difference in scores
            $\bbr{\dg M}_\gamma(\mu) - \bbr{\dg M}_{\gamma}(\mu^*) + 10^{-15}$,
        where $\mu$ is the method's output,
        and $\mu^*$ achieves best (smallest) known value of $\bbr{\dg M}_{\gamma}$.
         (Thus, the best solutions lie on the far left.)
        The $y$ axis is the time required to compute $\mu$. 
        Our methods are in gold $(0^+$\!-inference) and violet ($\zogamma$-inference, for $\zogamma>0$); the baselines (black-box optimizers applied directly to \eqref{eqn:scoring-fn}) are in green.
        The area of each circle is proportional to the size of the optimization problem, as measured by
        {\small\texttt{n\_worlds}}$:=$
        $|\V\!\X|$.
        Right: how the same methods scale in run time, as $|\V\!\X|$ increases.
     }\label{fig:joint-gap-time}
\end{figure*}

At this point, standard algorithms can use $\bmu$
to answer probabilistic queries about $\Pr_{\!\bmu}$ in polynomial time \parencite[\S 10.3.3]{koller2009probabilistic}.
\discard{
    Concretely: marginal probabilities can essentially be read off of a cailbrated a clique tree,
    and evidence $X{=}x$ may be incorporated by
    setting $\mu_C(c) := 0$ for every $C{=}c$ that conflicts with $X{=}x$
    and recalibrating the clique tree (e.g., with belief propagation). }
From \cref{prop:cluster+idef-correct,prop:cluster-small-gamma-correct}, it follows that
$\zogamma$-inference
(for small $\zogamma$, and for $0^+$)
can be reduced to a (pair of) convex optimization problem(s) with
a polynomial number of variables and constraints.
All that remains 
is to show that such a problem can be solved in polynomial time.
For this, we turn to interior-point methods.
As
\eqref{prob:cluster-small-gamma} and \eqref{prob:cluster+idef} are dcp, they
can be transformed via established methods \parencite{agrawal2018rewriting} into
a standard form
that can be solved in polynomial time by commercial solvers \parencite{mosek,ECOS}.
Threading the details of our constructions through
the analyses of \textcite{dahl2022primal}
and \textcite{nesterov1996infeasible}
results in our main theorem.





\discard{
\begin{linked}{theorem}{main}
We can do PDG inference to precision $\epsilon$ in 
    \[ 
    O\pqty[\Big]{  (N\!+\!A)^4 V^{4T}
        \pqty[\Big]{ T \log V + \log \frac{N\!+\! A}{\epsilon} 
    + \log
    \frac{\beta^{\max}}{\beta^{\min}}
        } }%
    \]
    time,
    \unskip\daggerfootnote{At the cost of substantial overhead and engineering effort, the exponent $4$ can be reduced to 2.872, by appeal to \textcite{skajaa2015homogeneous} and the 
    current best matrix multiplication algorithm \parencite[$O(n^{2.372})$]{duan2022faster}
    to invert 
    $n{\times} n$
    linear systems. }
    where:
    \begin{itemize}[nosep,%
            ]
        \item $N$ is the total number of variables,
        \item $V$ is the number of values per variable,
        \item $T$ is the tree-width of the PDG's structure, 
        \item $A$ is the number of hyperarcs,
        \item $\beta^{\max}$ is the maximum observational confidence,
        and
        \item $\beta^{\min}$ is either $\gamma$ (for $\zogamma$-inference)
         or the minimimum non-zero observational confidence (for $0^+$\!-inference).
        \item $\gamma^{\max} = \max_{a \in \Ar} \beta_a/\alpha_a$
    \end{itemize}
\end{linked}
}
\begin{linked}{theorem}{main}
Let $\dg M = (\X, \Ar, \mathbb P, \balpha, \bbeta)$
be a proper discrete PDG with $N = |\X|$ variables each taking at most $V$ values, and $A = |\Ar|$ arcs forming a hypergraph of treewidth $T$.
Then for all $\gamma \in \{0^+\}\cup (0,\,  \min_{a \in \Ar} \frac{ \beta_a}{\alpha_a}]$
and $\epsilon > 0$, 
we can do $\zogamma$-inference to precision $\epsilon$ in 
\[ 
    O\pqty[\bigg]{  (N\!+\!A)^4 V^{4T}
        \pqty[\Big]{ T \log V + \log \frac{N\!+\! A}{\epsilon} 
    + \log
    \frac{\beta^{\max}}{\beta^{\min}}
        }\! }%
\]
time,
\unskip\daggerfootnote{At the cost of substantial overhead and engineering effort, the exponent $4$ can be reduced to 2.872, by appeal to \textcite{skajaa2015homogeneous} and the current best matrix multiplication algorithm \parencite[$O(n^{2.372})$]{duan2022faster}
to invert 
$n{\times} n$
linear systems. }
where $\quad \beta^{\max} := \max_{{a \in \Ar}} \beta_a\quad$ and
\[
\beta^{\min} := 
\begin{cases}
    ~\displaystyle\min_{a \in \Ar} \{ \beta_a : \beta_a > 0\}& \text{if}~ \gamma = 0^+\\[-0.5ex]
    ~\hfill\gamma~~ & \text{if}~ \gamma > 0~~\text{.}
\end{cases}
\]
\end{linked}


Our approach to $\zogamma$-inference computes
$\aar{\dg M}_\gamma$ as a side effect.
But suppose that we were interested in calculating only this inconsistency.
Might there be a more direct, asymptotically easier way 
to do so? In general, the answer is no.

\begin{linked}{theorem}{consistent-NP-hard}\label{prop:sharp-p-hard}
        \label{theorem:inf-via-inc-oracle}
    \begin{enumerate}[nosep,label={\rm{(\alph*)}}]
    \item Determining whether or not there is a distribution %
        that shares all cpds with $\dg M$ is NP-hard.
    \item Computing $\aar{\dg M}_\gamma$ is \#P-hard, for all $\gamma \ge 0$.
    \item For $\gamma \in \{0^+\}\cup(0,\min_a \frac{\beta_a}{\alpha_a})$,
        there is an $O($size of query result$)$ reduction
        from $\zogamma$-inference to the problem of calculating
        $\gamma$-inconsistency. 
        Under bounded tree-width, there
        is also an $O(|\V \C|)$ reduction in the other direction,
        making the problems essentially equivalent.
    \end{enumerate}
\end{linked}
Part (b) undermines \citeauthor{pdg-aaai}'s original idea of
    inferring the probability of $Y$ given $X$ by
    adding a hypothesis $h(Y|X)$ to $\dg M$, and 
    adjusting $h$ to minimize the overall inconsistency $\aar{\dg M + h}_\gamma$\,.
If we had oracle access to $\aar{\dg M + h}_\gamma$\,, however, then this procedure
would not only give the right answer, but also its running time would
not depend on the size of $\dg M$. Indeed, that is how we prove (c).
\vspace{0ex}


\section{Experiments} \label{sec:expts}

We have given the first algorithm to provably do inference in polynomial
time, but that does not mean that it is the best way of answering queries in practice;
it also makes sense to use black-box optimization tools such as
    Adam \parencite{kingma2014adam} or L-BFGS \parencite{fletcher2013practical}
    to find minimizers of $\bbr{\dg M}_\gamma$.
Indeed, this scoring function has several properties
    that make it highly amenable to such methods: it is
    infinitely differentiable, $\gamma$-strongly convex, and its
    derivatives have simple closed-form expressions.
So it may seem surprising that $\bbr{\dg M}_\gamma$ poses
a challege to standard optimization tools---%
but it does, 
even when we optimize directly over joint distributions.

\begin{figure*}
    \centering
    \includegraphics[width=0.34\linewidth]{figs/rand-joint/mem-costs.png}
    \includegraphics[width=0.65\linewidth]{figs/rand-clus/gap-vs-time-agg9.png}
    \caption{Left: Memory footprint.
    The convex solver (violet, gold)
     requires more memory than baselines (green).
    Right: Analogue of \cref{fig:joint-gap-time} for the cluster setting.
     Here there is even more separation between exponential conic optimization
         (gold, violet) and black-box optimization (greens).
     The grey points represent belief propagation, which is fastest and most accurate---%
         but only applies in the special case when $\bbeta=\gamma\balpha$.}
    \label{fig:joint-mem}
    \label{fig:clus-gap-vs-time}
\end{figure*}

\textbf{Synthetic Experiment 1 ({\normalfont over joint distributions}).~~} 
Repeatedly do the following.
First, randomly generate a small PDG $\dg M$ containing 
at most 10 variables and 15 arcs. 
Then for various values of
$\gamma \in \{0, 0^+, 
    10^{-8}, 
     \ldots, \min_a \frac{\beta_a}{\alpha_a} \}$,
optimize $\bbr{\dg M}_\gamma(\mu)$ over joint distributions $\mu$, 
in one of two ways. 
\begin{enumerate}[wide,label=(\alph*),nosep,itemsep=0.2ex]
\item Use \verb|cvxpy| \parencite{diamond2016cvxpy}
to feed  
one of problems (\ref{prob:joint-inc},\ref{prob:joint-small-gamma},\ref{prob:joint+idef})
    to the MOSEK solver \parencite{mosek}, or
\item Choose a learning rate and a representation of $\mu$ in terms of optimization variables $\theta \in \mathbb R^n$.
    Then run a standard optimizer (Adam or L-BFGS) built into \verb|pytorch| \parencite{pytorch}
    to optimize $\theta$
    until $\mu_\theta$ converges to a minimizer of $\bbr{\dg M}_\gamma$ 
        (or a time limit is reached).
    Keep only the best result across all learning rates. 
\end{enumerate}


The results are shown in \cref{fig:joint-gap-time}.
Observe that the convex solver (gold, violet) is significantly more accurate than the baselines,
and also much faster for small PDGs.
Our implementation of $0^+\!$-inference (gold) also appears to scale better than L-BFGS
    in this regime, although
    that of $\zogamma$-inference (purple) seems to scale much worse. 
We suspect that the difference comes from \verb|cvxpy|'s complilation process,
    because the two use similar amounts of memory (\cref{fig:joint-mem}),
    and so are problems of similar sizes.



\textbf{Synthetic Experiment 2 ({\normalfont over clique trees}).~~} 
For PDGs of bounded treewidth, \Cref{coro:can-use-cliquetree} allows us to express these optimzation problems compactly not just for the convex solver, but for the black-box baseline approaches as well.
We adapt the previous experiment for clique trees as follows.
First randomly sample a maximal graph $G$ of tree-width $k$, called a 
    $k$-tree \parencite{patil1986structure}; 
then generate a PDG $\dg M$ whose hyperarcs lie within cliques of $G$.
This ensures that the maximal cliques of $G$ form a tree-decomposition $(\C, \mathcal T)$ of $\dg M$'s underlying
    hypergraph.
We can now proceed as before: 
    either encode
    (\ref{prob:cluster-inc},\ref{prob:cluster-small-gamma},\ref{prob:cluster+idef})
    as disciplined convex programs in \verb|cvxpy|,
or use \verb|torch| to directly minimize $\bbr{\dg M}_\gamma(\Pr_{\!\bmu})$
    amongst clique trees $\bmu$ over $(\C, \mathcal T)$.
    

In the latter case, however, there is now an additional difficulty:
    it is not easy to strictly enforce the calibration constraints with the black-box methods.
Common practice is to instead add extra loss terms to ``encourage'' calibration---%
but
    it can still be worthwile for the optimizer to simply incur that loss 
    in order to violate the constraints.
Thus, for fairness, we must recalibrate the 
    the clique trees returned by all methods before evaluation.
The result is an even more significant advantage for the convex solver; 
see \cref{fig:clus-gap-vs-time}.

\textbf{Evaluation on BNs.~~}
We also applied the procedure of the Synthetic Experiment 2
to the smaller BNs in the
\href{https://www.bnlearn.com/bnrepository/}{\texttt{bnlearn}} repository,
and found similar results (but with fewer examples; see Appendix C.3). 
But for a PDG that happens to also be a BN, it is possible to use belief propagation, which is much faster and at least as accurate.

Explicit details about all of our experiments, 
and many more figures, can be found in Appendix C.

\section{Discussion and Conclusion}

In this paper, we have provided the first practical algorithm for
    inference in PDGs. 
In more detail, we have defined a parametric family of PDG inference notions, 
given a fixed-parameter tractable inference algorithm for a subset of these parameters,
    proven our algorithm correct, implemented it, and
    shown our code to empirically outperform baselines.
Yet many questions about PDG inference remain open.

Asymptotically, there may be a lot of room for improvement.
Our implementation runs in time $\tilde O(N^4)$, and our analysis suggests one of time $\tilde O(N^{2.872})$. 
But assuming bounded tree-width, most graph problems, including
inference inference for BNs and FGs, can be solved in time $O(N)$. 


Moreover, we have shown how to do inference for only a subset of
possible paramer values, specifically, 
when
either $\bbeta \ge \gamma \balpha$ or $\bbeta \gg \balpha$. 
The remaining cases are also of interest, and likely require different techniques. 
When $\bbeta = 0$ and $(\Ar, \balpha)$ encodes the structure of a BN,
    for instance,
    inference is about characterizing the BN's independencies.
While we do not know how to tackle this problem in general, 
our methods can be augmented with the convex-concave procedure 
    \parencite{yuille2003concave} to obtain an inference
    algorithm that applies slightly more broadly; see Appendix~B.


Given the long history of improvements 
to Bayesian network inference algorithms,
we are optimistic that further improvements on these problems are
possible.


\subsubsection*{Acknowledgements}
Halpern and Richardson were supported in part by MURI grant
W911NF-19-1-0217 and ARO grant W911NF-17-1-0592.
De Sa was supported by NSF RI-CAREER award 2046760.


\ifbiblatex
    \subsubsection*{References}
    \printbibliography
\else
    \bibliography{refs}\fi



\end{document}
