\documentclass[accepted]{uai2023}
\usepackage[british]{babel}


\usepackage{amsmath,amsfonts,bm}

\newcommand{\figleft}{{\em (Left)}}
\newcommand{\figcenter}{{\em (Center)}}
\newcommand{\figright}{{\em (Right)}}
\newcommand{\figtop}{{\em (Top)}}
\newcommand{\figbottom}{{\em (Bottom)}}
\newcommand{\captiona}{{\em (a)}}
\newcommand{\captionb}{{\em (b)}}
\newcommand{\captionc}{{\em (c)}}
\newcommand{\captiond}{{\em (d)}}

\newcommand{\newterm}[1]{{\bf #1}}


\def\figref#1{figure~\ref{#1}}
\def\Figref#1{Figure~\ref{#1}}
\def\twofigref#1#2{figures \ref{#1} and \ref{#2}}
\def\quadfigref#1#2#3#4{figures \ref{#1}, \ref{#2}, \ref{#3} and \ref{#4}}
\def\secref#1{section~\ref{#1}}
\def\Secref#1{Section~\ref{#1}}
\def\twosecrefs#1#2{sections \ref{#1} and \ref{#2}}
\def\secrefs#1#2#3{sections \ref{#1}, \ref{#2} and \ref{#3}}
\def\eqref#1{equation~\ref{#1}}
\def\Eqref#1{Equation~\ref{#1}}
\def\plaineqref#1{\ref{#1}}
\def\chapref#1{chapter~\ref{#1}}
\def\Chapref#1{Chapter~\ref{#1}}
\def\rangechapref#1#2{chapters\ref{#1}--\ref{#2}}
\def\algref#1{algorithm~\ref{#1}}
\def\Algref#1{Algorithm~\ref{#1}}
\def\twoalgref#1#2{algorithms \ref{#1} and \ref{#2}}
\def\Twoalgref#1#2{Algorithms \ref{#1} and \ref{#2}}
\def\partref#1{part~\ref{#1}}
\def\Partref#1{Part~\ref{#1}}
\def\twopartref#1#2{parts \ref{#1} and \ref{#2}}

\def\ceil#1{\lceil #1 \rceil}
\def\floor#1{\lfloor #1 \rfloor}
\def\1{\bm{1}}
\newcommand{\train}{\mathcal{D}}
\newcommand{\valid}{\mathcal{D_{\mathrm{valid}}}}
\newcommand{\test}{\mathcal{D_{\mathrm{test}}}}

\def\eps{{\epsilon}}


\def\reta{{\textnormal{$\eta$}}}
\def\ra{{\textnormal{a}}}
\def\rb{{\textnormal{b}}}
\def\rc{{\textnormal{c}}}
\def\rd{{\textnormal{d}}}
\def\re{{\textnormal{e}}}
\def\rf{{\textnormal{f}}}
\def\rg{{\textnormal{g}}}
\def\rh{{\textnormal{h}}}
\def\ri{{\textnormal{i}}}
\def\rj{{\textnormal{j}}}
\def\rk{{\textnormal{k}}}
\def\rl{{\textnormal{l}}}
\def\rn{{\textnormal{n}}}
\def\ro{{\textnormal{o}}}
\def\rp{{\textnormal{p}}}
\def\rq{{\textnormal{q}}}
\def\rr{{\textnormal{r}}}
\def\rs{{\textnormal{s}}}
\def\rt{{\textnormal{t}}}
\def\ru{{\textnormal{u}}}
\def\rv{{\textnormal{v}}}
\def\rw{{\textnormal{w}}}
\def\rx{{\textnormal{x}}}
\def\ry{{\textnormal{y}}}
\def\rz{{\textnormal{z}}}

\def\rvepsilon{{\mathbf{\epsilon}}}
\def\rvtheta{{\mathbf{\theta}}}
\def\rva{{\mathbf{a}}}
\def\rvb{{\mathbf{b}}}
\def\rvc{{\mathbf{c}}}
\def\rvd{{\mathbf{d}}}
\def\rve{{\mathbf{e}}}
\def\rvf{{\mathbf{f}}}
\def\rvg{{\mathbf{g}}}
\def\rvh{{\mathbf{h}}}
\def\rvu{{\mathbf{i}}}
\def\rvj{{\mathbf{j}}}
\def\rvk{{\mathbf{k}}}
\def\rvl{{\mathbf{l}}}
\def\rvm{{\mathbf{m}}}
\def\rvn{{\mathbf{n}}}
\def\rvo{{\mathbf{o}}}
\def\rvp{{\mathbf{p}}}
\def\rvq{{\mathbf{q}}}
\def\rvr{{\mathbf{r}}}
\def\rvs{{\mathbf{s}}}
\def\rvt{{\mathbf{t}}}
\def\rvu{{\mathbf{u}}}
\def\rvv{{\mathbf{v}}}
\def\rvw{{\mathbf{w}}}
\def\rvx{{\mathbf{x}}}
\def\rvy{{\mathbf{y}}}
\def\rvz{{\mathbf{z}}}

\def\erva{{\textnormal{a}}}
\def\ervb{{\textnormal{b}}}
\def\ervc{{\textnormal{c}}}
\def\ervd{{\textnormal{d}}}
\def\erve{{\textnormal{e}}}
\def\ervf{{\textnormal{f}}}
\def\ervg{{\textnormal{g}}}
\def\ervh{{\textnormal{h}}}
\def\ervi{{\textnormal{i}}}
\def\ervj{{\textnormal{j}}}
\def\ervk{{\textnormal{k}}}
\def\ervl{{\textnormal{l}}}
\def\ervm{{\textnormal{m}}}
\def\ervn{{\textnormal{n}}}
\def\ervo{{\textnormal{o}}}
\def\ervp{{\textnormal{p}}}
\def\ervq{{\textnormal{q}}}
\def\ervr{{\textnormal{r}}}
\def\ervs{{\textnormal{s}}}
\def\ervt{{\textnormal{t}}}
\def\ervu{{\textnormal{u}}}
\def\ervv{{\textnormal{v}}}
\def\ervw{{\textnormal{w}}}
\def\ervx{{\textnormal{x}}}
\def\ervy{{\textnormal{y}}}
\def\ervz{{\textnormal{z}}}

\def\rmA{{\mathbf{A}}}
\def\rmB{{\mathbf{B}}}
\def\rmC{{\mathbf{C}}}
\def\rmD{{\mathbf{D}}}
\def\rmE{{\mathbf{E}}}
\def\rmF{{\mathbf{F}}}
\def\rmG{{\mathbf{G}}}
\def\rmH{{\mathbf{H}}}
\def\rmI{{\mathbf{I}}}
\def\rmJ{{\mathbf{J}}}
\def\rmK{{\mathbf{K}}}
\def\rmL{{\mathbf{L}}}
\def\rmM{{\mathbf{M}}}
\def\rmN{{\mathbf{N}}}
\def\rmO{{\mathbf{O}}}
\def\rmP{{\mathbf{P}}}
\def\rmQ{{\mathbf{Q}}}
\def\rmR{{\mathbf{R}}}
\def\rmS{{\mathbf{S}}}
\def\rmT{{\mathbf{T}}}
\def\rmU{{\mathbf{U}}}
\def\rmV{{\mathbf{V}}}
\def\rmW{{\mathbf{W}}}
\def\rmX{{\mathbf{X}}}
\def\rmY{{\mathbf{Y}}}
\def\rmZ{{\mathbf{Z}}}

\def\ermA{{\textnormal{A}}}
\def\ermB{{\textnormal{B}}}
\def\ermC{{\textnormal{C}}}
\def\ermD{{\textnormal{D}}}
\def\ermE{{\textnormal{E}}}
\def\ermF{{\textnormal{F}}}
\def\ermG{{\textnormal{G}}}
\def\ermH{{\textnormal{H}}}
\def\ermI{{\textnormal{I}}}
\def\ermJ{{\textnormal{J}}}
\def\ermK{{\textnormal{K}}}
\def\ermL{{\textnormal{L}}}
\def\ermM{{\textnormal{M}}}
\def\ermN{{\textnormal{N}}}
\def\ermO{{\textnormal{O}}}
\def\ermP{{\textnormal{P}}}
\def\ermQ{{\textnormal{Q}}}
\def\ermR{{\textnormal{R}}}
\def\ermS{{\textnormal{S}}}
\def\ermT{{\textnormal{T}}}
\def\ermU{{\textnormal{U}}}
\def\ermV{{\textnormal{V}}}
\def\ermW{{\textnormal{W}}}
\def\ermX{{\textnormal{X}}}
\def\ermY{{\textnormal{Y}}}
\def\ermZ{{\textnormal{Z}}}

\def\vzero{{\bm{0}}}
\def\vone{{\bm{1}}}
\def\vmu{{\bm{\mu}}}
\def\vtheta{{\bm{\theta}}}
\def\va{{\bm{a}}}
\def\vb{{\bm{b}}}
\def\vc{{\bm{c}}}
\def\vd{{\bm{d}}}
\def\ve{{\bm{e}}}
\def\vf{{\bm{f}}}
\def\vg{{\bm{g}}}
\def\vh{{\bm{h}}}
\def\vi{{\bm{i}}}
\def\vj{{\bm{j}}}
\def\vk{{\bm{k}}}
\def\vl{{\bm{l}}}
\def\vm{{\bm{m}}}
\def\vn{{\bm{n}}}
\def\vo{{\bm{o}}}
\def\vp{{\bm{p}}}
\def\vq{{\bm{q}}}
\def\vr{{\bm{r}}}
\def\vs{{\bm{s}}}
\def\vt{{\bm{t}}}
\def\vu{{\bm{u}}}
\def\vv{{\bm{v}}}
\def\vw{{\bm{w}}}
\def\vx{{\bm{x}}}
\def\vy{{\bm{y}}}
\def\vz{{\bm{z}}}

\def\evalpha{{\alpha}}
\def\evbeta{{\beta}}
\def\evepsilon{{\epsilon}}
\def\evlambda{{\lambda}}
\def\evomega{{\omega}}
\def\evmu{{\mu}}
\def\evpsi{{\psi}}
\def\evsigma{{\sigma}}
\def\evtheta{{\theta}}
\def\eva{{a}}
\def\evb{{b}}
\def\evc{{c}}
\def\evd{{d}}
\def\eve{{e}}
\def\evf{{f}}
\def\evg{{g}}
\def\evh{{h}}
\def\evi{{i}}
\def\evj{{j}}
\def\evk{{k}}
\def\evl{{l}}
\def\evm{{m}}
\def\evn{{n}}
\def\evo{{o}}
\def\evp{{p}}
\def\evq{{q}}
\def\evr{{r}}
\def\evs{{s}}
\def\evt{{t}}
\def\evu{{u}}
\def\evv{{v}}
\def\evw{{w}}
\def\evx{{x}}
\def\evy{{y}}
\def\evz{{z}}

\def\mA{{\bm{A}}}
\def\mB{{\bm{B}}}
\def\mC{{\bm{C}}}
\def\mD{{\bm{D}}}
\def\mE{{\bm{E}}}
\def\mF{{\bm{F}}}
\def\mG{{\bm{G}}}
\def\mH{{\bm{H}}}
\def\mI{{\bm{I}}}
\def\mJ{{\bm{J}}}
\def\mK{{\bm{K}}}
\def\mL{{\bm{L}}}
\def\mM{{\bm{M}}}
\def\mN{{\bm{N}}}
\def\mO{{\bm{O}}}
\def\mP{{\bm{P}}}
\def\mQ{{\bm{Q}}}
\def\mR{{\bm{R}}}
\def\mS{{\bm{S}}}
\def\mT{{\bm{T}}}
\def\mU{{\bm{U}}}
\def\mV{{\bm{V}}}
\def\mW{{\bm{W}}}
\def\mX{{\bm{X}}}
\def\mY{{\bm{Y}}}
\def\mZ{{\bm{Z}}}
\def\mBeta{{\bm{\beta}}}
\def\mPhi{{\bm{\Phi}}}
\def\mLambda{{\bm{\Lambda}}}
\def\mSigma{{\bm{\Sigma}}}

\DeclareMathAlphabet{\mathsfit}{\encodingdefault}{\sfdefault}{m}{sl}
\SetMathAlphabet{\mathsfit}{bold}{\encodingdefault}{\sfdefault}{bx}{n}
\newcommand{\tens}[1]{\bm{\mathsfit{#1}}}
\def\tA{{\tens{A}}}
\def\tB{{\tens{B}}}
\def\tC{{\tens{C}}}
\def\tD{{\tens{D}}}
\def\tE{{\tens{E}}}
\def\tF{{\tens{F}}}
\def\tG{{\tens{G}}}
\def\tH{{\tens{H}}}
\def\tI{{\tens{I}}}
\def\tJ{{\tens{J}}}
\def\tK{{\tens{K}}}
\def\tL{{\tens{L}}}
\def\tM{{\tens{M}}}
\def\tN{{\tens{N}}}
\def\tO{{\tens{O}}}
\def\tP{{\tens{P}}}
\def\tQ{{\tens{Q}}}
\def\tR{{\tens{R}}}
\def\tS{{\tens{S}}}
\def\tT{{\tens{T}}}
\def\tU{{\tens{U}}}
\def\tV{{\tens{V}}}
\def\tW{{\tens{W}}}
\def\tX{{\tens{X}}}
\def\tY{{\tens{Y}}}
\def\tZ{{\tens{Z}}}


\def\gA{{\mathcal{A}}}
\def\gB{{\mathcal{B}}}
\def\gC{{\mathcal{C}}}
\def\gD{{\mathcal{D}}}
\def\gE{{\mathcal{E}}}
\def\gF{{\mathcal{F}}}
\def\gG{{\mathcal{G}}}
\def\gH{{\mathcal{H}}}
\def\gI{{\mathcal{I}}}
\def\gJ{{\mathcal{J}}}
\def\gK{{\mathcal{K}}}
\def\gL{{\mathcal{L}}}
\def\gM{{\mathcal{M}}}
\def\gN{{\mathcal{N}}}
\def\gO{{\mathcal{O}}}
\def\gP{{\mathcal{P}}}
\def\gQ{{\mathcal{Q}}}
\def\gR{{\mathcal{R}}}
\def\gS{{\mathcal{S}}}
\def\gT{{\mathcal{T}}}
\def\gU{{\mathcal{U}}}
\def\gV{{\mathcal{V}}}
\def\gW{{\mathcal{W}}}
\def\gX{{\mathcal{X}}}
\def\gY{{\mathcal{Y}}}
\def\gZ{{\mathcal{Z}}}

\def\sA{{\mathbb{A}}}
\def\sB{{\mathbb{B}}}
\def\sC{{\mathbb{C}}}
\def\sD{{\mathbb{D}}}
\def\sF{{\mathbb{F}}}
\def\sG{{\mathbb{G}}}
\def\sH{{\mathbb{H}}}
\def\sI{{\mathbb{I}}}
\def\sJ{{\mathbb{J}}}
\def\sK{{\mathbb{K}}}
\def\sL{{\mathbb{L}}}
\def\sM{{\mathbb{M}}}
\def\sN{{\mathbb{N}}}
\def\sO{{\mathbb{O}}}
\def\sP{{\mathbb{P}}}
\def\sQ{{\mathbb{Q}}}
\def\sR{{\mathbb{R}}}
\def\sS{{\mathbb{S}}}
\def\sT{{\mathbb{T}}}
\def\sU{{\mathbb{U}}}
\def\sV{{\mathbb{V}}}
\def\sW{{\mathbb{W}}}
\def\sX{{\mathbb{X}}}
\def\sY{{\mathbb{Y}}}
\def\sZ{{\mathbb{Z}}}

\def\emLambda{{\Lambda}}
\def\emA{{A}}
\def\emB{{B}}
\def\emC{{C}}
\def\emD{{D}}
\def\emE{{E}}
\def\emF{{F}}
\def\emG{{G}}
\def\emH{{H}}
\def\emI{{I}}
\def\emJ{{J}}
\def\emK{{K}}
\def\emL{{L}}
\def\emM{{M}}
\def\emN{{N}}
\def\emO{{O}}
\def\emP{{P}}
\def\emQ{{Q}}
\def\emR{{R}}
\def\emS{{S}}
\def\emT{{T}}
\def\emU{{U}}
\def\emV{{V}}
\def\emW{{W}}
\def\emX{{X}}
\def\emY{{Y}}
\def\emZ{{Z}}
\def\emSigma{{\Sigma}}

\newcommand{\etens}[1]{\mathsfit{#1}}
\def\etLambda{{\etens{\Lambda}}}
\def\etA{{\etens{A}}}
\def\etB{{\etens{B}}}
\def\etC{{\etens{C}}}
\def\etD{{\etens{D}}}
\def\etE{{\etens{E}}}
\def\etF{{\etens{F}}}
\def\etG{{\etens{G}}}
\def\etH{{\etens{H}}}
\def\etI{{\etens{I}}}
\def\etJ{{\etens{J}}}
\def\etK{{\etens{K}}}
\def\etL{{\etens{L}}}
\def\etM{{\etens{M}}}
\def\etN{{\etens{N}}}
\def\etO{{\etens{O}}}
\def\etP{{\etens{P}}}
\def\etQ{{\etens{Q}}}
\def\etR{{\etens{R}}}
\def\etS{{\etens{S}}}
\def\etT{{\etens{T}}}
\def\etU{{\etens{U}}}
\def\etV{{\etens{V}}}
\def\etW{{\etens{W}}}
\def\etX{{\etens{X}}}
\def\etY{{\etens{Y}}}
\def\etZ{{\etens{Z}}}

\newcommand{\pdata}{p_{\rm{data}}}
\newcommand{\ptrain}{\hat{p}_{\rm{data}}}
\newcommand{\Ptrain}{\hat{P}_{\rm{data}}}
\newcommand{\pmodel}{p_{\rm{model}}}
\newcommand{\Pmodel}{P_{\rm{model}}}
\newcommand{\ptildemodel}{\tilde{p}_{\rm{model}}}
\newcommand{\pencode}{p_{\rm{encoder}}}
\newcommand{\pdecode}{p_{\rm{decoder}}}
\newcommand{\precons}{p_{\rm{reconstruct}}}

\newcommand{\laplace}{\mathrm{Laplace}} 

\newcommand{\E}{\mathbb{E}}
\newcommand{\Ls}{\mathcal{L}}
\newcommand{\R}{\mathbb{R}}
\newcommand{\emp}{\tilde{p}}
\newcommand{\lr}{\alpha}
\newcommand{\reg}{\lambda}
\newcommand{\rect}{\mathrm{rectifier}}
\newcommand{\softmax}{\mathrm{softmax}}
\newcommand{\sigmoid}{\sigma}
\newcommand{\softplus}{\zeta}
\newcommand{\KL}{D_{\mathrm{KL}}}
\newcommand{\Var}{\mathrm{Var}}
\newcommand{\standarderror}{\mathrm{SE}}
\newcommand{\Cov}{\mathrm{Cov}}
\newcommand{\normlzero}{L^0}
\newcommand{\normlone}{L^1}
\newcommand{\normltwo}{L^2}
\newcommand{\normlp}{L^p}
\newcommand{\normmax}{L^\infty}

\newcommand{\parents}{Pa} 

\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}

\DeclareMathOperator{\sign}{sign}
\DeclareMathOperator{\Tr}{Tr}
\let\ab\allowbreak
 \usepackage{natbib} \bibliographystyle{plainnat}
\renewcommand{\bibsection}{\subsubsection*{References}}


\usepackage{url}
\usepackage[utf8]{inputenc}
\usepackage{etoolbox}
\usepackage{amsmath, amsthm}
\usepackage{amsfonts, amssymb}
\usepackage{bm}
\usepackage{ifthen}
\usepackage{letltxmacro}
\usepackage{nicefrac}
\usepackage{xstring}
\usepackage{pgf}
\usepackage{tikz}
\usetikzlibrary{arrows,automata}
\usetikzlibrary{er,positioning,bayesnet}
\usepackage[linesnumbered,ruled]{algorithm2e}
\renewcommand{\mathbf}[1]{\bm{#1}}
\usepackage{mathrsfs}
\usepackage{graphicx}
\usepackage{framed}
\newcommand{\todot}[1]{\textbf{\color{red!75!black}TODO: #1}}
\usepackage{todo}
\newcommand{\attention}[1]{{\color{blue!70!black} #1}}
\newcommand{\fixme}[1]{{\color{red!80!black} #1}}
\usepackage{booktabs,siunitx}
\usepackage{caption}
\usepackage{subcaption}
\usepackage{wrapfig}
\usepackage{cleveref}
\newcommand{\arxiv}[1]{\href{https://arxiv.org/abs/#1}{arXiv:#1}}
\newcommand{\biorxiv}[1]{\href{https://doi.org/10.1101/#1}{bioR$\chi{}$iv:#1}}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{observation}[theorem]{Observation}
\newtheorem{claim}[theorem]{Claim}
\newtheorem{condition}[theorem]{Condition}
\newtheorem{example}[theorem]{Example}
\newtheorem{fact}[theorem]{Fact}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{consequence}[theorem]{Consequence}
\newtheorem{question}[theorem]{Question}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{remark}[theorem]{Remark}
\newtheorem{problem}[theorem]{Problem}
\crefname{theorem}{Theorem}{Theorems}
\crefname{observation}{Observation}{Observations}
\crefname{claim}{Claim}{Claims}
\crefname{algorithm}{Algorithm}{Algorithms}
\crefname{condition}{Condition}{Conditions}
\crefname{example}{Example}{Examples}
\crefname{fact}{Fact}{Facts}
\crefname{lemma}{Lemma}{Lemmas}
\crefname{corollary}{Corollary}{Corollaries}
\crefname{consequence}{Consequence}{Consequences}
\crefname{definition}{Definition}{Definitions}
\crefname{remark}{Remark}{Remarks}
\crefname{problem}{Problem}{Problems}
\crefname{proposition}{Proposition}{Propositions}
\crefname{section}{Section}{Sections}
\crefname{appendix}{Supplementary Material Section}{Supplementary Material Sections}

\newcommand{\abs}[1]{\ensuremath{\left|#1\right|}}
\newcommand{\norm}[2][]{\ensuremath{\Vert #2 \Vert_{#1}}}
\newcommand{\diff}[2]{\frac{\text{d}#1}{\text{d}#2}}
\newcommand{\pdiff}[2]{\frac{\partial #1}{\partial #2}}
\newcommand{\grad}[1]{\nabla\left(#1\right)}
\renewcommand{\Pr}[2][]{\ensuremath{\mathbb{P}_{#1}\insq{#2}}}
\newcommand{\Po}[0]{\ensuremath{\mathrm{Po}}}
\newcommand{\ina}[1]{\left<#1\right>}
\newcommand{\inb}[1]{\left\{#1\right\}}
\newcommand{\inp}[1]{\left(#1\right)}
\newcommand{\insq}[1]{\left[#1\right]}
\makeatletter
\newcommand*{\defeq}{\mathrel{\rlap{\raisebox{0.3ex}{$\m@th\cdot$}}\raisebox{-0.3ex}{$\m@th\cdot$}}=}
\newcommand*{\eqdef}{=
  \mathrel{\rlap{\raisebox{0.3ex}{$\m@th\cdot$}}\raisebox{-0.3ex}{$\m@th\cdot$}}}
\makeatother
\newcommand{\nfrac}[3][]{\nicefrac[#1]{#2}{#3}}
\newcommand{\xfrac}[3][]{\nicefrac[#1]{#2}{#3}}
\renewcommand{\deg}[1]{\ensuremath{\mathrm{deg}\inp{#1}}}
\newcommand{\Z}[0]{\ensuremath{\mathbb{Z}}}
\newcommand{\I}[1]{\ensuremath{\insq{#1}}}
\newcommand{\F}[1]{\ensuremath{\mathbb{F}_{#1}}}
\newcommand{\N}[0]{\mathbb{N}}
\renewcommand{\emptyset}[0]{\varnothing}
\newcommand{\comp}[1]{\overline{#1}}
\newcommand{\poly}[1]{\ensuremath{\mathop{\mathrm{poly}}\inp{#1}}}
\newcommand{\etal}[0]{\emph{et al.}}
\renewcommand{\vec}[1]{\mathbf{#1}}
\newcommand{\T}[0]{\ensuremath{\intercal}}
\newcommand{\skel}[0]{\ensuremath{\text{skeleton}}}
\newcommand{\countAMO}[1]{\ensuremath{\textup{\#AMO}\inp{#1}}}

\newcommand{\undir}[2]{\ensuremath{{#1} - {#2}}}

\hypersetup{hidelinks} 
\title{Counting Background Knowledge Consistent Markov Equivalent Directed   Acyclic Graphs }

\author[1]{Vidya Sagar Sharma}
\affil[1]{School of Technology and Computer Science\\
  Tata Institute of Fundamental Research, Mumbai\\
  Maharashtra, India
}


\newcommand{\fix}{\marginpar{FIX}}
\newcommand{\new}{\marginpar{NEW}}

\def\month{MM}  \def\year{YYYY} \def\openreview{\url{https://openreview.net/forum?id=XXXX}} 


\begin{document}


\maketitle

\begin{abstract}
  We study the problem of counting the number of directed acyclic graphs in a
  Markov equivalence class (MEC) that are consistent with \emph{background
    knowledge} specified in the form of the directions of some additional edges
  in the MEC.  A polynomial-time algorithm for the special case of the problem, when no background knowledge constraints are specified, was given by Wienöbst,
  Bannach, and Liśkiewicz (AAAI 2021), who also showed that the general case is
  NP-hard (in fact, \#P-hard).  In this paper, we show that the problem is
  nevertheless tractable in an interesting class of instances, by establishing
  that it is ``fixed-parameter tractable'': we give an algorithm that runs in
  time $O(k! k^2 n^4)$, where $n$ is the number of nodes in the MEC and $k$ is the
  maximum number of nodes in any maximal clique of the MEC that participate in
  the specified background knowledge constraints.  In particular, our algorithm
  runs in polynomial time in the well-studied special case of MECs of bounded
  tree-width or bounded maximum clique size.
\end{abstract}


 \section{Introduction}
\label{sec:introduction}
A graphical model is a combinatorial tool for expressing dependencies between random variables. Both directed and undirected versions have been used in the literature for modeling different kinds of dependency structures.  We study graphical models represented by directed acyclic graphs (DAGs), which represent conditional independence relations and causal influences between random variables by directed edges~\citep{Pearl2009}. Such graphical models have been used extensively for modeling causal relationships across several fields, e.g., material science~\citep{ren_embedding_2020}, game theory~\citep{kearns2013graphical}, and biology~\citep{friedman2004inferring,finegold2011robust}.

It is well known that given access only to observational data, the causal DAG underlying a system can only be determined up to its ``Markov equivalence class'' (MEC)~\citep{verma1991equivalence,meek2013causal,chickering_causal_1995}.  Two DAGs are said to be in the same MEC if they model exactly the same set of conditional dependence relations between the underlying random variables. Distinguishing between two DAGs in the same MEC requires the use of interventional data~\citep{HB12}.

Finding the size of an MEC, therefore, becomes a question of key interest. In particular, the size of an MEC quantifies the uncertainty of the causal model given only observational data.  The problem, along with a proposed algorithm, was already mentioned by Meek~\citep[Section 4.1]{meek2013causal}, and has since then been the focus of a long line of work~\citep{madigan1996bayesian,he2015counting,he2016formulas,bernstein2017sampling,ghassami2019counting,talvitie2019counting,ganian2020efficient}, which culminated in a polynomial time algorithm for the problem by \citet{wienobst2020polynomial}.

\paragraph{Counting with background knowledge constraints} In applications, more
information about the directions of edges in the underlying DAG than that
encoded in the MEC may be available, for example, due to access to
domain-specific knowledge.  \citet{meek2013causal} referred to this as
\emph{background knowledge} and modeled it as a specification of the directions
of some of the edges of the underlying DAG. Thus, instead of finding the size of
the whole MEC, one becomes interested in counting those DAGs in the MEC that are
consistent with this specified background knowledge.  An algorithm for this
problem can also be used as an indicator of the efficacy of a particular
intervention in pinning down a DAG within an MEC, by measuring the ratio between
the size of the MEC and the number of those DAGs in the MEC that are consistent
with the extra background information yielded by the intervention.
\citet{wienobst2020polynomial} showed, however, that in general, this problem is
\#P-complete (i.e., as hard as counting satisfying assignments to a Boolean
formula). Our goal in this paper is to circumvent this hardness result when the
specified background knowledge has some special structure.

\subsection{Our Contributions}
We now formalize the above problem.  Our input is an MEC on $n$ nodes, and we
are given also the directions of another $s$ edges that are not directed in the
MEC: we refer to the set of these edges as the \emph{background knowledge},
denoted $\mathcal{K}$. Our goal is to count the number of DAGs in the input MEC
that are consistent with the background knowledge $\mathcal{K}$. The above
quoted result of \cite{wienobst2020polynomial} implies that (under the standard
P $\neq$ NP assumption) there \emph{cannot} be an algorithm for solving this
problem whose run-time is polynomial in both $n$ and $s$.

\paragraph{Main result}
The main conceptual contribution of this paper is to
define the following parameter which lets us identify important special
instances of the problem where we can circumvent the above hardness result.
Given the set $\mathcal{K}$ of background knowledge edges, we define the
\emph{max-clique-knowledge} $k$ of $\mathcal{K}$ to be the maximum number of
vertices in any clique in the input MEC that are part of a background knowledge
edge that lies completely inside that clique.  In particular, $k$ can be at most
twice $s$, but it can also be much smaller.  Our main result
(\cref{thm:main-result}) is an algorithm that counts the number of DAGs in the
MEC that are consistent with $\mathcal{K}$, and runs in time
$O(k! \cdot k^2 \cdot n^4)$.


\paragraph{Discussion and evaluation} In particular, the runtime of our
algorithm is polynomially bounded when the parameter $k$ above is bounded above
by a constant, \emph{even if} the actual size $s$ of the background knowledge is
very large.  For example, since $k$ is bounded above by the size of the largest
clique in the input MEC, it follows that our algorithm runs in polynomial time
in the well-studied special case (see, e.g., \citep{talvitie2019counting}) when
the input MEC has constant tree-width (and hence constant maximum-clique
size). We provide an empirical exploration of the run-time of our algorithm,
including of the phenomenon that it depends on $\mathcal{K}$ only through $k$
and not through $s$, in \cref{sec:experimental_evaluation}.














\subsection{Related Work}
It is well known that an MEC can be represented as a partially directed graph,
known as an \emph{essential graph}, with special graph theoretic
properties~\citep{verma1991equivalence,meek2013causal,chickering_causal_1995,andersson1997characterization}. Initial
approaches to the problem of computing the size of an MEC
by~\citet{meek2013causal} and \cite{madigan1996bayesian} built upon ideas
underlying this characterization. More recently, \cite{he2015counting} evaluated
a heuristic based on the partition of essential graphs into chordal
components. \citet{ghassami2019counting} gave an algorithm that runs in
polynomial time for constant degree graphs, but where the degree of this
polynomial \emph{grows} with the maximum degree of the graph.


Our algorithm can be seen as an example of a \emph{fixed parameter tractable}
(FPT) algorithm in the well-studied framework of parameterized complexity
theory~\citep{cygan_parameterized_2015}.  Parameterized complexity offers an
approach to attack computationally hard problems (e.g. those that are NP-hard or
\#P-hard) by separating the complexity of solving the problem into two pieces --
a part that depends purely on the size of the input, and a part that depends
only on a well-chosen ``parameter'' $\rho$ of the problem.  An FPT algorithm for
a computationally hard problem has a runtime that is bounded above by
$f(\rho)\poly{n}$, where the degree of the polynomial does \emph{not} depend
upon the parameter $\rho$, but where the function $f$ (which does not depend
upon the input size $n$) may potentially be exponentially growing. In our setting, the underlying parameter is the max-clique-knowledge of the background
knowledge $\mathcal{K}$.

Thus, the algorithm of \cite{ghassami2019counting} cited above is \emph{not} an
FPT algorithm.  However, \citet{talvitie2019counting} improved upon it by giving
an FPT algorithm for computing the size of an MEC
(without background knowledge): for an undirected essential graph on $n$ nodes
whose maximum clique is of size $c$, their algorithm runs in time
$\mathcal{O}(c!2^c c^2n)$.  Finally, \citet{wienobst2020polynomial} presented the first polynomial time
algorithm for computing the size of any MEC (again, without background
knowledge).  As discussed above, they also showed that counting Markov
equivalent DAGs consistent with specified background knowledge is, in general,
\#P-complete.

We are not aware of any progress towards circumventing this hardness result by
imposing specific properties on the specified background knowledge, and to the
best of our knowledge, this paper is the first to give a fixed parameter
tractable algorithm for the problem. Our algorithm is motivated by the
techniques developed by \citet{wienobst2020polynomial}.

 \section{Preliminaries}
\label{sec:preliminaries}
We mostly follow the terminology and notation used by
\cite{andersson1997characterization} and \cite{wienobst2020polynomial} for
notions such as \textbf{\emph{graph unions}, \emph{chain graphs},
  \emph{directed} and \emph{undirected} graphs, \emph{skeletons}},
\textbf{v-\emph{structures}}, \textbf{\emph{cliques, separators, chordal graphs, undirected connected chordal
    graphs}} (which we will typically denote using the abbreviation
  \textbf{UCCG}), and \textbf{\emph{clique trees}}.  For completeness, we
  provide detailed definitions in the
Supplementary Material.

\textbf{Notations. } A graph $G$ is a pair $(V, E)$, where $V$ is said to be the set of vertices of $G$, and $E\subseteq V\times V$ is said to be the set of edges of $G$. For $u,v \in V$, if $(u,v),(v,u) \in E$ then we say there is an undirected edge between $u$ and $v$, denoted as $u-v$. For $u,v \in V$, if $(u,v) \in E$, and $(v,u)\notin E$ then we say there is a directed edge from $u$ to $v$, denoted as $u\rightarrow v$. For a graph $G$, we denote $V_G$ as the subset of vertices of $G$, and $E_G$ as the set of vertices of $G$. A clique is a set of pairwise adjacent vertices. We denote the set of all maximal cliques of $G$ by $\Pi(G)$.
For a set $X$, we denote by $\# X$ (and sometimes by $|X|$), the size of
$X$.











\textbf{Markov equivalence classes.} A causal DAG encodes a set of conditional independence relations between random variables represented by its vertices. Two DAGs are said to belong to the same \emph{Markov equivalence class} (MEC) if both encode the same set of conditional independence relations. \citet{verma1991equivalence} showed that two DAGs are in the same MEC if, and only if, both have (i) the same skeleton and (ii) the same set of v-structures.  An MEC can be represented by the graph union of all DAGs in it. \citet{andersson1997characterization} show that a partially directed graph representing an MEC is a chain graph whose undirected connected components (i.e., undirected connected components formed after removing the directed edges) are chordal graphs. We refer to these undirected connected components as \emph{chordal components} of the MEC.  With a slight abuse of terminology, we equate the chain graph with chordal components which represents an MEC with the MEC itself, and refer to both as an ``MEC''. 

\textbf{AMO.} Given a partially directed graph $G$, an \emph{orientation} of $G$ is obtained by assigning a direction to each undirected edge of $G$.  Following \citet{wienobst2020polynomial}, we call an orientation of $G$ an \emph{acyclic moral orientation} (AMO) of $G$ if (i) it does not contain any directed cycles, and (ii) it has the same set of v-structures as $G$. For an MEC $G$, we denote the set of AMOs of $G$ by AMO($G$).

\textbf{PEO and LBFS orderings.} For an undirected graph, a linear ordering $\tau$ of its vertices is said to be a \emph{perfect elimination ordering} (PEO) of the graph if for each vertex $v$, the neighbors of $v$ that occur after $v$ form a clique. A graph is chordal if, and only if, it has a PEO \citep{fulkerson1965incidence}. \citet{rose1976algorithmic} gave a \emph{lexicographical breadth-first-search} (LBFS) algorithm to find a PEO of a chordal graph (see \cref{alg:AMO-Union-K}  below for a modified version of LBFS).  Any ordering of vertices that can be returned by the LBFS algorithm is said to be an \emph{LBFS ordering}.  








\textbf{Representation of an AMO.}
\label{preliminaries:representation-of-an-AMO}
Given a linear ordering $\tau$ of the vertices of a graph $G$, an AMO of $G$ is said
to be \emph{represented} by $\tau$ if for every edge $u \rightarrow v$ in the
AMO, $u$ precedes $v$ in $\tau$.  Every LBFS ordering of a UCCG $G$ represents
a unique AMO of $G$, and every AMO of a UCCG $G$ is represented by an LBFS
ordering of $G$ (Corollary 1, and Lemma 2 of
\citet{wienobst2020polynomial}). For a maximal clique $C$ of $G$, we say that
\emph{$C$ represents an AMO $\alpha$ of $G$} if there exists an LBFS ordering that
starts with $C$ and represents $\alpha$. Similarly, for a permutation $\pi(C)$ of a
maximal clique $C$ of $G$, we say $\pi(C)$ represents $\alpha$ if
there exists an LBFS ordering that starts with $\pi(C)$ and represents $\alpha$. We denote by AMO$(G,\pi(C))$ and AMO$(G, C)$, the set of AMOs of $G$ that can be represented by $\pi(C)$ and $C$, respectively.

\textbf{Canonical representation of AMO.} For an AMO $\alpha$ of a UCCG $G$,
\cite{wienobst2020polynomial} define a unique clique that represents
$\alpha$. We denote the clique as $C_{\alpha}$, and say that $C_{\alpha}$
\emph{canonically} represents $\alpha$.  For the purposes of this paper, we only
need certain properties of the canonical representative $C_\alpha$, and these
are quoted in \cref{lem:identification-of-C-alpha}. However, for completeness,
we also give the definition of $C_{\alpha}$ in the Supplementary Material.

\begin{figure}[h!]
    \centering
    \begin{tikzpicture}
        \node[scale = 0.9](a){$a$};
        \node[scale = 0.9](b)[below left=0.5 and 0.5 of a] {$b$};
        \node[scale = 0.9](c)[below right=0.5 and 0.5 of a] {$c$};
        \node[scale = 0.9](d)[below =0.5 of c]{$d$};
        \node[scale = 0.9](e)[below left=0.5 and 0.5 of d] {$e$};
        \node[scale = 0.9](f)[below right=0.5 and 0.5 of d] {$f$};
        
        \draw[-](a)--(b);
        \draw[->](a)--(c);
        \draw[->](b)--(c);
        \draw[<-](c)--(d);
        \draw[-](d)--(e);
        \draw[-](d)--(f);
        \draw[-](e)--(f);

        \node[scale = 0.9](1)[right = 3.0 of a]{1};
        \node[scale = 0.9](2)[right=1.0 of 1]{2};
        \node[scale = 0.9](3)[below = 1.0 of 1]{3};
        \node[scale = 0.9](4)[right = 1.0 of 3]{4};
        \node[scale = 0.9](5)[below = 1.0 of 3]{5};
        \node[scale = 0.9](6)[right = 1.0 of 5]{6};
        \node[scale = 0.9](7)[below = 1.0 of 6]{7};
        \draw[-](1)--(2){};
        \draw[-](1)--(3){};
        \draw[-](1)--(4){};
        \draw[-](2)--(3){};
        \draw[-](2)--(4){};
        \draw[-](3)--(4){};
        \draw[-](3)--(5){};
        \draw[-](3)--(6){};
        \draw[-](4)--(5){};
        \draw[-](4)--(6){};
        \draw[-](5)--(6){};
        \draw[-](5)--(7){};
        \draw[-](6)--(7){};
        \node[scale = 0.9](mec1)[above =0.1 of a]{MEC 1};
        \node[scale = 0.9](mec2)[above right =0.1 and -0.1 of 1]{MEC 2};





        
        \node[scale =0.9](table-11)[below left=0.5 and 1.5 of 7]{Background Knowledge};
        \node[scale = 0.9](table-12)[right =0.0 of table-11]{Max-clique Knowledge};
        \node[scale = 0.9](eg-11)[below left= 0.1 and 3.5 of table-12]{Example 1};
        \node[scale = 0.9](bkeg-11)[right=0.0 of eg-11]{$\{a\rightarrow b$, $e\rightarrow d$, $f\rightarrow d\}$};
        \node[scale = 0.9](ckeg-11)[right=5.0 of eg-11]{3};
        \node[scale = 0.9](eg-12)[below=0.1 of eg-11]{Example 2};
        \node[scale = 0.9](bkeg-12)[right=0.0 of eg-12]{$\{a\rightarrow b$, $e\rightarrow d\}$};
        \node[scale = 0.9](ckeg-12)[right=5.0 of eg-12]{2};

\node[scale = 0.9](eg-21)[below = 0.1 of eg-12]{Example 3};
        \node[scale = 0.9](bkeg-21)[right=0.0 of eg-21]{$\{1\rightarrow 2$, $3\rightarrow 6\}$};
        \node[scale = 0.9](ckeg-21)[right=5.0 of eg-21]{2};

        \node[scale = 0.9](eg-22)[below = 0.1 of eg-21]{Example 4};
        \node[scale = 0.9](bkeg-22)[right=0.0 of eg-22]{$\{1\rightarrow 2$, $2\rightarrow 3$, $1\rightarrow 3$,};
        \node[scale = 0.9](bkeg-221)[below=0.1 of bkeg-22]{$3\rightarrow 6$, $4\rightarrow 6$, $6\rightarrow 7\}$};
        \node[scale = 0.9](ckeg-22)[right=5.0 of eg-22]{3};


    \end{tikzpicture}
    \caption{Background Knowledge and Max-clique Knowledge:  In MEC 1, there are 2 maximal cliques $\{a,b\}$ and $\{d,e,f\}$. In MEC 2, there are 3 maximal cliques $\{1,2,3,4\}$, $\{3,4,5,6\}$ and $\{5,6,7\}$. Examples 1 and 2 are for MEC 1, and examples 3 and 4 are for MEC 2. In example 1, for the maximal clique $\{d,e,f\}$, $d$ and $e$ are part of an edge $e\rightarrow d$, and $f$ is part of an edge $f\rightarrow d$, i.e., all the nodes of the clique is part of an edge of the background knowledge such that both endpoints of the edge are inside the clique. From the definition of clique knowledge, clique knowledge of the clique $\{d,e,f\}$ is 3. Similarly, clique knowledge of the maximal clique $\{a,b\}$ is 2. This shows the max-clique knowledge of the background knowledge in example 1 for MEC 1 is 3. Similarly, the max-clique knowledge of the background knowledge in example 2 for MEC 1 is 2, and the max-clique knowledge of the background knowledge in examples 3 and 4 for MEC 2 are 2 and 3, respectively.}
    \label{fig:background-knowledge-andclique-knowledge}
\end{figure} 
\textbf{Background Knowledge and Clique-knowledge.}
\label{def:background-knowledge-and-clique-knowledge}
For an MEC represented by a
partially directed graph $G$, \emph{background knowledge}\footnote{ Another way
  to represent background knowledge is by a maximally partially directed acyclic
  graph (MPDAG). We do not use MPDAGs in our paper because our run time
  depends only on the number of directed background knowledge edges that
  generate the MPDAG, and not on the (possibly much larger) number of directed
  edges in the MPDAG.} is specified as a set of directed edges
$\mathcal{K} \subseteq E_G$ \citep{meek2013causal}. A graph $G$ is said to be
consistent with $\mathcal{K}$ if, for any edge
$u \rightarrow v \in \mathcal{K}$, $v \rightarrow u \notin E_G$ (i.e., either $u-v\in E_G$ or $u\rightarrow v \in G$).  For a clique
$C$ of $G$, \emph{clique-knowledge} of $\mathcal{K}$ for $C$ is defined as the
number of vertices of $C$ that are part of a background knowledge edge both of whose endpoints are in $C.$
\emph{Max-clique-knowledge} of $\mathcal{K}$ for $G$ is the maximum value of
clique-knowledge over all cliques of $G$.

We denote the set of AMOs of $G$ consistent with background knowledge $\mathcal{K}$ by
AMO($G,{\mathcal{K}}$). Further, for any clique $C$ and any permutation $\pi(C)$
of the vertices of $C$, we denote by AMO$(G,\pi(C),\mathcal{K})$ and
AMO$(G,C,\mathcal{K})$, respectively, the set of $\mathcal{K}$-consistent AMOs
of $G$ that can be represented by $\pi(C)$ and
$C$.






 \section{Main Result}
\label{sec:main-result}
\citet{andersson1997characterization} show that a DAG is a member of an MEC $G$ if, and only if, it is an AMO of $G$. Thus, counting the number of DAGs in the MEC represented by $G$ is equivalent to counting AMOs of $G$.
We start with a formal description of the algorithmic problem we address in this paper.
\begin{problem}[\textbf{Counting AMOs with Background Knowledge}] \label{prob:countingAMO}
\textbf{INPUT:} (a) An MEC $G$ (in the form of a chain graph with chordal undirected components), (b) Background  Knowledge $\mathcal{K}\subseteq E_G$.

  \noindent \textbf{OUTPUT:} Number of DAGs in the MEC $G$ that are consistent  with $\mathcal{K}$, i.e., \#AMO($G, {\mathcal{K}}$).
\end{problem}

As discussed in the introduction, a polynomial time algorithm for the special
case of this problem where $\mathcal{K}$ is empty was given by
\citet{wienobst2020polynomial}, in the culmination of a long line of work on
that case. However, as discussed in the Introduction, a hardness result proved
by \citet{wienobst2020polynomial}
implies, under the standard $P \neq NP$ assumption, that there cannot
be an algorithm for \cref{prob:countingAMO} that runs in time polynomial in both
$n$ and $\abs{\mathcal{K}}$.

We, therefore, ask: what are the other interesting cases of the problem which admit an efficient solution?  We answer this question with the following result.

\begin{theorem}[Main result]\label{thm:main-result}
  There is an algorithm for \cref{prob:countingAMO} which outputs
  $\#AMO(G, \mathcal{K})$ in time $O(k!k^2n^4)$, where $k$ is the max-clique-knowledge value of $\mathcal{K}$ for $G$, and $n$ is the number of vertices in $G$.
\end{theorem}
 
In the formalism of parameterized algorithms \citep{cygan_parameterized_2015}, 
the above result says that the problem of counting AMOs that are consistent with background knowledge is \emph{fixed parameter tractable}, with the parameter being the max-clique-knowledge of the background knowledge.
This contrasts with the \#P-hardness result of \citet{wienobst2020polynomial} for the problem. 

The starting point of our algorithm is a standard reduction to the following special case of the problem.
\begin{problem}[\textbf{Counting Background Consistent AMOs in chordal
    graphs}] \label{prob:countingAMOofUCCG} \textbf{INPUT:} (a) An undirected
  connected chordal graph (UCCG) $G$, (b) Background Knowledge
  $\mathcal{K}\subseteq E_G$.

  \noindent \textbf{OUTPUT:} Number of AMOs of $G$ that are consistent with
  $\mathcal{K}$, i.e., \#AMO($G, {\mathcal{K}}$).
\end{problem}

\begin{proposition}
  \label{lem:Counting-AMO-reduction}
  Let $G$ be an MEC, and let $\mathcal{K} \subseteq E_G$ be background knowledge consistent with $G$. Then,
  \#AMO$(G,\mathcal{K}) = \prod_{\text{H}}{\#\text{AMO}(H,\mathcal{K}[H])},$
  where the product ranges over all undirected connected chordal components $H$
  of the MEC $G$, and
  $\mathcal{K}[H] = \{u\rightarrow v: u,v\in H \text{, and } u\rightarrow v\in
  \mathcal{K} \}$ is the corresponding background knowledge for $H$.
\end{proposition}
\cref{lem:Counting-AMO-reduction} reduces \cref{prob:countingAMO} to
\cref{prob:countingAMOofUCCG}.  The proof of \cref{lem:Counting-AMO-reduction}
follows directly from standard arguments (a similar reduction to chordal
components of an MEC has been used in many previous works), and is given in the
Supplementary Material. 

The rest of the paper is devoted to providing an efficient algorithm for \cref{prob:countingAMOofUCCG}. Our strategy builds upon the recursive framework developed by \citet{wienobst2020polynomial}.  In the first step, in \cref{sec:reduction-of-the-problem}, we modify the LBFS algorithm presented by \citet{wienobst2020polynomial} to make it background aware. This algorithm is used to generate smaller instances of the problem recursively.  Further, we modify the recursive algorithm of \citet{wienobst2020polynomial}  to use this new LBFS algorithm.  While building upon prior work, both steps require new ideas to take care of the background information.  In particular, it is a careful accounting of the background knowledge $\mathcal{K}$ that requires the $k!$ factor in the runtime, where $k$ is the max-clique-knowledge of the background knowledge.
In \cref{sec:time-complexity}, we analyze the time complexity of the resulting algorithm. Finally, \cref{sec:experimental_evaluation} shows our experimental results.  Due to space constraints, proofs of many lemmas and theorems are provided in the Supplementary Material. For most of the important results, we provide the proof sketch in the main text. 




 \section{The Algorithm}
\label{sec:reduction-of-the-problem}
In this section, we give an FPT algorithm \cref{alg:Count-AMO} that solves \cref{prob:countingAMOofUCCG} using a parameter ``max-clique knowledge'' (\cref{sec:preliminaries}). \cref{alg:Count-AMO} is a background aware version of Algorithm 2 of \cite{wienobst2020polynomial}, which solved the special case of \cref{prob:countingAMOofUCCG} when there is no background knowledge. Similar to their algorithm, our algorithm uses a modified LBFS algorithm, \cref{alg:AMO-Union-K}, which is a background aware version of the modified LBFS algorithm presented by them. The simple \cref{alg:valid-permutaion-of-clique}, which counts background knowledge consistent permutations of a clique, is the new ingredient required in our final algorithm presented in \cref{alg:Count-AMO}.

We start with the partitioning of the AMOs. In \cref{sec:preliminaries}, we saw that each AMO of a UCCG is canonically represented by a unique maximal clique of the UCCG. We use this for partitioning the AMOs.



\begin{lemma}
\label{lem:partition-of-set-of-bkc-AMOs}
Let $G$ be a UCCG, and $\mathcal{K}$ be a given background knowledge. Then
$\#\text{AMO}(G,\mathcal{K})$ equals 
\begin{multline}
  \sum_{C\in \Pi(G)}{|\{\alpha: \alpha \in \text{AMO}(G,\mathcal{K}) \text{ and } C=C_{\alpha}\}|}. 
\end{multline}
\end{lemma}

Here, $\{\alpha: \alpha \in \text{AMO}(G,\mathcal{K}) \text{ and } C=C_{\alpha}\}$ is the set of $\mathcal{K}$-consistent AMOs of $G$ that are \emph{canonically} represented by a maximal clique $C$ of $G$. To compute this, we first compute the union of AMOs of $G$ that are represented by $C$. 
The following definition concerns objects from the work of \cite{wienobst2020polynomial} that are relevant to construct the union graph.

\begin{definition}[{$G^{\pi(C)}, G^C, \mathcal{C}_G(\pi(C)$ and $\mathcal{C}_G(C)$}, \citet{wienobst2020polynomial}, Definition 1]
\label{def:definitions-of-wienbost-paper}
Let $G$ be a UCCG, $C$ a maximal clique of $G$, and $\pi(C)$ a permutation
of $C$.  Then, $G^C$ (respectively, $G^{\pi(C)}$) denotes the union of all the
AMOs of $G$ that can be represented by $C$ (respectively, by $\pi(C)$).
$\mathcal{C}_{G}(C)$ (respectively, $\mathcal{C}_{G}(\pi(C))$) denotes the
undirected connected components of $G^C[V_G\setminus C]$ (respectively,
$G^{\pi(C)}[V_G\setminus C]$).
\end{definition}

 The structure of $G^C$ provides us the set of directed edges in $G^C$, which helps us to check the $\mathcal{K}$-consistency of $G^C$. Since $G^C$ is the union of all the AMOs that can be represented by $C$,  a directed edge in $G^C$ is a directed edge in all the AMOs that can be represented by $C$. Thus, if any such directed edge is not $\mathcal{K}$-consistent then none of the AMOs that can be represented by $C$ can be $\mathcal{K}$-consistent. And, if all the directed edges of $G^C$ are $\mathcal{K}$-consistent then we further reduce our problem into counting $\mathcal{K}$-consistent AMOs for the undirected connected components of $G^C$ (\cref{lem:final-counting-algorithm}).

In order to implement the above discussion, the main insight required is the following definition.

\begin{definition}[$\mathcal{K}$-consistency of $G^C$]
  \label{def:bc-CGC}
  Let $G$ be a UCCG and $C$ a maximal clique in $G$. Given background
  knowledge $\mathcal{K}$ about the directions of the edges of $G$, $G^C$is said
  to be $\mathcal{K}$-consistent, if there does not exist a directed edge
  $u \rightarrow v \in G^C$ such that $v\rightarrow u \in \mathcal{K}$.
\end{definition}

As discussed above, \cref{def:bc-CGC} implies that for a maximal clique $C$ of $G$, if $G^C$ is not $\mathcal{K}$-consistent then there exists no $\mathcal{K}$-consistent AMO of $G$ that is represented by $C$. In other words, if $G^C$ is not $\mathcal{K}$-consistent then \[{|\{\alpha: \alpha \in \text{AMO}(G,\mathcal{K}) \text{ and } C=C_{\alpha}\}|}=0.\] Then, from \cref{lem:partition-of-set-of-bkc-AMOs},
\begin{multline*}
  \#\text{AMO}(G,\mathcal{K}) = \\ \sum_{C:G^C \text{ is } \mathcal{K}\text{-consistent}}{|\{\alpha: \alpha \in \text{AMO}(G,\mathcal{K}) \text{ and } C=C_{\alpha}\}|}.
\end{multline*}
We  construct \cref{alg:AMO-Union-K} to check the $\mathcal{K}$-consistency of $G^C$, for any maximal clique $C$ of $G$.
\citet{wienobst2020polynomial} give a modified LBFS algorithm that for input a chordal graph $G$, and a maximal clique $C$ of $G$, outputs the undirected connected components (UCCs) of $\mathcal{C}_G(C)$. Their algorithm outputs the UCCs of $\mathcal{C}_G(C)$ in such a way that by knowing the  UCCs of $\mathcal{C}_G(C)$, we can construct $G^C$. We use this fact and construct an LBFS algorithm \cref{alg:AMO-Union-K}, which also checks the $\mathcal{K}$-consistency of $G^C$. \cref{alg:AMO-Union-K} is a background aware version of the LBFS algorithm given by \citet{wienobst2020polynomial}.\footnote{If \cref{alg:AMO-Union-K} is executed with $C=\mathcal{K}=\emptyset$, the algorithm performs a normal LBFS with corresponding traversal ordering $\tau$, which is the reverse of a PEO of $G$.} 
 
 \begin{algorithm}[ht]
  \caption{LBFS($G, C,\mathcal{K}$) (Background aware LBFS, based on the modified LBFS of
    \citet{wienobst2020polynomial})}
\label{alg:AMO-Union-K}
\SetAlgoLined
\SetKwInOut{KwIn}{Input}
\SetKwInOut{KwOut}{Output}
\SetKwFunction{AMO-Union}{AMO-Union}
\KwIn{A UCCG $G$, a maximal clique $C$ of $G$, and background knowledge $\mathcal{K}\subseteq E_G$.}
    \KwOut{ $(1, \mathcal{C}_G(C))$: if $G^C$ is $\mathcal{K}$-consistent,\\
    $(0, \mathcal{C}_G(C))$: otherwise.
     }
$\mathcal{S}\leftarrow$ sequence of sets initialized with $(C,V\setminus C)$\label{alg:S-init}
    
    $\tau \leftarrow$ empty list, $\mathcal L\leftarrow$ empty list, $\texttt{flag}\leftarrow 1$, $Y\leftarrow$ empty list \label{alg:flag-init}
    
    \While{$\mathcal{S}$ is non-empty\label{alg:loop}}{
    $X\leftarrow$ first non-empty set of $\mathcal{S}$\label{alg:set-X}
    
    $v\leftarrow$ arbitrary vertex from $X$\label{alg:mod1}


    {
   \If{$v$ is neither in a set in $\mathcal L$ nor in $C$\label{alg:if-1-start}}{
    Append $X$ to the end of the list $\mathcal{L}$.
    
    Append undirected connected components of $G[X]$ to the end of $Y$.\label{alg:appendY}
    
}\label{alg:if-1-end}
    }
    
    Add vertex $v$ to the end of $\tau$.
    
    \If{$u \rightarrow v \in \mathcal{K}$ for any $u$ which is neither in a set in $\mathcal{L}$ nor in $C$\label{alg:mod-if}}
    {
      $\texttt{flag}=0$;\label{alg:mod-output-empty}
    }\label{alg:mod-if-end}

    


    
    
Replace the set $X$ in the sequence $\mathcal{S}$ by the set $X \setminus \inb{v}$. 
    
    $N(v)\leftarrow \{ x|x\notin \tau \textup{ and } \undir{v}{x}\in E\}$\label{alg:neighbour-v}
    
    Denote the current $\mathcal{S}$ by $(S_1,\ldots ,S_k)$.
    
    Replace each $S_i$ by $S_i \cap N(v), S_i \setminus  N(v)$.\label{alg:refine}
    
    Remove all empty sets from $\mathcal{S}$.
  }\label{alg:while-end}
    


\KwRet $(\texttt{flag}, Y)$ \label{alg:return}
\end{algorithm}
 
 We now describe our background-aware version of the modified LBFS algorithm:
 see \cref{alg:AMO-Union-K}.  We do not change any line from the LBFS algorithm
 of \citet{wienobst2020polynomial}. The modifications we do in their LBFS
 algorithm are (a) introduction of ``flag'', at \cref{alg:flag-init}, which is
 used to check the $\mathcal{K}$-consistency of $G^C$, (b) lines
 \ref{alg:mod-if}-\ref{alg:mod-if-end}, which is used to update the value of
 ``flag'', and (c) we also output the value of ``flag'' with $\mathcal{C}_G(C)$.
 The correctness of this modification (stated formally in
 \cref{lem:output-of-Alg-1}) is based on the following observation which in turn
 uses ideas implicit in the work of \citet{wienobst2020polynomial}.
 
\begin{observation}[Implicit in the work of \citet{wienobst2020polynomial}]
\label{lem:direction of-edges-of-GC-in-alg}
Let $G$ be a UCCG, $\mathcal{K}$ be the known background knowledge about $G$, and $C$ be a maximal clique of $G$. For input $G,C,$ and $\mathcal{K}$, suppose that at some iteration of \cref{alg:AMO-Union-K}, $\mathcal{L}=\{X_1,X_2,\ldots,X_l\}$.  Then,
\begin{enumerate}
    \item 
    \label{item-1-of-direction of-edges-of-G^C-in-alg}
    For any $u\in C$, and $v\notin C$, if $(u,v)\in E_G$ then $u\rightarrow v$ is a directed edge in $G^C$.
    \item
    \label{item-2-of-direction of-edges-of-G^C-in-alg}
    For $u\in X_i$, and $v \notin C\cup X_1 \cup X_2\cup \ldots \cup X_i$, if $(u,v)\in E_G$ then $u\rightarrow v$ is a directed edge in $G^C$.
    \item 
    \label{item-3-of-direction-of-edges-of-G^C-in-alg}
    For $u,v\in X_i$, for $1\leq i \leq l$, if $(u,v)\in E_G$ then $u-v$ is an undirected edge in $G^C$.
    \item 
    \label{item-3b-of-direction-of-edges-of-G^C-in-alg}
    For $u,v\in C$, $u-v$ is an undirected edge in $G^C$.
    
    \item 
    \label{item-4-of-direction-of-edges-of-G^C-in-alg}
    For any $X_i\in \mathcal{L}$, every undirected connected component of
    $G[X_i]$ is an element of $\mathcal{C}_G(C)$.
\end{enumerate}
\end{observation}

The following lemma encapsulates the correctness of \cref{alg:AMO-Union-K}.

 \begin{lemma}
 \label{lem:output-of-Alg-1}
 Let $G$ be a UCCG, $C$ be a maximal clique of $G$, and $\mathcal{K}$ be the known background knowledge about $G$. For the input $G,C,$ and $\mathcal{K}$, if $G^C$ is not $\mathcal{K}$-consistent \cref{alg:AMO-Union-K} outputs $(0,\mathcal{C}_G(C))$ on \cref{alg:return}, else it returns $(1,\mathcal{C}_G(C))$ on \cref{alg:return}.
 \end{lemma}

For any maximal clique $C$ of $G$ such that $G^C$ is $\mathcal{K}$-consistent, to compute the size of the set of $\mathcal{K}$-consistent AMOs of $G$ that are canonically represented by $C$, we further partition the set based on the different permutations of $C$. The simple \cref{lem:uniqueness-of-permutation-of-clique} below assists us in mapping each AMO of the set to a unique permutation $\pi(C)$ of $C$.
\begin{observation}
\label{lem:uniqueness-of-permutation-of-clique}
Let $G$ be a UCCG, and $\alpha$ an AMO of $G$ that is represented by a maximal clique $C$ of $G$. Then, there exists a unique permutation $\pi(C)$ of $C$ that represents $\alpha$. 
\end{observation}

By slightly extending the definition of the canonical representation of an AMO by a clique, we say that an AMO is \textbf{canonically represented by $\pi(C)$} if the AMO is represented by $\pi(C)$, and also canonically represented by the clique $C$. Then, \cref{lem:uniqueness-of-permutation-of-clique} implies that we can partition the set of $\mathcal{K}$-consistent AMOs that are canonically represented by $C$ into $\mathcal{K}$-consistent AMOs that are canonically represented by its permutations $\pi(C)$, i.e., $\{\alpha: \alpha \in \text{AMO}(G,\pi(C),\mathcal{K}) \text{ and } C=C_{\alpha}\}$. More formally,
\begin{multline*}
    |\{\alpha: \alpha \in \text{AMO}(G,\mathcal{K}) \text{ and } C=C_{\alpha}\}|=\\ \sum_{\pi(C)}{|\{\alpha: \alpha \in \text{AMO}(G,\pi(C),\mathcal{K}) \text{ and } C=C_{\alpha}\}|}
\end{multline*}



To compute the size of $\mathcal{K}$-consistent AMOs of $G$ that are canonically represented by $\pi(C)$,  we first have to go through the necessary and sufficient conditions for a maximal clique $C$ of $G$ to become $C_{\alpha}$, for an AMO $\alpha$ of $G$. 

\begin{lemma}[Claims 1, 2 and 3 of \citet{wienobst2020polynomial}]
\label{lem:identification-of-C-alpha}
Let $G$ be a UCCG. \citet{wienobst2020polynomial} fix a rooted clique tree of $G$ to define $C_{\alpha}$, for any AMO $\alpha$ of $G$. Let
$\mathcal{T}=(T,R)$ be the rooted clique tree (with root $R$) of $G$ on which $C_{\alpha}$ is defined, for each AMO $\alpha$ of $G$. For an AMO $\alpha$ of $G$, and a maximal clique $C$ of $G$, $C=C_{\alpha}$ if, and only if,
\begin{enumerate}
\item There exists an LBFS ordering of $G$ that starts with C, and represents $\alpha$, and
\item If $\pi(C)$ is the permutation of $C$ that represents $\alpha$ (from \cref{lem:uniqueness-of-permutation-of-clique}) then there does not exist any edge $C_i-C_j$ in the path in $T$ from $R$ to
  $C$ such that $\pi(C)$ has a prefix $C_i\cap C_j$.
\end{enumerate}
\end{lemma}
The set FP$(C,\mathcal{T})$ is defined to be the set of such forbidden prefixes
$C_i\cap C_j$.

\begin{definition}[$FP(C, \mathcal{T})$,  Definition 3 of \citet{wienobst2020polynomial}]
\label{def:FP}
Let $G$ be a UCCG, $\mathcal{T}=(T,R)$ a rooted clique tree of $G$, $C$ a
node in $T$ and $R=C_1 - C_2 - \ldots - C_{p-1} - C_p=C$ the unique path from $R$ to $C$ in
$T$. We define the set \emph{FP$(C,\mathcal{T})$ } to contain all sets of the
form $C_i\cap C_{i+1} \subseteq C$, for $1 \leq i < p$.
\end{definition}

Based on \cref{lem:identification-of-C-alpha}, we define $(\mathcal{K},\mathcal{T})$-consistency for a permutation $\pi(C)$ to simplify our computation. This definition is one of the main new ingredients that let us extend the result of \cite{wienobst2020polynomial}.

\begin{definition}[$(\mathcal{K},\mathcal{T})$-consistency of permutations of maximal cliques]
\label{def:background-consistent-pi-C}
Let $G$ be a UCCG, $C$ a maximal clique in $G$, $\pi(C)$ a permutation of $C$, $\mathcal{K}$ be a given background knowledge, and $\mathcal{T}=(T,R)$ a rooted clique tree of $G$ (on which $C_{\alpha}$ is defined). $\pi(C)$ is said to be $(\mathcal{K},\mathcal{T})$-consistent if  (a) $\pi(C)$ is $\mathcal{K}$-consistent, i.e, for any edge $u\rightarrow v \in \mathcal{K}$ such that $u,v\in C$, $u$ occurs before $v$ in $\pi(C)$, and (b) no element of FP$(C,\mathcal{T})$ (\cref{def:FP}) is a prefix of $\pi(C)$.
\end{definition}

If $\pi(C)$ itself is not $\mathcal{K}$-consistent then no $\mathcal{K}$-consistent AMO exists that is represented by $\pi(C)$. Also, if $\pi(C)$ has a prefix in  FP$(C,\mathcal{T})$ then from \cref{lem:uniqueness-of-permutation-of-clique,lem:identification-of-C-alpha}, there does not exist an AMO $\alpha$ of $G$ such that $\alpha$ is represented by $\pi(C)$, and $C=C_{\alpha}$. 
This yields the following observation.
\begin{observation}
\label{lem:formular-for-non-KT-consistent-permutaion}
If $\pi(C)$ is not $(\mathcal{K},\mathcal{T})$-consistent then there exists no $\mathcal{K}$-consistent AMOs of $G$ that can be canonically represented by $\pi(C)$, i.e.,  $|\{\alpha: \alpha \in \text{AMO}(G,\pi(C),\mathcal{K}) \text{ and } C=C_{\alpha}\}|= 0$
\end{observation}

We thus focus on only those permutations $\pi(C)$ of $C$ that are
$(\mathcal{K},\mathcal{T})$-consistent.  The main ingredient towards this end is
the following recursive formula.
\begin{lemma}
\label{lem:formular-for-KT-consistent-permutaion}
Let $G^C$ be $\mathcal{K}$-consistent. Then, for any
$(\mathcal{K},\mathcal{T})$-consistent permutation $\pi(C)$ of $C$, the size of
the set
$\{\alpha: \alpha \in \text{AMO}(G,\pi(C),\mathcal{K}) \text{ and }
C=C_{\alpha}\}$ is $\prod_{H\in \mathcal{C}_G(C)}{\#AMO(H,\mathcal{K}[H])}$.
\end{lemma}
Note that the formula obtained in
\cref{lem:formular-for-KT-consistent-permutaion} depends only upon the clique
$C$ and \emph{not} on the permutation $\pi$ of the nodes of $C$ (as long as
$\pi$ is itself $\mathcal{K}$-consistent)!  This implies immediately that the
number of $\mathcal{K}$-consistent AMOs of $G$ for which $C$ is the {canonical
  representative} is given by multiplying the product
$\prod_{H\in \mathcal{C}_G(C)}{\#AMO(H,\mathcal{K}[H])}$ with
the number of $(\mathcal{K},\mathcal{T})$-consistent permutation of $C$.


This motivates us to count  $(\mathcal{K},\mathcal{T})$-consistent permutations of a maximal clique $C$ of $G$. To count the $(\mathcal{K},\mathcal{T})$-consistent permutations of $C$, we define the following:
\begin{definition}
\label{def:phi-S-R-K}
Let $S$ be a set of vertices, $\mathcal{R}=\{R_1,R_2,\ldots, R_l\}$ such that $R_1\subsetneq R_2 \subsetneq \ldots \subsetneq R_l \subsetneq S$, and $\mathcal{K}\subseteq S\times S$. $\Phi(S,\mathcal{R}, \mathcal{K})$ is the number of $\mathcal{K}$-consistent permutations of $S$ that do not have a prefix in $\mathcal{R}$.
\end{definition}

\cref{lem:partition-of-set-of-bkc-AMOs,lem:formular-for-non-KT-consistent-permutaion},
along with \cref{lem:formular-for-KT-consistent-permutaion} and the discussion
following it, finally give us the following recursion.
 \begin{lemma}
 \label{lem:final-counting-algorithm}
 Let $G$ be a UCCG, $\mathcal{K}$ be a given background knowledge, and
$\mathcal{T}=(T,R)$ a rooted clique tree of $G$ on which $<_{\alpha}$ has been
defined. Then $\#AMO(G,\mathcal{K})$ equals
\begin{equation*}
\sum_{C}
  {\Phi(C,FP(C,\mathcal{T}),\mathcal{K}[C]) \times} 
  {\prod_{H\in \mathcal{C}_G(C)}{\#AMO(H,\mathcal{K}[H])}},
\end{equation*}
where the sum is over those $C$ for which $G^C$ is $\mathcal{K}$-consistent.
 \end{lemma}
 \cref{lem:final-counting-algorithm} solves our counting problem. The precondition of $\Phi(S,\mathcal{R}, \mathcal{K})$ in \cref{def:phi-S-R-K}  that $\mathcal{R}=\{R_1,R_2,\ldots,R_l\}$ has the property
$R_1\subsetneq R_2 \subsetneq \ldots \subsetneq R_l \subsetneq S$ is satisfied
at the beginning of the recursion, i.e. when $\mathcal{R} = FP(C, \mathcal{T})$,
by \cref{lem:elements-of-FP-is-arranged-as subsets}, which is a consequence of the
standard clique intersection property of clique trees of chordal graphs.
\begin{lemma}[\cite{wienobst2020polynomial}, Lemma 7]
\label{lem:elements-of-FP-is-arranged-as subsets}
We can order the elements of $FP(C,\mathcal{T})$ as $X_1\subsetneq X_2\subsetneq \ldots \subsetneq X_l \subsetneq C$.
\end{lemma}
The precondition of $\Phi(S,\mathcal{R}, \mathcal{K})$ is preserved throughout the recursion described in the lemma. We now give a recursive method to compute
$\Phi(S,\mathcal{R}, \mathcal{K})$. 
\begin{lemma}
\label{lem:phi-computaion}
Let $S$ be a clique, and $\mathcal{K}\subseteq S\times S$ be a set of directed
edges. Let $R=\{R_1,R_2,\ldots , R_l\}$ where $l\geq 1$ be such that
$R_1\subsetneq R_2 \subsetneq \ldots \subsetneq R_l \subsetneq S$.
Then,
\begin{enumerate}
\item \label{item-3-of-phi-computaion}
  $\Phi(S, \emptyset,\mathcal{K})= \frac{|S|!}{|V_{\mathcal{K}}|!} \times
  \Psi(V_{\mathcal{K}}, \mathcal{K})$, where
  $\Psi(V_{\mathcal{K}}, \mathcal{K})$ is the number of $\mathcal{K}$-consistent
  permutations of vertices in $V_{\mathcal{K}}$ ($V_{\mathcal{K}}$ is the set of
  end points of edges in $\mathcal{K}$). (Example: Suppose $S=\{1,2,3,4,5\}$, and $\mathcal{K}=\{1\rightarrow2, 2\rightarrow 3\}$. Then, $V_{\mathcal{K}}=\{1,2,3\}$. And, there exist only one permutation, $(1,2,3)$, of $V_{\mathcal{K}}$ that is $\mathcal{K}$-consistent, i.e., $\Psi(V_{\mathcal{K}}, \mathcal{K}) = 1$. This implies $\Phi(S, \emptyset,\mathcal{K})=20$.)

\item \label{item-1-of-phi-computaion}
    If there exists an edge $u\rightarrow v\in \mathcal{K}$ such that
    $u\in S\setminus R_l$ and $v\in R_l$, then
    $\Phi(S,R,\mathcal{K})= \Phi(S,R-\{R_l\},\mathcal{K}).$
\item 
  \label{item-2-of-phi-computaion}
  If there does not exist an edge $u \rightarrow v\in \mathcal{K}$ such that
  $u\in S\setminus R_l$ and $v\in R_l$, then $\Phi(S,R,\mathcal{K})=$
  $\Phi(S,R-\{R_l\},\mathcal{K}) -$
  ${\Phi(R_l,R-\{R_l\},\mathcal{K}[R_l])\times \Phi(S\setminus
    R_l,\emptyset,\mathcal{K}[S\setminus R_l])}$.
\end{enumerate}
\end{lemma}
\begin{algorithm}[t]
\caption{Valid-Perm$(S,\mathcal{R},\mathcal{K})$}
\label{alg:valid-permutaion-of-clique}
\SetAlgoLined
\SetKwInOut{KwIn}{Input}
\SetKwInOut{KwOut}{Output}
\SetKwFunction{Counting-Permutations}{Counting-Permutations}
\KwIn{A clique $S$, $\mathcal{R}=\{R_1,R_2,\ldots,R_l\}$ such that $R_1\subsetneq R_2 \subsetneq \ldots \subsetneq R_l\subsetneq S$,  and background knowledge $\mathcal{K}\subseteq S\times S$.}
\KwOut{ $\Phi(S,\mathcal{R},\mathcal{K})$.
}

\label{alg-vp:R-is-empty} \If{$\mathcal{R}=\emptyset $} 
{




\KwRet
     $\frac{|S|!}{|{V_{\mathcal{K}}|!}} \cdot {\Psi(V_{\mathcal{K}},
       \mathcal{K})}$\label{alg-vp:return-K-empty} \label{alg-vp:sum-init-R-empty}
    
    
   }\label{alg-vp:R-is-empty-end}

\label{alg-vp:sum-init-R-not-empty}
$\texttt{sum}\leftarrow$ Valid-Perm$(S,\mathcal{R}-\{R_l\},\mathcal{K})$

\label{alg-vp:no-edge-from-Rl} \If{$\{(u,v):u\rightarrow v\in \mathcal{K}, v\in R_l$ and $u\notin R_l\}\neq \emptyset$}
{
\KwRet $\texttt{sum}$\label{alg-vp:return2}
}\label{alg-vp:no-edge-from-Rl-end}


\KwRet $\texttt{sum} - \text{Valid-Perm}(R_l,\mathcal{R}-\{R_l\},\mathcal{K}[R_l]) \times \text{Valid-Perm}(S\setminus R_l,\emptyset, \mathcal{K}[S\setminus R_l])$\label{alg-vp:final-return}
\end{algorithm}
 \begin{proof}[Proof of \cref{lem:phi-computaion}]
  Proofs of \cref{item-1-of-phi-computaion,item-2-of-phi-computaion} follow
  easily from the definition of the $\Phi$ function
  (\cref{lem:final-counting-algorithm}), and are similar in spirit to the
  corresponding results of \citet{wienobst2020polynomial} in the setting of no
  background knowledge.  We provide the details of these proofs in the
  supplementary material and focus here on proving \cref{item-3-of-phi-computaion}. If
  $\mathcal{R}=\emptyset$ then $\Phi(S,\mathcal{R}, \mathcal{K})$ is the number
  of $\mathcal{K}$-consistent permutations of $S$. There are
  $\frac{|S|!}{|V_{\mathcal{K}}|!}$ permutations of $S$ consistent with any given ordering of vertices in $V_{\mathcal{K}}$. The total number of
  $\mathcal{K}$-consistent permutations of the vertices in $V_{\mathcal{K}}$ is
  $\Psi(V_{\mathcal{K}},\mathcal{K})$. Therefore, the number of
  $\mathcal{K}$-consistent permutations of $S$ equals
  $\frac{|S|!}{|V_{\mathcal{K}}|!} \times \Psi(V_{\mathcal{K}}, \mathcal{K})$.
\end{proof}
\Cref{alg:valid-permutaion-of-clique} implements \cref{lem:phi-computaion} to
compute $\Phi(S,\mathcal{R},\mathcal{K})$, and its correctness given below, is
an easy consequence of \cref{lem:phi-computaion}.
\begin{observation}
\label{lem:alg-valid-perm-correctness}
For input $S, \mathcal{R} = \{R_1,R_2,\ldots, R_l\},$ and $\mathcal{K}$, where
$R_1\subsetneq R_2 \subsetneq \ldots \subsetneq R_l\subsetneq S$, and
$\mathcal{K}\subseteq S\times S$, \cref{alg:valid-permutaion-of-clique} returns
$\Phi(S,\mathcal{R},\mathcal{K})$.
\end{observation}


We now construct \cref{alg:Count-AMO} that computes $\#$AMO$(G,\mathcal{K})$. \Cref{alg:Count-AMO} evaluates this formula, utilizing memoization to avoid recomputations. 

\begin{algorithm}[t]
  \caption{$\texttt{count}(G, \mathcal{K}, \texttt{memo})$ (modification of an algorithm of
    \cite{wienobst2020polynomial})}
\label{alg:Count-AMO}
\SetAlgoLined
\SetKwInOut{KwIn}{Input}
\SetKwInOut{KwOut}{Output}
\SetKwFunction{Count-AMO}{Count-AMO}

    \KwIn{A UCCG $G$, background knowledge $\mathcal{K}\subseteq E_G$.}
    \KwOut{$\#$AMO$(G,\mathcal{K})$.}
    \SetKwProg{Fn}{function}{}
  
  
\If{$G \in \texttt{\textup{memo}}$} {\KwRet $\texttt{memo}[G]$}\label{alg-count:G-in-memory}


$\mathcal{T}=(T,R)\leftarrow  \text{ a rooted clique tree of $G$}$\label{alg-count:clique-tree-construction}


\If{$R=V_G$\label{alg:count:G-is-a-clique}}{


$\texttt{memo}[G]=$\label{alg-count:memo-update}
$\Phi(V,\emptyset,\mathcal{K})$
 
 \KwRet $\texttt{memo}[G]$
 }\label{alg-count:if-G-is-a-clique-end}
 
 
 $\texttt{sum} \leftarrow 0$\label{alg-count:sum-initialization}
 

 $Q\leftarrow \text{queue with single element $R$}$ \label{alg-count:Queue-construction}
 
 
 \While{$Q \text{ is not empty \label{alg-count:while-queue-is-not-empty}}$}{
 
 $C\leftarrow pop(Q)$\label{alg-count:clique-popped}\\
 
 
 push($Q$, children($C$))\label{alg-count:queue-pushed}\\
 
 
 $(\texttt{flag},\mathcal{L})\leftarrow$LBFS$(G,C,\mathcal{K})$\label{alg-count:LBFS}
 
 
 \If{$\texttt{\textup{flag}} = 1$ \label{alg-count:flag-is-1}}{
 
 
 $\texttt{prod} \leftarrow 1$\label{alg-count:prod-init}
 
 
\ForEach{$H\in \mathcal{L}$\label{alg-count:foreach-start}}{


$\texttt{prod} = \texttt{prod} \times \texttt{count}(G[H], \mathcal{K}[H], \texttt{memo})$\label{alg-count:prod-multiplication}
}\label{alg-count:foreach-end}


$\texttt{sum} = \texttt{sum}
+  \texttt{prod} \times  \Phi(C,  \text{FP}(C,\mathcal{T}),\mathcal{K}[C])$\label{alg-count:sum-update} \\
}\label{alg-count:if-end}
}\label{alg-count:while-end}

$\texttt{memo}[G]= \texttt{sum}$\label{alg-count:memory-update}
 
 
 \KwRet $\texttt{sum}$\label{alg-count:final-return}
\end{algorithm}




 
\begin{theorem}\label{lem:main-algorithm-correct}
For a UCCG $G$ and background knowledge $\mathcal{K}$, \cref{alg:Count-AMO} returns $\#$AMO$(G,\mathcal{K})$.
\end{theorem}
Proof of \cref{lem:main-algorithm-correct}:
We first fix a
clique tree $\mathcal{T}=(T,R)$ (at line \ref{alg-count:clique-tree-construction}) on which we define $<_{\alpha}$. Lines \ref{alg:count:G-is-a-clique}-\ref{alg-count:if-G-is-a-clique-end} deals with the base case when $G$ is a clique. If $G$ is not a clique, \cref{alg:Count-AMO} follows \cref{lem:final-counting-algorithm}.  The full detail is given in Supplementary Material due to lack of space.

















































 \section{Time Complexity Analysis}
\label{sec:time-complexity}
In this section, we analyze the run time of \cref{alg:Count-AMO}.  The proof of
the following observation, which shows that despite our modifications,
\cref{alg:AMO-Union-K} still runs in linear time, is given in the Supplementary
Material.

\begin{observation}
\label{thm:LBFS-alg}
For a UCCG $G$, a maximal clique $C$ of $G$, and background knowledge $\mathcal{K}$, \cref{alg:AMO-Union-K} runs in linear time $O(|V_G| +|E_G|)$.
\end{observation}

Similar to Wien{\"o}bst et al.'s $\texttt{count}$ function, our $\texttt{count}$ function
(\cref{alg:Count-AMO}) is also recursively called at most $2|\Pi(G)|-1|$ times,
where $\Pi(G)$ is the set of maximal cliques of $G$.  Our approach to compute
the background aware version of $\Phi$ (\cref{alg:valid-permutaion-of-clique})
is similar to that of \citet{wienobst2020polynomial}, and the difference in time
complexity comes from the high time complexity of computation of
$\Phi(S,\emptyset,\mathcal{K})$ at \cref{item-3-of-phi-computaion} (it is $O(1)$
for $\mathcal{K}=\emptyset$, which is the setting considered by
\citet{wienobst2020polynomial}).  Proof of the claims below can be found in the
Supplementary Material.

\begin{proposition}
\label{prop:number-of-times-alg-2-called}
Let $G$ be a UCCG, and $\mathcal{K}$ be the known background knowledge about $G$. The number of distinct UCCG explored by the \texttt{count} function (as defined in \cref{alg:Count-AMO}) is bounded by $2|\Pi(G)|-1$.
\end{proposition}

\begin{lemma}
\label{lem:time-complexity-of-Phi}
For input $S$, $\mathcal{R}=\{R_1,R_2,\ldots, R_l\}$, and $\mathcal{K}$,
\cref{alg:valid-permutaion-of-clique} can be implemented using memoization to
use $O(k! \cdot k^2 \cdot |\Pi(G)|^2)$ arithmetic operations, where $k$ is the
max-clique knowledge of $\mathcal{K}$ (assuming factorials of integers from $1$
to $|V_G|$ are available for free).
\end{lemma}



\begin{theorem}[Final runtime bound of \cref{alg:Count-AMO}]
\label{thm:ACMO-Counting-BK}
For a UCCG $G$, and background knowledge $\mathcal{K}$, \cref{alg:Count-AMO} runs in time $O(k!k^2n^4)$, more precisely $O({k!k^2 \cdot |\Pi(G)|}^4)$, where $n$ is the number of nodes in $G$, and $k$ is the max-clique knowledge of $\mathcal{K}$.
\end{theorem}



\begin{proof}[Proof of \cref{thm:main-result}]
  Together, \cref{lem:main-algorithm-correct,thm:ACMO-Counting-BK} prove our
  main result, \cref{thm:main-result}.
\end{proof}
 \section{Experimental evaluation}
\label{sec:experimental_evaluation}
In this section, we evaluate the performance of \cref{alg:Count-AMO} on a synthetic dataset. For each $n\in \{500, 510, 520, \ldots, 1000\}$, we construct 50 random chordal graphs with $n$ nodes, and for each
$k \in \{5,6,\ldots, 13\}$, we construct a set of background knowledge edges with $k$ as its max clique knowledge value. We then
measure the running time of \cref{alg:Count-AMO} for each of these (graph,
background knowledge) pairs, and take the mean running time over all such pairs
with the same value of $n$ and $k$.  Further details about the construction of
these instances can be found in the Supplementary Material.






\paragraph{Validating the run-time bound} To validate the $O(k! k^2 n^4)$
run-time bound established in \cref{sec:time-complexity}, we draw log-log plots
of the mean run-time $T$ against the size $n$ of the graph, for each fixed value
of $k$ (\cref{fig:comp-plot-k-vs-logT-5-13}).  As predicted by the polynomial
(in $n$) run-time bound in our theoretical result, we get, for each value of the
parameter $k$, a roughly linear log-log plot.




\begin{figure}[t]
  \centering   \includegraphics[width=0.48\textwidth]{logn_vs_log_T_k-5-15}
  \caption{$\log$ vs. $\log$ plot of $T$ vs. $n$ \label{fig:comp-plot-k-vs-logT-5-13}}
\end{figure}



\paragraph{The intercept of the log-log plot} While the plots in
\cref{fig:comp-plot-k-vs-logT-5-13} for $k\in \{5, 6, 7, 8, 9, 10, 11\}$ are
quite close to each other, the separation of the plots for $k=12$ and $k=13$ is
much larger. The reason behind it is the actual time complexity of
\cref{alg:Count-AMO}  can roughly be
bounded as $\log T \leq \log a + \log (k!k^2 + b) + 4\log n$, where $a$ and $b$ are constants independent of $n$ and $k$.
The above observation then shows that until about $k \approxeq 11$, all the plots
have intercepts close to each other, as the value of $b$ dominates $k!k^2$ for
small value of $k$.  The difference starts increasing fast when the $k!k^2$ term
becomes larger than $b$.














\paragraph{Effect of the size of the background knowledge} An important feature
of our analysis of \cref{alg:Count-AMO} is that its run-time bound does not
depend directly upon the actual size of the background knowledge.  To validate
this, we conduct the following experiment: we fix a chordal graph of size $n$
and the max-clique knowledge value $k$, and then construct two different sets
$\mathcal{K}_1$ and $\mathcal{K}_2$ of background knowledge edges, of different
sizes such that both have the same $k$ value (the details of the construction
are given in the Supplementary Material).  In \cref{tab:1}, $T_1$, $T_2$ are the
running times of \cref{alg:Count-AMO} with background knowledge $\mathcal{K}_1$
and $\mathcal{K}_2$ respectively.
\begin{table}[h]
  \centering 
  \begin{tabular}{ |c|c|c|c|c|c| }
    \hline
    $n$ & $k$ &$|\mathcal{K}_1|$ & $|\mathcal{K}_2|$ & $T_1$ & $T_2$ \\ 
    \hline
    1000 & 10 & 65 & 116 & 370 & 353 \\  
    \hline
    1000 & 10 & 75 & 137 & 346 & 338 \\ 
    \hline
    1100 & 11 & 55 & 121 & 467 & 455 \\ 
    \hline
    1100 & 11 & 51 & 104 & 460 & 453 \\
    \hline
  \end{tabular}
  \caption{Exploring runtime dependence on the number of background knowledge
    edges~\label{tab:1}}
\end{table}
The table confirms the expectation that when the graph and $k$ are fixed, the
running time does not increase much when the size of the background-knowledge
increases.  More detailed data and discussion of this phenomenon are given in
the Supplementary Material.












 \section{Conclusion}
\label{sec:conclusion-open-problems}




Our main result shows that the max-clique-knowledge parameter we introduce plays
an important role in the algorithmic complexity of counting Markov equivalent
DAGs under background knowledge constraint. In particular, it leads to a
polynomial time algorithm in the special case of graphs of bounded
maximum-clique size. Note that an algorithm that runs in polynomial time in the
general case is precluded by the $\#$P-hardness result of
\cite{wienobst2020polynomial} (unless P = NP). However, the optimal dependence
of the run time on the max-clique-knowledge parameter is an interesting open
problem left open by our work.

\section{Acknowledgment}
We acknowledge the support from the Department of Atomic Energy, Government of
India, under project no. RTI4001.  We want to thank Piyush Srivastava for his
invaluable suggestions, discussions, and help in the completion of this
paper. We thank all the anonymous reviewers for multiple useful suggestions which helped
in improving the presentation of this paper.


\begin{thebibliography}{21}
\providecommand{\natexlab}[1]{#1}
\providecommand{\url}[1]{\texttt{#1}}
\expandafter\ifx\csname urlstyle\endcsname\relax
  \providecommand{\doi}[1]{doi: #1}\else
  \providecommand{\doi}{doi: \begingroup \urlstyle{rm}\Url}\fi

\bibitem[Andersson et~al.(1997)Andersson, Madigan, and
  Perlman]{andersson1997characterization}
Steen~A. Andersson, David Madigan, and Michael~D. Perlman.
\newblock {A} {Characterization} of {Markov} {Equivalence} {Classes} for
  {Acyclic} {Digraphs}.
\newblock \emph{Annals of Statistics}, 25\penalty0 (2):\penalty0 505--541,
  1997.

\bibitem[Bernstein and Tetali(2017)]{bernstein2017sampling}
Megan Bernstein and Prasad Tetali.
\newblock {On} {Sampling} {Graphical} {Markov} {Models}.
\newblock \emph{arXiv:1705.09717}, 2017.

\bibitem[Chickering(1995)]{chickering_causal_1995}
David~Maxwell Chickering.
\newblock A {Transformational} {Characterization} of {Equivalent} {Bayesian}
  {Network} {Structures}.
\newblock In \emph{Proceedings of the 11th Conference on Uncertainty in
  Artificial Intelligence (UAI 1995)}, pages 87--98, 1995.

\bibitem[Cygan et~al.(2015)Cygan, Fomin, Kowalik, Lokshtanov, Marx, Pilipczuk,
  Pilipczuk, and Saurabh]{cygan_parameterized_2015}
Marek Cygan, Fedor~V. Fomin, Łukasz Kowalik, Daniel Lokshtanov, Dániel Marx,
  Marcin Pilipczuk, Michał Pilipczuk, and Saket Saurabh.
\newblock \emph{Parameterized {Algorithms}}.
\newblock Springer International Publishing, 2015.

\bibitem[Finegold and Drton(2011)]{finegold2011robust}
Michael Finegold and Mathias Drton.
\newblock {Robust} {Graphical} {Modeling} of {Gene} {Networks} using
  {Classical} and {Alternative} $t$-{Distributions}.
\newblock \emph{Annals of Applied Statistics}, pages 1057--1080, 2011.

\bibitem[Friedman(2004)]{friedman2004inferring}
Nir Friedman.
\newblock {Inferring} {Cellular} {Networks} using {Probabilistic} {Graphical}
  {Models}.
\newblock \emph{Science}, 303\penalty0 (5659):\penalty0 799--805, 2004.

\bibitem[Fulkerson and Gross(1965)]{fulkerson1965incidence}
Delbert Fulkerson and Oliver Gross.
\newblock {Incidence} {Matrices} and {Interval} {Graphs}.
\newblock \emph{Pacific Journal of Mathematics}, 15:\penalty0 835--855, 1965.

\bibitem[Ganian et~al.(2020)Ganian, Hamm, and Talvitie]{ganian2020efficient}
Robert Ganian, Thekla Hamm, and Topi Talvitie.
\newblock {An} {Efficient} {Algorithm} for {Counting} {Markov} {Equivalent}
  {DAGs}.
\newblock In \emph{Proceedings of the 34th AAAI Conference on Artificial
  Intelligence (AAAI 2020)}, volume~34, pages 10136--10143, 2020.

\bibitem[Ghassami et~al.(2019)Ghassami, Salehkaleybar, Kiyavash, and
  Zhang]{ghassami2019counting}
AmirEmad Ghassami, Saber Salehkaleybar, Negar Kiyavash, and Kun Zhang.
\newblock {Counting} and {Sampling} from {Markov} {Equivalent} {DAGs} using
  {Clique} {Trees}.
\newblock In \emph{Proceedings of the 33rd AAAI Conference on Artificial
  Intelligence (AAAI 2019)}, volume~33, pages 3664--3671, 2019.

\bibitem[Hauser and B\"{u}hlmann(2012)]{HB12}
Alain Hauser and Peter B\"{u}hlmann.
\newblock Characterization and {Greedy} {Learning} of {Interventional} {Markov}
  {Equivalence} {Classes} of {Directed} {Acyclic} {Graphs}.
\newblock \emph{Journal of Machine Learning Research}, 13:\penalty0 2409--2464,
  2012.

\bibitem[He and Yu(2016)]{he2016formulas}
Yangbo He and Bin Yu.
\newblock {Formulas} for {Counting} the {Sizes} of {Markov} {Equivalence}
  {Classes} of {Directed} {Acyclic} {Graphs}.
\newblock \emph{arXiv:1610.07921}, 2016.

\bibitem[He et~al.(2015)He, Jia, and Yu]{he2015counting}
Yangbo He, Jinzhu Jia, and Bin Yu.
\newblock {Counting} and {Exploring} {Sizes} of {Markov} {Equivalence}
  {Classes} of {Directed} {Acyclic} {Graphs}.
\newblock \emph{Journal of Machine Learning Research}, 16\penalty0
  (1):\penalty0 2589--2609, 2015.

\bibitem[Kearns et~al.(2001)Kearns, Littman, and Singh]{kearns2013graphical}
Michael Kearns, Michael~L. Littman, and Satinder Singh.
\newblock {Graphical} {Models} for {Game} {Theory}.
\newblock In \emph{Proceedings of the 17th Conference on Uncertainity in
  Artifical Intelligence conference (UAI 2001)}, pages 253--260, 2001.

\bibitem[Madigan et~al.(1996)Madigan, Andersson, Perlman, and
  Volinsky]{madigan1996bayesian}
David Madigan, Steen~A. Andersson, Michael~D. Perlman, and Chris~T. Volinsky.
\newblock {Bayesian} {Model} {Averaging} and {Model} {Selection} for {Markov}
  {Equivalence} {Classes} of {Acyclic} {Digraphs}.
\newblock \emph{Communications in Statistics--Theory and Methods}, 25:\penalty0
  2493--2519, 1996.

\bibitem[Meek(1995)]{meek2013causal}
Christopher Meek.
\newblock Causal {Inference} and {Causal} {Explanation} with {Background}
  {Knowledge}.
\newblock In \emph{Proceedings of the 11th Conference on Uncertainty in
  Artificial Intelligence (UAI 1995)}, pages 403--410, 1995.

\bibitem[Pearl(2009)]{Pearl2009}
Judea Pearl.
\newblock \emph{{Causality}: {Models}, {Reasoning} and {Inference}}.
\newblock Cambridge University Press, 2009.

\bibitem[Ren et~al.(2020)Ren, Oviedo, Thway, Tian, Wang, Xue, Dario~Perea,
  Layurova, Heumueller, Birgersson, Aberle, Brabec, Stangl, Li, Sun, Lin,
  Peters, and Buonassisi]{ren_embedding_2020}
Zekun Ren, Felipe Oviedo, Maung Thway, Siyu I.~P. Tian, Yue Wang, Hansong Xue,
  Jose Dario~Perea, Mariya Layurova, Thomas Heumueller, Erik Birgersson,
  Armin~G. Aberle, Christoph~J. Brabec, Rolf Stangl, Qianxiao Li, Shijing Sun,
  Fen Lin, Ian~Marius Peters, and Tonio Buonassisi.
\newblock Embedding {Physics} {Domain} {Knowledge} into a {Bayesian} {Network}
  {Enables} {Layer}-by-{Layer} {Process} {Innovation} for {Photovoltaics}.
\newblock \emph{NPJ Computational Materials}, 6\penalty0 (1):\penalty0 9,
  January 2020.

\bibitem[Rose et~al.(1976)Rose, Tarjan, and Lueker]{rose1976algorithmic}
Donald~J. Rose, R.~Endre Tarjan, and George~S. Lueker.
\newblock {Algorithmic} {Aspects} of {Vertex} {Elimination} on {Graphs}.
\newblock \emph{SIAM Journal on Computing}, 5\penalty0 (2):\penalty0 266--283,
  1976.

\bibitem[Talvitie and Koivisto(2019)]{talvitie2019counting}
Topi Talvitie and Mikko Koivisto.
\newblock {Counting} and {Sampling} {Markov} {Equivalent} {Directed} {Acyclic}
  {Graphs}.
\newblock In \emph{Proceedings of the 33rd AAAI Conference on Artificial
  Intelligence (AAAI 2019)}, volume~33, pages 7984--7991, 2019.

\bibitem[Verma and Pearl(1990)]{verma1991equivalence}
Thomas Verma and Judea Pearl.
\newblock {Equivalence} and {Synthesis} of {Causal} {Models}.
\newblock In \emph{Proceedings of the 6th Conference on Uncertainity in
  Artifical Intelligence conference (UAI 1990)}, pages 220--227, 1990.

\bibitem[Wien{\"o}bst et~al.(2021)Wien{\"o}bst, Bannach, and
  Li{\'s}kiewicz]{wienobst2020polynomial}
Marcel Wien{\"o}bst, Max Bannach, and Maciej Li{\'s}kiewicz.
\newblock Polynomial-{Time} {Algorithms} for {Counting} and {Sampling} {Markov}
  {Equivalent} {DAGs}.
\newblock In \emph{Proceedings of the 35th AAAI Conference on Artificial
  Intelligence (AAAI 2021)}, pages 12198--12206, 2021.

\end{thebibliography}


\end{document}
