\documentclass[accepted]{uai2023} % for initial submission
% \documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
 \usepackage{multirow}
\usepackage[dvipsnames]{xcolor}
\usepackage{tikz} % nice language for creating drawings and diagrams
\usetikzlibrary{arrows,positioning}



\usepackage[T1]{fontenc}
\usepackage{mathtools}
\usepackage{float}
\usepackage{amsfonts}
\usepackage{dsfont}
\usepackage{enumitem}
\usepackage{algpseudocode}
\usepackage[algo2e,ruled,linesnumbered]{algorithm2e}
\newcommand{\alg}{{\scshape kicn}}
\newcommand{\eat}[1]{}
\DeclareMathOperator*{\argmin}{arg\,min}
\DeclareMathOperator*{\argmax}{arg\,max}
\usepackage{mathtools}
\DeclarePairedDelimiter\ceil{\lceil}{\rceil}
\DeclarePairedDelimiter\floor{\lfloor}{\rfloor}
% \usepackage[font=small]{caption}
\setlength{\algomargin}{1.2em} 
%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Knowledge Intensive Learning of Cutset Networks}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<SaurabhSanjay.Mathur@utdallas.edu>?Subject=Your UAI 2023 paper}{Saurabh Mathur}{}}
\author[1]{Vibhav Gogate}
\author[1]{Sriraam Natarajan}

% Add affiliations after the authors
\affil[1]{%
    Erik Jonsson School of Engineering \& Computer Science\\
    University of Texas at Dallas\\
    Richardson, Texas, USA
}
  
\begin{document}
\maketitle

\begin{abstract}
  Cutset networks (CNs) are interpretable probabilistic representations that combine probability trees and tree Bayesian networks, to model and reason about large multi-dimensional probability distributions. Motivated by high-stakes applications in domains such as healthcare where (a) rich domain knowledge in the form of qualitative influences is readily available and (b) use of interpretable models that the user can efficiently probe and infer over is often necessary, we focus on learning CNs in the presence of qualitative influences. We propose a penalized objective function that uses the influences as constraints, and develop a gradient-based learning algorithm, \alg{}. We show that because CNs are tractable, \alg{} is guaranteed to converge to a local maximum of the penalized objective function. Our experiments on several benchmark data sets show that our new algorithm is superior to the state-of-the-art, especially when the data is scarce or noisy. 
\end{abstract}

\section{Introduction}\label{sec:intro}

\begin{figure*}[ht]
    \centering
    \resizebox{0.55\linewidth}{!}{
    
        \tikzset{
  treenode/.style = {align=center, inner sep=0pt, text centered,
    font=\sffamily},
  or_node/.style = {treenode, circle, black, font=\tiny\sffamily, draw=black,
    fill=none, text width=0.5cm},  
  leaf_node/.style = {treenode, rectangle, ForestGreen, font=\tiny, draw=ForestGreen,dashed,
    fill=none, text width=0.5cm, minimum height=0.5cm}
}

\begin{tikzpicture}[node distance=0.8cm]
\node[or_node, name=n1]{BMI};
\coordinate[below of = n1,xshift=-0.5cm,yshift=-0.5cm](c1);
\node[or_node,name=n11,left of = c1,xshift=-2cm]{PRS};
\node[or_node,name=n12,right of = n11,xshift=1.25cm]{Age};
\node[or_node,name=n13,right of = n12,xshift=1.25cm]{Age};
\node[or_node,name=n14,right of = n13,xshift=1.25cm]{PCOS};

\coordinate[below of=n11,xshift=0.5cm,yshift=-0.5cm](c11);
\node[or_node,name=n21,left of = c11,xshift=-0.8cm]{Age};
\node[or_node,name=n22,right of = n21,xshift=-0.2cm]{Age};
\node[or_node,name=n23,right of = n22,xshift=-0.2cm]{Age};
\node[or_node,name=n24,right of = n23,xshift=-0.2cm]{Age};

\coordinate[below of=n12,xshift=0cm,yshift=-0.5cm](c12);

\node[leaf_node,name=n25,left of = c12,xshift=0.05cm]{$T_1$};
\node[or_node,name=n26,right of = n25,xshift=-0.2cm]{METs};
\node[or_node,name=n27, right of = n26,xshift=-0.2cm]{PRS};
\node[or_node,name=n28,right of = n27,xshift=-0.2cm]{METs};

\coordinate[below of=n13,xshift=1.2cm,yshift=-0.5cm](c13);
\node[leaf_node,name=n29,left of = c13,xshift=-0.8cm]{$T_2$};
\node[or_node,name=n30,right of = n29,xshift=-0.2cm]{PRS};
\node[or_node,name=n31,right of = n30,xshift=-0.2cm]{METs};
\node[leaf_node,name=n32,right of = n31,xshift=-0.2cm]{$T_3$};

\coordinate[below of=n14,xshift=-0.5cm,yshift=-0.5cm](c14);
\node[or_node,name=n33,right of = c14,xshift=-0.4cm]{Age};
\node[or_node,name=n34,right of = n33,xshift=-0.2cm]{HiBP};


\draw (n1) --  node[sloped,midway,align=center]{
    \setlength{\fboxsep}{0.5pt}\colorbox{white}{\tiny \sffamily 0.17}
}(n11);
\draw  (n1) --  node[sloped,midway,align=center]{
    \setlength{\fboxsep}{0.5pt}\colorbox{white}{\tiny \sffamily 0.59}
}(n12);
\draw   (n1) --  node[sloped,midway,align=center]{
    \setlength{\fboxsep}{0.5pt}\colorbox{white}{\tiny \sffamily 0.20}
}(n13);
\draw  (n1) --  node[sloped,midway,align=center]{
   \setlength{\fboxsep}{0.5pt}\colorbox{white}{\tiny \sffamily 0.04}
}(n14);
%
\draw (n11) --  node[sloped,midway,align=center]{
    \setlength{\fboxsep}{0.5pt}\colorbox{white}{\tiny \sffamily 0.27}
}(n21);
\draw (n11) --  node[sloped,midway,align=center]{
    \setlength{\fboxsep}{0.5pt}\colorbox{white}{\tiny \sffamily 0.36}
}(n22);
\draw (n11) --  node[sloped,midway,align=center]{
    \setlength{\fboxsep}{0.5pt}\colorbox{white}{\tiny \sffamily 0.19}
}(n23);
\draw (n11) --  node[sloped,midway,align=center]{
    \setlength{\fboxsep}{0.5pt}\colorbox{white}{\tiny \sffamily 0.18}
}(n24);
%
\draw (n12) --  node[sloped,midway,align=center]{
    \setlength{\fboxsep}{0.5pt}\colorbox{white}{\tiny \sffamily 0.01}
}(n25);
\draw (n12) --  node[sloped,midway,align=center]{
    \setlength{\fboxsep}{0.5pt}\colorbox{white}{\tiny \sffamily 0.44}
}(n26);
\draw (n12) --  node[sloped,midway,align=center]{
    \setlength{\fboxsep}{0.5pt}\colorbox{white}{\tiny \sffamily 0.53}
}(n27);
\draw (n12) --  node[sloped,midway,align=center]{
    \setlength{\fboxsep}{0.5pt}\colorbox{white}{\tiny \sffamily 0.02}
}(n28);
%
\draw (n13) --  node[sloped,midway,align=center]{
    \setlength{\fboxsep}{0.5pt}\colorbox{white}{\tiny \sffamily 0.01}
}(n29);
\draw (n13) --  node[sloped,midway,align=center]{
    \setlength{\fboxsep}{0.5pt}\colorbox{white}{\tiny \sffamily 0.44}
}(n30);
\draw (n13) --  node[sloped,midway,align=center]{
    \setlength{\fboxsep}{0.5pt}\colorbox{white}{\tiny \sffamily 0.53}
}(n31);
\draw (n13) --  node[sloped,midway,align=center]{
    \setlength{\fboxsep}{0.5pt}\colorbox{white}{\tiny \sffamily 0.02}
}(n32);
%
\draw (n14) --  node[sloped,midway,align=center]{
    \setlength{\fboxsep}{0.5pt}\colorbox{white}{\tiny \sffamily 0.80}
}(n33);
\draw (n14) --  node[sloped,midway,align=center]{
    \setlength{\fboxsep}{0.5pt}\colorbox{white}{\tiny \sffamily 0.20}
}(n34);
\end{tikzpicture}
    }
    \resizebox{0.44\linewidth}{!}{
    
\tikzset{
  treenode/.style = {align=center, inner sep=0pt, text centered,
    font=\sffamily},
  or_node/.style = {treenode, circle, black, font=\tiny\sffamily, draw=black,
    fill=none, text width=0.5cm},  
  leaf_node/.style = {treenode, rectangle, ForestGreen, font=\tiny, draw=ForestGreen,dashed,
    fill=none, text width=0.5cm, minimum height=0.5cm}
}
\eat{
\begin{tikzpicture}[]level 1/.style={level distance = 1cm, sibling distance=1.5cm}, level 2/.style={level distance = 1cm, sibling distance=0.5cm}] 
\tikzstyle{edge from parent}=[draw,black]
\node [or_node] {PRS} 
    child{ node [or_node] {BMI}
            child{ 
                node [or_node] {GDM}  
                edge from parent node[sloped] {\colorbox{white}{\tiny \sffamily 0.17}}
            }
            child{ 
                node [or_node] {PCOS}  
                edge from parent node[sloped] {\colorbox{white}{\tiny \sffamily 0.59}}
            }
            child{
                node [or_node] {HiBP}  
                edge from parent node[sloped] {\colorbox{white}{\tiny \sffamily 0.20}}
            }
            child{ 
                node [or_node] {PCOS}  
                edge from parent node[sloped] {\colorbox{white}{\tiny \sffamily 0.04}}
            }
            edge from parent node[sloped] {\colorbox{white}{\tiny \sffamily 0.25}}
            %edge from parent node[sloped, auto=left] {\tiny $0.0003$}
    } 
   child{ node [or_node] {Hist}  
            child{ node [or_node] {METs}
            edge from parent node[sloped] {\colorbox{white}{\tiny \sffamily 0.83}}
            }
            child{ node [or_node] {PCOS}
            edge from parent node[sloped] {\colorbox{white}{\tiny \sffamily 0.17}}
            }
            edge from parent node[sloped] {\colorbox{white}{\tiny \sffamily 0.28}}
            %edge from parent node[sloped, auto=left] {\tiny $0.0230$}
    } 
   child{ node [or_node] {HiBP}  
            %edge from parent node[sloped, auto=left] {\tiny $0.0230$}
            child{ node [or_node] {Hist}  
                edge from parent node[sloped] {\colorbox{white}{\tiny \sffamily 0.98}}
            }
            child{ node [leaf_node] {$T_2$}
            edge from parent node[sloped] {\colorbox{white}{\tiny \sffamily 0.02}}
            }
            edge from parent node[sloped] {\colorbox{white}{\tiny \sffamily 0.21}}
            %edge from parent node[sloped, auto=left] {\tiny $0.0003$}
    } 
   child{ node [or_node] {Hist}  
            child{ node [or_node] {PCOS}
            edge from parent node[sloped] {\colorbox{white}{\tiny \sffamily 0.78}}
            }
            child{ node [or_node] {HiBP}
            edge from parent node[sloped] {\colorbox{white}{\tiny \sffamily 0.22}}
            }
            edge from parent node[sloped] {\colorbox{white}{\tiny \sffamily 0.26}}
    } 
; 
\end{tikzpicture}
}

\begin{tikzpicture}[node distance=0.8cm]
\node[or_node, name=n1]{PRS};
\coordinate[below of = n1,xshift=-0.5cm,yshift=-0.5cm](c1);
\node[or_node,name=n11,left of = c1,xshift=-0.5cm]{BMI};
\node[or_node,name=n12,right of = n11,xshift=0.5cm]{Hist};
\node[or_node,name=n13,right of = n12,xshift=0.5cm]{HiBP};
\node[or_node,name=n14,right of = n13,xshift=0.5cm]{Hist};

\coordinate[below of=n11,xshift=0.5cm,yshift=-0.5cm](c11);
\node[or_node,name=n21,left of = c11,xshift=-0.8cm]{GDM};
\node[or_node,name=n22,right of = n21,xshift=-0.2cm]{PCOS};
\node[or_node,name=n23,right of = n22,xshift=-0.2cm]{HiBP};
\node[or_node,name=n24,right of = n23,xshift=-0.2cm]{PCOS};

\coordinate[below of=n12,xshift=-1cm,yshift=-0.5cm](c12);
\node[or_node,name=n25,left of = c12,xshift=1.8cm]{METs};
\node[or_node,name=n26,right of = n25,xshift=-0.2cm]{PCOS};

\coordinate[below of=n13,xshift=-0.75cm,yshift=-0.5cm](c13);
\node[or_node,name=n27,left of = c13,xshift=1.5cm]{Hist};
\node[leaf_node,name=n28,right of = n27,xshift=-0.1cm]{$T_1$};


\coordinate[below of=n14,xshift=-0.5cm,yshift=-0.5cm](c14);
\node[or_node,name=n29,left of = c14,xshift=1.4cm]{PCOS};
\node[or_node,name=n30,right of = n29]{HiBP};
%
\draw (n1.225) --  node[sloped,midway,align=center]{
    \setlength{\fboxsep}{0.5pt}\colorbox{white}{\tiny \sffamily 0.25}
}(n11.north east);
\draw  (n1.250) --  node[sloped,midway,align=center]{
    \setlength{\fboxsep}{0.5pt}\colorbox{white}{\tiny \sffamily 0.28}
}(n12.north);
\draw   (n1.275) --  node[sloped,midway,align=center]{
    \setlength{\fboxsep}{0.5pt}\colorbox{white}{\tiny \sffamily 0.21}
}(n13.north west);
\draw  (n1.300) --  node[sloped,midway,align=center]{
   \setlength{\fboxsep}{0.5pt}\colorbox{white}{\tiny \sffamily 0.26}
}(n14.north west);
%
\draw (n11) --  node[sloped,midway,align=center]{
    \setlength{\fboxsep}{0.5pt}\colorbox{white}{\tiny \sffamily 0.17}
}(n21);
\draw (n11) --  node[sloped,midway,align=center]{
    \setlength{\fboxsep}{0.5pt}\colorbox{white}{\tiny \sffamily 0.59}
}(n22);
\draw (n11) --  node[sloped,midway,align=center]{
    \setlength{\fboxsep}{0.5pt}\colorbox{white}{\tiny \sffamily 0.20}
}(n23);
\draw (n11) --  node[sloped,midway,align=center]{
    \setlength{\fboxsep}{0.5pt}\colorbox{white}{\tiny \sffamily 0.04}
}(n24);
%
\draw (n12) --  node[sloped,midway,align=center]{
    \setlength{\fboxsep}{0.5pt}\colorbox{white}{\tiny \sffamily 0.83}
}(n25);
\draw (n12) --  node[sloped,midway,align=center]{
    \setlength{\fboxsep}{0.5pt}\colorbox{white}{\tiny \sffamily 0.17}
}(n26);
%
\draw (n13) --  node[sloped,midway,align=center]{
    \setlength{\fboxsep}{0.5pt}\colorbox{white}{\tiny \sffamily 0.98}
}(n27);
\draw (n13) --  node[sloped,midway,align=center]{
    \setlength{\fboxsep}{0.5pt}\colorbox{white}{\tiny \sffamily 0.02}
}(n28);
%
\draw (n14) --  node[sloped,midway,align=center]{
    \setlength{\fboxsep}{0.5pt}\colorbox{white}{\tiny \sffamily 0.78}
}(n29);
\draw (n14) --  node[sloped,midway,align=center]{
    \setlength{\fboxsep}{0.5pt}\colorbox{white}{\tiny \sffamily 0.22}
}(n30);

\end{tikzpicture}
    }
    \caption{\small The first 3 levels of cutset networks (CNs) learned from the nuMoM2b-b dataset using LearnCNet (left) and using \alg{} (right). While Polygenic Risk Score (PRS) is a very important risk factor for Gestational Diabetes in the literature \citep{PRSPagel2022}, the CN learned from data has it in second and third levels. Combining data with monotonicities obtained from the domain expert allows the CN on the right to select PRS as the root node. Moreover, the CN on the right is more concise (and thus more interpretable). 
    }
    \label{fig:gdm_cnet1}
    %\vspace{-1em}
\end{figure*}

% Probabilistic Circuits: A Unifying Framework for Tractable Probabilistic Models (Not published in a conference/journal)
Recently, there has been a growing interest in learning tractable probabilistic models (TPMs) or probabilistic circuits ~\citep{Choi20} from data. The key advantage of these models is that they admit tractable, and in most cases, linear time \textit{exact} probabilistic inference as opposed to traditional probabilistic graphical models such as Bayesian and Markov networks (BNs and MNs) which require the use of approximate inference methods. %These approximate methods have large error rates in practice and therefore both predictions and explanations derived from them are often unreliable (in that they are not faithful to the underlying model).  This is especially problematic in high-stakes applications such as healthcare where it is necessary to validate, verify and explain the predictions made (via probabilistic inference). Thus, it is natural to prefer tractable models over Bayesian and Markov networks in such applications.
We consider a restricted class of TPMs called {\em cutset networks}~\citep{CNetsRahman2014} that are inspired from Pearl's cutset conditioning~\citep{Pearl1988,bidyuk2004}. These are essentially a combination of OR trees and tree BNs where the leaves of the OR tree are tree BNs. Cutset networks are tractable in that many reasoning queries such as computing the marginal probability over a subset of variables given observations and finding the most likely explanation for evidence can be solved in time that scales linearly in the size of the network. Another key virtue of cutset networks is that, unlike state-of-the-art TPMs such as arithmetic circuits~\citep{Darwiche03} and sum-product networks~\citep{SPNPoon2011}, they are also interpretable \citep{rahman-icml19}\textemdash another key property that is necessary for models used in high-stakes applications.

%and thin junction trees~\cite{Bach2001}, cutset networks also allow for tractable learning~\cite{CNetsRahman2014} including a recent work on learning ensembles of these models~\cite{Rahman2016}.


% While efficient, these models are learned purely from the data. 
Currently, state-of-the-art algorithms for learning \textit{tractable, interpretable} cutset networks  ~\citep{CNetsRahman2014,rahman-icml19,mauro15} use training data alone. Recently, there has been a surge in developing systems that can effectively use human inputs that range from decision boundary constraints~\citep{Fung2002,Towell1994,Kunapuli2010,Kunapuli2013}, label preferences~\citep{OdomKPLL2015}, misclassification costs~\citep{Yang2014}, privileged information~\citep{Vapnik2009,Sharmanska2013} and qualitative influences~\citep{AltendorfEtAl2005,YangEtAl2013,KiGBKokel2020}. Our key hypothesis in this work is that such knowledge can potentially allow for effective learning of cutset networks in settings where data is scarce or noisy.

Specifically, we consider a type of qualitative influence called \textit{monotonicities}, as an {\em inductive bias} when learning cutset networks. We consider this task in the context of a real-world problem, that of modeling gestational diabetes from a clinical study~\citep{numom2b}, a high-stakes application where using tractable, interpretable models such as cutset networks is necessary. The monotonicities obtained from the domain expert (a physician in our case) will serve as 
constraints on the model and allow for learning a more robust model. We develop a novel learning framework, {\em Knowledge Intensive Learning of Cutset Networks} (\alg{}) that enforces these constraints during either structure or parameter learning. Consequently, we present two variations of our learning algorithm -- one for parameter learning and one for structure learning. 

%on either a {\em local} or {\em global} level of the cutset network.  Consequently, we present two variations of our learning algorithm. 

%To understand the impact of knowledge in learning, consider the learned models in Figure~\ref{fig:gdm_cnet1}. The left part is learned only from the data while the right one is learned with data and knowledge for modeling gestational diabetes~\citep{numom2b}. It can be easily observed that while the data only model uses BMI as the top feature in modeling gestational diabetes. While the second model that uses domain knowledge from clinicians, faithfully constructs an interpretable model that employs polygenic risk scores (PRS) that have been viewed as a top indicator of gestational diabetes. These are precisely the type of models that we aim to learn using our \alg{} algorithm. 

To understand the impact of knowledge in learning, consider the learned models in Figure~\ref{fig:gdm_cnet1}. The model on the left is learned from data alone while the one on the right is learned from data and knowledge for modeling gestational diabetes~\citep{numom2b}. We observe that while the model learned from data alone uses BMI as the top feature in modeling gestational diabetes, the model learned using both data and domain knowledge from clinicians uses polygenic risk scores (PRS), a top risk factor for gestational diabetes according to literature \citep{PRSPagel2022}. These are precisely the type of models that we aim to learn using our \alg{} algorithm. 


We make the following key contributions: (1) As far as we are aware, we present the first work on employing rich qualitative information as inductive biases when learning tractable probabilistic models. (2) We develop an efficient learning framework (\alg{}) that uses this qualitative information as constraints during learning. (3) We outline a few variations of this framework based on where and how the constraints are enforced. (4) We perform extensive evaluation of the proposed framework on many standard data sets and demonstrate the  superiority of the proposed algorithm on two evaluation measures: test set log-likelihood score and mean-squared error on conditional probability queries. Most importantly, we present results on a real, high-impact, gestational diabetes data set where these learned models allow for interpretability and hence, can result in building effective treatment plans.


%The rest of the paper is organized as follows -- after providing the necessary background, we present our learning framework. Then we present the different adaptations of this framework (\alg{}) with the corresponding algorithms. Finally, we present our extensive empirical evaluation on several standard data sets and a real gestational diabetes data set before concluding by outlining areas of future research.



\section{Background}

%Let $X=\{X_1,\ldots,X_n\}$ denote a set of $n$ \textit{ordinal} random variables. Let $domain(X_i)$, where $1\leq i \leq n$ denote the domain of $X_i$, namely it denotes the set of values that $X_i$ can take. Let $x=(x_1,\ldots,x_n)$ denote an assignment to all variables in $X$ such that $x_i \in domain(X_i)$.



\subsection{Cutset Networks}

Cutset networks~\citep{CNetsRahman2014} are a class of tractable probabilistic models that compactly represent large multi-dimensional joint probability distributions. They combine two interpretable and tractable representations: OR probability trees and tree BNs.

%They consist of a rooted OR probability tree with tree-structured Bayesian Networks at each of its leaves. Each node in the OR tree is labeled with a random variable and each edge corresponds to a value assignment to the variable associated with its parent OR node. Each edge in the OR tree is labeled with a conditional probability of the corresponding value assignment to its parent OR node given the assignment from the root of the OR tree to its parent. Each tree Bayesian network represents the conditional probability distribution over all variables that are not included in the OR nodes on the path from the root node to the Bayesian network given the assignment from the root to the leaf node.

Formally, given a set of variables $X=\{X_1,\ldots,X_n\}$, a cutset network is defined as a pair $\mathcal{M} = (O, T)$ where $O$ is an OR tree having $l$ leaves where each OR node is labeled with a variable $X_i \in X$ and $T = \{ T_1, \dots, T_l\}$ is a set of tree BNs such that $T_j\in T$ is associated with the $j$-th leaf node of $O$. Similar to decision trees, we assume that each variable $X_i \in X$ appears \textit{at most once} on the path from the root to a leaf node in $O$. OR nodes in $O$ represent conditioning and each OR node labeled with $X_i$ has $|domain(X_i)|$ children, one for each value in $domain(X_i)$. Each edge from a parent OR node labeled with $X_i \in X$ and a child OR node labeled with $X_j \in X$ (or a  leaf node $T_k$) is labeled with the conditional probability of $X_i$ taking the corresponding value given the assignment from the root note to the parent. Each tree BN $T_j \in T$ represents the conditional probability distribution over all variables from the set $X$ that are not included in the OR nodes on the path from the root node to $T_j$ given the assignment on the path from the root node to $T_j$.

Given an assignment (data-point) $x=(x_1,\ldots,x_n)$ to all variables in the set $X$, let $z = l(x)$ be the leaf node corresponding to $x$, $path_O(z)$ be the path from the root of the OR tree $O$ to the leaf $z$, $V_z$ be the variables in $X$ that are not included on the OR nodes in $path_O(z)$, and $W_z$ be the set of conditional probability labels on the edges in $path_O(z)$.  Then, the  joint probability distribution induced by the cutset network $\mathcal{M}$ is
\begin{align}
    P_{\mathcal{M}}(x) = \left( \prod_{w \in W_z} w \right)  P^{z}(x_{V_{z}}) 
\end{align}
where $x_{V_z}$ is the projection of $x$ on the subset $V_z$ of $X$. We assume that each tree BN $T_z \in T$ is defined by the parent map $\text{Pa}^{z}: V_z \mapsto V_z$ and parameters $\theta^z$ as $T_z = (\text{Pa}^z, \theta^z)$. Further, we use the shorthand $\text{Pa}^z_i$ to refer to $\text{Pa}^{z}(X_i)$. Then, the probability distribution at each leaf is
\begin{eqnarray}
    \nonumber
    %P^z(x_{V_{z}}) &=& \prod_{i\in V_z{(x)}} P_i^{z} (x_i \mid \text{Pa}^{z}_i = x_{\text{Pa}^{z}_i})\\
     %&=& \prod_{i\in V_z{(x)}} \theta^{z}_{ijk} 
     P^z(x_{V_{z}}) = \prod_{X_i\in V_z} P_i^{z} (x_i \mid x_{\text{Pa}^{z}_i})=\prod_{X_i\in V_z} \theta^{z}_{ijk} 
\end{eqnarray}
where $\theta^{z}_{ijk}$ is the conditional probability that the random variable $X_i$ at the leaf $z$ has the value $k$ given that its parent $\text{Pa}^{z}_i$ has the value $j$. \eat{In order to encode the constraints $\theta^{z}_{ijk} \leq 0$ and $\sum_{k'} \theta^{z}_{ijk'} = 1$, we parameterize $\theta^{z}$ using the softmax function, $S$ as  $\theta^{z}_{ijk} = S(\mu^{z}_{ij})_k$.} Let $W$ and $\theta$ denote the set of parameters associated with $O$ \eat{the OR tree }and $T$ \eat{all tree BNs }respectively.

Cutset networks are learned using the LearnCNet algorithm ~\citep{CNetsRahman2014,mauro15}. It works by recursively splitting the given data on a heuristically selected variable until a termination condition is reached. For example, termination conditions could be defined in terms of the number of remaining variables or the amount of remaining data or both. When a termination condition is met, the  Chow-Liu algorithm~\citep{ChowLiu1968} is used to fit a tree BN. The most widely used heuristic for variable selection is the maximum pairwise mutual information score heuristic where we select a variable having the maximum sum pairwise mutual information.


\begin{figure}[t]
    \centering
    \resizebox{0.7\linewidth}{!}{
    
        \tikzset{
  treenode/.style = {align=center, inner sep=0pt, text centered,
    font=\sffamily},
  or_node/.style = {treenode, circle, black, font=\sffamily\tiny, draw=black,
    fill=none, text width=1.5em},  
  product_node/.style = {treenode, circle, black, font=\sffamily\bfseries, draw=none,
    fill=blue!20, text width=1.5em},  
  sum_node/.style = {treenode, circle, black,  font=\sffamily\bfseries, draw=none, 
    fill=red!20, text width=1.5em},
  leaf_node/.style = {treenode, rectangle, ForestGreen, font=\tiny, draw=ForestGreen,dashed,
    fill=none, text width=2em, minimum height=2em}
}
\eat{
\begin{tikzpicture}[level/.style={level distance = 1.2cm}, level 1/.style={sibling distance=2.2cm}, level 2/.style={sibling distance=1.2cm}, level 4/.style={sibling distance=1cm}] 
\tikzstyle{edge from parent}=[draw,black]
\node [or_node] {$X_1$} 
    child{ node [or_node] {$X_2$}  
            child{ node [leaf_node] {$T_1$}
            edge from parent node[sloped] {\colorbox{white}{\sffamily\tiny 0.8}}
            }
            child{ node [leaf_node] {$T_2$}
            edge from parent node[sloped] {\colorbox{white}{\sffamily\tiny 0.2}}
            }
            edge from parent node[sloped] {\colorbox{white}{\sffamily\tiny 0.3}}
    } 
   child{ node [or_node] {$X_3$}  
            child{ node [leaf_node] {$T_3$} 
            edge from parent node[sloped] {\colorbox{white}{\sffamily\tiny 0.55}}
            } 
            child{ node [leaf_node] {$T_4$} 
            edge from parent node[sloped] {\colorbox{white}{\sffamily\tiny 0.45}}
            }
            edge from parent node[sloped] {\colorbox{white}{\sffamily\tiny 0.7}}
    };

\end{tikzpicture}
}

\begin{tikzpicture}[node distance=0.7cm]
\node[or_node, name=n1]{$X_1$};
\coordinate[below of = n1,yshift=-0.25cm](c1);
\node[or_node,name=n11,left of = c1, xshift=-0.5cm]{$X_2$};
\node[or_node,name=n12,right of = c1, xshift=0.5cm]{$X_3$};
\coordinate[below of = n11,yshift=-0.25cm](c2);
\node[leaf_node,name=n21,left of = c2]{$T_1$};
\node[leaf_node,name=n22,right of = c2]{$T_2$};
\coordinate[below of = n12,yshift=-0.25cm](c3);
\node[leaf_node,name=n23,left of = c3]{$T_3$};
\node[leaf_node,name=n24,right of = c3]{$T_4$};
\draw (n1.south west) --  node[sloped,midway,align=center]{
    \setlength{\fboxsep}{1pt}\colorbox{white}{\tiny \sffamily 0.3}
}(n11.north east);
\draw  (n1.south east) --  node[sloped,midway,align=center]{
    \setlength{\fboxsep}{1pt}\colorbox{white}{\tiny \sffamily 0.7}
}(n12.north west);
\draw   (n11.south west) --  node[sloped,midway,align=center]{
    \setlength{\fboxsep}{1pt}\colorbox{white}{\tiny \sffamily 0.8}
}(n21.north);
\draw  (n11.south east) --  node[sloped,midway,align=center]{
   \setlength{\fboxsep}{1pt}\colorbox{white}{\tiny \sffamily 0.2}
}(n22.north);
\draw   (n12.south west) --  node[sloped,midway,align=center]{
    \setlength{\fboxsep}{1pt}\colorbox{white}{\tiny \sffamily 0.4}
}(n23.north);
\draw  (n12.south east) --  node[sloped,midway,align=center]{
   \setlength{\fboxsep}{1pt}\colorbox{white}{\tiny \sffamily 0.6}
}(n24.north);
\end{tikzpicture}
    }
    \caption{A Cutset network over 5 binary random variables $\{X_1, \dots, X_5\}$. The children for each node are shown in ascending order of their corresponding values. Here, $T_1, \dots, T_4$ are tree Bayesian Networks over the variables not included on the path from the root. Each edge label represents a conditional probability and can be interpreted as the proportion of data points belonging to the corresponding data partition.  
    %For example, the label on the edge $(X_2, T_1)$ represents $P(X_2 = 0 \mid X_1 = 0) = 0.8$.  
    }
    \label{fig:example_cnet}
    %\vspace{-1em}
\end{figure}
Figure \ref{fig:example_cnet} shows a cutset network defined over a set of 5 binary valued random variables $\{X_1, \dots, X_5\}$. The nodes of the OR tree are shown as circular nodes, while tree BNs are shown as dashed square nodes labeled $T_1, \dots, T_4$. $T_1$ and $T_2$ represent conditional probability distribution over $\{X_3, X_4, X_5\}$ given the assignments $(X_1=0,X_2=0)$ and $(X_1=0,X_2=1)$, respectively. $T_3$ and $T_4$ represent the conditional probability distribution over $\{X_2, X_4, X_5\}$ given the assignments $(X_1=1,X_3=0)$ and $(X_1=1,X_3=1)$, respectively. 

Cutset networks are tractable probabilistic models~\citep{CNetsRahman2014}. Queries such as finding the most probable explanation and computing the marginal probability distribution at each variable given observations can be answered in time that scales linearly with the size (number of parameters) of the network. In addition to being tractable, Cutset networks are also interpretable~\citep{rahman-icml19} because they consist of OR-trees and tree BNs. OR-trees are structured like probabilistic decision trees and hence inherit their interpretability. Tree BNs at the leaves of the OR-tree are interpretable because all of their nodes correspond to observed variables. The parameters of both the OR-tree and the tree BNs have probabilistic interpretations because they are conditional probabilities of observed variables. 
%since the OR tree is a probabilistic decision tree while the tree BNs~\citep{Pearl1988} are sparse and thus highly interpretable. 
For example, in Figure \ref{fig:example_cnet}, the path from the root node to $T_1$ can be interpreted as follows. The probability that $X_1$ takes the value $0$ is $0.3$; given $X_1=0$, the probability that $X_2$ takes the value $0$ is $0.8$; given $(X_1=0,X_2=0)$ the conditional distribution over the remaining variables is given by a \eat{sparse }tree BN $T_1$. 
These two properties -- tractability and interpretability -- make cutset networks {\bf a natural fit for high-stakes domains like healthcare} that require the models to be interpretable while being able to answer certain queries exactly in order to build trust with the domain expert~\citep{Rudin19}. 



\subsection{Qualitative influences in probabilistic models}

We approach the problem of learning interpretable and tractable probabilistic models for high-stakes domains by using qualitative influences given by a domain expert as an inductive bias.
% to learn cutset networks. 
As far as we are aware, this is the first work on learning interpretable tractable probabilistic models using qualitative influences. Qualitative influences have been previously used for learning probabilistic models~\citep{Wellman1990}. For instance, they have been used to learn more accurate discriminative models in the presence of noisy and sparse data~\citep{KiGBKokel2020, OdomKPLL2015}. However, their use in learning generative models has been limited to BNs~\citep{AltendorfEtAl2005, Campos2008QIBN, YangEtAl2013}, where exact inference is intractable in general~\citep{Cooper90}.
% TODO: Reverse

Additionally, while generative models like Sum-Product Networks (SPNs)  guarantee tractable inference~\citep{SPNPoon2011}, they are hard to explain because their internal nodes do not correspond to any observed variables.  The probabilistic prior constraints that have been used to learn SPNs~\citep{SPNConstraintsPapantonis2020} are harder to elicit from experts. In contrast, the structure of cutset networks makes it natural to encode a variety of domain knowledge such as conditional independences, context-specific independences, deterministic constraints, and quantitative constraints~\citep{Chavira2007,GogateDomingos2010,CNetsRahman2014}. In this work, we propose to learn cutset networks using qualitative influences which are easier to elicit from experts in domains like healthcare.




Concretely, we consider a specific type of qualitative influence called {\em monotonic influence}~\citep{AltendorfEtAl2005}. A random variable $X_j$ is said to positively monotonically influence another random variable $X_i$ if an increase in the value of $X_j$ increases the probability of higher values of $X_i$.
 We use ${X_j}_{\prec}^{M+}X_i$ to denote a positive monotonic influence. Similarly, a negative monotonic influence ${X_j}_{\prec}^{M-}X_i$ implies that an increase in the values of $X_j$ decreases the probability of higher values of $X_i$.
 
The positive and negative influences can be expressed respectively using the following constraints:
\begin{align}
\label{eq:pmc}
    \begin{aligned}
        P_\mathcal{M}(X_i \leq c\mid X_j = a) \leq P_\mathcal{M}(X_i \leq c\mid X_j = b)\\
        \forall a > b;\; a,b \in domain(X_j);\; c \in domain(X_i)
    \end{aligned}
\end{align}
\begin{align}
\label{eq:nmc}
    \begin{aligned}
        P_\mathcal{M}(X_i \leq c\mid X_j = a) \geq P_\mathcal{M}(X_i \leq c\mid X_j = b)\\
        \forall a > b;\; a,b \in {domain}(X_j);\; c \in {domain}(X_i)
    \end{aligned}
\end{align}
This form of monotonic influence relation has been expressed in prior work~\citep{AltendorfEtAl2005} in the context of BNs for the case where $X_j$ is a parent of $X_i$. This work on BNs was later extended to learn conditional distributions with causal independence and qualitative constraints~\citep{YangEtAl2013}. These relations were used as margin constraints to learn conditional probability tables. In our work, instead of using monotonic influences as constraints on conditional distributions, we use them as constraints on the joint distribution. 

% To this end, we propose a novel algorithm to learn the parameters of the leaves of cutset networks using qualitative influences as constraints on the joint distribution.

\section{Knowledge Intensive Learning of Cutset networks}

We hypothesize that knowledge in the form of monotonic influence statements integrates well with the patterns learned from data in a cutset network, producing more accurate and concise (and hence more interpretable) models. To test our hypothesis, we propose Algorithm \alg{}, which solves the following problem:

% A cutset network $\mathcal{M}$
\fbox{\centering\begin{minipage}{.95\linewidth}
    
    \textbf{Given:} Dataset ${\mathcal{D} = \{ x^{(i)}\}^{N}_{i=1}}$ over random variables ${X}$ and a set of qualitative influences $C$\\
    \textbf{To Do:} Learn a cutset network $\mathcal{M}\eat{ = (O, T)}$
    \end{minipage}
}\vspace{0.5em}
Mathematically, the above problem can be expressed as the following constrained optimization problem:
\begin{align}
\label{eq:opt1}
\begin{aligned}
    &\underset{\mathcal{M}}{\argmax}\ \mathcal{L}(\mathcal{M}, \mathcal{D})
    &\text{s.t. constraints in $C$}
\end{aligned}
\end{align}
where $\mathcal{L}(\mathcal{M}, \mathcal{D})$ is the log-likelihood of $\mathcal{D}$ w.r.t. $\mathcal{M} = (O, T) $ and is given by
\begin{align}
\label{eq:ll}
     \mathcal{L}(\mathcal{M}, \mathcal{D}) %&= \sum_{x \in \mathcal{D}} \log P_{\mathcal{M}}(x) \\
     &= \sum_{x \in \mathcal{D}}\left( \log P^{z}(x_{V_{z}}) + \sum_{w \in W_z} \log w    \right)
\end{align}
where $z=l(x)$ is the leaf node corresponding to $x$. An issue with the constrained optimization formulation given in Eq. \eqref{eq:opt1} is that it may not have any feasible solutions. For example, since the qualitative influence statements are elicited from a domain expert, they are {\em not guaranteed to be consistent with each other\eat{and with the structure of the network}}. As a result, exact constraint satisfaction {\em may not be possible}. To address this issue, we propose an algorithm to incorporate the qualitative influences in the parameters of Tree BNs. Finally, we propose the \alg{} framework which adapts the LearnCNet framework to leverage qualitative influences for both parameter learning and structure learning.

\subsection{Learning Tree BN Parameters}
Consider a tree BN $T_z$ having scope $V_z$. Let $\mathcal{D}_z$ be a dataset over $V_z$ and $C_z$ be a set of qualitative influences over $V_z$. Given the structure of $T_z$ defined by $\text{Pa}^z$, we define the optimization problem for parameter learning using qualitative influences as 
\begin{align}
\label{eq:opt2}
\begin{aligned}
    &\underset{\theta^z}{\argmax}\ \mathcal{L}(T_z, \mathcal{D}_z)
    &\text{s.t. constraints in $C_z$}
\end{aligned}
\end{align}
Here, $\mathcal{L}_z(T_z, \mathcal{D}_z)$ is the log likelihood function and is 
\begin{align}
    \begin{aligned}
        \mathcal{L}_z(T_z, \mathcal{D}_z) &= \sum_{x \in \mathcal{D}_z} \log P^z(x_{V_{z}})\\
       % &= \sum_{x \in \mathcal{D}_z} \sum_{i \in V_{z}} \log P^z_i(X_i = x_{i} \mid X_{pa^z_i} = x_{pa^z_i})\\
        &= \sum_{x \in \mathcal{D}_z} \sum_{X_i \in V_{z}} \log \theta^z_{ijk}
    \end{aligned}
\end{align}
where $k = x_{i}$ and $j = x_{pa^z_i}$.


Inspired by prior work of \citet{AltendorfEtAl2005}, we define the following margin constraint for each positive monotonic influence ${X_j}_{\prec}^{M+}X_i \in C$ (see Eq. \eqref{eq:pmc}) 
%Let $C$ denote the set of qualitative constraints 
%As the qualitative influence statements are elicited from a domain expert, they are {\em not guaranteed to be consistent with each other and with the structure of the network}. Thus, exact constraint satisfaction {\em may not be possible}. Consequently, inspired by prior work \cite{AltendorfEtAl2005}, we define the constraint for each positive monotonic influence ${X_j}_{\prec}^{M+}X_i \in C$ as the margin constraint 
\begin{align}
\label{eq:delta}
\begin{aligned}
\delta_{i,j,+}^{a, b, c} = &P(X_i \leq c\mid X_j = a) \\&-P(X_i \leq c\mid X_j = b) + \epsilon \leq 0
\end{aligned}
\end{align}
where $\epsilon \geq 0$ is a user-defined margin parameter. Similarly, each negative monotonic influence ${X_j}_{\prec}^{M-}X_i \in C$ can be encoded as margin constraint $\delta_{i,j,-}^{a, b, c} \leq 0$.\footnote{Essentially this is similar to the positive monotonic constraint with $a$ and $b$ reversed.}
The key difference from prior work (see \citep{AltendorfEtAl2005}) is that we interpret the monotonic influence as constraints on conditional distributions obtained by marginalizing other variables instead of fixing their values.

Using the notation given in Eq. \eqref{eq:delta}, we can express the optimization problem given in Eq. \eqref{eq:opt2} as
\begin{eqnarray}
    \nonumber
    \underset{\theta^z}{\argmax}\ \mathcal{L}(T_z, \mathcal{D}_z) & s.t.\\
    \delta_{i,j,+}^{a, b, c} \leq 0 & \forall {X_j}_{\prec}^{M+}X_i \in C\\
    \nonumber
    \delta_{i,j,-}^{a, b, c} \leq 0 & \forall {X_j}_{\prec}^{M-}X_i \in C
\end{eqnarray}
A standard approach for solving the above optimization task is to use Lagrangian relaxation (see, for example, \citep{bertsekas96}). A better alternative is the \textit{penalty method}, which we will use in this paper. This method relaxes the constrained optimization problem into an unconstrained one by adding a penalty term to the objective. The latter equals the product of a penalty parameter $\lambda$ and a function that is zero when the constraints are satisfied and non-zero (i.e., it penalizes the objective) when they are violated. It then optimizes the value of the penalty parameter $\lambda$ by progressively increasing it (e.g., by multiplying it by 10) until convergence. Several penalty functions have been proposed in the literature. In our work, we use the quadratic penalty.


To simplify our notation, we define the penalty function for each pair $X_i, X_j$ as $\zeta_{i,j} = \sum_c \sum_{a>b} \zeta_{i,j}^{a, b, c}$ where,
\begin{align}
\label{eq:con3}
\begin{aligned}
\zeta_{i,j}^{a, b, c} = \begin{cases}
 \mathds{1}_{\delta_{i,j,+}^{a, b, c} \geq 0}(\delta_{i,j,+}^{a, b, c})^2 & \text{If }{X_j}_{\prec}^{M+}X_i \in C\\
 \mathds{1}_{\delta_{i,j,-}^{a, b, c} \geq 0}(\delta_{i,j,-}^{a, b, c})^2 & \text{If }{X_j}_{\prec}^{M-}X_i \in C\\
0                        & \text{Otherwise}
\end{cases}
\end{aligned}
\end{align}

Using the penalty function given in Eq. \eqref{eq:con3}, we can solve the optimization problem given in Eq. \eqref{eq:opt2} using the following series (indexed by $t$) of penalized problems:
\begin{align}\label{eq:penalized_loglikelihood}
    \underset{\theta^z}{\argmax}\ \mathcal{L}_\text{pl}(T^z, \mathcal{D}_z, t)
\end{align}
where $\mathcal{L}_\text{pl}(T^z, \mathcal{D}_z, t)$ is the penalized log-likelihood and is
\begin{align*}
    \mathcal{L}_\text{pl}(T^z, \mathcal{D}_z, t) = \mathcal{L}_z(T^z, \mathcal{D}_z, t) - \lambda_t \sum_{i,j}\zeta_{i,j}(T^z, C_z) 
\end{align*}
Here, $t$ denotes the iteration number. At each iteration, we increase $\lambda_t$ (e.g. by a factor of 10), solve the unconstrained problem given in Eq. \eqref{eq:penalized_loglikelihood}, and use the values of $\theta^z$ as the initial guess for the next iteration. As we increase $\lambda_t$, the solution will eventually converge to the solution of the constrained optimization problem given in Eq. \eqref{eq:opt2}~\citep{Luenberger2016}.


\subsubsection{Gradients}
At each iteration $t$, the unconstrained optimization problem given in Eq. \eqref{eq:penalized_loglikelihood} can be solved in practice using a standard gradient ascent procedure. Since the objective is smooth, the gradient ascent will always converge to a local optimum. To complete the description of this gradient ascent procedure, we provide the expressions for gradients in this section. %(detailed derivations are given in the supplemental section).

To encode the constraints $0 \leq \theta^{z}_{ijk} \leq 1$ and $\sum_{k'} \theta^{z}_{ijk'} = 1$, we parameterize $\theta^{z}$ using the softmax function, $S$ as  $\theta^{z}_{ijk} = S(\mu^{z}_{ij})_k$. The gradient of the tree BN distribution with respect to the parameter $\mu^{z}_{ijk}$ is
\begin{align}\label{eq:p_grad}
\begin{aligned}
    \frac{\partial  P^z(x)}{\partial \mu^{z}_{ijk}} = \mathds{1}_{\text{Pa}^{z}_{x_i} = j } \frac{P^z(x)}{\theta^{z}_{ijk'}}  S'(\mu^{z}_{ij})_{k'}, 
\end{aligned}
\end{align}
where $S'$ is the gradient of the softmax function.

Now, without loss of generality, the gradient of penalty term due to the positive monotonic influence ${X_j}_{\prec}^{M+}X_i \in C$ is
\begin{align}
\begin{aligned}
\frac{\partial \zeta_{i,j}^{a, b, c}}{\partial \mu^{z}_{ijk}}  = 2\mathds{1}_{\delta_{i,j,+}^{a, b, c} \geq 0}\delta_{i,j,+}^{a, b, c}\frac{\partial \delta_{i,j,+}^{a, b, c}}{\partial \mu^{z}_{ijk}},
\end{aligned}
\end{align}
where the gradient of margin constraint is  
\begin{align}
\begin{aligned}
\frac{\partial  \delta_{i,j,+}^{a, b, c}}{\partial \mu^{z}_{ijk}} = & \frac{\partial P^z(X_i \leq c\mid X_j = a)}{\partial \mu^{z}_{ijk}}  \\
-&\frac{\partial P^z(X_i \leq c\mid X_j = b)}{\partial \mu^{z}_{ijk}} 
\end{aligned}
\end{align}
The gradient of each conditional distribution is 
\begin{align}
\begin{aligned}
\frac{\partial P^z(X_i = x_i\mid X_j = a)}{\partial \mu^{z}_{ijk}} =\frac{\frac{\partial P^z(X_i = x_i, X_j = a)}{\partial \mu^{z}_{ijk}}P^z(X_j = a)}{P^z(X_j = a)^2} \\
-\frac{P^z(X_i = x_i,X_j = a) \frac{\partial P^z(X_j = a) }{\partial \mu^{z}_{ijk}}}{P^z(X_j = a)^2},
\end{aligned}
\end{align}
where the gradient of each marginal distribution over a set of variables $Q$ can be computed using equation \ref{eq:p_grad} as 
\begin{align}
    \begin{aligned}
        \frac{\partial P^z(X_Q = x_Q) }{\partial \mu^{z}_{ijk}}= \sum_{\substack{x' \in {domain}(X)\\s.t.{x'_Q} = x_Q}} \frac{\partial P^z(X = x')}{\partial \mu^{z}_{ijk}}
    \end{aligned}
\end{align}

% Let $\mathcal{D}_z$ be the subset of examples $x$ such that $l(x) = z$ and let $C_z$ be the subset of $C$ such that for each ${X_j}_{\prec}^{M+}X_i \in C_z$ and each ${X_j}_{\prec}^{M-}X_i \in C_z$, both $i$ and $j$ are in the scope $V_z$ of the leaf. 

\subsubsection{Parameter Learning Algorithm}

We use these gradients to optimize the penalized loglikelihood over the tree BN distribution (Equation \eqref{eq:penalized_loglikelihood}) using the L-BFGS-B algorithm. We describe the procedure to use the monotonic influences as constraints for the penalized loglikelihoods in Algorithm \ref{alg:FitParametersWithKnowledge}. Here, we iteratively increase the value of $\lambda$ value until the penalty term is $0$. We use a parameter $t_\text{max}$ to limit the number of such iterations. 

Algorithm \ref{alg:FitParametersWithKnowledge} can be used to learn the parameters of the leaf distributions of Cutset Networks. Specifically, this can be done by setting $\mathcal{D}_z$ to the set of datapoints $x$ such that $l(x) = z$ and $C_z$ to the subset of $C$ such that for each ${X_j}_{\prec}^{M+}X_i \in C_z$ and each ${X_j}_{\prec}^{M-}X_i \in C_z$, both $i$ and $j$ are in the scope $V_z$ of the leaf. Algorithm \ref{alg:FitLeavesWithKnowledge} describes the procedure to learn leaf distributions of a cutset network using monotonic influences. It selects the data points $\mathcal{D}_z$ and constraints $C_z$ that are applicable to each leaf $z$ using the procedures {\em SelectDatapointsByPath} and {\em SelectInfluencesByScope} before performing the optimization over the parameters which are specific to that leaf.
%
%
\subsection{Variable selection heuristic}
A limitation of the above parameter learning approach is that it can only use the monotonic influences over variables that are present in the scope of leaf nodes. As a result, knowledge about the variables in the internal nodes of the OR tree cannot be incorporated. To address this issue, we propose a variable selection heuristic to incorporate monotonic influences in the OR-tree structure. At internal node $n$, the heuristic score for variable $X_m \in V_n$  is given as 
\begin{align*}
    \begin{aligned}
        \frac{\mathcal{L}(\mathcal{M'}_{nm}, \mathcal{D'}_n)}{|\mathcal{D'}_n|} - \log(|\mathcal{D'}_n|) \sum_{X_i,X_j \in V_m^2}\zeta_{i,j}(\mathcal{M'}_{nm}, C_n) 
    \end{aligned}
\end{align*}
where $\mathcal{D'}_n$ is the set of data points at node $n$, $\mathcal{M'}_{nm}$ is a cutset network of depth 1, rooted at $X_m$, and $\zeta_{i,j}(\mathcal{M'}_{nm})$ is the penalty function (Equation \ref{eq:con3}) defined over the cutset network distribution and the subset of qualitative influences $C_n$ which are applicable to scope $V_n$. Note that this score is the same as the penalized loglikelihood objective function from Equation \eqref{eq:penalized_loglikelihood} applied to a cutset network of depth 1 and setting the penalty weight $\lambda_t$ to $|\mathcal{D'}_n|\log(|\mathcal{D'}_n|)$. Algorithm \ref{alg:ScoreWithKnowledge} describes the procedure to compute this variable selection heuristic score for a variable $X_m$.
%
% \em Knowledge Intensive Learning of Cutset Networks
%\vspace{-0.5em}
\subsubsection{Structure Learning Algorithm}
The knowledge-based parameter learning and variable selection heuristics described above can be integrated into a generalized framework for learning the structure and the parameters of a cutset network using qualitative influences. Algorithm \ref{alg:KILCN} describes the \alg{} algorithm which learns a cutset network recursively like LearnCNet but uses Algorithm \ref{alg:FitParametersWithKnowledge} to learn leaf parameters and uses Algorithm \ref{alg:ScoreWithKnowledge} for the variable selection heuristic. 

\begin{algorithm2e}[t!]
  \SetAlgoLined
  \caption{FitParameters}
  \label{alg:FitParametersWithKnowledge}
  \SetKwInOut{Input}{input}\SetKwInOut{Output}{output}
  \Input{Parent map for a Tree BN $\text{Pa}: X \mapsto X$,\\ 
  Scope of the Chow-Liu Tree $V$,  \\
  Data $\mathcal{D}$, \\
  Set of Monotonic Influences $C$, \\
  Maximum number of tries $t_\text{max}$}
  \Output{Parameters for Tree BN  $\theta$}
  \textbf{initialize:} $\mu = \underset{\mu}{\argmax}\ \mathcal{L}(\mu, \text{Pa}, \mathcal{D}), t = 1$ 
  
  \algorithmiccomment{start with maximum likelihood solution} 
  
  
  $\lambda_1 = 1$
    
  \While{${\sum_{X_i,X_j \in {V}^2}\zeta_{i,j}(\mu, C) \neq 0}$ and $t \leq t_\text{max}$} { \algorithmiccomment{while constraints are not satisfied} 
  
    $\mu = \underset{\mu}{\argmax}\ (\mathcal{L}(\mu, pa, \mathcal{D}) - \lambda_t\sum_{i,j}\zeta_{i,j}(\mu, pa, C))$ 
    
    
    $\lambda_{t+1} = \lambda_{t} \times 10$ \algorithmiccomment{increase penalty weight} 
    
    $t = t + 1$
    
    
  }
  
  $\theta_{ijk} = S(\mu_{ij})_k,\ \forall i, j, k$ \algorithmiccomment{map into probability space} 
  
  \textbf{return} $\theta'$
\end{algorithm2e}

\begin{algorithm2e}[t!]
  \SetAlgoLined
  \caption{FitLeaves}
  \label{alg:FitLeavesWithKnowledge}
  \SetKwInOut{Input}{input}\SetKwInOut{Output}{output}
  \Input{Cutset Network $\mathcal{M} = (O, T)$,\\
  Data $\mathcal{D}$, \\
  Set of Monotonic Influences $C$,\\
  Maximum number of tries $t_\text{max}$}
  \Output{Cutset Network with updated leaf parameters $\mathcal{M'}$}
  \textbf{initialize:} $\mathcal{M'} = \mathcal{M}$
  
   
  
  \For{$z$ in $1, \dots, |T|$}{
    $L_z$ = GetPathToLeaf($O$, $z$)
    
    $\mathcal{D}_z$ = SelectDatapointsByPath($\mathcal{D}$, $L_z$)

    $V_z$ = GetScope($O$, $z$)
    
    $C_z$ = SelectInfluencesByScope(C, $V_z$)

    $\text{Pa}^z$ = GetParentMap($T_z$)

    $\theta^z = \text{FitParameters}(\text{Pa}^z, V_z,\mathcal{D}_z, C_z, t_{max})$

    replace $T'_z \in \mathcal{M'}$ with $(\text{Pa}^z, \theta^z)$
     
  }
  \textbf{return} $\theta$
\end{algorithm2e}

\begin{algorithm2e}[t!]
  \SetAlgoLined
  \caption{ScoreWithKnowledge}
  \label{alg:ScoreWithKnowledge}
  \SetKwInOut{Input}{input}\SetKwInOut{Output}{output}
  \Input{Variable $X_m$,\\
  Scope $V$,\\
  Data $\mathcal{D}$,\\
  Set of Monotonic Influences $C$,\\ 
  Maximum number of tries $t_\text{max}$}
  \Output{Heuristic score for variable $X_m$}

  $O$ = OR-tree of depth 1 defined over $V$, rooted at $X_m$

  $T$ = Chow-Liu Trees at each leaf of $O$

  $\mathcal{M}_\text{init} = (O,T)$
  
  $\mathcal{M} = \text{FitLeaves}(\mathcal{M}_\text{init}, \mathcal{D}_z, C_z, t_\text{max})$

  MeanLL = $\frac{1}{|\mathcal{D}|} \mathcal{L}(\mathcal{M}, \mathcal{D}) $
  
  PenaltyTerm =  $\sum_{i,j \in V^2}\zeta_{i,j}(\mathcal{M}, C)$

  $\text{Score} = \text{MeanLL} - \log(|\mathcal{D}|) \cdot \text{PenaltyTerm}$
  
\textbf{return} Score
\end{algorithm2e}

\begin{algorithm2e}[t!]
  \SetAlgoLined
  \caption{\alg{}}
  \label{alg:KILCN}
  \SetKwInOut{Input}{input}\SetKwInOut{Output}{output}
  \Input{Data $\mathcal{D}$,\\
  Scope $V$,\\
  Set of Monotonic Influences $C$,\\ 
  Maximum number of tries $t_\text{max}$}
  \Output{Cutset Network with updated leaf parameters $\mathcal{M'}$}

  \If{Termination condition is satisfied}{
    Pa = Structure of Tree BN using Chow-Liu algorithm
    
    $\theta = \text{FitParameters}(\text{Pa}, V, \mathcal{D}, C, {t_\text{max}})$

    $T = (\text{Pa}, \theta)$
    
    \textbf{return} $T$
  }

  Select a variable $X_m$ as $\underset{X_m \in V}{\argmax}\ \text{ScoreWithKnowledge}(X_m, V, \mathcal{D}, C, t_\text{max})$

  Child = List, W = List
  
  \For{$i$ in $|\text{domain}(X_m)|$}{
  
    $\mathcal{D}_z$ = $\{ x: x \in \mathcal{D}, x_m = i \}$

    $W_i = \frac{\mathcal{D}_z}{\mathcal{D}}$
    
    $V_z$ = $V \setminus X_m $
    
    $C_z$ = SelectInfluencesByScope(C, $V_z$)
    
    $\text{Child}_i = \text{\alg{}}(\mathcal{D}_i, V_i, C_i, t_\text{max})$
  }
  O = (Child, W)
  
\textbf{return} O
\end{algorithm2e}


%
% Since they explicitly tradeoff between the data and the monotonic influences using a parameter $\lambda$, we can use the global and local penalized loglikelihoods to learn the leaf parameters using the influences either as hard constraints or as soft constraints.
%
% To use the influences as soft constraints, we solve the global and local optimization problems (Equations \ref{eq:global_penalized_loglikelihood_opt} and \ref{eq:local_penalized_loglikelihood_opt}) for some fixed value of $\lambda$. Algorithms \ref{alg:FitGlobalSoftConstraints} and \ref{alg:FitLocalSoftConstraints} describe the procedures to perform to learn leaf parameters of a cutset network using monotonic influences as soft constraints. While Algorithm \ref{alg:FitGlobalSoftConstraints} directly performs the optimization over the penalized loglikelihood, Algorithm \ref{alg:FitLocalSoftConstraints} selects the data points $\mathcal{D}_z$ and constraints $C_z$ that applicable to each leaf $z$ using the procedures {\em SelectDatapointsByPath} and {\em SelectInfluencesByScope} before performing the optimization over the parameters  $\mu^z$ which are specific to that leaf.
%
%
% For the hard constraints case, instead of using a fixed value of $\lambda$, we iteratively increase its value until the penalty term is $0$. We use a parameter $t$ to limit the number of such iterations. We describe the procedure to use the monotonic influences as hard constraints for the global and local penalized loglikelihoods in Algorithms \ref{alg:FitGlobalHardConstraints} and \ref{alg:FitLocalHardConstraints} respectively. 
% Just like in Section \ref{sss:global_optimization}, this local, penalized loglikelihood can be used to learn parameters using the monotonic influences either as soft constraints or as hard constraints. Here, we select the data points $\mathcal{D}_z$ and constraints $C_z$ that applicable to each leaf $z$ using the procedures SelectDatapointsByPath and SelectInfluencesByScope and perform the optimization with soft and hard constraints in Algorithms \ref{alg:FitLocalSoftConstraints} and \ref{alg:FitLocalHardConstraints} respectively.
%
\section{Empirical evaluation}
We aim to answer the following questions explicitly: 
\begin{enumerate}
    \item[({\bf Q1})] Are monotonicities useful in learning cutset networks from noisy and sparse data?
    \item[({\bf Q2})]  Does \alg{}  improve the accuracy of learned models?
    \item[({\bf Q3})] Does \alg{} learn an {\em interpretable, explainable yet accurate} probabilistic model in high-stakes, clinical settings?
\end{enumerate}

% ({\bf Q1}) Are monotonicities useful in learning cutset networks from noisy and sparse data? ({\bf Q2}) Does \alg{}  improve the accuracy of learned models? ({\bf Q3}) Does \alg{} learn an {\em interpretable, explainable yet accurate} probabilistic model in high-stakes, clinical settings?
To answer these questions, we compared the networks learned using \alg{} with networks learned using LearnCNet.

We used two types of data sets for our experiments -- 15 standard data sets to study the properties of \alg{} and 4 data sets from {\em high-stakes medical domains} to understand the interpretability and explainability of the models.




\begin{table}[t!]
\centering
\begin{tabular}{lrrr}
\toprule
\textbf{Data set}   & \textbf{LearnCNet}     & \textbf{\alg{} (P)}           & \textbf{\alg{}}\\
\midrule
              cpu &    -509.67 &     -490.98 &    \textbf{-468.76} \\
        ljubljana &  -1,059.62 &   -1,053.26 &  \textbf{-1,026.08} \\
        cleveland &  -1,498.21 &   -1,486.55 &  \textbf{-1,475.27} \\
         haberman &    -670.85 &     -668.82 &    \textbf{-643.76} \\
         diabetes &  -2,121.14 &   -2,084.99 &  \textbf{-2,078.09} \\
             auto &  -1,239.30 &   -1,233.32 &  \textbf{-1,230.08} \\
            yeast &  -6,040.67 &   -5,927.33 &  \textbf{-5,864.03} \\
              car &  -7,518.63 &   -7,501.16 &  \textbf{-7,485.55} \\
          redwine &  -5,769.67 &   -5,722.72 &  \textbf{-5,659.97} \\
        whitewine & -15,054.33 &  -15,017.73 & \textbf{-14,985.78} \\
          abalone & -13,461.99 &  -13,340.42 & \textbf{-12,992.09} \\

\midrule
            sachs &  -1,025.38 &  \textbf{-1,015.24} &         -1,015.92 \\
             asia &    -411.62 &     -397.13 &    \textbf{-389.92} \\
       earthquake &    -124.65 &     -121.25 &    \textbf{-116.94} \\
           survey &    -451.02 &     -450.04 &    \textbf{-449.93} \\
\midrule
              ppd &    -717.13 &     -711.90 &    \textbf{-710.02} \\
             adni &    -907.67 &     -901.85 &    \textbf{-867.09} \\
             %rare &  -1,796.21 &   \textbf{-1,785.55} &  -1,806.04 \\
  numom2b-a & -14,102.51 &  -14,102.81 & \textbf{-14,068.81} \\
  numom2b-b& -10,535.51 &  -10,515.25 & \textbf{-10,448.37} \\
\bottomrule
\end{tabular}
\caption{Test loglikelihood scores for cutset networks fit on UCI data sets with 30\% noise (rows 1--11), data sampled from Bayesian Networks (rows 12--15), and data from medical domains (rows 16--19) using LearnCNet, \alg{} with only parameter learning (\alg{}(P)) and \alg{} with both structure and parameter learning. The scores are averaged over 10 bootstrap samples.}\label{tab:loglik}
\end{table}


\paragraph{Benchmark data sets:} We used two types of benchmark data sets -- UCI repository~\citep{Dua2019} and classic Bayes net (BN) data sets. For UCI data sets, we considered the Computer Hardware (cpu), Breast Cancer (Ljubljana), Haberman's Survival (haberman), Auto MPG (auto), Car Evaluation (car), Yeast (yeast), Wine quality (redwine and whitewine), Abalone (abalone) Heart disease (cleveland) and Pima Indians Diabetes (diabetes) data sets. Wherever discretization ranges were not available, we categorized each non-boolean variable into $3$ categories and split each data set into a 50:50 train-test split. Of these, Haberman's Survival, Heart disease, and Pima Indians Diabetes data sets had monotonic influences available in the literature ~\citep{AltendorfEtAl2005, KiGBKokel2020}. For all the other data sets, we employed the use of the  Qualitative Knowledge Extraction (QuaKE) algorithm~\citep{QuakeKaranamHayes2021} to generate monotonic influences. Since the QuaKE algorithm can work with any probabilistic model, we used cutset networks to infer the monotonic constraints. %using a cutset network fit on the training data as the generative model. Table \ref{tab:data} shows the number of monotonic influences for each data set.

Our key hypothesis is that these domain constraints are more useful in data-scarce and noisy domains. While 50:50 train-test split takes care of sparsity, we induced noise in the training data by replacing 30\% of the data points for each positive monotonic influence ${X_j}_{\prec}^{M+}X_i$ with $X_i = R_i - \floor{X_j\frac{R_i}{R_j}}$
%\begin{align}
 %   X_i = R_i - \floor{X_j\frac{R_i}{R_j}}
%\end{align}
where $R_i$ and $R_j$ are the max values of $X_i$ and $X_j$. Similarly, for each negative monotonic influence, we use ${X_j}_{\prec}^{M-}X_i$, $X_i = \floor{X_j\frac{R_i}{R_j}}$. 
 The noisy examples computed using the above formulas encode the reverse monotonic influences in $C$.

%\subsubsection{BN-based data sets}
Our second type of benchmark data sets come from the BN community. We used Earthquake~\citep{EarthquakeKorb2010}, Asia~\citep{AsiaLauritzen1988}, Survey~\citep{SurveyScutari2014} and Sachs~\citep{Sachs2005} BNs. We sampled 100 examples (for sparsity) for generating both training and testing data from the BNs. We added 30\% noise to the sampled training data using the same formula as above. We computed the monotonicities using the QuaKE algorithm. 
% Table \ref{tab:data} shows the number of monotonic influences for each BN.
%\vspace{-0.5em}
\paragraph{High-stakes medical-domains:} To understand the advantage of cutset networks over other deeper models in issues of interpretability, we used data from 3 studies, namely, Post-Partum Depression Survey (PPD ~\cite{PPDNatarajan2017}), Alzheimer's Disease Neuroimaging Initiative (ADNI), and Nulliparous Pregnancy Outcomes Study: Monitoring Mothers-to-Be (nuMoM2b ~\cite{numom2b}). 

%\paragraph{High-stakes medical-domain: Prediction of Gestational Diabetes} To understand the advantage of cutset networks over other deeper models in issues of interpretability, we used data from the {\bf Nulliparous Pregnancy Outcomes Study: Monitoring Mothers-to-Be} (nuMoM2b, ~\cite{numom2b}) study. This study enrolled $10,038$ nulliparous women, i.e., women that had not had a previous pregnancy lasting more than 20 weeks of gestation, to study Adverse Pregnancy Outcomes. The participants were enrolled from $10$ centers in the United States and the cohort is ethnically and racially diverse. The data was collected from the participants $4$ times during the course of the pregnancy. Our goal in this work is to learn a probabilistic model that would account for the constraints provided by the domain experts while being interpretable.
%Since this was a medical domain, the model would need to be interpretable to medical doctors.  


While we selected subsets of variables based on prior work on the PPD and ADNI domains, we considered two sub-cohorts of the nuMoM2b data, focusing on risk factors for Gestational Diabetes~\citep{PRSPagel2022}. 
The first sub-cohort (nuMoM2b-a) had 7 variables, namely, Body Mass Index (BMI), exercise in Metabolic Equivalent of Task units (METs), Age at first visit (Age), family history of diabetes (Hist), Polycystic Ovary Syndrome (PCOS), high Blood Pressure (HiBP), and Gestational Diabetes Mellitus (GDM). After excluding participants that had missing data for any of these variables, we had data from 6,164 in this sub-cohort. We obtained the following set of qualitative influences from an Obstetrics and Gynecology expert, Dr. David Haas:
\begin{align*}
\begin{aligned}
    \{&\text{BMI}_{\prec}^{M+}\text{GDM},\ \text{METs}_{\prec}^{M-}\text{GDM}, \   \text{Age}_{\prec}^{M+}\text{GDM},\\
    &\text{Hist}_{\prec}^{M+}\text{GDM}, \
    \text{PCOS}_{\prec}^{M+}\text{GDM},\ \text{HiBP}_{\prec}^{M+}\text{GDM} \}
\end{aligned}
\end{align*}
The second sub-cohort (nuMoM2b-b) had the Polygenic Risk Score (PRS) as an additional variable. Further, since PRS is applicable only to non-Hispanic white participants with European ancestry, we excluded all the other participants. As a result, we had data from 3,657 participants in this sub-cohort. We categorized the non-boolean variables, namely, BMI, METs, Age, and PRS into 4 categories each. Apart from the influences listed above, we used the additional influence that $\text{PRS}_{\prec}^{M+}\text{GDM}$.

\begin{table}[t]
\centering
\begin{tabular}{lrrrrr}
\toprule
\textbf{Data set}   & \textbf{LearnCNet}     & \textbf{\alg{} (P)}           & \textbf{\alg{}}\\
\midrule
          sachs &     0.0708 &      0.0685 &  \textbf{0.0663} \\
           asia &     0.1923 &      0.1766 & \textbf{0.1696} \\
     earthquake &     0.1391 &      0.1296 & \textbf{0.1221} \\
         survey &     0.0181 &      0.0185 &  \textbf{0.0165} \\
\midrule
      cleveland &     0.2746 &      0.2655 &  \textbf{0.2477} \\
       haberman &     0.2121 &      0.2065 &  \textbf{0.1953} \\
       diabetes &     0.2298 &      0.2164 &  \textbf{0.2114} \\
\midrule
            ppd &     0.2043 &      0.1974 &  \textbf{0.1963} \\
           adni &     0.1825 &      0.1713 &  \textbf{0.1636} \\
%           rare &     0.2517 &      \textbf{0.2414} &  0.2600 \\
numom2b-a &     0.0397 &      0.0390 &  \textbf{0.0383} \\
numom2b-b &     0.0515 &      0.0490 &  \textbf{0.0445} \\
\bottomrule
\end{tabular}
\caption{Mean squared error (MSE) for conditional probability queries for cutset networks fit on data sampled from BNs, on UCI data sets with prior knowledge and on data from clinical studies using LearnCNet, \alg{} with only parameter learning (\alg{}(P)) and \alg{} with both structure and parameter learning. The MSE values are averaged over 10 bootstrap samples.}\label{tab:queries}
\end{table}

% \vspace{-0.5em}
\paragraph{Methods}
We compared the following versions of \alg{}\footnote{Code and supplementary material available at \texttt{github.com/saurabhmathur96/KIL-CN}}  -- 
\begin{enumerate}[leftmargin=2.5em]
    \item[(1)] Parameter learning using knowledge (\alg{}(P)).
    \item[(2)] Parameter and structure learning using knowledge.
\end{enumerate}

%(1) Parameter learning using knowledge (denoted \alg{} (P)),  (2) Parameter and structure learning using knowledge. 
For version (1), the structure is pre-learned using LearnCNet and only the leaf parameters are updated. On the other hand version (2) involves learning the structure and parameters of the cutset network. For both modes, we set the number of tries $t_\text{max}$ to $10$ and we set the margin parameter $\epsilon$ to $0.001$. 
% \vspace{-0.5em}
\paragraph{Metrics}
We used two metrics in our evaluation: the log-likelihood of the cutset network on the test data (test loglikelihood), 
and the Mean Squared Error (MSE) for conditional probability queries. 
% We used conditional probability queries for BN-based data sets and the clinical data set. For each BN-based data set, use $P(X_i = x_i \mid X_j = x_j)$ for each positive monotonic influence ${X_j}_{\prec}^{M+}X_i$ or negative monotonic influence ${X_j}_{\prec}^{M-}X_i$. We compared these queries against the ground truth computed from the BNs. For the clinical data sets, we used the conditional probability query over the target variable given the values of all other variables, $P(GDM \mid X \setminus GDM)$. We evaluated these values against the ground truth values of $GDM$ from the test data.




\begin{table}[!t]
\centering
\begin{tabular}{lrrrr}
\toprule
& \multicolumn{2}{c}{\textbf{Edge count}} & \multicolumn{2}{c}{\textbf{Parameter count}} \\ \cmidrule(lr){2-3} \cmidrule(lr){4-5}  
                                  \textbf{Data set} & \textbf{LearnCNet}    & \textbf{\alg{}}    & \textbf{LearnCNet}     & \textbf{\alg{}}     \\ \midrule

ppd                                & \textbf{113.8}                & 114.1          & 205.7                 & \textbf{198.8}             \\
adni                               & 121.9                & \textbf{57.8}           & 343.3                 & \textbf{246.4}   \\
% rare                               & 401.8                & \textbf{355.4}          & 957.3                 & \textbf{817.2}              \\
numom2b-a                          & 179.4                & \textbf{108.6}          & 422.2                 & \textbf{366.3}  \\
numom2b-b                          & 416.5                & \textbf{220.9}          & 1,069.9               & \textbf{905.7}   \\ \bottomrule
\end{tabular}
\caption{The number of edges and the number of free parameters for cutset networks fit using LearnCNet and \alg{} on medical data sets,  averaged over 10 bootstrap samples.}\label{tab:structure}
\end{table}

\eat{
\begin{table*}[!t]
\centering
\begin{tabular}{lrrrrrr}
\toprule
& \multicolumn{2}{c}{\textbf{Edge count}} & \multicolumn{2}{c}{\textbf{Parameter count}} &  \multicolumn{2}{c}{\textbf{MSE on queries}}  \\ \cmidrule(lr){2-3} \cmidrule(lr){4-5}  \cmidrule(lr){6-7}    
                                  \textbf{Data set} & \textbf{LearnCNet}    & \textbf{\alg{}}    & \textbf{LearnCNet}     & \textbf{\alg{}} & \textbf{LearnCNet}     & \textbf{\alg{}}     \\ \midrule

ppd                                & \textbf{113.8}                & 114.1          & 205.7                 & \textbf{198.8}        & 0.2043 & \textbf{0.1963}      \\
adni                               & 121.9                & \textbf{57.8}           & 343.3                 & \textbf{246.4}           & 0.1825 & \textbf{0.1636}   \\
% rare                               & 401.8                & \textbf{355.4}          & 957.3                 & \textbf{817.2}              \\
numom2b-a                          & 179.4                & \textbf{108.6}          & 422.2                 & \textbf{366.3}          & 0.0397 & \textbf{0.0383}    \\
numom2b-b                          & 416.5                & \textbf{220.9}          & 1,069.9               & \textbf{905.7}          & 0.0515 & \textbf{0.0445}    \\ \bottomrule
\end{tabular}
\caption{The number of edges and the number of free parameters for cutset networks fit using LearnCNet and \alg{} on medical data sets,  averaged over 10 bootstrap samples.}\label{tab:structure}
\end{table*}
% \vspace{-1em}
}

\paragraph{Results}
\begin{itemize}[leftmargin=2.5em]%[topsep=0pt,itemsep=0ex,partopsep=1ex,parsep=1pt,leftmargin=1mm]
\item[({\bf Q1})] Table \ref{tab:loglik} presents the log-likelihood on the test set for standard data sets (rows 1--15). The training data for each of these domains had $30\%$ noise.  Overall, using domain knowledge improves generative performance, and using knowledge for structure learning results in better performance than using knowledge only for parameter learning. This allows us to answer Q1 affirmatively.
\item[({\bf Q2})] Table \ref{tab:queries} presents the mean squared error for conditional probability queries for the BN data sets (rows 1--4) and the UCI data sets with prior knowledge (rows 5--7). For the BN datasets, we compared queries of the form $P(X_i = x_i \mid X_j = x_j)$ for each positive monotonic influence ${X_j}_{\prec}^{M+}X_i$ or negative monotonic influence ${X_j}_{\prec}^{M-}X_i$ to the ground truth probabilities from the BN. For the UCI data sets, we used the conditional probability of the target given all the risk factors ($P(X_\text{target} \mid X \setminus X_\text{target})$) and compared it to the values of the target in the test data set. The use of monotonic influence results in a lower mean squared error and hence more accurate answers to the queries. Further, as with the log-likehood scores, knowledge-based structure learning methods perform better than those using only parameter learning. Thus, we can answer Q2 affirmatively.% as well.
\item[({\bf Q3})] The last 4 rows of table \ref{tab:loglik} show the test log-likelihood for the PPD, ADNI, and the two nuMoM2b data sets. For all the data sets, \alg{} improves the test log-likelihood. The last five rows of table \ref{tab:queries} show the mean squared error for conditional probability queries. As with the UCI data sets with prior knowledge, we compared the probability $P(X_\text{target} \mid X \setminus X_\text{target})$ to the values of the target variable in the test set. The models learned using \alg{} give more accurate answers for the conditional probability query. Finally, table \ref{tab:structure} compares the edge count and free parameter count for the structures learned using LearnCNet and \alg{}. The structures learned using \alg{} are more concise than the ones learned using LearnCNet. Thus, Q3 is answered affirmatively.
\end{itemize}

Finally, recall that as shown in Figure~\ref{fig:gdm_cnet1}, the learned model is not only interpretable\footnote{See supplementary material for additional results.} but follows published research~\citep{PRSPagel2022}. It should be mentioned that while \alg{} uses the monotonic constraints on both PRS and BMI, it correctly infers that PRS is more important than BMI. Moreover, for the low values of PRS, BMI is chosen indicating that while the person might have a low propensity risk of gestational diabetes, BMI can have a significant impact. Similarly, for high-risk scores, the history of gestational diabetes becomes more important than BMI. These not only reflect and validate current medical knowledge but enhances it by identifying specific combinations that can allow for corresponding treatment plans.

\textbf{Discussion:} One of the limitations of \alg{} is that the knowledge in the form of monotonic influences must be valid regardless of the context. That is, if an influence ${X_i}_\prec^{M+} X_j$ is given, we assume that there does not exist any context $\{X_Q = x_Q\}$ where $X_Q \subseteq (X\setminus \{X_i,X_j\})$ and $x_Q \in domain(X_Q)$ such that ${X_i}_\prec^{M+} X_j \mid X_q = x_q $ is false. To account for this limitation, we used influences that were either independent of other variables or had a positive synergistic effect with them.
%Figure \ref{fig:gdm_cnet1} shows a cutset network that was fit on a sample from the nuMoM2b-a data. The first split is on Age. The next split on 87\% of the data is on BMI. This shows that Age and BMI are the most important variables in the joint distribution which is inline with the clinical knowledege~\cite{AgeGDMLi2020,BMIGDMNajafi2019} demonstrating that strong inductive bias can yield interpretable and accurate models. Age and BMI are also highly correlated with Race in the nuMoM2b dataset\footnote{Citation withheld for blind review}.
\section{Conclusion}

We considered the problem of incorporating rich domain knowledge in the form of qualitative constraints when learning an {\em interpretable, tractable} probabilistic model, namely, cutset networks. We developed \alg{} to leverage qualitative constraints to learn the structure and parameters of cutset networks. Our experiments on benchmark data sets and medical data sets demonstrated the efficacy of the proposed approach. Extending this work to deeper tractable models is an interesting future direction. Incorporating different types of domain knowledge including synergistic information, preferences over conditional distributions, privileged information, and imbalance tradeoffs is another direction. Finally, generating global explanations using the structure of these networks, and instance-level explanations constructed from the differences in the reasoning paths of the different examples can allow clinicians to develop treatment plans that mitigate adverse pregnancy outcomes.% and is an exciting direction for future research.

\begin{acknowledgements} % will be removed in pdf for initial submission,
						 % (without ‘accepted’ option in \documentclass)
                         % so you can already fill it to test with the
                         % ‘accepted’ class option
SN and SM acknowledge the support by the NIH grant R01HD101246 and ARO award W911NF2010224. VG was supported in part by the DARPA Perceptually enabled Task Guidance (PTG) Program under contract
number HR00112220005 and by the National Science Foundation CAREER award IIS-1652835.
\end{acknowledgements}

% References
\bibliography{mathur_549}
\end{document}
