\documentclass[accepted]{uai2024} 
\usepackage[american]{babel}
\usepackage{natbib,mathtools,amsfonts,amssymb,bm,dsfont,booktabs,tikz,algorithm,algorithmic}
\bibliographystyle{plainnat}
\renewcommand{\bibsection}{\subsubsection*{References}}
\newcommand{\eat}[1]{}
\newcommand{\X}{\bm{X}}
\newcommand{\x}{\bm{x}}
\newcommand{\sm}[1]{\textcolor{blue}{[#1 \textsc{--SM}]}}
\title{Knowledge Intensive Learning of Credal Networks}
\author[1]{\href{mailto:<saurabhsanjay.mathur@utdallas.edu>?Subject=Your UAI 2024 paper}{Saurabh Mathur}{}}
\author[2]{Alessandro Antonucci}
\author[1]{Sriraam Natarajan}
\affil[1]{%
    Erik Jonsson School of Engineering \& Computer Science, University of Texas at Dallas, Richardson, Texas, USA
}
\affil[2]{%
    Istituto Dalle Molle di Studi sull’Intelligenza Artificiale (IDSIA) - Lugano, Switzerland
}
\begin{document}
\maketitle
\begin{abstract}
Bayesian networks are a popular class of directed probabilistic graphical models that allow for closed-form learning of the local parameters if complete data are available. However, learning the parameters is challenging when the data are sparse, incomplete, and uncertain. In this work, we present an approach to this problem based on \emph{credal networks}, a generalization of Bayesian networks based on set-valued local parameters. We derive an algorithm to learn such set-valued parameters from data using qualitative knowledge in the form of monotonic influence statements. Our empirical evaluation shows that using qualitative knowledge reduces uncertainty about the parameters without significant loss in accuracy.
\end{abstract}

\section{Introduction}
Bayesian networks (BNs) are a powerful tool for representing and reasoning under uncertainty. They have been successfully applied in a wide variety of domains~\citep{daly2011learning} including healthcare~\citep{lucas2004bayesian}, weather forecasting~\citep{abramson1996hailfinder}, software engineering~\citep{pendharkar2005probabilistic} and risk management~\citep{fan2004bbn}. However, BNs require complete and accurate data to learn the network parameters from data. In many real-world applications, such data may not be fully available. 
 
To overcome the limitations of noisy and sparse data, domain knowledge might be used to learn BNs. Domain knowledge can concisely determine the direction and strength of relationships between variables~\citep{niculescu2006bayesian} and trends in these relationships~\citep{Wellman1990}. Incorporating domain knowledge has been studied more broadly in machine learning. Knowledge in the form of precision-recall trade-off~\citep{Yang2014}, label preferences~\citep{OdomKPLL2015}, privileged information and qualitative influence statements~\citep{AltendorfEtAl2005,YangEtAl2013,mathur2023noisyor,Mathur2023KICN} have been successfully used to learn more accurate and robust models. While these methods overcome the limitations of noisy and sparse data, they can still not deal with incomplete and uncertain data.

Credal networks (CNs) address such a limitation by extending BNs to explicitly represent incompleteness and uncertainty about probability distributions~\citep{maua2020thirty}. They provide a more cautious approach to the specification of probabilistic models. This makes CNs especially useful for noisy, sparse, and incomplete data domains. However, inducing them purely from data can make the model too ``imprecise'' and result in vacuous inferences. 

Inspired by knowledge-guided learning of probabilistic models, we present a solution to the problem of learning accurate yet robust models in the presence of noisy, sparse, and possibly incomplete data by embedding domain knowledge in CNs. Specifically, we consider a subclass of qualitative influence statements called \emph{monotonic influence statements} to make CNs more precise. The main contributions of this paper are the development of a learning method for CNs that effectively exploits monotonic influence relationships in the domain as knowledge and the preliminary empirical evaluation of the corresponding learning algorithm. 

Specifically, we make the following key contributions: (i) we propose the {first method} for learning CNs from data and domain knowledge; (ii) we consider a specific type of knowledge -- monotonic influences as qualitative constraints -- to learn set-valued parameters; (iii) we demonstrate the effectiveness and efficacy of the learning algorithm on a combination of benchmark, BNs, based on three real healthcare data sets and a high-impact real-world problem of mitigating adverse pregnancy outcomes.


The rest of this paper is organized as follows: after providing background about CNs and qualitative influences, we present our method for learning CNs from data using domain knowledge. We then present our empirical evaluation and conclude with a discussion of central outlooks.
% we first provide background about CNs and qualitative influence. Then, we detail our method for learning CNs from data using domain knowledge and report our empirical evaluation. We conclude with a summary and discussion of central outlooks.

\section{Background Concepts}
\paragraph{Bayesian and credal networks.} \emph{Bayesian networks} (BNs, \citeauthor{pgm} \citeyear{pgm}) are probabilistic graphical models that compactly represent joint \emph{probability mass functions} (PMFs). Formally, a BN over a set of variables $\X = \{ X_1, \dots, X_n \}$ is a pair $\langle \mathcal{G}, \theta \rangle.$ Here, $\mathcal{G}$ is a directed acyclic graph such that each node corresponds to a random variable in $\X$ and $\theta$ is a set of conditional PMFs specified for each variable, given all the possible values of its parents $\mathrm{Pa}_X \subset \X$ according to $\mathcal{G}$. Graph $\mathcal{G}$ represents conditional independence relations according to the Markov condition. As a result, the joint PMF induced by the BN can be expressed as the following factorization:
\begin{equation}\label{eq:joint}
P(\x)=\prod_{X\in\X} P(x|\mathrm{pa}_X)\,,
\end{equation}
for each state $\x\in\mathrm{Dom}(\X)$, where $\mathrm{pa}_X \in\mathrm{Dom}(\mathrm{Pa}_X)$ and $x\in\mathrm{Dom}(X)$ are the states consistent with $\x$. 

\emph{Credal networks} (CNs, \citeauthor{maua2020thirty} \citeyear{maua2020thirty}) are a generalization of BNs that allows us to define sets of joint PMFs. A set of PMFs over $X$ is called \emph{credal set} (CS) and denoted as $K(X).$ CSs~\citep{levi1980enterprise,augustin2014introduction} allow us to explicitly represent incompleteness in uncertain specifications (e.g., a \emph{vacuous} CS including all the possible PMFs over $X$,  thus expressing a condition of complete ignorance). In this work, we consider closed and convex CSs, that are also finitely-generated, i.e., induced by the convex closure of a finite number of linear constraints on the PMFs $P(X)$ belonging to $K(X)$. This allows us to equivalently describe each conditional CS by listing its extreme points, whose number should be also finite. 

In practice, the specification of a CN is the same as that of a BN except that each (conditional) PMF is replaced by a CS. The Markov condition can also be applied to CNs, provided that a suitable notion of independence is considered. Here we focus on the notion of \emph{strong} independence, i.e., $X$ and $X'$ are independent according to CS $K(X,X')$ if they are independent in the stochastic sense for each PMF in the extreme points of the CS. This allows us to define a joint CS $K(\X)$ as the convex closure of the set of all joint PMFs as in Eq.~\eqref{eq:joint} such that the conditional PMFs are taken from the conditional CSs in the CN specification or, equivalently, from their vertices \citep{antonucci2008b}. Inferences in CNs are consequently intended as the computation of the lower and upper bounds of a BN query w.r.t. such a joint CS. In spite of the hardness of the general inference \citep{maua2014b}, exact \citep{cabanas2021a} and approximate \citep{antonucci2015approximate} schemes to query possibly large CNs are available.

\paragraph{Decision-making in CSs.} Recall that decision-making in PMFs involves finding the state (decision) that minimizes a given loss function. With 0-1 losses, this corresponds to taking as optimal state $x^*:=\arg\max_{x\in\X} P(x)$. Decision-making in CSs can be done using \emph{interval dominance}~\citep{NCCZaffalon2002,troffaes2007decision}. State $x\in\mathrm{Dom}(X)$ is said to interval-dominate another state $x'\in\mathrm{Dom}(X)$ according to the CS $K(X)$ if and only if:
\begin{equation}
\min_{P(X) \in K(X)} P(x)>\max_{P(X) \in K(X)} P(x')\,,
\end{equation}
where the two optimizations can be computed w.r.t. the linear constraints in the CS specification, or, equivalently, by only considering the extreme points. If a single state interval-dominates all other states, then that state can be selected as optimal for the decision. However, we might have more than one undominated state. In such cases, we can abstain from making a further decision and regard all the undominated states as optimal.

\paragraph{Learning CSs.} The \emph{imprecise Dirichlet model} (IDM,~\citeauthor{walley1996inferences} \citeyear{walley1996inferences}) is the most popular approach for learning CSs from categorical data. This is a generalization of a Bayesian approach combining a multinomial likelihood with a Dirichlet prior distribution. Instead of a single Dirichlet prior, the IDM posits a set of priors, called the imprecise Dirichlet prior, including all the Dirichlet prior distributions of given \emph{equivalent sample size} (ESS). Specifically, when learning from a data set $\mathcal{D}$ of observations of the random variable $X$, the set of Dirichlet priors is parameterized as $\mathrm{Dir}(st_X).$ Here, $s \in \mathbb{R}^+$ is the ESS and $t_X:=\{t_x\}_{x\in\mathrm{Dom}(X)}$ with $t_x \in [0,1]$ and $\sum_x t_x=1$. The probability induced by the IDM is therefore:
\begin{equation}
P(x)=\frac{N_x + st_x}{N+s}\,,
\end{equation}
where $N_x$ is the number of times $X=x$ occurs in data and $N$ is the total number of observations in the $\mathcal{D}$. The bounds w.r.t. the imprecise Dirichlet prior are therefore:
\begin{eqnarray}\label{eq:idm1}
\underline{P}(x)\!\!\!\!\!&:=&\!\!\!\!\!\!\!\!\!\!\!\min_{P(X)\in K(X)}\!\!\!\!P(x)=\!\!\!\underset{t_x\in[0,1]}{\min}\frac{N_x+st_x}{N+s}\!=\!\frac{N_x}{N+s}\,,\\
\overline{P}(x)\!\!\!\!\!&:=&\!\!\!\!\!\!\!\!\!\!\!\max_{P(X)\in K(X)}\!\!\!\!P(x)=\!\!\!\underset{t_x\in[0,1]}{\max}\frac{N_x+st_x}{N+s}\!=\!\frac{N_x+s}{N+s}\,,
\label{eq:idm2}
\end{eqnarray}
for each $x\in\mathrm{Dom}(X)$. Those bounds induce linear constraints on a PMF $P(X)$, thus defining a CS $K(X)$. Note that, for data sets whose cardinality is small w.r.t. the ESS $s$, these bounds can be quite broad. In the rest of the paper, we discuss a procedure based on domain knowledge to shrink these bounds.

\paragraph{Domain knowledge as qualitative influence statements.}
Qualitative influence statements (QISs, \citeauthor{Wellman1990} \citeyear{Wellman1990}) describe the influence of one or more variables over another variable. They allow domain experts to concisely express a trend in the distribution without needing to specify precise values. Here we focus on learning CNs using a class of QISs called \emph{monotonic influence statements} (MISs, \citeauthor{AltendorfEtAl2005} \citeyear{AltendorfEtAl2005}). 
MISs refer to ordinal, and hence also Boolean as a special case, variables. Given a variable $Y$ and a joint variable $\X$ in a probabilistic model, we say that $Y$ is \emph{positively monotonically influenced} by parent $X\in\X$ if higher values of $X$ stochastically result in higher values of $Y$, \textit{ceteris paribus} (i.e, the value of all other parents held constant). Such an influence is denoted as $X_\prec^{M+} Y$ and corresponds to domain knowledge of the form ``as $X$ increases, $Y$ also increases''. We express such a MIS as the inequality:
\begin{equation}\label{eq:mis}
P(Y\leq y|x,\tilde{\bm{x}}) \geq P(Y\leq y|x',\tilde{\bm{x}})
\end{equation}
for each $x,x'\in\mathrm{Dom}(X)$ such that $x\leq x'$,  $y\in\mathrm{Dom}(Y)$, and $\tilde{\bm{x}}\in\mathrm{Dom}(\tilde{\bm{X}})$,  where $\tilde{\bm{X}}:=\bm{X}\setminus\{X\}$. Negative influence can be defined analogously and denoted as $X_\prec^{M-} Y$. 

\paragraph{Related work.} 
QISs have been used to induce more accurate precise probabilistic models from noisy and sparse data for both discriminative \citep{KiGBKokel2020, OdomKPLL2015} and generative learning settings \citep{VanDerGaag2004,AltendorfEtAl2005, Campos2008QIBN, YangEtAl2013,Plajner2020,mathur2023noisyor, Mathur2023KICN}. In this work, we deal with learning imprecise generative models from sparse, incomplete, and uncertain data. QISs have been previously used to make generative models more precise. \citet{RenooijG02} introduce influence-intervals and perform interval-propagation on qualitative probabilistic networks to shrink the intervals. In contrast, our method maintains probabilistic semantics by dealing with (closed and convex) CSs. QISs have also been used to learn conditional CSs. \citet{CamposC05} use qualitative influences as constraints on the imprecise Dirichlet prior distributions. However, in the presence of prior-data conflicts~\citep{evans2006checking}, this approach does not guarantee consistency with the qualitative knowledge. 

Our approach of directly constraining a CS provides a more flexible solution to this problem. This also makes it independent of the way that the CS is initially computed.

\section{Knowledge-intensive Learning}
We aim to improve the performance of CN models by incorporating qualitative domain knowledge into the learning process. The key idea is that qualitative knowledge can serve as a strong inductive bias. While one could envision sampling data from QISs/MISs (as a single piece of knowledge could generalize several data points in one fell swoop), we take a different approach of using the knowledge to define constraints on the learning model. 

From a Bayesian perspective, qualitative knowledge might guide the specification of the prior distribution (e.g., a comparative judgment inducing an analogous constraint on the corresponding parameters of a Dirichlet distribution). Yet, in the presence of prior-data conflicts~\citep{evans2006checking}, the Bayesian approach does not guarantee consistency with the qualitative knowledge. CSs approaches are known to provide a more flexible solution to this problem~\citep{walter2009imprecision}. The problem of integrating qualitative knowledge (and in particular MISs) in the statistical learning of a credal model corresponds to the following learning task:

\fbox{\centering\begin{minipage}{.95\linewidth}
    \textbf{Given:} Data set ${\mathcal{D}:=\{y^{(i)},\x^{(i)}\}^{N}_{i=1}}$ over variables $(Y,\X)$ and a collection $C$ of MISs as in Eq.~\eqref{eq:mis}.\\
    \textbf{To Do:} Learn a collection of conditional CSs over $Y$,  say $\{K(Y|\x)\}_{\x \in\mathrm{Dom}(\X)}$,  that are compatible with $C$.
    \end{minipage}
}\vspace{0.5em}

{\bf A healthcare example.} The above learning setting is crucial in several domains, including healthcare, that require cautious models to be learned from limited and noisy data sets. As an example, consider a simplistic problem of modeling \emph{Gestational Diabetes Mellitus} ($G$) based on two risk factors -- when the age at the start of pregnancy is greater than 35 ($A$) and when the Body Mass Index at the start of pregnancy greater than 25 ($B$). Data-driven methods like the IDM can be used to learn the \emph{conditional} CSs (CCSs) for $G$ given $A$ and $B$. However, small and noisy datasets can induce wide bounds, making predictions uninformative. 

Our intuition, to be empirically tested, is that domain knowledge, specifically in the form of qualitative constraints could significantly shorten the (credal) bounds, thus leading to an actionable outcome. In our example, we might know that both age at the start of pregnancy and the Body Mass Index positively monotonically influence the risk of Gestational Diabetes Mellitus. This knowledge can then be used to filter out the PMFs that violate this rule to obtain narrower and more informative bounds.

Table~\ref{tab:example} illustrates this approach. The columns on the left show the CCSs learned purely from data, while those on the right presents the CCSs obtained after filtering PMFs that are not compatible with the domain knowledge. While none of the intervals for $G$ dominate for any combination of $A$ and $B$ for the CCSs on the left, the CCSs on the right has a configuration ($\{A=1, B=1\}$) where the interval corresponding to $G=1$ interval-dominates the one corresponding to $G=0$.

\begin{table*}[htp!]
\centering
\begin{tabular}{cc|cc}
\toprule
$A$&$B$&$P(G=0\mid A,B)$&$P(G=1 \mid A,B)$\\
\midrule
$0$&$0$&$[0.4,0.7]$&$[0.3,0.6]$\\
$0$&$1$&$[0.3,0.7]$&$[0.3,0.7]$\\
$1$&$0$&$[0.0,1.0]$&$[0.0,1.0]$\\
$1$&$1$&$[0.0,0.7]$&$[0.3,1.0]$\\
\bottomrule
\end{tabular}
\quad
\begin{tabular}{cc|cc}
\toprule
$A$&$B$&$P(G=0\mid A,B)$&$P(G=1 \mid A,B)$\\
\midrule
$0$&$0$&$[0.5,0.7]$&$[0.3,0.5]$\\
$0$&$1$&$[0.3,0.5]$&$[0.5,0.7]$\\
$1$&$0$&$[0.4,0.5]$&$[0.5,0.6]$\\
$1$&$1$&$[0.0,0.2]$&$[0.8,1.0]$\\
\bottomrule
\end{tabular}
\caption{CCSs for the presence of Gestational Diabetes ($G$) given two risk factors -- age at pregnancy greater than 35 ($A$) and BMI greater than 25 ($B$) estimated from a small sample of data using the IDM (left) and the CCS obtained by eliminating the PMFs that were incompatible with the knowledge that both risk factors positively monotonically influence the risk of G (${A}_\prec^{M+} G$ and ${B}_\prec^{M+} G$ respectively). }
\label{tab:example}
\end{table*}

\subsection{Our Approach}
\eat{
We approach the above problem by obtaining an initial set of CCSs from the data set $\mathcal{D}$ through the standard IDM learning and then deriving a procedure to shrink the IDM bounds by eliminating PMFs that violate the qualitative influence constraints (MISs) in $C$. We achieve that by jointly solving for all the maximum values, 
\begin{equation}
\overline{P}(y|\bm{x}) \forall y \in \mathrm{Dom}(Y), \bm{x} \in \mathrm{Dom}(\X)
\end{equation}
that satisfy the monotonicity constraints. 
}
We approach the above problem by obtaining an initial set of CCSs from the data set $\mathcal{D}$ (for eg, by standard IDM learning) and then shrinking the bounds by eliminating the PMFs that violate the qualitative influence constraints (MISs) in $C.$ 


Let the initial set of CCSs be $\underline{P}_0(y|\bm{x})$ and $\overline{P}_0(y|\bm{x}), \forall \bm{x}\in\mathrm{Dom}(\X), y\in\mathrm{Dom}(Y).$ Without loss of generality, we obtain new upper bounds by finding the largest values
\begin{equation}
\overline{P}(y|\bm{x}) \forall y \in \mathrm{Dom}(Y), \bm{x} \in \mathrm{Dom}(\X)
\end{equation}
that satisfy the monotonicity constraints. This is equivalent to the following constrained optimization problem:

\begin{equation}\label{eq:opt}
\underset{\substack{\underline{P}_0(y|\bm{x}) \leq q_{y|\bm{x}} \leq \overline{P}_0(y|\bm{x})\\q_{y|\bm{x}} \models C\\ \bm{x}\in\mathrm{Dom}(\X)\\ y \in \mathrm{Dom}(Y)}} {\arg\max} 
\mathcal{L}(\bm{q})\,,
%\sum_{\substack{\bm{x} \in \mathrm{Dom}(\X)\\y \in \mathrm{Dom}(Y)}} q_{y|\bm{x}}\,,
\end{equation}
where $\bm{q}:=\{ q_{y|\bm{x}} \}^{y\in\mathrm{Dom}(Y)}_{\bm{x} \in \mathrm{Dom}(\X)}$ is the set of all optimization variables, the objective function $\mathcal{L}(\bm{q})$ is defined as
\begin{equation}\label{eq:obj}
\mathcal{L}(\bm{q}):=
\sum_{\substack{\bm{x} \in \mathrm{Dom}(\X)\\y \in \mathrm{Dom}(Y)}} q_{y|\bm{x}}\,,
\end{equation}
and $q_{y|\bm{x}}\models C$ denotes that the optimization variables entail the MIS constraints in $C$ as stated by Eq.~\eqref{eq:mis}.
% where $\underline{P}_0(y|\bm{x})$ and $\overline{P}_0(y|\bm{x})$ are the IDM constraints as in Eqs. \eqref{eq:idm1} and \eqref{eq:idm2}, 
%while $q_{y|\bm{x}}\models C$ denotes that the optimization variables entail the MIS constraints in $C$ as stated by Eq.~\eqref{eq:mis}.
Note that $C$ imposes constraints across the CCSs corresponding to different configurations of the parents. So, if $C = \emptyset$, then Eq.~\eqref{eq:opt} becomes equivalent to performing separate optimizations for each $q_{y|\bm{x}}$ which recover the initial CCSs.
%Let us denote the set of all the optimization variables in the above optimization task as $\bm{q}$. 
An analogous optimization can be considered for the lower bounds. 

However, such linear programs are not guaranteed to have feasible solutions because some constraints might be unsatisfiable under the initial bound constraints. If this is the case we address the optimization using the \emph{barrier penalty} method~\citep{Luenberger2016}. Specifically, we encode each MIS constraints $c\in C$ of the form ${X}_\prec^{M+} Y$ as $\delta_c(\bm{q},\epsilon)\leq 0$ where:
\begin{equation}
\delta_c(\bm{q},\epsilon)=\sum_{y'\leq y} q_{y'|x',\tilde{\bm{x}}} - \sum_{y''\leq y} q_{y''|x,\tilde{\bm{x}}} + \epsilon\,,
\end{equation}
and we introduce a penalty,  $\max \{0, \delta_c(\bm{q},\epsilon)\}^2$.
Now, instead of Eq.~\eqref{eq:opt}, we solve a sequence of optimization problems of the form:
\begin{align}\label{eq:constropt}
\underset{\substack{\underline{P}_0(y|\bm{x}) \leq q_{y|\bm{x}} \leq \overline{P}_0(y|\bm{x})\\\bm{x}\in\mathrm{Dom}(\X)\\y \in \mathrm{Dom}(Y)}}{\arg \max}
\left[\mathcal{L}(\bm{q}) - \lambda \underbrace{\sum_{c\in C} \max \{0, \delta_c(\bm{q},\epsilon) \}^2}_{\text{Penalty}}\right]
\,,
\end{align}
for $\lambda = 10^0, 10^1,10^2,\dots,10^L$ until the penalty term vanishes, where $\mathcal{L}(\bm{q})$ is the objective function in Eq.~\eqref{eq:obj}. If a feasible solution exists, then this method is guaranteed to converge to a solution in the limit~\citep{Luenberger2016}. We analogously proceed for the minimization task.

\subsection{Gradients}
As outlined in Eq.~\eqref{eq:constropt}, solving a series of optimization problems forms the core of our method. Each such optimization problem can be solved using a standard gradient ascent procedure that supports parameter bounds of the form 
\begin{equation}
\begin{split}
    \underline{P}_0(y|\bm{x}) \leq q_{y|\bm{x}} \leq \overline{P}_0(y|\bm{x})\ \forall&\bm{x} \in\mathrm{Dom}(\X),\\&y \in \mathrm{Dom}(Y)
\end{split}
\end{equation}
We now present the details of the gradients of the objective function with respect to each element  $q_{y_i\mid\bm{x_j}}$ of the parameter vector $\bm{q}.$ The gradient of the objective function in Eq.~\eqref{eq:constropt} with respect to each $q_{y_i\mid\bm{x_j}}$ of $\bm{q}$ is
\begin{equation}
    \begin{split}
    &\frac{\partial}{\partial q_{y_i\mid\bm{x_j}}} [\mathcal{L}(\bm{q}) - \lambda \sum_{c\in C} \max \{0, \delta_c(\bm{q},\epsilon) \}^2]\\
    &= \frac{\partial \mathcal{L}(\bm{q})}{\partial q_{y_i\mid\bm{x_j}}}  - \lambda \sum_{c\in C} 2\max \{0, \delta_c(\bm{q},\epsilon) \} \frac{\partial \max \{0, \delta_c(\bm{q},\epsilon) \}}{\partial q_{y_i\mid\bm{x_j}}}\\
    &=1-\lambda \sum_{c\in C} 2\max \{0, \delta_c(\bm{q},\epsilon) \} \mathds{1}_{\delta_c(\bm{q},\epsilon)>0}\frac{\partial \delta_c(\bm{q},\epsilon)}{\partial q_{y_i\mid\bm{x_j}}}\,. 
    \end{split}
\end{equation}

Here, the gradient of the $\delta_c(\bm{q},\epsilon)$ term with respect to each $q_{y_i\mid\bm{x_j}}$ of $\bm{q}$ is
\begin{align}
    \begin{aligned}
        \frac{\partial \delta_c(\bm{q},\epsilon)}{\partial q_{y_i\mid\bm{x_j}}} 
        &= \frac{\partial}{\partial q_{y_i\mid\bm{x_j}}} \sum_{y'\leq y} q_{y'|x',\tilde{\bm{x}}} - \sum_{y''\leq y} q_{y''|x,\tilde{\bm{x}}} + \epsilon\\
        &=  \sum_{y'\leq y} \frac{\partial q_{y'|x',\tilde{\bm{x}}}}{\partial q_{y_i\mid\bm{x_j}}}  - \sum_{y''\leq y}  \frac{\partial q_{y''|x,\tilde{\bm{x}}}}{\partial q_{y_i\mid\bm{x_j}}}\\
    \end{aligned}
\end{align}
Once these gradients are obtained, we can solve the optimization problem in Eq.~\eqref{eq:constropt} for a given value of $\lambda$ using a gradient ascent procedure with parameter bounds.

\subsection{Algorithm}
\begin{algorithm}[!t]
\caption{ConstrOpt}
\label{alg:opt}
\textbf{Input}: \begin{itemize}
    \item[] $\sigma$ (+1 if maximize and -1 if minimize)
    \item[] $\{\underline{P}(y|\x),\overline{P}(y|\x)\}_{y\in\mathrm{Dom}(Y),\x\in\mathrm{Dom}(\X)}$ (CS bounds)
    \item[] $C$ (MISs)
    \item[] $t_\text{max}$ (maximum number of iterations)
\end{itemize}
\textbf{Output}: \begin{itemize}
    \item[] upper/lower CS bounds satisfying $C$
\end{itemize}

\begin{algorithmic}[1] %[1] enables line numbers
\STATE Initialize $\bm{q} = 
\underset{\substack{\underline{P}(y|\bm{x}) \leq q_{y|\bm{x}} \leq \overline{P}(y|\bm{x})\\\bm{x}\in\mathrm{Dom}(\X)\\y \in \mathrm{Dom}(Y)}}
%\underset{Q \in K_\text{IDM}(Y\mid X)}
{\arg \max} \sigma \mathcal{L}(\bm{q})$
\STATE $\lambda = 1, t = 1$
\WHILE{$\sum_{c\in C} \max \{0, \delta_c(\bm{q},\epsilon)\}^2>0$ and $t \leq t_\text{max}$}
\STATE $\bm{q}={\arg\max} \, \left[\sigma \mathcal{L}(\bm{q}) - \lambda\sum_{c\in C} \max \{0, \delta_c(\bm{q},\epsilon)\}^2 \right]$
\STATE $\lambda = \lambda \times 10$
\STATE $t = t+1$
\ENDWHILE
\STATE \textbf{return} $\bm{q}$
\end{algorithmic}
\end{algorithm}
We use these gradients and parameter bounds to optimize the objective function in Eq.~\eqref{eq:constropt} using the L-BFGS-B algorithm. We describe the procedure to solve the series of optimization problems in Algorithm~\ref{alg:opt}. To perform the maximization (or minimization) we start with the upper bound (or the lower bound) and solve a series of optimization problems of the form described in Eq.~\eqref{eq:constropt}. For each of these optimization problems, we use the previous solution as the initialization and we increase the value of the penalty weight $\lambda$ by a factor of 10 to allow for a jump start and early convergence.

Algorithm~\ref{alg:learn} details our procedure (KnowLearnCCS for Knowledge driven learning of Conditional Credal Sets) to obtain the consistent conditional CSs from the data set $\mathcal{D}$ and the MISs $C$. The algorithm begins by computing the IDM conditional CSs from $\mathcal{D}$. It then uses the MISs $C$ to shrink the CS bounds. It does so by finding the highest and lowest values in the initial CS that satisfy all the constraints in $C$. These values are obtained by constrained optimization based on the barrier penalty method. This is performed by sub-procedure detailed by Algorithm~\ref{alg:opt}.






\begin{algorithm}[!t]
\caption{KnowLearnCCS}
\label{alg:learn}
\textbf{Input}: \begin{itemize}
\item[] $\mathcal{D}$ (data set over $\X$ and $Y$)
\item[] $C$ (MISs)
\item[] $t_\text{max}$ (maximum number of iterations)
\end{itemize}
\textbf{Output}: \begin{itemize}
\item[] CS bounds %$K(Y|\x)\ \forall \x \in \text{Dom.}(\X),$ a local CS
\end{itemize}

\begin{algorithmic}[1] %[1] enables line numbers
\STATE Initialize $\underline{P}(y|\x)=\underline{P}_0(y|\x)$,  $\overline{P}(y|\x)=\overline{P}_0(y|\x)$ for each $y\in\mathrm{Dom}(Y)$ and $\x\in\mathrm{Dom}(\X)$\\
%$K_\text{IDM}(Y \mid \x)$ using the IDM as $$[\underline{P_0}(Y \mid \x ), \overline{P_0}(Y \mid \x)]$ \\
\STATE $\{\overline{P}(y|\x)\}_{y,\bm{x}} = \text{ConstrOpt}(+1, \underline{P}_0, \overline{P}_0, C, t_\text{max})$ %\COMMENT{The highest PMF in $K$ satisfying $C$} 
\STATE $\{\underline{P}(y|\x)\}_{y,\bm{x}} = \text{ConstrOpt}(-1, \underline{P}_0, \overline{P}_0, C, t_\text{max})$ %\COMMENT{The lowest PMF in $K$ satisfying $C$} 
%\STATE Define $K(Y\mid \x)$ as 
\STATE \textbf{return} $\{[\underline{P}(y|\x),\overline{P}(y|\x)]\}_{y \in \mathrm{Dom}(Y),\x \in \mathrm{Dom}(\X)}$
\end{algorithmic}
\end{algorithm}
\section{Experimental Evaluation}

High-stakes domains like healthcare require models that support cautious decision-making. While data-driven approaches like the IDM can learn CSs induced by upper and lower bounds on the PMFs, these sets can be too broad when learned from small and noisy data sets as is common in such domains. We hypothesize that qualitative domain knowledge can be used to eliminate inconsistent PMFs from such CSs making them more informative for decision-making while remaining cautious. Concretely, we aim to answer the following research questions:
\begin{enumerate}[leftmargin=2.5em]
    \item[{\bf (Q1)}] Can MISs be used to improve the coverage of a collection of CCSs in small and noisy data sets?
    \item[{\bf (Q2)}] Does imposing MIS constraints directly on the posterior distribution result in more accurate models than when imposing the constraint on the prior?
    \item[{\bf (Q3)}]  Can MISs be used to learn more accurate yet cautious models on real medical data?
\end{enumerate}
\begin{figure*}[!ht]
    \centering
    \includegraphics{Partials/gdm-cn.pdf}
    \caption{CN structure for the nuMoM2b domain. The risk of Gestational Diabetes Mellitus (GDM) is influenced by seven risk factors -- the genetic predisposition to Diabetes as measured by a Polygenic Risk Score (PRS), family history of Diabetes (Hist), the presence of Polycystic Ovary Syndrome (PCOS), the presence of high Blood Pressure at start of pregnancy (HiBP), the age at start of pregnancy $\geq 35$ (Age), the Body Mass Index at start of pregnancy $\geq 25$ (BMI) and the amount of physical activity measured in Metabolic /Equivalents of Time $\geq 450$ (METs). All risk factors except METs positively monotonically influence the risk of GDM, while METs negatively monotonically influences the risk of GDM.}
    \label{fig:gdm-cn}
\end{figure*}

\begin{table*}[!ht]
    \centering
    \include{Partials/acc-unc}
    \caption{Accuracy of precise conditional CPTs learning using the Dirichlet prior (BN), and Accuracy and Uncertainty of CCSs learned using IDM (CN-IDM), using IDM with monotonicity constraints on the prior (CN-IDM-MIS-P) and using IDM with monotonicity constraints on the posterior (CN-IDM-MIS) for each data set.}
    \label{tab:acc-unc}
\end{table*}
\eat{
    \begin{table*}[!ht]
        \centering
        \include{Partials/unc}
    
        \caption{Uncertainty}
        \label{tab:discountedacc}
    \end{table*}
}
\begin{table*}[!ht]
    \centering
    \include{Partials/discountedacc}
    \caption{Discounted accuracy of precise conditional CPTs learning using the Dirichlet prior (BN), and the conditional credal sets learned using IDM (CN-IDM), using IDM with monotonicity constraints on the prior (CN-IDM-MIS-P) and using IDM with monotonicity constraints on the posterior (CN-IDM-MIS) for each data set.}
    \label{tab:discountedacc}
\end{table*}

\begin{table*}[htp!]
    \centering
    \include{Partials/datasets}
    \caption{The number of examples ($|\mathcal{D}|$), the target ($Y$) and feature variables ($\X$) for each of the data sets used for empirical evaluation. The data sets are of three types -- BN based (rows 1--5), UCI Benchmark (rows 6--10) and medical study data (rows 11--14). A feature with the superscript + denotes a positive monotonic influence, and a feature with the superscript - denotes a negative monotonic influence.}
    \label{tab:datasets}
\end{table*}

\paragraph{Data sets.} To answer these research questions, we consider three types of data sets - data sampled from BNs, benchmark data sets and medical study data sets. We used three standard BNs - Asia, Cancer, and LUCAS - to generate the first five data sets. These BNs represent well-defined causal relationships between variables providing a controlled environment where domain knowledge is guaranteed to be correct. We used five data sets from UCI Machine Learning repository as benchmark data sets, namely, Haberman's Survival, Pima Indians Diabetes, Breast Cancer, Thyroid Disease, and Heart Disease. We use the same pre-processing and domain knowledge as in prior work~\citep{YangEtAl2013} for these data sets. Finally, we used data sets from four medical studies, namely, Alzheimer's Disease Neuroimaging Initiative (ADNI), Rare diseases Survey (Rare,~\cite{macleod2016identifying}), Post-Partum Depression Survey (PPD, ~\cite{PPDNatarajan2017}), and Nulliparous Pregnancy Outcomes Study: Monitoring Mothers-to-Be (nuMoM2b,~\cite{numom2b}). The target variables ($Y$) in all the data sets are Boolean and the parents ($\X$) are ordinal variables. Table~\ref{tab:datasets} details the size of the datasets, the Boolean target variables considered in our experiments, and the parent variables of the target together with the kind of monotonic influence they have on the target.

\paragraph{Methods.} We compare our algorithm (discussed in the previous section and denoted here as CN-IDM-MIS) against two baselines -- (i) a CN estimator based on the pure IDM (denoted as CN-IDM); (ii) a CN estimator that applies constraints in the Imprecise Dirichlet Prior (denoted as CN-IDM-MIS-P,~\cite{CamposC05}). To illustrate the difference in the types of data sets, we also present the results for a precise BN estimator with a Dirichlet prior (denoted as BN). 

We set the ESS $s=2$ for all data sets and models. Additionally, we set $\epsilon = 0.01$ for all the constraints in the BN data sets and $\epsilon = 0.001$ for the other data sets. The Python code used for the experiments is freely available in a public repository\footnote{\scriptsize\url{https://github.com/saurabhmathur96/credal-cpd}}.

We perform inference in the CN models by interval-dominance. If neither value of the Boolean target interval-dominates the other, we mark that data point as uncertain and do not make an inference for it. For  BNs we perform inference by thresholding the positive probability at $\geq 0.5$. 

\paragraph{Metrics.}
We evaluate the methods using three metrics -- the fraction of uncertain data points (\emph{uncertainty}), \emph{accuracy} over non-uncertain data points, and a utility-discounted accuracy (\emph{discounted accuracy}). 

Compared to its Bayesian counterpart, whose uncertainty is zero by construction, a credal method typically achieves higher accuracy at the price of a growing uncertainty (see, e.g., \cite{antonucci2017a}). 
The discounted accuracy provides a summary of such a trade-off: this performance descriptor accounts for uncertain data points by assigning them a score of 0.5 while scoring the correct and incorrect classifications identically to accuracy (as 1 and 0 respectively). Discounted accuracy coincides with the accuracy for models that always make predictions (like BNs). 

We compute these metrics by five-fold cross-validation. Additionally, to simulate small data settings, we limit the training set size in the BN data sets to 50 data points.

\vspace{1em}
\paragraph{Results.}\begin{enumerate}[leftmargin=2.5em]
    \item[{\bf (Q1)}] Rows 1--5 and 6--10 of Table~\ref{tab:acc-unc} present the accuracy and the number of uncertain examples for the BN based and the UCI benchmark data sets. To simulate small data setting, we fixed the training set size for BN data sets to 50. The UCI benchmark data sets are both noisy and small. This can be seen by the reduction in accuracy of the precise BN estimator from the BN data sets to the UCI data sets (from 85.2\% to 75.9\%).
    
    The IDM method achieves high accuracies for these data sets, but the price is being uncertain about a large number of test examples. The uncertainty rate is 10\% for the BN data sets on average. This increases to 35.5\% for the UCI benchmark data sets. 

    The methods using qualitative knowledge (CN-IDM-MIS-P and CN-IDM-MIS) reduce the number of uncertain examples relative to CN-IDM by 20.2\% on average with an average relative decrease in accuracy of just 0.4\% .  Hence, Q1 is answered affirmatively.

    \item[{\bf (Q2)}] Recall that the CN-IDM-MIS-P method imposes the monotonic constraints on the imprecise Dirichlet prior while the CN-IDM-MIS method imposes the constraints directly on the posterior distribution. Rows 1--5 and 6--10 of Table \ref{tab:discountedacc} present the discounted accuracy for the BN based and the UCI benchmark data sets. CN-IDM-MIS achieves same or better discounted accuracy for all the data sets (same for LUCAS-c and Thyroid). The average improvement in discounted accuracy in CN-IDM-MIS relative to CN-IDM-MIS-P is 1.5\%.  Hence, Q2 is answered affirmatively. 

    \item[{\bf (Q3)}] Rows 11--14 of Table~\ref{tab:acc-unc} present the accuracy and the number of uncertain examples for the four medical data sets. The methods using qualitative knowledge reduce the uncertainty by 54.3\% on average with an average reduction of 0.7\% in the accuracy. Rows 11--14 of Table~\ref{tab:discountedacc} present the discounted accuracies for the medical data sets. On average, the methods using qualitative knowledge achieve a 2.7\% improvement in the discounted accuracy, and CN-IDM-MIS achieves an improvement of 3.3\% in discounted accuracy over CNN-IDM-MIS. Hence, Q3 is answered affirmatively.
\end{enumerate}

Table~\ref{tab:acc-unc} shows that the data-driven credal approach CN-IDM typically achieves the highest accuracy at the price of a high uncertainty. On the other hand, precise models like BNs lack a way to abstain from prediction and as a result always make a prediction, even if the prediction might be unreliable. In such a situation, our approach CN-IDM-MIS might represent a reasonable balance. This can also be seen in the discounted accuracy scores in Table~\ref{tab:discountedacc}. Utility-discounted accuracy assigns a score of 0.5 to uncertain data points and tends to over-penalize credal models in comparison to precise models, making such a comparison unfair to the credal model~\citep{zaffalon2012evaluating}. In this light, the fact that our approach outperforms the precise BN on discounted accuracy on many data sets indicates significant benefit from the use of qualitative knowledge.

\section{Conclusion and Future work}
It is clear that in many domains such as healthcare an interval probability estimate would suffice rather than computing a clear point estimate. For instance, if the intervals between an event occurring and not occurring do not overlap, it can result in an actionable outcome. Hence, we considered the problem of learning credal networks from data and domain-specific qualitative knowledge. We presented an IDM-based procedure to learn credal networks from data in a way that is also consistent with the qualitative knowledge expressed by monotonic influence statements. This is achieved by an iterative procedure shrinking the IDM bounds. Our empirical evaluation demonstrates that the proposed algorithm yields conditional credal sets that have higher coverage without losing much accuracy.

There are several directions for future research. First is to extend the proposed method to support other qualitative influence statements like synergies where one specifies the effect of more than one random variable on a target (for example, higher BMI with a lower HDL level increases the risk of heart attack). Next, one could consider a more general setup where the qualitative influence statements are not restricted to parent-child relations but are instead over joint distributions. Also, one could employ the recent generative AI models to provide weak knowledge. Finally, learning from multiple experts while assessing the credibility of each expert could open up human-allied learning to very large problems such as healthcare.  

\subsection*{Acknowledgements}
The authors acknowledge the support by AFOSR award FA9550-23-1-0239, ARO award W911NF2010224, and NIH grant R01HD101246.

\bibliography{references}
\end{document}