\documentclass[accepted,hidelinks]{uai2023} % for initial submission
% \documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
% \usepackage[american]{babel}
\usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}    % use 8-bit T1 fontsm
\usepackage{hyperref}       % hyperlinks
\usepackage{url}            % simple URL typesetting
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.

\usepackage{float}
\usepackage{multirow}
\usepackage{courier}
\usepackage{listings, lstautogobble,amsfonts}
\usepackage{amsmath,amssymb,amsthm}
\usepackage{mdframed}
\usepackage{mathtools}
\usepackage[finalizecache=false,frozencache=false,newfloat]{minted}
\usepackage{textcmds}
\usepackage{xspace}
\usepackage{xcolor}
\usepackage[normalem]{ulem}
\usepackage{caption}
\usepackage{multicol}
\usepackage{bbm}
\usepackage{thmtools}
\usepackage{bm}
\usepackage{thm-restate}
%\usepackage{todonotes}
\usepackage[inline]{enumitem}
\usepackage{soul}
\usepackage{physics}
\usepackage{wrapfig}

\input{macros}

% Recommended, but optional, packages for figures and better typesetting:
\usepackage{microtype}
\usepackage{graphicx}
\usepackage{subfigure}
\usepackage{booktabs} % for professional tables


% Attempt to make hyperref and algorithmic work together better:
\newcommand{\theHalgorithm}{\arabic{algorithm}}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Neural Probabilistic Logic Programming in Discrete-Continuous Domains}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<lennert.desmet@kuleuven.be>?Subject=Your UAI 2023 paper}{Lennert De Smet}{}}
\author[2]{Pedro Zuidberg Dos Martires}
\author[1]{Robin Manhaeve}
\author[1]{Giuseppe Marra}
\author[1]{Angelika Kimmig}
\author[1,2]{Luc De Raedt}
% Add affiliations after the authors
\affil[1]{%
    Department of Computer Science\\
    KU Leuven\\
    Belgium
}
\affil[2]{%
    Center for Applied Autonomous Systems\\
    \"Orebro\\
    Sweden
}
\begin{document}
\maketitle

\begin{abstract}
    Neural-symbolic AI (NeSy) allows neural networks to exploit symbolic background knowledge in the form of logic. It has been shown to aid learning in the limited data regime and to facilitate inference on out-of-distribution data.
    Probabilistic NeSy focuses on integrating neural networks with both logic and probability theory, which additionally allows learning under uncertainty.
    A major limitation of current probabilistic NeSy systems, such as \dpl, is their restriction to finite probability distributions, i.e., discrete random variables.
    In contrast, deep probabilistic programming (DPP) excels in modelling and optimising continuous probability distributions.
    Hence, we introduce \dspl, a neural probabilistic logic programming language that incorporates DPP techniques into NeSy.
    Doing so results in the support of inference and learning of both discrete and continuous probability distributions under logical constraints.
    Our main contributions are
    1) the semantics of \dspl and its corresponding inference algorithm,
    2) a proven asymptotically unbiased learning algorithm, and
    3) a series of experiments that illustrate the versatility of our approach.
\end{abstract}

\section{Introduction}\label{sec:intro}


Neural-symbolic AI (NeSy)~\citep{garcez2002neural,dereadt2021statistical} focuses on the integration of symbolic and neural methods. 
The advantage of NeSy is that it combines the reasoning power of logical representations with the learning capabilities of neural networks.
Additionally, it has been shown to converge faster during learning and to be more robust~\citep{rocktaschel2017end,xu2018semantic,evans2018learning}.

The challenge of NeSy lies in combining discrete symbols with continuous and differentiable neural representations.
So far, such a combination has been realised for Boolean variables by interpreting the outputs of neural networks as the weights of these variables.
These weights can then be given either a fuzzy semantics~\citep{badreddine2022logic,diligenti2017sbr}
or a probabilistic semantics~\citep{manhaeve2021neural,yang2020neurasp}. 
The latter is also used in neural probabilistic logic programming (NPLP), where neural networks parametrise probabilistic logic programs.

A shortcoming of traditional probabilistic NeSy approaches is that they fail to capture models that integrate continuous random variables and neural networks --
a feature already achieved with mixture density networks~\citep{bishop1994mixture} and more generally deep probabilistic programming (DPP)~\citep{tran2017deep,bingham2019pyro}.
However, it is unclear whether DPP can be generalised to enable logical and relational reasoning.
Hence, a gap exists between DPP and NeSy as reasoning is, after all, a fundamental component of the latter.
We contribute towards closing this DPP-NeSy gap by introducing \dspl\footnote{\q{Sea} stands for the letter C, as in {\bf c}ontinuous random variable.},
an NPLP language
with support for discrete-continuous random variables that retains logical and relational reasoning capabilities.
We achieve this integration by allowing
arbitrary and differentiable probability distributions expressed in a modern DPP language
while combining knowledge compilation~\citep{darwiche2002knowledge} with the reparametrisation trick~\citep{ruiz2016generalized} and continuous relaxations~\citep{petersen2021learning}.

Our main contributions are \begin{enumerate*}[label=(\arabic*)]
    \item the well-defined probabilistic semantics of \dspl (Section~\ref{sec:dspl}) with an inference algorithm based on
    weighted model integration (WMI)~\citep{belle2015probabilistic}
    (Section~\ref{subsec:reduction}),
    \item a proven asymptotically unbiased gradient estimate for WMI that turns \dspl into a differentiable, discrete-continuous NPLP language (Section~\ref{subsec:learning}), and
    \item an experimental evaluation showing the versatility of discrete-continuous reasoning and the efficacy of our approach\label{contrib3} (Section~\ref{sec:experiments})
\end{enumerate*}.

\section{Logic programming concepts}
\label{sec:lp}

A term~\probloginline{t} is either a constant~\probloginline{c}, a variable~\probloginline{V} or a structured term of the form
\probloginline{f(t@$_1$@,...,t@$_K$@)},
where \probloginline{f} is a functor and each \probloginline{t@$_i$@} is a term. 
Atoms are expressions of the form
\probloginline{q(t@$_1$@,...,t@$_K$@)}.
Here, \probloginline{q/@$K$@} is a predicate of arity $K$ %(of arity $n$, or $q/n$ in shorthand notation)
and each
\probloginline{t@$_i$@} is a term.
A literal is an atom or the negation of an atom 
$\neg \text{\probloginline{q(t@$_1$@,...,t@$_K$@)}}$.
A definite clause (also called a rule) is an expression of the form
\probloginline{h:- b@$_1$@,...,b@$_K$@}
where \probloginline{h} is an atom and each \probloginline{b@$_i$@} is a literal.
Within the context of a rule, \probloginline{h} is called the head and the conjunction of \probloginline{b@$_i$@}'s is referred to as the body of the rule. Rules with an empty body are called facts.
A logic program is a finite set of definite clauses. 
If an expression does not contain any variables, it is called ground.
Ground expressions are obtained from non-ground ones by means of substitution.
A substitution
$\theta = \{
    \text{\probloginline{V@$_1$@}}
    =
    \text{\probloginline{t@$_1$@}}
    ,
    \dots,
    \text{\probloginline{V@$_K$@}}
    =
    \text{\probloginline{t@$_K$@}}
\}$
is a mapping from variables~\probloginline{V@$_i$@} to terms~\probloginline{t@$_i$@}. 
Applying a substitution $\theta$ to an expression \probloginline{e} (denoted \probloginline{e@$\theta$@})
replaces each occurrence of \probloginline{V@$_i$@} in \probloginline{e} with the corresponding \probloginline{t@$_i$@}.

While {\em pure} Prolog (or definite clause logic) is defined using the concepts above, practical implementations of Prolog extend definite clause logic with  an external arithmetic engine~\cite[Section 8]{sterling1994art}. Such engines enable the use of system specific routines in order to handle numeric data efficiently.
Analogous to standard terms in definite clause logic, as defined above, we introduce numeric terms. A numeric term  \probloginline{n@$_i$@} is either a numeric constant (a real, an integer, a float, etc.), a numeric variable \probloginline{N@$_i$@},
or a numerical functional term, which is an expression of the form
\probloginline{@$\varphi$@(n@$_1$@,...,n@$_K$@)} where \probloginline{@$\varphi$@} is an externally defined numerical function.
The difference between a standard logical term and a numerical term is that {\em ground} numerical terms are evaluated and yield a numeric constant. For instance,  if \probloginline{add} is a function, then 
\probloginline{add(3, add(5, 0))} evaluates to the numerical constant \probloginline{8}. 

Lastly, numeric constants can be compared to each other using a built-in binary comparison operator
$\bowtie\ \in
\left\{
\text{\probloginline{<}},
\text{\probloginline{=<}},
\text{\probloginline{>}},
\text{\probloginline{>=}},
\text{\probloginline{=:=}},
\text{\probloginline{=\=}}
\right\}$.
Here we use Prolog syntax to write comparison operators, which correspond to $\{ <, \leq, >, \geq, =, \neq \}$ in standard mathematical notation.
Comparison operators appear in the body of a rule, have two arguments, and are generally written as
$
\text{\probloginline{@$\varphi_l$@(n@$_{l,1}$@,...,n@$_{n,K}$@)}}
\bowtie
\text{\probloginline{@$\varphi_r$@(n@$_{r,1}$@,...,n@$_{r,K}$@)}}.
$
They evaluate
their left and right side and subsequently compare the results, assuming everything is ground.
If the 
%stated
comparison holds, it is interpreted 
%by the logic program 
as true, else as false.


\section{\dspl}
\label{sec:dspl}

\subsection{Syntax}


While facts in pure Prolog are deterministically true, in probabilistic logic programs they are annotated with the probability with which they are true. These are the so-called probabilistic facts~\citep{de2007problog}. When working in discrete-continuous domains, we need to use the more general concept of distributional facts~\citep{zuidberg2023declarative},
inspired by the distributional clauses of~\citet{gutmann2011magic}.

\begin{definition}[Distributional fact]  \emph{Distributional facts} are expressions of the form
\probloginline{x ~ distribution(n@$_1$@,...,n@$_K$@)}, where \probloginline{x} denotes a term,  the \probloginline{n@$_i$@}'s are numerical terms and
\probloginline{distribution} expresses the probability distribution according to which \probloginline{x} is distributed.
\end{definition}

\begin{example}[Distributional fact]
To declare a Poisson distributed variable \probloginline{x} with rate parameter $\lambda$, one would write \probloginline{x ~ poisson(@$\lambda$@).}
\end{example}

The meaning of a distributional fact is that all ground instances $\text{\probloginline{x}}\theta$ serve as random variables that are distributed according to $\text{\probloginline{distribution(n@$_1\theta$@,...,n@$_K\theta$@)}}$.
To obtain a neural-symbolic interface, we will allow neural networks to parametrise these distributions.

\begin{definition}[Neural distributional fact]
\label{def:ndf}
A \emph{neural distributional fact} (NDF) is a distributional fact \probloginline{} in which
a subset
% $\{ \text{\probloginline{f@$_j$@}}\}_{j=1}^{L} $ 
of the set of numerical terms $\{ \text{\probloginline{n@$_i$@}}\}_{i=1}^{K}$ is implemented by neural networks that depend on a set of parameters $\Neuralparams$.
\end{definition}

Random variables defined by NDFs can then be used in the logic in the form of comparisons, e.g., $\probloginline{x} > \probloginline{y}$, to reason about desired ranges of the variables.

\begin{definition}(Probabilistic comparison formula)
\label{def:pcf}
A \emph{probabilistic comparison formula} (PCF) is an expression of the form $(g(\boldsymbol{x}) \bowtie 0)$,
where $g$ is a function applied to the set of random variables $\variables{x}$ and $\bowtie\ \in
\left\{
\text{\probloginline{<}},
\text{\probloginline{=<}},
\text{\probloginline{>}},
\text{\probloginline{>=}},
\text{\probloginline{=:=}},
\text{\probloginline{=\=}}
\right\}$ is a binary comparison operator.
A PCF is called \emph{valid} if $\left\{\boldsymbol{x}\ |\ g(\boldsymbol{x}) \bowtie 0\right\}$ is a \emph{measurable} set.
\end{definition}

\begin{example}[Probabilistic comparison formula]
If \probloginline{x} is Poisson distributed and represents the number of chocolate pieces put in a chocolate biscuit, then we can use a simple PCF to define when such a biscuit passes a quality test through the rule \probloginline{passes_test :- (x > 11).}
\end{example}

Note that the general form of a PCF in Definition~\ref{def:pcf} has a $0$ on the right hand side, which can always be obtained by subtracting the right hand-side from both sides of the relation.
With the definitions of NDFs and PCFs, a \dspl program can now be formally defined.

\begin{definition}[\dspl program]
A \emph{\dspl program} consists of
a finite set of  NDFs $\dfacts$ (defining random variables),
a finite set $\compset$ of  valid PCFs
and a set of logical rules $\lrules$ that can use any of those valid PCFs in their bodies.
\end{definition}

\begin{example}[\dspl program]
\label{ex:dsplprogram}
\probloginline{humid} denotes a Bernoulli
random variable that takes the value $1$ with a probability $p$ given by the output of a neural network \probloginline{humid_detector}. \probloginline{temp} denotes a normally distributed variable whose parameters are predicted by a network \probloginline{temperature_predictor}.
The program further contains two rules that deduce whether we have good weather or not. The first one expresses the case of snowy weather, while the second holds for a rather temperate and dry situation.
The atom \probloginline{query(good_weather(}$\inlineimage{Imagery/world.png}$\probloginline{))} declares that we want to compute the probability of \probloginline{good_weather} when evaluated on the data $\inlineimage{Imagery/world.png}$.
It illustrates the neural-symbolic nature of \dspl, as its ground argument is a sub-symbolic representation ($\inlineimage{Imagery/world.png}$) of the world. In an actual program, the $\inlineimage{Imagery/world.png}$ symbol would be represented by a variable.

{\footnotesize
\begin{problog}
humid(Data) ~ 
    bernoulli(humid_detector(Data)).
temp(Data) ~ 
    normal(temperature_predictor(Data)).

good_weather(Data):- 
    humid(Data) =:= 1, temp(Data) < 0.

good_weather(Data):- 
    humid(Data) =:= 0, temp(Data) > 15.

query(good_weather(@$\inlineimage{Imagery/world.png}$@)).
\end{problog}
}
\end{example}

Notice how the random variables \probloginline{humid} and \probloginline{temp} appear in the body of a logical rule with comparison operators.
In our probabilistic setting, the truth value of a comparison depends on the value of its random variables and is thus random itself.

\paragraph{\dspl generalises a range of existing PLP languages.}
For instance, if we were to remove the distributional fact on \probloginline{temp} and all the PCFs using them, we would obtain a \dpl program~\citep{manhaeve2021neural}. If we additionally replace the neural network in \probloginline{humid} with a fixed probability \probloginline{p}, we end up with a probabilistic logic program~\citep{de2007problog}. Replacing that constant probability \probloginline{p} by a constant \probloginline{1} yields a non-probabilistic Prolog program.
Alternatively, considering all rules and facts in Example~\ref{ex:dsplprogram} but
replacing the neural parameters of the normal distribution with numeric constants results in a Distributional Clause program~\citep{gutmann2011magic}.
We further discuss these connections
in Appendix A, where we also formally prove that \dspl strictly generalises \dpl.

\subsection{Semantics}

\dspl programs are used to
compute the probability that a ground atom \probloginline{q} is entailed.
That probability follows from the semantics of a \dspl program.
As is custom in (probabilistic) logic programming, we will define the semantics of \dspl with respect to ground programs.
We will assume that each ground distributional fact $f \in \dfacts$ defines a different random variable, as each random variable can only have one unique distribution.
Also notice that any ground neural distributional facts will contain the inputs to their neural functions.
In a sense, a \dspl program is conditioned on these neural network inputs.

To define the semantics of ground \dspl programs, we first introduce the possible worlds over the PCFs.
Every subset $\compsubset$ of a set of PCFs $\compset$ defines a possible world $\world_{\compsubset} = \{ \compsubset \cup h \theta \mid \lrules \cup \compsubset  \models h\theta \text{ and } h\theta \text{ is ground} \}$.
Intuitively speaking, the comparisons in such a subset are considered to be true and all others false. A rule with a comparison in its body that is not in this subset can hence not be used to determine the truth value of atoms. The deterministic rules $\lrules$ and the subset $\compsubset$ together define a set of all ground atoms $h\theta$ that are derivable, i.e., entailed by the program, and thus considered true. Such a set is called a \textit{possible world}. We refer the reader to the paper of~\citet{deraedt2015concepts} for a detailed account of possible worlds in a PLP context.
Following the distribution semantics of~\citet{sato1995statistical} and by taking inspiration from~\citet{gutmann2011magic}, we define the probability of a possible world.

\begin{definition}[Probability of a possible world]
Let \dsplprogram be a ground \dspl program
and $\compsubset=\{ c_1,\dots, c_H \} \subseteq \compset$ a set of PCFs that depend on the random variables declared in the set of distributional facts $\dfacts$. The probability $P(\world_{\compsubset})$ of a world $\world_{\compsubset}$
is then defined as
\begin{align}
        \int
        \left[
            \Big(\prod_{c_i\in \compsubset} \indicator(c_i) \Big)
            \Big( \prod_{c_i \in \compset\setminus \compsubset}  \indicator(\bar{c}_i) \Big)
        \right]
        \ \differential P_{\dfacts}.
        \label{eq:world_probability}
\end{align}

Here the symbol $\indicator$ denotes the indicator function, $\bar{c}_i$ is the complement of the comparison $c_i$ and $\differential P_{\dfacts}$ represents the joint probability measure of the random variables defined in the set of distributional facts $\dfacts$.
\end{definition}

\begin{example}[Probability of a possible world]
Given \dsplprogram as in Example \ref{ex:dsplprogram}, 
where \probloginline{humid_detector(data1)} predicts $p(\text{\probloginline{data1}})$ and \probloginline{temperature_predictor(data1)} predicts the tuple $(\mu(\text{\probloginline{data1}}),\sigma(\text{\probloginline{data1}}))$, the probability of the possible world $\omega_{\left\{\text{\probloginline{temp(data1)>15}},\ \text{\probloginline{humid(data1)=:=1}}\right\}}$ is given by
\begin{align}
    p(\text{\probloginline{data1}})\cdot
    \int\indicator(x {>} 15)
    \frac{
    \exp
    \left(
        -\frac{
            \left(x-\mu(\text{\probloginline{data1}}) \right)^2
        }
        {
            2 \sigma^2(\text{\probloginline{data1}})
        }
    \right)
    }{\sqrt{2\pi} \sigma(\text{\probloginline{data1}})}
    \ \differential x.
\end{align}
Indeed, the measure $\differential P_{\dfacts}$ decomposes into a counting measure and the product of a Gaussian density function with a differential. The counting measure leads to the factor $p(\text{\probloginline{data1}})$, since that is the probability that \probloginline{humid(data1)=:=1}. Hence, the products in Equation~\ref{eq:world_probability} reduce to a single indicator of the PCF $(x > 15)$.
\end{example}

\begin{definition}[Probability of query atom]
The probability of a ground atom $q$ is given by
\begin{align}
    P(q) = \sum_{\compsubset \subseteq \compset : q \in \world_{\compset} } P(\world_{\compsubset})
    \label{eq:query_probability}.
\end{align}
\end{definition}

\begin{restatable}[Measureability of query atom]{proposition}{querymeasurability}
\label{proposition:query_measurability}
Let \dsplprogram be a \dspl program, then
\dsplprogram 
defines, for an arbitrary query atom $q$, the probability that $q$ is true.
\label{prop:semantics}
\end{restatable}
% \phantom{=}
\begin{prf}
See Appendix B.
\end{prf}

\section{Inference and learning}\label{sec:learning}

\subsection{Inference via weighted logic}
\label{subsec:reduction}

A popular technique to perform inference in probabilistic logic programming uses a reduction to so-called {\em weighted model counting} (WMC); instead of computing the probability of a query, one computes the weight of a propositional logical formula~\citep{chavira2008probabilistic,fierens2015inference}. For \dspl, the equivalent approach is to map a ground program onto a {\em satisfiability modulo theory} (SMT) formula~\citep{barrett2018satisfiability}. The analogous concept to WMC for these formulas is {\em weighted model integration} (WMI)~\citep{belle2015probabilistic}, which can handle infinite sample spaces. 
In all that follows, for ease of exposition, we assume that all joint probability distributions are continuous.

\begin{restatable}[Inference as WMI]{proposition}{inferenceaswmi}
\label{proposition:inferenceaswmi}
Assume that the measure $\differential P_{\dfacts}$ decomposes into a joint probability density function $\weight(\variables{x})$ and a differential $\differential \variables{x}$, then the probability $P(q)$ of a query atom $q$ can be expressed as the weighted model integration problem
\begin{align}
    % P(q) =
    \int \left[
        \sum_{\compsubset \subseteq \compset : q \in \world_{\compset}} \prod_{c_i\in \compsubset {\cup} \negcompsubset}  \indicator(c_i(\variables{x})) 
    \right]
    \weight(\variables{x})
    \ \mathrm{d}\variables{x},
    \label{eq:probability_query_as_wmi}
\end{align}
where
$
    \negcompsubset \coloneqq \left\{\bar{c}_i\ |\ c_i \in \compset {\setminus} \compsubset\right\}
$
.
\end{restatable}
% \phantom{=}
\begin{prf}
See Appendix C.
\end{prf}

Being able to express the probability of a queried atom in \dspl as a weighted model integral allows us to adapt and deploy inference techniques developed in the weighted model integration literature for \dspl.
We opt for the approximate inference algorithm \q{Sampo} presented in~\citet{zuidberg2019exact} because of its more scalable nature.
Sampo uses knowledge compilation~\citep{darwiche2002knowledge}, a state-of-the-art technique for probabilistic logic inference~\citep{chavira2008probabilistic,fierens2015inference}.
Intuitively, knowledge compilation is a two-step procedure applied to a logical formula with PCFs, i.e., an SMT formula.
First, it infers the exact probability of all PCFs containing discrete variables through symbolic inference.
Then, it converts the remainder of the SMT formula into a polynomial in terms of those exact probabilities and the PCFs containing continuous random variables (Figure~\ref{fig:amcdiagram2}).
This polynomial is the integrand of Equation~\ref{eq:probability_query_as_wmi}. All that remains is to approximate the integration of this polynomial by sampling from the joint probability distribution $\weight(\variables{x})$ of the continuous random variables.
In other words, Sampo computes the expression
\begin{align}
    P(q) =
    \int \amc(\variables{x}) \cdot  \weight(\variables{x}) \, \differential\variables{x}
   % \ \approx\ 
   \approx
    \frac{1}{|\mathcal{X}|}
    \sum_{\variables{x} \in \mathcal{X}} \amc(\variables{x}),
    \label{eq:probability_query_approx}
\end{align}
where $\mathcal{X}$ denotes a set of samples drawn from $\weight(\variables{x})$ and $\amc(\variables{x})$ is the result of knowledge compilation, i.e., the sum of products of indicator functions in Equation~\ref{eq:probability_query_as_wmi}.

\begin{figure}
    \centering
    \includegraphics[width=\linewidth]{Imagery/AMCdiagram2.pdf}
    \caption{Diagrammatic representation of the result of knowledge compilation for the query in Example~\ref{ex:dsplprogram}.
    The \textcolor{celadon_blue}{blue} boxes originate from PCFs over discrete variables, while the \textcolor{orange_red}{orange} ones are PCFs over continuous variables. Note how the discrete variable PCFs are reduced to their exact probabilities while the continuous PCFs still need to be inferred.
    }
    \label{fig:amcdiagram2}
\end{figure}

We stress that the Sampo algorithm only samples random variables whose expected value with respect to the function $\amc(\variables{x})$ can not be computed exactly. 
Hence, in the absence of continuous random variables, our implementation of \dspl using Sampo coincides with \dpl on both a semantics level and inference level.


\subsection{Learning via differentiation}\label{subsec:learning}

A \dspl program depend on a set of (neural) parameters
$\Neuralparams$ (Definition~\ref{def:ndf}).
In order to optimise these parameters, we need to take their gradients of a loss function that compares the probability $P(q)$ to a training signal. More precisely, we need to compute the derivative
\begin{align}
     \deriv \mathcal{L}(P_{\Neuralparams}(q)) =
%    \ \approx\ 
    \partial_{P_{\Neuralparams}(q)} \mathcal{L}(P_{\Neuralparams}(q))\cdot \deriv P_{\Neuralparams}(q), 
\end{align}
where we explicitly indicate the dependency of the probability on $\Neuralparams$ and $\neuralparam\in \Neuralparams$.
Differentiating $P_{\Neuralparams}(q)$ with respect to $\lambda$ presents two obstacles.
First,
the question of differentiating through the sampling  process of Equation~\ref{eq:probability_query_approx}
and second,
the non-differentiability of the indicator functions in $\amc(\variables{x})$ \citep{zuidberg2019differentiation}.

The non-differentiability of sampling is tackled using the reparametrisation trick~\citep{ruiz2016generalized}.
Reparametrisation offers better estimates than other approaches, such as REINFORCE~\citep{williams1992simple} and is readily utilised in modern probabilistic programming languages such as Tensorflow Probability~\citep{tran2017deep} and Pyro~\citep{bingham2019pyro}.
Conversely, the non-differentiability of the indicator functions prevents swapping the order of differentiation and integration~\citep{flanders1973differentiation}, which we resolve by applying continuous relaxations following the work of~\citet{petersen2021learning}.
Together, we obtain the gradient estimate
\begin{align}
    \deriv P_{\Neuralparams}(q)
    &= 
    \deriv 
    \int \amc(\variables{x}) \cdot  \weight_{\Neuralparams}(\variables{x}) \, \differential\variables{x} \\
    &\approx\ 
    \int
    \left[
        \deriv \amc_\softened(\reparam)
    \right] \cdot 
    p(\variables{u})
    \ \differential \variables{u}
    \label{eq:probability_derivative}
    ,
\end{align}
where the subscript $s$ in $\amc_\softened(\variables{x}) $ denotes the continuously relaxed or \q{softened} version of $\amc(\variables{x})$ and $r(\variables{u}, \Neuralparams)$ is the reparametrisation function.

\paragraph{Our gradient estimate using relaxations is asymptotically unbiased.}
As an example of these relaxations, consider the indicator of a PCF $(g(\boldsymbol{x}) > 0)$, which is relaxed into the sigmoid $\sigma(\coolness\cdot g(\boldsymbol{x}))$.
Appendix D provides more details on relaxations of general PCFs.
The \textit{coolness} parameter $\coolness\in \mathbb{R}_+^0$
determines the strictness of the relaxation. Hence, we recover the hard indicator function when $\coolness{\rightarrow} +\infty$.
Note that relaxing indicator functions 
introduces bias.
\citet{petersen2021learning} already stated in their work that, in the infinite coolness limit, a relaxed function coincides with the non-relaxed one. Proposition~\ref{proposition:unbiasedapprox} extends this result
to the derivatives of relaxed and non-relaxed functions, proving that our gradient estimate is asymptotically unbiased.

\begin{restatable}[Unbiased in the infinite coolness limit]{proposition}{unbiasedapprox}
\label{proposition:unbiasedapprox}
Let $\mathbb{P}$ be a \dspl program with PCFs $(g_i(\boldsymbol{x}) \bowtie 0)$ and corresponding coolness parameters $\coolness_i$. \\
If all $\deriv (g_i \circ r)$ are locally integrable over $\mathbb{R}^k$ and
every $\coolness_i \rightarrow +\infty$,
then we have, for any query atom $q$, that
\begin{align}
    \deriv P(q)
    =
    \int
    \deriv\amc_\softened(\reparam)
    \cdot p(\variables{u})
    \ \differential \variables{u}
    .
\end{align}
\end{restatable}
\begin{proof}
The proof makes use of the mathematical theory of distributions~\citep{schwartz1957theorie}, which generalise the concept of functions, and is given in Appendix E.
\end{proof}

Finally, we obtain a practical and unbiased estimate of $\deriv P_{\Neuralparams}(q)$ using a set of samples $\mathcal{U}$ drawn from 
$p(\variables{u})$.
\begin{align}
    \deriv P(q)
    &\approx\ 
    \int
        \left[
            \deriv      \amc_\softened(\reparam) 
        \right]
        \cdot p(\variables{u})
    \ \differential \variables{u} \\
    &\approx\ 
    \frac{1}{|\mathcal{U}|} \sum_{\variables{u} \in \mathcal{U}} \deriv \amc_\softened(\reparam).
    \label{eq:WMIDerFinalApprox}
\end{align}
Computing this gradient estimate does not require drawing new samples. Implementing the relaxations of PCFs in a \q{straight-through} manner allows us to directly apply automatic differentiation on the inferred probability.

\subsection{Probabilistic programming connections}

Since knowledge compilation symbolically infers discrete random variables, we only have to sample from a continuous joint probability distribution.
To sample such distributions, we can fully exploit the advanced inference and learning techniques~\citep{hoffman2014no} of modern probabilistic programming languages~\citep{tran2017deep, bingham2019pyro}. Our implementation of \dspl utilises Tensorflow Probability for this task, effectively using knowledge compilation as a differentiable bridge between logical and probabilistic reasoning.
While this bridge is limited to sampling techniques for now, it presents an interesting direction for future work to completely unify NeSy with DPP.



\subsection{Limitations}\label{subsec:limitations}

While the use of relaxations is well-known and used in recent gradient estimators~\citep{tucker2017rebar, grathwohl2018backpropagation}, the bias they introduce is often hard to deal with in practice.
In our case, this bias only reduces to zero in the infinite coolness limit (Proposition~\ref{proposition:unbiasedapprox}), meaning the use of annealing can be necessary. Finding a good annealing scheme for any problem is non-trivial and effectively introduces another component in need of optimisation. However, as relaxations allow the use of the reparametrisation trick, the resulting lower variance estimates together with our theoretical guarantees support our choice. A more detailed discussion of the current limitations of \dspl can be found in Appendix H.



\section{Related work}
\label{sec:related}

From a NeSy perspective the formalism most closely related to \dspl is that of {\em Logic Tensor Networks} (LTNs)~\citep{badreddine2022logic}. The main difference between LTNs and \dspl is the fuzzy logic semantics of the former and the probabilistic semantics of the latter.
Interestingly, LTNs and other NeSy approaches based on fuzzy logic also require relaxations to incorporate continuous values.
However, fuzzy-based approaches require these relaxations at the semantics level, in contrast to \dspl.
Even more, they can only compare continuous point values instead of more general continuous random variables.
LTNs' fuzzy semantics also exhibit drawbacks on a more practical level.
Unlike \dspl with its probabilistic semantics, LTNs are not inherently capable of neural-symbolic generative modelling (Section~\ref{subsec:vae}). 
For a broader overview of the field of neural-symbolic AI, we refer the reader to a series of survey papers that have been published in recent years~\citep{garcez2019neural, marra2021statistical,garcez2022neural}.

From a probabilistic programming perspective, \dspl is related to languages that handle discrete and continuous random variables such as \emph{BLOG}~\citep{milch2006probabilistic}, \emph{Distributional Clauses}~\citep{gutmann2011magic} and \emph{Anglican}~\citep{tolpin2016design}, which have all been given declarative semantics, i.e., the meaning of the program does not depend on the underlying inference algorithm. However, these languages have the drawback of non-differentiability.
This drawback stands in stark contrast to end-to-end (deep) probabilistic programming languages such as Pyro~\citep{bingham2019pyro} or Tensorflow Probability~\citep{dillon2017tensorflow}, but these have only been equipped with operational semantics and do not support logical constraints.
\dspl not only introduces the ability to express such logical constraints in the form of PCFs to construct challenging posterior distributions, but
does so in an end-to-end differentiable fashion.

Finally, our gradient estimate can be related to relaxation-based methods like REBAR~\citep{tucker2017rebar} or RELAX~\citep{grathwohl2018backpropagation}, but without the REINFORCE~\citep{williams1992simple} inspired component. Instead, we utilise the differentiability of knowledge compilation to obtain exact gradients of discrete variables.
Since our inference scheme innately requires knowledge compilation, the use of other discrete gradient estimators like~\citep{niepert2021implicit} does not directly apply to \dspl.
Moreover, we exploit the structure of our problem by directly relaxing comparison formulae in a sound manner~\citep{petersen2021learning}, in contrast to introducing an artificial relaxation of the whole problem~\citep{grathwohl2018backpropagation}.



\section{Experimental Evaluation}\label{sec:experiments}

We illustrate the versatility of \dspl by tackling three different problems. Section~\ref{subsec:dates} discusses the detection of handwritten dates without location supervision. In Section~\ref{subsec:hybridnet} a hybrid Bayesian network with conditional probabilities dependent on the satisfaction of certain logical constraints will be optimised. Finally, Section~\ref{subsec:vae} introduces neural-symbolic variational auto-encoders, inspired by~\citet{misino2022vael}. 

The details of our experimental setup, including the precise \dspl programs, coolness annealing schemes and hyperparameters used for the neural networks are given in Appendix F.  

\subsection{Neural-symbolic attention}\label{subsec:dates}

% It is difficult to compare the performance of \dspl to other, existing methods because of its unique combination of discrete-continuous probabilistic logic and neural networks.
A problem that cannot yet be solved to a satisfactory degree by purely neural or other neural-symbolic systems is detecting handwritten years.
Given a single image with a handwritten year, the task is to predict the correct year as a sequence of 4 digits together with the location of these digits (Figure~\ref{fig:probattention}, left).
This year can be anywhere in the image and the only supervision is in the digits of the year, \emph{not} where these digits are in the image. In other words, the problem is equivalent to object detection \emph{without} bounding box supervision.

Solving such a problem seems to be out of scope for current methods.
On the one hand, existing neural approaches are often complex pipelines of neural components that break end-to-end differentiability~\citep{seker2022generalized}.
On the other hand, current neural-symbolic methods lack sufficient spatial reasoning capabilities in order to perform the necessary image segmentation.

We exploit probabilistic programming by modelling the location of a digit as a deep generalised normal distribution~\citep{nadarajah2005generalized}. 
That is, we use a convolutional neural network to regress the parameters of four generalised normal distributions, one for each digit of a year.
Then, we take inspiration from the spatial transformer literature~\citep{carion2020end} and convert the distribution of each location to an attention map (Figure~\ref{fig:probattention}, right).

\begin{figure}
    \centering
    \includegraphics[width=0.35\linewidth]{Imagery/2186_original.pdf}
    \hspace{0.10\linewidth}
    \includegraphics[width=0.35\linewidth]{Imagery/2186.pdf}
    \caption{
    On the left, an example of a handwritten year.
    On the right, the attention map for the digit \q{8} as a generalised normal distribution.
    Intuitively, we can view generalised normal distributions as differentiable bounding boxes. This allows gradients to flow from a downstream classification network to the regression component.
    }
    \label{fig:probattention}
\end{figure}

In our experimental validation we compare \dspl to a neural baseline and logic tensor networks.
The neural baseline applies the four probabilistic attention maps, one for the location of each of the four digits, to the input image.
The resulting four attenuated images correspond to the four digits from left to right and are passed on in that order to a classification network without additional reasoning.
Importantly, we maintain the same order from left to right for the classifications.
With \dspl, we encode that a year is a sequence of digits, i.e., the order matters, by enforcing an explicit order on the digit locations.
Doing so requires spatial reasoning, i.e., reasoning which digit is at which location.
For LTNs, we encode the same information.
However, as LTNs lack a proper distribution semantics, they can only reason on the level of the expected values of the generalised normal distributions. 

In our experiment, the sets of years appearing in the training, validation and test data are all disjoint.
Moreover, the sets of handwritten digits used to generate those years are also disjoint.
Partitioning the data in such a way leads to a challenging learning problem; 
the difficulty lies in out-of-distribution inference, as the years and handwritten digits in the validation and test set have never been seen during training.

We evaluate all methods in terms of accuracy and Intersection-over-Union (IoU).
For the accuracy, we compare the sequence of predicted digits to the correct sequence of digits constituting a year.
A prediction is correct if \emph{all} digits are correctly predicted in the right order.
For the IoU, we map each predicted generalised normal distribution to a bounding box by using the mean as the centre and the scale parameter as the width of the box. The IoU is then given by the overlap between this box and the true location of the handwritten digit.

\begin{table}[t]
    \caption{Mean accuracy and IoU with standard error for classifying the correct year, taken over 10 runs.
    }
    \label{table:yearresults}
    \begin{center}
    \begin{tabular}{ccc}
        \toprule
        Method & \multicolumn{2}{c}{Results} \\
        \midrule
        & acc. & IoU \\
        \cmidrule{2 - 3}
        \dspl & $93.77\pm 0.57\phantom{0}$ & $17.69\pm 0.23$  \\

        LTN &  $76.50\pm 12.10$ & $10.73\pm 1.69$ \\
        
        Neural Baseline &  $54.71\pm 14.33$ & \phantom{0}$6.26\pm 1.77$ \\
        \bottomrule
    \end{tabular}
    \end{center}
\end{table}

We present our results in Table~\ref{table:yearresults}.
The most striking observation is the poor performance and large variance of the neural baseline. 
It fails to predict the location of the digits in the right order, as can be seen from the lower IoU values.
Since classification depends on the predicted locations, these lower values also explain the lack in accuracy.
We can conclude that the neural baseline struggles to generalise to out-of-distribution data.
While LTNs fare better, the high standard error on the accuracy indicates that their continuous reasoning capabilities are insufficient to guarantee consistent solutions.
\dspl distinguishes itself by a higher and more consistent accuracy.
The reason is also clear; \dspl exploits the entire domain of the distribution of each location. This then leads to a higher IoU value that in turn results in a higher accuracy.



\subsection{Neural hybrid Bayesian networks}\label{subsec:hybridnet}

Hybrid Bayesian networks~\citep{lerner2003hybrid} are probabilistic graphical models that combine discrete and continuous random variables.
\dspl allows for the introduction of optimisable neural components and logical constraints to such models, as shown in Example~\ref{ex:dsplprogram}.
We further extend this example (Figure~\ref{fig:weathergraph}) and specify the datasets that form the input to the various neural networks.
The temperature is predicted from a real meteorological dataset~\citep{cho2020comparative} and we use CIFAR-10 images as proxies for observing clouds and humidity.
Moreover, dependencies on a number of constraints are added, which goes beyond the capabilities of traditional probabilistic programming.

\begin{figure}
    \centering
    \includegraphics[width=0.5\linewidth]{Imagery/bayesnet.pdf}
    \caption{Graphical model of \textbf{E}njoying the weather (\textbf{E}).(\textbf{E}) holds when \textbf{D}epressed (\textbf{D}) is not true and there is \textbf{G}ood weather (\textbf{G}).
    A person has a higher probability of being depressed when it is \textbf{C}loudy (\textbf{C}), while the degree of good weather is beta distributed depending on various logical constraints on \textbf{T}emperature (\textbf{T}) and \textbf{R}ain (\textbf{R}).
    Finally, rain is probable when it is both \textbf{C}loudy and \textbf{H}umid (\textbf{H}).}
    \label{fig:weathergraph}
\end{figure}

Our neural Bayesian model was optimised by only giving probabilistic supervision on whether \textbf{E} was true or false, i.e., the weather was enjoyed or not.
Given our model, such distant supervision only translates into a learning signal on different \emph{ranges} of temperature values that satisfy different PCFs. We will see that \dspl's reasoning over the full domain of the temperature distribution allows it to perform meaningful density estimation from such a signal.

The optimised Bayesian model can be evaluated in two ways. First, the accuracy on CIFAR-10 of the networks utilised in \textbf{C}loudy and \textbf{H}umid, which were
$95.24\pm 3.32$ and $98.96\pm 0.11$, respectively.
Second, we measure the quality of the density estimation on \textbf{T}emperature by looking at the MSE between the true and predicted mean values, which was 
$0.1799 \pm 0.0139$
.
Importantly, \dspl was able to approximate the standard deviation of \textbf{T}emperature from just the distant supervision, deviating by only
$0.60\pm 0.22$. 



\subsection{Neural-symbolic variational auto-encoder}\label{subsec:vae}

Probabilistic programming is well-suited to generative tasks, but it can not perform generation conditioned on logical constraints.
Inspired by the work of~\citet{misino2022vael}, we showcase how \dspl extends the generative power of probabilistic programming to such constraints.
To this end, we will consider the task of learning to generate 2 images of digits given the value of their subtraction.

A diagrammatic overview of our \dspl program is given in Figure~\ref{fig:vae_diagram}.
It uses a conditional variational auto-encoder (CVAE)~\citep{sohn2015learning} to generate images conditioned on a digit value.
\dspl finds those digit values from a given subtraction result by logical reasoning.
It can also condition generation on other variables in the CVAE latent space as this space is an integral part of \dspl's deep, relational model.
We will exploit this property later on when we extend the task to generating digits in the same writing style as a given image without \emph{any} additional optimisation.

\begin{figure}
    \centering
    \includegraphics[width=0.95\linewidth]{Imagery/nesyvae.pdf}
    \caption{
    Given example pairs of images and the value of their subtraction, e.g., $(\inlineimage{Imagery/2_6_6_7_original.png}, \inlineimage{Imagery/5_3_29_2_original.png})$ and $3$, the CVAE encoder \probloginline{vae_latent} first encodes each image into a multivariate normal NDF (\textcolor{orange_red}{\probloginline{latent}}) and a latent vector. The latter is the input of a categorical NDF \textcolor{celadon_blue}{\probloginline{digit}}, completing the CVAE latent space. Supervision is dual; generated images are compared to the original ones in a probabilistic reconstruction loss, while both digits need to subtract to the given value.
    }
    \label{fig:vae_diagram}
\end{figure}

Both the CVAE and digit classifier are successfully trained jointly. Example generations of images that satisfy the subtraction result
$
\inlineimage{Imagery/generation.pdf}
{-}
\inlineimage{Imagery/generation.pdf}
= 5
$
can be seen below. In general, \dspl finds all possible digits that subtract to a given value and generates images for each correct combination. Below, we left out 2 such combinations for clarity of exposition.

\begin{figure}[ht]
\includegraphics[width=0.12\linewidth]{Imagery/5generation3_fixed11gray.png}
\includegraphics[width=0.12\linewidth]{Imagery/5generation3_fixed12gray.png}
\hfill
\includegraphics[width=0.12\linewidth]{Imagery/5generation3_fixed41gray.png} 
\includegraphics[width=0.12\linewidth]{Imagery/5generation3_fixed42gray.png}
\hfill
\includegraphics[width=0.12\linewidth]{Imagery/5generation3_fixed51gray.png} 
\includegraphics[width=0.12\linewidth]{Imagery/5generation3_fixed52gray.png}
\end{figure}

While our program is inspired by the VAEL architecture of \citet{misino2022vael}, conceptual differences exist.
Most notably, for VAEL, the image generation resides outside the probabilistic logic program. Conversely, the CVAE, including its latent space, is explicitly declared and accessible in \dspl.
This difference allows \dspl to generalise to conditional generative queries that differ significantly from the original optimisation task.
For example, we can \emph{zero-shot} query the program to fill in the blank in 
$
\inlineimage{Imagery/imageconditionalgeneration_input2_grey.png}
{-}
\inlineimage{Imagery/generation.pdf}
= \text{\probloginline{Diff}}$
instead of the two blanks of the learning task
$
\inlineimage{Imagery/generation.pdf}
{-}
\inlineimage{Imagery/generation.pdf}
= \text{\probloginline{Diff}}$.
Even more, we can enforce that the generated digit is in the same writing style as the given digit by conditioning the generation on the latent space of the given image (Figure~\ref{fig:querygens}).
\begin{figure}[t]
\begin{minipage}{0.23\linewidth}
\includegraphics[width=\linewidth]{Imagery/imageconditionalgeneration_og3_grey.png}
\vspace{5pt}
\centering
\includegraphics[width=0.30\linewidth]{Imagery/conditionalgen_og3_-2diff.png}
\includegraphics[width=0.30\linewidth]{Imagery/conditionalgen_og3_-5diff.png}
\includegraphics[width=0.30\linewidth]{Imagery/conditionalgen_og3_3diff.png}
\end{minipage}
\hfill
\begin{minipage}{0.23\linewidth}
\includegraphics[width=\linewidth]{Imagery/imageconditionalgeneration_og4_grey.png}
\vspace{5pt}
\centering
\includegraphics[width=0.30\linewidth]{Imagery/conditionalgen_og4_-5diff.png}
\includegraphics[width=0.30\linewidth]{Imagery/conditionalgen_og4_1diff.png}
\includegraphics[width=0.30\linewidth]{Imagery/conditionalgen_og4_3diff.png}
\end{minipage}
\hfill
\begin{minipage}{0.23\linewidth}
\includegraphics[width=\linewidth]{Imagery/imageconditionalgeneration_og5_grey.png}
\vspace{5pt}
\centering
\includegraphics[width=0.30\linewidth]{Imagery/conditionalgen_og5_1diff.png}
\includegraphics[width=0.30\linewidth]{Imagery/conditionalgen_og5_3diff.png}
\includegraphics[width=0.30\linewidth]{Imagery/conditionalgen_og5_5diff.png}
\end{minipage}
\hfill
\begin{minipage}{0.23\linewidth}
\includegraphics[width=\linewidth]{Imagery/imageconditionalgeneration_og8_grey.png}
\vspace{5pt}
\centering
\includegraphics[width=0.30\linewidth]{Imagery/conditionalgen_og8_-1diff.png}
\includegraphics[width=0.30\linewidth]{Imagery/conditionalgen_og8_4diff.png}
\includegraphics[width=0.30\linewidth]{Imagery/conditionalgen_og8_6diff.png}
\end{minipage}
\centering
\caption{Four random images of right digits (top row) and their generated left digits for 3 given random difference values (bottom row). Note the preservation of the style of the given minuends.}
\label{fig:querygens}
\end{figure}



\section{Conclusion}\label{sec:conclusion}

We presented \dspl, a novel neural-symbolic probabilistic logic programming language that integrates hybrid probabilistic logic and neural networks. Inference is dealt with efficiently through approximate weighted model integration while learning is facilitated by reparametrisation and continuous relaxations of non-differentiable logic components. Our experiments illustrate how \dspl is capable of intricate probabilistic modelling allowing for meaningful weak supervision while maintaining strong out-of-distribution performance. Moreover, they show how hybrid probabilistic logic can be used as a flexible structuring formalism for the neural paradigm that can effectively optimise and reuse neural components in different tasks. 

\begin{acknowledgements}
This research is funded by TAILOR, a project from the EU Horizon 2020 research and innovation programme under GA No 952215.
It was also supported by the Wallenberg AI, Autonomous Systems and Software Program (WASP) funded by the Knut and Alice Wallenberg Foundation.
We also have to acknowledge support from Flanders AI, FWO and the KU Leuven Research Fund.
Finally, we want to thank Gust Verbruggen for his suggestion of learning to classify handwritten dates, as it proved to be an excellent benchmark.
\end{acknowledgements}

\bibliography{references}  

\clearpage

\end{document}
