\documentclass[accepted]{uai2022} %

\usepackage[american]{babel}

\usepackage[usenames,dvipsnames]{xcolor}
\usepackage{natbib} %
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} %
\usepackage{booktabs} %
\usepackage{tikz} %



\newcommand{\swap}[3][-]{#3#1#2} %

\usepackage{xspace}
\usepackage{amsmath}
\usepackage{amsthm}
\usepackage{multirow}
\usepackage{graphicx}
\usepackage{multirow}
\usepackage{booktabs}
\usepackage{amssymb}
\usepackage{stmaryrd}
\usepackage{bbm}
\usepackage{mathtools}
\usepackage{cancel}
\usepackage{caption}
\usepackage{subcaption}
\usepackage{algorithm}
\usepackage{cleveref}
\usepackage[noend]{algorithmic}
\newcommand{\entropy}{\textsc{Ent}}
\newcommand{\cache}{\mathsf{c}}

\newcommand{\LineIf}[2]{     
    \STATE \algorithmicif\ {#1}\ \algorithmicthen\ {#2} 
}
\newcommand{\LineIfElse}[3]{     
    \STATE \algorithmicif\ {#1}\ \algorithmicthen\ {#2}\ \algorithmicelse\ {#3}
}

\theoremstyle{definition}
\newtheorem{definition}{Definition}
\newtheorem{assumption}{Assumption}
\newtheorem{axiom}{Axiom}
\newtheorem{proposition}{Proposition}[section]
\newtheorem{claim}{Claim}[section]

\newcommand{\id}[1]{\llbracket{#1}\rrbracket}
\newcommand{\Ind}[1]{{\ensuremath{\mathbbm{1}\!\left\{#1\right\}}}}

\usepackage{siunitx,etoolbox}
\newrobustcmd{\B}{\bfseries}
\addtolength{\tabcolsep}{-2pt}

\newcommand\BibTeX{B\textsc{ib}\TeX}

\title{Neuro-Symbolic Entropy Regularization %
}

\newcommand{\rvar}[1]{\ensuremath{\mathit{#1}}\xspace}
\newcommand{\Xr}{\rvar{X}}
\newcommand{\Yr}{\rvar{Y}}

\newcommand{\rvars}[1]{\ensuremath{\mathbf{#1}}\xspace}
\newcommand{\Xs}{\rvars{X}}
\newcommand{\Ys}{\rvars{Y}}
\newcommand{\Zs}{\rvars{Z}}
\newcommand{\Ws}{\rvars{W}}
\newcommand{\Es}{\rvars{E}}
\newcommand{\Qs}{\rvars{Q}}
\newcommand{\Fs}{\rvars{F}}
\newcommand{\Ps}{\rvars{P}}
\newcommand{\Is}{\rvars{I}}

\newcommand{\jstate}[1]{\ensuremath{\mathbf{#1}}\xspace}
\newcommand{\xs}{\jstate{x}}
\newcommand{\ys}{\jstate{y}}
\newcommand{\zs}{\jstate{z}}
\newcommand{\ws}{\jstate{w}}
\newcommand{\es}{\jstate{e}}
\newcommand{\ps}{\jstate{p}}
\newcommand{\is}{\jstate{i}}

\newcommand{\true}{\mathit{true}}
\newcommand{\false}{\mathit{false}}

\newcommand{\vect}[1]{\ensuremath{\mathbf{\mathsf{#1}}}\xspace}
\newcommand{\pv}{\vect{p}}
\newcommand{\qv}{\vect{q}}

\newcommand{\sloss}{\operatorname{L^s}}

\newcommand{\p}{{p}}
\newcommand{\q}{{q}}
\newcommand{\ch}{\ensuremath{\mathsf{in}}}
\newcommand{\vars}{\ensuremath{\mathsf{vars}}}
\newcommand{\val}{\ensuremath{\mathsf{val}}}
\newcommand{\X}{\ensuremath{\mathbf{X}}}
\newcommand{\x}{\ensuremath{\mathbf{x}}}
\newcommand{\Y}{\ensuremath{\mathbf{Y}}}
\newcommand{\y}{\ensuremath{\mathbf{y}}}
\newcommand{\Z}{\ensuremath{\mathbf{Z}}}
\newcommand{\z}{\ensuremath{\mathbf{z}}}
\newcommand{\supp}{\ensuremath{\mathsf{supp}}}

\makeatletter
\newenvironment{breakablealgorithm}
  {%
   \begin{center}
     \refstepcounter{algorithm}%
     \hrule height.8pt depth0pt \kern-2pt%
     \renewcommand{\caption}[2][\relax]{%
       {\raggedright\textbf{\ALG@name~\thealgorithm} ##2\par}%
       \ifx\relax##1\relax %
         \addcontentsline{loa}{algorithm}{\protect\numberline{\thealgorithm}##2}%
       \else %
         \addcontentsline{loa}{algorithm}{\protect\numberline{\thealgorithm}##1}%
       \fi
       \kern2pt\hrule\kern2pt
     }
  }{%
     \kern2pt\hrule\relax%
   \end{center}
  }
\makeatother

\newcommand{\ra}[1]{\renewcommand{\arraystretch}{#1}}

\newif\ifcomments
\ifcomments
    \providecommand{\kareem}[2][]{{\protect\color{red}{[Kareem:\textbf{#1} #2]}}}
    \providecommand{\kaiwei}[2][]{{\protect\color{red}{[Kaiwei:\textbf{#1} #2]}}}
    \providecommand{\guy}[2][]{{\protect\color{purple}{[Guy:\textbf{#1} #2]}}}
    \providecommand{\eric}[2][]{{\protect\color{ForestGreen}{[Eric:\textbf{#1} #2]}}}
\else
    \providecommand{\kareem}[2][]{}
    \providecommand{\kaiwei}[2][]{}
    \providecommand{\guy}[2][]{}
    \providecommand{\eric}[2][]{}
\fi

\usepackage{tikz}
\usetikzlibrary{circuits.logic.US}
\usetikzlibrary{shapes.misc}
\usetikzlibrary{shapes.arrows}
\usetikzlibrary{arrows,shapes.geometric}
\usetikzlibrary{external}
\usetikzlibrary{positioning}

\usepackage{pgfplots}
\pgfmathdeclarefunction{gaussian}{2}{%
  \pgfmathparse{1/(#2*sqrt(2*pi))*exp(-((x-#1)^2)/(2*#2^2))}%
}
\pgfmathdeclarefunction{gaussianmixture2}{5}{%
  \pgfmathparse{#5*(1/(#2*sqrt(2*pi))*exp(-((x-#1)^2)/(2*#2^2))) + (1-#5)*(1/(#4*sqrt(2*pi))*exp(-((x-#3)^2)/(2*#4^2)))}%
}

\newcommand{\figref}[1]{Fig.~\ref{#1}}
\newcommand{\tbref}[1]{Tab.~\ref{#1}}
\newlength{\picHeight}


\author[1]{\href{mailto:<ahmedk@cs.ucla.edu>?Subject=Your UAI 2022 paper}{Kareem Ahmed}{}}
\author[1]{Eric Wang}
\author[1]{Kai-Wei Chang}
\author[1]{Guy Van den Broeck}
\affil[1]{%
    Computer Science Department\\
    University of California Los Angeles\\
    USA
}

\DeclareMathOperator{\prob}{P}
\begin{document}
\maketitle

\begin{abstract}
In structured output prediction, the goal is to jointly predict several output variables that together encode a structured object -- a path in a graph, an entity-relation triple, or an ordering of objects.
Such a large output space makes learning hard and requires vast amounts of labeled data.
Different approaches leverage alternate sources of supervision.
One approach -- entropy regularization -- posits that 
decision boundaries should lie in low-probability regions.
It extracts supervision from unlabeled examples, but remains agnostic to the structure of the output space.
Conversely, neuro-symbolic approaches exploit the knowledge that not every prediction corresponds to a \emph{valid} structure in the output space. Yet, they do not further restrict the learned output distribution.
This paper introduces a framework that unifies both approaches.
We propose a loss, \emph{neuro-symbolic entropy regularization}, that encourages the model to confidently predict a valid object.
It is obtained by restricting entropy regularization to the distribution over only the valid structures.
This loss can be computed efficiently when the output constraint is expressed as a tractable logic circuit.
Moreover, it seamlessly integrates with other neuro-symbolic losses that eliminate invalid predictions.
We demonstrate the efficacy of our approach on a series of semi-supervised and fully-supervised structured-prediction experiments, where it leads to models whose predictions are more accurate as well as more likely to be valid.















\end{abstract}

\section{Introduction}


Neural networks have achieved breakthroughs across a wide range of domains.
Such breakthroughs are often only possible in the presence of large labeled datasets, which can be hard to obtain.
Increasing efforts are therefore being devoted to approaches that utilize alternate sources of supervision in lieu of \emph{more} labeled data. 
Entropy regularization constitutes one such approach~\citep{grandvalet2005,ssl}.
It posits that data belonging to the same class tend to form discrete clusters.
Minimizing the entropy of the predictive distribution can thus be regarded as minimizing a measure of class overlap under the learned representation.
Intuitively, a classifier guessing uniformly at random has \emph{maximum entropy} and has not learned features that are informative of the underlying class.
Consequently, we prefer a \emph{minimum entropy} classifier that learns features \emph{maximally informative} of the underlying class, even on unlabeled data.

The need for labeled data is only exacerbated in structured prediction, where the objective is to predict multiple interdependent output variables representing a discrete object.
Viewed as traditional classification, the number of classes in structured prediction is exponential in the number of output variables -- all possible output configurations.
Neuro-symbolic methods can provide additional supervision, leveraging symbolic knowledge regarding the structure of the output space~\citep{RaedtIJCAI2020}.
This knowledge, typically expressed in logic, characterizes the set of valid structures; for instance, a path in a graph is a series of \emph{connected} edges commencing at the source and terminating at the destination.%

In this paper, we take a principled approach to unifying the aforementioned forms of supervision.
Naively, we might consider simply optimizing both losses simultaneously. 
However, computed in that manner, entropy regularization does not account for the structure of the output space and is therefore likely to push the network towards invalid structures.
Instead, we restrict the entropy loss to the network's distribution over the valid structures, as characterized by the constraint, as opposed to the entire predictive distribution, proposing \emph{neuro-symbolic entropy regularization}.
That is, we require that the network's output distribution be maximally informative of the target \emph{subject to the constraint}.
Intuitively, the network should ``know'' the right structure among the valid structures.
Computing the entropy of a distribution subject to a constraint is, in general, computationally hard.
We provide an algorithm leveraging structural properties of tractable logical circuits to efficiently compute this quantity.
Our framework integrates seamlessly with other neuro-symbolic approaches that maximize the constraint probability, in effect ``eliminating'' invalid structures.


Empirically, we evaluate our loss on four structured prediction tasks, in both semi-supervised and fully-supervised settings. We observe it leads to models whose predictions are more accurate, and more likely to satisfy the constraint.

\noindent\textbf{Organization\; }
This paper is structured as follows.
We start by introducing the notation and background assumed throughout the paper.
Section \ref{sec:nser} motivates, and formally defines, our neuro-symbolic entropy loss.
Section \ref{sec:compute_nser} derives an algorithm that exploits certain structural properties of logical circuits that enable the efficient computation of our loss.
Section \ref{sec:example} illustrates our algorithm on a toy constraint, where the probability and neuro-symbolic entropy computations are made explicit.
Section \ref{sec:experiments} empirically validates our proposed approach on tasks in both semi-supervised and fully-supervised settings.
Section \ref{sec:related_work} reviews, and draws connections to the the neuro-symbolic and the semi-supervised literatures.
We step through an example compiling a logical formula in Section \ref{sec:compilingcircuits} and conclude in Section \ref{sec:conclusion}. Our code can be found at \url{https://github.com/UCLA-StarAI/NeSyEntropy}.

\section{Neuro-Symbolic Entropy Loss}\label{sec:nser}

We first introduce background on logical constraints and probability distributions over output structures.
Afterwards, we motivate and define our neuro-symbolic entropy loss.

\subsection{Background}\label{sec:background}
We write uppercase letters ($X$, $Y$) for Boolean variables and lowercase letters ($x$, $y$) for their instantiation ($Y=0$ or $Y=1$).
Sets of variables are written in bold uppercase ($\Xs$, $\Ys$), and their joint instantiation in bold lowercase ($\xs$, $\ys$).
A literal is a variable ($Y$) or its negation ($\neg Y$).
A logical sentence ($\alpha$ or $\beta$) is constructed from variables and logical connectives ($\land$, $\lor$, etc.), and is also called a (logical) formula or constraint.
A state or world $\ys$ is an instantiation to all variables $\Ys$.
A state $\ys$ satisfies a sentence $\alpha$, denoted $\ys \models \alpha$, if the sentence evaluates to true in that world. A state $\ys$ that satisfies a sentence $\alpha$ is also said to be a model of $\alpha$.
We denote by $m(\alpha)$ the set of all models of $\alpha$.
The notation for states $\ys$ is used to refer to an assignment, the logical sentence enforcing the assignment, or the binary output vector capturing the assignment, as these are all equivalent notions.
A sentence $\alpha$ entails another sentence $\beta$, denoted $\alpha \models \beta$, if all worlds that satisfy $\alpha$ also satisfy $\beta$.

\paragraph{A Probability Distribution over Possible Structures}
Let $\alpha$ be a logical sentence defined over Boolean variables $\Ys = \{Y_1,\dots,Y_n\}$.
Let $\pv$ be a vector of probabilities for the same variables $\Ys$, where $\pv_i$ denotes the predicted probability of variable $Y_i$ and corresponds to a single output of the neural network.
The neural network's outputs induce a probability distribution $\prob(\cdot)$ over all possible states $\ys$ of $\Ys$:
\begin{equation}\label{eqn:pr_struct}
     \prob(\y) = \prod_{i: \y \models \Y_i} \pv_i \prod_{i: \y \models \lnot \Y_i} (1 - \pv_i).
\end{equation}

\paragraph{Semantic Loss}\label{sec:semantic_loss}
 The semantic loss \citep{Xu18} is a function of the logical constraint $\alpha$ and a probability vector $\pv$. 
It quantifies how close the neural network comes to satisfying the constraint by computing the probability of the constraint under the distribution $\prob(\cdot)$ induced by $\pv$.
It does so by reducing the problem of probability computation to weighted model counting (WMC): summing up the models of $\alpha$, each weighted by its likelihood under $\prob(\cdot)$.
It, therefore, maximizes the probability mass allocated by the network to the models of $\alpha$
\begin{equation}
\label{eq:sloss}
\mathbb{E}_{\y \sim \prob}\left[ \Ind{\y \models \alpha} \right] 
= \sum_{\y \models \alpha} \prob(\y).
\end{equation}
Taking the negative logarithm recovers semantic loss. We make use of semantic loss in our experiments to "eliminate" invalid structures under the neural network's distribution.


\subsection{Motivation and Definition}

\begin{figure}[t]
\begin{subfigure}[b]{0.48\columnwidth}
\centering
\begin{tikzpicture}
\begin{axis}[
    no markers, domain=0:10, samples=100,
    axis lines*=left, xlabel=$\y$, ylabel=$p(\y|x)$,
    every axis y label/.style={at=(current axis.above origin),anchor=south},
    every axis x label/.style={at=(current axis.right of origin),anchor=west},
    height=4cm,
    xtick=\empty, ytick=\empty,
    enlargelimits=false, clip=false, axis on top,
    grid = major,
    ]
    \addplot [fill=cyan!20, draw=none, domain=1.5:8.5] {gaussianmixture2(0.5,2,7,1,0.5)} \closedcycle;
    \addplot [very thick,cyan!50!black] {gaussianmixture2(0.5,2,7,1,0.5)};
    \draw [yshift=-0.3cm, latex-latex, |-|](axis cs:1.5,0) -- node [fill=white] {$m(\alpha)$} (axis cs:8.5,0);
\end{axis}
\end{tikzpicture}
\caption{Network uncertain over both valid and invalid predictions}
\end{subfigure}
\hfill
\begin{subfigure}[b]{0.48\columnwidth}
\centering
\begin{tikzpicture}
\begin{axis}[
    no markers, domain=0:10, samples=500,
    axis lines*=left, xlabel=$\y$, ylabel=$p(\y|x)$,
    every axis y label/.style={at=(current axis.above origin),anchor=south},
    every axis x label/.style={at=(current axis.right of origin),anchor=west},
    height=4cm,
    xtick=\empty, ytick=\empty,
    enlargelimits=false, clip=false, axis on top,
    grid = major,
    ]
    \addplot [fill=cyan!20, draw=none, domain=1.5:8.5] {gaussianmixture2(0.6,0.2,4,2.5,0.6)} \closedcycle;
    \addplot [very thick,cyan!50!black] {gaussianmixture2(0.6,0.2,4,2.5,0.6)};
    \draw [yshift=-0.3cm, latex-latex, |-|](axis cs:1.5,0) -- node [fill=white] {$m(\alpha)$} (axis cs:8.5,0);
\end{axis}
\end{tikzpicture}
\caption{Network allocating most mass to one invalid prediction}
\end{subfigure}

\begin{subfigure}[b]{0.48\columnwidth}
\centering
\begin{tikzpicture}
\begin{axis}[
    no markers, domain=0:10, samples=100,
    axis lines*=left, xlabel=$\y$, ylabel=$p(\y|x)$,
    every axis y label/.style={at=(current axis.above origin),anchor=south},
    every axis x label/.style={at=(current axis.right of origin),anchor=west},
    height=4cm,
    xtick=\empty, ytick=\empty,
    enlargelimits=false, clip=false, axis on top,
    grid = major,
    ]
    \addplot [fill=cyan!20, draw=none, domain=1.5:8.5] {gaussianmixture2(3,0.5,6,1,0.5)} \closedcycle;
    \addplot [very thick,cyan!50!black] {gaussianmixture2(3,0.5,6,1,0.5)};
    \draw [yshift=-0.3cm, latex-latex, |-|](axis cs:1.5,0) -- node [fill=white] {$m(\alpha)$} (axis cs:8.5,0);
\end{axis}
\end{tikzpicture}
\caption{Network allocating most mass to valid predictions}
\end{subfigure}
\hfill
\begin{subfigure}[b]{0.48\columnwidth}
\centering
\begin{tikzpicture}
\begin{axis}[
    no markers, domain=0:10, samples=500,
    axis lines*=left, xlabel=$\y$, ylabel=$p(\y|x)$,
    every axis y label/.style={at=(current axis.above origin),anchor=south},
    every axis x label/.style={at=(current axis.right of origin),anchor=west},
    height=4cm,
    xtick=\empty, ytick=\empty,
    enlargelimits=false, clip=false, axis on top,
    grid = major,
    ]
    \addplot [fill=cyan!20, draw=none, domain=1.5:8.5] {gaussianmixture2(3,0.2,4,2.5,0.6)} \closedcycle;
    \addplot [very thick,cyan!50!black] {gaussianmixture2(3,0.2,4,2.5,0.6)};
    \draw [yshift=-0.3cm, latex-latex, |-|](axis cs:1.5,0) -- node [fill=white] {$m(\alpha)$} (axis cs:8.5,0);
\end{axis}
\end{tikzpicture}
\caption{Network allocating most mass to one valid prediction}
\end{subfigure}
\caption{%
A network's predictive distribution can be uncertain or certain ($\leftrightarrow$), and it can allow or disallow invalid predictions under the constraint $\alpha$ ($\updownarrow$).
Entropy regularization steers the network towards confident, possibly invalid predictions (b). 
Neuro-symbolic learning steers the network towards valid predictions without necessarily being confident (c).
Neuro-symbolic entropy-regularization guides the network to valid and confident predictions~(d).
}
\label{fig:entsl}
\end{figure}


Consider the plots in Figure \ref{fig:entsl}.
For any given data point $x$, the neural network can be fairly uncertain regarding the target class, accommodating for both valid and invalid structured predictions under its predicted distribution.

A common underlying assumption in many machine learning methods is that data belonging to the same class tend to form discrete clusters\;\citep{ssl} -- an assumption deemed justified on the sheer basis of the existence of classes.
Consequently, a classifier is expected to favor decision boundaries lying in regions of low data density, separating the clusters.
Entropy-regularization \citep{grandvalet2005} directly implements the above assumption, requiring that the classifier output confident -- low-entropy -- predictive distributions, pushing the decision boundary away from unlabeled points, thereby supplementing scarce labeled data with abundant unlabeled data.
Seen through that lens, minimizing the entropy of the predictive distribution can be regarded as minimizing a measure of class overlap as a function of the features learned by the network.


Entropy regularization, however, remains agnostic to the underlying domain, failing to exploit situations where we have knowledge characterizing valid predictions in the domain.  
Therefore, it can often be detrimental to a model's performance, causing it to grow confident in invalid predictions.

Conversely, neuro-symbolic approaches steer the network towards distributions disallowing invalid predictions, by maximizing the constraint probability, but do little to ensure the network learn features conducive to classification.

Clearly then, there is a benefit to combining the merits of both approaches. We restrict the entropy computation to the distribution over models of the logical formula, ensuring the network only grow confident in valid predictions. Complemented with maximizing the constraint probability, the network learns to allocate all of its mass to models of the constraint, while being maximally informative of the target.

\paragraph{Defining the Loss}
More precisely, let $\Y$ be a random variable distributed according to Equation \ref{eqn:pr_struct}: $\Y\sim \prob$. We are interested in minimizing the entropy of $\Y$ conditioned on the constraint $\alpha$
\begin{equation}
    \begin{aligned}\label{eqn:nsentropy}
    H(\Y | \alpha)  &= - \sum_{\y \models \alpha} \prob(\y | \alpha) \log \prob(\y | \alpha)\\
                    &= - \mathbb{E}_{\Y | \alpha} \left[ \log \prob(\Y | \alpha) \right].
    \end{aligned}
\end{equation}


\section{Computing the Loss}\label{sec:compute_nser}
The above loss is, in general, hard to compute. 
To see this, consider the uniform distribution over models of a constraint $\alpha$.
That is, let $\prob(\y|\alpha) = \frac{1}{|m(\alpha)|}$ for all $\y \models \alpha$.
Then, $H(\Y | \alpha) = -\sum_{\y \models \alpha} \frac{1}{|m(\alpha)|} \log \frac{1}{|m(\alpha)|} = \log |m(\alpha)|$.
This tells us how many models of $\alpha$ there are, which is a well-known \#P-hard problem \citep{Valiant1979a,Valiant1979b}.
We will show that, through compilation into tractable circuits, we can compute Equation \ref{eqn:nsentropy} in time linear in the size of the circuit.

\begin{algorithm}[tb]
   \caption{\textsc{Ent}($\alpha, \prob, \cache$)}
   \label{alg:Shannon-Entropy}
   {\bfseries Input:} a smooth, deterministic and decomposable logical circuit $\alpha$, a fully-factorized probability distribution $\prob(\cdot)$ over states of $\alpha$, and a cache $\cache$ for memoization\\
   {\bfseries Output:} $H(\Y | \alpha)$, where $\Y \sim \prob(\cdot)$
   
   \begin{algorithmic}[1]
   \LineIf{$\alpha \in \mathsf{c}$}{\textbf{return} $\cache(\alpha)$}
   \IF{$\alpha$ is a literal}
   \STATE $e \leftarrow 0$
   \ELSIF{$\alpha$ is an AND gate}
   \STATE $e \leftarrow \entropy(\beta, \prob, \cache) + \entropy(\gamma, \prob, \cache)$
   \ELSIF{$\alpha$ is an OR gate}
   \STATE $e\leftarrow\sum_{i=1}^{|\ch(\alpha)|}\prob(\beta_i) \log \prob(\beta_i) \! + \prob(\beta_i) \,  \entropy(\beta_i, \prob, \cache)$
   \ENDIF
   \STATE $\mathsf{c}(\alpha)\leftarrow e$
   \STATE \textbf{return} $e$
\end{algorithmic}
\end{algorithm}
\subsection{Computation through Compilation}
\paragraph{Tractable Circuit Compilation}
We resort to knowledge compilation techniques -- a class of methods that transform, or \emph{compile}, a logical theory into a target form with certain properties that allow certain probabilistic queries to be answered efficiently.
More precisely, we know of circuit languages that compute the probability of constraints~\citep{darwiche03}, and that are amenable to backpropagation.
We use the circuit compilation techniques in~\citet{darwiche11} to build a logical circuit representing our constraint.
Due to the structural properties of this circuit form, we can use it to compute both the probability of the constraint as well as its gradients with respect to the network's weights, in time linear in the size of the circuit~\citep{darwiche02}. 
This does not, in general, escape the complexity of the computation: worst case, the compiled circuit can be exponential in the size of the constraint. In practice, however, constraints often exhibit enough structure (repeated sub-problems) to make compilation feasible.
We refer to Section \ref{sec:compilingcircuits} for an illustrative example of  such a compilation.

\paragraph{Logical Circuits} 
More formally, a \emph{logical circuit} is a directed, acyclic computational graph representing a logical formula.
Each node $n$ in the DAG encodes a logical sub-formula, denoted $[n]$.
Each inner node in the graph is either an AND or an OR gate, and each leaf node encodes a Boolean literal ($Y$ or $\lnot Y$). 
We denote by $\ch(n)$ the set of $n$'s children, that is, the operands of its logical gate.

\paragraph{Structural Properties}  As already alluded to, circuits enable the tractable computation of certain classes of queries over encoded functions granted that a set of structural properties are enforced. We explicate such properties below.

A circuit is \emph{decomposable} if the inputs of every AND gate depend on disjoint sets of variables i.e. for $\alpha = \beta \land \gamma$, $\vars(\beta) \cap \vars(\gamma) = \varnothing$.
Intuitively, decomposable AND nodes encode local factorizations of the function. For the sake of simplicity, we assume that decomposable AND gates always have two inputs, a condition that can be enforced on any circuit in exchange for a polynomial increase in its size~\citep{vergari2015simplifying,peharz2020einsum}.

A second useful property is \emph{smoothness}.
A circuit is \emph{smooth} if the children
of every OR gate depend on the same set of variables i.e. for $\alpha = \bigvee_i \beta_i$, we have that $\vars(\beta_i) = \vars(\beta_j)\ \forall i,j$. Decomposability and smoothness are a sufficient and necessary condition for tractable integration over arbitrary sets of variables in a single pass, as they allow larger integrals to decompose into smaller ones~\citep{choi2020pc}.

Lastly, a circuit is said to be  \emph{deterministic} if, for any input, at most one child of every OR node has a non-zero output i.e. for $\alpha = \bigvee_i \beta_i$, we have that $ \beta_i \land \beta_j = \bot$ for all $i \neq j$. Figure \ref{fig:example} shows an example of smooth, decomposable and deterministic circuit.

\subsection{Algorithm}\label{sec:algorithm}
Let $\alpha$ be a \emph{smooth}, \emph{deterministic} and \emph{decomposable} logical circuit encoding our constraint, defined over Boolean variables $\Ys = \{Y_1,\dots,Y_n\}$. 
We now show that we can compute the constrained entropy in Equation~\ref{eqn:nsentropy} in time linear in the size of $\alpha$.
The key insight is that, using circuits, we are able to efficiently decompose an expectation with respect to a fully-factorized distribution by alternately splitting the query variables and the support of the distribution until we reach the leaves of the circuit, which are simple literals. In what follows, in a slight abuse of notation for brevity, all unconditional probabilities are implicitly conditioned on constraint $\alpha$; that is we redefine $\prob(\cdot)$  as  $\prob(\cdot | \alpha)$.

\subsubsection{Base Case: $\alpha$ is a literal}
When $\alpha$ is a literal, $\alpha = Y_i$ or $\alpha = \lnot Y_i$, we have that
\begin{align*}
    \prob(y_i|\alpha) &= \Ind{y_i \models [\alpha]}, \text{~~and} \\
    H(y_i | \alpha)  &= - \prob(y_i|\alpha) \log \prob(y_i|\alpha) = 0.
\end{align*}
Intuitively, a literal has no uncertainty associated with it.
\subsubsection{Recursive Case: $\alpha$ is a conjunction}
When $\alpha$ is a conjunction, decomposability enables us to write
\begin{equation*}
    \prob(\y|\alpha) = \prob(\y_1|\beta) \prob(\y_2|\gamma), \text{~where~}  \vars(\beta) \cap \vars(\gamma) = \varnothing
\end{equation*}
as it decomposes 
    $\alpha$ into two independent constraints $\beta$ and~$\gamma$,
    and $\y$ into two independent assignments $\y_1$ and~$\y_2$.
The neuro-symbolic entropy $- \mathbb{E}_{\Y | \alpha} \left[ \log \prob(\Y | \alpha) \right]$ is then
\begin{align*}
    &- \mathbb{E}_{\{\Y_1,\Y_2\} | \alpha} \Big[ \log \prob(\Y_1|\beta) + \log \prob(\Y_2|\gamma)\Big]\\
    &\quad =- \Big[\mathbb{E}_{\Y_1 | \beta} \big[ \log \prob(\Y_1|\beta) \big] + \mathbb{E}_{\Y_2 | \gamma} \big [\log \prob(\Y_2|\gamma)\big]\Big].
\end{align*}
That is, the entropy given a decomposable conjunction $\alpha$ is the sum of entropies given the conjuncts of~$\alpha$.
\subsubsection{Recursive Case: $\alpha$ is a disjunction}
When $\alpha$ is a smooth and deterministic disjunction, 
we have that $\alpha = \bigvee_i \beta_i$, where the $\beta_i$s are mutually exclusive, and therefore partition $\alpha$. Consequently, we have that
\begin{equation*}
    \prob(\y|\alpha) = \sum_i \prob(\beta_i) \cdot \prob(\y|\beta_i).
\end{equation*}
The neuro-symbolic entropy decomposes as well:
\begingroup
\allowdisplaybreaks
\begin{align*}
    &- \mathbb{E}_{\Y | \alpha} \left[ \log \prob(\Y | \alpha) \right] 
    = -\sum_{\y \models \alpha}\prob(\y|\alpha)\log \prob(\y|\alpha)\\
    &= -\sum_{\y \models \alpha} \sum_i \prob(\beta_i)\prob(\y|\beta_i) \log \Big[\sum_j \prob(\beta_j)\prob(\y|\beta_j)\Big]\\
    \begin{split}
      &= -\sum_{\y \models \alpha} \sum_i \prob(\beta_i)\prob(\y|\beta_i)\id{\y \models \beta_i}\\ 
      &\qquad\qquad\qquad\log \Big[\sum_j \prob(\beta_j)\prob(\y|\beta_j)\id{\y \models \beta_j}\Big],
    \end{split}
    \intertext{where by determinism, we have that, for any $\y$ such that $\y \models \alpha$, $\y \models \beta_i \implies \y \not \models \beta_j$ for all $i \neq j$. In other words, any state that satisfies the constraint $\alpha$ satisfies one and only one of its terms, and therefore, the above expression equals}
    &-\sum_{\y \models \alpha} \sum_i \prob(\beta_i) \prob(\y|\beta_i)\log \Big[\prob(\beta_i)\prob(\y|\beta_i)\Big]\id{\y \models \beta_i}\\
    &= -\sum_i \sum_{\y \models \beta_i} \prob(\beta_i) \prob(\y|\beta_i)\log \Big[\prob(\beta_i)\prob(\y|\beta_i)\Big].\\
    \intertext{Further simplifying the expression, expanding the logarithm, and using the fact that probability sums to 1 yields}
    \begin{split}
        &=-\sum_i \prob(\beta_i) \log \prob(\beta_i) \sum_{\y \models \beta_i} \prob(\y|\beta_i)\\
        &\qquad\qquad\qquad+ \prob(\beta_i) \sum_{\y \models \beta_i} \prob(\y|\beta_i)\log \prob(\y|\beta_i)
    \end{split}\\
    &=-\sum_i \prob(\beta_i) \log \prob(\beta_i) + \prob(\beta_i) \mathbb{E}_{\Y | \beta_i} \Big[ \log \prob(\Y|\beta_i) \Big].
\end{align*}
\endgroup
That is, the entropy of the random variable $\Y$ conditioned on a disjunction $\alpha$ is the sum of the entropy of the distribution induced on the children of $\alpha$, and the average entropy of its children. The full algorithm is illustrated in Algorithm \ref{alg:Shannon-Entropy}.


\begin{figure}[t!]
 \begin{subfigure}[b]{0.33\columnwidth}
 \centering
\scalebox{0.7}{
\begin{tikzpicture}[circuit logic US]
\node (output) at (4.25, 8) {};
\node (or1) [or gate, inputs=nn, rotate=90, scale=0.9] at (4.25,7) {};
\draw (or1) -- (output) node[pos=0.2, above right, color=YellowOrange] {$0.88$};

\node (and1) [and gate, inputs=nn, rotate=90, scale=0.9] at (3,5.8) {};
\node (and2) [and gate, inputs=nn, rotate=90, scale=0.9] at (5.7,5.8) {};

\node (c) at (2.2,4.6) {$C$};
\node (cval) at (2.2,3.8) {$\color{red}{0.2}$};
\draw[->] (cval) edge (c);
\node (nc) at (7,4.6) {$\neg C$};
\node (ncval) at (7,3.8) {$\color{red}0.8$};
\draw[->] (ncval) edge (nc);
\node (or2) [or gate, inputs=nnn, rotate=90, scale=0.9] at (3.1,4.6) {};
\node (or3) [or gate, inputs=nn, rotate=90, scale=0.9] at (5.6,4.6) {};

\node (and3) [and gate, inputs=nn, rotate=90, scale=0.9] at (3.1,3.4) {};
\node (and4) [and gate, inputs=nn, rotate=90, scale=0.9] at (5.1,3.4) {};
\node (and5) [and gate, inputs=nn, rotate=90, scale=0.9] at (6.1,3.4) {};

\node (a) at (4.6,2.2) {$A$};
\node (aval) at (4.6,1.4) {$\color{blue}0.3$};
\draw[->] (aval) edge (a);
\node (nb) at (5.2,2.2) {$\neg B$};
\node (nbval) at (5.2,1.4) {$\color{ForestGreen}0.5$};
\draw[->] (nbval) edge (nb);
\node (na) at (6,2.2) {$\neg A$};
\node (naval) at (6,1.4) {$\color{blue}0.7$};
\draw[->] (naval) edge (na);

\node (or4) [or gate, inputs=nn, rotate=90, scale=0.9] at (2.6,2.2) {};
\node (or5) [or gate, inputs=nn, rotate=90, scale=0.9] at (3.6,2.2) {};
\node (a1) at (2.1,1) {$A$};
\node (a1val) at (2.1,0.2) {$\color{blue}0.3$};
\draw[->] (a1val) edge (a1);
\node (na1) at (2.7,1) {$\neg A$};
\node (na1val) at (2.7,0.2) {$\color{blue}0.7$};
\draw[->] (na1val) edge (na1);
\node (b) at (3.5,1) {$B$};
\node (bval) at (3.5,0.2) {$\color{ForestGreen}0.5$};
\draw[->] (bval) edge (b);
\node (nb1) at (4.1,1) {$\neg B$};
\node (nb1val) at (4.1,0.2) {$\color{ForestGreen}0.5$};
\draw[->] (nb1val) edge (nb1);

\draw (or1.input 1) -- ++ (down: 0.25) -| (and1) node[pos=0.45, above right, color=YellowOrange] {$0.2$};
\draw (or1.input 2) -- ++ (down: 0.25) -| (and2) node[pos=0.1, above right, color=YellowOrange] {$0.68$};

\draw (and1.input 1) -- ++ (down: 0.25) -| (c);
\draw (and1.input 2) -- (or2) node[pos=0.45, right, color=YellowOrange] {$1$};

\draw (and2.input 2) -- ++ (down: 0.25) -| (nc);
\draw (and2.input 1) -- (or3) node[pos=0.45, left, color=YellowOrange] {$0.85$};

\draw (or2.input 2) -- (and3) node[pos=0.45, right, color=YellowOrange] {$1$};

\draw (or3.input 1) -- ++ (down: 0.25) -| (and4) node[pos=0.45, left, color=YellowOrange] {$0.15$};
\draw (or3.input 2) -- ++ (down: 0.25) -| (and5) node[pos=0.45, right, color=YellowOrange] {$0.7$};

\draw (and3.input 1) -- ++ (down: 0.25) -| (or4) node[pos=0.45, above, color=YellowOrange] {1};
\draw (and3.input 2) -- ++ (down: 0.25) -| (or5) node[pos=0.45, above, color=YellowOrange] {1};

\draw (and4.input 1) -- ++ (down:0.4) -| (a);
\draw (and4.input 2) edge (nb);

\draw (and5.input 1) edge (na);
\draw (and5.input 2) -- ++ (down:0.25) -| (or5);

\draw (or4.input 1) -- ++ (down:0.25) -| (a1);
\draw (or4.input 2) edge (na1);

\draw (or5.input 1) edge (b);
\draw (or5.input 2) -- ++ (down:0.25) -| (nb1);
\end{tikzpicture}
}
\end{subfigure}
\begin{subfigure}[b]{0.1\columnwidth}
\scalebox{.4}{
\begin{tikzpicture}[baseline=-330pt]
\node[circle,draw,minimum size=0.8cm] (a1) at (0,0.5) {};
\node[circle,draw,minimum size=0.8cm] (a2) at (0,1.5) {};
\node[circle,draw,minimum size=0.8cm] (a3) at (0,2.5) {};

\node[circle,draw,minimum size=0.8cm] (b1) at (1.5,0) {};
\node[circle,draw,minimum size=0.8cm] (b2) at (1.5,1) {};
\node[circle,draw,minimum size=0.8cm] (b3) at (1.5,2) {};
\node[circle,draw,minimum size=0.8cm] (b4) at (1.5,3) {};

\foreach \i in {1,...,3} {
    \foreach \j in {1,...,4} {
        \draw (a\i) -- (b\j);
    }
}

\node[circle,draw,minimum size=0.8cm] (c1) at (3.0,0) {};
\node[circle,draw,minimum size=0.8cm] (c2) at (3.0,1) {};
\node[circle,draw,minimum size=0.8cm] (c3) at (3.0,2) {};
\node[circle,draw,minimum size=0.8cm] (c4) at (3.0,3) {};

\foreach \i in {1,...,4} {
    \foreach \j in {1,...,4} {
        \draw (b\i) -- (c\j);
    }
}

\node[circle,draw,minimum size=0.8cm] (d1) at (4.5,0.5) {$\color{blue}0.3$};
\node[circle,draw,minimum size=0.8cm] (d2) at (4.5,1.5) {$\color{ForestGreen}0.5$};
\node[circle,draw,minimum size=0.8cm] (d3) at (4.5,2.5) {$\color{red}0.2$};

\foreach \i in {1,...,4} {
    \foreach \j in {1,...,3} {
        \draw (c\i) -- (d\j);
    }
}

\end{tikzpicture}
}
\end{subfigure}
\begin{subfigure}[b]{0.30\columnwidth}
\centering
\scalebox{0.7}{
\begin{tikzpicture}[circuit logic US]
\node (output) at (4.25, 8) {};
\node (or1) [or gate, inputs=nn, rotate=90, scale=0.9] at (4.25,7) {};
\node (or1val) [color=DarkOrchid] at ($(or1) + (0.7,0.1)$) {$\color{DarkOrchid}1.64$};
\draw (or1) -- (output);

\node (and1) [and gate, inputs=nn, rotate=90, scale=0.9] at (3,5.8) {};
\node (and1val) [color=DarkOrchid] at ($(and1) + (0.7,0.1)$) {$1.30$};
\node (and2) [and gate, inputs=nn, rotate=90, scale=0.9] at (5.7,5.8) {};
\node (and2val) [color=DarkOrchid] at ($(and2) + (0.7,0.1)$) {$1.04$};

\node (c) at (2.2,4.6) {$C$};
\node (cval) [color=DarkOrchid] at ($(c) - (0,0.4)$) {$0$};
\node (nc) at (7,4.6) {$\neg C$};
\node (ncval) [color=DarkOrchid] at ($(nc) - (0,0.4)$) {$0$};
\node (or2) [or gate, inputs=nnn, rotate=90, scale=0.9] at (3.1,4.6) {};
\node (or2val) [color=DarkOrchid] at ($(or2) + (0.7,0.1)$) {$1.30$};
\node (or3) [or gate, inputs=nn, rotate=90, scale=0.9] at (5.6,4.6) {};
\node (or3val) [color=DarkOrchid] at ($(or3) + (-0.7,0.1)$) {$1.04$};

\node (and3) [and gate, inputs=nn, rotate=90, scale=0.9] at (3.1,3.4) {};
\node (and3val) [color=DarkOrchid] at ($(and3) + (-0.7,0.1)$) {$1.30$};
\node (and4) [and gate, inputs=nn, rotate=90, scale=0.9] at (5.1,3.4) {};
\node (and4val) [color=DarkOrchid] at ($(and4) + (-0.6,0.1)$) {$0$};
\node (and5) [and gate, inputs=nn, rotate=90, scale=0.9] at (6.1,3.4) {};
\node (and5val) [color=DarkOrchid] at ($(and5) + (0.7,0.1)$) {$0.69$};

\node (a) at (5,1.7) {$A$};
\node (aval) [color=DarkOrchid] at ($(a) - (0,0.4)$) {$0$};
\node (nb) at (5.8,1.7) {$\neg B$};
\node (nbval) [color=DarkOrchid] at ($(nb) - (0,0.4)$) {$0$};
\node (na) at (6.6,1.7) {$\neg A$};
\node (naval) [color=DarkOrchid] at ($(na) - (0,0.4)$) {$0$};

\node (or4) [or gate, inputs=nn, rotate=90, scale=0.9] at (2.6,2.2) {};
\node (or4val) [color=DarkOrchid] at ($(or4) + (-0.7,0.1)$) {$0.61$};
\node (or5) [or gate, inputs=nn, rotate=90, scale=0.9] at (3.6,2.2) {};
\node (or5val) [color=DarkOrchid] at ($(or5) + (0.8,0.1)$) {$0.69$};

\node (a1) at (2.0,1) {$A$};
\node (a1val) [color=DarkOrchid] at ($(a1) - (0,0.4)$) {$0$};
\node (na1) at (2.7,1) {$\neg A$};
\node (na1val) [color=DarkOrchid] at ($(na1) - (0,0.4)$) {$0$};
\node (b) at (3.5,1) {$B$};
\node (bval) [color=DarkOrchid] at ($(b) - (0,0.4)$) {$0$};
\node (nb1) at (4.2,1) {$\neg B$};
\node (nb1val) [color=DarkOrchid] at ($(nb1) - (0,0.4)$) {$0$};

\draw (or1.input 1) -- ++ (down: 0.25) -| (and1);
\draw (or1.input 2) -- ++ (down: 0.25) -| (and2);

\draw (and1.input 1) -- ++ (down: 0.25) -| (c);
\draw (and1.input 2) -- (or2);

\draw (and2.input 2) -- ++ (down: 0.25) -| (nc);
\draw (and2.input 1) -- (or3);

\draw (or2.input 2) -- (and3);

\draw (or3.input 1) -- ++ (down: 0.25) -| (and4);
\draw (or3.input 2) -- ++ (down: 0.25) -| (and5);

\draw (and3.input 1) -- ++ (down: 0.25) -| (or4);
\draw (and3.input 2) -- ++ (down: 0.25) -| (or5);

\draw (and4.input 1) edge (a);
\draw (and4.input 2) -- ++ (down:0.8) -| (nb);

\draw (and5.input 1) -- ++ (down:0.55) -| (na);
\draw (and5.input 2) -- ++ (down:0.25) -| (or5);

\draw (or4.input 1) -- ++ (down:0.25) -| (a1);
\draw (or4.input 2) edge (na1);

\draw (or5.input 1) edge (b);
\draw (or5.input 2) -- ++ (down:0.25) -| (nb1);
\end{tikzpicture}
}
\end{subfigure}
\caption{
For a given data point, the network (middle) outputs a distribution over classes $A, B$ and $C$, highlighted in blue, green and red, respectively.
The circuit encodes the constraint $(A \land B) \implies C$.
For each leaf node $l$, we plug in $\prob(l)$ and $1 - \prob(l)$ for positive and negative literals, respectively.
The computation proceeds bottom-up, taking products at AND gates and summations at OR gates.
The value accumulated at the root of the circuit (left) is the probability allocated by the network to the constraint.
The weights accumulated on edges from OR gates to their children are of special significance: OR nodes induce a partitioning of the distribution's support, and the weights correspond to the mass allocated by the network to each mutually-exclusive event.
Complemented with a second upward pass, where the entropy of an OR node is the entropy of the distribution over its children plus the expected entropy of its children, and the entropy of an AND node is the product of its children's entropies, we get the entropy of the distribution over the constraint's models -- the neuro-symbolic entropy regularization loss (right).
}
\label{fig:example}
\end{figure}
\section{An Illustrative example}\label{sec:example}
Consider Figure \ref{fig:example}.
Given a data point, the neural network defines a distribution over Boolean random variables $A, B$, and $C$, where $\prob(A) = \pv_0$ and $\prob(\lnot A) = 1 - \pv_0$, $\prob(B) = \pv_1$ and $\prob(\lnot B) = 1 - \pv_1$, etc.
The circuit encodes the constraint $(A \land B) \implies C$.
To compute the the probability of the constraint under the network's distribution, we feed the probabilities into the circuit, proceeding in a bottom-up fashion, taking products at AND gates and summations at OR gates, accumulating intermediate computations on the edges of the circuit.
The value accumulated at the root of the circuit is the probability mass allocated by the network to models of the formula, and corresponds to the probability of the constraint under the network's distribution -- this is exactly the semantic loss, up to a negative logarithm.
The weights accumulated on edges from OR gates to their children are of special significance: OR nodes induce a partitioning of the distribution's support, and the weights correspond to the mass allocated by the network to each mutually-exclusive event.
Complemented with another upward pass, where the entropy of every OR node is the entropy of the distribution over it's children plus the expected entropy of its children, and the entropy of every AND node is the product of its children's entropies, we calculate the entropy of the distribution over models of the constraint -- this is exactly the neuro-symbolic entropy regularization. Therefore, performing two upward sweeps of the circuit, we are able to compute the neuro-symbolic entropy regularization and the semantic loss%















































\section{Experimental Evaluation}\label{sec:experiments}
In this section we set out to empirically test our neuro-symbolic entropy loss.
To that end, we devise a series of semi-supervised and fully-supervised
structured prediction experiments.
Such are settings where, contrary to the their dominant use, classifiers are expected to predict structured
objects rather than scalar, discrete or real values. 
Such objects are defined in terms of constraints: a set of rules characterizing the set of solutions. 
We aim to answer the following:
\begin{enumerate}
    \item Does entropy regularization, in general, lead to predictive models 
    with improved generalization capabilities?
    \item If the answer to the above question is in the positive, it is our 
    expectation that restricting the distribution acted upon by entropy
    regularization to that over just the models of the constraint might
    seem more sensible as compared to entropy-regularizing the entire
    predictive distribution--including non-models of the constraint.
    Do experiments corroborate such a hypothesis?
    \item Finally, entropy regularization can be interpreted as clustering
    the different classes, and has intimate connections to transductive
    Support Vector Machines \citep{ssl}. Does such an interpretation carry
    over to models and non-models of the constraint? Put differently, can
    we expect entropy-regularized predictive models to better conform to
    our constraints, measured by the percentage of predictions satisfying
    the constraint \emph{regardless} of matching the groundtruth.
\end{enumerate}

\subsection{Semi-Supervised: Entity-Relation Extraction}

\newcommand{\tablescale}{0.9}

\begin{table*}[!htb]
\caption{Experimental results for entity-relation extraction on ACE05 and SciERC. \#Labels indicates the number of labeled data points available to the network per relation. The remaining training set is stripped of labels and utilized in an unsupervised manner. We report the F1-score where a prediction is correct if the relation and its entities are correct.}
\centering
\scalebox{\tablescale}{%
\begin{tabular}{llc|c|c|c|c|c|c}
\toprule
\# Labels  &    &3  &5  &10 &15 &25 &50 &75\\
\midrule
\multirow{6}{*}{\rotatebox[origin=c]{90}{ACE05}}
& Baseline
&4.92 $\pm$ 1.12          
&7.24 $\pm$ 1.75          
&13.66 $\pm$ 0.18 
&15.07 $\pm$ 1.79          
&21.65 $\pm$ 3.41          
&28.96 $\pm$ 0.98         
&33.02 $\pm$ 1.17 \\
& Self-training
&7.72 $\pm$ 1.21         
&12.83 $\pm$ 2.97            
&16.22 $\pm$ 3.08            
&17.55 $\pm$ 1.41            
&27.00 $\pm$ 3.66            
&32.90 $\pm$ 1.71           
&37.15 $\pm$ 1.42 \\
& Product t-norm
&8.89 $\pm$ 5.09       
&14.52 $\pm$ 2.13            
&19.22 $\pm$ 5.81            
&21.80 $\pm$ 7.67            
&30.15 $\pm$ 1.01           
&34.12 $\pm$ 2.75          
&37.35 $\pm$ 2.53 \\
\cmidrule{2-9}
& Semantic Loss
&12.00 $\pm$ 3.81
&14.92 $\pm$ 3.14 %
&22.23 $\pm$ 3.64
&27.35 $\pm$ 3.10
&30.78 $\pm$ 0.68
&36.76 $\pm$ 1.40
&38.49 $\pm$ 1.74\\
& + Full Entropy
&{\bf14.80} $\pm$ 3.70
&15.78 $\pm$ 1.90
&23.34 $\pm$ 4.07 %
&28.09 $\pm$ 1.46 %
&31.13 $\pm$ 2.26
&36.05 $\pm$ 1.00
&39.39 $\pm$ 1.21\\
& + NeSy Entropy
&14.72 $\pm$ 1.57
&{\bf18.38} $\pm$ 2.50
&{\bf26.41} $\pm$ 0.49
&{\bf31.17} $\pm$ 1.68
&{\bf35.85} $\pm$ 0.75
&{\bf37.62} $\pm$ 2.17
&{\bf41.28} $\pm$ 0.46\\
\midrule
\multirow{6}{*}{\rotatebox[origin=c]{90}{SciERC}}
& Baseline
&2.71 $\pm$ 1.10
&2.94 $\pm$ 1.00
&3.49 $\pm$ 1.80
&3.56 $\pm$ 1.10
&8.83 $\pm$ 1.00
&12.32 $\pm$ 3.00
&12.49 $\pm$ 2.60\\
&Self-training
&3.56 $\pm$ 1.40
&3.04 $\pm$ 0.90
&4.14 $\pm$ 2.60
&3.73 $\pm$ 1.10
&9.44 $\pm$ 3.80
&14.82 $\pm$ 1.20
&13.79 $\pm$ 3.90\\
&Product t-norm
&{\bf6.50} $\pm$ 2.00
&8.86 $\pm$ 1.20
&10.92 $\pm$ 1.60
&13.38 $\pm$ 0.70
&13.83 $\pm$ 2.90
&19.20 $\pm$ 1.70
&19.54 $\pm$ 1.70\\
\cmidrule{2-9}
&Semantic Loss
&6.47 $\pm$ 1.02
&{\bf9.31} $\pm$ 0.76
&11.50 $\pm$ 1.53
&12.97 $\pm$ 2.86
&14.07 $\pm$ 2.33
&20.47 $\pm$ 2.50
&23.72 $\pm$ 0.38
\\
&+ Full Entropy
&6.26 $\pm$ 1.21
&8.49 $\pm$ 0.85
&11.12 $\pm$ 1.22
&14.10 $\pm$ 2.79 
&17.25 $\pm$ 2.75
&{\bf22.42} $\pm$ 0.43
&24.37 $\pm$ 1.62\\
&+ NeSy Entropy
&6.19 $\pm$ 2.40
&8.11 $\pm$ 3.66
&{\bf13.17} $\pm$ 1.08
&{\bf15.47} $\pm$ 2.19
&{\bf17.45} $\pm$ 1.52
&22.14 $\pm$ 1.46
&{\bf25.11} $\pm$ 1.03\\
\bottomrule
\end{tabular}%
}
\label{table:results}
\end{table*}

We begin by testing our research questions in the semi-supervised setting.
Here the model is presented with only a portion of the labeled
training set, with the rest used exclusively in an unsupervised manner
by the respective approaches.

We make use of the natural ontology of entity types and their relations 
present when dealing with relational data. This defines a set of 
relations and their permissible argument types. As is with all of our
constraints, we express the aforementioned ontology in the language 
of Boolean logic.

Our approach to recognizing the named entities and their pairwise relations
is most similar to \citet{Zhong2020}. Contextual embeddings are first procured
for every token in the sentence. These are then fed into a named entity 
recognition module that outputs a vector of per-class probability for every 
entity. A classifier then classifies the concatenated contextual 
embeddings and entity predictions into a relation.

We employ two entity-relation extraction datasets, the Automatic Content
Extraction (ACE) 2005 \citep{walker2006} and SciERC datasets \citep{luan2018}.
ACE05 defines an ontology over $7$ entities and $18$ relations from mixed-genre
text, whereas SciERC defines $6$ entity types with $7$ possible relation between
them and includes annotations for scientific entities and there relations,
assimilated from $12$ AI conference/workshop proceedings.
We report the percentage of coherent predictions: data points for which the
predicted entity types, as well as the relations are correct.

We compare against five baselines. The first baseline is a purely supervised
model which makes no use of unlabeled data. The second is a classical
self-training approach based off of \citet{chang2007}, and uses integer linear
programming to impute the unlabeled data's most likely labels subject to the
constraint, and consequently augment the (small) labeled set. The third 
baseline is a popular instantiation of a broad class of methods, fuzzy logics,
which replace logical operators with their fuzzy t-norms and logical implications
with simple inequalities. Lastly, we compare our proposed method, dubbed 
``NeSy Entropy'', to vanilla semantic loss as proposed in \citet{Xu18} 
as well as another entropy-regularized baseline, dubbed ``Full Entropy'', which
minimizes the entropy of the entire predictive distribution, as opposed to just
the distribution over the constraint's models.

Our results are shown in Table \ref{table:results}. We observe that semantic loss outperforms
the baseline, self-training, and product t-norm across the board. We attribute
such a performance to the exactness of semantic loss, and its faithfulness to
the underlying constraint. We also observe that entropy-regularizing the
predictive model, in conjunction with training using semantic loss leads to
better predictive models, as compared with models trained solely using semantic
loss. Furthermore, it turns out that restricting entropy to the distribution
over the constraint's models, models that we know constitute the set of valid
predictions, compared to the model's entire predictive distribution, which
includes valid and invalid predictions, leads to a non-trivial
increase in the accuracy of predictions.


\begingroup
\begin{table}[t]
\centering
\caption {Grid shortest path test results}
{
\begin{tabular}{ @{} l l l l }
Test accuracy \%  & Coherent & Incoherent & Constraint \\
\midrule \midrule
5-layer MLP & 5.62 & {\bf 85.91} & 6.99 \\
\midrule
Semantic loss & 28.51 & 83.14 & 69.89 \\
+ Full Entropy & 29.02 & 83.76  & 75.23 \\ 
+ NeSy Entropy & {\bf 30.12} & 83.01 & {\bf 91.61} \\
\end{tabular}
}
\label{tab:grid}
\end{table}

\begin{table}[t]
\centering
\caption {Preference prediction test results}
{
\begin{tabular}{@{}l l l l @{}}
Test accuracy \%  & Coherent & Incoherent & Constraint\\
\midrule \midrule
3-layer MLP & 1.01 & {\bf 75.78} & 2.72 \\
\midrule
Semantic loss & 15.03 & 72.43 & 69.83 \\
+ Full Entropy & 17.52 & 71.80 & 80.21 \\
+ NeSy Entropy & {\bf 18.17} & 71.51 & {\bf 96.04} \\
\end{tabular}
}
\label{tab:pref}
\end{table}
\endgroup
\begin{table}[h]
\centering
\caption {Warcraft shortest path prediction results}
{
\begin{tabular}{@{}l l l l @{}}
Test accuracy \%  & Coherent& Incoherent & Constraint\\
\midrule \midrule
ResNet-18&  44.8 & 97.7  & 56.9\\
\midrule
Semantic loss& 50.9 &  97.7 & 67.4\\
+ Full Entropy& 51.5 &  97.6&  67.7\\
+ NeSy Entropy& {\bf55.0}& {\bf97.9}& {\bf69.8}\\
\end{tabular}
}
\label{tab:sp}
\end{table}

\subsection{Fully-Supervised Learning}
We now turn our attention to testing our hypotheses in a fully supervised setting,
where our aim is to examine the effect of constraints enforced on the training set.
We note that this is a seemingly harder setting in the following sense: In a semi-
supervised setting we might make the argument that, despite its abundance, imposing
an auxiliary loss on unlabeled data provides the predictive model with an unfair
advantage as compared to the baseline. We concern ourselves with two tasks: predicting paths in a grid and preference learning.

\paragraph{Predicting Simple Paths}
For this task, our aim is to find the shortest path in a graph, or more
specifically a 4-by-4 grid, $G = (V, E)$ with uniform edge weights. Our input is
a binary vector of length $|V| + |E|$, with the first $|V|$ variables indicating
the source and destination, and the next $|E|$ variables encoding a subgraph $G'
\subseteq G$. Each label is a binary vector of length $|E|$ encoding the
shortest \emph{simple} path in $G'$, a requirement that we enforce through our
constraint. We follow the algorithm proposed by \citet{nishino2017} to generate 
a constraint for each simple path in the grid, conjoined with indicators specifying
the corresponding source-destination pair. Our constraint is then the disjunction of all such conjunctions.

To generate the data, we begin by randomly removing one third of the edges in
the graph $G$, resulting in a subgraph, $G'$. Subsequently, we filter out
connected components in $G'$ with fewer than $5$ nodes to reduce degenerate
cases. We then sample a source and destination node uniformly at random. The
latter constitutes a single data point. We generate a dataset of $1600$
examples, with a $60/20/20$ train/validation/test split. %

\paragraph{Preference Learning}
We also consider the task of preference learning. Given the user's ranking of a subset of elements, we
wish to predict the user's preferences over the remaining elements of the set.
We encode an ordering over $n$ items as a binary matrix ${X_{ij}}$, where for
each $i, j \in {1, \ldots, n}$, $X_{ij}$ denotes that item $i$ is at position
$j$. Our constraint $\alpha$ requires that the network's output be a valid 
total ordering.
We use preference ranking data over $10$ types of sushi for $5,000$ individuals,
taken from PREFLIB \citep{MaWa13a}, split 60/20/20. Our inputs consist of the user's 
preference over $6$ sushi types, with the model tasked to predict the
user's preference, a \emph{strict} total order, over the remaining $4$. %

Tables \ref{tab:grid} and \ref{tab:pref} compares the baseline
to the same MLP augmented with semantic loss, semantic
loss with entropy regularization over the entire predictive distribution, dubbed
``Full Entropy'' and entropy regularization over the distribution over the
constraint's models, dubbed ``NeSy Entropy".

Similar to \citet{Xu18}, we observe that the semantic loss has a marginal effect
on incoherent accuracy, but significantly improves the network’s  ability to output
coherent predictions. We also observe that, similar to semi-supervised settings, 
entropy-regularization leads to more coherent predictions using
both ``Full Entropy'' and ``NeSy Entropy", with ``NeSy Entropy" leading to the
best performing predictive models. Remarkably, we also observe that ``NeSy Entropy''
leads to predictive models whose predictions almost always satisfy the constraint,
captured by ``Constraint''.

\paragraph{Warcraft Shortest Path}
Lastly, we consider a more real-world variant of the task of predicting simple paths.
Following \citep{Pogancic2020}, our training set consists of $10,000$
terrain maps curated using Warcraft II tileset.
Each map encodes an underlying grid of dimension $12 \times 12$, where each vertex is assigned a cost depending on the type of terrain it represents (e.g. earth has lower cost than water).
The shortest (minimum cost) path between the top left and bottom right vertices is encoded as
an indicator matrix, and serves as label. 
Figure~\ref{fig:sp-results} shows an example input presented to the network, the groundtruth, and the input with the annotated shortest path.
Figure~\ref{fig:warcraft-predictions} shows examples of baseline predictions and those obtained by training with constraints.

\setlength{\fboxsep}{0pt}
\setlength{\picHeight}{0.25\linewidth}
\begin{figure}[t]
    \centering
        \scalebox{0.9}{
		\parbox[b][\picHeight][c]{1em}{\rotatebox{90}{Input}}
		\includegraphics[height=\picHeight]{figs/example2_map_raw.png}
		\vbox to\picHeight{\vfil\hbox{\LARGE$\to$}\vfil}
		\parbox[b][\picHeight][c]{\picHeight}{
		\centering
		$$
			\begin{pmatrix}
				1&0&\cdots&0\\
				1&0&\cdots&0\\
				\vdots & \vdots & \ddots &\vdots\\
				0&0&\cdots&1
			\end{pmatrix}
		$$} ~~~~~~
		\includegraphics[height=\picHeight]{figs/example2_map_with_path.png}}
		\caption{Warcraft dataset. Each
		input (left) is a $12 \times 12$ grid corresponding to a Warcraft II
		terrain map, the output is a matrix (middle) indicating the shortest
		path from top left to bottom right (right).
		\label{fig:sp-results:dataset}
		\label{fig:sp-results:withpath}
		}
    \label{fig:sp-results}
\end{figure}
\setlength{\fboxsep}{0pt}
\setlength{\picHeight}{0.3\linewidth}
\begin{figure}[t]
    \centering
    \includegraphics[height=\picHeight, width=0.93 \linewidth]{figs/failure_1.png}
    \\
    \includegraphics[height=\picHeight]{figs/failure_3.png}
    \\
    \includegraphics[height=\picHeight]{figs/failure_4.png}
    \\
		\caption{Example maps from the Warcraft dataset (left) annotated with the baseline predictions in red (center), and the predictions obtained using constraints in yellow (right)}
    \label{fig:warcraft-predictions}
\end{figure}

Presented with an image of a terrain map, a convolutional neural network -- following \citep{Pogancic2020}, we use ResNet18 \citep{He2016} -- outputs a $12 \times 12$ binary matrix indicating the vertices that constitute the minimum cost path.
We report three metrics: ``Coherent'' denotes the percentage of optimal-cost predictions, ``Incoherent'' denotes the percentage of individual vertices matching the groundtruth, and ``Constraint'' indicates the percentage of predictions that constitute valid paths. Our results are shown in Table~\ref{tab:sp}.

In line with our previous experiments, we observe that incorporating constraints into learning improves the ``Coherent'' metric from $44.8\%$ to $50.9\%$, and of the  ``Coherent'' metric from $56.9\%$ to $67.4\%$.
Augmenting semantic loss with the entropy over the network's predictive distribution, ``Full Entropy'', we attain a modest improvement from $50.9\%$ to $51.5\%$ and $67.4\%$ to $67.7\%$ for the ``Coherent'' and ``Constraint'' metrics respectively. 
Restricting the entropy minimization to models of the constraint, ``NeSy Entropy'', we observe that we attain a large improvement to $55.0\%$ and $69.8\%$ for the ``Coherent'' and ``Constraint'' metrics resp.

\section{Related Work}\label{sec:related_work}
The idea of using a model's predictions to obtain artificial labels for unlabeled data is as old as time~\citep{scudder1965probability,mclachlan1975iterative},
and has often known throughout the literature as pseudo-labeling or self-training. 
Self-training is an iterative process by which a learner imputes the labels of examples which have been confidently classified in the previous step, and can therefore be viewed as implicitly minimizing the model's entropy.
This is done explicitly in \cite{grandvalet2005} with a loss term which minimizes the entropy of the model's predicted distribution for any given unlabeled data point, thereby rendering the entropy computation amenable to differentiation, and allowing finer control on the influence of the unlabeled data. 
It has been applied successfully across a wide range of domain, including NLP \citep{mcclosky2006effective}, object detection \citep{rosenberg2005semi}, image classification \citep{lee2013pseudo,xie2019self}, domain adaptation \citep{zou2018domain}, to name a few.
It has also been used recently by a plethora of semi-supervised learning algorithms as a constituent of their training pipelines \citep{arazo2019pseudo,anonymous2019semi, miyato2018virtual, berthelot2019mixmatch}.
This is in contrast to entropy maximization, used in reinforcement learning, where the aim
is to capture the entire range of low-cost behaviors, not a single correct one\citep{Toussaint2009robot}.

In an acknowledgment to the need for both symbolic as well as sub-symbolic reasoning, there has been a plethora of recent works studying how to best combine neural networks and logical reasoning, dubbed \emph{neuro-symbolic reasoning}. The focus of such approaches is typically making probabilistic reasoning tractable through first-order approximations, and differentiable, through reducing logical formulas into arithmetic objectives, replacing logical operators with their fuzzy t-norms, and implications with inequalities~\citep{kimmig2012short,rocktaschel2015,fischer19a}.

Constraint driven learning \citep{chang2007} is a classic work that lies at the intersection of both bodies of work. Therein, in a fashion similar to self-training, the learner imputes the labels of the samples that were confidently classified \emph{subject to the constraint}. Therefore, the imputed labels are guaranteed to be valid. CoDL, however, performs a first-order approximation, approximating the netwok's full posterior by the MAP. Furthermore it is not differentiable.

\citet{diligenti2017} and \citet{donadello2017} use first-order logic to specify constraints on outputs of a neural network. They employ fuzzy logic to reduce logical formulas into differential, arithmetic objectives denoting the extent to which neural network outputs violate the constraints, thereby supporting end-to-end learning under constraints. More recently, \citet{Xu18} introduced semantic loss, which circumvents the shortcomings of fuzzy approaches, while still supporting end-to-end learning under constraints. More precisely, \emph{fuzzy reasoning} is replaced with \emph{exact probabilistic reasoning}, made possible by compiling logical formulae into structures supporting efficient probabilistic queries. 


Another class of neuro-symbolic approaches have their roots in logic programming. DeepProbLog~\citep{manhaeve2018} extends ProbLog, a probabilistic logic programming language, with the capacity to process neural predicates, whereby the network's outputs are construed as the probabilities of the corresponding predicates. This simple idea retains all essential components of ProbLog: the semantics, inference mechanism, and the implementation. In a similar vein, \citet{dai2018} combine domain knowledge specified as purely logical Prolog rules with the output of neural networks, dealing with the network's uncertainty through revising the hypothesis by iteratively replacing the output of the neural network with anonymous variables until a consistent hypothesis can be formed. \citet{bosnjak2017programming} present a framework combining prior procedural knowledge, as a Forth program, with neural functions learned through data. The resulting neural programs are consistent with specified prior knowledge and optimized with respect to data.










\section{Conclusion}\label{sec:conclusion}
In conclusion, we proposed neuro-symbolic entropy regularization, a principled approach to unifying neuro-symbolic learning and entropy regularization. It encourages the network to output distributions that are peaked over models of the logical formula. We are able to compute our loss due to structural properties of circuit languages. We validate our hypothesis on four different tasks under semi-supervised and fully-supervised settings and observed an increase in \emph{accuracy} as well as the \emph{validity} of the model's predictions.

\begin{acknowledgements}
KA would like to thank Arthur Choi, Antonio Vergari, Yoojung Choi, and Tal Friedman for helpful discussions throughout the project.
This work is partially supported by a DARPA PTG grant, NSF grants \#IIS-1943641, \#IIS-1956441, \#CCF-1837129, Samsung, CISCO, and a Sloan Fellowship.


\end{acknowledgements}

\appendix

\section{Compiling Logical Formulas into Tractable Circuits}\label{sec:compilingcircuits}
At a high level, there exist off-the-shelf compilers \citep{choi2013dynamic, Oztok2015topdown, Darwiche2004advances, Muise2012dsharp, Lagniez2017improveddnnf, Takahisa2012implementing} utilizing SAT solvers, essentially through case analysis, to compile a logical formula into a tractable logical circuit.
NeSy Entropy is agnostic to the exact flavor of circuit so long as the properties outlined in Section \ref{sec:algorithm} are respected. 
In our experiments, we use PySDD\footnote{https://github.com/wannesm/PySDD} a python SDD compiler~\citep{darwiche11,choi2013dynamic}.
We will now step through an example of compiling a logical formula.
Consider the circuit in Figure \ref{fig:example} encoding constraint
\begin{equation*}
    (A \land B) \implies C,
\end{equation*}
to be construed as encoding, $\mathsf{animal} \land \mathsf{barks} \implies \mathsf{dog}$.

Intuitively, our aim is to transform the above logical formula into a \emph{compact}
target form representing all possible assignments to $A, B$ and $C$ satisfying the logical
formula.
We compile such a constraint by proceeding in a bottom up fashion,
where bottom-up compilation can be seen as composing Boolean sub-functions whose domain is determined by a variable ordering.
Concretely, starting from circuits for literals $A$ and $B$, we compile a circuit $\beta = A \land B$.
We compose the previously compiled circuit $\beta$ with the circuit for literal $C$.
We point out that this is achieved using a couple of simple API calls to a bottom-up compiler.
We will now step through the actual construction of the circuit.
We introduce logical circuits representing the literals%
\begin{equation*}
    {A}\qquad{\lnot A}\qquad{B}\qquad{\lnot B}\qquad{C}\qquad{\lnot C}
\end{equation*}
The compiler disjoins literals $A$ with $\lnot A$, and $B$ with $\lnot B$,
introducing deterministic and smooth OR nodes.\\[10pt]
\centerline
{
    \includegraphics[width=.1\columnwidth,page=1]{figs/Compiled_Circuits.pdf}\hspace{150pt}\includegraphics[width=.1\columnwidth,page=2]{figs/Compiled_Circuits.pdf}
}

An OR node represents \emph{disjoint solutions} to the logical formula, meaning there exists distinct assignments, characterized by the children, satisfying the constraint e.g. $a, \lnot a, b$ and $\lnot b$ all occur as part of distinct solutions to the constraint.


Compilation proceeds by conjoining constraint circuits for 
$A \lor \lnot A$ with $B \lor \lnot B$, $\lnot A$ with $B \lor \lnot B$
and $A$ with $\lnot B$.%
    \\[10pt]
\centerline{
    \includegraphics[width=.3\columnwidth,page=3]{figs/Compiled_Circuits.pdf}
}
    
Decomposable AND nodes \emph{compose} functions over \emph{disjoint sets of variables}. These AND nodes represent Boolean functions $(A \lor \lnot A) \land (B \lor \lnot B)$, $\lnot A \land (B \lor \lnot B)$, and  $A \land \lnot B$.


The compiler disjoins $\lnot A \land (B \lor \lnot B)$, with  $A \land \lnot B$ and $(A \lor \lnot A) \land (B \lor \lnot B)$ with $\mathsf{true}$, the multiplicative identity, guaranteeing alternating AND and OR nodes, for convenience. It is worth reiterating that every child of an OR node encodes disjoint solutions over the same set of variables.

So far, we have compiled logical circuits for the formula
\begin{equation}
\label{eq:c1}
(\lnot A \land (B \lor \lnot B)) \lor  (A \land \lnot B)
\end{equation}
as well as for the fomula
\begin{equation}
\label{eq:c2}
    (A \lor \lnot A) \land (B \lor \lnot B)
\end{equation}

What remains is to conjoin \cref{eq:c1} with $C$, and \cref{eq:c2} with $\lnot C$, and disjoin the resulting circuits.
What we get is a disjunction over the possible solutions of the constraint:
predicting the presence of a barking animal implies the presence of a dog.
Otherwise, there might or not be a dog.\\[10pt]
\centerline{
\includegraphics[width=.35\columnwidth,page=4]{figs/Compiled_Circuits.pdf}
}
Compilation techniques like the one we illustrated do not, however, escape the hardness of 
the problem: the compiled circuit can be
exponential in the size of the constraint, \textit{in the worst case}.
\textit{In practice}, however, we can obtain compact circuits because real-life logical constraints exhibit enough structure (e.g., repeated
sub-problems) that can be easily exploited by a compiler~\citep{darwiche02}.
\nocite{Paszke19}
\bibliography{ahmed_663}
\end{document}
