\documentclass[accepted]{uai2022} %

\usepackage[american]{babel}

\usepackage[usenames,dvipsnames]{xcolor}
\usepackage{natbib} %
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} %
\usepackage{booktabs} %
\usepackage{tikz} %



\newcommand{\swap}[3][-]{#3#1#2} %

\usepackage{bibentry}
\usepackage{xspace}
\usepackage{amsmath}
\usepackage{amsthm}
\usepackage{multirow}
\usepackage{graphicx}
\usepackage{multirow}
\usepackage{booktabs}
\usepackage{amssymb}
\usepackage{stmaryrd}
\usepackage{bbm}
\usepackage{mathtools}
\usepackage{cancel}
\usepackage{caption}
\usepackage{subcaption}
\usepackage{algorithm}
\usepackage{cleveref}
\usepackage[noend]{algorithmic}
\newcommand{\entropy}{\textsc{Ent}}
\newcommand{\cache}{\mathsf{c}}

\newcommand{\LineIf}[2]{     
    \STATE \algorithmicif\ {#1}\ \algorithmicthen\ {#2} 
}
\newcommand{\LineIfElse}[3]{     
    \STATE \algorithmicif\ {#1}\ \algorithmicthen\ {#2}\ \algorithmicelse\ {#3}
}

\theoremstyle{definition}
\newtheorem{definition}{Definition}
\newtheorem{assumption}{Assumption}
\newtheorem{axiom}{Axiom}
\newtheorem{proposition}{Proposition}[section]
\newtheorem{claim}{Claim}[section]

\newcommand{\id}[1]{\llbracket{#1}\rrbracket}
\newcommand{\Ind}[1]{{\ensuremath{\mathbbm{1}\!\left\{#1\right\}}}}

\usepackage{siunitx,etoolbox}
\newrobustcmd{\B}{\bfseries}
\addtolength{\tabcolsep}{-2pt}

\newcommand\BibTeX{B\textsc{ib}\TeX}

\title{Neuro-Symbolic Entropy Regularization (Supplementary material) %
}

\newcommand{\rvar}[1]{\ensuremath{\mathit{#1}}\xspace}
\newcommand{\Xr}{\rvar{X}}
\newcommand{\Yr}{\rvar{Y}}

\newcommand{\rvars}[1]{\ensuremath{\mathbf{#1}}\xspace}
\newcommand{\Xs}{\rvars{X}}
\newcommand{\Ys}{\rvars{Y}}
\newcommand{\Zs}{\rvars{Z}}
\newcommand{\Ws}{\rvars{W}}
\newcommand{\Es}{\rvars{E}}
\newcommand{\Qs}{\rvars{Q}}
\newcommand{\Fs}{\rvars{F}}
\newcommand{\Ps}{\rvars{P}}
\newcommand{\Is}{\rvars{I}}

\newcommand{\jstate}[1]{\ensuremath{\mathbf{#1}}\xspace}
\newcommand{\xs}{\jstate{x}}
\newcommand{\ys}{\jstate{y}}
\newcommand{\zs}{\jstate{z}}
\newcommand{\ws}{\jstate{w}}
\newcommand{\es}{\jstate{e}}
\newcommand{\ps}{\jstate{p}}
\newcommand{\is}{\jstate{i}}

\newcommand{\true}{\mathit{true}}
\newcommand{\false}{\mathit{false}}

\newcommand{\vect}[1]{\ensuremath{\mathbf{\mathsf{#1}}}\xspace}
\newcommand{\pv}{\vect{p}}
\newcommand{\qv}{\vect{q}}

\newcommand{\sloss}{\operatorname{L^s}}

\newcommand{\p}{{p}}
\newcommand{\q}{{q}}
\newcommand{\ch}{\ensuremath{\mathsf{in}}}
\newcommand{\vars}{\ensuremath{\mathsf{vars}}}
\newcommand{\val}{\ensuremath{\mathsf{val}}}
\newcommand{\X}{\ensuremath{\mathbf{X}}}
\newcommand{\x}{\ensuremath{\mathbf{x}}}
\newcommand{\Y}{\ensuremath{\mathbf{Y}}}
\newcommand{\y}{\ensuremath{\mathbf{y}}}
\newcommand{\Z}{\ensuremath{\mathbf{Z}}}
\newcommand{\z}{\ensuremath{\mathbf{z}}}
\newcommand{\supp}{\ensuremath{\mathsf{supp}}}

\makeatletter
\newenvironment{breakablealgorithm}
  {%
   \begin{center}
     \refstepcounter{algorithm}%
     \hrule height.8pt depth0pt \kern-2pt%
     \renewcommand{\caption}[2][\relax]{%
       {\raggedright\textbf{\ALG@name~\thealgorithm} ##2\par}%
       \ifx\relax##1\relax %
         \addcontentsline{loa}{algorithm}{\protect\numberline{\thealgorithm}##2}%
       \else %
         \addcontentsline{loa}{algorithm}{\protect\numberline{\thealgorithm}##1}%
       \fi
       \kern2pt\hrule\kern2pt
     }
  }{%
     \kern2pt\hrule\relax%
   \end{center}
  }
\makeatother

\newcommand{\ra}[1]{\renewcommand{\arraystretch}{#1}}

\newif\ifcomments
\ifcomments
    \providecommand{\kareem}[2][]{{\protect\color{red}{[Kareem:\textbf{#1} #2]}}}
    \providecommand{\kaiwei}[2][]{{\protect\color{red}{[Kaiwei:\textbf{#1} #2]}}}
    \providecommand{\guy}[2][]{{\protect\color{purple}{[Guy:\textbf{#1} #2]}}}
    \providecommand{\eric}[2][]{{\protect\color{ForestGreen}{[Eric:\textbf{#1} #2]}}}
\else
    \providecommand{\kareem}[2][]{}
    \providecommand{\kaiwei}[2][]{}
    \providecommand{\guy}[2][]{}
    \providecommand{\eric}[2][]{}
\fi

\usepackage{tikz}
\usetikzlibrary{circuits.logic.US}
\usetikzlibrary{shapes.misc}
\usetikzlibrary{shapes.arrows}
\usetikzlibrary{arrows,shapes.geometric}
\usetikzlibrary{external}
\usetikzlibrary{positioning}
\tikzexternalize[prefix=tikz/,optimize command away=\includepdf]

\usepackage{pgfplots}
\pgfmathdeclarefunction{gaussian}{2}{%
  \pgfmathparse{1/(#2*sqrt(2*pi))*exp(-((x-#1)^2)/(2*#2^2))}%
}
\pgfmathdeclarefunction{gaussianmixture2}{5}{%
  \pgfmathparse{#5*(1/(#2*sqrt(2*pi))*exp(-((x-#1)^2)/(2*#2^2))) + (1-#5)*(1/(#4*sqrt(2*pi))*exp(-((x-#3)^2)/(2*#4^2)))}%
}

\newcommand{\figref}[1]{Fig.~\ref{#1}}
\newcommand{\tbref}[1]{Tab.~\ref{#1}}
\newlength{\picHeight}


\author[1]{\href{mailto:<ahmedk@cs.ucla.edu>?Subject=Your UAI 2022 paper}{Kareem Ahmed}{}}
\author[1]{Eric Wang}
\author[1]{Kai-Wei Chang}
\author[1]{Guy Van den Broeck}
\affil[1]{%
    Computer Science Department\\
    University of California Los Angeles\\
    USA
}

\DeclareMathOperator{\prob}{P}
\begin{document}
\maketitle
\section{Additional Experimental Details}
All of our models were implemented in PyTorch \citep{Paszke19}. 
Our code makes use of the PySDD to compile the constraints.
In all the experiments, we performed a grid search on the coefficent of the constraint loss as well as the the coefficient of the entropy loss in the range $[1\times10^{0}, 1\times10^{-1}, 5\times10^{-1}, 1\times10^{-2}, 5\times10^{-2}, 1\times10^{-3}, 5\times10^{-3}, 1\times10^{-4}, 5\times10^{-4}]$.
That is in addition to searching over other hyperparameters that will be listed, and vary by experiment.
All of our constraints are included with our code.
\subsection{Entity-Relation Extraction}
We begin by testing our research questions in the semi-supervised setting.
Here the model is presented with only a portion of the labeled
training set, with the rest used exclusively in an unsupervised manner
by the respective approach.

We make use of the natural ontology of entity types and their relations 
present when dealing with relational data. This defines a set of 
relations and their permissible argument types. As is with all of our
constraints, we express the aforementioned ontology in the language 
of Boolean logic.

Our approach to recognizing the named entities and their pairwise relations
is most similar to \citet{Zhong2020}. Contextual embeddings are first procured,
using the BERT$_{\text{BASE}}$ model from the Hugging Face Transformers library
\footnote{https://github.com/huggingface/transformers},
for every token in the sentence. These are then fed into a named entity 
recognition module that outputs a vector of per-class probability for every 
entity. A classifier then classifies the concatenated contextual 
embeddings and entity predictions into a relation.

We employ two entity-relation extraction datasets, the Automatic Content
Extraction (ACE) 2005 \citep{walker2006} and SciERC datasets \citep{luan2018}.
ACE05 defines an ontology over $7$ entities and $18$ relations from mixed-genre
text, whereas SciERC defines $6$ entity types with $7$ possible relation between
them and includes annotations for scientific entities and there relations,
assimilated from $12$ AI conference/workshop proceedings.
We report the percentage of coherent predictions: data points for which the
predicted entity types, as well as the relations are correct.
 
 
\paragraph{Constraint} The ACE05 specification lists all permissible relations and their arguments
the conjunction of which represent our constraint.
Unlike ACE05, SciERC does not specify an ontology of entities and their permissible relations. 
Therefore, our constraint is determined through procuring the set of all possible relation-subject-object
triples in the training set, and applying a threshold to eliminate all noisy labelings in the training set.
The script for extracting such a cosntraint is provided with our code.

We used SGD as the optimizer with an initial learning rate of $1.0$, which was annealed by a decay rate of
$0.9$ for every $10$ epochs that there is no improvement on the validation set.
Every model was allowed to train for $100$ epochs, with early stopping if progress is not made for $20$ epochs.

\subsection{Predicting Simple Paths}
For this task, our aim is to find the shortest path in a graph, or more
specifically a 4-by-4 grid, $G = (V, E)$ with uniform edge weights. Our input is
a binary vector of length $|V| + |E|$, with the first $|V|$ variables indicating
the source and destination, and the next $|E|$ variables encoding a subgraph $G'
\subseteq G$. Each label is a binary vector of length $|E|$ encoding the
shortest \emph{simple} path in $G'$, a requirement that we enforce through our
constraint. We follow the algorithm proposed by \citet{nishino2017} to generate 
a constraint for each simple path in the grid, conjoined with indicators specifying
the corresponding source-destination pair. Our constraint is then the disjunction of all such conjunctions.

To generate the data, we begin by randomly removing one third of the edges in
the graph $G$, resulting in a subgraph, $G'$. Subsequently, we filter out
connected components in $G'$ with fewer than $5$ nodes to reduce degenerate
cases. We then sample a source and destination node uniformly at random. The
latter constitutes a single data point. We generate a dataset of $1600$
examples, with a $60/20/20$ train/validation/test split. 
We keep all the hyperparameters provided by \citet{Xu18} fixed, employing a 5-layer MLP as our baseline,
with $50$ hidden units per layer, and the Adam optimizer with a learning rate of $1\times10^{-3}$.

\subsection{Preference Learning}
We also consider the task of preference learning. Given the user's ranking of a subset of elements, we
wish to predict the user's preferences over the remaining elements of the set.
We encode an ordering over $n$ items as a binary matrix ${X_{ij}}$, where for
each $i, j \in {1, \ldots, n}$, $X_{ij}$ denotes that item $i$ is at position
$j$. Our constraint $\alpha$ requires that the network's output be a valid 
total ordering.

We use preference ranking data over $10$ types of sushi for $5,000$ individuals,
taken from PREFLIB \citep{MaWa13a}, split 60/20/20. Our inputs consist of the user's 
preference over $6$ sushi types, with the model tasked to predict the
user's preference, a \emph{strict} total order, over the remaining $4$.
We keep all the hyperparameters provided by \citet{Xu18} fixed, employing a 3-layer MLP as our baseline,
with $25$ hidden units per layer, and the Adam optimizer with a learning rate of $1\times10^{-3}$.

\subsection{Warcraft Shortest Path}
Following \citep{Pogancic2020}, our training set consists of $10,000$
terrain maps curated using Warcraft II tileset.
Each map encodes an underlying grid of dimension $12 \times 12$, where each vertex is assigned a cost depending on the type of terrain it represents (e.g. earth has lower cost than water).
The shortest (minimum cost) path between the top left and bottom right vertices is encoded as
an indicator matrix, and serves as label. 
Presented with an image of a terrain map, a convolutional neural network -- following \citep{Pogancic2020}, we use ResNet18 \citep{He2016} -- outputs a $12 \times 12$ binary matrix indicating the vertices that constitute the minimum cost path.

We keep all the hyperparameters provided by \citep{Pogancic2020} fixed , using an Adam optimizer with a learning rate of $5\times10^{-4}$.
To obtain the constraint, we compiled a constraint for a $6 \times 6$ grid, that was applied $9$ times, to each overlapping region of the $12 \times 12$ grid.
\nobibliography{ahmed_663}
\end{document}
