%\documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{mathtools} % amsmath with fixes and additions
\usepackage{amssymb}
\usepackage{bbm}
\usepackage{balance}
\usepackage{algorithm}
\usepackage[noend]{algpseudocode}   
\usepackage{pgfplots}
\usepackage{amsfonts}
\usepackage{subcaption}
\usepackage{float}
\usepackage{enumitem}
\let\proof\relax
\let\endproof\relax
\usepackage{amsthm} %http://ctan.org/pkg/amsthm
\newtheorem{theorem}{Theorem}
\newtheoremstyle{exampstyle}
  {\topsep} % Space above
  {\topsep} % Space below
  {} % Body font
  {} % Indent amount
  {\bfseries} % Theorem head font
  {.} % Punctuation after theorem head
  {.5em} % Space after theorem head
  {} % Theorem head spec (can be left empty, meaning `normal')
\theoremstyle{exampstyle} \newtheorem{example}{Example}
\theoremstyle{exampstyle} \newtheorem{remark}{Remark}
\theoremstyle{exampstyle} \newtheorem{definition}{Definition}
\theoremstyle{exampstyle} \newtheorem{lemma}{Lemma}
\theoremstyle{exampstyle} \newtheorem*{lemma*}{Lemma}
\renewcommand{\qedsymbol}{}

% LINQS
\usepackage{macros}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Learning Explainable Templated Graphical Models - Supplementary Material}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<vembar@ucsc.edu>?Subject=Learning Explainable Templated Graphical Models}{Varun Embar *\thanks{Equal contribution}}{}}
\author[1]{Sriram Srinivasan *}
\author[1]{Lise Getoor}
% Add affiliations after the authors
\affil[1]{%
    Dept. of Computer Science and Engineering \\
    University of California, Santa Cruz \\
    USA
}

\begin{document}
\maketitle
\section{Strong convexity of PSL energy function}
We first reiterate the definition of strong convexity. 
\begin{definition}
\textbf{Strong Convexity:} A function $E: (\mathcal{Y},\mathcal{X}) \rightarrow \mathbb{R}$ is $\kappa$-strongly convex in $\mathcal{Y}$  (w.r.t the 1-norm) if $\mathcal{Y}$ is a convex set and, for $\mathbf{X} \in \mathcal{X}$ and any $\mathbf{Y},\mathbf{Y'} \in \mathcal{Y}$, $\tau \in [0,1]$,
\begin{align*}
    \tau(1 - \tau)\frac{\kappa}{2}\norm{\mathbf{Y} - \mathbf{Y'}} + E(\tau\mathbf{Y} + (1 - \tau) \mathbf{Y'}, \mathbf{X}) \\
    \leq \tau E(\mathbf{Y}, \mathbf{X}) + (1 - \tau)E(\mathbf{Y'}, \mathbf{X})
\end{align*}
\label{def:kconvex}
\end{definition}
The energy function  $E$ is a summation of squared hinges and hence $E$ is convex.
Further, the prior template described in our approach acts a regularizer of $\mathbf{Y}$ and is $\kappa$-strongly convex.
Hence $E$ is at least $\kappa$-strongly convex in $\mathcal{Y}$ .

\section{Proof of Lemma 1}
\begin{lemma*}
\small
For a graphical model $G$ with a set of potentials $\mathbf{\Phi}$, let $Q_i$ denote the number of potentials that involve $\mathbf{X_i}$, and let $Q_G \triangleq \max_{i} Q_i$. Let $\norm{\mathbf{w}} < R$. Let $\mathbf{X}, \mathbf{X'} \in \mathcal{X}$  differ at a single coordinate $i$ by atmost $\epsilon$. Then, for $\dot{\mathbf{Y}} \triangleq \argmin_{\mathbf{Y}} E(\mathbf{Y}, \mathbf{X})$ and $\dot{\mathbf{Y'}} \triangleq \argmin_{\mathbf{Y}} E(\mathbf{Y}, \mathbf{X'})$,
\begin{align*}
    \norm{E(\dot{\mathbf{Y'}}, \mathbf{X}) - E(\dot{\mathbf{Y'}}, \mathbf{X'})} \leq \epsilon R\sqrt{Q_G}\
\end{align*}
\end{lemma*}
\begin{proof}
\begin{align*}
    & \norm{E(\dot{\mathbf{Y'}}, \mathbf{X}) - E(\dot{\mathbf{Y'}}, \mathbf{X'})} \\
     &= \norm{ \mathbf{w}^T \mathbf{\Phi}\mathbf{(\dot{\mathbf{Y'}}, \mathbf{X})}  - \mathbf{w}^T \mathbf{\Phi}\mathbf{(\dot{\mathbf{Y'}}, \mathbf{X'})}} \\
     &\leq \norm{\mathbf{w}} \norm{\mathbf{\Phi}\mathbf{(\dot{\mathbf{Y'}}, \mathbf{X})} - \mathbf{\Phi}\mathbf{(\dot{\mathbf{Y'}}, \mathbf{X'})}} \text{[Form Cauchy-Schwarz]}\\
     &\leq R \norm{\mathbf{\Phi}\mathbf{(\dot{\mathbf{Y'}}, \mathbf{X})} - \mathbf{\Phi}\mathbf{(\dot{\mathbf{Y'}}, \mathbf{X'})}} \\
\end{align*}
because, by definition, $\norm{\mathbf{w}}$ is upper bounded by $R$. 
Note that $\mathbf{\Phi}\mathbf{(\dot{\mathbf{Y'}}, \mathbf{X})}$ and $\mathbf{\Phi}\mathbf{(\dot{\mathbf{Y'}}, \mathbf{X'})}$ only differ at any grounding involving $\mathbf{X_i}$.
The number of such groundings is $Q_i$, which is upper-bounded by $Q_G$, so at most $Q_G$ potentials will change.

Further, the squared hinge loss potential has the from $max\{\mathbf{C_x}^T\mathbf{X} + \mathbf{C_y}^T\mathbf{Y} - c, 0\}^2$ where $\mathbf{C_x}, \mathbf{C_y}$ co-efficient vectors consisting of $1, -1, 0$.\\

\begin{align*}
    &\norm{\mathbf{\Phi}\mathbf{(\dot{\mathbf{Y'}}, \mathbf{X})} - \mathbf{\Phi}\mathbf{(\dot{\mathbf{Y'}},\mathbf{X'})}} \\
    &=\left({\sum_{\phi \in \mathbf{\Phi}} \mathbbm{1}\{\mathbf{C_{\mathbf{X}_i}} \neq 0\} ((\phi(\dot{\mathbf{Y'}}, \mathbf{X}) - \phi(\dot{\mathbf{Y'}}, \mathbf{X'})))^{2}}\right)^{1/2} \\
    &=\left(\sum_{\phi \in \mathbf{\Phi}} \mathbbm{1}\{\mathbf{C_{\mathbf{X}_i}} \neq 0\} (max\{\mathbf{C_x}^T\mathbf{X} + \mathbf{C_y}^T\dot{\mathbf{Y'}} - c, 0\} \right. \\
    &\qquad \qquad \left. - max\{\mathbf{C_x}^T\mathbf{X'} + \mathbf{C_y}^T\dot{\mathbf{Y'}} - c, 0\})^{2}\right)^{1/2} \\
    &\leq \left(\sum_{\phi \in \mathbf{\Phi}} \mathbbm{1}\{\mathbf{C_{\mathbf{X}_i}} \neq 0\} (max\{\mathbf{C_x}^T(\mathbf{X}-\mathbf{X'})\right.\\
    &\qquad \qquad \left. + \mathbf{C_y}^T(\dot{\mathbf{Y'}}-\dot{\mathbf{Y'}}), 0\})^{2} \right) ^{1/2} \\
    & \leq \left(Q_i\right)^{1/2}\epsilon \leq \left(Q_G\right)^{1/2}\epsilon
\end{align*}
\end{proof}

\section{Proof of Lemma 2}
\begin{lemma*}
Let $E: (\mathcal{Y},\mathcal{X}) \rightarrow \mathbb{R}$ be $\kappa$-strongly convex, and let  $\dot{\mathbf{Y}} \triangleq \argmin_{\mathbf{Y}} E(\mathbf{Y}, \mathbf{X})$ and $\dot{\mathbf{Y'}} \triangleq \argmin_{\mathbf{Y}} E(\mathbf{Y}, \mathbf{X'})$, where $\mathbf{X}, \mathbf{X'} \in \mathcal{X}$  differ at a single RV $\mathbf{X_i}$ by atmost $\epsilon$. Then,
\begin{align}
    \norm{\dot{\mathbf{Y'}} - \dot{\mathbf{Y}}}^2 \leq \frac{2}{\kappa}(E(\dot{\mathbf{Y}}, \mathbf{X}) - E(\dot{\mathbf{Y'}}, \mathbf{X'}))
\end{align}
\end{lemma*}
\begin{proof}
Without loss of generality, assume that $E(\dot{\mathbf{Y}}, \mathbf{X}) \geq E(\dot{\mathbf{Y'}}, \mathbf{X'})$ .(If $E(\dot{\mathbf{Y}}, \mathbf{X}) \leq E(\dot{\mathbf{Y'}}, \mathbf{X'})$ we can state this in terms of $\dot{\mathbf{Y'}}$).
 Let $\Delta\mathbf{Y}\triangleq \dot{\mathbf{Y'}} - \dot{\mathbf{Y}}$. By \defnref{def:kconvex}, for any $\tau \in [0,1]$,
 \begin{align*}
     \tau(1 - \tau)&\frac{\kappa}{2}\norm{\dot{\mathbf{Y'}} - \dot{\mathbf{Y}}} + E(\tau\dot{\mathbf{Y'}} + (1 - \tau) \dot{\mathbf{Y}}, \mathbf{X}) \\
    &\leq \tau E(\dot{\mathbf{Y'}}, \mathbf{X}) + (1 - \tau)E(\dot{\mathbf{Y}}, \mathbf{X})
 \end{align*}
 Since $\mathbf{\dot{Y}}$ is, by definition, the unique minimizer of $E(\mathbf{Y}, \mathbf{X})$, it follows that $E(\mathbf{\dot{Y}} + \tau \Delta\mathbf{Y}, \mathbf{X}) - E(\mathbf{\dot{Y}}, \mathbf{X}) \ge 0$, so the above inequality is preserved when this term is dropped. This, dividing both sides by $\tau\kappa/2$, we have that
 \begin{align*}
      (1-\tau)\norm{\Delta\mathbf{Y}}^2 &\leq \frac{2}{\kappa}(E(\dot{\mathbf{Y'}}, \mathbf{X}) - E(\dot{\mathbf{Y}}, \mathbf{X})) \\
      \norm{\Delta\mathbf{Y}}^2 &\leq \frac{2}{\kappa}(E(\dot{\mathbf{Y'}}, \mathbf{X}) - E(\dot{\mathbf{Y}}, \mathbf{X}))
 \end{align*}
 where the last inequality follows from the fact that $(1 - \tau)$ is maximized at $\tau = 0$.\\
 Since $E(\dot{\mathbf{Y}}, \mathbf{X}) \geq E(\dot{\mathbf{Y'}}, \mathbf{X'})$, the following inequality holds
  \begin{align*}
     \norm{\dot{\mathbf{Y'}} - \dot{\mathbf{Y}}}^2 
     \leq \frac{2}{\kappa}(E(\dot{\mathbf{Y'}}, \mathbf{X}) - E(\dot{\mathbf{Y'}}, \mathbf{X'}))
 \end{align*}
\end{proof}

\section{Proof of Lemma 3}
\begin{lemma*}
Let the explaining function $f$ be defined as $f(\mathbf{X}, \mathbf{Y},\phi) = \norm{\frac{w\partial \phi(\mathbf{X,Y})}{\partial \mathbf{Y_i}}|_{y}}$. Let $\mathbf{X}, \mathbf{X'} \in \mathcal{X}$ differ at a single random variable $\mathbf{X_i}$ by at most $\epsilon$. Let $\norm{\mathbf{Y}- \mathbf{Y'}} < B$ for any two $\mathbf{Y}, \mathbf{Y'} \in \mathcal{Y}$ and $\norm{\mathbf{w}} < R$. Then:
\begin{align}
|f(\mathbf{X}, \mathbf{Y}, \phi) - f_k(\mathbf{X'}, \mathbf{Y'}, \phi)| \leq 2R(\epsilon+B)
\end{align}
\end{lemma*}
\begin{proof}
The hinge loss function $\phi$ has the from $max\{\mathbf{C_x}^T\mathbf{X} + \mathbf{C_y}^T\mathbf{Y} - c, 0\}^2$ where $\mathbf{C_x}, \mathbf{C_y}$ co-efficient vectors consisting of $1, 0, -1$.\\
The partial derivative w.r.t to $\mathbf{Y_i}$ 
\begin{align*}
    &= \norm{\frac{\partial \phi(\mathbf{X,Y})}{\partial \mathbf{Y_i}}|_{y}} \\
    & = 2 * max\{\mathbf{C_x}^T\mathbf{X} + \mathbf{C_y}^T\mathbf{Y} - c, 0\} \\
    &\qquad \qquad * \norm{\frac{\partial max\{\mathbf{C_x}^T\mathbf{X} +\mathbf{C_y}^T\mathbf{Y} - c, 0\}}{\partial \mathbf{Y_i}}|_{y}} \\
    & = 2 * max\{\mathbf{C_x}^T\mathbf{X} + \mathbf{C_y}^T\mathbf{Y} - c, 0\}
\end{align*}
The last step comes from the fact that $\qquad \qquad \norm{\frac{\partial max\{\mathbf{C_x}^T\mathbf{X} +\mathbf{C_y}^T\mathbf{Y} - c, 0\}}{\partial \mathbf{Y_i}}|_{y}}$ can be $\{-1, 0, 1\}$. In all cases, 
\begin{align*}
& 2 * max\{\mathbf{C_x}^T\mathbf{X} + \mathbf{C_y}^T\mathbf{Y} - c, 0\} \\
    &\qquad \qquad * \norm{\frac{\partial max\{\mathbf{C_x}^T\mathbf{X} +\mathbf{C_y}^T\mathbf{Y} - c, 0\}}{\partial \mathbf{Y_i}}|_{y}} \\
    & = 2 * max\{\mathbf{C_x}^T\mathbf{X} + \mathbf{C_y}^T\mathbf{Y} - c, 0\}
\end{align*}
Now consider $|f(\mathbf{X}, \mathbf{Y}, \phi) - f(\mathbf{X'}, \mathbf{Y'}, \phi)|$
\begin{align*}
%& = |f_k(\mathbf{X}, \mathbf{Y}, \phi_j) - f_k(\mathbf{X'}, \mathbf{Y'}, \phi_j)| \\
& = w \norm{\frac{\partial \phi(\mathbf{X,Y})}{\partial \mathbf{Y_i}}|_{y} - \frac{\partial \phi_j(\mathbf{X',Y'})}{\partial \mathbf{Y_i}}|_{y'}} \\
& = 2w \lVert max\{\mathbf{C_x}^T\mathbf{X} + \mathbf{C_y}^T\mathbf{Y} - c, 0\} \\
&\qquad \qquad - max\{\mathbf{C_x}^T\mathbf{X'} + \mathbf{C_y}^T\mathbf{Y'} - c, 0\} \rVert \\
& \leq 2w \norm{max\{\mathbf{C_x}^T(\mathbf{X}-\mathbf{X'}) + \mathbf{C_y}^T(\mathbf{Y}-\mathbf{Y'}), 0\}} \\
& \leq 2w \norm{max\{\epsilon + B,0\}} \\
& \leq 2R(\epsilon+B) 
\end{align*}
\end{proof}

\section{Datasets}
\textbf{Enity Resolution Dataset:} The Cora Citation entity resolution dataset is based on the citation references between scientific papers.
The task is to identify papers that are the same. This is represented by the target predicate $\pslpred{SAME\_BIB}$. 
The dataset contains 10 predicates. For each of the non-target predicates, we included the inverse predicates where the arguments are reversed. For example, for the predicate $\pslpred{SAME\_AUTHOR}(\pslarg{A},\pslarg{B})$ we include the predicate $\pslpred{\_SAME\_AUTHOR}(\pslarg{B},\pslarg{A})$. In total there are 19 predicates.

The set of predicates are $\pslpred{SAME\_AUTHOR}$, $\pslpred{SAME\_BIB}$, $\pslpred{SAME\_VENUE}$, $\pslpred{SAME\_TITLE}$, $\pslpred{AUTHOR}$, $\pslpred{VENUE}$, $\pslpred{TITLE}$, $\pslpred{HASWORD\_AUTHOR}$, $\pslpred{HASWORD\_TITLE}$, $\pslpred{HASWORD\_VENUE}$.
Since all predicates are explainable, we do not classify them as explainable and non-explainable predicates.
The dataset is split into 5 folds. We use the same splits as \citet{khot:icdm11}.

\textbf{Rcommendation Dataset:}
Both YELP and LASTFM datasets contain 21 predicates or relations. 
We categorize the predicates as explainable and non-explainable predicates based on how easy it is for an end-user to understand the predicates.
There were 15 explainable predicates and 6 non-explainable predicates.
The list of predicates are as follows:
    
\textbf{Explainable predicates:} $\pslpred{USERS\_ARE\_FRIENDS}$, $\pslpred{SIM\_CONTENT\_ITEMS\_JACCARD}$, $\pslpred{SIM\_PEARSON\_ITEMS}$, $\pslpred{SIM\_COSINE\_ITEMS}$, $\pslpred{SIM\_ADJCOS\_ITEMS}$, $\pslpred{SIM\_MF\_COSINE\_ITEMS}$, $\pslpred{SIM\_MF\_EUCLIDEAN\_ITEMS}$, $\pslpred{SIM\_COSINE\_USERS}$, $\pslpred{SIM\_PEARSON\_USERS}$, $\pslpred{SIM\_MF\_COSINE\_USERS}$, $\pslpred{SIM\_MF\_EUCLIDEAN\_USERS}$,
$\pslpred{AVG\_ITEM\_RATING}$,
$\pslpred{RATING\_PRIOR}$,
$\pslpred{AVG\_USER\_RATING}$\\
\textbf{Non-explainable predicates:} $\pslpred{RATING}$, $\pslpred{RATED}$, $\pslpred{SGD\_RATING}$, $\pslpred{BPMF\_RATING}$, $\pslpred{ITEM\_PEARSON\_RATING}$, $\pslpred{USER}$, $\pslpred{ITEM}$\\

The \textbf{YELP} dataset is split in five folds. Each fold contains a train and a test split.
The train splits contains 79240 observed ratings and 7924 ratings that need to be predicted.
The test split contains 99049 observed ratings and 19809 ratings that need to be predicted.

Similarly, the \textbf{LASTFM} dataset is split in five folds. Each fold contains a train and a test split.
The train splits contains 74267 observed ratings and 18567 ratings that need to be predicted.
The test split contains 92834 observed ratings and 18567 ratings that need to be predicted.

\section{Mode declarations for \BOOST}
Modes are used to restrict/guide the search space and are a powerful tool in getting relational algorithms such as BoostSRL to work.
Below we give the mode declarations used by \BOOST\ for the recommendation and entity resolution datasets.\\

\textbf{Entity resolution dataset:}
\begin{align*}
&mode: author(+paper, -auth). \\
&mode: haswordauthor(+auth, -word).\\
&mode: haswordtitle(+title, -word).\\
&mode: haswordvenue(+venue, -word).\\
&mode: title(+paper, -title).\\
&mode: venue(+paper, -venue).\\
&mode: author(-paper, +auth).\\
&mode: haswordauthor(-auth, +word).\\
&mode: haswordtitle(-title, +word).\\
&mode: haswordvenue(-venue, +word).\\
&mode: title(-paper, +title).\\
&mode: venue(-paper, +venue).\\
&mode: samebib(+paper, +paper).\\
&mode: sametitle(+title, +title).\\
&mode: samevenue(+venue, +venue).\\
&mode: sameauthor(+auth, +auth).\\
&mode: recursive\_samebib(+paper, paper).\\
&mode: recursive\_sametitle(+title, title).\\
&mode: recursive\_samevenue(+venue, venue).\\
&mode: recursive\_sameauthor(+auth, auth).\\
&mode: recursive\_samebib(`paper, +paper).\\
&mode: recursive\_sametitle(title, +title).\\
&mode: recursive\_samevenue(venue, +venue).\\
&mode: recursive\_sameauthor(auth, +auth).\\
&mode: samebib(+paper, -paper).\\
\end{align*}
\begin{align*}
&mode: sametitle(+title, -title).\\
&mode: samevenue(+venue, -venue).\\
&mode: sameauthor(+auth, -auth).\\
&mode: samebib(-paper, +paper).\\
&mode: sametitle(-title, +title).\\
&mode: samevenue(-venue, +venue).\\
&mode: sameauthor(-auth, +auth).\\
&usePrologVariables: true.\\
&okIfUnknown: recursive\_sametitle/2.\\
&okIfUnknown: recursive\_samebib/2.\\
&okIfUnknown: recursive\_samevenue/2.\\
&okIfUnknown: recursive\_sameauthor/2.
\end{align*}
\textbf{Recommendation datasets:}
\begin{align*}
&mode: avg\_item\_rating(+item). \\
&mode: avg\_user\_rating(+user). \\
&mode: bpmf\_rating(+user, +item). \\
&mode: item\_pearson\_rating(+user, +item).\\
&mode: sgd\_rating(+user, +item).\\
&mode: users\_are\_friends(+user, -user). \\
&mode: users\_are\_friends(-user, +user).\\
&mode: sim\_adjcos\_items(+item, -item).\\
&mode: sim\_adjcos\_items(-item, +item).\\
&mode: sim\_content\_items_jaccard(-item, +item).\\
&mode: sim\_content\_items_jaccard(-item, +item).\\
&mode: sim\_cosine\_items(-item, +item).\\
&mode: sim\_cosine\_items(+item, -item).\\
&mode: sim\_cosine\_users(-user, +user).\\
&mode: sim\_cosine\_users(+user, -user).\\
&mode: sim\_mf\_cosine\_items(-item, +item).\\
&mode: sim\_mf\_cosine\_items(+item, -item).\\
&mode: sim\_mf\_cosine\_users(-user, +user).\\
&mode: sim\_mf\_cosine\_users(+user, -user).\\
&mode: sim\_mf\_euclidean\_items(-item, +item).\\
&mode: sim\_mf\_euclidean\_items(+item, -item).\\
&mode: sim\_mf\_euclidean\_users(-user, +user).\\
&mode: sim\_mf\_euclidean\_users(+user, -user).\\
&mode: sim\_pearson\_users(-user, +user).\\
&mode: sim\_pearson\_users(+user, -user).\\
&mode: sim\_pearson\_items(-item, +item).\\
&mode: sim\_pearson\_items(+item, -item). \\
&mode: rating(+user, +item). \\
\end{align*}
\begin{align*}
&bridger: friends/2. \\
&bridger: sim\_adjcos\_items/2. \\
&bridger: sim\_content\_items/2. \\
&bridger: sim\_cosine\_items/2. \\
&bridger: sim\_cosine\_users/2. \\
&bridger: sim\_mf\_cosine\_items/2. \\
&bridger: sim\_mf\_cosine\_users/2.\\
&bridger: sim\_mf\_euclidean\_items/2.\\
&bridger: sim\_mf\_euclidean\_users/2.\\
&bridger: sim\_pearson\_users/2.\\
&bridger: sim\_pearson\_items/2.
\end{align*}

\section{Learned Models}
Below we show the rules learned by our approach (\SL), Path ranking algorithm (\PRA) and Boost(\BOOST) for one of the folds on the $\cora$ and the $\yelp$ dataset. Note that model weights are relative.
\subsection{$\cora$ dataset}
\textbf{Model learned by \SL} 
\begin{equation*}
\footnotesize
\begin{split}
0.07: & \pslpred{TARGETS}(\pslarg{A0},\pslarg{A2}) \land \pslpred{SAMEBIB}(\pslarg{A0}, \pslarg{A1})  
\land \\& \pslpred{SAMEBIB}(\pslarg{A1}, \pslarg{A2}) \pslthen \pslpred{SAMEBIB}(\pslarg{A0}, \pslarg{A2}) \\
0.018: & \pslpred{TARGETS}(\pslarg{A0},\pslarg{A2})  \land \pslpred{TITLE}(\pslarg{A0}, \pslarg{A1}) \land\\& \pslpred{\_TITLE}(\pslarg{A1}, \pslarg{A2}) \pslthen \pslpred{SAMEBIB}(\pslarg{A0}, \pslarg{A2}) \\
0.018: & \pslpred{TARGETS}(\pslarg{A0},\pslarg{A2}) \land \pslpred{VENUE}(\pslarg{A0}, \pslarg{A1})  \land \\&  \pslpred{\_VENUE}(\pslarg{A1}, \pslarg{A2}) \pslthen \pslpred{SAMEBIB}(\pslarg{A0}, \pslarg{A2}) 
\end{split}
\end{equation*}

\textbf{Model learned by \BOOST} 
\begin{equation*}
\footnotesize
\begin{split}
0.35: & \pslpred{TARGETS}(\pslarg{A0},\pslarg{A1}) \land \pslpred{VENUE}(\pslarg{A1}, \pslarg{A2})  
\land \\& \pslpred{VENUE}(\pslarg{A0}, \pslarg{A2}) \pslthen \pslpred{SAMEBIB}(\pslarg{A0}, \pslarg{A1}) \\
0.35: & \pslpred{TARGETS}(\pslarg{A0},\pslarg{A1})  \land \pslpred{TITLE}(\pslarg{A0}, \pslarg{A2}) \land\\& \pslpred{TITLE}(\pslarg{A1}, \pslarg{A2}) \pslthen \pslpred{SAMEBIB}(\pslarg{A0}, \pslarg{A1}) \\
0.37: & \pslpred{TARGETS}(\pslarg{A0},\pslarg{A0}) \land \pslpred{AUTHOR}(\pslarg{A0}, \pslarg{A1})   \\& \pslthen \pslpred{SAMEBIB}(\pslarg{A0}, \pslarg{A0}) 
\end{split}
\end{equation*}

\textbf{Model learned by \PRA} 
\begin{equation*}
\footnotesize
\begin{split}
0.07: & \pslpred{TARGETS}(\pslarg{A0},\pslarg{A2}) \land \pslpred{SAMEBIB}(\pslarg{A0}, \pslarg{A1})  
\land \\& \pslpred{SAMEBIB}(\pslarg{A1}, \pslarg{A2}) \pslthen \pslpred{SAMEBIB}(\pslarg{A0}, \pslarg{A2}) \\
\end{split}
\end{equation*}
\subsection{$\yelp$ dataset}
\textbf{Model learned by \SL} 
\begin{equation*}
\footnotesize
\begin{split}
0.01:& \pslpred{1.00 * RATING}(\pslarg{A0},\pslarg{ A1}) = 1.00\\
0.07:& \pslpred{AVG\_ITEM\_RATING}(\pslarg{A0}) \land \pslpred{ITEM}(\pslarg{A0}) \land \\& \pslpred{RATED}(\pslarg{A1},\pslarg{ A0}) \pslthen \pslpred{RATING}(\pslarg{A1},\pslarg{ A0})\\
0.07:& \pslpred{AVG\_USER\_RATING}(\pslarg{A0}) \land \pslpred{USER}(\pslarg{A0}) \land \\&\pslpred{RATED}(\pslarg{A0},\pslarg{ A1}) \pslthen \pslpred{RATING}(\pslarg{A0},\pslarg{ A1})\\
0.05:& \pslpred{RATING}(\pslarg{A},\pslarg{ B}) \land \pslpred{SIM\_ADJCOS\_ITEMS}(\pslarg{B},\pslarg{ C}) \land \\&  \pslpred{RATED}(\pslarg{A},\pslarg{ B}) \land \pslpred{RATED}(\pslarg{A},\pslarg{ C})\pslthen \pslpred{RATING}(\pslarg{A},\pslarg{ C})\\
0.07:& \pslpred{SGD\_RATING}(\pslarg{A0},\pslarg{ A1}) \land \pslpred{RATED}(\pslarg{A0},\pslarg{ A1})\\& \land \pslpred{RATED}(\pslarg{A0},\pslarg{ A1}) \pslthen \pslpred{RATING}(\pslarg{A0},\pslarg{ A1})\\
0.05:& \pslpred{SIM\_PEARSON\_USERS}(\pslarg{A0},\pslarg{ A1}) \land \pslpred{RATING}(\pslarg{A1},\pslarg{ A2}) \land \\& \pslpred{RATED}(\pslarg{A1},\pslarg{ A2})  \land \pslpred{RATED}(\pslarg{A0},\pslarg{ A2})\pslthen \pslpred{RATING}(\pslarg{A0},\pslarg{ A2})\\
0.05:& \pslpred{RATING}(\pslarg{A},\pslarg{ B}) \land \pslpred{SIM\_PEARSON\_USERS}(\pslarg{A},\pslarg{ C}) \land \\& \pslpred{RATED}(\pslarg{A},\pslarg{ B}) \land \pslpred{RATED}(\pslarg{C},\pslarg{ B}) \pslthen \pslpred{RATING}(\pslarg{C},\pslarg{ B})\\
0.01:& \pslpred{1.00 * RATING}(\pslarg{A0},\pslarg{ A1}) = 0.00\\
0.07:& \pslpred{SIM\_MF\_EUCLIDEAN\_USERS}(\pslarg{A0},\pslarg{ A1}) \land \\&  \pslpred{RATING}(\pslarg{A1},\pslarg{ A2}) \land  \pslpred{RATED}(\pslarg{A1},\pslarg{ A2}) \land \\& \pslpred{RATED}(\pslarg{A0},\pslarg{ A2})\pslthen  \pslpred{RATING}(\pslarg{A0},\pslarg{ A2})\\
\end{split}
\end{equation*}

\textbf{Model learned by \BOOST}
\begin{equation*}
\footnotesize
\begin{split}
0.12:& \pslpred{SIM\_PEARSON\_USERS}(\pslarg{A},\pslarg{ A1}) \land \pslpred{RATED}(\pslarg{A},\pslarg{ A1})\\& \pslthen \pslpred{RATING}(\pslarg{A},\pslarg{ A1})\\
0.10:& \pslpred{AVG\_USER\_RATING}(\pslarg{A}) \land \pslpred{RATED}(\pslarg{A},\pslarg{ A1})\\& \pslthen \pslpred{RATING}(\pslarg{A},\pslarg{ A1})\\
0.15:& \pslpred{RATED}(\pslarg{A1},\pslarg{ A}) \land \pslpred{SIM\_PEARSON\_ITEMS}(\pslarg{A},\pslarg{ A1})\\& \pslthen \pslpred{RATING}(\pslarg{A1},\pslarg{ A})\\
0.07:& \pslpred{AVG\_USER\_RATING}(\pslarg{A}) \land \\ & \pslpred{SIM\_PEARSON\_USERS}(\pslarg{A},\pslarg{ A1}) \land \pslpred{RATED}(\pslarg{A},\pslarg{A1}) \\& \pslthen  \pslpred{RATING}(\pslarg{A},\pslarg{A1})\\
0.06:& \pslpred{AVG\_USER\_RATING}(\pslarg{A}) \\& \land \pslpred{SIM\_PEARSON\_ITEMS}(\pslarg{B},\pslarg{ A1}) \land  \pslpred{RATED}(\pslarg{A},\pslarg{ B}) \\& \pslthen \pslpred{RATING}(\pslarg{A},\pslarg{ B})\\
\end{split}
\end{equation*}
\begin{equation*}
\footnotesize
\begin{split}
0.06:& \pslpred{AVG\_USER\_RATING}(\pslarg{B}) \land \pslpred{RATED}(\pslarg{A},\pslarg{ A1}) \land \\& \pslpred{SIM\_PEARSON\_USERS}(\pslarg{A},\pslarg{ B}) \pslthen \pslpred{RATING}(\pslarg{A},\pslarg{ A1})\\
0.06:& \pslpred{AVG\_USER\_RATING}(\pslarg{A}) \land \\& \pslpred{SIM\_MF\_COSINE\_ITEMS}(\pslarg{B},\pslarg{ A1}) \land \pslpred{RATED}(\pslarg{A},\pslarg{ B})\\& \pslthen \pslpred{RATING}(\pslarg{A},\pslarg{ B})\\
0.05:& \pslpred{RATED}(\pslarg{A1},\pslarg{ A}) \land \pslpred{SIM\_PEARSON\_ITEMS}(\pslarg{A},\pslarg{ B}) \land \\& \pslpred{AVG\_ITEM\_RATING}(\pslarg{B}) \pslthen \pslpred{RATING}(\pslarg{A1},\pslarg{ A})\\
0.09:& \pslpred{ITEM\_PEARSON\_RATING}(\pslarg{A},\pslarg{ B}) \land \pslpred{RATED}(\pslarg{A},\pslarg{ B})\\& \pslthen \pslpred{RATING}(\pslarg{A},\pslarg{ B})\\
0.10:& \pslpred{RATED}(\pslarg{A},\pslarg{ B}) \land \pslpred{SGD\_RATING}(\pslarg{A},\pslarg{ B})\\& \pslthen \pslpred{RATING}(\pslarg{A},\pslarg{ B})\\
\end{split}
\end{equation*}

\textbf{Model learned by \PRA} \\
The PRA model had over 75 rules. Here, we shows some of the top weighted rules.
\begin{equation*}
\footnotesize
\begin{split}
0.10:& \pslpred{ITEM\_PEARSON\_RATING}(\pslarg{R2},\pslarg{ R3}) \land \\& \pslpred{SIM\_COSINE\_USERS}(\pslarg{R2},\pslarg{ R1}) \land \\& \pslpred{RATED}(\pslarg{R1},\pslarg{ R3}) \pslthen \pslpred{RATING}(\pslarg{R1},\pslarg{ R3})\\
0.10:& \pslpred{BPMF\_RATING}(\pslarg{R1},\pslarg{ R2}) \land \\&\pslpred{SIM\_COSINE\_ITEMS}(\pslarg{R3},\pslarg{ R2}) \land \\& \pslpred{RATED}(\pslarg{R1},\pslarg{ R3})\pslthen \pslpred{RATING}(\pslarg{R1},\pslarg{ R3})\\
0.10:& \pslpred{SGD\_RATING}(\pslarg{R1},\pslarg{ R2}) \land \\&\pslpred{SIM\_COSINE\_ITEMS}(\pslarg{R3},\pslarg{ R2}) \land \\&  \pslpred{RATED}(\pslarg{R1},\pslarg{ R3})\pslthen \pslpred{RATING}(\pslarg{R1},\pslarg{ R3})\\
0.10:& \pslpred{ITEM\_PEARSON\_RATING}(\pslarg{R1},\pslarg{ R2}) \land \\&\pslpred{SIM\_COSINE\_ITEMS}(\pslarg{R3},\pslarg{ R2}) \land \pslpred{RATED}(\pslarg{R1},\pslarg{ R3})\\& \pslthen \pslpred{RATING}(\pslarg{R1},\pslarg{ R3})\\
0.10:& \pslpred{SIM\_ADJCOS\_ITEMS}(\pslarg{R3},\pslarg{ R2}) \land \\&\pslpred{SGD\_RATING}(\pslarg{R1},\pslarg{ R2}) \land \pslpred{RATED}(\pslarg{R1},\pslarg{ R3})\\& \pslthen \pslpred{RATING}(\pslarg{R1},\pslarg{ R3})\\
0.10:& \pslpred{BPMF\_RATING}(\pslarg{R1},\pslarg{ R2}) \land \pslpred{RATED}(\pslarg{R1},\pslarg{ R3}) \land \\& \pslpred{SIM\_ADJCOS\_ITEMS}(\pslarg{R2},\pslarg{ R3}) \\&\pslthen \pslpred{RATING}(\pslarg{R1},\pslarg{ R3})\\
0.10:& \pslpred{SIM\_ADJCOS\_ITEMS}(\pslarg{R3},\pslarg{ R2}) \land \\& \pslpred{BPMF\_RATING}(\pslarg{R1},\pslarg{ R2}) \land \pslpred{RATED}(\pslarg{R1},\pslarg{ R3})\\& \pslthen \pslpred{RATING}(\pslarg{R1},\pslarg{ R3}) \\
0.10:& \pslpred{SIM\_ADJCOS\_ITEMS}(\pslarg{R3},\pslarg{ R2}) \land \\& \pslpred{ITEM\_PEARSON\_RATING}(\pslarg{R1},\pslarg{ R2}) \land \\&\pslpred{RATED}(\pslarg{R1},\pslarg{ R3}) \pslthen \pslpred{RATING}(\pslarg{R1},\pslarg{ R3})\\
0.10:& \pslpred{SGD\_RATING}(\pslarg{R1},\pslarg{ R2}) \land \pslpred{RATED}(\pslarg{R1},\pslarg{ R3}) \land \\& \pslpred{SIM\_ADJCOS\_ITEMS}(\pslarg{R2},\pslarg{ R3})\\& \pslthen \pslpred{RATING}(\pslarg{R1},\pslarg{ R3})\\
0.10:& \pslpred{ITEM\_PEARSON\_RATING}(\pslarg{R1},\pslarg{ R2}) \land \\& \pslpred{RATED}(\pslarg{R1},\pslarg{ R3}) \land  \pslpred{SIM\_ADJCOS\_ITEMS}(\pslarg{R2},\pslarg{ R3})\\& \pslthen \pslpred{RATING}(\pslarg{R1},\pslarg{ R3})\\
0.11:& \pslpred{SGD\_RATING}(\pslarg{R1},\pslarg{ R2}) \land \\& \pslpred{SIM\_COSINE\_ITEMS}(\pslarg{R2},\pslarg{ R3}) \land \pslpred{RATED}(\pslarg{R1},\pslarg{ R3})\\& \pslthen \pslpred{RATING}(\pslarg{R1},\pslarg{ R3})\\
\end{split}
\end{equation*}
\begin{equation*}
\footnotesize
\begin{split}
0.10:& \pslpred{SIM\_COSINE\_ITEMS}(\pslarg{R2},\pslarg{ R3}) \land \\&\pslpred{BPMF\_RATING}(\pslarg{R1},\pslarg{ R2}) \land \pslpred{RATED}(\pslarg{R1},\pslarg{ R3})\\& \pslthen \pslpred{RATING}(\pslarg{R1},\pslarg{ R3})\\
0.11:& \pslpred{SIM\_COSINE\_ITEMS}(\pslarg{R2},\pslarg{ R3}) \land \\& \pslpred{ITEM\_PEARSON\_RATING}(\pslarg{R1},\pslarg{ R2}) \land \\& \pslpred{RATED}(\pslarg{R1},\pslarg{ R3}) \pslthen \pslpred{RATING}(\pslarg{R1},\pslarg{ R3})\\
\end{split}
\end{equation*}

\section{Timing Experiment}
\begin{figure}
    \centering
    \includegraphics[scale=0.35]{timing.png}
    \caption{\textbf{Runtime for weight learning}: Runtime increases exponentially for MLE but increases linearly for PPLL.}
    \label{fig:runtime}
\end{figure}

We evaluate the runtimes for the proposed PPLL weight learning approach and the standard Maximum Likelihood Estimate (MLE) approach. 
Given a set of rules, PPLL compute the weights only once for each rule. The presence of other rules in the model does not affect the weight of a rule. However, since MLE couples all the rules, we need to compute the weights for each subset of the rules and select the best model.
\figref{fig:runtime} shows the runtimes in seconds for PPLL and MLE as the number of rules in the model increases from 1 to 5. 
We observe that the runtimes increase exponentially for MLE but increases linearly for PPLL.
The decoupling of the rules in weight learning help scale our approach to models with larger sets of rules.

\end{document}
