\setcounter{proposition}{0}
\section*{APPENDIX}
\section{Proofs}\label{apendix:proofs}

\begin{proposition}
    If parameterized by positive weights $\Theta^j$, the negative Conditional Log-Likelihood $-\log P_{\Theta^j}({\bf y}|{\bf x},\Phi^i_{\mathcal{M}})$ is a convex function.
\end{proposition}

\begin{proof}
    \begin{equation*}
    -\log P_{\Theta^j}({\bf y}|{\bf x},\Phi^i)=\log Z-\sum_{k}\theta^j_ks_k({\bf x},{\bf y},\Phi^i)
    \end{equation*}
    Note that in this case, we partition the ground atoms in the HMLN into query and evidence atoms and we assume that $\Phi^i$ is always observed as evidence. Thus, 
    \begin{equation*}
        \log Z = \log \left(\sum_{\bf y'}\exp\left(\sum_k\theta^j_ks_k({\bf y}',{\bf x},\Phi^i\right)\right)
    \end{equation*}
    Thus, since $\log Z$ is a log-sum over exponentials, the result follows.
\end{proof}

\begin{proposition}
    For any embedding $\phi$, let $w^*_i(\phi)$ be the exact DR and $w_i(\phi)$ be the approximate DR computed by the probabilistic classifier. If the value of each ground formula is bounded between (0,1) and $|\frac{1}{w^*_i(\phi)} - \frac{1}{w_i(\phi)}|$ $\leq$ $\epsilon$, then $\ell^*-{\ell}$ $\leq$ $2\epsilon m$, where $m$ is the number of ground formulas, $\ell^*$ denotes the CLL reparameterized by the exact DR and $\ell$ denotes the CLL reparameterized by the approximate DR.
\end{proposition}

\begin{proof}
\begin{align*}
\ell^*({\bf y},~&{\bf x},\Phi) = \left(\sum_{i=1}^m\frac{\theta_i}{w_i^*(\Phi)}s_i({\bf x},{\bf y},\Phi)\right)-\log Z \\
\ell^*({\bf y},~&{\bf x},\Phi) = \left(\sum_{i=1}^m\frac{\theta_i}{w_i^*(\Phi)}s_i({\bf x},{\bf y},\Phi)\right)-\log \sum_{{\bf x},{\bf y}',\Phi}\left(\sum_{i=1}^m\frac{\theta_i}{w_i^*(\Phi)}s_i({\bf x},{\bf y}',\Phi)\right) \\
&\leq\left(\sum_{i=1}^m(\frac{\theta_i}{w_i(\Phi)}+\epsilon) s_i({\bf x},{\bf y},\Phi)\right)-\log \sum_{{\bf x},{\bf y}',\Phi}\left(\sum_{i=1}^m(\frac{\theta_i}{w_i(\Phi)}-\epsilon) s_i({\bf x},{\bf y}',\Phi)\right) \\
&= \sum_{i=1}^m\epsilon s_i({\bf x},{\bf y},\Phi)+\left(\sum_{i=1}^m\frac{\theta_i}{w_i(\Phi)} s_i({\bf x},{\bf y},\Phi)\right)-\log \sum_{{\bf x},{\bf y}',\Phi}\left(\sum_{i=1}^m(\frac{\theta_i}{w_i(\Phi)}) s_i({\bf x},{\bf y}',\Phi)-\log\exp\sum_{i=1}^m\epsilon s_i({\bf x},{\bf y}',\Phi)\right) \\
& =2\epsilon m + \ell({\bf y},{\bf x},\Phi)
\end{align*}
\end{proof}

\begin{proposition}
    Given an HMLN $[\theta:f(x,y)*({\tt R}(x) \vee {\tt S}(y))]$, where $f(x,y)$ is real-valued, if $|\Delta_x|$ $=$ $|\Delta_y|$ $=$ $n$ then for the non-reparameterized distribution, the marginal probability for a single-variable query $(P({\tt R}(A))$ converges to a constant, i.e., $\lim_{n\to\infty} P({\tt R}(A))$ $=$ $1$.
\end{proposition}

\begin{proof}
    \begin{equation*}
        P({\tt R}(A))=\frac{1}{1+\frac{Z_{{\tt R}(A)=0}}{Z_{{\tt R}(A)=1}}}
    \end{equation*}
    Using the lifted inference rules from~\citet{gogate2011probabilistic}, we write the two partition functions as follows.
    \begin{align*}
    Z_{{\tt R}(A)=1} &= e^{\theta*v*n}*2^n\\
    Z_{{\tt R}(A)=0} &= (1+e^{\theta*v})^n\\
    \lim_{n\to\infty} \frac{Z_{{\tt R}(A)=0}}{Z_{{\tt R}(A)=1}} &= \frac{(1+e^{\theta*v})^n}{e^{\theta*v*n}*2^n}=0\\
    \lim_{n\to\infty} P({\tt R}(A)) &= \frac{1}{1+0}=1.
    \end{align*}
\end{proof}

\begin{proposition}
        Given an HMLN $[\theta:f(x,y)*({\tt R}(x) \vee {\tt S}(y))]$, where $f(x,y)$ is real-valued, if $|\Delta_x|$ $=$ $|\Delta_y|$ $=$ $n$ and $f(x,y)=v$, if the importance weight is $1/n$ for each grounding, the marginal probability for a single-variable query $(P({\tt R}(A))$ converges to a function over $\theta,v$, i.e., $\lim_{n\to\infty} {P}({\tt R}(A))$ $=$ $\frac{1}{1+e^{\frac{-\theta*v}{2}}}$.
\end{proposition}

\begin{proof}
     \begin{equation*}
         P({\tt R}(A))=\frac{1}{1+\frac{Z_{{\tt R}(A)=0}}{Z_{{\tt R}(A)=1}}}
     \end{equation*}
     %Since we assume a uniform distribution over the groundings, the probability associated with each grounding is $\frac{1}{n}$. 
     Using the lifted inference rules from~\citet{gogate2011probabilistic}, we write the two partition functions as follows.
\begin{align*}
Z_{{\tt R}(A)=1} &= e^{(\theta*v/n)*n}*2^n = e^{\theta*v}*2^n \\
Z_{{\tt R}(A)=0} &= (1+e^{\theta*v/n})^n \\
\lim_{n\to\infty} \frac{Z_{{\tt R}(A)=0}}{Z_{{\tt R}(A)=1}} &= \frac{(1+e^{\theta*v/n})^n}{e^{\theta*v}*2^n}=e^{-\theta*v/2} \\
\lim_{n\to\infty} {P}(R(A)) &= \frac{1}{1+e^{-\theta*v/2}}
\end{align*}
\end{proof}

\section{Deep Knowledge Tracing}\label{apendix:dkt}

Knowledge Tracing~\citep{bkt} is a classical cognitive model that models student knowledge over time to encode latent student skills.
Specifically, students work out problems and the model observes if the student answered a problem correctly/incorrectly to model knowledge acquired by the student over time. Knowledge tracing aims to model knowledge acquired by the student so that we can use this to predict how they may perform in future problems. This can be used in several applications such as to develop interventions targeting specific areas of deficiency, improve student engagement, alternate strategies that may be used to teach a student, etc.
Bayesian Knowledge Tracing (BKT)~\citep{bkt} is a classical approach for knowledge tracing that uses a Hidden Markov Model to learn from temporal data. Specifically, in BKT, a student's knowledge is represented by a set of latent variables. As the student answers exercises related to specific {\em skills}, BKT updates the latent variable probabilities based on the correctness (or incorrectness) of their answers. 
Deep Knowledge Tracing (DKT)~\citep{NIPS2015_bac9162b} leverages DNNs to learn dense embeddings representing student skills. Specifically, DKT is a Sequence2Sequence model trained over observations that simulate exercises that students work on of varying difficulty.
Specifically, knowledge over skills is represented by the hidden layer in the Sequence2Sequence model. The model is trained over sequential observations that simulate exercises that students work on of varying difficulty.
To train the model, the exercises are generated using Item Response Theory (IRT)~\citep{irt1990}. 
Specifically, given parameters $\alpha$, $\beta$ that represent student skill in a specific concept and exercise difficulty respectively, the probability that the student completes the exercise correctly is $P(correct|\alpha,\beta)$ $=$ $c$ $+$ $\frac{1-c}{1+\exp(\beta-\alpha)}$, where $c$ is the probability of a random guess (which is set to 0.25). The dataset initializes the difficulty level for each exercise and also sets initial skill levels for each student. We used difficulty levels ranging from 1 to 5, with 5 being the most difficult. The students' skills are updated over time as they encounter more exercises related to the same concept. 

\section{Algorithms}
The algorithm to train the mixture model is summarized in Algorithm.~\ref{alg:mhmlns}. The algorithm for performing marginal inference is summarized in Algorithm~\ref{alg:ginfer}.

\begin{algorithm}[h]
\caption{Mixture of HMLNs}
\label{alg:mhmlns}

\textbf{Input}: HMLN structure $\mathcal{M}$, $\mathcal{D}$ $=$ $({\bf y},{\bf x})$, Representations $\{\Phi^i\}_{i=1}^n$\\
\textbf{Output}: Mixture Model with HMLN parameters $\{\Theta^i\}_{i=1}^K$\\ 

\begin{algorithmic}[1]
    \STATE Initialize $\alpha_i\ldots\alpha_K$
    \STATE Initialize $\{\Theta^i\}_{i=1}^K$
    \WHILE{not converged or $t$ $\leq$ $maxiters$}
        \STATE // $E-Step$
        \STATE Compute the component weight matrix with weights $\gamma_{ij}$ using Eq.~\eqref{eq:weightmat}
        % static eqn no 7
        \STATE // $M-Step$
        \FOR{$j$ $=$ 1 through $K$}
            \STATE // {\em Perform gradient descent}
            \STATE Compute the MAP assignment and MAP objective $M_j$ to all non-evidence variables given weights $\Theta^j$\\
            \FOR{$\theta_k^j$}
                \STATE Compute the expected value of the $k$-th formula in the MAP assignment\\
                \STATE Update $\theta_k^j$ using the gradient in Eq.~\eqref{eq:gradient}\\
                % static eqn no 9
            \ENDFOR
        \ENDFOR 
        \FOR{$j$ $=$ 1 through $K$}
            \STATE Update the mixture coefficients $\alpha_i^{(t)}$\\
            %$=$ $(\alpha_i^{(t-1)}+\hat{M}_j)/2$, where $\hat{M}_j$ is the MAP objective value normalized over all components.
            \ENDFOR
    \ENDWHILE
\end{algorithmic}
\end{algorithm}
% \normalsize



\begin{algorithm}[h]
 \small
\caption{Marginal Inference}
\label{alg:ginfer}
% \linesnumbered

\textbf{Input}: evidence $\hat{\bf x}$, non-evidence $\hat{\bf y}$, test representation $\hat{\Phi}$, probabilistic classifier $\mathcal{C}$, HMLN parameters $\{\Theta^i\}_{i=1}^K$\\
\textbf{Output}: Marginal probabilities for $\hat{\bf y}$\\

\begin{algorithmic}[1]
    \STATE Initialize $\hat{\bf y}^{(0)}$ to a random state
    \WHILE{Not converged}
        \STATE Select component HMLN $\Theta^j$ to sample with probability $\alpha_j$
        \STATE Compute the DR for each grounding using $\mathcal{C}$
        \STATE Reparameterize the $j$-th HMLN with the DRs using Eq.~\eqref{eq:mlnreparam3}
        % static eqn no 15
        \STATE {\em Gibbs sampling steps}
        \FOR{$y\in\hat{\bf y}$}
        \STATE From the reparameterized HMLN, sample a single non-evidence variable $y$ using the Gibbs kernel
        \IF{burn-in complete}
            \STATE Update marginal estimates for $\hat{\bf y}$ using the estimator in Eq.~\eqref{eq:margest}
            % static eqn no 2
        \ENDIF
        \ENDFOR
    \ENDWHILE
    \STATE return  marginal estimates for $\hat{\bf y}$
\end{algorithmic}
\end{algorithm}
% \normalsize

\section{More Results}
The tables \ref{tab:appres1} - \ref{tab:appres4} show additional results on the comparison of the log-likelihoods for the different components while learning the Mix-HMLN. These results are for inference on the Cora dataset. Fig.~\ref{fig:rel-diag-components} shows the reliability diagrams for the calibrated neural network which we use to compute the importance weights. 

\begin{table*}
    \centering
    \caption{Conditional Log-Likelihood on the original Cora graphs for the individual HMLN components.}
    \scalebox{1}{
    \begin{tabular}{c c c c c }
         \hline
         \multirow{2}{*}{\textbf{Experiments}} & \multicolumn{4}{c}{\textbf{Cora}}\\
         \cline{2-5}
         & \textbf{CLL} & \textbf{\makecell{Avg. \\Marginals}} & \textbf{Accuracy (\%)} & \textbf{\makecell{Time \\(secs)}} \\
         \hline
         \hline
         HMLN comp 1 & -0.21$\pm$0.08 & 0.83 & 55.43$\pm$3.35 & 49.58$\pm$1.5 \\
         HMLN comp 2 & -0.34$\pm$0.04 & 0.71 & 56.37$\pm$2.13 & 50.24$\pm$2.6 \\
         HMLN comp 3 & -0.33$\pm$0.05 & 0.71 & 55.22$\pm$1.05 & 49.87$\pm$1.9 \\
         HMLN comp 4 & -0.27$\pm$0.08 & 0.78 & 60.28$\pm$2.1 & 52.4$\pm$2.05\\
         HMLN comp 5 & -0.39$\pm$0.12 & 0.87 & 69.53$\pm$2.3 & 51.2$\pm$1.7 \\
          HMLN comp 6 & -0.26$\pm$0.08 & 0.71 & 64.75$\pm$3.88 & 51.85$\pm$2.19 \\
         HMLN comp 7 & -0.28$\pm$0.08 & 0.70 & 59.8$\pm$1.88 & 52.16$\pm$3.27 \\
         HMLN comp 8 & -0.35$\pm$0.08 & 0.75 & 60.54$\pm$3.73 & 52.62$\pm$1.59 \\
         HMLN comp 9 & -0.27$\pm$0.04 & 0.75 & 57.48$\pm$3.46 & 51.74$\pm$1.34\\
         HMLN comp 10 & -0.30$\pm$0.04 & 0.74 & 59.97$\pm$1.6 & 52.72$\pm$1.79 \\
         \hline
    \end{tabular}
    }
    \label{tab:appres1}
\end{table*}

\begin{table*}
    \centering
    \caption{Conditional Log-Likelihood on the original Citeseer graphs for the individual HMLN components.}
    \scalebox{1}{
    \begin{tabular}{c c c c c }
         \hline
         \multirow{2}{*}{\textbf{Experiments}} & \multicolumn{4}{c}{\textbf{Citeseer}}\\
         \cline{2-5}
         & \textbf{CLL} & \textbf{\makecell{Avg. \\Marginals}} & \textbf{Accuracy (\%)} & \textbf{\makecell{Time \\(secs)}} \\
         \hline
         \hline
         HMLN comp 1 & -0.10$\pm$0.05 & 0.81 & 59.95$\pm$2.10 & 47.27$\pm$1.3 \\
         HMLN comp 2 & -0.16$\pm$0.04 & 0.77 & 59.32$\pm$2.23 & 44.3$\pm$1.6 \\
         HMLN comp 3 & -0.12$\pm$0.05 & 0.80 & 57.97$\pm$2.19 & 49.11$\pm$1.1 \\
         HMLN comp 4 &-0.12$\pm$0.04 & 0.80 & 58.45$\pm$2.02 & 49.2$\pm$2.1\\
         HMLN comp 5 & -0.16$\pm$0.07 & 0.76 & 61.33$\pm$0.79 & 47.1$\pm$2.1 \\
         HMLN comp 6 & -0.17$\pm$0.06 & 0.80 & 58.80$\pm$1.34 & 48.51$\pm$1.6 \\
         HMLN comp 7 & -0.13$\pm$0.05 & 0.78 & 61.56$\pm$2.30 & 48.86$\pm$1.8 \\
         HMLN comp 8 & -0.12$\pm$0.07 & 0.78 & 60.35$\pm$2.23 & 48.66$\pm$1.9 \\
         HMLN comp 9 &-0.16$\pm$0.07 & 0.79 & 59.87$\pm$1.47 & 49.81$\pm$1.4\\
         HMLN comp 10 & -0.15$\pm$0.05 & 0.76 & 58.84$\pm$1.21 & 49.44$\pm$1.3 \\
         \hline
    \end{tabular}
    }
    \label{tab:appres2}
\end{table*}

\begin{table*}
    \centering
    \caption{Conditional Log-Likelihood on the covariate shifted Cora graph for the individual HMLN components with reparameterization.}
    \scalebox{1}{
    \begin{tabular}{c c c c c }
         \hline
         \multirow{2}{*}{\textbf{Experiments}} & \multicolumn{4}{c}{\textbf{Noisy Cora}}\\
         \cline{2-5}
         & \textbf{CLL} & \textbf{\makecell{Avg. \\Marginals}} & \textbf{Accuracy (\%)} & \textbf{\makecell{Time \\(secs)}} \\
         \hline
         \hline
         HMLN comp 1 & -0.166$\pm$0.07 & 0.87 & 58.92$\pm$1.79 & 48.98$\pm$2.1 \\
         HMLN comp 2 & -0.19$\pm$0.6 & 0.80 & 56.67$\pm$2.23 & 49.65$\pm$1.8 \\
         HMLN comp 3 & -0.18$\pm$0.07 &0.79 & 59.72$\pm$2.19 & 48.5$\pm$2.3 \\
         HMLN comp 4 & -0.17$\pm$0.06 & 0.81 & 60.15$\pm$1.88 & 52.2$\pm$1.6\\
         HMLN comp 5 & -0.19$\pm$0.04 & 0.75 & 63.22$\pm$1.24 & 50.21$\pm$1.5 \\
         HMLN comp 6 & -0.16$\pm$0.03 & 0.73 & 60.20$\pm$1.37 & 50.92$\pm$1.4 \\
         HMLN comp 7 & -0.15$\pm$0.04 & 0.79 & 63.47$\pm$2.09 & 49.63$\pm$2.1 \\
         HMLN comp 8 & -0.18$\pm$0.04 & 0.78 & 60.32$\pm$1.30 & 48.62$\pm$1.2 \\
         HMLN comp 9 & -0.19$\pm$0.03 & 0.73 & 61.05$\pm$2.06 & 49.03$\pm$1.9\\
         HMLN comp 10 & -0.16$\pm$0.06 & 0.80 & 61.11$\pm$2.28 & 48.17$\pm$1.7 \\
         \hline
    \end{tabular}
    }
    \label{tab:appres3}
\end{table*}

\begin{table*}
    \centering
    \caption{Conditional Log-Likelihood on the covariate shifted Citeseer graph for the individual HMLN components with reparameterization.}
    \scalebox{1}{
    \begin{tabular}{c c c c c }
         \hline
         \multirow{2}{*}{\textbf{Experiments}} & \multicolumn{4}{c}{\textbf{Noisy Citeseer}}\\
         \cline{2-5}
         & \textbf{CLL} & \textbf{\makecell{Avg. \\Marginals}} & \textbf{Accuracy (\%)} & \textbf{\makecell{Time \\(secs)}} \\
         \hline
         \hline
         HMLN comp 1 & -0.29$\pm$0.08 & 0.75 & 64.31$\pm$2.14 & 47.14$\pm$1.6 \\
         HMLN comp 2 & -0.30$\pm$0.06 & 0.71 & 62.34$\pm$2.23 & 45.46$\pm$1.6 \\
         HMLN comp 3 & -0.33$\pm$0.07 & 0.72 & 61.34$\pm$2.19 & 42.18$\pm$1.1 \\
         HMLN comp 4 & -0.28$\pm$0.04 & 0.72 & 62.13$\pm$1.93 & 48.3$\pm$2.1\\
         HMLN comp 5 & -0.30$\pm$0.06 & 0.72 & 61.72$\pm$1.39 & 48.44$\pm$3.1 \\
         HMLN comp 6 & -0.27$\pm$0.04 & 0.74 & 64.80$\pm$1.84 & 44.24$\pm$2.1 \\
         HMLN comp 7 & -0.31$\pm$0.03 & 0.75 & 64.49$\pm$1.37 & 45.19$\pm$2.1 \\
         HMLN comp 8 & -0.28$\pm$0.03 & 0.70 & 60.53$\pm$1.55 & 46.71$\pm$3.4 \\
         HMLN comp 9 & -0.28$\pm$0.07 & 0.70 & 61.49$\pm$1.59 & 46.04$\pm$2.8\\
         HMLN comp 10 & -0.33$\pm$0.05 & 0.75 & 60.16$\pm$2.36 & 46.51$\pm$2.7 \\
         \hline
    \end{tabular}
    }
    \label{tab:appres4}
\end{table*}

%\newpage
\begin{figure*}
    \centering
    \scalebox{0.9}{
        \includegraphics{exfigs/rel-diag-components}
    }
    \caption{Reliability diagram for calibrating a few of the HMLN components for MixHMLNs on the covariate shifted CORA dataset. The figures on the left are before temperature scaling and the figures on the right are after temperature scaling. }
    \label{fig:rel-diag-components}
\end{figure*}