\begin{table}
    \centering
    \caption{HMLN structures.}\label{tab:hmlns}
    % \tabcolsep=0.15cm
    \scalebox{0.74}{
    \begin{tabular}{|c|c|}
        \hline
        \multirow{2}{*}{GNN} & ${\tt Class}(+x_1,+c)$ $\wedge$ ${\tt Cites}(+x_1,x_2)$ $\Rightarrow$ ${\tt Class}(x_2,+c)$\\
        & \colorbox{gray!25}{${\tt Dist}(+e_{x_1},e_{x_2})<\tau * ({\tt Class}(+x_1,+c)\Leftrightarrow{\tt Class}(x_2,+c))$}\\
         \hline
         \multirow{2}{*}{DKT}  & $({\tt Correct}(+s,p_1)\wedge{\tt PreRequisite}(p_1,p_2) \Rightarrow{\tt Correct}(+s,p_2))$\\
          & \colorbox{gray!25}{${\tt Dist}(+e_{s_1},e_{s_2})<\tau*({\tt Correct}(+s_1,p)\Leftrightarrow{\tt Correct}(s_2,p))$}\\
        \hline
    \end{tabular}
    }
    
\end{table}






\section{Experiments}

\begin{table*}
    \centering
    \caption{The top table shows results when the train and test embeddings are generated using GCNs based on the original graph benchmarks. The bottom table shows results for benchmarks when we introduce covariate shift in the test embeddings. Time shown indicates time taken for inference. Results (mean and standard deviation over 5 runs of the experiment) are shown for 1000 test nodes in the benchmarks. To compute accuracy for a test node, we select the class that has the maximum marginal probability corresponding to that node.}\label{tab:results1}
    \scalebox{1}{
    \begin{tabular}{|c| c| c| c| c| c| c| }
         \hline
          \multirow{2}{*}{\textbf{Experiments}} & \multicolumn{3}{|c|}{\textbf{Cora}} & \multicolumn{3}{|c|}{\textbf{Citeseer}} \\
         \cline{2-7}
         & \textbf{CLL}& \textbf{Accuracy (\%)} & \textbf{\makecell{Time \\(secs)}}  & \textbf{CLL} & \textbf{Accuracy (\%)} & \textbf{\makecell{Time \\(secs)}}  \\
         \hline
         \hline
         DeepStochLog & -0.74$\pm$0.2 & 72.8$\pm$1.2 & 155.61$\pm$6.26 & -1.2$\pm$0.1 & 65.5$\pm$2.8 & 83$\pm$11.42 \\
         PSL & -0.64$\pm$0.11 & 72.84$\pm$2.15 & \textbf{4.56}$\pm$0.29  & -0.61$\pm$0.10 & 51.9$\pm$1.2 & 4.78$\pm$1.24\\
         NeuPSL & -0.28$\pm$0.06 & \textbf{81.2}$\pm$0.76 & 6.5$\pm$0.4 & -0.34$\pm$0.08 & \textbf{68.48}$\pm$1.14 & \textbf{3.71}$\pm$0.08\\
         \hline
        HMLN & {-0.23}$\pm$0.08 & 65.43$\pm$3.35 & 49.58$\pm$1.5 & -0.11$\pm$0.05 & 59.95$\pm$2.10 & 47.27$\pm$1.3 \\
         MIX-HMLN & \textbf{-0.21}$\pm$0.08 & 79.37$\pm$1.15 & 151$\pm$2.1 & {\bf -0.09}$\pm$0.06 & 65.14$\pm$1.8 & 146.1$\pm$3.3\\
         
         \hline
         \multirow{2}{*}{\textbf{}} & \multicolumn{3}{|c|}{\textbf{Cora (covariate shifted)}} & \multicolumn{3}{|c|}{\textbf{Citeseer (covariate shifted)}}\\
         \cline{2-7}
         & \textbf{CLL}& \textbf{Accuracy (\%)} & \textbf{\makecell{Time \\(secs)}}  & \textbf{CLL} & \textbf{Accuracy (\%)} & \textbf{\makecell{Time \\(secs)}}  \\
         \hline
         \hline
         DeepStochLog & -0.64$\pm$0.06 & 49.3$\pm$0.12 & 181.26$\pm$8.73 & -0.71$\pm$0.04 & 28.8$\pm$0.5 & 74.4$\pm$8.78 \\
         PSL & -0.97$\pm$0.04 & 65.92$\pm$1.98 & \textbf{4.05}$\pm$0.21  & -1.10$\pm$0.04 & 40.14$\pm$1.31 & \textbf{3.38}$\pm$0.27\\
         NeuPSL & -0.62$\pm$0.02 & 66.54$\pm$1.73 & 7.37$\pm$0.19 & -0.55$\pm$0.13 & 36.2$\pm$2.68 & 5.64$\pm$0.24\\
         \hline
         HMLN & -0.29$\pm$0.11 & 58.67$\pm$3.23 & 50.16$\pm$1.6  & -0.21$\pm$0.09 & 53.52$\pm$3.88 & 48.16$\pm$1.5 \\
         HMLN(DR) & -0.166$\pm$0.07 & 58.92$\pm$1.79 & 48.98$\pm$2.1  & -0.29$\pm$0.08 & 64.31$\pm$2.14 & 47.14$\pm$1.6 \\
         MIX-HMLN & \textbf{-0.149}$\pm$0.08 & \textbf{77.028}$\pm$1.12 & 178.5$\pm$3.15 & \textbf{-0.11}$\pm$0.09 & \textbf{64.93}$\pm$2.21  & 165.65$\pm$2.73\\
         \hline
    \end{tabular}
    } 
\end{table*}

\subsection{Implementation}
\paragraph{HMLNs.} 
We used Gurobi to implement the voted perceptron parameter learning for HMLNs. We used a maximum of 100 iterations (or until convergence) to learn the mixture model. In each gradient step, we used a learning rate of $0.01$. For the HMLNs, we used an approach similar to the approach used in MLNs to learn multiple parameters for a formula instead of a single weight for all groundings since this limits the type of distributions that we can represent. Specifically, in MLNs a variable that is indicated with a ``+'' sign implies that for each grounding of that variable, we learn a separate weight. Thus, we can control the number of groundings with shared weights, and in our case, we use a single ``+'' variable in each of our HMLN formulas. We assumed HMLN semantics for real-valued features over embeddings. Specifically, to represent the soft inequality, $\alpha < t$, we use the log-sigmoid function, $-\log(1+e^{(\alpha-t)})$. 
% \noindent 
\paragraph{Probabilistic Classifier.} 
For the probabilistic classifier, we used a calibrated neural network to estimate the DR. Specifically, we trained a single-hidden layer network to distinguish between training and test embeddings with a cross-entropy loss function. To improve calibration, we used the Temperature Scaling approach~\citep{pmlr-v70-guo17a} on the output probabilities of the classifier. 
\paragraph{Gibbs Sampling.} 
For inference, we implemented the mixture of Markov chains with 10 parallel Markov chains with a total of 100K samples. We determined convergence using the PSRF (potential scale reduction factor) of the Gelman-Rubin diagnostic~\citep{gelman1992inference}. PSRF is a standard diagnostic method to assess if the sampler has converged by comparing within chain variance to variance across parallel MCMC chains. We used a burn-in period of 5K samples after which we used the samples to compute the marginal probabilities.

We implemented our approach in Google Cloud with a Tesla T4 GPU (16GB). Our implementation is available here~\footnote{https://github.com/anupshakya07/uquant}.

\subsection{Graph Embeddings}

We used two benchmark datasets, Cora and Citeseer, where the DNN learns embeddings to perform node classification. The HMLN structure is shown in Table~\ref{tab:hmlns}. The first property specifies the homophily relation; nodes that are connected in the graph have the same class. The second formula specifies that embeddings whose distance is smaller than $\tau$ (with $\tau=0.2$) have similar classes. The soft inequality function is used to specify the real-valued feature. Since there are $n^2$ groundings in the hybrid formula, we reduce this by eliminating groundings where the distance between embeddings is large since the grounding has a very small value.

To learn the mixture model, we need to generate variants of embeddings. To do this, we use the dropout~\citep{NIPS2017_84ddfb34} mechanism. Specifically as shown in~\citet{gal2016dropout}, dropout is a Bayesian approximation of a Gaussian process. Thus, using dropout, we can sample from the family of embeddings generated by the DNN. To learn the mixture model, we set the initial dropout value to 0.1 and increment it by 0.05. For each value of dropout, we learn a different embedding, and we used 10 components in our mixture model.

\begin{table*}
    \centering
    \caption{Ablation Study. We vary the train and test models used to learn the embeddings. For the results on GCN (Test), the GAT model is used for training the HMLN and for GAT (Test), the GCN is used in training. We show the mean and standard deviation over 5 runs.}\label{tab:ablation}
    \scalebox{0.78}{
    \begin{tabular}{|c|c|c|c|c|c|c|c|c| }
         \hline
          \multirow{3}{*}{\textbf{Method}} & \multicolumn{4}{|c|}{\textbf{Cora}} & \multicolumn{4}{|c|}{\textbf{Citeseer}}\\
         \cline{2-9}
         & \multicolumn{2}{|c|}{\textbf{GCN (Test)}} & \multicolumn{2}{|c|}{\textbf{GAT (Test)}} & \multicolumn{2}{|c|}{\textbf{GCN (Test)}} & \multicolumn{2}{|c|}{\textbf{GAT (Test)}} \\
         \cline{2-9}
         & \textbf{CLL}& \textbf{Accuracy (\%)}  & \textbf{CLL} & \textbf{Accuracy (\%)} & \textbf{CLL}& \textbf{Accuracy (\%)}  & \textbf{CLL} & \textbf{Accuracy (\%)}  \\
         \hline
         \hline
         HMLN & -0.28$\pm$0.18 & 61.55$\pm$1.26 & -0.35$\pm$0.13 & 74.53$\pm$1.13 & -0.20$\pm$0.15 & 57.26$\pm$2.19 & -0.14$\pm$0.16 & 75.51$\pm$0.1 \\
         HMLN $+$ MIX & -0.22$\pm$0.09 & 65.34$\pm$1.26 & -0.28$\pm$0.11 & 68.35$\pm$0.86 & -0.15$\pm$0.09 & 62.86$\pm$1.06 & -0.12$\pm$0.9 & 73.54$\pm$2.06 \\
         HMLN $+$ MIX $+$ DR & \textbf{-0.18}$\pm$0.05 & \textbf{81.27}$\pm$0.29 & \textbf{-0.2}$\pm$0.06 & \textbf{78.9}$\pm$0.37 & \textbf{-0.14}$\pm$0.03 & \textbf{64.23}$\pm$0.89 & \textbf{-0.09}$\pm$0.05 & \textbf{76.29}$\pm$0.83 \\
         \hline
        
    \end{tabular}
    }
\end{table*}


We compare our approach with DeepStochLog~\citep{deepstoch}, Probabilistic Soft Logic (PSL)~\citep{bach_psl17} and NeuPSL~\citep{ijcai2023p0461}. PSL is a purely statistical relational model, while DeepStochLog and NeuPSL are both state-of-the-art extensions of well-known statistical relational models (PSL and ProbLog respectively) to incorporate DNN representations.
In each case, we compare the conditional log-likelihood (CLL) scores on the set of test nodes. We perform marginal inference on test nodes and similar to prior approaches, we approximate the test CLL as the log average of the marginals computed over the test nodes. We run 5 experiments and compute the mean and standard deviation in the test CLL estimates. We also compare between 3 variants of our approach as follows. HMLN: we learn a single HMLN model, HMLN(DR): we learn a single HMLN model but perform reparameterization during inference and MIX-HMLN: we learn a mixture of HMLNs and perform reparameterization during inference.



In the first set of experiments, we learn the HMLN using embeddings from Graph Convolutional Networks (GCNs)~\citep{kipf2017semisupervised}. The homophily property that the HMLN encodes is aligned with the representation learned by a GCNs~\citep{ma2022is}. Our results showing the CLL on test nodes are shown in Table~\ref{tab:results1}. MIX-HMLN has the largest CLL compared to the other approaches. This indicates that MIX-HMLN has the smallest uncertainty in inferring the alignment between the embeddings and the relational property. NeuPSL had a slightly higher accuracy in labeling the test nodes, however, it also has a higher uncertainty (as indicated by the smaller CLL).

Next, we introduce covariate shifts in the test embeddings using an approach similar to \citet{pmlr-v206-alchihabi23a}. Specifically, we remove edges and add Gaussian noise to the node features to create a {\em perturbed} graph. We learn embeddings (with covariate shift) by training the GCN on the perturbed graph. The GCN accuracy of test node classification on the perturbed graph is similar to the accuracy on the original graph (the difference in accuracy was less than $1\%$). Thus, the embeddings from the perturbed graph encode the same relationships as the embeddings on the original graph.

Our results are shown in Table~\ref{tab:results1}. As shown here, MIX-HMLN outperforms all the other methods in CLL scores by significantly higher margins in this case since we account for the covariate shift during inference. As shown in the table, our approach which is a mixture model takes longer to run as compared to the other single model approaches. At the same time, the mixture reduces uncertainty which in the context of learning using embeddings is important since deep models that generate these embeddings may converge to distinct local minima. Thus, we trade-off a loss of efficiency for lower uncertainty (evidenced by the higher CLL scores). In future, we plan to develop more efficient inference methods based on activating specific components in the mixture similar to approaches used in mixture of experts.

\paragraph{Ablation Study.}
Here, we introduce covariate shift by modifying the DNN architecture during inference. Specifically, we use GCNs and Graph Attention Networks (GATs)~\citep{velickovic&al18} where one of them generates training embeddings and the other one generates test embeddings. Table~\ref{tab:ablation} shows our results for the three variants of our approach. As seen here, the CLL consistently increases when we add the mixture model learning and further increases when we add the reparameterization in each case.

\subsection{Deep Knowledge Tracing}

Deep Knowledge Tracing (DKT)~\citep{NIPS2015_bac9162b} uses DNNs to learn dense embeddings representing student skills over latent concepts. Specifically, DKT is a Sequence2Sequence model trained over observations that simulate exercises of varying difficulty that students work.
To train the model, the exercises are generated using Item Response Theory (IRT)~\citep{irt1990} (details in the Appendix~\ref{apendix:dkt}). 
The HMLN for this task shown in Table~\ref{tab:hmlns} represents {\em pre-requisite relational structure}, i.e., the first formula specifies that if a student gets an exercise correct then they have acquired skills corresponding to pre-requisites associated with that exercise. The second formula is a hybrid formula that relates student performance to their DKT embeddings.
To learn the mixture model, we learn DKT embeddings over multiple problem orderings but we maintain the same pre-requisite structure over problems in each ordering. Specifically, if $X$ is pre-requisite to $Y$, then $X$ must appear before $Y$ in the problem ordering. We used a total of 5 different orderings to learn the mixture model. 

\begin{table}
    \centering
    \caption{$(p,n,c)$ denotes that there are $p$ problems, $n*1000$ students and $c$ latent concepts are used by the IRT model to generate the training data. Results show F1-score for predictions made on 200 test problems on all the students in the training data.}\label{tab:dkt_result}
    % \tabcolsep=0.15cm
    \resizebox{0.47\textwidth}{!}{
    \begin{tabular}{c c c c}
        \hline
        {\bf Training Dataset} & {\bf DKT} & {\bf HMLN} & {\bf MIX-HMLN}\\
       \hline
       \hline
       $(50,1,2)$ & \textbf{0.80} & 0.7 & \textbf{0.80} \\
       $(75,2,3)$ & 0.76 & 0.67 & \textbf{0.78}\\
       $(100,4,5)$ & 0.74 & 0.57 & \textbf{0.78} \\
       $(200,5,5)$ & 0.62 & 0.54 & \textbf{0.72} \\
       $(400,8,10)$ & 0.71 & 0.56 & \textbf{0.74} \\
       
       \hline
    \end{tabular}
    }
\end{table}

For training embeddings, we use data with problem difficulty values (encodes in the IRT model parameters) ranging from 1 to 3. For the test embeddings, we use difficulty values from 4 to 5. Thus, we simulate the condition when the students work on new exercises of increasing difficulty. This induces a covariate shift in the DKT embeddings learned for test data. We compute the marginal probability over the atoms ${\tt Correct}(S,P)$, where $S,P$ represents the student and problem respectively. Table~\ref{tab:dkt_result} compares the F1-scores of predicting student performance (we threshold the probability at 0.5) with the original DKT model over all students for all the test problems. As seen here, MIX-HMLN outperforms DKT across all settings. This illustrates that utilizing pre-requisite relational structure combined with reparameterization in the presence of covariate shift improves generalization when the number of problems, number of latent concepts and the number of students in the data grows larger.


