\documentclass[accepted]{uai2022} 

\usepackage[american]{babel}
% \usepackage[british]{babel}


\usepackage{natbib} 
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} 
% \usepackage{siunitx} 
\usepackage{booktabs} 
\usepackage{tikz}
\usepackage{multirow}

\usepackage{graphicx}
% \graphicspath{{figures/}}
\usepackage{subcaption}
\captionsetup{subrefformat=parens}
\usepackage{algorithm}
\usepackage{algorithmic}

\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}

\usepackage{bm}
\usepackage[inline=true, margin=false]{fixme}

\usepackage{xr}
\externaldocument{huang_696-supp}

% \newcommand{\swap}[3][-]{#3#1#2} % just an example

\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}

\title{A Mutually Exciting Latent Space Hawkes Process Model\\
for Continuous-time Networks}


\author[1]{\href{mailto:<zhipeng.huang@rockets.utoledo.edu>?Subject=Your UAI 2022 paper}{Zhipeng Huang}{}}
\author[1]{Hadeel Soliman}
\author[2]{Subhadeep Paul}
\author[1]{Kevin S. Xu}

% Add affiliations after the authors
\affil[1]{%
    Department of Electrical Engineering and Computer Science\\
    University of Toledo\\
    Toledo, OH, USA
}
\affil[2]{%
    Department of Statistics\\
    The Ohio State University\\
    Columbus, OH, USA
}

  
  \begin{document}
\maketitle

\begin{abstract}
Networks and temporal point processes serve as fundamental building blocks for modeling complex dynamic relational data in various domains. 
We propose the \emph{latent space Hawkes (LSH)} model, a novel generative model for continuous-time networks of relational events, using a latent space representation for nodes. 
We model relational events between nodes using mutually exciting Hawkes processes with baseline intensities dependent upon the distances between the nodes in the latent space and sender and receiver specific effects. 
We demonstrate that our proposed LSH model can replicate many features observed in real temporal networks including reciprocity and transitivity,  while also achieving superior prediction accuracy and providing more interpretable fits than existing models.
\end{abstract}

\section{Introduction}
Dynamic networks are used to represent time-varying relationships (edges) between a set of nodes. 
They are useful in a variety of application settings, including 
messages between users on online social networks and transactions between users on online marketplaces. 
In such settings, the network typically evolves over time through a set of \emph{timestamped relational events}. Each event is a triplet $(u,v,t)$ denoting that node $u$ initiated an interaction with node $v$ (e.g.~$u$ sent a message to $v$) at timestamp $t$. 
We refer to this type of dynamic network as a \emph{continuous-time network} because it is continuously evolving through these relational events.

A topic of much recent interest is identifying latent representations for nodes in networks. 
These latent representations are often referred to as node embeddings, and node embedding-based approaches for common network analysis tasks including link prediction have gained significant attention in recent years \citep{grover2016node2vec, goyal2018graph, cui2018survey}. 
Prior to this surge of interest, latent space models have been used in statistics and mathematical sociology for exploratory analysis of networks \citep{hoff2002latent, hoff2005bilinear, hoff2007modeling, handcock2007model, krivitsky2009representing}. 
Latent representations have also been developed for dynamic networks evolving over discrete time steps  \citep{sewell2015latent} or in continuous time \citep{nguyen2018continuous}. 

Latent space representations can be combined with temporal point processes (TPPs) to form a probabilistic generative model for continuous-time networks, which we consider in this paper. 
Augmenting the latent representation with a TPP enables one to generate timestamps for the edges between nodes. 
\citet{yang2017decoupling} proposed the dual latent space (DLS) generative model that combines two types of latent spaces with bivariate Hawkes processes.
They found that using two types of latent spaces, one to capture homophily and one to capture reciprocity, provides a richer model that also leads to improved link prediction accuracy. 
However, much of the interpretability of the latent space, which was the original motivation of the latent space model of \citet{hoff2002latent}, is lost by using multiple high-dimensional latent spaces. 
Furthermore, the DLS model has issues with stability of the generative process due to the multiple latent spaces. 
It also uses only reciprocal excitation and not self excitation. 
Self excitation is important in application settings such as modeling text messages, where person $u$ may send multiple messages to $v$ in rapid succession before $v$ responds.

In this paper, we consider using a single latent space representation to provide a more interpretable model. 
The single latent space limits the flexibility of the model compared to the DLS, so we increase flexibility by adding self excitation and sender and receiver effects.
We demonstrate that our proposed latent space Hawkes (LSH) model is competitive with other models in predictive and generative tasks on 4 real network datasets while providing more interpretable and stable model fits.
Furthermore, we apply our LSH model to perform exploratory analysis on a dataset of militarized disputes to reveal network structure between countries.

\section{Background}


\subsection{Hawkes Processes}
\label{sec:background_hawkes}

The Hawkes process model was introduced for temporal point processes by \citet{hawkes1971spectra}. The defining characteristic of a Hawkes process is that it is self exciting, meaning that each event increases the rate of future events for some period of time. Mutually exciting Hawkes processes allow events from different processes to excite each other in addition to self excitation \citep{laub2021elements}. 
An $m$-dimensional mutually exciting Hawkes process is characterized by a conditional intensity function for each dimension $i$:
\begin{equation}
\label{eq:1}
\lambda_i^*(t) = \lambda_i (t|\mathcal{H}_t) 
= \mu_i + \sum_{j=1}^m \sum_{k:t_k < t} \phi_{ij}(t-t_k),
\end{equation}
where $\mathcal{H}_t$ denotes the history of the process up to time $t$,  $\mu_i$ denotes the baseline rate of events in dimension $i$, and $\phi_{ij}(\cdot)$ is a kernel function that describes how an event in dimension $j$ influences dimension $i$. 

The most commonly used kernel function is the exponential kernel $\phi(t-t_k)=\alpha \beta e^{-\beta(t-t_k)}$ for $\alpha > 0$ and $\beta > 0$. With each event arrival, the conditional intensity jumps by $\alpha$. The influence of the arrival then exponentially decays at rate $\beta$ over time. In practice, both $\alpha$ and $\beta$ are unknown parameters that need to be estimated from data, which is usually done using maximum likelihood estimation \citep{laub2021elements}. However, estimators for the decay parameter $\beta$ are poorly behaved \citep{santos2021surfacing}, and 
it is more computationally efficient to choose a fixed $\beta$ rather than estimating it \citep{lemonnier2014nonparametric}. 

An approach that is more general than fixing the value of $\beta$ is the sum of exponential kernels method \citep{lemonnier2014nonparametric}, which defines $\phi(t-t_k)= \sum^B_b \alpha \beta_b e^{-\beta_b(t-t_k)}$, where $B$ denotes the number of exponential kernels. This method generalizes better as it handles different time scales, which makes the modeling less sensitive to choice of $\beta$. 
We use the sum of exponential kernels decay in this paper.




\subsection{Latent Space Models}
\label{sec:LSMs}

The latent space model (LSM), first proposed by \citet{hoff2002latent} is a popular model-based approach for  social network analysis. 
Designed initially for a single static undirected network, the LSM allows the probability of an edge between two nodes to depend on their Euclidean distance in an unobserved or latent space using a logistic regression model. 
Let $A$ denote the adjacency matrix of a network, with $a_{uv} = 1$ for node pairs $(u,v)$ with an edge and $a_{uv} = 0$ otherwise.
By assuming conditional independence between node pairs, the log-likelihood can be written as
\begin{equation*}
\log P(A|\eta) = \sum_{u < v} \left[ \eta_{uv} a_{uv} - \log(1 + e^{\eta_{uv}}) \right],
\end{equation*}
where entry $\eta_{uv}$ in the matrix $\eta$ denotes the log odds of an edge being formed between nodes $(u,v)$. 
$\eta_{uv}$ is parameterized as follows:
$\eta_{uv} = \xi - \|z_u - z_v\|_2$,
where $z_u$ denotes the latent position of node $u$ in a $d$-dimensional latent space, and $\xi$ is an intercept term. 
Under this parameterization, two nodes with closer latent positions have higher probability of forming an edge.

The latent space model provides a visual and interpretable model-based spatial representation of social relationships. 
It has been extended by many researchers. \citet{handcock2007model} developed a latent position cluster model to capture transitivity, homophily, and community structure simultaneously. The latent space models were later extended to include node-specific random effects by  \citet{krivitsky2009representing}. 
Latent space models have also been extended for more complex network based data structures, including multiple networks \citep{gollini2016joint,salter2017latent}, discrete-time dynamic networks \citep{sewell2015latent,sewell2016latent, friel2016interlocking, gracious2021neural}, and multimodal networks \citep{wang2019joint}.
We use the latent space model as the building block for our proposed continuous-time LSH model.

\subsection{Related Work}

\paragraph{Dynamic Network Embeddings}

One line of related work is focused on node embeddings for dynamic networks. 
Compared to static network embedding methods, dynamic network embedding methods assign nodes low-dimensional representations that effectively preserve the temporal information. \citet{nguyen2018continuous} proposed continuous-time dynamic network embeddings (CTDNE), a general framework to learn a time-respecting embedding from continuous-time dynamic networks. Their framework acts as a basis for incorporating temporal dependencies into existing node embedding and deep graph models based on random walks. 
Other approaches for dynamic network embedding have also been proposed \citep{chen2018scalable, sankar2018dynamic, goyal2020dyngraph2vec}, many of which are discussed in a recent survey on dynamic network embedding \citep{xie2020survey}.


\paragraph{TPP-based Network Models}
TPP-based network models are generative models for continuous-time dynamic networks that incorporate both a generative process for the nodes $(u,v)$ that form an edge and the time $t$ at which an edge is formed. 
These timestamped edges or events can be viewed as triplets $(u,v,t)$. 
Many TPP-based network models utilize a discrete latent variable representation for the nodes \citep{Blundell2012,Dubois2013,miscouridou2018modelling,junuthula2019block,arastuie2020chip, soliman2022multivariate}, dividing them into different blocks or communities.

The most closely related work to this paper is the dual latent space (DLS) model \citep{yang2017decoupling}, which also utilizes a continuous latent variable representation inspired by the latent space model. 
The DLS model uses bivariate Hawkes processes to capture the homophily and reciprocity of dynamic networks. 
They observed that the latent dimensions of users which affect link formation may be different from the latent dimensions of users which affect reciprocity. 
We discuss shortcomings of the DLS model and its relation to our proposed model in Section \ref{sec:relation_DLS}.

Another TPP-based network model using a continuous latent space is proposed by \citet{rastelli2021continuous}. 
It assumes that the latent positions of nodes may change at a set of predefined change points rather than being fixed over time.

\paragraph{Other Continuous-time Network Models}
Earlier research on continuous-time network models was proposed by \citet{wasserman1980analyzing, wasserman1980stochastic}, who modeled the evolution of network data using continuous-time Markov chains. 
Later on, \citet{snijders2005models, snijders2017modeling} proposed a set of network models that offers more flexibility to represent a variety of network effects, such as transitivity, reciprocity, etc. \citet{fan2012learning} explored the inference for these models and proposed a sampling-based learning algorithm for continuous-time social network models.

\section{Proposed Model}

In our model, we employ a latent space to learn hidden node attributes underlying the network and mutually exciting Hawkes processes to capture the temporal dynamics of communication. We model the communications between each pair of nodes as realizations from a bivariate Hawkes process whose conditional intensity function $\lambda_{uv}(t|\mathcal{H}_t)$ includes three components: a baseline rate, a self-exciting term, and a reciprocal term. 

Let $z_u$ and $z_v$ denote the latent positions for nodes $u$ and $v$, respectively. We model baseline rate $\mu_{uv}$ as a function of Euclidean distances between $z_u$ and $z_v$. \citet{gollini2016joint} showed that  squared Euclidean distances are computationally more efficient than  Euclidean distances yet resulted in similar latent positions. Thus, we use squared Euclidean distances $||z_u - z_v||_2^2$ in the model for $\mu_{uv}$, similar to DLS \citep{yang2017decoupling}. 
We further add sender and receiver node effect terms $\delta_u, \gamma_v$ to the model as in \cite{hoff2005bilinear,krivitsky2009representing,wang2019joint} to capture the degree heterogeneity, namely the tendency of some nodes to send and receive events more than others, respectively. 

A Hawkes process with exponential kernel has been found to be a good model for conversation event sequences as well as other relational temporal event data \citep{masuda2013self}. 
We use a sum of $B$ exponential kernels in our Hawkes processes. 
We set $\beta = (\beta_1, \beta_2, \ldots, \beta_B)$ as a set of fixed known decays and $C = (C_1, C_2, \ldots,C_B)$ as a set of scaling parameters for the kernel with $\sum_i^B C_i= 1$. 
The conditional intensity function can be written as follows:
\begin{equation} \label{eq:lsh_lambda}
\begin{split}
\lambda_{uv}^*(t) &= \mu_{uv} +   \sum_{t_{uv} < t}\sum_b^B C_b \alpha_1 \beta_b e^{-\beta_b(t-t_{uv})} \\
&+   \sum_{t_{vu} < t}\sum_b^B C_b \alpha_2 \beta_b e^{-\beta_b(t-t_{vu})}, \quad \forall u \neq v
\end{split}
\end{equation}
where the baseline rate $\mu_{uv}$ is given by
\begin{equation}
\label{eq:baseline}
\mu_{uv} = e^{-\theta_1||z_u - z_v||^2_2 + \theta_2 + \delta_u + \gamma_v}.  
\end{equation}

\subsection{Model Parameters}
The LSH model has parameters ($Z$, $\alpha_1$, $\alpha_2$, $\theta_1$, $\theta_2$ $\delta$, $\gamma$). Each node has a $d$-dimensional latent position $z_u$, a sender propensity parameter $\delta_u$ and a receiver propensity parameter $\gamma_u$. $\alpha_1$ and $\alpha_2$ are the jump size parameters for self-excitation and reciprocal-excitation. $Z$ is a $n \times d$ matrix where each row is  a latent position vector $z_u$ for a node, and $d$ is the latent dimension. Each of $\delta$ and $\gamma$ is a vector of size $n$. 
$\theta_1$ and $\theta_2$ are slope and intercept terms, respectively, associated with the baseline rate and latent positions. A positive slope $\theta_1$ provides node pairs closer together in the latent space with a higher probability of forming edges, while a negative slope does the reverse.


\paragraph{Identifiability}
There are two sets of identifiability problems that need to be discussed. From the observed event times, the Hawkes process parameters $\mu_{uv}, \alpha_1, \alpha_2$ can be identified as shown by \citet{ozaki1979maximum}. With the baseline intensity parameter $\mu_{uv}$ correctly identified, we explore the identifiability of the parameters in the model for $\mu_{uv}$. The identifiability of parameters in the latent space model has been discussed by \cite{ma2020universal} for a single network and by \cite{zhang2020flexible} for multilayer networks. 

Denote $1_n$ to be the $n$ dimensional vector and $J_n=1_n 1_n^T$ to be the $n\times n$ matrix whose elements are all 1's. We first note that the magnitude of the parameter $\theta_1$ is not identifiable since it enters the equation for $\mu$ as a product with $\|z_u - z_v\|^2_2$. 
However, the sign of $\theta_1$ is identifiable since $\|z_u - z_v\|^2_2$ is always positive. In the following, we set $\theta_1=1$ and examine the conditions for identification of other parameters.
We have
\begin{align*}
\log (\mu_{uv}) &  = \theta_2 - \|z_u\|^2 - \|z_v\|^2 + z_u^Tz_v + \delta_u +\gamma_v \\
& = \theta_2 + z_u^Tz_v + \tilde{\delta}_u +\tilde{\gamma}_v,
\end{align*}
where $\tilde{\delta}_u = \delta_u - \|z_u\|^2$ and $\tilde{\gamma}_v = \gamma_v - \|z_v\|^2$.
Now  let $\tilde{\delta}$ and $\tilde{\gamma}$ denote the $n$-dimensional vectors whose elements are $\tilde{\delta}_u$ and $\tilde{\gamma}_v$, respectively. (All vectors are column vectors.) Writing in matrix form, the above expression is 
\begin{equation*}
\log (\mu) = \theta_2 J_n + ZZ^T + \tilde{\delta}1_n^T + 1_n \tilde{\gamma}^T. 
\end{equation*}

\begin{theorem}
\label{thm:identifiability}
Under the following assumptions:
\begin{enumerate}
    \item The latent positions are centered, i.e., $HZ=Z$, where $H=I-\frac{1}{n}11^T$, and
    \item The total nodal effects sum to 0, i.e., $1_n^T \tilde{\delta}=0$ and $\tilde{\gamma}^T1_n = 0$,
\end{enumerate}
if two sets of parameters $\theta_2, Z,\gamma,\delta$ and $\theta_2', Z',\gamma',\delta'$ lead to the same $\log (\mu)$, then
\begin{equation*}
\theta_2 =\theta_2', \,\, \delta = \delta', \,\, \gamma=\gamma'\,\, \text{ and }  Z =Z'O,
\end{equation*}
where $O$ is a $d \times d$ orthogonal matrix.
\label{identifiability}
\end{theorem}
The proof is provided in Appendix \ref{sec:supp_proof}. 
Thus, under the constraints that the true latent positions $Z$ are centered and total nodal sender and receiver effects sum to 0, the parameters $\theta_2$, $\delta$, $\gamma$ and the vector distances $ZZ^T$ are exactly identified, while $Z$ is identified up to an orthogonal matrix $O$.

\subsection{Relation to DLS Model}
\label{sec:relation_DLS}

The most similar model to ours is the dual latent space (DLS) model \citep{yang2017decoupling}. 
It uses the following form for the conditional intensity function\footnote{They include also a periodic kernel in addition to the exponential kernels, which we exclude for ease of comparison.}: 
\begin{equation} \label{eq:dls_lambda}
\begin{split}
&\lambda_{uv}^*(t) = e^{-||z_u - z_v||^2_2 + \theta_2} \\
&+   \sum_{t_{vu} < t}\sum_b^B \alpha_2 e^{-||x_u^{(b)} - x_v^{(b)}||^2_2} \beta_b e^{-\beta_b(t-t_{vu})}, \quad \forall u \neq v
\end{split}
\raisetag{40pt}
\end{equation}
By comparing the form of the conditional intensity function for DLS \eqref{eq:dls_lambda} with that of our proposed LSH model \eqref{eq:lsh_lambda}, we identify 3 key differences, each addressing a concern regarding the DLS model:
\begin{enumerate}
\item The DLS utilizes reciprocal latent spaces $X^{(b)}$ to allow different rates of reciprocity between node pairs. 
This increase in flexibility of the model comes with a key drawback: the estimated latent positions for a node pair $(u,v)$ and kernel $b$ may result in the jump size $\alpha_2 e^{-||x_u^{(b)} - x_v^{(b)}||^2_2} > 1$, which leads to an unstable process. 
We were unable to simulate new networks from the DLS model fits to real networks due to the instability as we discuss in Section \ref{sec:generative_accuracy}. 
In contrast, we use just a single jump size $\alpha_2$ for all node pairs in our LSH model. 
While this may be less flexible, it does not lead to instability like the reciprocal latent space.

\item The DLS does not have a self excitation component, unlike our proposed LSH (second term in \eqref{eq:lsh_lambda}). 
The lack of self excitation prevents the DLS from modeling repeated edges from node $u$ to $v$ with no response from $v$ back to $u$. 
For example, this setting occurs frequently in militarized conflicts between countries, where one country repeatedly threatens or takes action against another country that does not retaliate.

\item The DLS does not have nodal effects parameters ($\delta_u$ and $\gamma_v$ in \eqref{eq:lsh_lambda}). 
This limits its ability to model nodes with different rates of sending or receiving events.
\end{enumerate}

Furthermore, a primary motivation of the latent space model is to embed the network into a single Euclidean space that can be easily visualized and interpreted. 
By using a single latent space, our proposed LSH is able to provide a much more interpretable model fit compared to DLS.

\section{Estimation Procedure}

Our model consists of mutually exciting bivariate Hawkes processes over all pairs of nodes. 
Using the likelihood theorem of \citet{daley2003introduction}, we can write the log-likelihood as
\begin{equation} \label{eq:3}
\log \mathcal{L} = \sum_{u\neq v} \left\{ \sum_{i=1}^k \log(\lambda_{uv}^*(t_i)) - \int_0^{t_k} \lambda_{uv}^* (t) dt \right\},
\end{equation}
where $k$ denotes the total number of events and $\lambda_{uv}^*(t)$ takes on the form in \eqref{eq:lsh_lambda}. 
We simplify the log-likelihood and improve the efficiency of the estimation by deriving a recursive form as in \cite{ozaki1979maximum}. More details and the full log-likelihood derivation for our LSH model are provided in Appendix \ref{sec:appendix_ll}, resulting in the simplified expression in \eqref{eq:5}. 

Latent space models typically assume that the probability of forming an edge between two nodes is inversely proportional to the distances between the node positions in the latent space.
Thus, the observation of an edge between two nodes typically pulls them closer together in the latent space.
The presence of the slope parameter $\theta_1$ in the baseline rate $\mu_{uv}$ for our LSH model \eqref{eq:baseline} allows us to either pull node pairs with events closer together by constraining $\theta_1 > 0$ or push them further apart by constraining $\theta_1 < 0$. 
Or, we could leave $\theta_1$ unconstrained---we find that this usually results in the estimate $\hat{\theta}_1 > 0$.

We use the L-BFGS-B algorithm \citep{byrd1995limited} to minimize the negative log-likelihood (NLL). 
The gradients of the log-likelihood can be carried out using the Autograd package \citep{maclaurin2015autograd} for automatic differentiation of standard Python functions. 
We consider also an alternating minimization approach that alternates between estimating the latent space and the model parameters, which we show in Appendix \ref{sec:alternating}. 
Our alternating minimization approach is partly inspired by the projected gradient method of \citet{ma2020universal}, which also alternates between estimating the latent space and the model parameters in a static
latent space model. 
We find that the alternating minimization approach generally converges more slowly than L-BFGS-B, so the results we present in this paper use L-BFGS-B.


\begin{figure*}[t]
\newcommand{\figwidth}{0.32\textwidth}
\newcommand{\figheight}{1.5in}
\centering %centers figure
    \hfill
    \begin{subfigure}[c]{\figwidth}
        \centering
        \includegraphics[height=\figheight]{Figures/RMSE_simulation/simulate_actual.pdf}
        \caption{Actual latent space}
        \label{fig:sim1_actual}
    \end{subfigure}
    \hfill
    \begin{subfigure}[c]{\figwidth}
        \centering
        \includegraphics[height=\figheight]{Figures/RMSE_simulation/simulate_estimate.pdf}
        \caption{Estimated latent space}
        \label{fig:sim1_estimate}
    \end{subfigure}
    \hfill
    \begin{subfigure}[c]{0.34\textwidth}
        \centering
        \includegraphics[height=\figheight, clip=true, trim=5 10 25 25]{Figures/RMSE_simulation/RMSE_z_large.pdf}
        \caption{Latent positions estimation error}
        \label{fig:rmse_sim_z}
    \end{subfigure}
    \hfill
\caption{Comparison of \subref{fig:sim1_actual} actual latent space and \subref{fig:sim1_estimate} estimated latent space (with Procrustes transformation) on a $20$ node simulated network with duration $T=100$. 
The recovered latent node positions are close to the actual positions. 
\subref{fig:rmse_sim_z} The RMSE over $30$ simulated networks ($\pm \, 2$ standard errors) decreases as the duration $T$ increases.}
\label{fig:sim1}
\end{figure*}


We use a multidimensional scaling algorithm as an initialization for the latent space positions $Z$ as in the original latent space model proposed by \citet{hoff2002latent}. We set random values to initialize all other parameters $\Theta = (\alpha_1, \alpha_2, \theta_1, \theta_2, \delta, \gamma)$. 


\section{Experiments}


In this section, we perform evaluation tasks for our proposed model on simulated networks and real networks\footnote{Python code to reproduce our results is available at \url{https://github.com/IdeasLabUT/Latent-Space-Hawkes}}.
We use a sum of $B = 3$ exponential kernels and utilize decays with time scales of an hour, a day, and a week, which is the same as \citet{yang2017decoupling} did in their DLS model. We also fix $C = (1/3, 1/3, 1/3)$ for simplicity\footnote{We also experimented with estimating $C$ but did not find much difference in the results.}.

\subsection{Simulated Networks}
\label{sec:sim_experiment}



We first test our L-BFGS-B estimation procedure on networks simulated from our latent space Hawkes (LSH) model. 
We simulate networks of $20$ nodes in a 2-D latent space using parameters $\theta_1 = 1$, $\theta_2 = -3.2$, $\alpha_1 =  0.01$, and $\alpha_2=0.02$.
Each dimension of the latent positions as well as sender and receiver effects for nodes are sampled independently from a standard Normal distribution: $z_{u}, \delta_u, \gamma_u \sim \mathcal{N}(0,1)$. 
We increase the time duration $T$ from $50$ to $3,000$ and evaluate the estimation accuracy for the latent positions and other parameters.
Additional details on the simulation process is provided in Appendix \ref{sec:sim_appendix}. 
A comparison of the actual and estimated latent positions for a simulated network is shown in Figure \ref{fig:sim1} along with the root mean squared error (RMSE) for estimated latent positions over 30 simulated networks. 
As expected, the error decreases for increasing time duration $T$. 
The error for the other parameters decreases also, as we show in Figure \ref{fig:sim1_supp} in Appendix \ref{sec:sim_appendix}. 
Thus, L-BFGS-B appears to accurately estimate latent positions and model parameters.


\subsection{Real Networks}

\begin{table}[t]
    \centering
    \caption{Summary statistics of real network datasets}
    \label{tab:dataStats}
    \begin{tabular}{cccc}
    \toprule
    Dataset  & Nodes    & Events & Time Duration \\
    \midrule
    Reality  & $65$     & $2,150$      & $8$ months \\
    Enron    & $155$    & $9,646$      & 15 months \\
    MID      & $145$    & $5,088$      & $23$ years \\
    FB-forum & $899$    & $33,720$    & $5.5$ months \\
    \bottomrule
    \end{tabular}
\end{table}

We perform experiments on several real network datasets: Reality Mining \citep{eagle2006reality}, Enron emails \citep{klimt2004enron}, Militarized Interstate Disputes (MID) \citep{palmer2021mid5}, and Facebook-forum \citep{nr}.
Summary statistics for the datasets are shown in Table \ref{tab:dataStats}, and  
additional details are provided in Appendix \ref{sec:supp_datasets}.  
Each dataset consists of a set of relational events, each denoted by a sender, a receiver, and a timestamp.


\paragraph{Baselines for Comparisons}
We compare against three other Hawkes process-based continuous-time network models. The dual latent space (DLS) model \citep{yang2017decoupling} is the most similar to ours, and we provide a detailed comparison of the DLS model with our proposed LSH model in Section \ref{sec:relation_DLS}. 
We also compare against two recently proposed Hawkes process-based block models: the community Hawkes independent pairs (CHIP) model \citep{arastuie2020chip} and the block Hawkes model (BHM) \citep{junuthula2019block}. 
Finally, we compare also against the non-generative continuous-time dynamic network embeddings (CTDNE) \citep{nguyen2018continuous} approach. 
Additional information on these models for comparison along with implementation details are provided in Appendix \ref{sec:supp_models}.

\subsubsection{Predictive Accuracy}
\label{sec:real_experiment}
We first evaluate the predictive ability of our proposed LSH model. 
We split each dataset into a training set containing the first $80\%$ of events and a test set containing the remaining $20\%$ of events. 
We estimate model parameters on the training set and evaluate prediction accuracy on the test set. 
We choose the number of latent dimensions $d$ (for LSH and DLS) and the number of blocks $K$ (for BHM and CHIP) that maximizes the log-likelihood evaluated on the test set. 

\paragraph{Test Log-likelihood}

\begin{table}[t]
  \caption{Evaluation metrics for predictive accuracy on real network datasets. 
  Bold entry denotes highest accuracy for each metric on a dataset. 
  Test log-lik.~shows the mean test set log-likelihood per event and the number of latent dimensions $d$ or blocks $K$ that maximize it. 
  The AUC column shows the mean (standard deviation) of the AUC across 100 time points for dynamic link prediction. 
  DLS does not scale to the FB-forum data.
  CTDNE is not generative so test log-likelihood is not applicable.
  }
  \label{tab:predictive_accuracy}
  \centering
  \setlength{\tabcolsep}{3pt}
  \begin{tabular}{cccccc}
    \toprule
    Dataset                   & Model  & Test log-lik.       & AUC  \\
    \midrule
    \multirow{5}{*}{Reality}  & LSH    & $\bm{-3.71}\,(d=4)$ & $0.945(0.028)$\\
                              & DLS    & $-5.64\,(d=300)$    & $0.940(0.034)$ \\
                              & BHM    & $-5.31\,(K=50)$     & $\bm{0.957(0.022)}$\\
                              & CHIP   & $-4.70\,(K=1)$      & $0.937(0.028)$\\
                              & CTDNE  &                     & $0.936(0.033)$\\
    \midrule
    \multirow{5}{*}{Enron}    & LSH    & $\bm{-4.87}\,(d=4)$ & $0.946(0.024)$ \\
                              & DLS    & $-5.29\,(d=100)$    & $\bm{0.947(0.017)}$\\
                              & BHM    & $-6.35\,(K=14)$     & $0.839(0.035)$\\
                              & CHIP   & $-5.34\,(K=4)$      & $0.895(0.053)$ \\
                              & CTDNE  &                     & $0.912(0.035)$\\

    \midrule
    \multirow{5}{*}{MID}     & LSH     & $\bm{-3.38}\,(d=3)$ & $\bm{0.988(0.018)}$\\
                             & DLS     & $-4.52\,(d=100)$    & $0.977(0.007)$\\
                             & BHM     & $-4.97\,(K=95)$     & $0.971(0.031)$ \\
                             & CHIP    & $-3.63\,(K=2)$      & $0.958(0.035)$\\
                             & CTDNE   &                     & $0.953(0.018)$\\
    \midrule
    \multirow{4}{*}{FB-forum}& LSH     & $\bm{-7.21}\,(d=8)$ & $\bm{0.932(0.009)}$  \\
                             & BHM     & $-11.16\,(K=57)$    & $0.839(0.017)$ \\
                             & CHIP    & $-7.65\,(K=2)$      & $0.919(0.011)$ \\
                             & CTDNE   &                     & $0.788(0.028)$\\
    \bottomrule
  \end{tabular}
\end{table}

We use the mean log-likelihood per event on the test set, also used by \citet{Dubois2013} and \citet{arastuie2020chip}, as an evaluation metric for the model's predictive ability on future data. 
As shown in Table \ref{tab:predictive_accuracy}, our Latent Space Hawkes (LSH) significantly outperforms the other models on all datasets. 
The test log-likelihood is maximized for the LSH at relatively small latent dimensions $d$ compared to the DLS model. 
The low-dimensional latent representation using a single latent space makes the LSH fit more interpretable than the high-dimensional DLS representation using multiple latent spaces. 
Furthermore, these results suggest that the addition of nodal effects and self excitation in the LSH significantly affects the predictive ability compared to DLS. 



\paragraph{Dynamic Link Prediction}
We further explore the performance of the learned model in a dynamic link prediction task. 
We use the same experiment set-up as \citet{yang2017decoupling}. 
We randomly sample 100 time points $t_i$ during the test period. We then compute the probability of a link appearing between each pair of nodes in the $[t_i, t_i+\delta$] time window. We set $\delta$ to be two weeks for the Reality, Enron, and FB-forum datasets and two months for the MID data, which takes place over a longer period of time. For each of these $\delta$ intervals, we obtain the Receiver Operating Characteristics (ROC) curve and compute the Area Under the Curve (AUC) measured across all pairs of nodes according to the predicted probabilities given by the model. 

The mean AUC values are shown in Table \ref{tab:predictive_accuracy} with the value inside the parentheses indicating the standard deviation over these 100 time intervals. 
The ROC curves and box plots for the corresponding AUC values are presented in Appendix \ref{sec:supp_dynamic_lp}.
Our proposed LSH model is competitive at the dynamic link prediction task, achieving highest mean AUC on FB-forum and MID and second highest on Reality and Enron. 


\subsubsection{Generative Accuracy}
\label{sec:generative_accuracy}





To evaluate generative accuracy of our proposed LSH model, we simulate networks with our fitted parameters and perform posterior predictive checks (PPCs) using network statistics such as  reciprocity and transitivity.  
While our LSH model has no issues simulating networks, the DLS is problematic due to its model formulation. 
The jump size for reciprocal excitation depends on distances between nodes in a reciprocal latent space and is further scaled by the parameter $\alpha_2$ in \eqref{eq:dls_lambda}. 
Since the maximum jump size is not constrained, this results in some node pairs having unstable Hawkes processes so that the simulation does not terminate. 
To enable us to make comparisons with the DLS model, we stabilize it by fixing the scaling parameter for the jump size $\alpha_2 = 1$. 

We simulate 15 networks from the fitted model on each real dataset, with the exception of DLS, which does not scale to the FB-forum data. 
We then perform PPCs on the number of events generated, average run length, and 4 static network statistics: transitivity (global clustering coefficient), reciprocity, average local clustering coefficient (LCC), and average degree. 
The run length is the number of consecutive events in the same direction, e.g.~in the sequence $(u,v), (v,u), (v,u), (v,u), (v,u), (u,v)$, the run length for $(v,u)$ is 4 because it appears 4 times consecutively before the reciprocal event $(u,v)$ appears. 


\begin{table}[t]
\caption{Comparison of generative accuracy between models using mean statistic over 15 simulated networks.
Bold entry denotes the simulated statistic closest to the actual statistic. 
While both LSH and DLS can replicate the static network statistics from the actual networks, DLS generates way too many events compared to the actual networks.}
\label{tab:generative_accuracy}
\centering
\setlength{\tabcolsep}{5pt}
    \begin{tabular}{ccccc}
    \toprule
    Dataset                  & Statistic       & Actual     & LSH             & DLS       \\ 
    \midrule
    \multirow{6}{*}{Reality} & \# of events    & $2,\!148$  & $\bm{2,\!190}$  & $9,\!493$   \\
                             & Avg.~run length & $2.49$     & $\bm{2.62}$     & $1.91$      \\
                             & Transitivity    & $0.29$     & $0.34$          & $\bm{0.32}$      \\
                             & Reciprocity     & $0.80$     & $\bm{0.86}$     & $0.52$      \\
                             & Avg.~LCC        & $0.25$     & $0.19$          & $\bm{0.21}$      \\
                             & Avg.~degree     & $4.86$     & $\bm{4.45}$     & $7.50$      \\
    \midrule
    \multirow{6}{*}{Enron}   & \# of events    & $9,\!646$  & $\bm{11,\!010}$ & $675,\!621$ \\
                             & Avg.~run length & $2.44$     & $\bm{2.63}$     & $1.87$      \\
                             & Transitivity    & $0.31$     & $0.39$          & $\bm{0.30}$      \\
                             & Reciprocity     & $0.65$     & $\bm{0.65}$     & $\bm{0.65}$      \\
                             & Avg.~LCC        & $0.40$     & $0.51$          & $\bm{0.36}$      \\
                             & Avg.~degree     & $18.46$    & $25.86$         & $\bm{18.43}$     \\ 
    \midrule
    \multirow{6}{*}{MID}     & \# of events    & $5,\!088$  & $\bm{3,\!996}$  & $412,\!890$ \\
                             & Avg.~run length & $2.88$     & $\bm{2.71}$     & $1.89$      \\
                             & Transitivity    & $0.13$     & $0.24$          & $\bm{0.20}$      \\
                             & Reciprocity     & $0.64$     & $\bm{0.57}$     & $0.52$      \\
                             & Avg.~coef       & $0.25$     & $\bm{0.29}$     & $\bm{0.29}$      \\
                             & Avg.~degree     & $6.80$     & $\bm{7.05}$     & $9.57$      \\ 
    \bottomrule
    \end{tabular}
\end{table}
A comparison between the actual statistics and mean simulated statistics is shown in Table \ref{tab:generative_accuracy}. We compare LSH and DLS since they are both based on the latent space model. The DLS model generates significantly more events than exist in the actual network, ranging from roughly a 4x increase (Reality) to an 80x increase (MID). 
We believe that this is due to the reciprocal latent space used in the DLS model. 
Even though we stabilized the model by setting $\alpha_2 = 1$, some nodes are likely still extremely close in the reciprocal latent space, causing too many events to be generated. 

We also find that the lack of self-excitation in DLS prevents it from replicating the run length of directed event sequences. Since DLS only has reciprocal excitation, its generated networks have the average run length of about 2 regardless of the average run length in the actual network. 
On the other hand, the DLS model performs quite well at replicating the static network statistics, and in many cases, even better than our proposed LSH. 
We believe that this is partially due to the much higher latent dimension $d$ that maximizes the test log-likelihood for DLS. 
The LSH could potentially achieve better generative accuracy using higher $d$ as well.
Additional results on generative accuracy, including plots comparing the actual statistics with the distribution of the simulated statistics, are provided in Appendix \ref{sec:supp_ppc}.




\section{Case Study}

\begin{figure*}[p]
\centering %centers figure
\newcommand{\figwidth}{4.65in}
    \begin{subfigure}[c]{\figwidth}
        \centering
        \hfill
        \includegraphics[width=\figwidth]{Figures/LSP/LSP_MID_pos.pdf}
        \caption{Estimated latent positions from model with positive slope}
        \label{fig:mid_latent_pos}
    \end{subfigure}
    \\[12pt]
    \begin{subfigure}[c]{\figwidth}
        \centering
        \hfill
        \includegraphics[width=\figwidth]{Figures/LSP/LSP_MID_neg.pdf}
        \caption{Estimated latent positions from model with negative slope}
        \label{fig:mid_latent_neg}
    \end{subfigure}
\caption{2-D latent space plots for LSH model fit to MID data. 
Edges are shown for the 10 most frequently occurring incidents. 
The most active countries that initiate and receive the 5 most incidents are shown in blue and green, respectively. 
Pakistan (PAK) is among the top 5 initiators and receivers and is shown in red.
\subref{fig:mid_latent_pos} The model with positive slope places countries with lots of conflicts close together. 
The most active countries tend to appear centrally in this latent space. 
A zoomed in version of the center of the latent space is shown in Figure \ref{fig:mid_latent_zoomed} in Appendix \ref{sec:supp_case_study}. 
\subref{fig:mid_latent_neg} The model with negative slope places countries with lots of conflicts far apart. 
The most active countries tend to appear on the periphery of this latent space.}
\label{fig:mid_latent}
\end{figure*}

\begin{figure*}[t]
\centering %centers figure
\newcommand{\figwidth}{4.65in}
        \includegraphics[width=\figwidth]{Figures/LSP/LSP_MID_cont.pdf}
\caption{2-D latent space plot for MID data with positive slope and countries colored by continent. A zoomed in version of the center of the latent space is shown in Figure \ref{fig:mid_latent_cont_zoomed} in Appendix \ref{sec:supp_case_study}.}
\label{fig:mid_latent_continents}
\end{figure*}


We now present a case study demonstrating our proposed LSH model being used for exploratory analysis on a real continuous-time network: the Militarized Interstate Disputes (MID) incident network. 
Timestamped edges in this network correspond to individual incidents within disputes between countries. 
Incidents include threats, displays, and uses of force initiated by one country towards another. 

Incidents in the MID network are indicative of negative relationships between countries. 
As a result, one might expect the network to be disassortative. 
On the other hand, incidents frequently occur between countries that are geographically close, particularly if they share a boundary, which suggests that the network may also have an assortative structure. 
Thus, we conduct exploratory analysis of this network using two different parameterizations of our model. 
We fix the latent dimension to be $d=2$ in both models so that we can visualize the latent positions of the countries. 

We first consider a \emph{positive slope} model by constraining $\theta_1 > 0$ in \eqref{eq:baseline} so that two countries with lots of incidents between them are pulled closer together in the latent space, as is typically the case for assortative networks. 
In this parameterization, countries that engage in lots of incidents are likely to appear centrally in the latent space. 
We next consider a \emph{negative slope} model by constraining $\theta_1 < 0$ in \eqref{eq:baseline} so that two countries with lots of incidents between them are pushed further apart in the latent space. 
Under this parameterization, countries that engage in lots of incidents are likely to appear on the periphery of the latent space.

\paragraph{Findings and Discussion}
We show the 2-D latent space plot with both positive and negative slope terms in Figure \ref{fig:mid_latent}. 
We first consider the latent positions from the positive slope model. 
Notice that the most active nodes tend to appear centrally, and the node pairs with the most frequent incidents tend to be placed close together. 
For example, Israel (ISR) and Lebanon (LEB) have latent positions very close together, which makes sense given that they have the most incidents in the data set: 588 total incidents.
Additionally, countries that are geographically close do mostly appear close together in the latent space. 
This can be seen from Figure \ref{fig:mid_latent_continents}, where nodes are colored by continent. 
The estimated parameters are $\theta_1 = 1.2, \theta_2 = -9.3, \alpha_1 = 0.77, \alpha_2 = 0.13$. 
The high value for $\alpha_1$ compared to $\alpha_2$ indicates the importance of self excitation in addition to reciprocal excitation.



Next, we consider the negative slope model. 
From examining the latent positions, we find that most active nodes tend to appear on the periphery of the latent space, which is reasonable because the model attempts to push nodes with many incidents far apart. 
For example, Israel and Lebanon are on opposite sides of the latent space.  
The estimated parameters for this model are $\theta_1 = -0.008, \theta_2 = -1.24, \alpha_1 = 0.83, \alpha_2 = 0.15$. 
While the parameters used for modeling the baseline intensity have changed significantly, the $\alpha$ parameters modeling self and reciprocal excitation are very similar to the positive slope model.

Additional results are presented in Appendix \ref{sec:supp_case_study}. 
We note that this case study is intended to be exploratory rather than confirmatory. 
We caution readers from jumping to conclusions about countries from our results.

\section{Conclusion}
We proposed the latent space Hawkes (LSH) model for continuous-time networks of relational events, 
which models interactions between each pair of nodes as realizations from a mutually exciting Hawkes processes whose intensity functions include a baseline rate along with both self and reciprocal excitation terms. 
The LSH model makes use of a single latent space along with sender and receiver effects to provide a more interpretable fit while remaining competitive in accuracy compared to the dual latent space (DLS) model. 
We performed an exploratory analysis of militarized disputes between countries using the LSH, where the latent space was quite informative of the dispute network structure. 
We also found that self excitation was stronger than reciprocal excitation in this network, demonstrating the importance of self excitation, which is not present in the DLS model. 
We hope this paper inspires future work combining continuous latent space representations with TPPs, which have not gotten as much attention as block model-based TPPs.

\paragraph{Limitations}
While our proposed model shows superior empirical performance and interpretability, there are also several limitations. 
We use a single reciprocal jump size $\alpha_2$ for all node pairs, which results in a less flexible model compared to the DLS, but it is
more stable. 
While our estimation procedure scales to networks with about $1,000$ nodes, it does not scale to extremely
large networks with $>10,000$ nodes, unlike the the CHIP \citep{arastuie2020chip} and MULCH \citep{soliman2022multivariate} latent block models. 
Additionally, the latent positions of nodes in our LSH model are fixed over time, just like in the DLS. If there are significant changes in the
network structure over time, a more flexible model that allows latent positions to change over time, such as the model of \citet{rastelli2021continuous}, may be a
better fit.
Finally, one could model more complex dependencies among the nodes that goes beyond self and reciprocal
excitation using a multivariate Hawkes process, as in the MULCH latent block model \citep{soliman2022multivariate}, instead of a bivariate Hawkes process.

\begin{contributions} % will be removed in pdf for initial submission,
                      % so you can already fill it to test with the
                      % ‘accepted’ class option
    % Briefly list author contributions.
    % This is a nice way of making clear who did what and to give proper credit.

    % H.~Q.~Bovik conceived the idea and wrote the paper.
    % Coauthor One created the code.
    % Coauthor Two created the figures.
Z.~Huang, S.~Paul, and K.~S.~Xu contributed to the model and algorithm development. 
Z.~Huang and H.~Soliman wrote the code and developed the experiments. 
All authors contributed to writing the paper.
\end{contributions}

\begin{acknowledgements} 

This material is based upon work supported by the National Science Foundation grants IIS-1755824, DMS-1830412, IIS-2047955, and DMS-1830547. 

\end{acknowledgements}

\nocite{gower1975generalized, StellarGraph}

\bibliography{huang_696}

\end{document}
