% \documentclass{uai2025} % for initial submission
\documentclass[accepted]{uai2025} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2025} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2025} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}    % use 8-bit T1 fonts
\usepackage{hyperref}       % hyperlinks
\usepackage{url}            % simple URL typesetting
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{xcolor}         % colors
\usepackage{multirow}
\usepackage[normalem]{ulem}
\usepackage{enumitem}
\usepackage{graphicx}
\usepackage{subcaption}
\usepackage{amsmath}
\usepackage{rotating}
\usepackage{multirow}
\usepackage{bbding}
\usepackage{makecell}
\usepackage{float}
\usepackage[algo2e,ruled,vlined]{algorithm2e}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{DyGMAE: A Novel Dynamic Graph Masked Autoencoder for Link Prediction}

% The standard author block has changed for UAI 2025 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
% \author[1]{\href{mailto:<jj@example.edu>?Subject=Your UAI 2025 paper}{Jane~J.~von~O'L\'opez}{}}
\author[1]{\href{mailto:<weixiongliu@m.scnu.edu.cn>}{Weixiong Liu}}
\author[1]{\href{mailto:<jung@m.scnu.edu.cn>}{Junwei Cheng}}
\author[1]{\href{mailto:<2023023258@m.scnu.edu.cn>}{Zhongyu Pan}}
\author[1]{\href{mailto:<hechaobo@foxmail.com>}{Chaobo He}\thanks{Corresponding author: hechaobo@foxmail.com}}

\author[2]{\href{mailto:<gql@jnu.edu.cn>}{Quanlong Guan}}  

\affil[1]{%
    School of Computer Science\\
    South China Normal University\\
    Guangzhou, China
}
\affil[2]{%
    College of Information Science and Technology\\
    Jinan University\\
    Guangzhou, China
}
  
\begin{document}
\maketitle


\begin{abstract}
  Dynamic link prediction (DLP) is a crucial task in graph learning, aiming to predict future links between nodes at subsequent time in dynamic graphs. Recently, graph masked autoencoders (GMAEs) have shown promising performance in self-supervised learning. However, their application to DLP is under-explored. Existing GMAEs struggle to capture temporal dependencies, and their random masking causes crucial information loss for DLP. Moreover, most existing DLP methods rely on local information, ignoring global information and failing to capture complex features in real-world dynamic graphs. To address these issues, we propose DyGMAE, a novel dynamic GMAE method specifically designed for DLP. DyGMAE introduces a Multi-Scale Masking Strategy (MSMS), which generates multiple graph views by masking parts of the edges and tries to reconstruct them. Additionally, a multi-scale masking representation alignment module with a contrastive learning objective is employed to align representations which are encoded by unmasked edges across these views. Through this design, different masked views can provide diverse information to alleviate the drawbacks of random masking, and contrastive learning can align different views to mitigate the problem of exploiting local and global information simultaneously. Experiments on benchmark datasets show DyGMAE achieves superior performance in the DLP task.
\end{abstract}

\section{Introduction}
Dynamic graphs \citep{gravina2024deep} exhibit a remarkable capacity to model real-world interaction changes. This distinct feature enables their extensive application in diverse dynamic systems, such as social networks \citep{min2021stgsn, cheng2025community}, disease transmission networks \citep{zhu2022interpretable}, and transportation systems \citep{yu2020forecasting}. In dynamic graph learning, dynamic link prediction (DLP) is one fundamental task, which aims to forecast the appearance or disappearance of links over time \citep{qin2023temporal, firouzkouhi2024generalized}, and can be regarded as a temporal extension of static link prediction \citep{he2023community}. DLP plays a crucial role in diverse applications including traffic forecasting \citep{li2023dynamic} and disease control \citep{wang2022dynamic}. Given the complexity of dynamic graphs, which involves intricate structural patterns and temporal dependencies, finding effective methods for DLP remains a significant challenge.
% Among the numerous tasks in the realm of dynamic graph learning, dynamic link prediction (DLP) stands out as a fundamental one.

During the pursuit of addressing this challenge, self-supervised learning (SSL) has emerged as a promising approach due to its ability to leverage large amounts of unlabeled data \citep{gao2022novel, zhang2023dyted}. In particular, graph masked autoencoders (GMAEs) \citep{hou2022graphmae, liu2024mask, liu2024hi}, a generative SSL framework, have recently excelled in graph-related tasks. However, their potential in DLP has not been fully explored. Specifically, GMAE extends the masked autoencoder, an SSL framework in computer vision \citep{he2022masked} and natural language processing \citep{devlin2018bert} without labeled data, to graphs. Compared with other SSL frameworks, GMAEs have achieved success in the static graph link prediction task by masking and reconstructing edges \citep{li2023s}, which has been found to remarkably boost the prediction accuracy. Based on the above considerations, we reasonably speculate that GMAE might be adaptable to dynamic graphs to enhance the performance of DLP. However, despite their success in static graphs, current GMAE methods face several limitations when applied to dynamic graphs. First, simple random masking easily causes critical information loss, resulting in suboptimal link prediction performance. Second, they struggle to model time-evolving structures vital for DLP because they are designed for static graphs.

In addition to the limitations of current GMAE in the dynamic graph scenario, existing DLP methods also suffer from significant drawbacks. Specifically, as \citep {gao2022novel} pointed out, most of these methods are unable to capture the complex features in real-world dynamic graphs comprehensively. They tend to focus only on local information while overlooking global information. In fact, both global and local information in dynamic graphs are crucial for DLP. Local information offers node-specific short-term dynamics, while global information reveals the overall structures and long-term dependencies of the graph. Together, they enable a comprehensive understanding of the graph. For instance, some methods, such as \citep{hajiramezanali2019variational, YANG2022289, yang2021discrete, jiao2024deep}, focus solely on local structural information using graph neural networks (GNNs), neglecting the broader context. These methods often struggle to capture local and global patterns simultaneously, limiting their ability to represent the full complexity of dynamic graphs. Although DGCN \citep{gao2022novel} tries to capture global information by maximizing mutual information between local and global representations, its heavy reliance on local information for global representation aggregation restricts its performance.

Intuitively, GMAE uses a masking strategy to generate different structural views and reconstruct the masked parts, facilitating the exploration of diverse structures and evolving patterns. Considering GMAE's limitations in DLP and the problem of exploiting both local and global information, we propose DyGMAE, a novel dynamic graph masked autoencoder specifically designed for DLP. In DyGMAE, we introduce a Multi-Scale Masking Strategy (MSMS) to tackle both of the above problems. MSMS generates diverse masked and unmasked views by applying different edge masking techniques and edge masking ratios, and attempts to reconstruct the masked edges from the unmasked portions. Furthermore, in MSMS, we incorporate an alignment module with a contrastive learning objective \citep{he2025signed}. This module aims to integrate the information obtained from different masking views, including both local and global information, by aligning different views. By applying the MSMS, DyGMAE can more effectively explore different aspects of the structural patterns and temporal dependencies in multi-reconstruction phases, significantly enhancing its ability to capture comprehensive information and thus improving DLP performance. Finally, the refined embeddings are processed by a gated recurrent unit (GRU) based temporal modeling module, which captures temporal dependencies in dynamic graphs, and are subsequently used for DLP. As a result, DyGMAE effectively captures both the structural evolution and temporal dependencies of dynamic graphs, addressing key challenges in DLP. We summarize our main contributions as follows:

\begin{itemize}
\item We propose DyGMAE, a novel dynamic graph masked autoencoder tailored for DLP. It extends the GMAE framework to effectively capture both structural patterns and temporal dependencies in dynamic graphs.

\item DyGMAE integrates a MSMS to mitigate the information loss induced by random masking and address the challenge of acquiring both local and global information. To our knowledge, this is the first attempt to combine GMAEs with contrastive learning for DLP.

\item Our experimental results show that DyGMAE achieves superior performance in the DLP task across several real-world dynamic graph datasets, outperforming state-of-the-art methods.
\end{itemize}


\section{Preliminaries}
In this section, we first introduce notations and formulate the problems, then illustrate the graph masked autoencoder framework.
\label{sec:Preliminaries}
\subsection{Notations and Problem Formulation} Without loss of generality, we model dynamic graphs as consisting of a series of snapshots, denoted as \(\mathcal{G} = \{{G_1, G_2, \ldots, G_T}\}\), where \(T\) represents the total number of snapshots. Each snapshot $G_t = (V_t, E_t)$, for $1 \leq t \leq T$, consists of a node set ${V}_t$ and an edge set ${E}_t$. The adjacency matrix at time step $t$ is denoted by $\mathbf{A}_t \in \{0, 1\}^{|{V}_t| \times |{V}_t|}$.

DLP aims to forecast the future link states in a dynamic graph by utilizing the historical snapshots $\mathbf{A}_1, \mathbf{A}_2, \ldots, \mathbf{A}_T$. Its objective is to predict the link states in the next snapshot: $\hat{\mathbf{A}}_{T+1}$. Dynamic New Link Prediction (DNLP) is a specialized type of DLP task. It focuses on identifying the links that exist in the current snapshot $\mathcal{G}_T$ of dynamic graphs but were absent in the previous snapshot $\mathcal{G}_{T-1}$. Compared with DLP, DNLP is more challenging as it demands stronger generalization capabilities to effectively tackle this task. 


\subsection{Graph Masked Autoencoder}
We take a static graph as an example to illustrate the GMAE framework. It has three core components: a masking module \( f_{\text{M}}(\cdot) \), an encoder \( f_{\text{E}}(\cdot) \), and a decoder \( f_{\text{D}}(\cdot) \). Given graph \( G \), the masking module creates a masked graph \( {G}^{{m}} \) and an unmasked graph \( {G}^{{u}} \) by masking nodes, features, or edges. The unmasked graph \( {G}^{{u}} \) goes through the encoder \( f_{\text{E}}(\cdot) \) to generate latent representations \( \mathbf{Z} \) containing the graph's key information. The decoder \( f_{\text{D}}(\cdot) \) then reconstructs an approximation \( \hat{{G}} \) of the original graph from \( \mathbf{Z} \). 
  

\begin{figure*}[htbp]
    \centering
    \includegraphics[scale=1.3]{figs/framework.pdf}
    \caption{
    The overall architecture of DyGMAE.
    % (A) Overall architecture of DyGMAE. It involves the Multi-Scale Masking Strategy (MSMS), GRU-based temporal modeling, and dynamic link prediction. The dynamic graph snapshots \( \{ G_1, G_2, \dots, G_T \} \) are processed through the MSMS, generating multiple unmasked views. The aggregated representations from all unmasked views are fed into the GRU to model temporal dependencies, producing the final node representations \( \mathbf{Z}_T \), which are then used for dynamic link prediction tasks.
    % (B) Multi-Scale Masking Strategy (MSMS). The module generates masked graphs \( G_t^{m} \) and unmasked graphs \( G_t^{u} \), which are passed through the encoder and projector to obtain node representations. The contrastive loss \( \mathcal{L}^C_t \) aligns the representations across different unmasked views, while the edge and adjacency reconstruction losses \( \mathcal{L}^E_t \) and \( \mathcal{L}^A_t \) refine the embeddings. These graph representations are used to compute the Multi-Mask Representation Fusion (MMRF) across different views, which are then passed to the GRU for further temporal modeling.
    }
    \label{fig:framework}
\end{figure*}

\section{Methodology} 
\label{sec:Methodology}

The overall framework of the proposed DyGMAE is shown in Figure~\ref{fig:framework}, detailed as follows:
\textbf{(A) Overall architecture of DyGMAE}: Dynamic graph snapshots \( \{ G_1, G_2, \dots, G_T \} \) are processed by MSMS to generate multiple unmasked views. Aggregated representations from these views are fed into the GRU for temporal modeling, yielding final node representations \( \mathbf{Z}_T \) for DLP. \textbf{(B) Multi-Scale Masking Strategy (MSMS)}: This module generates unmasked \( G_t^{u} \) graphs through the application of diverse masking strategies to each snapshot, which are passed through an encoder and projector for node representations. The contrastive loss \( \mathcal{L}^C_t \) aligns representations of different unmasked views, while edge and adjacency reconstruction losses \( \mathcal{L}^E_t \) and \( \mathcal{L}^A_t \) refine embeddings. Graph representations are used for Multi-Scale Masking Representation Fusion (MSMRF) across views, then sent to the GRU for further temporal modeling. In the subsequent sections, we explain each DyGMAE module in detail, emphasizing how the GMAE framework adapts to dynamic graphs and how MSMS boosts DLP performance.

\subsection{Edge Masking Strategy}
\label{subsection:edge_masking}
Many related GMAE methods \citep{hou2022graphmae} mask node features and attempt to reconstruct them. However, our task is DLP, and thus we need to bridge the gap between feature reconstruction and link prediction. To this end, we employ two edge-masking strategies as part of our MSMS: random edge masking and path-wise random masking. 

\noindent\textbf{Random Edge Masking.}
Random edge masking is performed by randomly masking a subset of edges in the graph. The edges to be masked, denoted as $E^{m}$, are sampled using a Bernoulli distribution as follows:
\begin{equation}
    E^{m} \sim \text{Bernoulli}(p),
\end{equation}
\begin{equation}
    E^{u} = E - E^{m},
\end{equation}
where \( p \) denotes the probability of masking an edge, and \( E^{u} \) represents the set of remaining edges. 

\noindent\textbf{Path-Wise Random Masking.}  
Following the inspiration from \citep{li2023s}, we adopt the path-wise random masking strategy. Different from random edge masking, it masks edges via random walks \citep{berahmand2022preference} from root nodes. This disrupts local connections, forcing path reconstruction to capture complex structural dependencies and higher-order node relationships. Formally, the masked edges are sampled as:
\begin{equation}
    E^{m} \sim \text{RandomWalk}(R, n_{\text{walk}}, l_{\text{walk}}),
\end{equation}
where \( R \) is a set of randomly selected root nodes, \( n_{\text{walk}} \) denotes the number of random walks per root node, and \( l_{\text{walk}} \) is the length of each random walk. 

For dynamic graphs, we define the unmasked snapshot at time step \(t\) as \( G_{t}^{u} = (V_t, E_{t}^{u}) \), with \( E_{t}^{u} \subseteq E_t \) being the remaining unmasked edges. The masked snapshot is \( G_{t}^{m} = (V_t, E_{t}^{m}) \), where \( E_{t}^{m} = E_t - E_{t}^{u} \). This approach adds stochasticity and compels the method to leverage the partially unmasked graph \(G_{t}^{u}\) and historical information, facilitating the capture of more crucial information.

\subsection{Multi-Scale Masking Strategy (MSMS)}
\label{sec:Multi-Scale Masking Strategy (MSMS)}
\noindent\textbf{Multi-Scale Masking.}
Facing the challenges in DLP with GMAE, which involve the critical information loss induced by random masking strategies and the limitations in acquiring local and global information for complex data modeling, we put forward the MSMS as a solution. MSMS generates multiple masked and unmasked views for each snapshot by applying different edge masking strategies and varying the masking probabilities \(p\) as we introduced in Section~\ref{subsection:edge_masking}. Specifically, given a dynamic graph snapshot \(G_t = (V_t, E_t)\), the multi-scale masking creates \(K\) unmasked versions of the snapshot through the set of masking strategies \(\{f_\text{M}^1, f_\text{M}^2, \ldots, f_\text{M}^K\}\), denoted as \(\{G_t^{u1}, G_t^{u2}, \ldots, G_t^{uK}\}\). Each unmasked view captures distinct structural perspectives and different evolving dynamics. Edges are masked according to predefined probabilities \(\{p_1, p_2, \ldots, p_K\}\), with each \(p_i\) controlling the extent of masking in the corresponding view. These different edge masking lead to various characteristics of the unmasked views. For example, one view may focus on a higher masking ratio to encourage the method to infer global structures, while another view may apply a lower masking ratio to preserve local and fine-grained information.

By introducing multiple unmasked views, the multi-masking strategy enriches the diversity of training data and allows the method to extract richer structural patterns and temporal dependencies from various unmasked views in the edge reconstruction phase. This capability is crucial for DLP, as it enables the model to capture both local and global information in the graph over time, thereby improving its ability to predict future links accurately. 

\noindent\textbf{Unmasked Graph Encoder.}  
The dynamic graph snapshot is processed through the multi-scale masking, which generates multiple unmasked views of the graph. Then one such view, \(G_t^{u} = (V_t, E_t^{u})\), is fed into the encoder to compute the latent node representations. The encoder consists of a stack of \(L\) GNN layers, which operate on the unmasked part of the graph. The latent representation at the \((l+1)\)-th layer is computed as:  
\begin{equation}
\label{GNN_layer}
\mathbf{H}_{t,l+1}^{} = \sigma_1\left(\tilde{\mathbf{D}}_{t}^{-\frac{1}{2}} \tilde{\mathbf{A}}_{t}^{u} \tilde{\mathbf{D}}_{t}^{-\frac{1}{2}} \mathbf{H}_{t,l}^{} \mathbf{W}_l\right),
\end{equation}  
where \(\mathbf{H}_{t,l+1}^{} \in \mathbb{R}^{N \times d_\text{out}}\) represents the node embeddings at layer \(l+1\) for the \(t\)-th snapshot. In this equation, \(N\) denotes the number of nodes in the graph, and \(d_\text{out}\) is the dimensionality of the output embeddings. \(\tilde{\mathbf{A}}_{t}^{u} = \mathbf{A}_{t}^{u} + \mathbf{I} \in \mathbb{R}^{N \times N}\) represents the unmasked adjacency matrix of the \(t\)-th snapshot with self-loops added, where \(\mathbf{A}_{t}^{u}\) is the adjacency matrix of the unmasked graph \(G_t^{u}\), generated by one of the multi-scale masking views. The diagonal degree matrix corresponding to \(\tilde{\mathbf{A}}_{t}^{u}\) is denoted as \(\tilde{\mathbf{D}}_{t} \in \mathbb{R}^{N \times N}\). \(\mathbf{H}_{t,l}^{} \in \mathbb{R}^{N \times d_\text{in}}\) represents the input node embeddings at layer \(l\), where \(d_\text{in}\) is the dimensionality of the input features. Finally, \(\mathbf{W}_l \in \mathbb{R}^{d_\text{in} \times d_\text{out}}\) is the learnable weight matrix of the \(l\)-th GNN layer, and \(\sigma_1(\cdot)\) is an activation function such as ReLU. 

\noindent\textbf{Projector.}  
To refine and enhance the latent representations produced by the encoder for link prediction, we introduce a multi-layer perceptron (MLP) as a projector following the encoder. The transformation is defined as:  
\begin{equation}
\label{Projector_mlp}
\mathbf{H}_t = \mathbf{MLP}(\mathbf{H}_{t,L}),
\end{equation}  
where \(\mathbf{H}_t \in \mathbb{R}^{N \times d_\text{proj}}\) represents the refined node embeddings after projection, and \(\mathbf{H}_{t,L} \in \mathbb{R}^{N \times d_\text{out}}\) denotes the latent representations output from the last GNN layer of the encoder. The MLP is used to project the embeddings into a new space, where \(d_\text{proj}\) is the dimensionality of the projected embeddings.


\noindent\textbf{Representation Refinement via Reconstruction.} 
In order to ensure that the representations generated by each unmasked view contain more informative and critical structural features for reconstructing the masked edges and get better DLP performance, we feed each representation into the decoder to enhance its reconstruction ability and mitigate the issues of noise and redundancy. We employ two types of loss functions to guide the training process. Next, for simplicity, we will explain the loss functions using one view as an example.

The first loss function, edge reconstruction loss \(\mathcal{L}_{t}^{E}\), is designed to leverage the unmasked edges to reconstruct the masked edges, the loss function maximizes the probability of the masked link nodes in the original graph while minimizing the probability of unlinked nodes, thereby promoting the reconstruction of the graph structure and enhancing the method's reconstruction ability. The loss is formulated using a binary cross-entropy function, as follows:
\begin{equation}
\label{Loss_E1}
\mathcal{L}^+_{t} = \frac{1}{|E^+_{t}|} \sum_{(i,j) \in E^+_{t}} \log p_f(\mathbf{h}_{t,i}, \mathbf{h}_{t,j}),
\end{equation}  
\begin{equation}
\label{Loss_E2}
\mathcal{L}^-_{t} = \frac{1}{|E^-_{t}|} \sum_{(i',j') \in E^-_{t}} \log(1 -  p_f(\mathbf{h}_{t,i'}, \mathbf{h}_{t,j'})),
\end{equation}  
\begin{equation}
\label{Loss_E}
\mathcal{L}_{t}^{E} = - \left( \mathcal{L}^+_{t} + \mathcal{L}^-_{t} \right),
\end{equation}  
\normalsize
where the set of positive edges \(E^+_{t}\) corresponds to the edges present in the masked graph at time \(t\), while \(E^-_{t}\) is the set of negative edges, with the number of negative edges equal to the number of positive edges. $p_f(\mathbf{h}_{t,i}, \mathbf{h}_{t,j})$ computes the probability that there is an edge between nodes $i$ and $j$, based on their embeddings $\mathbf{h}_{t,i}$ and $\mathbf{h}_{t,j}$ at time $t$. By minimizing \(\mathcal{L}_{t}^{E}\), the method learns to reconstruct the masked edges, resulting in better DLP performance.

The second loss function, adjacency matrix reconstruction loss \(\mathcal{L}_{t}^{A}\), is designed to recover the original graph connections from the latent representations derived from the unmasked parts of the graph. The reconstruction error for the graph structure at time \(t\) is defined as:
\begin{equation}
\label{Loss_a1}
    \mathcal{L}_{t}^{A} = \| \mathbf{A}_{t} -\tilde{\mathbf{A}}_{t,\text{pre}}\|_F^2,
\end{equation}
where \(\tilde{\mathbf{A}}_{t,\text{pre}}\) represents the predicted adjacency matrix at time \(t\) and is decoded by $\mathbf{H}_t$ through two MLP layers. And \(\mathbf{A}_t\) is the ground-truth adjacency matrix. Since the elements of \(\mathbf{A}_t\) are predominantly zeros due to the graph's sparsity, we adopt the following loss to improve the method's reconstruction ability \citep{10.5555/3042573.3042584}:
\begin{equation}
\label{Loss_A}
    \mathcal{L}_{t}^{A} = \| \mathbf{A}_{t+1} - \tilde{\mathbf{A}}_{t+1} \|_F^2 
+ \gamma \| \tilde{\mathbf{A}}_{t+1} \|_1 
+ \delta \| \tilde{\mathbf{A}}_{t+1} \|_*,
\end{equation}
\normalsize
where the term \(\| \tilde{\mathbf{A}}_{t+1} \|_1\) imposes sparsity on the predicted matrix, while \(\| \tilde{\mathbf{A}}_{t+1} \|_*\) is the nuclear norm that encourages a low-rank structure. The weights \(\gamma\) and \(\delta\) control the importance of the sparsity and low-rank constraints, respectively.

By optimizing both loss functions, DyGMAE improves its ability to reconstruct the masked edges and capture the all nodes connectivity across several unmasked views. However, as different view representations are discrete in nature, there exists a challenge in effectively integrating them. 

\noindent\textbf{Alignment Module With Contrastive Learning.}  
After obtaining representations from multiple unmasked graph views, \(\{\mathbf{H}_{t}^{1}, \mathbf{H}_{t}^{2}, \ldots, \mathbf{H}_{t}^{K}\}\), we apply contrastive learning to align these latent representations, preserving complex features. This contrastive alignment enhances the consistency of the learned features and boosts the integration of local and global information across different views.

We adopt the following contrastive loss on two views to enforce alignment between representations of the same node across different unmasked views at time \(t\):
\begin{equation}
\label{Loss_c1}
\mathcal{L}_{t}^{C} = - \frac{1}{N} \sum_{i=1}^N \log \frac{\exp\left(\text{sim}(\mathbf{h}_i^{1}, \mathbf{h}_i^{2}) / \tau\right)}{\sum_{j=1}^N \exp\left(\text{sim}(\mathbf{h}_i^{1}, \mathbf{h}_j^{2}) / \tau\right)},
\end{equation}  
where $\mathbf{h}_i^{1}$ and $\mathbf{h}_i^{2}$ stand for the latent representations corresponding to the same node $i$ within two separate unmasked views, \(\text{sim}(\cdot, \cdot)\) denotes a similarity function (e.g., cosine similarity), and \(\tau\) is a temperature parameter that controls the sharpness of distribution.

\noindent\textbf{Multi-Scale Masking Representation Fusion.}  
Although individual representations \(\{\mathbf{H}_{t}^{1}, \mathbf{H}_{t}^{2}, \ldots, \mathbf{H}_{t}^{K}\}\) generated from different unmasked views capture distinct structural and evolving features, they fail to fully represent the comprehensive characteristics of the graph. To address this, we introduce a Multi-Scale Masking Representation Fusion (MSMRF) to aggregate the latent representations derived from multiple unmasked views, combining information across different perspectives. This fusion process enhances the method’s ability to learn multi-level features, including both global and fine-grained structural patterns and temporal dependencies.
The fused representation of the \(t\)-th snapshot is computed as:
\begin{equation}
\label{aggregate_1}
\mathbf{H}_t = \text{Aggregate}(\mathbf{H}_t^{1}, \mathbf{H}_t^{2}, \ldots, \mathbf{H}_t^{K}),
\end{equation}
where \(\text{Aggregate}(\cdot)\) denotes the aggregation function, which can be implemented through a variety of approaches. These include calculating the mean, performing summation, using the max operation, and applying attention mechanisms.


\subsection{Temporal Dependency Modeling with GRU}  
Capturing temporal dependencies is crucial for DLP, as the relationships between nodes evolve over time. In dynamic graphs, the probability of a node forming a link with another node at the current snapshot depends not only on the current graph structure but also on the temporal evolution of node interactions, which is influenced by previous snapshots. To capture these evolving dependencies, we employ a GRU \citep{cho2014learning}, a model well-suited for sequential data processing, enabling the effective capture of temporal dependencies in the graph. Specifically, the current latent representation \(\mathbf{H}_t\), generated by the encoder, and the final representation \(\mathbf{Z}_{t-1}\) from the previous time step are combined as inputs to compute the final representation \(\mathbf{Z}_t\) at time \(t\). For the initial step (\(t=0\)), \(\mathbf{Z}_0\) is initialized randomly. The GRU updates are defined as follows:
\begin{align}
\label{gru}
\mathbf{Q}_t &= \sigma_2(\mathbf{W}_q \mathbf{H}_t + \mathbf{U}_q \mathbf{Z}_{t-1}), \\ 
\mathbf{R}_t &= \sigma_2(\mathbf{W}_r \mathbf{H}_t + \mathbf{U}_r \mathbf{Z}_{t-1}), \\ 
\hat{\mathbf{Z}}_t &= \phi(\mathbf{W}_h \mathbf{H}_t + \mathbf{U}_h (\mathbf{R}_t \odot \mathbf{Z}_{t-1})), \\ 
\mathbf{Z}_t &= (1 - \mathbf{Q}_t) \odot \mathbf{Z}_{t-1} + \mathbf{Q}_t \odot \hat{\mathbf{Z}}_t,
\end{align}
where \(\mathbf{Q}_t\) and \(\mathbf{R}_t\) denote the update gate and reset gate, respectively, and \(\hat{\mathbf{Z}}_t\) is the candidate state. Here, \(\sigma_2(\cdot)\) represents the sigmoid activation function, \(\phi(\cdot)\) denotes the hyperbolic tangent function (\(\tanh\)), and \(\odot\) is the element-wise product. The parameters \(\mathbf{W}_q, \mathbf{W}_r, \mathbf{W}_h\) and \(\mathbf{U}_q, \mathbf{U}_r, \mathbf{U}_h\) are learnable weight matrices. 

\subsection{Decoder for DLP}
After obtaining the representations \(\mathbf{Z}_t\) for each time step, these representations are used to predict the existence of edges in the dynamic graph at next time step \(t+1\) for the DLP task. We utilize dot-product as the decoder. The decoder operation is defined as follows:
\begin{equation}
    \hat{\mathbf{A}}_{t+1} = f_\text{Dec}(\mathbf{Z}_t),
\end{equation}
where \(\hat{\mathbf{A}}_{t+1}\) is the predicted adjacency matrix for time step \(t+1\), and a higher value in \(\hat{\mathbf{A}}_{t + 1}\) indicates a higher probability of an edge between nodes.

\subsection{Final Objective}  
DyGMAE integrates three key components for its overall objective: edge reconstruction loss, adjacency matrix reconstruction loss and contrastive loss. The overall loss function for a single snapshot \(t\) is defined as:
\begin{equation} 
\label{loss_all_t}
    \mathcal{L}_t = \mathcal{L}_t^E + \mathcal{L}_t^A + \mathcal{L}_t^C.
\end{equation}

Extending this to the entire dynamic graph, the total loss is aggregated across all snapshots \(t \in \{1, 2, \dots, T\}\) and all masked views \(k \in \{1, 2, \dots, K\}\), and is expressed as:
\begin{equation}
\label{loss_all}
    \mathcal{L} = \sum_{t = 1}^T \left(\sum_{k = 1}^K \left(\mathcal{L}_{t}^{Ek} + \mathcal{L}_{t}^{Ak}\right) + \lambda \mathcal{L}_{t}^{C}\right),
\end{equation}
where \(\mathcal{L}_{t}^{Ek}\) represents the edge reconstruction loss and \(\mathcal{L}_{t}^{Ak}\) is the adjacency matrix reconstruction loss at the \(k\)-th view of snapshot \(t\). The weighting coefficient \(\lambda\) controls the relative importance of the contrastive loss.

By integrating these components across every snapshot and view, our method successfully captures the structural patterns and temporal dependencies in both local and global aspects. The full algorithm and complexity analysis of DyGMAE is presented in the Appendix \ref{sec:algorithm}. 


\section{Experiments}
\label{sec:Experiment}
In this section, we conduct the experiments on five real-world dynamic graph datasets to verify the effectiveness of our proposed DyGMAE method.
\input{table/LP}
\input{table/NLP}

\subsection{Experimental Settings}

\noindent{\textbf{Datasets.}}  
We conduct experiments on five dynamic graph datasets, including Enron \citep{benson2018simplicial}, DBLP, Facebook \citep{hajiramezanali2019variational}, Email \citep{gao2022novel}, and AS733 \citep{yang2021discrete}. Detailed information about these datasets is provided in Appendix~\ref{sec:datasets}.

\noindent{\textbf{Baselines.}}  
The baselines include static autoencoder methods such as GAE and VGAE \citep{kipf2016variational}, as well as dynamic autoencoder methods like DynAE, DynRNN, DynAERNN \citep{goyal2020dyngraph2vec}, and VGRNN \citep{hajiramezanali2019variational}. Additionally, we compare with advanced methods like EvolveGCN \citep{pareja2020evolvegcn}, DGCN \citep{gao2022novel}, DySAT \citep{dysat_deep}, HTGN \citep{yang2021discrete}, and HGWaveNet \citep{bai2023hgwavenet}. More details about baselines are provided in Appendix~\ref{sec:baselines}.

\noindent{\textbf{Evaluation Tasks and Metrics.}}  
To evaluate the effectiveness of our method, we conduct experiments on two link prediction tasks: DLP and DNLP here. The evaluation measures the ability of the methods to distinguish true from false edges by calculating Average Precision (AP) and Area Under the Receiver Operating Characteristic Curve (AUC) scores. In this setup, all known edges in the test snapshots are treated as positive samples (true links), while an equal number of non-existent edges are sampled as negative samples (false links).
 
\subsection{Implementation Details}  
We conduct all experiments following the same settings as in \citep{hajiramezanali2019variational}. The training process uses the snapshots from the training set, while the performance of all methods is evaluated on the test snapshots for DLP and DNLP. During training, train snapshots are masked, while the test snapshots use the original unmasked graph data. For a fair comparison, each experiment is run five times with different random seeds to minimize the impact of randomness, and the results are reported as averages with standard deviations. The other implementation details can be seen in Appendix \ref{sec:implementation_details}.


\subsection{Experimental Results}
The results of DLP and DNLP are presented in Table~\ref{tab:LP} and Table~\ref{tab:NLP}, respectively. For the test sets, both the average values and standard deviations are reported. Certain results are based on findings from previously published papers. We conduct experiments on multi-scale masking representation fusion type analysis as well as multi-step dynamic link prediction, which are presented in Appendix \ref{sec:msrf} and Appendix \ref{sec:msdlp}, respectively.

\noindent{\textbf{Dynamic Link Prediction.}}
Results of DLP are shown in Table~\ref{tab:LP}. As we can see, DyGMAE consistently outperforms state-of-the-art methods in DLP, achieving higher AUC and AP scores across various real-world dynamic graph datasets. Specifically, in Facebook dataset, DyGMAE gets 3.88\% and 4.41\% improvement on AUC and AP compared with the second-best method VGRNN. Although HTGN and HTWaveNet achieve good performance in the hyperbolic space, they can only capture the local information, which limits their performance. Compared to DGCN, which can capture global information, we achieve significant improvements across all five datasets. The strong performance of DyGMAE indicates its superiority in capturing both local and global structural patterns and temporal dependencies. Compared with other autoencoders, including the static autoencoders like GAE and VGAE, as well as the dynamic autoencoders such as DynAE, DynRNN, DynAERNN, and VGRNN, our approach yields remarkable improvements. This outcome not only validates the feasibility of the GMAE framework in DLP task but also confirms the rationality of our initial inspiration.

\noindent{\textbf{Dynamic New Link Prediction.}}
As shown in Table~\ref{tab:NLP}, in the task of DNLP, which demands that all methods should have enhanced inductive capabilities and acquire more global information, DyGMAE consistently outperforms other methods. Especially, on the Facebook dataset, DyGMAE achieves a significant improvement of 3.25\% in AUC and 4.51\% in AP compared to the second-best method DySAT. This notable gap indicates that DyGMAE has a stronger ability to capture the potential patterns and changes in the social graph structure within this dataset. Moreover, when looking at other datasets, DyGMAE also demonstrates superior generalization. This outstanding performance is attributed to the simultaneous capture of local and global information, which enhances the ability to model changes and enables excellent generalization across diverse scenarios.

\begin{figure*}[htbp]
    \centering
    \begin{subfigure}[b]{0.33\textwidth}
        \centering
        \includegraphics[width=\textwidth]{figs/ablation_study_results_facebook.pdf}
        \caption{Facebook}
        \label{fig:Facebook_ablation}
    \end{subfigure}
    \hfill
    \begin{subfigure}[b]{0.33\textwidth}
        \centering
        \includegraphics[width=\textwidth]{figs/ablation_study_results_email.pdf}
        \caption{Email}
        \label{fig:email_ablation}
    \end{subfigure}
    \hfill
    \begin{subfigure}[b]{0.33\textwidth}
        \centering
        \includegraphics[width=\textwidth]{figs/ablation_study_results_as733.pdf}
        \caption{AS733}
        \label{fig:as733_ablation}
    \end{subfigure}
   \caption{Ablation study results in Facebook, Email, and AS733 datasets. AUC and AP refer to DLP, while New\_AUC and New\_AP correspond to DNLP.}
    \label{fig:ablation_combined}
\end{figure*}


\begin{figure*}[htbp]
    \centering
    \begin{subfigure}[b]{0.33\textwidth}
        \centering
        \includegraphics[width=\textwidth]{figs/parameter_results_facebook.pdf}
        \caption{Facebook}
        \label{fig:Facebook_parameter}
    \end{subfigure}
    \hfill
    \begin{subfigure}[b]{0.33\textwidth}
        \centering
        \includegraphics[width=\textwidth]{figs/parameter_results_email.pdf}
        \caption{Email}
        \label{fig:email_parameter}
    \end{subfigure}
    \hfill
    \begin{subfigure}[b]{0.33\textwidth}
        \centering
        \includegraphics[width=\textwidth]{figs/parameter_results_as733.pdf}
        \caption{AS733}
        \label{fig:as733_parameter}
    \end{subfigure}
    \caption{Parameter study results in Facebook, Email, and AS733 datasets. AUC and AP refer to DLP, while New\_AUC and New\_AP correspond to DNLP.}
    \label{fig:parameter_combined}
\end{figure*}


\subsection{Ablation Study}
To better understand the contributions of different modules to the performance improvements of DyGMAE, we conduct an ablation study by removing key components. Specifically, we analyze two variants:
\textbf{w/o MSMS:} This variant removes the MSMS and replaces it with a single masking strategy. \textbf{w/o RA:} In this variant, we eliminate the multi-scale masking representation alignment module, leaving only multi-scale masking without representation alignment in MSMS. We conduct experiments on three datasets: Facebook, Email, and AS733, with the results shown in Figure~\ref{fig:ablation_combined}. Overall, DyGMAE achieves the best performance when all components are included, while removing either the MSMS module or the contrastive alignment leads to significant performance drops across all metrics.

Regarding the impact of removing MSMS, in Facebook, removing MSMS results in a decrease of 0.48\% in AUC, 1.41\% in AP, 0.54\% in New\_AUC, and 1.52\% in New\_AP. Specifically, for Email with long snapshots where DyGMAE struggles to capture dynamic dependencies, MSMS helps alleviate this issue. However, removing MSMS leads to drops of 4.18\% in AUC, 3.25\% in AP, 5.4\% in New\_AUC, and 4.23\% in New\_AP. In AS733 for DNLP, significant improvement is due to MSMS, which brings increases of 3.38\% in AUC, 2.76\% in AP, 9.91\% in New\_AUC, and 8.34\% in New\_AP. Given the long datasets and test snapshots, MSMS is key for capturing long-term dependencies and ensuring good generalization. The ablation results further confirm the validity of our motivation that GMAE benefits DLP and our design related to MSMS.

Furthermore, removing the multi-scale masking representation alignment module (w/o RA) also degrades performance. In the Facebook, Email, and AS733 datasets, the drops in AUC are 0.07\%, 2.30\%, and 1.95\% respectively; in AP, 0.31\%, 1.81\%, and 1.77\%; in New\_AUC, 0.16\%, 2.50\%, and 1.16\%; and in New\_AP, 0.49\%, 1.79\%, and 2.32\%. Interestingly, the performance of the w/o RA variant is better than the w/o MSMS variant in all three datasets, but still shows a clear gap when compared to the full DyGMAE. This demonstrates the effectiveness of the representation alignment in extracting more meaningful and comprehensive representations, especially in larger graph datasets where it is more challenging to acquire local and global information. Additionally, in all three datasets, DyGMAE exhibits smaller variance compared to the other two variants, validating its robustness. 

Overall, these findings underline the crucial role of both multi-scale masking and contrastive learning in enhancing the discriminative power and generalization ability of the learned representations.


\subsection{Hyper-parameter Sensitivity Analysis}
We perform parameter analysis experiments on multiple datasets to examine the sensitivity of the hyperparameter \(\lambda\), which controls the contribution of the contrastive loss objective for aligning the representations across different masked views in MSMS. The results, shown in Figure~\ref{fig:parameter_combined}, indicate that the sensitivity of \(\lambda\) varies across different real-world dynamic graph datasets for both DLP and DNLP tasks.

When \(\lambda\) is too small, DyGMAE fails to align the representations effectively, causing the loss of complex features and comprehensive representations, which results in a performance decline in both tasks. As \(\lambda\) increases, the alignment between representations becomes more pronounced, leading to improved consistency in representations across different views. However, when \(\lambda\) exceeds a certain threshold, the method starts to over-align the representations, leading to the loss of important details and consequently decreasing performance. We observe that the best performance for both tasks is achieved under different values of \(\lambda\). Regarding the changes in \(\lambda\), there are different performance variation curves. Performance degrades when \(\lambda\) is either too small or too large, highlighting the sensitivity of the method to this hyperparameter.

\input{table/ablation_reconstruction}
\subsection{Effectiveness of Reconstruction Objectives}
To evaluate the impact of the reconstruction-based training objectives in DyGMAE, we perform ablation studies by selectively removing the edge-level reconstruction loss ($\mathcal{L}_t^E$) and the adjacency-level reconstruction loss ($\mathcal{L}_t^A$). We assess model performance on both standard and new link prediction settings across three datasets: Enron, Facebook, and Email.

The results are summarized in the following: We observe that removing either reconstruction objective leads to a consistent performance drop. Specifically: Removing the edge-level loss ($\mathcal{L}_t^E$) causes moderate degradation, indicating that reconstructing individual edges is essential for capturing fine-grained local patterns. Removing the adjacency-level loss ($\mathcal{L}_t^A$) leads to a more pronounced performance decline, especially on New Link Prediction metrics. This suggests adjacency-level modeling provides richer context and structural awareness, which is particularly important for discovering new interactions. The performance gap is most evident on the Facebook and Email datasets, where missing $\mathcal{L}_t^A$ leads to drops of up to 5\%–7\% AUC/AP in the new link prediction task. These findings reinforce that both $\mathcal{L}_t^E$ and $\mathcal{L}_t^A$ are crucial and complementary. Together with contrastive alignment, they enable DyGMAE to effectively model both local connectivity and global topology evolution, thereby significantly enhancing its generalization in dynamic link prediction.

\section{Conclusion}
\label{sec:Conclusion}
In this study, we propose DyGMAE, a novel dynamic graph masked autoencoder, specifically designed for dynamic link prediction. DyGMAE extends the GMAE framework to dynamic graphs and fully leverages the advantages of GMAE in link prediction. In DyGMAE, we propose a Multi-Scale Masking Strategy to learn structural and dynamic features. This strategy involves applying different masking strategies to the graph and attempting to reconstruct it. Within MSMS, a contrastive learning objective is incorporated to further align representations across masked views and enable the model to capture complex features. As a result, MSMS can effectively mitigate the information loss caused by random masking and capture the comprehensive representation of local and global information in dynamic graphs. Extensive experiments conducted on five real-world dynamic graph datasets demonstrate the superior performance of DyGMAE and validate the effectiveness of our proposed design. However, our method requires careful tuning of hyperparameters. Our future work plans to focus on adapting DyGMAE for other dynamic graph tasks and extending the GMAE framework to continuous-time dynamic graphs.

\begin{acknowledgements}This work was supported by the National Natural Science Foundation of China (62477016, 62377028, 62077045) and Guangdong Basic and Applied Basic Research Foundation (2024A1515011758, 2023B1515120064), Guangzhou Science and Technology Planning Project (202206030007, Nansha District: 2023ZD001, Development District: 2023GH01). \end{acknowledgements}


% References
\bibliography{uai2025-template}

\newpage
\onecolumn
\title{DyGMAE: A Novel Dynamic Graph Masked Autoencoder for Link Prediction\\(Supplementary Material)}
\maketitle

% This Supplementary Material should be submitted together with the main paper.

\appendix

\section{Algorithm and Complexity}
\label{sec:algorithm}
In this section, we will present a detailed account of the algorithm underlying our proposed DyGMAE method and conduct a comprehensive complexity analysis.
\subsection{Algorithm}
We summarize the details of our proposed method DyGMAE in the Algorithm \ref{alg:DyGMAE}. We input the dynamic graph snapshots $\mathcal{G}$, and the parameters. During the training of snapshots, for each individual snapshot, we generate \(K\) masks in MSMS. For each masked view, the encoder generates representations. Subsequently, we calculate the edge reconstruction loss and the adjacency matrix reconstruction loss using Equations~\ref{Loss_E} and \ref{Loss_A} respectively. For each snapshot, we also need to compute the contrastive loss according to Equation~\ref{Loss_c1}. Then, we aggregate the representations from all views and pass them through the GRU. After that, we calculate the overall loss to update the parameters. Finally, we output the predicted adjacency matrix for the next time step.
\begin{algorithm2e}[t]
	\caption{Dynamic Graph Masked Autoencoder (DyGMAE)} 
	\label{alg:DyGMAE}
	\SetAlgoLined 
	\LinesNumbered  
	\KwIn{Dynamic graph snapshots $\mathcal{G} = \{G_1, G_2, \ldots, G_T\}$; Number of masks \(K\); Set of masking strategies \(\{f_\text{M}^1, f_\text{M}^2, \ldots, f_\text{M}^K\}\); Weight coefficients \(\lambda\); Maximum iteration \(epoch\_max\)}  
	\KwOut{Predicted adjacency matrix $\hat{\mathbf{A}}_{T+1}$}

    \While{\(epoch \leq {epoch\_max}\)}{
        \For{each snapshot \(t \in \{1, 2, \ldots, T\}\)}{
            \For{each view \(k \in \{1, 2, \ldots, K\}\)}{
                Generate unmasked view \(G_t^{uk}\) using \(f_\text{M}^k\)\;
                Compute latent representation \(\mathbf{H}_{t}^{k}\) using the encoder and projector (Equations~\ref{GNN_layer} and \ref{Projector_mlp})\;
                Compute reconstruction losses \(\mathcal{L}_t^{Ek}\) and \(\mathcal{L}_t^{Ak}\) (Equations~\ref{Loss_E} and \ref{Loss_A})\;
            }
            
            Compute contrastive loss \(\mathcal{L}_t^C\) to align multi-scale masking representations (Equation~\ref{Loss_c1})\;

            Aggregate representations from all masked views to obtain \(\mathbf{H}_t\) (Equation~\ref{aggregate_1})\;

            Update temporal dependencies using GRU to compute \(\mathbf{Z}_t\) (Equation~\ref{gru})\;
        }

        Compute total loss \(\mathcal{L}\) (Equation~\ref{loss_all})\;
        Update parameters using gradient descent\;
    }
    \Return Predicted adjacency matrix $\hat{\mathbf{A}}_{T+1}$\;
\end{algorithm2e}

\subsection{Complexity analysis}
We conduct a time-complexity analysis of DyGMAE. First, generating masked views for each snapshot \( t \) and each masking strategy \( K \) requires \( O(T \cdot K \cdot M) \), where \( T \) is the number of snapshots, \( K \) is the number of masking strategies, and \( M \) is the number of edges. Next, computing the latent representations \( \mathbf{H}_t^k \) using the encoder and projector involves a complexity of \( O(T \cdot K \cdot (L \cdot (N + M) \cdot d^2 + N \cdot d^2)) \), where \( L \) is the number of GNN layers, \( N \) is the number of nodes, and \( d \) is the feature dimension. The all reconstruction losses \( \mathcal{L}_t^{Ek} \) and \( \mathcal{L}_t^{Ak} \) are computed with a complexity of \( O(T \cdot K \cdot N \cdot d) \). For the contrastive loss \( \mathcal{L}_t^C \), aligning multi-mask representations incurs a complexity of \( O(T \cdot K^2 \cdot N \cdot d) \). Aggregating representations from all masked views to obtain \( \mathbf{H}_t \) requires \( O(T \cdot K \cdot N \cdot d) \). Finally, updating temporal dependencies using GRU has a complexity of \( O(T \cdot N \cdot d^2) \). Overall, the dominant factors in the time complexity are the number of snapshots \( T \), the graph size \( N \) and \( M \), and the feature dimension \( d \), leading to an approximate complexity of \( O(T \cdot (N + M) \cdot d^2) \) when \( K \) and \( L \) are small constants.



\section{Datasets}
\label{sec:datasets}
\input{table/dataset}
We conduct experiments on five datasets, which vary in size and the length of their snapshots. Some datasets are large while others are small, and some have long-term snapshots whereas others have short-term ones. Table~\ref{tab:dataset} summarizes the key statistics of the datasets used in our experiments, providing an overview of their size and structural properties. The notation "Train : Test" represents the number of time steps in the training snapshots and test snapshots, respectively. The datasets include: Enron\footnote{https://www.cs.cornell.edu/~arb/data/email-Enron/}, an email communication dataset where edges represent emails exchanged between employees; DBLP\footnote{https://github.com/VGraphRNN/VGRNN}, an academic co-authorship network where nodes represent authors and edges represent collaborations; Facebook, a social communication network; Email\footnote{http://networkrepository.com/dynamic.php}, an email communication dataset. AS733\footnote{https://snap.stanford.edu/data/as-733.html}, a graph representing autonomous systems (AS) of routers and their traffic exchanges via the Border Gateway Protocol; 

\section{Baselines}
\label{sec:baselines}
\begin{itemize}
    \item GAE: GAE consists of an encoder and a decoder. The encoder learns to map the input graph data into a low-dimensional latent space, while the decoder reconstructs the graph data from the latent vectors. 
    \item VGAE \citep{kipf2016variational}: The VGAE combines the concepts of the VAE and the GCN, and it consists of an encoder and a decoder. The encoder uses the GCN to map the node features of a graph to a latent space and outputs the mean and variance of the latent variables to represent the distribution of the node features.
    \item DynAE \citep{goyal2020dyngraph2vec}: It is a dynamic autoencoder method extended from the static graph autoencoder. It processes the adjacency matrix information of multiple time steps through fully connected layers. It is suitable for capturing short-term dynamic patterns, but has limited ability to model long-term dependencies.
    \item DynRNN \citep{goyal2020dyngraph2vec}: It inputs the sequence of node adjacency vectors into the LSTM (Long Short-Term Memory) units and learns the evolutionary patterns across time steps through the memory gating mechanism. The LSTM also serves as the decoder.
    \item DynAERNN \citep{goyal2020dyngraph2vec}: DynAERNN uses a fully connected encoder to reduce the dimension of the adjacency matrix into a low-dimensional representation. Then, it inputs the low-dimensional sequence into the LSTM for temporal modeling. The decoder used is a fully connected layer. 
    \item VGRNN \citep{hajiramezanali2019variational}: It is a novel hierarchical variational model for representation learning on dynamic graphs. It extends the graph convolutional recurrent neural network (GCRN) to form a graph recurrent neural network (GRNN), then enhances it by introducing high-level latent random variables to create the variational graph recurrent neural network (VGRNN).
    \item EvolveGCN \citep{pareja2020evolvegcn}: EvolveGCN offers a solution for dynamic graphs by adapting GCN over time without relying on node embeddings. It uses an RNN to dynamically update GCN parameters, with two architectures explored for this process. 
    \item DGCN \citep{gao2022novel}: DGCN is a GCN-based dynamic graph representation learning method. It maximizes the mutual information between local node and global graph representations to capture snapshot-level global structure and uses LSTM to update GCN weight parameters across time steps. A new Dice similarity is proposed to guide the aggregation and better distinguish the importance of neighboring nodes.
    \item DySAT \citep{dysat_deep}: It employs two self-attention mechanisms, the structural attention block and the temporal attention block, to capture information from two dimensions: structural neighborhood and historical representations.
    \item HTGN \citep{yang2021discrete}: HTGN migrates the space to the hyperbolic geometry space. It utilizes Hypergraph Neural Network (HGNN) and Hypergraph Gated Recurrent Unit (HGRU) to obtain topological and dynamic information. Moreover, it introduces the Hyperbolic Temporal Contextual Self-Attention (HTA) module to focus on historical states and the Hyperbolic Temporal Consistency (HTC) module to ensure stability and generalization ability. 
    \item HGWaveNet \citep{bai2023hgwavenet}: HGWaveNet uses hyperbolic diffusion graph convolution (HDGC) to aggregate neighborhood information and hyperbolic dilated causal convolution (HDCC) to obtain historical state information.
\end{itemize}

\section{implementation details} 
\label{sec:implementation_details}
DyGMAE is implemented in Python 3.9, PyTorch 1.11 and executed on Intel Core i5-12490F CPUs and NVIDIA RTX 3070 GPUs. The detailed hyperparameters of our model for each dataset are presented in Table~\ref{tab:parameters}.
\input{table/parameters}


\section{Multi-Scale Masking Representation Fusion Type Analysis}
\label{sec:msrf}
In this study, we performed a comprehensive analysis of diverse fusion types within the Multi-Scale Masking Representation Fusion (MSMRF) introduced in Section \ref{sec:Multi-Scale Masking Strategy (MSMS)}. To gauge the efficacy of MSMRF, we conducted experiments employing four distinct fusion strategies: mean, sum, max, and an attention-based mechanism. The outcomes of the DLP and DNLP tasks are respectively presented in Table \ref{tab:msrf_lp} and Table \ref{tab:msrf_nlp}. 

The experimental results indicate that different datasets require different strategies to achieve optimal performance, and there are significant differences in the performance of different strategies on various datasets. The max strategy demonstrates good performance across multiple datasets and tasks. However, the performance of each strategy varies significantly among different datasets. This suggests that in practical applications, we need to select appropriate fusion strategies according to the characteristics of the datasets to improve the accuracy and reliability of predictions. 
% For example, in the Enron dataset, the max strategy achieves the best results, and there is a large gap between the max strategy and others like the sum strategy in the DNLP task. In the Facebook dataset, the mean and sum strategies have similar performance, but they differ greatly from the max and attention strategies.
\input{table/MSRF_LP}
\input{table/MSRF_NLP}

\section{Multi-step dynamic link prediction results}
\label{sec:msdlp}
Multi-step dynamic link prediction aims to train a projection function \(f\). Given a sequence of dynamic graph adjacency matrices of length \(l\), denoted as \(\{ \mathbf{A}_1, \mathbf{A}_2, \cdots, \mathbf{A}_l \}\), this function is trained for prediction. This function maps the input snapshot sequence \(\{ \mathbf{A}_1, \mathbf{A}_2, \cdots, \mathbf{A}_l \}\) to the future adjacency matrices \(\mathbf{A}_{l + 1}, \mathbf{A}_{l + 2}, \cdots, \mathbf{A}_{l + k}\), i.e., \(\mathbf{A}_{l + 1}, \mathbf{A}_{l + 2}, \cdots, \mathbf{A}_{l + k} = f(\mathbf{A}_1, \mathbf{A}_2, \cdots, \mathbf{A}_l)\). It must be pointed out that the multi-step dynamic link prediction has no access to the entire set of test snapshots. In contrast, the dynamic link prediction can see the snapshot at the previous time step of the snapshot to be predicted, but the model parameters cannot be updated through training on the test set. Compared with DLP, multi-step dynamic link prediction requires methods to be more effective in capturing the dynamic evolution information of graph structures. We conducted multi-step dynamic link prediction and also multi-step dynamic new link prediction experiments on three datasets and compared with several state-of-the-art methods. The experimental settings were the same as those in the DLP experiments. The results are presented in Table~\ref{tab:mdlp} and Table~\ref{tab:mdnlp} respectively. As can be seen, our method achieves the best performance on these two tasks, demonstrating that DyGMAE can better capture the dynamic dependencies.
\input{table/MDLP}
\input{table/MDNLP}

\section{Related Work}
In this section, we systematically review the relevant works regarding DLP and provide a brief introduction to the graph masked autoencoder.
\label{sec:related_work}
\subsection{Dynamic Link Prediction}
The field of DLP has seen significant progress and attention recently and many related methods have been proposed to address this challenge. For example, EvolveGCN \citep{pareja2020evolvegcn} uses RNNs to update graph convolution network (GCN) weight parameters dynamically at each time step, modeling temporal changes in graph sequences. Building on this, ComGCN \citep{pham2021comgcn} captures both node-level and community-level structural and evolutionary dynamics to improve the DLP performance. HTGN \citep{yang2021discrete} and HGWaveNet \citep{bai2023hgwavenet} extend GCNs to hyperbolic space, better capturing the structural and temporal dependencies. HTNE \citep{zuo2018embedding} models the DLP task as the neighborhood formation sequences with multivariate Hawkes process \citep{lima2023hawkes}, and infers the current neighbor formation events. TDGNN \citep{qu2020continuous} devised a novel temporal aggregator to incorporate dynamic information into the message propagation process of GNN, and developed an effective static GNN extension TDGNN for DLP. Following the idea of TDGNN, GSNOP \citep{luo2023graph} integrates the neural ordinary differential equations, and it not only obtains better DLP performance but also effectively alleviates the sparsity issue of dynamic graphs. SSL offers a powerful approach to leverage the abundant unlabeled data available in dynamic graphs. Among SSL-based methods for dynamic graph learning, contrastive approaches dominate. DDGCL \citep{tian2021self} is the first self-supervised framework for dynamic graphs, extending contrastive learning by contrasting temporal views of the same node identity. Similarly, DySubC \citep{chen2023self} uses temporal subgraph contrastive learning to capture both structural and dynamic features while maximizing mutual information. Other SSL generative methods, such as VGRNN \citep{hajiramezanali2019variational}, combine variational autoencoders with RNNs to model time-evolving node representations, using probabilistic inference to capture temporal dependencies and model uncertainty. And \citep{goyal2020dyngraph2vec} proposed three autoencoder methods to deal with DLP.


\subsection{Graph Masked Autoencoders}
Graph masked autoencoders have attracted significant attention in graph learning for their ability to leverage self-supervised signals through masking and reconstruction, enabling models to learn meaningful representations without requiring labeled data. GraphMAE \citep{hou2022graphmae}, the first GMAE-based method, focuses on masking and reconstructing node features, achieving notable performance improvements in node classification task. Building upon GraphMAE, to enhance the robustness, GraphMAE2 \citep{hou2023graphmae2} introduces the multi-view random re-mask decoding and latent representation prediction strategies. MaskGAE \citep{li2023s} extends this by corrupting edges and paths, reconstructing edge and degree information to capture structural features. StructMAE \citep{liu2024mask} refines the masking strategy by introducing a structure-guided approach, where nodes are scored based on structural significance and an easy-to-hard masking process gradually enhances structural awareness. Additionally, AUG-MAE \citep{wang2024rethinking} introduces adversarial masking and a uniformity regularizer to improve alignment and representation consistency. Other methods \citep{tian2023heterogeneous, ye2023graph, liu2024hi, luo2024masked} integrate GMAE with contrastive learning, heterogeneous graphs, and sequential recommendation tasks. However, these methods are designed for static graphs and cannot address both the structural patterns and temporal dependencies of dynamic graphs.

\end{document}
