%\documentclass{uai2025} % for initial submission
\documentclass[accepted]{uai2025} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2025} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2025} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{amsmath}
\usepackage{amsthm}
\usepackage{mathtools}
\usepackage{multirow} 
\usepackage{amssymb}
\usepackage{bbm}
\usepackage{algorithm}
\usepackage{algorithmic}

% add by Duan
\usepackage{bm}

\newtheorem{example}{Example}
\newtheorem{theorem}{Theorem}
\newtheorem{lemma}{Lemma}
%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Improving Graph Contrastive Learning with Community Structure}

% The standard author block has changed for UAI 2025 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1,2]{Xiang Chen}
\author[1,2]{Kun Yue}
\author[1,2]{\href{mailto:<duanl@ynu.edu.cn>?Subject=Your UAI 2025 paper "IGCL-CS"}{Liang Duan}{}}
\author[1,2]{Lixing Yu}
% Add affiliations after the authors
\affil[1]{%
    School of Information Science and Engineering\\
    Yunnan University\\
    Kunming, China
}

\affil[2]{%
    Yunnan Key Laboratory of Intelligent Systems and Computing\\
    Yunnan University\\
    Kunming, China
}

  
  \begin{document}
\maketitle

\begin{abstract}
    Graph contrastive learning (GCL) has demonstrated remarkable success in training graph neural networks (GNNs) by distinguishing positive and negative node pairs without human labeling. However, existing GCL methods often suffer from two limitations: the repetitive message-passing mechanism in GNNs and the quadratic computational complexity of exhaustive node pair sampling in loss function. To address these issues, we propose an efficient and effective GCL framework that leverages community structure rather than relying on the intricate node-to-node adjacency information. Inspired by the concept of sparse low-rank approximation of graph diffusion matrices, our model delivers node messages to the corresponding communities instead of individual neighbors. By exploiting community structures, our method significantly improves GCL efficiency by reducing the number of node pairs needed for contrastive loss calculation. Furthermore, we theoretically prove that our model effectively captures essential structure information for downstream tasks. Extensive experiments conducted on real-world datasets illustrate that our method not only achieves the state-of-the-art performance but also substantially reduces time and memory consumption compared with other GCL methods. Our code is available at \href{https://github.com/chenx-hi/IGCL-CS}{https://github.com/chenx-hi/IGCL-CS}.
\end{abstract}

% Keywords: Graph contrastive learning, Graph neural network, Community structure, Graph partition, Community contrastive loss

\section{Introduction}\label{sec:intro}
    Graph neural networks (GNNs) are essential for analyzing complex graph-structured data by learning effective node representations that capture rich structural information through message passing on the graph topology \citep{gat}. Most GNNs are trained in a semi-supervised manner, where their performance heavily depends on the availability of labeled nodes \citep{ggd}. However, obtaining these labels is often expensive and labor-intensive. To address this challenge, graph contrastive learning (GCL) has emerged as a successful method for training GNNs without requiring specific task labels, making it particularly useful in fields such as social network analysis and recommendation systems~\citep{gcl-survey}. 

    The core technology of GCL revolves around optimizing a contrastive loss that discriminates positive and negative node pairs to maximize feature consistency across augmented graph views ~\citep{gca,gcl-uai}. While some GCL variants enhance performance by improving the quality and quantity of sampled node pairs and achieve results comparable to or even surpassing those of methods trained with ground truth labels ~\citep{ncla,uai-multiview}, their scalability remains severely constrained by two inherent bottlenecks: (1) the intensive message passing in GNNs, and (2) the quadratic computational complexity of node pairs in contrastive loss. Several methods improve efficiency by simplifying the computation process of GCL, such as eliminating the need for negative node pairs in contrastive loss \citep{bgrl, sgcl}, or reducing the number of graphs that require encoding by GNNs \citep{sugrl}. Despite these improvements, the high computational cost of the message-passing mechanism remains a bottleneck. Other strategies aim to speedup GNN by decoupling graph convolution and embedding transformation \citep{sgc} or performing message passing on subgraphs \citep{coarse-gnn}. However, these techniques are not directly applicable to unlabeled scenarios.


    Community structures, characterized by dense internal connections and sparse external connections, are prevalent in many real-world graphs \citep{se_toit}. This inherent property aligns well with the diffusion process on graphs \citep{community, coarse-gnn, structcomp}, and recent methods have demonstrated the effectiveness of utilizing community structures in GCL to improve downstream task performance \citep{gcool, csgcl}. However, these methods primarily focus on enhancing task performance and often overlook the scalability challenges associated with GCL. Generally, given a problem, simpler data structures can lead to simpler and more efficient algorithms. The partition matrix indicating node-community memberships offers a straightforward yet essential structure for graph representation ~\citep{hpn}. This naturally raises the question: can we leverage community structures to simultaneously address the two scalability issues, while still maintaining high downstream task performance?
    

    
    For the computation consumption caused by GNNs, we utilize a community partition matrix instead of the original graph structure for message passing. Based on the dense internal connections within the community, we treat each community as a subgraph where internal nodes are interconnected. Thus, all nodes within a community share the same representation, referred to as the community centroid. This approach simplifies the message-passing process by allowing node features in a community to be aggregated at the community centroid instead of individual nodes, avoiding the issue of exponential growth in the number of nodes that need to be computed during message passing. 
    
 
    For the quadratic computational complexity of node pair similarity in contrastive loss, we propose an approach that effectively exploits both intra-community and inter-community information to reconstruct the loss. Specifically, our method encourages the embedding representations of community centroids to be similar to their internal nodes, and leverages the hierarchical structure of communities to implicitly model long-range dependencies between nodes in adjacent communities for capturing both basic (intra-community) and higher-level (inter-community) structural information. Since the number of communities is typically much smaller than the number of individual nodes, this approach significantly reduces the computational load required for calculating node pair similarities in GCL. In addition, our reconstructed loss facilitates the construction of positive and negative samples more effectively, avoiding the mislabeling of closely connected nodes as negative samples to improve the performance of downstream tasks.
    
    The main contributions are summarized as follows:
    \begin{itemize}
        \item We propose a simple and effective method to improve the scalability and performance of GCL by leveraging community structure instead of fine-grained adjacency information between nodes. 
        
        \item We provide theoretical analysis showing that our community structure-based loss can effectively capture the essential structural information and achieves good generalization performance.
        
        \item Extensive experiments on widely used benchmarks across different scales and homophily levels show that our method significantly reduces computational overhead while achieving the best performance. 
    \end{itemize}


\section{Preliminaries}
\textbf{Graph Neural Network.} Let $\mathcal{G} = (\mathcal{V}, \mathcal{E})$ denote a graph with $n$ nodes, where $\mathcal{V} = \{v_1, \cdots, v_n\}$ is the node set and $\mathcal{E} \subseteq \mathcal{V} \times \mathcal{V} $ is the edge set. The original graph structure can be represented by an adjacency matrix $\mathbf{A} \in \{0, 1\}^{n \times n}$, where $\mathbf{A}_{i,j} = 1$ if there is an edge~$(v_i,v_j) \in \mathcal{E}$, otherwise $\mathbf{A}_{i,j} = 0$. The node features are represented by a feature matrix $\mathbf{X} \in \mathbb{R}^{n \times h}$, where $\mathbf{x}_i \in \mathbf{X}$ is a $h$-dimensional feature vector of node $v_i$. Thus, the complete graph (i.e., graph structure together with node features) can be denoted as $G = (\mathbf{A}, \mathbf{X})$. For scenarios of isolating node features from the graph structure, we define a feature-only graph as $G^0 = (\mathbf{I}_n, \mathbf{X})$, where $\mathbf{I}_n$ is the $n \times n$ identity matrix indicating the absence of edge connections. 


In this paper, we focus on training a GNN encoder $f_\theta(G): \mathcal{G} \rightarrow \mathbb{R}^{n \times d}$ parameterized by $\theta$ in the absence of labeled data to generate node representations $\mathbf{v}_i = f_\theta(G)[v_i] \in \mathbb{R}^d$ optimized for downstream tasks (e.g., node classification). Specifically, a single-layer GNN~\citep{gcn} can be formulated as 
\begin{equation} \label{equ-gnn}
	f_\theta(G) = \sigma(\hat{\mathbf{A}}\mathbf{X}\mathbf{W})
\end{equation}
\noindent where $\sigma(\cdot)$ denotes a non-linear activation function, $\hat{\mathbf{A}}$ represents the symmetrically normalized adjacency matrix of $\mathbf{A}$, and $\mathbf{W} \in \mathbb{R}^{h \times d}$ is the learnable weight matrix corresponding to $\theta$. It means that the GNN leverages the graph structure by $\hat{\mathbf{A}}$ and the node features by $\mathbf{X}$ to produce effective node embeddings suitable for downstream applications.



\noindent \textbf{Graph Partition.} 
Let $\mathcal{P} = \{P_1, \cdots, P_m\}$ represent a partition of $G$, where each $P_j$ denotes a community within $G$ that preserves regional structure and clustering properties. For any $j \neq k$, we have $P_j \cap P_{k} = \emptyset $. A node $v_i \in P_j$ must satisfy the condition that its internal degree within $P_j$ exceeds its external degree. We use $\mathbf{P} \in \{0, 1\}^{n \times m} $ to denote the partition matrix corresponding to $\mathcal{P}$, where $\mathbf{P}_{i,j} = 1$ if node $v_i \in P_j$, and $\mathbf{P}_{i,j} = 0$ otherwise. The normalized partition matrix is denoted as $\hat{\mathbf{P}}$. The adjacency matrix of the community-level graph can be constructed by $\mathbf{P}^\mathrm{T}\mathbf{A}\mathbf{P}$, where each entry indicates the connections between communities. This formulation enables the analysis of higher-level interactions among communities, instead of focusing on individual nodes.



\noindent \textbf{Graph Contrastive Learning.}
The typical GCL framework consists of a shared GNN encoder $f_\theta$, a MLP projection head $g_\varphi$, and a graph contrastive loss $\mathcal{L}_{gc}$. Initially, GCL generates two augmented views $G^1$ and $G^2$ by applying random perturbations to the input graph $G$, such as DropEdge and feature masking~\citep{mvgrl}. These augmented views are then fed into $f_\theta$ to obtain node representations. During the training phase, the projection head $g_\varphi$ maps the node representations from both views into a common embedding space for contrastive learning. The contrastive loss $\mathcal{L}_{gc}$ is designed to pull together the representations of the same node from different views (i.e., positive node pairs) while pushing apart the representations of different nodes (i.e., negative node pairs). Formally, $\mathcal{L}_{gc}$ can be written as
\begin{equation} \label{equ-gcl}
	\mathcal{L}_{gc} = - \frac{1}{n} \sum_{v_i \in \mathcal{V}} \log{\frac{\exp{(g_\varphi(\mathbf{v}_i^1)^\mathrm{T} g_\varphi(\mathbf{v}_i^2)/\tau)}}{\sum_{v_j \in \mathcal{V}^-} \exp{(g_\varphi(\mathbf{v}_i^1)^\mathrm{T} g_\varphi(\mathbf{v}_j)/\tau)}}}
\end{equation}
where $\mathbf{v}_i^1 = f_\theta(G^1)[v_i]$ and $\mathbf{v}_i^2 = f_\theta(G^2)[v_i]$ are the embedding representations of the same node $v_i$ in the two augmented views, $\mathcal{V^-}$ denotes the set of negative node pairs from the two views~\citep{gca}, and $\tau > 0$ is the temperature parameter. In addition, given the graph homogeneity assumption, positive node pairs can be extended from the two augmented views to the neighbors of node $v$. An illustration of this scheme is presented in Figure~\ref{fig-gcl}.

\begin{figure}
   \centering
   \centerline{\includegraphics[width=0.92\columnwidth]{fig-gcl.pdf}}
   \caption{An Example of Contrastive Scheme.}
   \label{fig-gcl}
\end{figure}

\begin{figure*}[!htb]
	\centering
	\includegraphics[width=1.0\textwidth]{fig-framework.pdf} 
	\caption{The Framework of Our Proposed GCL Method.}
	\label{fig:framework}
\end{figure*}

\section{Methodology}
In this section, we present the technical details of our method and provide a theoretical analysis to ensure its effective application to downstream tasks.


\subsection{Community Contrastive Learning}
The basic idea behind our method is to encourage the representations of community centroids to be similar to those of their internal nodes, while pushing dissimilar community centroids apart. As illustrated in Figure~\ref{fig:framework}, our method does not utilize a GNN encoder during the training phase, thereby reducing the computational overhead associated with generating negative sample pairs. This simplified GCL framework significantly decreases the computational resources required for model training.



\subsubsection{Partition Convolutional Network}
Conventional GCL methods suffer from prohibitive complexity due to the exponential growth of the number of nodes that need to be computed in layer-wise message passing and the redundant computations across augmented views. Prior attempts to reduce graph instances processed by GNNs ~\citep{sugrl} have not address the root complexity bottleneck, i.e., dense message passing. We replace the adjacency matrix with a sparse partition matrix, which enables a low-rank approximation of the $k$-step diffusion matrix:  $\hat{\mathbf{A}}^k \approx \mathbf{P}\hat{\mathbf{P}}^\mathrm{T}$~\citep{gs,structcomp}. Then, the GNN encoder in Eq.~\ref{equ-gnn} simplifies to:
\begin{equation} \label{equ-pcn}
	f_\theta(G) = \sigma(\hat{\mathbf{A}}\mathbf{X}\mathbf{W}) \approx  \sigma(\mathbf{P}\hat{\mathbf{P}}^\mathrm{T}\mathbf{X}\mathbf{W}) = f_\theta(\mathcal{P})
\end{equation}
where $\hat{\mathbf{P}}^\mathrm{T}\mathbf{X}\mathbf{W}$ denotes the community centroid representations $\mathbf{C} \in \mathbb{R}^{m \times d}$. We refer to Eq.~\ref{equ-pcn} as a single-layer Partition Convolutional Network (PCN), which performs message passing exclusively within each community. 

In practice, we use a fast graph partition algorithm METIS~\citep{metis} to generate $\mathbf{P}$. The sparsity of the partition matrix $\mathbf{P}$ ensures that PCN requires fewer computational resources, making it particularly efficient for large-scale graphs. Moreover, by focusing on message passing within communities, PCN can effectively exchange information among nodes within the same community without unnecessary interactions across communities. Moreover, our experimental results further show that even with a single-layer architecture, PCN achieves high accuracy while significantly reducing computational costs.


\subsubsection{Community Contrastive Loss}
The success of existing GCL methods lies in emphasizing similarities in the neighborhood representations of the same node across different augmented views \citep{gca, ncla}. Inspired by this, we leverage the dense connections within communities to strengthen the learning of neighborhood similarities. Building on the principle of PCN, which focuses on message passing in communities to capture local structural information, we make node representations within the same community more similar, ensuring that intra-community nodes have consistent and closely aligned embeddings. Moreover, to fully extract hierarchical structure information, we also encourage closer proximity between neighboring community centroids. 


%The state-of-the-art GCL methods emphasize similarities in the neighborhood representations of the same node in different augmented views~\citep{gca,ncla}. In contrast, we also hope that our method can learn neighborhood similarity to ensure the performance of downstream tasks. As a potential structure of graph, the community is characterized by a denser connection between nodes \citep{community,se_toit}. Therefore, another nature idea of capturing the neighborhood signal is to make node representations in the same community similar, which is consistent with PCN.

%\textbf{Community Reconstruction Loss.}
\textbf{Intra-community Reconstruction Loss.} We utilize the community centroid representations to reconstruct the original features of internal nodes, leveraging the fact that nodes within a community share the same centroid representation. This approach naturally reduces the distance between the internal node representations without needing to compute all pairwise distances within the same community. Formally, the intra-community reconstruction loss is defined as
\begin{equation} \label{lcr}
	\mathcal{L}_{cr} = \frac{1}{m}\sum_{P_i \in \mathcal{P}} \frac{1}{\lvert P_i \rvert} \sum_{v_j \in P_i} \lVert g_\varphi(\mathbf{c}_i) - \mathbf{v}_j \rVert_2^2
\end{equation}
where $\mathbf{c}_i = f_\theta(\mathcal{P})[P_i]$ is the centroid representation of $P_i$ and $\mathbf{v}_j = f_\xi(G^0)[v_j]$ is the feature-only representation of $v_j$. The term $\lvert P_i \rvert$ is the number of nodes in $P_i$ and $\lVert \cdot \rVert_2$ denotes the L2 norm. Minimizing $\mathcal{L}_{cr}$ ensures the embeddings of nodes within the same community are closely aligned with their community centroids. 



%\textbf{Community Neighbor Loss.}

\textbf{Inter-community Neighborhood Loss.} Note that adjacent communities tend to merge into larger communities, forming hierarchical structures, a phenomenon widely observed in real-world graph data \citep{se_toit}. To incorporate this hierarchical structure information and address the limitations of $\mathcal{L}_{cr}$, where nodes are adjacent but do not belong to the same community, we introduce the inter-community neighborhood loss, defined as
\begin{equation} \label{lcn}
	\mathcal{L}_{cn} = \frac{1}{m} \sum_{P_i \in \mathcal{P}} \frac{1}{\lvert \mathcal{N}(P_i) \rvert} \sum_{P_k \in \mathcal{N}(P_i)} \lVert \mathbf{c}_i - \mathbf{c}_k \rVert_2^2
\end{equation}
where $\mathbf{c}_k = f_\theta(\mathcal{P})[P_k]$, and $\mathcal{N}(P)$ represents the neighbors of $P$, which can be obtained from the community adjacency matrix $\mathbf{P}^\mathrm{T}\mathbf{A}\mathbf{P}$. Minimizing $\mathcal{L}_{cn}$ ensures that neighboring communities have similar centroid representations, effectively capturing the hierarchical structure information. 


\textbf{Community Uniformity Regularization Loss. } Although $\mathcal{L}_{cr}$ and $\mathcal{L}_{cn}$ provide a framework for combining intra-community similarity and inter-community hierarchy, these losses alone may lead to collapsed representations~\citep{bgrl}. In such a scenario, all node representations degenerate to the same vector on the hyperplane, minimizing the loss but rendering the model ineffective for downstream tasks. To address this issue, we introduce the community uniformity regularization loss to further enhance the representation diversity, defined as 
\begin{equation}  \label{lcu}
	\mathcal{L}_{cur} = - \frac{1}{m^2} \sum_{P_i, P_t \in \mathcal{P}} \lVert \mathbf{c}_i - \mathbf{c}_t \rVert^2_2
\end{equation}
where $P_t$ denotes any community distinct from $P_i$ (i.e., $i \neq t$), and $\mathbf{c}_t$ is the representation of $P_t$. By maximizing the distances between different community centroids, this loss encourages diversity in the learned representations.


\textbf{Overall Loss.} Directly jointly optimizing the three losses can result in an overall loss $\mathcal{L}_{all} = \mathcal{L}_{cr} + \mathcal{L}_{cn} + \mathcal{L}_{cur}$. However, minimizing $\mathcal{L}_{all}$ is not a suitable optimization objective because $\mathcal{L}_{cur}$ can cause $\mathcal{L}_{all}$ to become negative, leading to abnormal training. Although scaling $\mathcal{L}_{cur}$ via a hyperparameter can mitigate this issue, it increases the complexity and time required to find optimal model parameters. Therefore, we integrate the three losses and derive an upper bound representation as the overall loss:
\begin{equation}\label{loss}
     \mathcal{L}_\mathcal{P} \leq - \frac{1}{m} \sum_{P_i \in \mathcal{P}} \frac{1}{ \lvert \mathcal{N}(P_i) \rvert } \sum_{P_k \in \mathcal{N}(P_i)} \log \ell (P_i)
\end{equation}
where $\ell (P_i) =$
\begin{equation*}
        \frac{\exp ({ \frac{1}{\lvert P_i \rvert} \sum_{v_j \in P_i} g_\varphi(\mathbf{c}_i)^\mathrm{T}\mathbf{v}_j + \alpha \mathbf{c}_i^\mathrm{T}\mathbf{c}_k)}}
        {\exp (\frac{1}{\lvert P_i \rvert} \sum_{v_j \in P_i} g_\varphi(\mathbf{c}_i)^\mathrm{T}\mathbf{v}_j) + \sum_{P_t \in \mathcal{P}} \exp{\mathbf{c}_i^\mathrm{T}\mathbf{c}_t}} \label{loss-lv}
\end{equation*}, and $\alpha$ controls the influence of the neighboring communities. The larger $\alpha$ is, the more focus on global information of $G$. For reading simplicity, we omit the temperature parameter $\tau$ in Eq.~\ref{loss} and the specific value can be found in Appendix ~\ref{app-para}. The derivation of the overall loss is outlined below, with more details available in Appendix ~\ref{app-loss}.
\begin{proof}
Since vectors in contrastive losses are usually normalized, we have
\begin{equation}\label{lcr1}
        \min \mathcal{L}_{cr} \Leftrightarrow \min -\frac{1}{m}\sum_{P_i \in \mathcal{P}} \frac{1}{\lvert P_i \rvert} \sum_{v_j \in P_i} g_\varphi(\textbf{c}_i)^\mathrm{T}\mathbf{v}_j
\end{equation}
\begin{equation}\label{lcn1}
    \begin{split}
        \min \mathcal{L}_{cn} &\Leftrightarrow \min - \frac{1}{m} \sum_{P_i \in \mathcal{P}} \frac{1}{\lvert \mathcal{N}(P_i) \rvert} \sum_{P_k \in \mathcal{N}(P_i)} \textbf{c}_i^\mathrm{T} \textbf{c}_k \\
    \end{split}
\end{equation}  
\begin{equation}\label{lcu1}
        \min \mathcal{L}_{cur} \Leftrightarrow \min \frac{1}{m^2} \sum_{P_i, P_t \in \mathcal{P}} \textbf{c}_i^\mathrm{T}\textbf{c}_t
\end{equation}
Let $\mathcal{L}_{\mathcal{P}}$ be the sum of Eq.~\ref{lcr1}, Eq.~\ref{lcn1} and Eq.~\ref{lcu1}, and define $\mathbf{B} =  \frac{1}{\lvert P_i \rvert} \sum_{v_j \in P_i} g_\varphi(\mathbf{c}_i)^\mathrm{T}\mathbf{v}_j + \mathbf{c}_i^\mathrm{T}\mathbf{c}_k$. Then, according to Jensen's inequality, we have
\begin{equation*}
        \begin{split}
             &\mathcal{L}_{\mathcal{P}} \stackrel{\text{c}}{=} -\frac{1}{m} \sum_{P_i \in \mathcal{P}} \frac{1}{\lvert \mathcal{N}(P_i) \rvert} \sum_{P_k \in \mathcal{N}(P_i)} (\textbf{B} - \frac{1}{m} \sum_{P_t \in \mathcal{P}} \textbf{c}_i^\mathrm{T}\textbf{c}_t)  \\       
            &\leq -\frac{1}{m} \sum_{P_i \in \mathcal{P}} \frac{1}{\lvert \mathcal{N}(P_i) \rvert} \sum_{P_k \in \mathcal{N}(P_i)} (\textbf{B} -  \log \sum_{P_t \in \mathcal{P}} \frac{\exp \textbf{c}_i^\mathrm{T}\textbf{c}_t}{m} ) \\
            % &\leq -\frac{1}{m} \sum_{P_i \in \mathcal{P}} \frac{1}{\lvert \mathcal{N}(P_i) \rvert} \sum_{P_k \in \mathcal{N}(P_i)} (\textbf{B} -  \log \sum_{P_t \in \mathcal{P}} \exp \textbf{c}_i^\mathrm{T}\textbf{c}_t ) \\            
             &\leq -\frac{1}{m}\sum_{P_i \in \mathcal{P}} \frac{1}{\lvert P_i \rvert} \sum_{v_j \in P_i} [ \log \exp\textbf{B} ~~-\\
             &~~~~~~~~~~~~~\log{ (\exp{ (\frac{1}{\lvert P_i \rvert} \sum_{v_j \in P_i} g_\varphi(\mathbf{c}_i)^\mathrm{T}\mathbf{v}_j) }  + \sum_{P_t \in \mathcal{P}} \exp {\textbf{c}_i^\mathrm{T}\textbf{c}_t })} ] \\
             & = -\frac{1}{m} \sum_{P_i \in \mathcal{P}} \frac{1}{\lvert \mathcal{N}(P_i) \rvert} \sum_{P_k \in \mathcal{N}(P_i)} \log \ell (P_i)
        \end{split}
\end{equation*}
where $\stackrel{\text{c}}{=}$ denotes the equation minimization is equivalent. Since $\min \mathcal{L}_{all} \Leftrightarrow \min \mathcal{L}_{\mathcal{P}}$, we derive the upper bound of the overall loss, as shown in Eq.~\ref{loss}. 
\end{proof}


\subsubsection{Model Training}
Once the training is completed, we retain the model parameters learned in PCN and replace the partition convolutional operator with a graph convolutional operator, i.e., $\sigma(\hat{\mathbf{A}}^k\mathbf{X}\mathbf{W})$ to obtain node representations for downstream tasks. By this way, we avoid complex calculations during the training process and also use the graph convolutional operator to improve model performance, which we have verified in experiments. Note that the weight parameters $\xi$ of the MLP encoder are updated via the exponential moving average (EMA) of the PCN encoder weights $\theta$ (i.e., $\xi \leftarrow w\theta+(1-w)\xi$, where $w \in [0, 1]$ is the target decay rate). This is a common method in contrastive learning~\citep{bgrl}. The overall procedure of our model is illustrated in Algorithm \ref{alg-train}. 

Given a graph $G$ with $n$ nodes and $m$ communities, the time complexity of our method primarily arises from PCN and contrastive loss $\mathcal{L}_\mathcal{P}$. In the former, aggregating node representations to community centroids takes $O(nd)$, where $d$ is the representation dimension. In the latter, the comparison between centroids and internal nodes is $O(nd)$, and the comparison between communities is $O(m^2d)$. Since $m \ll n$, we can complete the computations in linear time.


\begin{algorithm}
	\caption{Model Training}
	\label{alg-train}
	\textbf{Input}: a graph $G = (\mathbf{A}, \mathbf{X})$\\
	\textbf{Parameter}: number of communities $m$, hidden dimensions $d$, training epochs $T$, PCN encoder $f_\theta$, MLP encoder $f_\xi$, projection head $g_\varphi$\\
	\textbf{Output}: node representations $\mathbf{H}$ \\
        \textbf{Steps}:
	\begin{algorithmic}[1] %[1] enables line numbers
		\STATE Initiate parameters $\theta$, $\xi$ and $\varphi$;
		\STATE $\mathcal{P} \leftarrow$ construct a partition of $G$ by METIS;
		\STATE Construct adjacency matrix based on $\mathcal{P}$;
		\FOR{$t=1$ to $T$}
		\STATE $\mathbf{c} \leftarrow$ generate community representations via Eq. \ref{equ-pcn};
		\STATE $\mathbf{v} \leftarrow$ generate node representations via $f_\xi(\mathbf{X})$;
		\STATE $\mathcal{L}_\mathcal{P} \leftarrow$ calculate the overall loss via Eq. \ref{loss};
		\STATE Update model parameters $\theta$ and $\varphi$ via $\mathcal{L}_\mathcal{P}$;
		\STATE Update model parameters $\xi$ via EMA;
        \ENDFOR
		\STATE $\mathbf{H} \leftarrow$ generate node representation via $\sigma(\hat{\mathbf{A}}^k\mathbf{X}\mathbf{W})$;
		\STATE \textbf{return} $\mathbf{H}$
	\end{algorithmic}
\end{algorithm}

\subsection{Properties of Overall Loss} \label{section-theorem}
We provide theoretical evidence to prove that our method can capture essential structural information for downstream tasks, and all detailed proofs can be found in Appendix~\ref{app-proof}. First, we demonstrate that our method can capture structure information of one-hop neighborhood, which is beneficial for heterophilic graphs, as nodes from the same semantic class tend to share similar neighborhood contexts~\citep{graphacl}.
    
    \begin{theorem} \label{theorem:one-hop}
		Let $\mathcal{N}(v)$ denote the neighbors of $v$. Minimizing the community contrastive loss $\mathcal{L}_\mathcal{P}$ will try to minimize the alignment loss between one-hop neighbors, which is defined as
		
		\begin{equation}
			\mathcal{L}_{alig} = \frac{1}{n} \sum_{v_j \in \mathcal{V}} \frac{1}{\lvert \mathcal{N}(v_j) \rvert} \sum_{v_i \in \mathcal{N}(v_j)} \lVert \mathbf{v}_j - g_\varphi(\mathbf{v}_i) \rVert_2  
		\end{equation}
		
		\noindent where $\mathbf{v}_j = f_\theta(G)[v_j]$ and $\mathbf{v}_i = f_\theta(G^0)[v_i]$.
    \end{theorem}

Theorem~\ref{theorem:one-hop} shows that our method can capture the one-hop neighborhood context from the central node representations, which captures the local structure information of the graph. Next, we prove that our method can capture higher-level structure information, i.e., multi-hop neighborhood dependencies.

\begin{theorem} \label{theorem:high-order}
    Suppose the contrastive loss $\mathcal{L}_\mathcal{P}$ and $\mathcal{L}_{gc}$ are L-Lipschitz continuous. Then, $\mathcal{L}_\mathcal{P}$ can be approximated by $\mathcal{L}_{gc}$ under the graph homophily assumption 
    
    \begin{equation}
        \lVert \mathcal{L}_{gc} - \mathcal{L}_\mathcal{P} \rVert \leq L \lVert \hat{\mathbf{A}}^k - \mathbf{P}\hat{\mathbf{P}}^\mathrm{T} \rVert \lVert \mathbf{X} \rVert \lVert \mathbf{W}_{\text{all}} \rVert
    \end{equation}
    
    \noindent where $L$ is the Lipschitz constant and $\mathbf{W}_{\text{all}}$ is the learnable parameters in GCL model.
\end{theorem}

Theorem~\ref{theorem:high-order} shows that our loss in Eq.~\ref{loss} can estimate the original contrastive loss $\mathcal{L}_{gc}$ of the diffusion matrix. Minimizing $\lVert \hat{\mathbf{A}}^k - \mathbf{P}\hat{\mathbf{P}}^\mathrm{T} \rVert$ is the minimum cut problem in graph theory, which is consistent with the goal of graph partition methods~\citep{cut}. Moreover, we also provide formal guarantees on the generalizability for downstream tasks. 

\begin{theorem} \label{theorem:tasks}
    Let $f^*_\theta$ be the optimal model parameters learned by the global minimizer of $\mathcal{L}_\mathcal{P}$, and $y(v_i)$ denote the label of $v_i$. Then, there exists a linear classification function $\hat{y}: \mathcal{V} \rightarrow \mathbb{R}^c$ such that the error upper bound is
    \begin{equation}
        \mathbb{E}_{v \in \mathcal{V}} \lVert y(v_i) - \hat{y} [f^*_\theta(v_i)] \rVert^2_2 \leq \frac{1 - \phi_\mathcal{P}}{\lambda_{d+1}}
    \end{equation}
\noindent where $\lambda_{d+1}$ is the $d+1$ smallest eigenvalue of diffusion matrix $\hat{\mathbf{A}^k}$ and $\phi_\mathcal{P}$ is partition homophily ratio, defined as
    \begin{equation}
        \mathcal{\phi}_\mathcal{P} = \frac{1}{m} \sum_{P_i \in \mathcal{P}} \frac{1}{\lvert P_i \rvert ^2} \sum_{v_j,v_k \in P_i} \mathbbm{1}[y(v_j) = y(v_k)]
    \end{equation}
\noindent where $\mathbbm{1}[\cdot]$ is the indicator function. 
\end{theorem}

Theorem~\ref{theorem:tasks} shows that the classification error of the learned representations is bound by the partition rate $\phi_\mathcal{P}$ and embedding dimension $d$, which means that we can dynamically adjust the number of communities and $d$ to improve performance. It is worth noting that the nodes within the same community in a homophilic graph tend to be of the same class $(\phi_\mathcal{P} \rightarrow 1)$, while the opposite may be true in a heterophilic graph $(\phi_\mathcal{P} \rightarrow 0)$. This theorem indicates that heterophilic graphs might require a larger dimension to ensure model performance.

\begin{table*}[t!]  %The Best and Second Best Performance are \textbf{Bold} and \underline{Underline}, Respectively
	\centering
		%\small
        \caption{Node Classification Accuracy ($\%$) on Homophilic Graphs (\textbf{Bold}: Best, \underline{Underline}: Second Best). }
		\setlength{\tabcolsep}{4.5pt}
		\begin{tabular}{lcccccccc}
			\toprule[1pt]
			Methods & Cora & CiteSeer & PubMed & Wiki-CS  & Computer & Photo & CS & Physics \\
			\midrule[0.5pt]
			GCN & 81.5$\pm$0.2 & 70.3$\pm$0.4 & 79.0$\pm$0.5 & 76.9$\pm$0.4 & 86.3$\pm0.5$ & 92.2$\pm$0.2 & 93.1$\pm$0.2 & 95.4$\pm$0.2 \\
			GAT & 83.0$\pm$0.7 & 72.5$\pm$0.3 & 79.0$\pm$0.3 & 77.4$\pm$0.2 & 87.1$\pm0.4$ & 92.7$\pm$0.5 & 92.4$\pm$0.3 & 95.3$\pm$0.2 \\	
                SGC & 81.0$\pm$0.1 & 71.9$\pm$0.1 & 78.9$\pm$0.0 & 79.4$\pm$0.3 & 85.9$\pm$ 0.7 & 92.3$\pm$0.2 & 92.6$\pm$0.2 & 95.1$\pm$0.2 \\			
				NodeFormer & 82.7$\pm$0.8 & 72.4$\pm$1.2 & 79.6$\pm$0.9 & \underline{80.4$\pm$0.4} & 86.6$\pm$0.3 & 92.9$\pm$0.1 & 93.5$\pm$0.2 &\underline{95.9$\pm$0.2} \\
			\midrule[0.5pt]
			DGI & 82.3$\pm$0.5 & 71.5$\pm$0.7 & 79.4$\pm$0.3 & 75.7$\pm$0.2 & 84.1$\pm0.4$ & 91.6$\pm$0.2 & 92.0$\pm$0.4 & 94.6$\pm$0.4 \\
			MVGRL & 82.9$\pm$0.7 & 72.8$\pm$0.4 & 80.1$\pm$0.4 & 77.9$\pm$0.3 & 87.1$\pm0.3$ & 92.0$\pm$0.1 & 91.9$\pm$0.2 & 95.5$\pm$0.5 \\						
			gCooL & 82.1$\pm$0.4 & 71.4$\pm$0.5 & \underline{82.1$\pm$0.3} & 78.7$\pm$0.1 & 88.9$\pm0.1$ & 92.8$\pm$0.2 & 92.8$\pm$0.1 & 95.1$\pm$0.1 \\
			CSGCL & 81.2$\pm$0.2 & 71.1$\pm$0.1 & 81.2$\pm$0.3 & 78.6$\pm$0.1 & 90.2$\pm0.2$ & 93.2$\pm$0.4 & \underline{93.6$\pm0.1$} & 95.3$\pm$0.2 \\
			\midrule[0.5pt]
			BGRL & 82.7$\pm$0.6 & 71.6$\pm$0.4 & 79.9$\pm$0.4 & 80.0$\pm$0.1 & 89.7$\pm0.4$ & 92.9$\pm$0.3 & 93.3$\pm$0.4 & 95.6$\pm$0.4 \\
			SUGRL & 83.4$\pm$0.5 & \underline{73.0$\pm$0.4} & 81.9$\pm$0.3 & 79.8$\pm$0.3 & 88.9$\pm0.2$ & 93.1$\pm$0.2 & 92.7$\pm$0.1 & 94.1$\pm$0.4 \\
			GGD & \underline{83.9$\pm$0.4} & \underline{73.0$\pm$0.6} & 81.3$\pm$0.8 & 78.7$\pm$0.6 & 90.1$\pm0.9$ & 92.5$\pm$0.6 & 92.4$\pm$0.2 & 95.0$\pm$0.2 \\
			SGCL & 83.0$\pm$0.2 & 72.6$\pm$0.3 & 81.3$\pm$0.3 & 79.9$\pm$0.5 & \underline{90.7$\pm0.3$} & \underline{93.4$\pm$0.3} & 93.3$\pm$0.2 & 95.7$\pm$0.1 \\
			StructComp & 82.3$\pm$0.8 & 71.6$\pm$1.0 & 78.3$\pm$2.5 & 80.1$\pm$0.1 & 89.1$\pm1.4$ & 92.7$\pm$1.0 & 93.1$\pm$0.4 & 95.0$\pm$0.1 \\
			\midrule[0.5pt]											
			Ours & \textbf{84.9$\pm$0.5} & \textbf{74.2$\pm$0.9} & \textbf{82.8$\pm$0.7} & \textbf{81.1$\pm$0.3} & \textbf{90.9$\pm$0.1}  & \textbf{93.8$\pm$0.2} & \textbf{94.3$\pm$0.3} & \textbf{96.3$\pm$0.1} \\	
			\bottomrule[1pt]
		\end{tabular}		
	\label{tab:nc-homo}
\end{table*}

\section{Experiments}
In this section, we conduct extensive experiments to evaluate the effectiveness and scalability of our method.
\subsection{Experimental Settings}
\textbf{Datasets.} 
We choose fourteen benchmark datasets for experiments, including: (a) eight homophilic graph datasets Cora, CiteSeer, PubMed, Wiki-CS, Amazon Computer, Amazon Photo, Coauthor CS and Coauthor Physics~\citep{gcn,amazon-data,wiki-data}, (b) four heterophilic graph datasets Texas, Wisconsin, Cornell and Actor~\citep{cornell-data}, and (c) two large-scale homophilic datasets Ogbn-Arxiv and Ogbn-Products~\citep{ogbn-data}. 

\noindent \textbf{Splitting Strategies.} For Cora, CiteSeer and Pubmed datasets, we adopt the public splits, with each class having 20 nodes for training, another fixed 500 nodes and 1000 nodes for validation and testing, respectively~\citep{gcn, dgi}. For the other five homophilic datasets, we adopt the 10\%/10\%/80\% training/validation/testing splits following previous works~\citep{greet}. For heterophilic and large-scale graphs, we adopt the splits that come with datasets~\cite{ggd, structcomp}. The details of the datasets are summarized in Appendix~\ref{app-data}.

%We choose fourteen open benchmark datasets for experiments, including eight homophilic graph datasets (\textit{i.e.,} Cora, CiteSeer, PubMed, Wiki-CS, Amazon Computer, Amazon Photo, Coauthor CS and Coauthor Physics~\citep{gcn,amazon-data,wiki-data}), four heterophilic graph datasets (\textit{i.e.,} Texas, Wisconsin, Chameleon and Actor~\citep{cornell-data}), and two large-scale datasets (\textit{i.e.,} Ogbn-Arxiv and Ogbn-Products~\citep{ogbn-data}). For all datasets, we adopt the public splits. The detail of datasets are summarized in Appendix~\ref{app-data}. 

\noindent \textbf{Comparison Methods.} We compare our method with the following four categories of methods: 
\begin{itemize}
	\item Semi-supervised learning methods: GCN~\citep{gcn}, GAT~\citep{gat}, SGC~\citep{sgc} and NodeFormer~\citep{nodeformer}.
	\item Classical GCL methods: DGI~\citep{dgi}, MVGRL~\citep{mvgrl}, gCooL~\citep{gcool} and CSGCL~\citep{csgcl}.
	\item Efficiency-oriented GCL methods: BGRL~\citep{bgrl}, SUGRL~\citep{sugrl}, GGD~\citep{ggd}, SGCL~\citep{sgcl} and StructComp~\citep{structcomp}. 
	\item Heterophily-aware GCL methods: HGRL~\citep{hgrl}, DSSL~\citep{dssl}, SP-GCL~\citep{sp-gcl}, GraphACL~\citep{graphacl}, GREET~\citep{greet} and HEATS~\citep{heats}.
\end{itemize}

\noindent \textbf{Metrics.} We adopt node classification and node clustering tasks to evaluate the quality of node representations. For node classification, we train a logistic regression classifier on the frozen representations and report the test accuracy. For node clustering, we perform $K$-Means clustering on the representations, and report the Normalized Mutual Information (NMI) and Adjusted Rand Index (ARI) scores. We verify the scalability of our method using GPU memory usage and training time per epoch.

\noindent \textbf{Implementation Details.} We randomly initialize all weight parameters of the model and use the Adam optimizer to train the encoder. All experiments are implemented in PyTorch and conducted on a machine equipped with an Intel i9 13900KF CPU, 128GB RAM, and NVIDIA RTX 4090 GPU. Each experiment is repeated 20 times, and the average performance and standard deviation are reported here. The detailed settings of the model and specific hyper-parameters can be found in Appendix~\ref{app-para}. 


\subsection{Experimental Results}
\textbf{Node Classification. } The node classification results on homophilic and heterophilic graphs are reported in Table~\ref{tab:nc-homo} and Table~\ref{tab:nc-hete}, respectively. These results tell us that: (a) Our method achieves the best performance across all datasets, especially on heterophilic graphs where it outperforms heterophily-aware GCL methods. (b) Unsupervised GCL methods demonstrate significant competitiveness compared to semi-supervised methods that leverage label information during the training process. (c) Community-based GCL methods, such as gCooL and CSGCL, also achieve impressive results, confirming the utility of community information in enhancing contrastive learning. These empirical findings support the theoretical analysis in Section~\ref{section-theorem} and validate the effectiveness of our method. 

\begin{table}
    %\small
    \caption{Node Classification Accuracy $(\%)$ on Heterophilic Graphs (\textbf{Bold}: Best, \underline{Underline}: Second Best).}
    \setlength{\tabcolsep}{4pt}
    \centering
        \begin{tabular}{lcccc}
			\toprule[1pt]
			Methods & Texas & Wisconsin & Cornell & Actor \\
			\midrule[0.5pt]
			HGRL & 61.8$\pm$0.7 & 63.9$\pm$0.6 & 51.8$\pm$1.0 & 28.0$\pm$0.3 \\
			DSSL & 62.1$\pm$1.5 & 62.3$\pm$0.6 & 53.2$\pm$1.3 & 28.2$\pm$0.3 \\
			SP-GCL & 59.8$\pm$1.3 & 60.1$\pm$0.4 & 52.3$\pm$1.2 & 28.9$\pm$0.7 \\
			GraphACL & 71.1$\pm$0.3 & 69.2$\pm$0.4 & 72.7$\pm$3.7 & 30.0$\pm$0.1 \\
			GREET & \underline{84.6$\pm$4.2} & \underline{80.9$\pm$5.2} & \underline{72.9$\pm$1.7} & \underline{36.1$\pm$1.2} \\
			HEATS & 64.9$\pm$4.7 & 65.9$\pm$5.6 & 67.0$\pm$5.9 & 30.1$\pm$1.2 \\
			\midrule[0.5pt]
			Ours & \textbf{85.4$\pm$5.6} & \textbf{83.7$\pm$3.2} & \textbf{74.6$\pm$5.0} & \textbf{37.4$\pm$1.3} \\
			\bottomrule[1pt]
		\end{tabular}		
	\label{tab:nc-hete}
\end{table}

\textbf{Node Clustering. } We select the methods with clustering effects in their original papers for comparison and report the results in Table~\ref{tab:cluster}. The results tell us that: (a) Our method performs better than other methods due to the effective utilization of community structure information. (b) Our method outperforms gCooL and CSGCL which only use the intra-community information. This suggests that leveraging both intra-community and inter-community relationships is crucial for node representation. These results not only validate the effectiveness of our method but also demonstrate its adaptability to other downstream tasks.

\begin{table}
	%\small
	\setlength{\tabcolsep}{4.1pt}
        \caption{Node Clustering Results Measured by NMI $(\%)$ and ARI $(\%)$. $K$-Means Represents Clustering Directly on the Original Node Features.}
        \centering
        \begin{tabular}{lcc|cc|cc}
			\toprule[1pt]
			\multirow{2}{*}{Methods} &\multicolumn{2}{c}{Photo} &\multicolumn{2}{c}{CS} &\multicolumn{2}{c}{Physics}  \\
			\cmidrule[0.5pt](lr){2-7} 
			&NMI &ARI &NMI &ARI &NMI &ARI\\
            \midrule[0.5pt]
                $K$-Means &25.8 &14.5 &60.1 &40.4 &48.9 & 27.6 \\
                gCooL &56.6 &43.1 &75.3 &62.1 &65.2 &57.8 \\
                CSGCL &58.8 &46.3 & \underline{77.1} & \underline{63.6} &66.1 &58.3\\
                SUGRL & \underline{63.6} &\underline{52.8} &76.6 &62.5 &65.7 &60.4 \\
                GREET & 52.3 & 37.1 &75.8 &62.1 &\underline{66.4} &\underline{63.6}\\
                GraphACL &61.1 &47.9 &74.7 & 62.8 &64.2 &62.5 \\
                \midrule[0.5pt]
                Ours &\textbf{64.8} &\textbf{54.4} &\textbf{77.4} &\textbf{63.9} &\textbf{69.3} &\textbf{66.1}\\
			\bottomrule[1pt]
		\end{tabular}		
	\label{tab:cluster}
\end{table}

\noindent \textbf{Scalability Evaluation. } We compare the node classification accuracy and training consumption of our method with efficiency-based GCL methods, and report the results on large-scale datasets in Table~\ref{tab:nc-large}. For fairness, we omitted the memory usage of methods trained in a mini-batch manner. These results tell us that: (a) Our method achieves the best accuracy while simultaneously reducing time consumption and memory usage. (b) The efficiency improvement of our method becomes more significant as the dataset scale increases. Notably, on Ogbn-Products, where many methods must adopt mini-batch training, our method can be trained in full-batch mode, which is attributed to our method's significant reduction in message passing and contrastive loss calculation consumption. These results validate the scalability of our method.


\begin{table} % 
	%\small
	\caption{Scalability Evaluation on Node Classification. \textquotesingle Acc\textquotesingle: Accuracy ($\%$), ~\textquotesingle Time\textquotesingle: Training Time per Epoch, ~\textquotesingle Mem\textquotesingle: GPU Memory (GB), ~\textquotesingle -\textquotesingle: Training in Mini-batch.}    
	\setlength{\tabcolsep}{4.1pt}
	\centering
		\begin{tabular}{lccc|ccc}
			\toprule[1pt]
			\multirow{2}{*}{Methods} &\multicolumn{3}{c}{Ogbn-Arxiv} &\multicolumn{3}{c}{Ogbn-Products}  \\
			\cmidrule[0.5pt](lr){2-7} 
			&Acc &Time(s) &Mem &Acc &Time(m) &Mem\\		
			BGRL &71.6 & 0.29 &10.7 &64.0 & 53.3  & -		\\
			SUGRL &67.8 &\textbf{0.05} &\textbf{2.6} &72.9 &1.5 & 23.5		\\
			GGD &71.6 &0.95 &14.3 &75.7 & 12.7 & -		\\
			SGCL &71.0 & 0.09 & 5.1  &\underline{76.0} & 1.9 & - 		\\
			S.Comp &\underline{71.7} &\textbf{0.05} &\underline{3.4} &75.7 &\underline{0.06} &\underline{12.0}		\\
			\midrule[0.5pt]
			Ours &\textbf{71.9} &\underline{0.06} & 5.4 &\textbf{76.8} &\textbf{0.001} &\textbf{6.3}		\\
			\bottomrule[1pt]
		\end{tabular}		
	\label{tab:nc-large}
\end{table}


\noindent \textbf{Impacts of Parameters. } We explore the impacts of partition rate $\beta$ and node embedding dimension $d$ in Figure~\ref{fig-hidden}, and report the effect of coefficient $\alpha$ in Appendix~\ref{app-alpha}. These results tell us that: (a) When $\beta$ becomes very large, the accuracy tends to flatten or even decrease, indicating that setting $\beta$ to a smaller value is a practical choice. Moreover, a smaller $\beta$ requires less GPU memory. (b) A larger $d$ can generally improve node classification accuracy, especially on heterophilic graphs. This observation supports Theorem~\ref{theorem:tasks}, which posits that a larger dimension can effectively reduce the upper bound of the classification error. (c) On homophilic graphs, a smaller $\alpha$ is required to emphasize the local information of the graph, while on heterophilic graphs, increasing the value of $\alpha$ is needed to focus on global information.


\begin{figure}
	\centering
		\centerline{\includegraphics[width=\columnwidth]{fig-hidden.pdf}}
		\caption{Impacts of Hyperparameters $\beta$ and $d$ (~\textquotesingle $\beta = 1\%$ \textquotesingle: The Number of Communities is Fixed to $0.01 \times n$).}
		\label{fig-hidden}
\end{figure}

\noindent \textbf{Impacts of Partition Algorithms.} We evaluate the impacts of different graph partition methods, including Graph Cut (GC) \citep{gs}, Louvain \citep{louvain}, and Structural Entropy (SE) \citep{se_toit}, on node classification in Figure~\ref{fig-partition}. The results tell us that: (a) SE performs better across all datasets because it does not require manual specification of the number of communities, which avoids overfitting to a some extent. This indicates that more powerful partition method leads to higher accuracy improvement. (b) Despite its simplicity, METIS also demonstrates good performance on node classification. Thus, we adopt METIS in our method. The graph partition time consumption for all datasets is summarized in Appendix~\ref{app-time}, showing that it requires only a few seconds on large-scale datasets.




\noindent \textbf{Ablation Study.} We conduct an ablation study, as shown in table~\ref{tab:ablation-study}. The results tell us that: (a) All components contribute to the performance improvement of our method. (b) The message passing mechanism is critical for improving the accuracy of node classification on homophilic graphs. (c) For node classification, the impact of $\mathcal{L}_{cr}$ is greater than that of $\mathcal{L}_{cn}$, which indicates that intra-community information is more useful than inter-community information. (d) Even without the graph convolutional operator, our method still outperforms semi-supervised MLP and remains competitive with GCN, which verifies the rationality of our method. These explore the contributions of different components of our method. 

\begin{table}[t!]
    %\small
    \caption{Ablation Study on Node Classification. \textbf{A1}: Removing Graph Convolutional Operator during Testing Phase, \textbf{A2}: Removing Intra-community Reconstruction Loss $\mathcal{L}_{cr}$, \textbf{A3}: Removing Inter-community Neighborhood Loss $\mathcal{L}_{cn}$, ~\textquotesingle - \textquotesingle: Without using Graph Convolutional Operator, that is, $k=0$.}    
    \setlength{\tabcolsep}{3pt}
    \centering
		\begin{tabular}{lcccc}
			\toprule[1pt]
			Baselines &CiteSeer &PubMed &Photo &Actor \\
			\midrule[0.5pt]
			MLP & 56.1$\pm$0.4 & 71.4$\pm$0.1 & 78.5$\pm$0.1 & 35.6$\pm$0.9\\
			GCN & 70.3$\pm$0.4 & 79.0$\pm$0.5 & 92.4$\pm$0.2 & 30.8$\pm$0.7\\			
			\midrule[0.5pt]
			\textbf{A1} (w/o GC) & 69.5$\pm$0.2 & 76.2$\pm$0.8 & 88.3$\pm$0.2 & -\\			
			\textbf{A2} (w/o $\mathcal{L}_{cr}$) & 70.8$\pm$0.8 & 45.7$\pm$1.7 & 90.2$\pm$0.5 & 34.5$\pm$1.2\\			
			\textbf{A3} (w/o $\mathcal{L}_{cn}$) & \underline{72.7$\pm$0.2} &\underline{80.9$\pm$0.3} & \underline{93.3$\pm$0.1} & \underline{36.1$\pm$1.4}\\			
			\textbf{A1 \& A2} &  52.6$\pm$0.2 & 35.9$\pm$0.4 & 75.1$\pm$0.6 & -\\
			\textbf{A1 \& A3} &  68.9$\pm$0.2 & 74.6$\pm$0.4 & 88.4$\pm$0.1 & -\\
			\midrule[0.5pt]
			Ours & \textbf{74.2$\pm$0.9} & \textbf{82.8$\pm$0.7} & \textbf{93.8$\pm$0.2} & \textbf{37.4 $\pm$1.3}\\			
			\bottomrule[1pt]
		\end{tabular}		
	\label{tab:ablation-study}
\end{table}

\begin{figure}
	\centering
		\centerline{\includegraphics[width=\columnwidth]{fig-partition.pdf}}
		\caption{Comparison of Different Partition Algorithms.}
		\label{fig-partition}
\end{figure}


\begin{figure*}[ht]
    \centering
    \begin{minipage}{0.33\textwidth}
        \centering
        \includegraphics[width=0.95\linewidth]{fig-cora.pdf}
    \end{minipage}
    \hfill
    \begin{minipage}{0.33\textwidth}
        \centering
        \includegraphics[width=0.95\linewidth]{fig-citeseer.pdf}
    \end{minipage}
    \hfill
    \begin{minipage}{0.33\textwidth}
        \centering
        \includegraphics[width=0.95\linewidth]{fig-actor.pdf}
    \end{minipage}
    \caption{The Pair-wise Similarity Distribution of Randomly Sampled Node, One-hop and Multi-hop Neighbors.}
    \label{fig:visualization}
\end{figure*}

\noindent \textbf{Visualization.} We evaluate the effectiveness of our loss in capturing graph structural information by removing the GNN during the test phase, and report the pairwise cosine similarity of node representations for randomly sampled nodes, one-hop neighbors, and multi-hop neighbors in Figure~\ref{fig:visualization}. The results tell us that: (a) Our method increases the similarity of node representations for one-hop neighbors compared to random pairs, indicating effective preservation of local structural information. (b) Node representations of multi-hop neighbors maintain high similarity, demonstrating our method's ability to capture higher-level structural patterns without stacking multiple GNN layers. These results confirm that our loss function effectively captures essential graph structural information.  




\section{Related Works}
\textbf{Scalable Graph Neural Networks.}
 GNNs facilitate feature propagation between nodes through the message passing mechanism~\cite{gcn}. To improve the scalability of GNNs, GraphSage~\citep{sage} and Cluster-GCN~\citep{cluster-gcn} employ subgraph sampling techniques or mini-batch processing mode to train models. SGC~\citep{sgc} simplifies GNNs by removing the non-linear function of graph convolutional layers. Coarse-GNN~\citep{coarse-gnn} proposes to use a compreseed graph for scalable training of GNNs. Other methods use global attention mechanisms with linear complexity to process large-scale graphs, such as NodeFormer~\citep{nodeformer}. However, these methods are not suitable for graph representation learning without task-specific labels.


\noindent\textbf{Graph Contrastive Learning.}
GCL has demonstrated excellent performance in graph representation learning tasks without labels~\citep{ncla, gcl-survey}. DGI~\citep{dgi} and MVGRL~\citep{mvgrl} maximize the mutual information between local and global embeddings to learn node representations. gCool~\citep{gcool} and CSGCL~\citep{csgcl} improve node representations by introducing community structure to construct positive and negative samples. Recent works focus on the scalability of GCL~\citep{gcl-survey}, such as BGRL~\citep{bgrl} and SGCL~\citep{sgc} compute the contrastive loss without negative node pairs. SUGRL~\citep{sugrl} reduces the number of graphs that need to be processed by GCL. GGD~\citep{ggd} directly uses binary cross entropy loss to distinguish between positive and negative samples. StructComp~\citep{structcomp} conducts contrastive learning on the constructed compressed graph, significantly improving the scalability of GCL. However, it relies on a fixed graph coarsening process, leading to overly homogeneous center representations and loss of node-level information~\citep{cluster-former}. 

There are also some methods that explore the potential of GCL on heterophilic graphs~\citep{uai-heter}. HGRL captures distant neighbors to learn node representations \citep{hgrl}. DSSL decouples different neighborhood contexts of nodes \citep{dssl}. SP-GCL studies the concentration property of features on heterophilic graphs \citep{sp-gcl}. GraphACL captures one-hop neighborhood information and two-hop monophily similarity \citep{graphacl}. GREET learns node representations by distinguishing homophilic and heterophilic edges \citep{greet}. HEATS optimizes  positive sampling techniques for heterophilic graphs \citep{heats}. However, most of these methods still need to use mini-batch mode when processing large-scale graphs.


\section{Conclusion}
In this paper, we propose a simple and efficient method to improve the scalability of GCL by leveraging community structures. The core idea is to replace finer-grained node adjacency information with community-level structures. Specifically, we use a sparse partition matrix for message passing with linear time complexity, and design an efficient contrastive loss function that considers both intra-community and inter-community structural information. Theoretical analysis shows that our loss can effectively capture the basic and high-level structural information of the graph and has good generalization performance guarantees. Experimental results demonstrate that our method achieves the best performance while significantly reducing the time and memory overhead. We plan to explore a simple adaptive graph partition technique to improve the robustness of our method, addressing potential issues where community structures become unreliable due to noise in the graph. 





% \begin{contributions} % will be removed in pdf for initial submission 
% 					  % (without ‘accepted’ option in \documentclass)
%                       % so you can already fill it to test with the
%                       % ‘accepted’ class option
%     Briefly list author contributions. 
%     This is a nice way of making clear who did what and to give proper credit.
%     This section is optional.

%     H.~Q.~Bovik conceived the idea and wrote the paper.
%     Coauthor One created the code.
%     Coauthor Two created the figures.
% \end{contributions}

\begin{acknowledgements} % will be removed in pdf for initial submission,
						 % (without ‘accepted’ option in \documentclass)
                         % so you can already fill it to test with the
                         % ‘accepted’ class option
    This work was supported by the Key Program of National Natural Science Joint Foundation of China (U23A20298); Program of Yunnan Key Laboratory of Intelligent Systems and Computing (202405AV340009); Yunnan Fundamental Research Projects (202501AS070102, 202401AS070138); Scientific Research Fund Project of Yunnan Education Department (2025Y0061). For any correspondence, please refer to Liang Duan.
\end{acknowledgements}


% References
\bibliography{igcl}

\newpage

\onecolumn

\title{Improving Graph Contrastive Learning with Community Structure\\(Appendix)}
\maketitle

\appendix
\section{ Proof Details} \label{app-proof}
\subsection{The Overall Loss} \label{app-loss}
We provide a detailed derivation of the overall loss in Eq. ~\ref{loss}.
\begin{proof}
    For the loss $\mathcal{L}_{cr}$ in Eq.~\ref{lcr}, we have
    \begin{equation}
        \begin{split}
            \mathcal{L}_{cr} &= \frac{1}{m}\sum_{P_i \in \mathcal{P}} \frac{1}{\lvert P_i \rvert} \sum_{v_j \in P_i} \lVert g_\varphi(\mathbf{c}_i) - \mathbf{v}_j \rVert_2^2 \\
            &=\frac{1}{m}\sum_{P_i \in \mathcal{P}} \frac{1}{\lvert P_i \rvert} \sum_{v_j \in P_i} (g_\phi(\textbf{c}_i) - \mathbf{v}_j)^\mathrm{T} (g_\varphi(\mathbf{c}_i) - \mathbf{v}_j)  \\
            &=\frac{1}{m}\sum_{P_i \in \mathcal{P}} \frac{1}{\lvert P_i \rvert} \sum_{v_j \in P_i} (g_\phi(\textbf{c}_i)^\mathrm{T}g_\varphi(\mathbf{c}_i) + \mathbf{v}_j^\mathrm{T}\mathbf{v}_j - 2g_\varphi(\mathbf{c}_i)^\mathrm{T}\mathbf{v}_j)   \\
        \end{split}
    \end{equation}
    Since vectors in contrastive losses are usually normalized. Thus, we have
    \begin{equation}\label{lcr11}
        \min \mathcal{L}_{cr} \Leftrightarrow \min -\frac{1}{m}\sum_{P_i \in \mathcal{P}} \frac{1}{\lvert P_i \rvert} \sum_{v_j \in P_i} g_\phi(\textbf{c}_i)^\mathrm{T}\mathbf{v}_j
    \end{equation}
    Similarly, minimizing $\mathcal{L}_{cn}$ in Eq.~\ref{lcn} can be written as 
    \begin{equation}\label{lcn11}
        \begin{split}
            \min \mathcal{L}_{cn} &\Leftrightarrow \min - \frac{1}{m} \sum_{P_i \in \mathcal{P}} \frac{1}{\lvert  \mathcal{N}(P_i) \rvert} \sum_{P_k \in \mathcal{N}(P_i)} \textbf{c}_i^\mathrm{T}\textbf{c}_k \\
            % &\Leftrightarrow \min - \frac{1}{m} \sum_{P_i \in \mathcal{P}} \frac{1}{\lvert P_i \rvert} \sum_{P_k \in \mathcal{N}(P_i)} \textbf{c}_i^\mathrm{T} \textbf{c}_j
        \end{split}
    \end{equation}
    Minimizing $\mathcal{L}_{cur}$ in Eq.~\ref{lcu} can be written as
    \begin{equation}\label{lcu11}
        \min \mathcal{L}_{cur} \Leftrightarrow \min \frac{1}{m^2} \sum_{P_i \in \mathcal{P}} \sum_{P_t \in \mathcal{P}} \textbf{c}_i^\mathrm{T}\textbf{c}_t
    \end{equation}
    Combining Eq.~\ref{lcr11}, Eq.~\ref{lcn11} and Eq.~\ref{lcu11}. Then, we have
    \begin{equation}
        \begin{split}
            \mathcal{L}_{\mathcal{P}} &= -\frac{1}{m} \sum_{P_i \in \mathcal{P}} \frac{1}{\lvert \mathcal{N}(P_i) \rvert} \sum_{P_k \in \mathcal{N}(P_i)} \frac{1}{\lvert P_i \rvert} \sum_{v_j \in P_i} g_\phi(\textbf{c}_i)^\mathrm{T}\mathbf{v}_j + \textbf{c}_i^\mathrm{T}\textbf{c}_k - \frac{1}{m}\sum_{P_t \in \mathcal{P}} \textbf{c}_i^\mathrm{T}\textbf{c}_t \\
            &= -\frac{1}{m} \sum_{P_i \in \mathcal{P}} \frac{1}{\lvert \mathcal{N}(P_i) \rvert} \sum_{P_k \in \mathcal{N}(P_i)} \frac{1}{\lvert P_i \rvert} \sum_{v_j \in P_i} g_\phi(\textbf{c}_i)^\mathrm{T}\mathbf{v}_j + \textbf{c}_i^\mathrm{T}\textbf{c}_k - \sum_{P_t \in \mathcal{P}} \frac{1}{m}\log \exp \textbf{c}_i^\mathrm{T}\textbf{c}_t  \\
        \end{split}
    \end{equation}
    According to Jensen’s inequality, we have
    \begin{equation}
        \begin{split}
            \mathcal{L}_{\mathcal{P}} &= -\frac{1}{m} \sum_{P_i \in \mathcal{P}} \frac{1}{\lvert \mathcal{N}(P_i) \rvert} \sum_{P_k \in \mathcal{N}(P_i)} \frac{1}{\lvert P_i \rvert} \sum_{v_j \in P_i} g_\phi(\textbf{c}_i)^\mathrm{T}\mathbf{v}_j + \textbf{c}_i^\mathrm{T}\textbf{c}_k - \sum_{P_t \in \mathcal{P}} \frac{1}{m}\log \exp \textbf{c}_i^\mathrm{T}\textbf{c}_t \\
            % &\stackrel{\text{c}}{=} -\frac{1}{m} \sum_{P_i \in \mathcal{P}} \frac{1}{\lvert \mathcal{N}(P_i) \rvert} \sum_{P_k \in \mathcal{N}(P_i)} \frac{1}{\lvert P_i \rvert} \sum_{v_j \in P_i} g_\phi(\textbf{c}_i)^\mathrm{T}\mathbf{v}_j + \textbf{c}_i^\mathrm{T}\textbf{c}_k - \sum_{P_t \in \mathcal{P}} \log \frac{ \exp \textbf{c}_i^\mathrm{T}\textbf{c}_t}{m} \\
            & \leq -\frac{1}{m} \sum_{P_i \in \mathcal{P}} \frac{1}{\lvert \mathcal{N}(P_i) \rvert} \sum_{P_k \in \mathcal{N}(P_i)} \frac{1}{\lvert P_i \rvert} \sum_{v_j \in P_i} g_\phi(\textbf{c}_i)^\mathrm{T}\mathbf{v}_j + \textbf{c}_i^\mathrm{T}\textbf{c}_k - \log \sum_{P_t \in \mathcal{P}} \frac{\exp \textbf{c}_i^\mathrm{T}\textbf{c}_t}{m}  \\
            &\leq -\frac{1}{m} \sum_{P_i \in \mathcal{P}} \frac{1}{\lvert \mathcal{N}(P_i) \rvert} \sum_{P_k \in \mathcal{N}(P_i)} \frac{1}{\lvert P_i \rvert} \sum_{v_j \in P_i} g_\phi(\textbf{c}_i)^\mathrm{T}\mathbf{v}_j + \textbf{c}_i^\mathrm{T}\textbf{c}_k - \log \sum_{P_t \in \mathcal{P}} \exp \textbf{c}_i^\mathrm{T}\textbf{c}_t\\
            &\leq -\frac{1}{m} \sum_{P_i \in \mathcal{P}} \frac{1}{\lvert \mathcal{N}(P_i) \rvert} \sum_{P_k \in \mathcal{N}(P_i)} \frac{1}{\lvert P_i \rvert} \sum_{v_j \in P_i} g_\phi(\textbf{c}_i)^\mathrm{T}\mathbf{v}_j + \textbf{c}_i^\mathrm{T}\textbf{c}_k - \log (\exp ( \frac{1}{\lvert P_i \rvert} \sum_{v_j \in P_i} {g_\varphi(\mathbf{c}_i)^\mathrm{T}\mathbf{v}_j)} + \sum_{P_t \in \mathcal{P}} \exp \textbf{c}_i^\mathrm{T}\textbf{c}_t)\\
             &=  -\frac{1}{m} \sum_{P_i \in \mathcal{P}} \frac{1}{\lvert \mathcal{N}(P_i) \rvert} \sum_{P_k \in \mathcal{N}(P_i)} \log \frac{ \exp (\frac{1}{\lvert P_i \rvert}  \sum_{v_j \in P_i} g_\phi(\textbf{c}_i)^\mathrm{T}\mathbf{v}_j + \textbf{c}_i^\mathrm{T}\textbf{c}_k)}{\exp {(\frac{1}{\lvert P_i \rvert} \sum_{v_j \in P_i} g_\varphi(\mathbf{c}_i)^\mathrm{T}\mathbf{v}_j)} +\sum_{P_t \in \mathcal{P}} \exp \textbf{c}_i^\mathrm{T}\textbf{c}_t}
        \end{split}
    \end{equation}
To this end, we derive the upper bound of the combination loss, as shown in Eq.~\ref{loss}.    
\end{proof}

\addtocounter{theorem}{-3}
\subsection{Proof of Theorem \ref{theorem:one-hop}}
	\begin{theorem}
		Let $\mathcal{N}(v)$ denote the neighbors of $v$. Minimizing the community contrastive loss $\mathcal{L}_\mathcal{P}$ will try to minimize the alignment loss between one-hop neighbors, which is defined as
		
		\begin{equation}
			\mathcal{L}_{alig} = \frac{1}{n} \sum_{v_j \in \mathcal{V}} \frac{1}{\lvert \mathcal{N}(v_j) \rvert} \sum_{v_i \in \mathcal{N}(v_j)} \lVert \mathbf{v}_j - g_\varphi(\mathbf{v}_i) \rVert_2  
		\end{equation}
		
		\noindent where $\mathbf{v}_j = f_\theta(G)[v_j]$ and $\mathbf{v}_i = f_\theta(G^0)[v_i]$.
	\end{theorem}

\begin{proof}
    Let $S(v_i)$ represents the set of one-hop neighbor nodes of $v_i$ in the same community and $M(v_i) = \mathcal{N}(v_i) - S(v_i)$ represents the set of one-hop neighbor nodes of $v_j$ not in the same community. Thus, we have

    \begin{equation}
        \begin{split}
            \mathcal{L}_{cr} &= \frac{1}{m}\sum_{P_i \in \mathcal{P}} \frac{1}{\lvert P_i \rvert} \sum_{v_j \in P_i} \lVert \mathbf{c}_i - \mathbf{v}_j\rVert^2_2  \\
            &\stackrel{\text{c}}{=} \frac{1}{m}\sum_{P_i \in \mathcal{P}} \frac{1}{\lvert P_i \rvert} \sum_{v_j \in P_i} \lVert \sum_{v \in P_i} \mathbf{v} - \mathbf{v}_j\rVert^2_2  \\
            &\Rightarrow -\frac{1}{m}\sum_{P_i \in \mathcal{P}} \frac{1}{\lvert P_i \rvert} \sum_{v_j \in P_i} (\mathbf{v}_j^T \sum_{v \in S(v_j)} \mathbf{v}) \\
            &\stackrel{\text{c}}{=} \frac{1}{m}\sum_{P_i \in \mathcal{P}} \frac{1}{\lvert P_i \rvert} \sum_{v_j \in P_i} \lVert \mathbf{v}_j - \sum_{v \in S(v_j)} \mathbf{v} \rVert_2
        \end{split}
    \end{equation}
    For the loss $\mathcal{L}_{cn}$ in Eq.~\ref{lcn}, we have
    \begin{equation}
        \begin{split}
            \mathcal{L}_{cn} &= \frac{1}{m} \sum_{P_i \in \mathcal{P}} \frac{1}{\lvert \mathcal{N}(P_i) \rvert} \sum_{P_k \in \mathcal{N}(P_i)} \lVert \mathbf{c}_i - \mathbf{c}_k \rVert_2^2 \\
            &\stackrel{\text{c}}{=} \frac{1}{m} \sum_{P_i \in \mathcal{P}} \frac{1}{\lvert P_i \rvert} \sum_{P_k \in \mathcal{N}(P)} \lVert \sum_{v_j \in P_i}\mathbf{v}_j - \sum_{v \in P_k} \mathbf{v} \rVert_2^2 \\
            &\Rightarrow \frac{1}{m} \sum_{P_i \in \mathcal{P}} \frac{1}{\lvert P_i \rvert} \sum_{v_j \in P_i} \lVert \mathbf{v}_j - \sum_{u \in M(v_j)} \mathbf{u} \rVert_2 \\
        \end{split}
    \end{equation}
    Combining the above two Equations, we have
    \begin{equation}
        \begin{split}
            \mathcal{L}_{cr} + \mathcal{L}_{cn} &\geq \frac{1}{m} \sum_{P_i \in \mathcal{P}} \frac{1}{\lvert P_i \rvert} \sum_{v_j \in P_i} \lVert 2\mathbf{v}_j - (\sum_{u \in M(v_j)} \mathbf{u} + \sum_{v \in S(v_j)} \mathbf{v}) \rVert_2 \\
            &\stackrel{\text{c}}{=} \frac{1}{m} \sum_{P_i \in \mathcal{P}} \frac{1}{\lvert P_i \rvert} \sum_{v_j \in P_i} \lVert 2\mathbf{v}_j - \sum_{v \in \mathcal{N}(v_j)} \mathbf{v} \rVert_2 \\
            &\stackrel{\text{c}}{=} \frac{1}{m} \sum_{P_i \in \mathcal{P}} \frac{1}{\lvert P_i \rvert} \sum_{v_j \in P_i} \lVert \mathbf{v}_j - \sum_{v \in \mathcal{N}(v_j)} \mathbf{v} \rVert_2 \\
            &\stackrel{\text{c}}{=} \frac{1}{n} \sum_{v_j \in \mathcal{V}} \frac{1}{\lvert \mathcal{N}(v_j) \rvert} \sum_{v_i \in \mathcal{N}(v_j)} \lVert \mathbf{v}_j -  g_\varphi(\mathbf{v}_i) \rVert_2 
        \end{split}
    \end{equation}
    The last equation holds since it is consistent with the goal of $\mathcal{L}_G$, which is to minimize the distance between neighbor representations.

\end{proof}



\subsection{Proof of Theorem \ref{theorem:high-order}}
\begin{theorem} 
    Suppose the contrastive loss $\mathcal{L}_\mathcal{P}$ and $\mathcal{L}_{gc}$ are L-Lipschitz continuous. Then, $\mathcal{L}_\mathcal{P}$ can be approximated by $\mathcal{L}_{gc}$ under the graph homophily assumption 
    
    \begin{equation}
        \lVert \mathcal{L}_{gc} - \mathcal{L}_\mathcal{P} \rVert \leq L \lVert \hat{\mathbf{A}}^k - \mathbf{P}\hat{\mathbf{P}}^\mathrm{T} \rVert \lVert \mathbf{X} \rVert \lVert \mathbf{W}_{\text{m}} \rVert
    \end{equation}
    
    \noindent where $L$ is the Lipschitz constant and $\mathbf{W}_{\text{m}}$ is the model parameters in GCL framework.
\end{theorem}

\begin{proof}
    Let the computational process of contrastive learning represents as a function $l(\cdot)$. Then, we have
    \begin{equation}
        \begin{split}
            \lVert \mathcal{L}_G - \mathcal{L}_\mathcal{P} \rVert &= \lvert l(\hat{\mathbf{A}}^k) - l(\mathbf{P}\hat{\mathbf{P}}^\mathrm{T}) \rvert \\
            &\leq L \lVert \hat{\mathbf{A}}^k\textbf{X}\textbf{W}_m - \mathbf{P}\hat{\mathbf{P}}^\mathrm{T}\textbf{X}\textbf{W}_m \rVert \\
            & = L \lVert (\hat{\mathbf{A}}^k - \mathbf{P}\hat{\mathbf{P}}^\mathrm{T})\textbf{X}\textbf{W}_m \rVert\\
            &\leq L \lVert \hat{\mathbf{A}}^k - \mathbf{P}\hat{\mathbf{P}}^\mathrm{T} \rVert \lVert \mathbf{X} \rVert \lVert \mathbf{W}_{\text{m}} \rVert
        \end{split}
    \end{equation}
    Intuitively, our loss can be viewed as standard contrastive loss performed on $\mathbf{P}\hat{\mathbf{P}}^\mathrm{T}$.
\end{proof}



\subsection{Proof of Theorem \ref{theorem:tasks}}
To prove Theorem~\ref{theorem:tasks}, we first introduce a lemma, which provides the following theoretical guarantees for the model learned using spectral contrastive loss~\citep{pscl}.

\begin{lemma} \label{lemma}
    Let $f^*$ be the minimizer of the spectral contrastive loss: $\mathcal{L}_{scl} = -2 \sum_{x, x^\prime} w_{xx^\prime} \cdot f(x)^\mathrm{T}f(x^\prime) + \sum_{x, x^\prime} w_{xx^\prime} \cdot (f(x)^\mathrm{T}f(x^\prime))^2$, where $w_{xx^\prime}$ is the probability of a random positive pair being $(x,x^\prime)$ while $w_x$ the probability of a random selected data point being $x$. Then, we have
    \begin{equation}
        \mathbb{E}_{v_i \in \mathcal{V}} \lVert y(v_i) - \hat{y} [f^*(v_i)] \rVert^2_2 \leq \frac{1 - \phi_G}{\lambda_{d+1}}
    \end{equation}
    \noindent where $\lambda_{d+1}$ is the $d+1$ smallest eigenvalue of normalized matrix $\hat{\mathbf{A}}$ and $\phi_G$ is the graph homophily ratio, defined as
    \begin{equation}
        \mathcal{\phi}_G = \frac{1}{n} \sum_{v_i \in \mathcal{V}} \frac{1}{\lvert \mathcal{N}(v_i) \rvert} \sum_{v_j \in \mathcal{N}(v_i)} \mathbbm{1}[y(v_i) = y(v_j)]
    \end{equation}
\end{lemma}


According to Lemma 1, we only need to prove that our loss in Eq.~\ref{loss} can be expressed as a spectral contrastive loss to complete the proof.

\begin{theorem}
    Let $f^*_\theta$ be the optimal model parameters obtained by the global minimizer of $\mathcal{L}_\mathcal{P}$ and $y(v)$ denote the label of $v$. Then, there exists a linear classification function $\hat{y}: \mathcal{V} \rightarrow \mathbb{R}^c$ such that the error upper bound is
    \begin{equation}
        \mathbb{E}_{v \in \mathcal{V}} \lVert y(v) - \hat{y} [f^*_\theta(v)] \rVert^2_2 \leq \frac{1 - \phi_\mathcal{P}}{\lambda_{d+1}}
    \end{equation}
\noindent where $\lambda_{d+1}$ is the $d+1$ smallest eigenvalue of diffusion matrix $\hat{\mathbf{A}^k}$ and $\phi_\mathcal{P}$ is the partition homophily ratio, defined as
    \begin{equation}
        \mathcal{\phi}_\mathcal{P} = \frac{1}{m} \sum_{P_i \in \mathcal{P}} \frac{1}{\lvert P_i \rvert} \frac{1}{\lvert P_i \rvert} \sum_{v_j \in P_i} \sum_{v_k \in P_i} \mathbbm{1}[y(v_j) = y(v_k)]
    \end{equation}
\noindent where $\mathbbm{1}[\cdot]$ is the indicator function.
\end{theorem}

\begin{proof}
    Let $\mathcal{N}(v_i)$ denotes the neighbors of node $v_i$ in the matrix  $\mathbf{P}\hat{\mathbf{P}}^\mathrm{T}$. According to Theorem~\ref{theorem:one-hop}, the positive node pairs of our loss in Equ.~\ref{loss} can be expressed as the one-hop neighbors of node. Thus, we have
    \begin{equation} \label{lp+}
        \begin{split}
            \mathcal{L}_{P}^+ &= \frac{1}{n} \sum_{v_i \in \mathcal{V}} \frac{1}{\lvert \mathcal{N}(v_i) \rvert} \sum_{v_j \in \mathcal{N}(v_i)} \lVert \mathbf{v}_i - \mathbf{v}_j \rVert^2_2 \\
            &\stackrel{\text{c}}{=} -\frac{1}{n} \sum_{v_i \in \mathcal{V}} \frac{1}{\lvert \mathcal{N}(v_i) \rvert} \sum_{v_j \in \mathcal{N}(v_i)} \mathbf{v}_i^\mathrm{T}\mathbf{v}_j\\
            &\stackrel{\text{c}}{=} - \frac{1}{n} \sum_{v_i \in \mathcal{V}} \frac{1}{\lvert \mathcal{N}(v_i) \rvert} \sum_{v_j \in \mathcal{N}(v_i)} 2\cdot\mathbf{v}_i^\mathrm{T}\mathbf{v}_j\\
        \end{split}
    \end{equation} 
    For the negative node pairs in Equ.~\ref{loss}, we have
    \begin{equation}
        \begin{split}
            \mathcal{L}_\mathcal{P}^- &= \frac{1}{m} \sum_{P_i \in \mathcal{P}} \frac{1}{\lvert \mathcal{N}(P_i) \rvert} \sum_{P_k \in \mathcal{N}(P_i)} \log \sum_{P_t \in \mathcal{P}} \exp(\mathbf{c}_i^\mathrm{T}\mathbf{c}_t)\\
            % &\geq \frac{1}{m} \sum_{P_i \in \mathcal{P}} \frac{1}{|P_i|} \sum_{v_j \in P_i} \log ( \sum_{P_t \in \mathcal{P}} \exp(\mathbf{c}_i^\mathrm{T}\mathbf{c}_t)) \\
            &= \frac{1}{m} \sum_{P_i \in \mathcal{P}} \frac{1}{\lvert \mathcal{N}(P_i) \rvert} \sum_{P_k \in \mathcal{N}(P_i)} \log ( \sum_{P_t \in \mathcal{P}} \frac{\exp(\mathbf{c}_i^\mathrm{T}\mathbf{c}_t)}{m} m) \\
            &= \frac{1}{m} \sum_{P_i \in \mathcal{P}} \frac{1}{\lvert \mathcal{N}(P_i) \rvert} \sum_{P_k \in \mathcal{N}(P_i)} \log  \sum_{P_t \in \mathcal{P}} \frac{\exp(\mathbf{c}_i^\mathrm{T}\mathbf{c}_t)}{m} + \log m\\
            &\geq \frac{1}{m} \sum_{P_i \in \mathcal{P}} \frac{1}{\lvert \mathcal{N}(P_i) \rvert} \sum_{P_k \in \mathcal{N}(P_i)} \log  \sum_{P_t \in \mathcal{P}} \frac{\exp(\mathbf{c}_i^\mathrm{T}\mathbf{c}_t)}{m}\\
        \end{split}
    \end{equation}
    According to Jensen's inequality, we have
    \begin{equation} \label{lp-}
        \begin{split}
            \mathcal{L}_\mathcal{P}^-  &\geq \frac{1}{m} \sum_{P_i \in \mathcal{P}} \frac{1}{\lvert \mathcal{N}(P_i) \rvert} \sum_{P_k \in \mathcal{N}(P_i)} \log  \sum_{P_t \in \mathcal{P}} \frac{\exp(\mathbf{c}_i^\mathrm{T}\mathbf{c}_t)}{m}\\
             &\geq \frac{1}{m} \sum_{P_i \in \mathcal{P}} \frac{1}{\lvert \mathcal{N}(P_i) \rvert} \sum_{P_k \in \mathcal{N}(P_i)} \ \frac{1}{m} \sum_{P_t \in \mathcal{P}} \mathbf{c}_i^\mathrm{T}\mathbf{c}_t\\
             &\Rightarrow \frac{1}{m^2} \sum_{P_i \in \mathcal{P}} \sum_{v_j \in P_i} \sum_{v_t \in \mathcal{V}} \mathbf{v}_j^\mathrm{T}\mathbf{v}_t\\
             &\stackrel{\text{c}}{=} \frac{1}{n^2} \sum_{v_i \in \mathcal{V}} \sum_{v_j \in \mathcal{V}} \mathbf{v}_i^\mathrm{T}\mathbf{v}_j\\
             &\stackrel{\text{c}}{=} \frac{1}{n^2} \sum_{v_i \in \mathcal{V}} \sum_{v_j \in \mathcal{V}} (\mathbf{v}_i^\mathrm{T}\mathbf{v}_j)^2
        \end{split}
    \end{equation}
    \end{proof}
    Combining the Eq.~\ref{lp+} and Eq.~\ref{lp-}, we have
    \begin{equation}
        \mathcal{L}_{\mathcal{P}} \geq  - \frac{1}{n} \sum_{v_i \in \mathcal{V}} \frac{1}{\lvert \mathcal{N}(v_i) \rvert} \sum_{v_j \in \mathcal{N}(v_i)} 2\cdot\mathbf{v}_i^\mathrm{T}\mathbf{v}_j + \frac{1}{n} \frac{1}{n} \sum_{v_i \in \mathcal{V}} \sum_{v_j \in \mathcal{V}} (\mathbf{v}_i^\mathrm{T}\mathbf{v}_j)^2
    \end{equation}
    Since $\mathbf{P}\hat{\mathbf{P}}^\mathrm{T}$ can be used as a low-rank approximation of the $k$-step diffusion matrix $\hat{\mathbf{A}}^k$, and the  $\mathbf{P}\hat{\mathbf{P}}^\mathrm{T}$ connects nodes within the same community to each other and removes the connections between nodes in different communities. According to Lemma~\ref{lemma}, our loss can be expressed as a spectral loss on $\mathbf{P}\hat{\mathbf{P}}^\mathrm{T}$. Thus, we complete the proof of Theorem~\ref{theorem:tasks}. 


\begin{table*}[h!]
	\caption{Datasets Statistics.}
    \centering
		\begin{tabular}{lcccccc}
			\toprule[1pt]
			Dataset & Nodes & Edges & Classes & Features  & Train / Val / Test  \\
			\midrule[0.75pt]
			Cora  & 2708 & 10556 & 7 & 1433  & 140 /500 / 1000 \\
			CiteSeer & 3327 & 9104 & 6 & 3703  & 120 / 500 / 1000 \\
			Pubmed & 19717 & 88648 & 3 & 500 &60 / 500 / 1000 \\
			    Wiki-CS & 11701 & 431206 & 10 & 300 &1170 / 1171 / 9360 \\
			Amazon-Computer & 13752 & 491722 & 10 & 767  & 1375 / 1376 / 11001 \\
			Amazon-Photo & 7650 & 238162 & 8 & 745  & 765 / 765 / 6120 \\
			Coauthor-CS & 18333 & 163788 & 15 & 6805  & 1833 / 1834 / 14666 \\
			Coauthor-Physics	& 34493 & 495924 & 5 & 841  & 3449 / 3450 / 27594 \\
                \midrule[0.75pt]
			Texas  & 183 & 309 & 5 & 1703 & 87 /59 / 37 \\
			Wisconsin  & 251 & 499 & 5 & 1703 & 120 /80 / 51 \\
			Cornell  & 183 & 295 & 5 & 1703 & 87 /59 /37 \\
			Actor  & 7600 & 29926 & 5 & 932  & 3634 /2432 / 1520 \\
                \midrule[0.75pt]
			Ogbn-Arxiv  & 169343 & 1166243 & 40 & 128 & 90941 /29799 / 48603 \\
			Ogbn-Products  & 2449029 & 61859140 & 47 & 100 & 196615 /39323 / 2213091 \\

			\bottomrule[1pt]
		\end{tabular}
		\label{tab:dataset}
\end{table*}
\section{Experimental Study} \label{app-exper}
\subsection{Details of Datasets} \label{app-data}
The statistics of all datasets are summarized in Table \ref{tab:dataset}.
\begin{itemize}
	\item \textbf{Cora, CiteSeer and Pubmed.} They are three citation network datasets, where nodes represent articles, edges represent citation relationships, features consist of bag-of-words representations of articles, labels correspond to the academic domains or metadata of the articles.
	\item \textbf{Wiki-CS.} It is a ciation network extracted from Wikipedia dataset, where nodes represent articles about computer science, edges represent the hyperlinks between two articles, features consist of bag-of-words representations of articles, and labels are different fields of each article.
        \item \textbf{Amazon-Computer and Amazon-Photo.} They are two co-purchase networks from Amazon dataset, where nodes represent products, edges represent pairs of products often bought together, features consist of bag-of-words representations of product reviews, and labels are the category of products.
        \item \textbf{Coauthor-CS and Coauthor-Physics.} They are two co-authorship networks extracted from Microsoft Academic Graph in KDD Cup 2016 challenge, where nodes represent authors, edges represent co-authorship relationships, features consist of bag-of-words representations of article keywords, and labels are the research fields of authors.
        \item \textbf{Texas, Wisconsin and Cornell.} They are three subsets of WebKB dataset, where nodes represent web pages, edges represent hyperlinks, features are described by a word vector comprising keywords extracted from page content, labels represent the categories of the web pages.
% 	\item \textbf{Chameleon.} It is a Wikipedia network, where nodes represent web pages, edges represent the links between two
% pages, features contain the informative nouns on pages, and labels represent the average traffic of page.
	\item \textbf{Actor.} It is a an actor co-occurrence network, where nodes represent actors, edges represent two actors have co-occurrence in the same movie, features represent the key word in the Wikipedia pages, and labels are the words of corresponding actors.
	\item \textbf{Ogbn-Arxiv and Ogbn-Products.} They are two datasets in the Open Graph Benchmark. Ogbn-Arxiv is a cation network, where nodes represent articles, edges represent citation relationships, features through averaging the embeddings of words in its title and abstract, labels represent the categories of the articles. Ogbn-Products is a co-purchase network, where nodes represent products, edges represent pairs of products often bought together, features consist of bag-of-words representations of product reviews, and labels are the category of the products.
\end{itemize}



\subsection{Parameters Settings} \label{app-para}
For fair comparison, we use the results provided by the authors in their original papers. For baselines not reported on specific datasets or those not utilizing standard public data splits, we carefully tune the hyper-parameters based on the authors' official code.  We implement our method in PyTorch with Adam optimizer. All experiments are conducted on a machine with Intel 13900KF CPU, 128GB RAM and RTX4090 GPU, running Windows 11. Each experiment is repeated for 20 times.

We use one layer of PCN and one layer of MLP as the mapping head to implement our method. The learning rate $lr$ is chosen from ${0.0001, 0.0005, 0.001, 0.002, 0.005, 0.1}$. The training epochs $T$ are chosen from ${25, 50, 75, 100, 150}$. The order of the diffusion matrix $k$ is chosen from {0, 1, 2, 3}, and Ogbn-Products is set to 10 since it is more complex. The partition rate $\beta$, temperature parameter $\tau$ and coefficient $\alpha$ are selected from 0 to 1. The embedding dimension $d$ is chosen from {1024, 1500, 2048, 4096}, and Ogbn-Products is set to 128. The hyper-parameters for each dataset are listed in Table~\ref{tab:parametrs}. More detailed hyper-parameters can be found in the provided code.

\begin{table}
	\setlength{\tabcolsep}{4.1pt}
    \centering
    \caption{Details of the Hyper-parameters in Our Method.}
		\begin{tabular}{lccccccc}
			\toprule[1pt]
			Dataset & $lr$ &$T$ &$k$ &$\beta$ & $d$ & $\alpha$ & $\tau$\\
			\midrule[0.75pt]
                Cora  & 0.005 &100 &3 & 0.02 & 1024 & 0.1  & 0.1\\
			CiteSeer & 0.0005 &100 &2 & 0.04 & 2048 & 0.2 & 0.8\\
			Pubmed & 0.001 & 75 &2 & 0.05 & 1024 &0.4 &0.45\\
			    Wiki-CS & 0.005 & 50 & 2 & 0.01 &2048 &0.9 &0.35\\
			Amazon-Computer & 0.0005 &150 &2 &0.1  &2048  &0.6 &0.2 \\
			Amazon-Photo & 0.001 & 150 & 1 &0.03  &2048 &0.4 &0.6\\
			Coauthor-CS & 0.005 & 100 & 1 & 0.05  & 1024 &0.2 &0.6\\
			Coauthor-Physics &0.1 & 25 & 1 & 0.09  & 2048 &0.5 &0.55\\
                \midrule[0.75pt]
			Texas  & 0.0005 & 100 & 0 & 0.05 & 4096 &1.0 &0.8\\
			Wisconsin  & 0.001 & 25 & 0 & 0.07 & 4096 &0.9 &1.0\\
			Cornell  & 0.0005 & 150 & 0 & 0.06 & 2048 &0.9 &0.3\\
			Actor  & 0.001 & 25 & 0 & 0.01  & 2048 &0.9 &0.75 \\
                \midrule[0.75pt]
			Ogbn-Arxiv  & 0.001 &25 & 3 & 0.03 & 1500 &0.6 &0.4\\
			Ogbn-Products  & 0.002 &25 &10 &0.001 &128 &0.8 &0.7\\
            			\bottomrule[1pt]

		\end{tabular}
		\label{tab:parametrs}
\end{table}
\subsection{Impacts of Parameter $\alpha$} \label{app-alpha}
We explore the impacts of coefficient $\alpha$ for controlling community neighbor loss in Figure~\ref{fig-alpha}. According to Table~\ref{tab:partition-time}, we found across 13 datasets that the optimal $\alpha$ generally correlates with the homophily level: higher homophily ratio requires a smaller $\alpha$ (emphasizing local information), while lower homophily ratio requires a larger $\alpha$ (emphasizing high-order structural information). This trend was observed in most of the datasets.

\subsection{Pretext Time} \label{app-time}
We report the time consumption for community segmentation using METIS (the fastest algorithm) and Structure Entropy (the algorithm achieving the highest accuracy) for all datasets in Table~\ref{tab:partition-time}. It can be seen that METS  is very efficient. Given that METIS offers a considerable speed advantage in constructing partitions, we propose a practical recommendation based on the performance-efficiency trade-off: structural entropy is suitable for medium-scale graphs to potentially achieve better performance, whereas METIS is recommended for large-scale graphs to ensure scalability. 

\begin{figure}[tb]
	\begin{center}
		\centerline{\includegraphics[width=\columnwidth]{fig-alpha.pdf}}
		\caption{Sensitivity Analysis of the Hyper-parameters $\alpha$. We Omitted the Variance Lines in Cornell and Actor.}
		\label{fig-alpha}
	\end{center}
\end{figure}


\begin{table}[H]
	\caption{Partition Time on All Dataset.}
    \centering
		\begin{tabular}{lcccccccc|cccc|cc}
			\toprule[1pt]
			Dataset & METIS Time &SE Time &Homophily Ratio &$\alpha$\\
			\midrule[0.75pt]
			Cora  & 0.021s &1.43s &0.77 &0.1\\
			CiteSeer & 0.027s &1.51s &0.63 &0.2\\
			Pubmed & 0.385s &12.42s &0.66 &0.4\\
			    Wiki-CS & 0.247s &178.14s &0.57 &0.9\\
			Amazon-Computer &0.773s  &7.04m &0.70 &0.6\\
			Amazon-Photo & 0.275s &62.58s &0.77 &0.4\\
			Coauthor-CS &  0.437s &33.22s &0.76 &0.2\\
			Coauthor-Physics	&1.226s  &11.72m &0.85 &0.5 \\
                \midrule[0.75pt]
			Texas  & 0.002s &1.18s &0.0013 &1.0\\
			Wisconsin  &  0.003s &1.18s &0.0941  &0.9 \\
			Cornell  &  0.002s &1.17s &0.0311 &0.9\\
			Actor  &  0.095s &3.72s &0.0110 &0.9\\
                \midrule[0.75pt]
			Ogbn-Arxiv  &  6.995s & 3h &0.42 &0.6 \\
			Ogbn-Products  & 79.73s & >12h &0.46 &0.8\\

			\bottomrule[1pt]
		\end{tabular}
		\label{tab:partition-time}
\end{table}



% \section{Model Training} \label{app-train}
% The overall procedure of our method is shown in Algorithm \ref{alg-train}.
% \begin{algorithm}
% 	\caption{Model Training}
% 	\label{alg-train}
% 	\textbf{Input}: a graph $G = (\mathbf{A}, \mathbf{X})$\\
% 	\textbf{Parameter}: number of communities $m$, hidden dimensions $d$, training epochs $T$, PCN encoder $f_\theta$, MLP encoder $f_\xi$, projection head $g_\varphi$\\
% 	\textbf{Output}: final representations $\mathbf{H}$
% 	\begin{algorithmic}[1] %[1] enables line numbers
% 		\STATE Initiate parameters $\theta$, $\xi$ and $\varphi$;
% 		\STATE $\mathcal{P} \leftarrow$ construct a partition of $G$ by METIS;
% 		\STATE Construct adjacency matrix of communities based on $\mathcal{P}$;
% 		\FOR{$t=1$ to $T$}
% 		\STATE $\mathbf{c} \leftarrow$ generate community representations via Eq. \ref{equ-pcn};
% 		\STATE $\mathbf{v} \leftarrow$ generate node representations via $f_\xi(\mathbf{X})$;
% 		\STATE $\mathcal{L}_\mathcal{P} \leftarrow$ calculate the overall loss via Eq. \ref{loss};
% 		\STATE Update model parameters $\theta$, $\xi$ and $\varphi$ via $\mathcal{L}_\mathcal{P}$;
% 		\ENDFOR
% 		\STATE $\mathbf{H} \leftarrow$ generate node representation via $\sigma(\hat{\mathbf{A}}^k\mathbf{X}\mathbf{W})$;
% 		\STATE \textbf{return} $\mathbf{H}$
% 	\end{algorithmic}
% \end{algorithm}


\end{document}
