\vspace{-2mm}
\section{Preliminaries}
\textbf{Notations.}
We %consider self-supervised graph representation learning, and 
denote by $\mathcal{G} = (\mathcal{V},\mathcal{E})$ an undirected graph, where $\mathcal{V}=\{v_1, v_2, \dots, v_N\}$ represents the node set, and $\mathcal{E} \subseteq \mathcal{V} \times \mathcal{V}$ represents the edge set. We denote by $\pmb{A}\in\{0,1\}^{N\times N}$ the symmetric adjacency matrix of graph $\mathcal{G}$, where $\pmb{A}_{ij} = 1$ if and only if $(v_i,v_j) \in \mathcal{E}$, and $\pmb{A}_{ij} = 0$ otherwise. We denote the feature matrix by $\pmb{X}$, where $\pmb{X}_{i.} \in \mathbb{R}^m$ is the feature vector of the $i^{th}$ node, 
and $\pmb{x}_{}\in\mathbb{R}^N$ is a column of the matrix and represents a graph signal. 
$\pmb{D}$ is the degree matrix of the graph, with $\pmb{D}_{ii} = \sum_j \pmb{A}_{ij}$, 
and $\mathcal{N}_i=\{j: \pmb{A}_{ij}=1\}$ is the neighborhood of node $i$. $\pmb{L}$ is the Laplacian matrix of the graph, defined as $\pmb{L} = \pmb{D} - \pmb{A}$. The normalized Laplacian matrix is denoted by $\pmb{L}_{sym} = \pmb{D}^{-\frac{1}{2}}\pmb{LD}^{-\frac{1}{2}}$, and the normalized adjacency matrix is defined as $\pmb{A}_{sym} = \pmb{D}^{-\frac{1}{2}}\pmb{AD}^{-\frac{1}{2}}$. Here, we use the renormalized version of the adjacency matrix $\pmb{\hat{A}}_{sym}= \pmb{\bar{D}}^{-\frac{1}{2}}\pmb{\bar{A}} \pmb{\bar{D}}^{-\frac{1}{2}}$ as introduced in ~\citep{kipf2016semi}, where $\pmb{\bar{A}} = \pmb{A} + \pmb{I}$, $\pmb{\bar{D}} = \pmb{D} + \pmb{I}$. 
Similarly, the renormalized Laplacian matrix is defined as $\pmb{\hat{L}}_{sym} = \pmb{I} - \pmb{\hat{A}}_{sym}$. 
%%%
$\pmb{\hat{L}}_{sym}$ is a real symmetric matrix, with orthonormal eigenvectors $\{\pmb{u}_i\}^n_{l=1} \in \mathbb{R}^n$, and corresponding eigenvalues $\lambda_i \in [0, 2)$ ~\citep{chung1997spectral}. 
% Therefore, we can write $\pmb{\hat{L}}_{sym} = \pmb{U} \pmb{\Lambda} \pmb{U}^T$, where $\pmb{\Lambda} = \text{diag}([\lambda_1, \lambda_2, \cdots , \lambda_n])$.
For $\pmb{\hat{A}}_{sym}$ we have $\lambda_i(\pmb{\hat{A}}_{sym})\in(-1,1]$. \looseness=-1

\subsection{Graph CL under Homophily} \label{sec: homophily_methods}
State-of-the-art graph CL methods %work by 
explicitly augment the input graph using feature or topology augmentations, encode the augmented graphs using a GNN-based encoder, and contrast the encoded node representations
~\citep{zhu2020deep,zhu2021empirical,velickovic2019deep,thakoor2021large,qiu2020gcc}, as we will discuss in more detail next. \looseness=-1
% Below, we discuss each of these steps in more detail.

\noindent\textbf{Graph Augmentation.} 
% The augmented views are generated by %first
First, the input graph is
explicitly augmented, by altering its topology or node features. Topology augmentation methods remove or add nodes or edges, and feature augmentation methods alter the node features by masking particular columns, dropping features at random, or randomly shuffling the node features~\citep{zhu2020deep,zhu2021empirical,velickovic2019deep,thakoor2021large}.

\noindent\textbf{GNN Encoder.} The augmented graphs are then passed through a GNN-based encoder to obtain the augmented node views.
The GNN encoder produces node representations by aggregating the node features in a neighborhood as follows:\looseness=-1
\begin{equation}
 \pmb{H}_{}^l = \sigma(\pmb{\tilde{A}}_{sym} \pmb{H}^{l-1} \pmb{W}^{l-1}_{}),\quad \pmb{H}^0=\pmb{X},
\end{equation}
where %$\pmb{\hat{A}}_{sym},\pmb{\hat{L}}_{sym}$ are the low-pass and high-pass filters,
$\pmb{H}^l_{L}$ is the node representations at layer $l$ of the encoder,  $\pmb{W}^l\in\mathbb{R}^{d_l \times d_{l-1}}$ is the weight matrix in layer $l$ of the encoder, and $\sigma$ is the activation function.
%%%
Crucially, the Adjacency matrix $\pmb{\tilde{A}}_{sym}$ %is low-pass filter, which only preserves the low-frequency signals by 
is a low-pass filter that
aggregates every node's features with the features of nodes in its immediate neighborhood. 
% Using the low-pass filter within a 
For a multi-layer graph encoder, it
iteratively aggregates features in a multi-hop neighborhood of every node to learn its representation. 
Hence, it smooths out the node representations and produces similar representations for the nodes within the same multi-hop neighborhood. 
% The final representation of node $i$ are obtained at the last layer $L$ of the encoder. %by $\pmb{z}_i=\pmb{H}^L_{i.}$. \looseness=-1

\noindent\textbf{Contrastive Loss.}
Finally, the contrastive loss distinguishes the representations of the same node in two different augmented views, from other node representations.
%%%%
For example the commonly used InfoNCE loss ~\citep{oord2018representation} is: 
\begin{equation}
-\log \frac{e^{\text{sim}_{\tau}(\pmb{u}_{}^i, \pmb{v}_{}^i)}}{e^{\text{sim}_{\tau}(\pmb{u}_{}^i, \pmb{v}_{}^i)} + \sum\limits_{\substack{k \neq i}} e^{\text{sim}_{\tau}(\pmb{u}_{}^i, \pmb{v}_{}^k)}},
\end{equation}
where $\pmb{u}_i, \pmb{v}_i$ are representations of two different augmented views of node $i$, $\text{sim} (\pmb{u}_{}^i, \pmb{v}_{}^k)$ is the cosine similarity between $\pmb{u}^i$ and  $\pmb{v}^k$, and $\tau$ is a temperature parameter.

\vspace{-2mm}
\subsection{High-pass and Low-pass graph filters} \label{sec:filter}
The adjacency and Laplacian matrices can be leveraged to filter the smooth and non-smooth graph components, and capture similarity and dissimilarity of node features to their neighborhoods.
\noindent Specifically, multiplication of Laplacian with a graph signal $\pmb{\hat{L}}_{sym}\pmb{x}=\sum_i \lambda_i \pmb{u}_i\pmb{u}_i^T \pmb{x}$, acts as a filtering operation over $\pmb{x}$, adjusting the scale of the components of $\pmb{x}$ in the frequency domain.
The entries of every eigenvector, $\pmb{u}_i$ align with a cluster of connected nodes in the graph. 
For the Laplacian matrix, a smaller eigenvalue $\lambda_i$ %is the amplitude of
corresponds to a lower frequency %}components %and corresponds to a 
(smoother) {eigenvectors} 
$\pmb{u}_i$, 
and a larger cluster of connected nodes.
On the other hand, a larger $\lambda_i$ %is the amplitude of a {high frequency graph components}, and 
corresponds to a high frequency (non-smooth)
{eigenvectors} $\pmb{u}_i$,
which identify smaller clusters of closely connected nodes in the graph.
A Laplacian filter magnifies the {high frequency signals} that align well with basis functions corresponding to large eigenvalues $\lambda_i\in(1,2)$ and suppresses the {low frequency} signal that aligns with basis functions corresponding to small eigenvalues $\lambda_i\in[0,1]$.
That means, for small clusters of nodes that have a large alignment with $\pmb{u}_i$ corresponding to $\lambda_i>1$, the projection %$\lambda_i\pmb{u}_i\pmb{u}_i\pmb{x}$ 
{$\lambda_i \pmb{u}_i\pmb{u}_i^T \pmb{x}$} amplifies $\pmb{x}$ within the cluster and 
% the multiplication by the Laplacian, $\pmb{\hat{L}}_{sym}\pmb{x}$, enlarges their features. and 
consequently magnifies the difference in $\pmb{x}$ among the nodes within that cluster. On the other hand, for the larger clusters that align well with $\pmb{u}_i$ corresponding to $\lambda_i<1$, the projection %$\lambda_i\pmb{u}_i\pmb{u}_i\pmb{x}$ 
\cl{$\lambda_i \pmb{u}_i\pmb{u}_i^T \pmb{x}$} suppresses $\pmb{x}$ within the cluster and reduces the differences in $\pmb{x}$ among the nodes within that cluster. 
Hence the Laplacian matrices can be generally regarded as high-pass filters ~\citep{ekambaram2014graph}, %which filters out the smooth components. 
that enlarge the differences in node features over small clusters, and smooths out the differences over larger clusters in the graph.
% Thus, eigenvectors corresponding to larger eigenvalues indicate smaller clusters of nodes in the graph.
%%
% The multiplication of Laplacian with a graph signal $\pmb{Lx}=\sum_i \lambda_i \pmb{u}_i\pmb{u}_i^T \pmb{x}$, acts as a filtering operation over $\pmb{x}$, adjusting the scale of the components of $\pmb{x}$ in frequency domain. 
% That is, the part of the signal that aligns with a large $\lambda_i>1$ will be amplified, while the part that aligns with a small $\lambda_i<1$ will be suppressed.
% In summary, the Laplacian matrix magnifies the differences in node features over small clusters of the graph, and smooths out the differences over larger clusters in the graph.
%
In contrast, affinity matrices, such as the normalized adjacency matrix, can be treated as low-pass filters ~\citep{nt2019revisiting}, which suppress and filter out non-smooth components 
%
%and suppress the differences as it 
of the signals. This is because all of the eigenvalues of the affinity matrices are smaller than 1, i.e.,  $\lambda_i\in(-1,1]$.
% \textbf{High-pass and Low-pass graph filters.}
% Left multiplying the Laplacian or Affinity matrices with a graph signal $\pmb{x}$ is a filtering operation over $\pmb{x}$. %which adjusts the scale of the components of $\pmb{x}$ in frequency domain.

\noindent On the node level, left multiplying $\pmb{\hat{L}}_{sym}$ and $\pmb{\hat{A}}_{sym}$ filters with $\pmb{x}$ can be understood as diversification and aggregation operations, respectively ~\citep{luan2020complete}.
% \begin{align}\label{eq:operations} (\pmb{\hat{L}}_{sym}\pmb{x})_i=\sum_{j\in\mathcal{N}_i}\frac{1}{\pmb{D}_{ii}}(\pmb{x}_i-\pmb{x}_j), \quad (\pmb{\hat{A}}_{sym}\pmb{x})_i=\sum_{j\in\mathcal{N}_i}\frac{1}{\pmb{D}_{ii}}\pmb{x}_j.
% \end{align}
In particular, a typical GNN filters smooth graph frequencies by aggregating the node representations with those of their neighbors, using the adjacency matrix, i.e., %$\pmb{\hat{A}}_{sym}\pmb{x}$. 
\begin{align}\label{eq:operations_A} \vspace{-2mm}(\pmb{\hat{A}}_{sym}\pmb{x})_i=\sum_{j\in\mathcal{N}_i}\frac{1}{\pmb{\bar{D}}_{ii}}\pmb{x}_j.
\end{align}
Hence, it results in similar representations for the nodes in a neighborhood. In contrast, the high-pass filter only preserves the high-pass frequencies, using the Laplacian matrix, i.e. %$\pmb{\hat{L}}_{sym}\pmb{x}$. 
\begin{align}\label{eq:operations_L} (\pmb{\hat{L}}_{sym}\pmb{x})_i=\sum_{j\in\mathcal{N}_i}\frac{1}{\pmb{\bar{D}}_{ii}}(\pmb{x}_i-\pmb{x}_j). %\quad (\pmb{\hat{A}}_{sym}\pmb{x})_i=\sum_{j\in\mathcal{N}_i}\frac{1}{\pmb{D}_{ii}}\pmb{x}_j.
\end{align}
In doing so, it magnifies the dissimilarities between the nodes %with different features 
and
make the representations of nodes in a neighborhood distinguishable.
% Since $\pmb{\hat{L}}_{sym}+\pmb{\hat{A}}_{sym} = \pmb{I}$, both filters capture complementary information and their combination using a contrastive loss allow learning richer representations.

% \ba{I think we cannot drop the definition of homophily ratio when talking about heterophily graphs!!}

\textbf{Homophily Ratio}
Homophily ratio quantifies how likely nodes with {same labels} are connected in the graph. Formally, homophily ratio, $\beta$, is defined as follows ~\citep{pei2020geom}:
\begin{align}
\beta = \frac{1}{|V|} \sum_{v \in V} \frac{\text{No. of similar neighbors of } v}{\text{No. of neighbors of } v}.
\end{align}