%\documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}
% Recommended, but optional, packages for figures and better typesetting:
\usepackage{microtype}
\usepackage{graphicx}
%\usepackage{subfigure} % Do not include this package; if so, it will ruin subfigure formatting. ONLY use \usepackage{subcaption}
\usepackage{booktabs} % for professional tables
\usepackage{multirow}
\usepackage[table,xcdraw]{xcolor}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

% Additional packages to preamble the formulas, tables
\usepackage{macros}
\usepackage{enumitem}
%\usepackage{geometry}      % do not include this package--it will change the required icml 2023 format
\usepackage{url}            % simple URL typesetting
\usepackage{makecell}
\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}    % use 8-bit T1 fonts
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{floatrow}
\usepackage{caption}
\usepackage{subcaption}
\usepackage{amsmath}
% \usepackage{multirow}
% \usepackage{graphicx}
% \usepackage[table,xcdraw]{xcolor}

\renewcommand{\a}{\alpha}
\renewcommand{\b}{\beta}
% \newcommand{\xhdr}[1]{\vspace{0.1mm}\noindent{{\bf #1.}}}
\setlength{\abovedisplayskip}{-30pt}
\setlength{\belowdisplayskip}{-15pt}
\setlength{\abovedisplayshortskip}{-30pt}
\setlength{\belowdisplayshortskip}{-15pt}
\setlength{\textfloatsep}{10pt plus 0.0pt minus 0.0pt}
\setlength\abovecaptionskip{0pt}
\setlength\belowcaptionskip{0pt}
% Table float box with bottom caption

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr} 
\makeatletter

\newcommand*{\addFileDependency}[1]{% argument=file name and extension
\typeout{(#1)}% latexmk will find this if $recorder=0
% however, in that case, it will ignore #1 if it is a .aux or 
% .pdf file etc and it exists! If it doesn't exist, it will appear 
% in the list of dependents regardless)
%
% Write the following if you want it to appear in \listfiles 
% --- although not really necessary and latexmk doesn't use this
%
\@addtofilelist{#1}
%
% latexmk will find this message if #1 doesn't exist (yet)
\IfFileExists{#1}{}{\typeout{No file #1.}}
}\makeatother

\newcommand*{\myexternaldocument}[1]{%
\externaldocument{#1}%
\addFileDependency{#1.tex}%
\addFileDependency{#1.aux}%
}
%------------End of helper code--------------

% put all the external documents here!
\myexternaldocument{donnat_261}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Studying the Effect of GNN Spatial Convolutions On The Embedding Space's Geometry \\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{Claire Donnat}
\author[1]{So Won Jeong}
% Add affiliations after the authors
\affil[1]{%
    Department of Statistics\\
    The University of Chicago\\
    Chicago, Illinois, USA
}
%\affil[2]{%
%    Second Affiliation\\
%    Address\\
%    …
%}
  
\begin{document}
  
\onecolumn %% Turn this off if a single column is desired for the supplement
\maketitle

\appendix
\section{Comparison of GNN operators}\label{appendix:gnn}
%%[H][b] not available if use * only [t](top of the page) [p](end of the document available
\begin{table*}[h] 
\centering
%\begin{adjustbox}{max width=0.9\textwidth}
\resizebox{0.9\textwidth}{!}{%
\begin{tabular}{|c|c|c|c|}
\hline
\textbf{Method} & \textbf{Operator Type} & \textbf{Convolution Family} & 
  \textbf{Operator} \\ \hline \hline
GCNConv \citep{kipf2016semi} & Spatial  & $\mathcal{F}_{\alpha=0.5, \beta=1}$ &
  \begin{tabular}[c]{@{}c@{}}$ x_i' = \hat{D}^{-1/2} \hat{A} \hat{D}^{-1/2} XW$\\ $ x_i' =  \sum_{j \in \mathcal{N}(i)} \frac{e_{ij}}{\sqrt{\hat{d}_i\hat{d}_j}}x_j $\end{tabular} \\ \hline
ChebConv \citep{defferrard2016convolutional} &Spectral & NA &
  \begin{tabular}[c]{@{}c@{}}$X =  X W_1 + \hat{L}X W_2 + (2\hat{L}^2X - X)W_3$\\ with $\hat{L} = \frac{2}{\lambda_{\max}}L-I$\end{tabular} \\ \hline
SAGEConv \citep{hamilton2017inductive}&Spatial &  $\mathcal{M}_{\alpha=0, \beta=1}$  &
  \begin{tabular}[c]{@{}c@{}}$x = W_1x_i + W_2 \bar{X}_{j \in \mathcal{N}(i)} $\\ with \\ $ \bar{X}_{j \in \mathcal{N}(i)} =\frac{\sum_{j \in \mathcal{N}(i)}x_j}{d_i}$\end{tabular} \\ \hline
GraphConv\citep{morris2019weisfeiler} &Spatial & $\mathcal{F}_{\alpha=0, \beta=0}$ &
  $x_i' = W_1 x_i + W_2\sum_{j \in \mathcal{N}(i)} e_{ij} x_j$ \\ \hline
GatedGraphConv \citep{li2015gated} &Spatial & Variant of $\mathcal{F}_{\alpha=0, \beta=0}$ &
  \begin{tabular}[c]{@{}c@{}}$h_i^{(0)} = x_i ||0$\\ $m_i^{(l+1)} = \sum_{j \in \mathcal{N}(i)} e_{j,i} Wh_j^{(l)}$\\ $h_i^{(l+1)} = GRU(m_i^{(l+1)}, h_i^{(l)}) $\end{tabular} \\ \hline
ResGatedGraphConv \citep{bresson2017residual} &Spatial & Variant of $\mathcal{F}_{\alpha=0, \beta=0}$ &
  \begin{tabular}[c]{@{}c@{}}$x_i' = W_1x_i + \sum_{j \in \mathcal{N}(i)} \eta_{ij} \circ W_2x_j$\\ with\\ $\eta_{ij} =\sigma(W_3x_i + W_4x_j) $\end{tabular} \\ \hline
 \begin{tabular}[c]{@{}c@{}} GAT \citep{velivckovic2017graph}\\ GATv2Conv \citep{brody2021attentive}
 \end{tabular}&
Spatial & Variant of $\mathcal{M}_{\alpha=0, \beta=1}$  &
  \begin{tabular}[c]{@{}c@{}}$x_i' = \alpha_{ii}Wx_i + \sum_{j\in \mathcal{N}(i)}\alpha_{ij}Wx_j $\\ with \\ $\alpha_{ij} =\frac{\exp\{ \text{LeakyReLU}(a^T[\Theta x_i || \Theta x_j]\}}{\sum_{k\in \mathcal{N}(i) \cup \{i\} } \exp\{ \text{LeakyReLU}(a^T[\Theta x_i || \Theta x_j]\}}$ \end{tabular} \\ \hline
AGNN \citep{thekumparampil2018attention}&Spatial &  $\mathcal{M}_{\alpha=0, \beta=1}$ &
  \begin{tabular}[c]{@{}c@{}}$X' =PX$\\ $P_{ij} = \frac{ \exp\{ \beta \dot \cos(x_i,x_j) \}}{\sum_{k \in \mathcal{N}(i) \cup \{i\} }\exp\{ \beta \dot \cos(x_i,x_k) \} }$ \end{tabular} \\ \hline
 Transformer Conv \citep{shi2020masked}&Spatial & Variant of $\mathcal{M}_{\alpha=0, \beta=0}$  &
  \begin{tabular}[c]{@{}c@{}}$x_i' = W_1 x_i +\sum_{j \in \mathcal{N}(i)} \alpha_{ij} W_2 x_j$\\ $\alpha_{ij} =\text{Softmax} \frac{(W_3 x_i)^T (W_4 x_j)}{\sqrt{d}} $\end{tabular} \\ \hline
TAGConv \citep{du2017topology}&Spectral & NA &
  $ X' = \sum_{k=0}^K (D^{-1/2} AD^{-1/2})^k XW_k$ \\ \hline
GINConv \citep{xu2018powerful}&Spatial &  $\mathcal{F}_{\alpha=0, \beta=1+\epsilon}$  &
  $X'  = h_{\theta}\Big( (\mathbf{A} + (1+\epsilon) \mathbf{I} )X \Big))$ \\ \hline
GINEConv \citep{hu2019strategies}&Spatial & Variant of $\mathcal{F}_{\alpha=0, \beta=0}$ &
  $x'_i  = h_{\theta}\Big( ( (1+\epsilon) x_i +\sum_{j \in \mathcal{N}(i)} \text{ReLU}(x_j +e_{ij}) \Big)) $ \\ \hline
ARMAConv \citep{bianchi2021graph} &Spectral & NA &
  $X' =\frac{1}{K} \sum_{k=1}^K X_k^{(T)}$ \\ \hline
SGCConv \citep{wu2019simplifying} &Spatial & $\mathcal{F}_{\alpha=0.5, \beta=1}$ &
  $X' = (\hat{D}^{-1/2} \hat{A}\hat{D}^{-1/2} )^KXW$ \\ \hline
\end{tabular}
%\end{adjustbox}
}
\caption{Comparison of some of the different convolution operators. We report here some of the most famous existing convolutions --- but we deliberately omitted those applicable to edges, dynamic graphs, heterogeneous graphs,  hypergraphs and other extensions. We report the type of convolution family (as defined in Section \ref{sec:convolutions}) corresponding to each of the proposed convolution. The term `` variant'' denotes some adaptation of the base family (for instance, learning the appropriate edge weights as part of the training procedure, or treating the source node differently than the sum of the neighbors). }\label{tab:operators_all}
\end{table*}

\section{Proofs of Section \ref{sec:geometry}}\label{appendix:extrinsic}
\begin{proof}[Lemma \ref{lemma:dis}]
As per section \ref{sec:geometry}, we analyse the embedding that is fed into the last linear layer, denoted as:
\begin{equation*}
%\resizebox{0.475\textwidth}{!}{
\begin{split} 
H^{(K)} = S\s( H^{(K-1)}) = \sum_{v \in \mc{N}(u) \cup\{u\}} \frac{A_{uv}}{(d_u +\beta)^{\alpha}(d_v +\beta)^{\alpha}} Z_{v\cdot}
\end{split}
%}
\end{equation*}
where $ \mc{N}(u)$ denotes the neighbourhood of node $u$, $A_{uv}$ is the (potentially weighted) adjacency matrix, with diagonal equal to $\beta$, and $Z_{v\cdot}  = \s( H^{(K-1)}_{v\cdot})$.

Writing $\Delta_v = {d_v-d_u}$, note that $H^{(K)}$ can be rewritten as:
\begin{align}\label{eq:taylor}
     H^{(K)} & = \frac{1}{(d_u +\beta)^{2\alpha}}\sum_{v \in \mc{N}(u) \cup\{u\}} \frac{A_{uv}}{(1 + \frac{\Delta_v}{d_u +\beta})^{\alpha}} Z_{v\cdot} 
\end{align}


Note that $\frac{\Delta_v}{d_u+\beta} \geq -1$ as long as $d_v-d_u\geq -d_u -\beta  \implies  d_v \geq -\beta$, which holds necessarily, since $d_v\geq 1$.  Since the function $x \to  ( x+ 1)^{-\alpha}$ is infinitely differentiable for $x \in (-1, \infty)$, using the Maclaurin expansion of $( x+ 1)^{-\alpha}$ around $0$, we know that there exists $ \xi\in [\min(0, x),\max(0, x)]$ such that:
 \begin{align} \label{eq:holder} \frac{1}{(x+1)^{\a}}  =1  - \a x +  \frac{\alpha(\alpha+1)}{2} \frac{ x^2}{(\xi+1)^{\alpha+2}}   \end{align}
It is easy to check that if $\frac{\Delta_v}{d_u+\beta}\geq 0$, then $\frac{1}{(1+\xi)^{\alpha+2}}\leq 1$.  Conversely, if $d_v\leq d_u$, then $\frac{1}{(\xi+1)^{\alpha+2}} \leq \frac{1}{(1+\frac{\Delta_v}{d_u+\beta})^{\alpha+2}} \leq \frac{1}{(1+\frac{1-d_u}{d_u+\beta})^{\alpha+2}}\leq \frac{1}{(\frac{\beta+1}{d_u+\beta})^{2+\alpha}}\leq (d_{\max} + \beta)^{2 + \alpha}= M$.

Equation~\ref{eq:taylor} thus becomes: 
\begin{equation}\label{eq:ineq}
\begin{split}
%\resizebox{0.9\textwidth}{!}{
||H^{(K)}||_2 =&||S Z||_2 \leq \frac{1}{(d_u + \b)^{2\a} } \sum_{v \in \mathcal{N}(u)\cup\{u\}} \frac{A_{uv}}{( 1 +  \frac{\Delta_v}{d_u + \beta} )^{\a}}  ||Z_v||_{2} \notag\\
&\leq \frac{ || Z||_{2,\infty}}{(d_u + \b)^{2\a} } \sum_{v \in \mathcal{N}(u)\cup\{u\}} A_{uv} ( 1 \begin{aligned}[t]&-\alpha  \frac{\Delta_v}{d_u + \beta}+ \frac{\alpha(\alpha+1)M}{2}  \frac{\Delta^2_v}{(d_u + \beta)^2}) \end{aligned}\\
&={ ||Z||_{2,\infty}}  \Big ( ( d_u + \b)^{1-2\alpha} \begin{aligned}[t]&-  \alpha  \frac{\bar{\Delta}_v}{(d_u + \beta)^{2\alpha}}+ \frac{\alpha(\alpha+1)M}{2}  \frac{\overline{\Delta^2}_v}{(d_u + \beta)^{1 + 2\alpha}} \Big)\end{aligned}
\end{split}
\end{equation}
where $\bar{\Delta}_u$ (respectively $\bar{\Delta^2}_u$) are the weighted averages of the degree differences: $\bar{\Delta}_u = \frac{ \sum_{v \in \mathcal{N}(u)\cup\{u\}} A_{uv} \Delta_v} {d_u + \beta}$ (respectively, squared degree differences: $\bar{\Delta^2}_u = \frac{ \sum_{v \in \mathcal{N}(u)\cup\{u\}} A_{uv} \Delta_v^2} {d_u + \beta}$). In the previous equation, we have also introduced the notation $||Z||_{2, \infty} = \max_{v} ||Z_v||_2$.

% \begin{remark}
% Note that, when $\s=\text{ReLU}$, then, by positivity of the entries in $Z$, we have the following (entrywise) inequality:
% \begin{align}\label{eq:ineq2}
% 0&\leq S Z \leq \frac{1}{(d_u + \b)^{2\a} } \sum_{v \in \mathcal{N}(u)\cup\{u\}} \frac{A_{uv}}{( 1 +  \frac{\Delta_v}{d_u + \beta} )^{\a}}  Z_v \notag\\
% &\leq{ ||Z||_{\infty}}  \Big ( ( d_u + \b)^{1-2\alpha} -  \alpha  \frac{\bar{\Delta}_v}{(d_u + \beta)^{2\alpha}} + \frac{\alpha(\alpha+1)M}{2}  \frac{\overline{\Delta^2}_v}{(d_u + \beta)^{1 + 2\alpha}} \Big) 
% \end{align}
% \end{remark}
\end{proof}



\section{Proofs of Section \ref{sec:geometry-intrinsic}}\label{appendix:intrinsic}
\subsection{Section \ref{sec:geometry-intrinsic}: Proof of the observations}



We begin by revisiting in greater details the observations  made in section \ref{sec:geometry-intrinsic}.
To see how the two families of spatial operators differ in the importance they attribute to topology and node feature information, consider a simple two-layer GCN such as  suggested by Kipf et al \cite{kipf2016semi}. In this setting, node embeddings can be written as:
 $H = S  \s(S XW + b),$ so that the output of the network is $Y = S  \s(S XW + b)W^{(2)}+b^{(2)}=HW^{(2)} + b^{(2)}$. We also choose the non-linearity $\s$ to be the ReLU function. In this case, for the directions in which the term is positive,  the embedding $H$ (ie, the transformed features that are being fed into the last linear layer) can be re-written as:
\begin{align}\label{eq:emb}
H_{u\cdot}= \sum_{k=1}^d \sum_{\substack{
        v \in \tilde{\mc{N}}(u)  \\
        (S XW + b)_{vk} \geq 0
    }} \big( S_{uv}(S XW)_{vk}W^{(2)}_{k\cdot} + S_{uv}b_{vk}W^{(2)}_{k\cdot} \big)
\end{align} 
    The embedding is thus the sum of two components: a function of a (subset of) neighbouring feature vector and a term that has the potential to encode local topology. To see why this is the case, consider a scenario where nodes in $\mc{N}(u)$ are all such that $ (S XW + b)_{vk} \geq 0$ for all $k$ or $ (S XW + b)_{vk} < 0$ for all $k$. Denote $\tilde{A}(u) = \{ v \in \tilde{N}(u):\quad (S XW + b)_{vk} \geq 0 \quad \text{for all } k \}$. In this case, Equation \ref{eq:emb} becomes:
$H_{u\cdot}= \sum_{
        v \in {\tilde{A}(u)}} \big( S_{uv}(S X\tilde{W})_{v\cdot} + S_{uv}\tilde{b}_{v\cdot} \big)$, with $\tilde{b} = bW^{(2)}$ and $\tilde{W}=WW^{(2)}$.
Therefore, for symmetric convolutions, the term $ (\sum_{
        v \in {\tilde{A}(u)}} S_{uv})b  $ encodes information about the neighborhood (it is proportional to the number of terms in the sum $|{\tilde{A}(u)}|$.)
        Conversely, for row-symmetric convolutions, this term is identically equal to $b$, resulting in an embedding that is less sensitive to topology.
        
        
\subsection{Proof of lemma \ref{lemma:inh}: Symmetric Convolutions}
In this subsection, we prove the results stated in lemma \ref{lemma:inh} for symmetric convolutions. We remind the reader of the setting of lemma \ref{lemma:inh}: we consider two structurally equivalent neighbourhoods (meaning that there exists a mapping $\phi$ that transforms each node in the neighborhood of $v$ into its corresponding one in the neighborhood of $u$ --- see Figure~\ref{fig:str_eq}), but the feature vectors are different. Mathematically, we model this situation as:
$$ \forall j \in N(v), \quad  X_j = X_{u\phi(j)} +\epsilon $$
where $\epsilon$ is a vector with independent centered Gaussian entries with parameter $\sigma$.  The purpose of this subsection is to analyze the effect of the convolution on the relative distance between embeddings. 

Lemma \ref{lemma:inh} is re-written here, to make this appendix self-contained:



\xhdr{Lemma 4.1}
{\it 
For symmetric convolutions, with probability at least $1-\delta$, with $M$ as in \ref{lemma:dis}, we have: 
\begin{equation*}
\begin{split}
||H_u -H_{u'}||^2  &\leq \mu +  2 \sqrt{2}\s||W||_{2, \infty}(d_u +\beta)^{1-2\a} \times \sqrt{1 +  2\a |\overline{\Delta}_u|  +  \frac{\a(2\a+1) M}{d_u}} \log(1/\delta) 
\end{split}
\end{equation*}
  where $\mu = \sigma^2||W||^2 \Big( (d_u +\beta)^{2-4\a} +  2\a |\overline{\Delta}_u | +  \a(2\a+1) M \frac{\overline{\Delta^2}_u}{d_u}\Big).$
Conversely, for row-symmetric embeddings:
%\vspace{-0.2cm}
\begin{equation*}
\begin{split}
    ||H_u -H_{u'}||^2 
  & \leq \mu +  2 \sqrt{2}\s||W||_2\sqrt{\sum_{v \in \tilde{\mathcal{N}}(u)} \frac{1}{(d_v +\beta)^{2\a}}} \log(1/\delta),\\
  & \mu = \frac{\sigma^2||W||^2}{\sum_{v \in \tilde{\m{N}}(u)} (d_v +\beta)^{-2\a}} \frac{1}{1+\beta}  
\end{split}
\end{equation*}
}



%In this case, the input that is being fed into the at the next layer has the form:
% $ Z = \sigma(S X W + b)$.
% In this proof, we restrict to considering non-linearities that are contracting (meaning that $||\sigma(X) -\s(Y)  ||_2^2 \leq  || X-Y ||^2$ --- which is the case for the ReLU function.
As previously stated, the purpose of this subsection is to analyze the effect of the convolution on the relative distance between embeddings. Consequently, we consider a simplified one-layer setting, with no non-linearities. We argue that this is indeed sufficient to characterize the effect of the convolution on the organization of the data, and we expect results for deeper networks to follow by induction, and to hold by 1-Lipschitzness of the ReLU activation for ReLU non-linear GNNs. 

\begin{proof}{\bf \hspace{0.05cm} of Lemma~\ref{lemma:inh}(symmetric convolutions).}
Therefore, in the simplified setting, the distance between the outputs of a GCN layer for nodes $u$ and $v$ can be written as:
\begin{equation}
    \begin{split}
 %||\s(SH^{(k)}_uW+ b)  -\s(H^{(k)}_{u'}W+ b)||_2^2 &\leq  || H^{(k)}_uW -H^{(k)}_{u'}W||_2^2\\
H^{(k)}_u -H^{(k)}_{u'}&= (SX_{u}-SX_{u'})W\\
&=\sum_{v\in \mc{N}(u)\cup \{u\}} \frac{A_{uv}}{(d_u+\beta)^{\alpha} (d_v +\beta)^{\alpha}}  (X_{v} -X_{\phi(v)})W \\
&= \sum_{v\in \mc{N}(u)\cup \{u\}} \frac{A_{uv}}{(d_u+\beta)^{\alpha} (d_v +\beta)^{\alpha}} \epsilon_vW\\
    %   &\leq \frac{1}{(d_u+1)^{2\alpha}} \sum_{v\in \mc{N}(u)} \frac{1}{ (d_j +1)^{2\alpha}} \mathbb{E}[||    X_{j} -X_{\phi(j)}||^2]\\
    %   &= p\sigma^2  \frac{1}{(d_u+1)^{2\alpha}}  \sum_{j\in N(i)} \frac{1}{ (d_j +1)^{2\alpha}} 
        \end{split}

\end{equation}
Since $\epsilon_v \sim \mc{N}(0, \sigma^2),$ each entry of the vector $ H^{(k)}_u -H^{(k)}_{u'}$ is Gaussian:
\begin{equation*}
\begin{split}
  H^{(k)}_{uj} -H^{(k)}_{u'j}&=  \sum_{v\in \mc{N}(u)\cup \{u\}} \frac{A_{uv}}{(d_u+\beta)^{\alpha} (d_v +\beta)^{\alpha}} \sum_{k=1}^d \e_{vk}W_{kj} \\ &\sim \mc{N}(0, \frac{\sigma^2||W_{\cdot j}||^2}{(d_u+\beta)^{2\alpha}}\sum_{v\in \mc{N}(u)\cup \{u\}} \frac{A_{uv}^2}{(d_v +\beta)^{2\alpha}})
\end{split}
\end{equation*}

The mean of $||H^{(k)}_u -H^{(k)}_{u'}||^2$ is simply given by:
\begin{equation*}
\begin{split}
\mu &= \E[||H^{(k)}_u -H^{(k)}_{u'}||^2] \\
    &= \frac{\sigma^2||W||^2}{(d_u+\beta)^{2\alpha}}\sum_{v\in \mc{N}(u)\cup \{u\}} \frac{A_{uv}^2}{(d_v +\beta)^{2\alpha}}\\
&\leq  \frac{\sigma^2||W||^2}{(d_u+\beta)^{4\alpha}}\sum_{v\in \mc{N}(u)\cup \{u\}}A_{uv}^2  (1 \begin{aligned}[t]&-  2\a \frac{\Delta_v}{d_u + \beta} +  \a(2\a+1) M \frac{\Delta_v^2}{(d_u + \beta)^2}) \end{aligned}
\end{split}
\end{equation*}
for some constant $M$, using the same Taylor expansion reasoning as for Lemma~\ref{lemma:dis}. Therefore, denoting as $\tilde{d}_u =\sum_{v \in \mc{N}(u)} A_{uv}^2$ and $\bar{\tilde{\Delta}} = \frac{\sum_{v\sim \mathcal{N}(u)}A_{uv}^2 \Delta_v}{\tilde{d}_u }, \overline{\tilde{\Delta}^2} = \frac{\sum_{v\sim \mathcal{N}(u)}A_{uv}^2 \Delta_v^2}{\tilde{d}_u }$, we have:
\begin{equation*}
\begin{split}
\mu  &\leq  \frac{\sigma^2||W||^2}{(d_u+\beta)^{4\alpha}} \Big( (\tilde{d}_u +\beta^2) \begin{aligned}[t]&-  2\a \overline{\Delta}_u \frac{\tilde{d}_u}{d_u + \beta}  +   \a(2\a+1) M \overline{\tilde{\Delta}^2}_u  \frac{\tilde{d}_u}{(d_u + \beta)^2})\end{aligned} \\
 &\overset{(i)}{\leq}  \frac{\sigma^2||W||^2}{(d_u+\beta)^{4\alpha}} \Big( (d_u +\beta)^2 \begin{aligned}[t]&+  2\a |\overline{\tilde{\Delta}}_u| \frac{{d}_u}{d_u + \beta}  +\a(2\a+1) M \frac{\overline{\tilde{\Delta}^2}_u}{d_u +\beta}) \end{aligned}\\
& =  \sigma^2||W||^2 \Big( (d_u +\beta)^{2-4\a} \begin{aligned}[t]+  2\a |\overline{\tilde{\Delta}}_u |({d_u + \beta})^{-4\alpha}  +  \a(2\a+1) M \overline{\tilde{\Delta}^2}_u({d_u + \beta})^{-1-4\alpha}\Big) \end{aligned}\\
\end{split}
\end{equation*}
where line (i) follows from the fact that, assuming the edge weights are less than 1, $A_{uv}^2 \leq A_{uv}$, implying that $\tilde{d}_u \leq d_u$. 

% Note that in the case where $\b\leq 1$ (so that $\b^2\leq 1)$, this bound can be made even tighter by considering:
% \begin{equation*}
%     \begin{split}
%     \mu \leq  \sigma^2||W||^2 \Big( (d_u &+\beta)^{2-4\a}\\&+  2\a |\overline{\Delta}_u |({d_u + \beta})^{-4\alpha} \\
%     &+  \a(2\a+1) M \overline{\Delta^2}_u({d_u + \beta})^{-1-4\alpha}\Big).
%     \end{split}
% \end{equation*}


Let us now turn to the analysis of the concentration of this norm. By Gaussianity of each of its coordinate, the squared norm
$|| H^{(k)}_u -H^{(k)}_{u'}||^2 = \sum_{j=1}^p \big(\sum_{v\in \mc{N}(u)\cup \{u\}} \frac{A_{uv}}{(d_u+\beta)^{\alpha} (d_v +\beta)^{\alpha}} \e_{v\cdot} W_{\cdot j}\big)^2 $ is  sub-exponential.% with parameters $(\frac{2 \sigma^2||W||_2^2}{(d_u+\beta)^{2\alpha}}\sum_{v\in \mc{N}(u)\cup \{u\}} \frac{A_{uv}^2}{(d_v +\beta)^{2\alpha}}$.

To see this, note that since each of the $p$ coordinate of the vector $H^{(k)}_u -H^{(k)}_{u'}$ is Gaussian with variance $\tilde{\s}_j^2=\frac{\sigma^2||W_{\cdot j}||^2}{(d_u+\beta)^{2\alpha}}\sum_{v\in \mc{N}(u)\cup \{u\}} \frac{A_{uv}^2}{(d_v +\beta)^{2\alpha}})$, its square is sub-Exponential with parameter $(2\tilde{\s}_j^2 ,4\tilde{\s}_j^2)$ (\cite{wainwright2019high}), so the squared norm (ie the sum of the squared entries) is sub-Exponential with parameter:
\begin{equation*}
    \begin{split}
    (&2\sum_{j=1}^p \tilde{\s}_j^2 , 4\max_j \tilde{\s}_j^2) = 
    \Big( 2\frac{\sigma^2||W||^2}{(d_u+\beta)^{2\alpha}}\begin{aligned}[t]\sum_{v\in \mc{N}(u)\cup \{u\}} \frac{A_{uv}^2}{(d_v +\beta)^{2\alpha}},4 \frac{\sigma^2||W||_{2, \infty}^2}{(d_u+\beta)^{2\alpha}}\sum_{v\in \mc{N}(u)\cup \{u\}} \frac{A_{uv}^2}{(d_v +\beta)^{2\alpha}} \Big).\end{aligned}
    \end{split}
\end{equation*}
By property of the sub-exponential tail, we know that:
\begin{equation}
\begin{split}
 \P[||H^{(k)}_u -&H^{(k)}_{u'}||^2  - \mu \geq t] \leq \min(e^{-t^2/(4\sum_{j=1}^p \tilde{\s}_j^2)}, e^{-t/(2\sqrt{2}\sqrt{\sum_{j=1}^p \tilde{\s}_j^2})} )
\end{split}
\end{equation}

 Therefore, with probability at least $1-\delta$, for any $\delta \in (0,1)$, we must have:
\begin{equation*}
\begin{split}
&||H^{(k)}_u -H^{(k)}_{u'}||^2&  \\
&\leq \mu +  2\sqrt{2}
 \sqrt{\frac{\sigma^2||W||_{2, \infty}^2}{(d_u+\beta)^{2\alpha}}\sum_{v\in \mc{N}(u)\cup \{u\}} \frac{A_{uv}^2}{(d_v +\beta)^{2\alpha}}}\log(1/\delta) \\
& \leq \mu \begin{aligned}[t]&+  2 \sqrt{2}\s||W||_{2, \infty}(d_u +\beta)^{1-2\a} \\
  &\times \sqrt{1 +  2\a |\overline{\Delta}_u | +  \a(2\a+1) M \frac{\overline{\Delta^2}_u}{d_u+\beta} } \log(1/\delta)\end{aligned}
\end{split}
\end{equation*}

The concentration is thus a function of the node degree: the leading term is in $(d_u + \beta)^{1-2\alpha}$, and we observe again the existence of a critical threshold at $\alpha=0.5$.
\end{proof}

\subsection{Proof of lemma \ref{lemma:inh}: the case of row-normalized Convolutions}

We now turn to the proof of Lemma~\ref{lemma:inh} for row-normalized convolutions.

\begin{proof}{\bf  \hspace{0.05cm}  of Lemma~\ref{lemma:inh}(row-normalized convolutions).}
In the case of row-normalized convolutions, we have instead:
\begin{equation}
    \begin{split}
(SX_{u}-SX_{u'})W &= \sum_{v\in \mc{N}(u)\cup \{u\}}s_{uv} \epsilon_vW\\
        \end{split}
\end{equation}
where, as highlighted in section \ref{sec:geometry}, $s_{uv}$ is proportional to  $\frac{1}{(d_v+ \beta)^{\a}}$, but does not depend on $d_u$. In this case, following a similar reasoning to the previous subsection:

\begin{align*} 
\mu &= \E[||H^{(k)}_u -H^{(k)}_{u'}||^2] \\
&= \frac{\sigma^2||W||^2}{Z^2}\sum_{v\in \mc{N}(u)\cup \{u\}} s_{uv}^2 \quad \text{with } Z = \sum_{v\in \mc{N}(u)\cup \{u\}} s_{uv} \\
&\leq  \frac{\sigma^2||W||^2}{Z^2} \max_{v\in \mc{N}(u)\cup \{u\}}\{s_{uv}\} \quad \text{ by Holder's inequality} \\
&\leq  \frac{\sigma^2||W||^2}{Z^2} \beta \quad \text{ assuming } $\beta \geq 1$ \implies \max_{v\in \mc{N}(u)\cup \{u\}}\{s_{uv}\} \leq \beta  \\
% &\leq  \frac{\sigma^2||W||^2}{Z^2} \frac{1}{(d_{\min} +\beta)^{\alpha}}\\
% &\leq  \frac{\sigma^2||W||^2}{Z^2} \frac{1}{(1+\beta)^{\alpha}} 
\end{align*}
\end{proof}

\subsection{Toy example 2}

Conversely, $u$ and $u'$ have radically different neighborhoods from a topological perspective, but have similar features:
$$ \forall j \in \mc{N}(u) \cup \mc{N}(u') ,\quad X_j = \bar{X}_u $$

\begin{description}[noitemsep, leftmargin=0.5cm]
\item[In the symmetric case:]
\begin{equation*}
    \begin{split}
  &||(SX)_{u\cdot} - (SX)_{u'\cdot}||^2  \\
&= || \sum_{v\in \tilde{\mc{N}}(u)} \begin{aligned}[t]&\frac{A_{uv}}{(d_u+\beta)^{\alpha}  (d_v +\beta)^{\alpha}}  \bar{X} \\&- \sum_{v'\in \tilde{\mc{N}}(u')} \frac{A_{u'v'}}{(d_{v'}+\beta)^{\alpha} (d_{u'} +\b)^{\alpha}} \bar{X} ||^2\end{aligned}\\
&= \Big( \sum_{v\in \tilde{\mc{N}}(u)} \begin{aligned}[t]&\frac{A_{uv}}{(d_u+\beta)^{\alpha}  (d_v +\beta)^{\alpha}} \\&- \sum_{v'\in \tilde{\mc{N}}(u')} \frac{A_{u'v'}}{(d_{v'}+\beta)^{\alpha} (d_{u'} +\b)^{\alpha}} \Big)^2||\bar{X}||^2\end{aligned}\\ 
&= \Big( \sum_{v\in \tilde{\mc{N}}(u)} \begin{aligned}[t]&\frac{A_{uv}}{(d_u+\beta)^{2\alpha}  (1 + \frac{\Delta_v}{d_u +\beta})^{\alpha}} \\&- \sum_{v'\in \tilde{\mc{N}}(u')} \frac{A_{u'v'}}{(d_{u'}+\beta)^{2\alpha} (1 +\frac{\Delta_v}{d_{u'} +\beta})^{\alpha}} \Big)^2||\bar{X}||^2\end{aligned}\\ 
&=  \Big( \begin{aligned}[t]&(d_u + \beta)^{1-2\alpha} Ω-\alpha \overline{\Delta}_u \\&+ \frac{\alpha(\alpha+1)}{2(d_u + \beta)^{2 + 2\alpha}}  \sum_{v\in \tilde{\mc{N}}(u)}  \frac{A_{uv}(d_v-d_u)^2}{((1-t_v) + t_v\frac{\Delta_v}{d_u +\beta} + \beta)^{2 + 2\alpha}} \\
&- (d_{u'} + \beta)^{1-2\alpha} +\alpha \overline{\Delta}_{u'} \\&- \frac{\alpha(\alpha+1)}{2(d_{u'} + \beta)^{2 + 2\alpha}}  \sum_{v'\in \tilde{\mc{N}}(u')}  \frac{A_{u'v'}(d_v'-d_u')^2}{((1-t_{v'}) + t_{v'}\frac{\Delta_{v'}}{d_{u'} + \beta} + \beta)^{2 + 2\alpha}} \Big)^2||\bar{X}||^2  \end{aligned}
\end{split}
\end{equation*}

where $t_v, t_{v'} \in [0,1]$.
In this case, note that:
\begin{itemize}
    \item When $\a=0$, this difference writes as:
    $ ||(SX)_{u\cdot} - (SX)_{u'\cdot}||^2   = d_u  -d_{u'} $, and is thus extremely sensitive to the degree of the nodes,
    \item When $\a=1$, the difference can be written as:
\begin{equation*}
\begin{split}
&||(SX)_{u\cdot} - (SX)_{u'\cdot}||^2    \\
&=  \Big( \begin{aligned}[t]&\frac{1}{d_u + \beta} - \overline{\Delta}_u \\&+ \frac{1}{(d_u + \beta)^{4}}  \sum_{v\in \tilde{\mc{N}}(u)}  \frac{A_{uv}(d_v-d_u)^2}{((1-t_v) + t_v\frac{\Delta_{v}}{d_{u} + \beta} + \beta)^{4}} \\&- \frac{1}{d_{u'} + \beta} + \overline{\Delta}_{u'} \\&- \frac{1}{(d_{u'} + \beta)^{4}}  \sum_{v'\in \tilde{\mc{N}}(u')}  \frac{A_{u'v'}(d_v'-d_u')^2}{((1-t_{v'})+ t_{v'}\frac{\Delta_{v'}}{d_{u'} + \beta} + \beta)^{4}} \Big)^2||\bar{X}||^2 \end{aligned}
\end{split}
\end{equation*}
   In this case, the leading terms are functions of the inverse of the node degrees $\frac{1}{d_{u} + \beta}- \frac{1}{d_{u'} + \beta} $ and the difference in local homogeneity of topology $\overline{\Delta}_u - \overline{\Delta}_{u'}$. Consequently, the distance is still sensitive to topological properties of the neighborhood.
   \item When $\alpha=0.5$: in this case, the distance writes as:
\begin{equation*}
    \begin{split}
     ||(SX)_{u\cdot} - (SX)_{u'\cdot}||^2 & =  \Big(  \frac{1}{2}\overline{\Delta}_{u'}-\frac{1}{2}\overline{\Delta}_u + \frac{3}{8(d_u + \beta)^{3}}  \sum_{v\in \tilde{\mc{N}}(u)}  \frac{A_{uv}(d_v-d_u)^2}{((1-t_v) + t_v\frac{\Delta_v}{d_u +\beta} + \beta)^{3}}\\
&- \frac{3}{8(d_{u'} + \beta)^{3}}  \sum_{v'\in \tilde{\mc{N}}(u')}  \frac{A_{u'v'}(d_{v'}-d_{u'})^2}{((1-t_{v'}) + t_{v'}\frac{\Delta_{v'}}{d_{u'} +\beta} + \beta)^{3}} \Big)^2||\bar{X}||^2   
    \end{split}
\end{equation*}

Consequently, this distance is less directly related to the degree of the node, and relies more on the topological traits of the neighborhood.
\end{itemize}

\item[In the regularised case:]
\begin{equation}
    \begin{split}
        &||(SXW_1 + b_1)_u-(SXW_1 + b_1)_{u'} ||^2 =  0
        \end{split}
\end{equation}
In this case, the distance is entirely driven by the features.
\end{description}


\xhdr{Results of the experiments} 
\begin{figure*}[h]
     \centering
\includegraphics[width=\textwidth]{NEW_FIGURES/hubs_and_spokes_panel_norm.004.png}
\caption{Results for our Structural Equivalents experiment}\label{fig:str_eq}
\end{figure*}
%\vspace{-0.3cm}



\section{Proofs of Section \ref{sec:properties}}\label{appendix:properties}
To better formalize our setting, we propose considering a specific family of graphs: the degree-corrected Stochastic Block Model \cite{karrer2011stochastic} on two classes of equal size $n$. Let each node have class $Z_i \in \{1, 2\}$, and denote 
 $X_i = \mu^{(Z_i)} + \epsilon_i$ its attributes. According to the DC-SBM model, each edge in the network is sampled according to a Bernouilli distribution:
$ A_{ij}  \sim \text{Bernouilli}(\theta_i \theta_j \omega_{Z_iZ_j}),$ where $\theta_i$ is a popularity parameter such that, for each group $g$:
$ \sum_{i=1}^n \theta_{i}1_{Z_i=g} = n,$
where $\omega_{ij}$ is the parameter of the model corresponding to the probability of connection between group $i$ and $j$. Note that, under this model, the expected number of edges from community $(i)$ to $(j)$ is simply $m_{ij} =n^2 w_{ij}$. Therefore, picking $\forall i, \theta_i=1$ corresponds to the traditional stochastic block model.
We will also assume that $\forall i, \theta_i \in [\frac{1}{\kappa}, \kappa] $ where $\kappa>1$. In other words, the degree distribution cannot be too skewed.

The degree of each node $i$ can thus be rewritten as:
\begin{small}
\begin{align*}
d_i & = \sum_{j=1, j\neq i}^n A_{ij} \\
\E[d_i]& = \sum_{j=1, \neq i}^n \theta_i \theta_j \omega_{Z_iZ_j}= \theta_i [ (n-\theta_i)\omega_{11} +n \omega_{12}] = n \theta_i [ (1-\frac{\theta_i}{n})\omega_{11} + \omega_{12}] = \theta_i \bar{\omega} n + o(1) .
\end{align*}
\end{small}
where $\bar{\omega}  = \omega_{11} + \omega_{12}.$

A trivial application of the bounded difference inequality shows that the scaled degree $n^{-1}d_i$ concentrates rapidly around its mean (see \cite{wainwright2019high} Chapter 2):
$$ \mathbb{P}[ \frac{1}{n}| d_i -\mathbb{E}[d_i]| > t ] \leq 2 e^{-2nt^2}$$


\xhdr{Effect of the convolution under the DCSBM model} Let us now focus on the effect of the convolution. We have:
\begin{equation}
    \begin{split}
        Z_i & = \frac{\beta}{(\beta + d_i)^{2\alpha}} X_i + \sum_{j=1, j\neq i}^n \frac{A_{ij}}{(\sum_{k\neq j, i} A_{ki} + A_{ij} + \beta)^{\alpha}(\sum_{k\neq j, i} A_{kj} + A_{ij} + \beta)^{\alpha } }X_j \\
        \implies         \E[Z_i] & = \E[\frac{\beta}{(\beta + d_i)^{2\alpha}}]\E[X_i] + \sum_{j=1, j\neq i}^n \E\big[\E[\frac{A_{ij}}{(z_i + A_{ij})^{\alpha}(z_j + A_{ij})^{\alpha } }\Big|  \sum_{k\neq j, i} A_{ki} +\beta = z_i, \sum_{k\neq j, i} A_{ki} + \beta= z_j]\big]\E[X_j] 
    \end{split}
\end{equation}
Consider the term $\E[\frac{A_{ij}}{(z_i + A_{ij})^{\alpha}(z_j + A_{ij})^{\alpha } }\Big| \beta + \sum_{k\neq j, i} A_{ki} = z_i, \beta + \sum_{k\neq j, i} A_{ki} = z_j]$. This is a binary variable, with value $\frac{1}{(z_i+1)^{\alpha}(z_j+1)^{\alpha}} $ with probability $\theta_i \theta_j \omega_{Z_iZ_j}$, and 0 otherwise.
Therefore:

$$ \E\Big[\frac{A_{ij}}{(z_i + A_{ij})^{\alpha}(z_j + A_{ij})^{\alpha } }\Big| \beta + \sum_{k\neq j, i} A_{ki} = z_i, \beta + \sum_{k\neq j, i} A_{ki} = z_j\Big] = \frac{\theta_i \theta_j \omega_{Z_iZ_j}}{(z_i+1)^{\alpha}(z_j+1)^{\alpha}} $$
Thus the trick becomes to characterize the behaviour of the random variable $\tilde{Y}=\frac{1}{(z_i+1)^{\alpha}(z_j+1)^{\alpha}} $. Note that, by construction, $z_j$ and $z_i$ are independent of one another. Since the function $\phi: x \to x^{-\alpha}$ is continuous, by the continuous mapping theorem, we know that $\phi(X_n)$ converges to $\phi(\E[X]) = \frac{1}{\E[\lim_{n\to \infty} X_n ] }$. Here, we have shown above that:
$$ \lim_{n\to \infty} \frac{d_i}{n} =\theta_i \bar{\omega}.  $$
Therefore, assuming (without loss of generality) that $Z_i=1$, so that $\E[X_i]= \mu^{(1)}$:
\begin{equation}
    \begin{split}     
      n^{2\alpha}  \E[Z_i] & = \E[\frac{\beta}{(\frac{\beta}{n} + \frac{d_i}{n})^{2\alpha}}]\E[X_i] + \sum_{j=1, j\neq i}^n \E\big[\frac{\theta_i \theta_j \omega_{Z_iZ_j}}{(\frac{z_i}{n}+\frac{1}{n})^{\alpha}(\frac{z_j}{n}+\frac{1}{n})^{\alpha}} \big]\E[X_j] \\
    n^{2\alpha} \E[Z_i] & = \frac{\beta}{(\bar{\omega} \theta_i)^{2\alpha}}\E[X_i] + \sum_{j=1, j\neq i}^n \frac{\theta_i \theta_j \omega_{Z_iZ_j}}{\theta_i^{\alpha}\theta_j^{\alpha}\bar{\omega}^{2\alpha}}\mu^{(Z_j)}  + O(1)\\
    \end{split}
\end{equation}
Therefore
\begin{equation}
    \begin{split}     \E[Z_i] &  = \frac{\beta}{n^{2\alpha} \bar{\omega}^{2\alpha} \theta_i^{2\alpha}} \mu^{(1)} +   
n^{-2\alpha} \theta_i^{1-\alpha} \frac{\omega_{11}}{\bar{\omega}^{2\alpha}} \sum_{j \neq i, Z_j=Z_i} \theta_j^{1-\alpha} \mu^{(1)} +  +   
n^{-2\alpha} \theta_i^{1-\alpha} \frac{\omega_{12}}{\bar{\omega}^{2\alpha}} \sum_{j \neq i, Z_j\neq Z_i} \theta_j^{1-\alpha} \mu^{(2)} +o(n^{-2\alpha})\\
    &  = \frac{\beta}{n^{2\alpha} \bar{\omega}^2 \theta_i^2} \mu^{(1)} +   
n^{-2\alpha} \theta_i^{1-\alpha} \frac{\omega_{11}}{\bar{\omega}^{2\alpha}}  (S_1 - \theta_i^{1-\alpha} \mu^{(1)}) ) +   
n^{-2\alpha} \theta_i^{1-\alpha} \frac{\omega_{12}}{\bar{\omega}^{2\alpha}}(S_2 - \theta_i^{1-\alpha} \mu^{(2)})  +o(n^{-2\alpha})\\
    \end{split}
\end{equation}
 
This shows that the embedding scales as $n^{1-2\alpha}$: for $\alpha=1$, we see that the embedding will converge to $0$, as is observed empirically. Reciprocally, for $\alpha=0$, the embedding can expand. 
This expression is interesting as well. As we can see, the embedding is directly proportional to $\theta_i^{1-\alpha}$. Consequently, for $\alpha=1$, the leading term is independent of $\theta_i$. Reciprocally, for $\alpha=0$, the embedding is directly proportional to $\theta_i$. 

To see this, we provide the following example. Consider a DC-SBM graph on 300 nodes with two classes, with connectivity parameters $\omega_{11}=\omega_{22}=0.1$ and  $\omega_{11}=\omega_{22}=0.005$. The features here are taken to be multivariate normal with  $\mu^{(1)}=2,\mu^{(2)}=-2$ and standard deviation equal to 4. We generate the $\theta_i$ for each group from a lognormal distribution, with mean 0 and standard deviation 3.   The histogram of the degree distribution is provided in Figure \ref{fig:deg}, along with a plot of the original features $X \in \R^{n \times 2}$ in Figure \ref{fig:original_embeddings}.



\begin{figure}
     \centering
     \begin{subfigure}[t]{0.49\textwidth}
         \centering
         \includegraphics[width=\textwidth]{FIGS/degree_distribution_sbm.png}
         \caption{Degree distribution}
         \label{fig:deg}
     \end{subfigure}
     \hfill
     \begin{subfigure}[t]{0.49\textwidth}
         \centering
         \includegraphics[width=\textwidth]{FIGS/original_emb.png}
         \caption{Original node attributes.}
         \label{fig:original_embeddings}
     \end{subfigure}
     \caption{Degree distribution and raw attributes in the DC-SBM serving as our example.}
\end{figure}



\begin{figure}
         \centering
         \includegraphics[width=\textwidth]{FIGS/embeddings_after_conv.png}
         \caption{Attributes after convolution for different values of $\alpha$ and $\beta=1.$}
         \label{fig:emb}
\end{figure}


% It is further possible to show that the embedding concentrates as well around its mean. To this end, we apply the bounded difference inequality to the embedding function $Z_i= f(A, X)$ as a function of the adjacency matrix $A$.
% Note that in this case, a change of a single entry in $A_{ij}$ results in a change that can be upper-bounded as follows:
% \begin{description}
%  \item[If $A_{ij}$ is flipped from 0 to 1:] in this case, we have:
%  \begin{equation}
%     \begin{split}
%         Z_i -\tilde{Z}_i &= \frac{\mu}{( 1 + d_i)^\alpha( 1+ d_j)^\alpha} + \sum_{k\neq j} \frac{A_{ik}\mu_{Z_k}}{d_k^{\alpha}} [ \frac{1}{( 1+ d_i)^\alpha} - \frac{1}{( d_i)^\alpha}] \\
%         \|  Z_i -\tilde{Z}_i \|   & \leq \frac{\mu_{\infty}}{( d_i+1)^\alpha} \Big[ \frac{1}{(\beta)^\alpha} + \sum_{k\neq j}\frac{A_{ik}}{\beta^\alpha} \big| \frac{(d_i+1)^{\alpha}}{\beta^{\alpha}}  - 1\big| \Big] \quad \text{ using } \frac{1}{d_k^{\alpha}} \leq \frac{1}{\beta^{\alpha}} \\
%          & \leq \frac{\mu_{\infty}}{\beta^\alpha d_i^\alpha} \Big[ 1  + d_i   \frac{\alpha}{d_i} \Big]\\
%         &\leq \frac{C}{d_i^{\alpha}}  \leq \frac{C}{\theta_i n^{\alpha} \bar{\omega}^{\alpha}}
%     \end{split}
% \end{equation}
% Therefore, with high probability, the difference is bounded.
%  \item[If $A_{ij}$ is flipped from 1 to 0:] then in this case, 
%   \begin{equation}
%     \begin{split}
%         Z_i -\tilde{Z}_i &= \frac{\mu}{d_i^\alpha d_j^\alpha} + \sum_{k\neq j} \frac{A_{ik}\mu_{Z_k}}{d_k^{\alpha}} [ \frac{1}{(d_i-1)^\alpha} - \frac{1}{( d_i)^\alpha}] \\
%         \|  Z_i -\tilde{Z}_i \|   & \leq \frac{\mu_{\infty}}{d_i^\alpha\beta^\alpha}\Big[ 1 + \sum_{k\neq j}A_{ik} \big| \frac{(d_i+1)^{\alpha}}{d_i^{\alpha}}  - 1\big| \Big]\\
%          & \leq \frac{\mu_{\infty}}{d_i^\alpha\beta^\alpha} \Big[ 1 + \alpha \Big]\\
%         &\leq \frac{C}{d_i^{\alpha}} 
%     \end{split}
% \end{equation}
% \end{description}
% Thus, with high probability, the differences are bounded. 
% On that set, by the bounded difference inequality, we know that the (appropriately rescaled) embedding will concentrate around its mean:

% $$ \mathbb{P}[ n^{-1}| Z_i -\mathbb{E}[Z_i]| > t ] \leq 2 e^{-2\frac{t^2n^{2}}{\tilde{C} \sum_{j=1, j\neq i}^{n} \theta_i C \theta_i^{-\alpha} \bar{\omega}^{-\alpha} n^{-\alpha}}}\leq 2 e^{-2\frac{t^2n^{2-4\alpha}}{\tilde{C} \theta_i C \theta_i^{-\alpha} \bar{\omega}^{-\alpha} n^{1-\alpha}}} = 2 e^{-2\frac{t^2n^{\alpha}}{\tilde{C} \theta_i C \theta_i^{-\alpha} \bar{\omega}^{-\alpha}}} $$
% So the higher the $\alpha,$ the faster the convergence.



% So the variables are only weakly coupled and the embedding converges to its mean.
% Now, we knoe that:
% $ \E[Z_i] = \sum_{ j} \frac{\theta_i \theta_j n^2 p}{(1+d_i)^{\alpha}(1+d_j)^{\alpha}}$
% \begin{lemma}
% For symmetric embeddings, the embedding $Z_i$ for node $i$ concentrates around its mean:
% $$Z_i = \sum_{j=1}^n \frac{\theta_i^{1-\alpha} \theta_j^{1-\alpha}}{ ((1-\theta_i)\omega_{11}  +  \omega_{12})^{\alpha} ((1-\theta_j)\omega_{11}  +  \omega_{12})^{\alpha}} \omega_{Z_iZ_j} \mu_{Z_i} $$
% \end{lemma}




\begin{equation}
    \begin{split}
    \E[\frac{A_{ij}}{d_i^{\alpha}}] = \E [ \frac{\theta_i \theta_j \omega_{Z_iZ_j}}{ \E[d_i] + W}] \leq \frac{\theta_i \theta_j \omega_{Z_iZ_j}}{\E[d_i]} ( 1 - \frac{W}{\E[d_i]} +  \frac{W^2}{\E[d_i]^2})
        \end{split}
\end{equation}
So as long as the fluctuations around the mean are controlled, the entire expression remains manageable.
\begin{description}
\item[For Row-normalized convolutions:]
\begin{equation}
    \begin{split}
        H^{(1)} &= \mathbb{E}[\sum_{i\in G_1} \frac{A_{ij}}{\beta + \sum_{k}A_{ik}} X  + \sum_{i\in G_2}^n \frac{A_{ij}}{\beta + \sum_{k}A_{ik}} X   ]\\
 &=  \mathbb{E}[ (\frac{\beta}{(\beta + d_i)^{2\alpha}}  + 
\sum_{ j \in G_1} \frac{1}{(\beta + d_i)^{\alpha}} \frac{1}{(\beta + d_j)^{\alpha}}  ) \mu^{(1)} \\
&+  \sum_{i\in G_2}^n \frac{\theta_i \theta_j q}{\beta + \theta_i (\sum_j\theta_j } \mu^{(2)}   ] \\
    \end{split}
\end{equation}

\begin{equation}
    \begin{split}
        H^{(1)} &= \sum_{i\in G_1} \frac{\theta_{i}\theta_{j}p}{\theta_i^{\alpha} (m_1-\theta_i + m_2)^{\alpha}\theta_j^{\alpha} (m_1-\theta_j + m_2)^{\alpha}}\mu^{(1)}  \\
        &+ \sum_{i\in G_2}^n \frac{\theta_i\theta_j q}{\theta_i^{\alpha} (m_1-\theta_i + m_2)^{\alpha}\theta_j^{\alpha} (m_1-\theta_j + m_2)^{\alpha}} \mu^{(2)} \\
 &= \theta_i^{1-\alpha}\Big( \sum_{i\in G_1} \frac{\theta_{j}^{1-\alpha}p}{(m_1-\theta_i + m_2)^{\alpha} (m_1-\theta_j + m_2)^{\alpha}}\mu^{(1)}  \\
        &+ \sum_{i\in G_2}^n \frac{\theta_j^{1-\alpha} q}{(m_1-\theta_i + m_2)^{\alpha}(m_1-\theta_j + m_2)^{\alpha}} \mu^{(2)} \Big)\\
    \end{split}
\end{equation}
Several cases:
\begin{itemize}
    \item When $\alpha =0:$
    $$  H^{(1)}= \theta_i\Big( \sum_{i\in G_1} \theta_{j} p \mu^{(1)}  + \sum_{i\in G_2}^n \theta_j q  \mu^{(2)} \Big)$$
        $$  H^{(1)}= \theta_i (m_1 - \theta_i) p \mu^{(1)}  +  \theta_im_2 q  \mu^{(2)}$$
        \item When $\alpha =1:$
    $$  H^{(1)}=  \Big(  \frac{p}{(m_1+m_2)^{2}} \mu^{(1)} +   \frac{q}{(m_1+m_2)^{2}} \mu^{(1)} ) $$
\end{itemize}

We also have:
\begin{equation}
    \begin{split}
   A &=      \sum_{i\in G_1} \frac{\theta_{i}^{1-\alpha}\theta_{j}^{1-\alpha}p}{(m_1 + m_2)^{2\alpha}}(1 -\alpha \frac{\theta_i}{m_1})(1 -\alpha \frac{\theta_j}{m_1})\mu^{(1)} \\
   &+ \sum_{i\in G_2} \frac{\theta_{i}^{1-\alpha}\theta_{j}^{1-\alpha}q}{(m_1 + m_2)^{2\alpha}}(1 -\alpha \frac{\theta_i}{m_1})(1 -\alpha \frac{\theta_j}{m_1})\mu^{(2)} \\
    \end{split}
\end{equation}
\item[For Row-normalized convolutions:]
\begin{equation}
    \begin{split}
   A &=   \theta_{i}^{1-\alpha} \Big[ \sum_{i\in G_1}\frac{\theta_{j}^{1-\alpha}p}{(m_1 + m_2)^{2\alpha}}(1 -\alpha (\frac{\theta_i}{m_1} + \frac{\theta_j}{m_1}))  \\
   &+ \sum_{i \in G_2}\frac{\theta_{j}^{1-\alpha}p}{(m_1 + m_2)^{2\alpha}}1 -\alpha (\frac{\theta_i}{m_1} + \frac{\theta_j}{m_1}))  \Big]\\
    B/A &=  \frac{\theta_{j}^{1-\alpha}p}{(m_1 + m_2)^{2\alpha}}(1 -\alpha (\frac{\theta_i}{m_1} + \frac{\theta_j}{m_1})) / \Big[ \sum_{i\in G_1}\frac{\theta_{j}^{1-\alpha}p}{(m_1 + m_2)^{2\alpha}}(1 -\alpha (\frac{\theta_i}{m_1} + \frac{\theta_j}{m_1}))  \\
   &+ \sum_{i \in G_2}\frac{\theta_{j}^{1-\alpha}p}{(m_1 + m_2)^{2\alpha}}1 -\alpha (\frac{\theta_i}{m_1} + \frac{\theta_j}{m_1}))  \Big]
       \end{split}
   \end{equation}
\begin{equation}
    \begin{split}
   A &=      \sum_{i\in G_1} \frac{\theta_{i}^{1-\alpha}\theta_{j}^{1-\alpha}p}{(m_1 + m_2)^{2\alpha}}(1 -\alpha \frac{\theta_i}{m_1})(1 -\alpha \frac{\theta_j}{m_1})\mu^{(1)} \\
   &+ \sum_{i\in G_2} \frac{\theta_{i}^{1-\alpha}\theta_{j}^{1-\alpha}q}{(m_1 + m_2)^{2\alpha}}(1 -\alpha \frac{\theta_i}{m_1})(1 -\alpha \frac{\theta_j}{m_1})\mu^{(2)} \\
    \end{split}
\end{equation}

Maybe a good way of understanding things is through a sensitivity analysis.
\end{description}

\section{Further results and experiments}\label{appendix:experiments}

We analyzed the impact of the choice of operator across $\alpha$ and $\beta$ using standard benchmark datasets. In particular, the node classification task has been performed. The performance turns out to be dependent on the choice of an operator as well as the inherent characteristics of the datasets. We use visualizations to further investigate the properties of each embedding space in relation to the choice of operator. The code for the experiments can be found \href{https://github.com/sowonjeong/gnn-geometry-uai}{here}

\subsection{Dataset statistics}

% Data Related Statistics
\begin{table*}%[H]
\centering
\resizebox{\textwidth}{!}{%
\begin{tabular}{|c|r|r|r|r|r|r|r|r|}
\hline
\rowcolor[HTML]{C0C0C0} 
Name &
  \multicolumn{1}{c|}{\cellcolor[HTML]{C0C0C0}\textbf{Node}} &
  \multicolumn{1}{c|}{\cellcolor[HTML]{C0C0C0}\textbf{Edge}} &
  \multicolumn{1}{c|}{\cellcolor[HTML]{C0C0C0}\textbf{Features}} &
  \multicolumn{1}{c|}{\cellcolor[HTML]{C0C0C0}\textbf{Class}} &
  \multicolumn{1}{c|}{\cellcolor[HTML]{C0C0C0}\textbf{\begin{tabular}[c]{@{}c@{}}Avg.\\ Degree\end{tabular}}} &
  \multicolumn{1}{c|}{\cellcolor[HTML]{C0C0C0}\textbf{\begin{tabular}[c]{@{}c@{}}Mean\\ Centrality\end{tabular}}} &
  \multicolumn{1}{c|}{\cellcolor[HTML]{C0C0C0}\textbf{h_{edges}}} &
  \multicolumn{1}{c|}{\cellcolor[HTML]{C0C0C0}\textbf{h_{nodes}}} \\ \hline
\rowcolor[HTML]{FFFFFF} 
\textbf{Cora}          & 2,708   & 10,556         & 1,433 & 7  & 3.90  & 1.65E-03                 & {\color[HTML]{000000} 0.81} & {\color[HTML]{000000} 0.83} \\ \hline
\rowcolor[HTML]{FFFFFF} 
\textbf{pubMed}        & 19,717  & 88,648         & 500   & 3  & 4.50  & 2.71E-04                 & {\color[HTML]{000000} 0.80} & {\color[HTML]{000000} 0.79} \\ \hline
\rowcolor[HTML]{FFFFFF} 
\textbf{Citeseer}      & 3,327   & \textit{9,104} & 3,703 & 6  & 2.74  & 1.02E-03                 & {\color[HTML]{000000} 0.74} & {\color[HTML]{000000} 0.71} \\ \hline
\rowcolor[HTML]{FFFFFF} 
\textbf{Coauthor CS}   & 18,333  & 163,788        & 6,805 & 15 & 8.93  & 2.42E-04                 & {\color[HTML]{000000} 0.81} & {\color[HTML]{000000} 0.83} \\ \hline
\rowcolor[HTML]{FFFFFF} 
\textbf{Amazon Photos} & 7,650   & 238,162        & 745   & 8  & 31.13 & 3.82E-04                 & {\color[HTML]{000000} 0.83} & {\color[HTML]{000000} 0.84} \\ \hline
\rowcolor[HTML]{FFFFFF} 
\textbf{Actor}         & 7,600   & 30,019         & 932   & 5  & 3.95  & 3.18E-04                 & {\color[HTML]{000000} 0.22} & {\color[HTML]{000000} 0.21} \\ \hline
\rowcolor[HTML]{FFFFFF} 
\textbf{Cornell}       & 183     & 280            & 1,703 & 5  & 1.53  & 1.07E-04                 & {\color[HTML]{000000} 0.31} & {\color[HTML]{000000} 0.21} \\ \hline
\rowcolor[HTML]{FFFFFF} 
\textbf{Wisconsin}     & 251     & 515            & 1,703 & 5  & 2.05  & 2.42E-04                 & {\color[HTML]{000000} 0.20} & {\color[HTML]{000000} 0.13} \\ \hline
\rowcolor[HTML]{FFFFFF} 
\textbf{PATTERN}       & 108     & 4,884          & 3     & 2  & 45.22 & 5.45E-03                 & {\color[HTML]{000000} 0.67} & {\color[HTML]{000000} 0.69} \\ \hline
\rowcolor[HTML]{FFFFFF} 
\textbf{CLUSTER}       & 117     & 4,104          & 7     & 6  & 35.08 & 6.07E-03                 & {\color[HTML]{000000} 0.37} & {\color[HTML]{000000} 0.36} \\ \hline
\rowcolor[HTML]{FFFFFF} 
\textbf{WikiCS}        & 11,701  & 297,110        & 300   & 10 & 25.39 & 1.76E-04                 & {\color[HTML]{000000} 0.69} & {\color[HTML]{000000} 0.64} \\ \hline
\rowcolor[HTML]{FFFFFF} 
\textbf{OGBN-arxiv}    & 169,343 & 1,166,243      & 128   & 40 & 6.89  & \cellcolor[HTML]{9B9B9B} & 0.66                        & \cellcolor[HTML]{9B9B9B}    \\ \hline
\end{tabular}%
}
\caption{Statistics for datasets used for experiments. Node and edge homophily indices are calculated by the formula suggested in \citep{Pei2020GeomGCN}, \citep{Zhu2020BeyonHomophily} respectively.}
\label{tab:data-stats}
\end{table*}


\xhdr{Datasets} We used twelve datasets for experiments including eight standard benchmark datasets, namely, Cora, Pubmed, Citeseer, and Amazon Photos, Coauthor CS, and four novel benchmarks proposed by \citep{dwivedi2020benchmarking}. These datasets include ,in particular, synthetic graphs ({PATTERN}, {CLUSTER}) --- which offer a more controlled environment to perform experiments---, as well as social/academic networks such as {WikiCS} and {OGBN-arxiv}. We use the processed version provided by PyTorch Geometric\citep{fey2019fast}. Detailed statistics for the datasets used in the experiments are shown in Table \ref{tab:data-stats}. 

We further expand on our experimental results by considering the  benchmarks proposed by . 

\textit{Citation networks}. Cora, Citeseer, and Pubmed are standard citation network benchmark datasets.\citep{Yang2019Revisiting} In these networks, nodes represent scientific publications, and edges denote citation links between publications. Node features are the bag-of-words representation of papers, and
node label is the academic topic of a paper.

\textit{Coauthor} In Coauthor CS\citep{Shchur2018Pitfalls} network, each node represents the author of the scientific publication, and edge shows whether any of the authors coauthored the paper. Node features are bag-of-word representations of these documents, and node labels denote the field of study. 

\textit{Amazon} In Amazon Photo\citep{Shchur2018Pitfalls} network,  nodes represent goods and edges show whether two goods are frequently bought together. Node features are bag-of-word representation of product reviews.

\textit{WebKB}. WebKB\cite{Craven1998WebKB} is a webpage dataset collected from computer science departments of various universities by Carnegie Mellon University. We use Cornell, and Wisconsin among them. Nodes represent web pages, and edges are hyperlinks between them. Node features are the bag-of-words representation of web pages. The web pages are manually classified into the five categories: student, project, course, staff, and faculty.

\textit{Cooccurrence network} Actor dataset is the actor-only induced subgraph of the film-director-actor-writer network\citep{Pei2020GeomGCN}. Each node corresponds to an actor, and the edge between two nodes denotes co-occurrence on the same Wikipedia page. Node features correspond to corresponding Wikipedia keywords. 

\textit{PATTERN and CLUSTER} We used the processed version provided by PyTorch Geometric\citep{fey2019fast}. In \citep{dwivedi2020benchmarking}, the 10,000 training graphs are used to train the model for node classification task. In our experiments, we only used the first graph from the respective datasets and randomly split the training and test nodes within the graph for each training epoch. 

\textit{WikiCS}. We used the processed version provided by PyTorch Geometric\citep{fey2019fast}. 

\textit{OGBN-arxiv}. We used dataset from \href{https://github.com/snap-stanford/ogb}{\textit{Open Graph Benchmark github repository}} \citep{hu2020ogb}.

\subsection{Experiment setup}

\xhdr{Models} We use a two-layer GCN\citep{kipf2016semi} model with varying families of spatial convolution operator across the choice of $\alpha$ while keeping $\beta \in \{0,1\}$. For each experiment, we randomly split the data into training and test sets (using the default number of train and test points in Pytorch geometric). The number of training nodes used are specified in Table \ref{table:training-details}. 

The GCN model in \citep{dwivedi2020benchmarking} uses batch normalization between GCN layers, unlike our experiments for standard benchmark datasets(e.g. Cora, Pubmed, Citeseer). Batch normalization alters the geometry of the embedding space, which is the main focus of this paper. Consequently, to enable the comparison between the experiments presented in this section and these on traditional benchmark datasets in the last, we train the model with and without batch normalization. Further training details including data split, number of experiments and learning rate are also summarized in Table \ref{table:training-details}. 
\begin{table*}[]
\resizebox{\textwidth}{!}{%
\begin{tabular}{|
>{\columncolor[HTML]{FFFFFF}}c |
>{\columncolor[HTML]{FFFFFF}}c |
>{\columncolor[HTML]{FFFFFF}}c |
>{\columncolor[HTML]{FFFFFF}}c |
>{\columncolor[HTML]{FFFFFF}}c |
>{\columncolor[HTML]{FFFFFF}}c |
>{\columncolor[HTML]{FFFFFF}}c |
>{\columncolor[HTML]{FFFFFF}}c |
>{\columncolor[HTML]{FFFFFF}}c |}
\hline
\cellcolor[HTML]{C0C0C0} &
  \cellcolor[HTML]{C0C0C0} &
  \cellcolor[HTML]{C0C0C0} &
  \cellcolor[HTML]{C0C0C0} &
  \cellcolor[HTML]{C0C0C0} &
  \cellcolor[HTML]{C0C0C0} &
  \cellcolor[HTML]{C0C0C0} &
  \cellcolor[HTML]{C0C0C0} &
  \cellcolor[HTML]{C0C0C0} \\
\cellcolor[HTML]{C0C0C0} &
  \cellcolor[HTML]{C0C0C0} &
  \cellcolor[HTML]{C0C0C0} &
  \cellcolor[HTML]{C0C0C0} &
  \cellcolor[HTML]{C0C0C0} &
  \cellcolor[HTML]{C0C0C0} &
  \cellcolor[HTML]{C0C0C0} &
  \cellcolor[HTML]{C0C0C0} &
  \cellcolor[HTML]{C0C0C0} \\
\multirow{-3}{*}{\cellcolor[HTML]{C0C0C0}\textbf{Dataset}} &
  \multirow{-3}{*}{\cellcolor[HTML]{C0C0C0}\textbf{\begin{tabular}[c]{@{}c@{}}Batch\\ Normalized\end{tabular}}} &
  \multirow{-3}{*}{\cellcolor[HTML]{C0C0C0}\textbf{\begin{tabular}[c]{@{}c@{}}Num of\\ training\\ nodes\end{tabular}}} &
  \multirow{-3}{*}{\cellcolor[HTML]{C0C0C0}\textbf{\begin{tabular}[c]{@{}c@{}}Learning\\ rate\end{tabular}}} &
  \multirow{-3}{*}{\cellcolor[HTML]{C0C0C0}\textbf{Epoch}} &
  \multirow{-3}{*}{\cellcolor[HTML]{C0C0C0}\textbf{\begin{tabular}[c]{@{}c@{}}Num of\\ exp\end{tabular}}} &
  \multirow{-3}{*}{\cellcolor[HTML]{C0C0C0}\textbf{\begin{tabular}[c]{@{}c@{}}Training\\ time(sec)\end{tabular}}} &
  \multirow{-3}{*}{\cellcolor[HTML]{C0C0C0}\textbf{\begin{tabular}[c]{@{}c@{}}Dim of \\ hidden\\ layers\end{tabular}}} &
  \multirow{-3}{*}{\cellcolor[HTML]{C0C0C0}\textbf{\begin{tabular}[c]{@{}c@{}}Num of\\ GCN\\ layers\end{tabular}}} \\ \hline
\textbf{Cora} &
  \textbf{X} &
  140 &
  0.02 &
  200 &
  50 &
  9.58 &
  32 &
  2 \\ \hline
\textbf{pubMed} &
  \textbf{X} &
  140 &
  0.001 &
  500 &
  30 &
  100.49 &
  32 &
  2 \\ \hline
\textbf{Citeseer} &
  \textbf{X} &
  1694 &
  0.05 &
  500 &
  30 &
  31.73 &
  32 &
  2 \\ \hline
\textbf{Coauthor CS} &
  \textbf{X} &
  9194 &
  0.05 &
  200 &
  30 &
  150.68 &
  32 &
  2 \\ \hline
\textbf{Amazon Photos} &
  \textbf{X} &
  3844 &
  0.05 &
  200 &
  30 &
  88.04 &
  32 &
  2 \\ \hline
\textbf{Actor} &
  \textbf{X} &
  3804 &
  0.02 &
  200 &
  30 &
  13.02 &
  32 &
  2 \\ \hline
\textbf{Cornell} &
  \textbf{X} &
  87 &
  0.01 &
  500 &
  30 &
  2.95 &
  32 &
  2 \\ \hline
\textbf{Wisconsin} &
  \textbf{X} &
  126 &
  0.001 &
  500 &
  30 &
  3.52 &
  32 &
  2 \\ \hline
\cellcolor[HTML]{FFFFFF} &
  \textbf{O} &
  42 &
  0.01 &
  300 &
  10 &
  5.39 &
  32 &
  2 \\ \cline{2-9} 
\multirow{-2}{*}{\cellcolor[HTML]{FFFFFF}\textbf{PATTERN}} &
  \textbf{X} &
  42 &
  0.01 &
  200 &
  30 &
  1.38 &
  32 &
  2 \\ \hline
\cellcolor[HTML]{FFFFFF} &
  \textbf{O} &
  47 &
  0.005 &
  500 &
  10 &
  8.07 &
  32 &
  2 \\ \cline{2-9} 
\multirow{-2}{*}{\cellcolor[HTML]{FFFFFF}\textbf{CLUSTER}} &
  \textbf{X} &
  47 &
  0.005 &
  500 &
  30 &
  4.36 &
  32 &
  2 \\ \hline
\cellcolor[HTML]{FFFFFF} &
  \textbf{O} &
  5851 &
  0.001 &
  300 &
  5 &
  916.48 &
  120 &
  2 \\ \cline{2-9} 
\multirow{-2}{*}{\cellcolor[HTML]{FFFFFF}\textbf{WikiCS}} &
  \textbf{X} &
  5851 &
  0.1 &
  200 &
  30 &
  124.96 &
  32 &
  2 \\ \hline
\textbf{OGBN-arxiv} &
  \textbf{X} &
  16124 &
  0.005 &
  500 &
  5 &
  7496.92 &
  64 &
  2 \\ \hline
\end{tabular}%
}

\caption{Hyperparameters and training details for all datasets. Training time(sec) is the training time for the first epoch applying neither normalization nor regularization.}\label{table:training-details}
\end{table*}


\xhdr{Hardware and Software Specifications}. Our models are implemented with Python 3.8.8, PyTorch Geometric 2.0.5 \citep{fey2019fast}, and PyTorch 1.10.0 \citep{pytorch2019}. We conduct experiments on a computer equipped with 2.3 GHz Quad-Core Intel Core i7 processor and Intel Iris Plus Graphics 1536 MB. 

\label{sec:app-results}
\subsection{Experiment results}

In this section, we highlight the results of our experiments on the various datasets aforementioned. Additional plots are provided in the folder of supplementary materials associated with this paper.

\subsubsection{Node Classification}
First, we want to investigate the impact of the choice of operators on the node classification task. We observe that the performance of the node classification task varies by choice of $\alpha$. We fix $\beta = 1$ --- in other words, we add self-loops, consistently with the standard GCN architecture. In general, we observe that performance are highly dependent on the choice of $\alpha$ especially for the symmetrized operator, but the performance of node classification task of row-normalized operator is relatively robust to the choice of $\alpha$


\begin{figure*}\label{fig:intro2}
     \centering
          \begin{subfigure}[t]{0.33\textwidth}
              \centering
    \includegraphics[width=1.0\textwidth, height=4cm]{Experiment_Figures_Fin/Accuracy/amazon_accuracy.png}
    \caption{Amazon Photos ($h_e=0.83$)}
     \end{subfigure}
          \begin{subfigure}[t]{0.32\textwidth}
              \centering
    \includegraphics[width=1.0\textwidth, height=4cm]{Experiment_Figures_Fin/Accuracy/cornell_accuracy.png}
    \caption{Cornell   ($h_e=0.30$)}
     \end{subfigure}
              \begin{subfigure}[t]{0.32\textwidth}
              \centering
    \includegraphics[width=1.0\textwidth, height=4cm]{Experiment_Figures_Fin/Accuracy/wisconsin_accuracy.png}
    \caption{Wisconsin  ($h_e=0.21$)}
     \end{subfigure}
    \caption{Effect of $\alpha$ on the performance of the algorithm for our family of convolutions defined in Eq.\ref{eq:normalized} and Eq.\ref{eq:regularized} (30 independent experiments, with random training and test set). Here, $h_e$ denotes the edge homophily in the dataset (defined as the fraction of edges whose vertices share the same label) Note the strong dependency of the results on $\alpha$. See Appendix \ref{appendix:experiments} for further details and results.}
    \label{fig:intro2}
\end{figure*}


\begin{figure*} %[H]
    \centering
        \begin{subfigure}{1\textwidth}
            \includegraphics[width=\textwidth]{Experiment_Figures_Fin/classification/Citeseer_embedding_normal_loop_True_PCA_class.png}
            \caption{Citeseer, symmetrized, PCA, colored by node label}
        \end{subfigure}
        \begin{subfigure}{1\textwidth}
            \includegraphics[width=\textwidth]{Experiment_Figures_Fin/classification/Citeseer_embedding_normal_loop_True_UMAP_class.png}
            \caption{Citeseer, symmetrized, UMAP, colored by node label}
        \end{subfigure}
        \begin{subfigure}{1\textwidth}
            \includegraphics[width=\textwidth]{Experiment_Figures_Fin/classification/Citeseer_embedding_diffusion_loop_True_PCA_class.png}
            \caption{Citeseer, row-normalized, PCA, colored by node label}
        \end{subfigure}
        \begin{subfigure}{1\textwidth}
            \includegraphics[width=\textwidth]{Experiment_Figures_Fin/classification/Citeseer_embedding_diffusion_loop_True_UMAP_class.png}
            \caption{Citeseer, row-normalized, UMAP, colored by node label}
        \end{subfigure}
    \caption{Citeseer. The plots are colored by node labels(product categories). Embedding spaces generated by symmetric operator, (a), (b), as $\alpha$ increases the level of distinction between the cluster of different node labels decreases. Embedding space generated by row-normalized operator seems to be robust to the choice of $\alpha$-- it gives relatively constant level of clustering regardless of $\alpha$.}
    \label{fig:citeseer}
\end{figure*}

\begin{figure*}%[H]
    \centering
        \begin{subfigure}{1\textwidth}
            \includegraphics[width=\textwidth]{Experiment_Figures_Fin/classification/Wisconsin_embedding_normal_loop_True_PCA_class.png}
            \caption{Wisconsin, symmetrized, PCA, colored by node label}
        \end{subfigure}
        \begin{subfigure}{1\textwidth}
            \includegraphics[width=\textwidth]{Experiment_Figures_Fin/classification/Wisconsin_embedding_normal_loop_True_UMAP_class.png}
            \caption{Wisconsin, symmetrized, UMAP, colored by node label}
        \end{subfigure}
        \begin{subfigure}{1\textwidth}
            \includegraphics[width=\textwidth]{Experiment_Figures_Fin/classification/Wisconsin_embedding_diffusion_loop_True_PCA_class.png}
            \caption{Wisconsin, row-normalized, PCA, colored by node label}
        \end{subfigure}
        \begin{subfigure}{1\textwidth}
            \includegraphics[width=\textwidth]{Experiment_Figures_Fin/classification/Wisconsin_embedding_normal_loop_True_UMAP_class.png}
            \caption{Wisconsin, row-normalized, UMAP, colored by node label}
        \end{subfigure}
    \caption{Wisconsin.The plots are colored by node label(categories for the webpage). Unlike the graph in Figure 8, it is hard to detect the change in the level of clustering or separation of each node class as $\alpha$ varies. PCA transformed embedding plot for row-normalized operator (c) even shows that the clustering of node label improves as $\alpha$ gets closer to 1.}
    \label{fig:wisc}
\end{figure*}


\xhdr{Analysis} For standard homophilic datasets such as Citeseer(Figure \ref{fig:citeseer}), clustering of each node class has become less identifiable for a symmetric operator when $\alpha$ increases. On the other hand, the node class has been well separated across the alpha for the row-normalized operator-- the row-normalized operator is robust to the choice of $\alpha$ when it comes to node classification performance. Conversely, for the datasets with low homophily shown in Table \ref{table:training-details}, such as Wisconsin(Figure \ref{fig:wisc}), the separation of the node label does not change much depending on the choice of $\alpha$ or the choice of operator. The visual inspections on the embedding space transformed by PCA and UMAP are in line with the numerical result of test accuracy, Figure \ref{fig:node_class}. 

Consistently with the results for standard benchmark sets, we observe that the performance of the model also depends on the choice of $\alpha$ for \textit{PATTERN}, \textit{CLUSTER}, \textit{WikiCS}, and\textit{OGBN-arxiv}(see Table-\ref{tab:experiments-combined}). Without batch normalization, the row-normalized convolution are quite stable (the variation in $\alpha$ only induces gains in accuracy of 8\% for Cluster, and 3\% for WikiCS). By contrast, the tuning of $\alpha$ has a more dramatic effect on the performance, yielding increases of up to 81\% and 17\% for these two datasets. Not only the performance differs by the choice of $\alpha$, the resulting embedding space is also affected. As in Figure~\ref{fig:amazon-node-degree}, Figure~\ref{fig:wikics-node-degree} shows the analogous arrangement of embedded nodes by their node degree. Particularly, the high degree nodes are concentrated to the origin as $\alpha$ increases, and the lower degree nodes are located at the margin of the embedding space for both symmetric and row-normalized operators. Overall, these experiments confirm the phenomena  observed in the previous section.



\label{sec:app-results-classification}
\begin{figure*}%[H]
    \centering
        \begin{subfigure}{.49\textwidth}
            \includegraphics[width=\textwidth]{Experiment_Figures_Fin/Accuracy/citeseer_accuracy.png}
            \caption{Citeseer}
        \end{subfigure}
        \begin{subfigure}{.49\textwidth}
            \includegraphics[width=\textwidth]{Experiment_Figures_Fin/Accuracy/wisconsin_accuracy.png}
            \caption{Wisconsin}
        \end{subfigure}
    \caption{Test accuracy for node classification task on two datasets: Citeseer and Wisconsin.}
    \label{fig:node_class}
\end{figure*}


\begin{figure*}%[H]
    \centering
        \begin{subfigure}{.49\textwidth}
            \includegraphics[width=\textwidth]{Experiment_Figures_Fin/NewBenchmark/pattern_accuracy.png}
            \caption{PATTERN without Batch Normalization}
        \end{subfigure}
        \begin{subfigure}{.49\textwidth}
            \includegraphics[width=\textwidth]{Experiment_Figures_Fin/NewBenchmark/pattern_BN_accuracy.png}
            \caption{PATTERN with Batch Normalization}
        \end{subfigure}
    \caption{Test accuracy for node classification task. Only the first graph of PATTERN dataset is used, and the nodes within the graph are randomly split into training and test data per each training epoch.}
    \label{fig:cluster-node_class}
\end{figure*}

\begin{figure*}%[H]
    \centering
        \begin{subfigure}{.49\textwidth}
            \includegraphics[width=\textwidth]{Experiment_Figures_Fin/NewBenchmark/cluster_accuracy.png}
            \caption{CLUSTER without Batch Normalization}
        \end{subfigure}
        \begin{subfigure}{.49\textwidth}
            \includegraphics[width=\textwidth]{Experiment_Figures_Fin/NewBenchmark/cluster_BN_accuracy.png}
            \caption{CLUSTER with Batch Normalization}
        \end{subfigure}
    \caption{Test accuracy for node classification task. Only the first graph of CLUSTER dataset is used, and the nodes within the graph are randomly split into training and test data per each training epoch.}
    \label{fig:cluster-node_class}
\end{figure*}


\begin{figure*}%[H]
    \centering
        \begin{subfigure}{.49\textwidth}
            \includegraphics[width=\textwidth]{Experiment_Figures_Fin/NewBenchmark/wiki_accuracy.png}
            \caption{WikiCS without Batch Normalization}
        \end{subfigure}
        \begin{subfigure}{.49\textwidth}
            \includegraphics[width=\textwidth]{Experiment_Figures_Fin/NewBenchmark/wiki_BN_accuracy.png}
            \caption{WikiCS with Batch Normalization}
        \end{subfigure}
    \caption{Test accuracy for node classification task on WikiCS}
    \label{fig:wiki-node_class}
\end{figure*}




\subsubsection{Degree} \label{sec:app-results-degree}

In this subsection, we propose to investigate how basic topological characteristics (more specifically, here, the node degree) drive the organization of the embedding space. Consequently, to complement the analysis performed in the main text, we conduct visual inspection on the embedding plots of our benchmark datasets colored by node degree. 

Figures \ref{fig:co-author-cs-node-degree} and \ref{fig:amazon-node-degree} show the embedding spaces transformed by PCA, and the size and color of points denote the node degree. It is noted that the high degree nodes are marginalized when $\alpha$ close to 0, and lower degree nodes tend to be located at the origin. As $\alpha$ gets closer to 1, this pattern seems to be reverted--- the higher degree nodes are located at the origin and the lower degree nodes are pushed out to be at the periphery. 

\begin{figure*}%[H]
     \centering
     \begin{subfigure}{1\textwidth}
    \includegraphics[width=\textwidth]{Experiment_Figures_Fin/Embeddings/CoauthorCS_embedding_normal_loop_True_PCA.png}
    \caption{Coauthor CS, symmetrized, colored by degree}
    \end{subfigure}
    \begin{subfigure}{1\textwidth}
        \includegraphics[width=\textwidth]{Experiment_Figures_Fin/Embeddings/CoauthorCS_embedding_diffusion_loop_True_PCA.png}
        \caption{Coauthor CS, row-normalized, colored by degree}
    \end{subfigure}    
    \caption{Coauthor CS. The point size and color denote the node degree. For both symmetrized and row-normalized operator, high degree nodes are located farther from the origin when $\alpha \approx 0$. As $\alpha$ increases, high degree nodes are concentrated on the origin, and low degree nodes are spread out instead.}
    %
    \label{fig:co-author-cs-node-degree}
\end{figure*}


\begin{figure*}%[H]
     \centering
     \begin{subfigure}{1\textwidth}
    \includegraphics[width=\textwidth]{Experiment_Figures_Fin/Embeddings/AmazonPhoto_embedding_normal_loop_True_PCA.png}
    \caption{Amazon Photo, symmetrized, colored by degree}
    \end{subfigure}
    \begin{subfigure}{1\textwidth}
        \includegraphics[width=\textwidth]{Experiment_Figures_Fin/Embeddings/AmazonPhoto_embedding_diffusion_loop_True_PCA.png}
        \caption{Amazon Photo, row-normalized, colored by degree}
    \end{subfigure}
    \caption{Amazon Photo. The point size and color denote the node degree. Amazon Photo networks have a few nodes with extremely high degree$(>500)$. Even for such nodes, as $\alpha$ increases the effect of high degree vanishes and all points are clustered near the origin.}
    \label{fig:amazon-node-degree}
\end{figure*}


    
\begin{figure*}%[H]
     \centering
     \begin{subfigure}{1\textwidth}
    \includegraphics[width=\textwidth]{Experiment_Figures_Fin/NewBenchmark/WikiCS_embedding_normal_loop_True_PCA.png}
    \caption{WikiCS, symmetrized, colored by degree, without Batch Normalization}
    \end{subfigure}
    \begin{subfigure}{1\textwidth}
        \includegraphics[width=\textwidth]{Experiment_Figures_Fin/NewBenchmark/WikiCS_embedding_diffusion_loop_True_PCA.png}
        \caption{WikiCS, row-normalized, colored by degree, without Batch Normalization}
    \end{subfigure}
    \begin{subfigure}{1\textwidth}
        \includegraphics[width=\textwidth]{Experiment_Figures_Fin/NewBenchmark/WikiCS_BN_embedding_normal_loop_True_PCA.png}
        \caption{WikiCS, symmetrized, colored by degree, with Batch Normalization}
    \end{subfigure}
    \begin{subfigure}{1\textwidth}
        \includegraphics[width=\textwidth]{Experiment_Figures_Fin/NewBenchmark/WikiCS_BN_embedding_diffusion_loop_True_PCA.png}
        \caption{WikiCS, row-normalized, colored by degree, with Batch Normalization}
    \end{subfigure}
    \caption{WikiCS. The point size and color denote the node degree. As we have observed from the experiment with Cora or Citeseer, with and without Batch Normalization, as $\alpha$ gets close to 1, the high degree nodes are concentrated near the origin and the low degree nodes are spread out in the embedding space.}
    \label{fig:wikics-node-degree}
\end{figure*}




\subsubsection{Distance to the Original Space}\label{sec:app-results-distance}

In this subsection, we investigate the link between the relative distances between embedding points, and that of the original data.

\xhdr{Distance Calculation} The original dataset provides two separate views of the data, for which we can define two separate notions of distance: (1) a distance based on the graph structure (e.g the adjacency matrix), and (2) a distance based on the node features. For the distance in the graph, we choose to consider a distance in the graph space based on the diffusion distance \citep{coifman2006diffusion} using Gaussian kernel with $\epsilon = 0.5$. 
\begin{align*}
    K(u,v) = exp\big(-\frac{d_{\text{shortest path}
    }(\text{node u}, \text{node v})_{\alpha}^2}{\epsilon}\big)
\end{align*} 

The shortest path distance is computed by build-in function in \citep{networkx}. Distance in the feature space is measured by the pairwise euclidean distance of node features space. Finally, the distance in the embedding space is all based on the pairwise $\ell_2$ Euclidean distance. 

\xhdr{Correlation Analysis} It is a natural question to ask how embedding space closely resembles the original graph space or the feature space. The notion of closeness can be defined in several ways, but in this experiment, we first see the correlation between the distance in the original space and in the embedding space. We will use Spearman's rank correlation, which measures the monotonic relationship between the two.

Higher correlation could be interpreted as the amount of information that is retained in the embedding space regarding graph structure or node features. From Figure \ref{fig:distance-corr-graph}, both dataset show decreasing correlation as alpha increases; however, the correlation itself is actually close to 0($<0.05$). It is be reasonable to suspect no information regarding graph structure has been preserved in the embedding space, so we will come back to this question in \ref{sec:app-results-curvature}

Figure \ref{fig:distance-corr-feature} shows the trend in consonance with the test accuracy for the node classification task, Figure \ref{fig:node_class}. That is, for row-normalized operator the amount of node feature information is relatively constant for both standard homophilic dataset and heterophilic dataset such as Wisconsin. On the other hand, when it comes to the symmetrized operator, for Cora dataset, the correlation between the embedding spaces and the original feature space drops from 0.15(when $\alpha \approx 0.6$), to 0.005(when $\alpha \approx 1$). For Wisconsin dataset, the correlation still increases for the symmetrized operator, but the absolute value of correlation for the symmetrized operator is lower than that of the row-normalized operator.



\begin{figure*}%[H]
     \centering
     \begin{subfigure}{.49\textwidth}
    \includegraphics[width=\textwidth]{Experiment_Figures_Fin/Correlation/dist_corrCora_graph.png}
    \caption{Cora}
    \end{subfigure}
      \begin{subfigure}{.49\textwidth}
    \includegraphics[width=\textwidth]{Experiment_Figures_Fin/Correlation/dist_corrWisconsin_graph.png}
    \caption{Wisconsin}
    \end{subfigure}
     \caption{Spearman's correlation between the pairwise distances in the graph space and pairwise distance in the embedding space. }
     \label{fig:distance-corr-graph}
\end{figure*}

\begin{figure*}%[H]
    \begin{subfigure}{.49\textwidth}
    \includegraphics[width=\textwidth]{Experiment_Figures_Fin/Correlation/dist_corrCora_feature.png}
    \caption{Cora}
    \end{subfigure}
    \begin{subfigure}{.49\textwidth}
    \includegraphics[width=\textwidth]{Experiment_Figures_Fin/Correlation/dist_corrWisconsin_featrue.png}
    \caption{Wisconsin}
    \end{subfigure}
    \caption{Spearman's correlation between the pairwise distances in the feature space and pairwise distance in the embedding space}
    \label{fig:distance-corr-feature}
\end{figure*}




\xhdr{Gromov-Wasserstein Distance} To this extent, we use Gromov-Wasserstein distance\citep{Memoli2011GW} which allows to measure the distance between two probability spaces of different dimensions, by comparing the within distance of probability spaces. By estimating Gromov-Wasserstein distance we can evaluate how close our embedding space is to the original space upon the choice of operators.

Based on the within distance calculated as described earlier, Gromov-wasserstein distance is calculated using python implemented \verb|ot.gromov.gromov.wasserstein| function in \verb|ot| package.\citep{flamary2021pot} \href{https://pythonot.github.io/quickstart.html}{\textit{https://pythonot.github.io}}. The detailed values from computations are shown in Figure \ref{fig:gw-graph} and Figure \ref{fig:gw-feature}.

\xhdr{Analysis} For the distance, we need to interpret in the opposite way we comprehend the correlation from earlier subsection. The lower the distance, the more the information regarding graph structure of feature has preserved in the embedding spaces. First, Cora shows the opposite pattern of distance with graph space and feature space. It can be viewed as for Cora, feature information has maximally preserved when $\alpha \approx 0.5$ for both symmetrized and row-normalized operator, while the information regarding graph structure has minimally estimated. When $\alpha \approx 0$ or $\alpha \approx 1$, the distance to the graph space is close to  0, while the distance to the feature space is close to the highest value.

On the contrary, Wisconsin seems to have similar pattern of distance for both graph and feature spaces. Embedding space recovered by the symmetrized operator, the information for both graph structure and node features are minimally retained when $\alpha \approx 0.5$. With the row-normalized operator, the distances to the original spaces increase as $\alpha$ increases. 

% How close is 0.01 in GW distance? 


\begin{figure*}%[H]
    \centering
        \begin{subfigure}[t]{0.49\textwidth}
            \includegraphics[width=\textwidth]{Experiment_Figures_Fin/Gromov-Wasserstein/GW_plot forCora_graph.png}
            \caption{Cora}
        \end{subfigure}
        \begin{subfigure}[t]{0.49\textwidth}
            \includegraphics[width=\textwidth]{Experiment_Figures_Fin/Gromov-Wasserstein/GW_plot forWisconsin_graph.png}
            \caption{Wisconsin}
        \end{subfigure}
    \caption{Gromov-Wasserstein distance between the graph space and embedding space. For both datasets, when $\alpha$ is close to 0 or 1, the distance between two spaces is small for symmetrized operator. } 
    \label{fig:gw-graph}
\end{figure*}




\begin{figure*}%[H]
    \centering
        \begin{subfigure}[t]{0.49\textwidth}
            \includegraphics[width=\textwidth]{Experiment_Figures_Fin/Gromov-Wasserstein/GW_plot forCora_node.png}
            \caption{Cora}
        \end{subfigure}
        \begin{subfigure}[t]{0.49\textwidth}
            \includegraphics[width=\textwidth]{Experiment_Figures_Fin/Gromov-Wasserstein/GW_plot forWisconsin_node.png}
            \caption{Wisconsin}
        \end{subfigure}
    \caption{Gromov-Wasserstein distance between the feature space and embedding space. Note that the distance variation across $\alpha$ is very similar to the accuracy of node classification task along $\alpha$ on Figure \ref{fig:node_class}
    }
    \label{fig:gw-feature}
\end{figure*}


\subsubsection{Curvature} \label{sec:app-results-curvature}

In this section, the embedding space is compared to the original space with regard to the geometry of the original space. We first narrow down the notion of geometry to  a graph curvature. Graph curvature could explain the structural properties of the data that cannot be fully captured by the node degree. One might reasonably wonder how this structural information or geometry of the graph could be preserved from original space to the embedding space. We use augmented Forman curvature for the graph defined in \citep{Giovanni2022Curvature}. 

\begin{table*}%[H]
\centering
\resizebox{\textwidth}{!}{%
\begin{tabular}{@{}ccccccccc@{}}
\toprule
 &
  \textbf{Cora} &
  \textbf{Pubmed} &
  \textbf{Citeseer} &
  \textbf{\begin{tabular}[c]{@{}c@{}}Coauthor\\ CS\end{tabular}} &
  \textbf{\begin{tabular}[c]{@{}c@{}}Amazon\\ Photo\end{tabular}} &
  \textbf{Actor} &
  \textbf{Cornell} &
  \textbf{Wisconsin} \\ \midrule
\textbf{Mean} &
  -9.6178 &
  -18.9898 &
  -3.2427 &
  -14.4801 &
  -99.8552 &
  -8.2625 &
  -41.9855 &
  -46.2206 \\
\textbf{SD} &
  16.0352 &
  15.9152 &
  8.5414 &
  11.7537 &
  110.3011 &
  21.2629 &
  43.2669 &
  52.3953 \\ \bottomrule
\end{tabular}%
\caption{Mean and Standard deviation of augmented forman curvature\citep{Giovanni2022Curvature} for 8 Datasets}
}
\end{table*}

To calculate the graph curvature on the embedding space, we have to reconstruct the graph on the embedding space. First, based on the euclidean distance of each node in the embedding space, we connect the same number of edges as the original graph. With this "reconstructed graph" on the embedding space, we calculate the graph curvature. Finally, we compare how much curvature has been preserved upon varying operators by Spearman's rank correlation. 

Figure \ref{fig:curvature-corr} shows (a) Cora and (b) Amazon Photos, standard datasets with high homophily as shown in Table 3, Spearman's correlation between the original curvature and embedding curvature are relatively constant around 0.3 with row-normalized operator. On the other hand, symmetrized operator has stronger positive correlation when $\alpha \approx 1$. For the dataset with low homophily, denoted as heterophilic graph dataset, such as (c) Cornell or (d) Wisconsin, not only the absolute value of the Spearman's correlation is much lower than that of results from homophilic dataset, but also there is a decreasing trend across the $\alpha$ for both symmetrized and row-normalized operator. 

Based on these observations, the geometry in terms of curvature seems to be better preserved when $\alpha \approx 1$ for the dataset with high homophily. When the graph is of low homophily, symmetrized operator works slightly better preserving the curvature, but the absolute value of the correlation itself is fairly low compared with the result of high homophily dataset, such as Figure \ref{fig:curvature-corr} (a) or (b).


\begin{figure*}%[H]
     \centering
     \begin{subfigure}{.49\textwidth}
    \includegraphics[width=\textwidth]{Experiment_Figures_Fin/Correlation/Curvature_corrCora.png}
    \caption{Cora}
    \end{subfigure}
    \begin{subfigure}{.49\textwidth}
    \includegraphics[width=\textwidth]{Experiment_Figures_Fin/Correlation/Curvature_corrCiteseer.png}
    \caption{Citeseer}
    \end{subfigure}
     \begin{subfigure}{.49\textwidth}
    \includegraphics[width=\textwidth]{Experiment_Figures_Fin/Correlation/Curvature_corrCornell.png}
    \caption{Cornell}
    \end{subfigure}
    \begin{subfigure}{.49\textwidth}
    \includegraphics[width=\textwidth]{Experiment_Figures_Fin/Correlation/Curvature_corrWisconsin.png}
    \caption{Wisconsin}
    \end{subfigure}
    \caption{Spearman's correlation between the original and embedding curvature. }
    \label{fig:curvature-corr}
\end{figure*}


\begin{figure*}%[H]
     \centering
     \begin{subfigure}{1\textwidth}
    \includegraphics[width=\textwidth]{Experiment_Figures_Fin/Embedding_colored_by_curvature/Cora_curvature_normal_loop_True_PCA.png}
    \caption{Cora, symmetrized, colored by embedding curvature}
    \end{subfigure}
     \begin{subfigure}{1\textwidth}
    \includegraphics[width=\textwidth]{Experiment_Figures_Fin/Embedding_colored_by_curvature/Cora_curvature_diffusion_loop_True_PCA.png}
    \caption{Cora, row-normalized, colored by embedding curvature}
    \end{subfigure}
    \label{fig:my_label}
    \caption{Cora. The points are colored by embedding curvature and the size is proportional to the original curvature.}
\end{figure*}




\begin{figure*}%[H]
     \centering
     \begin{subfigure}{1\textwidth}
    \includegraphics[width=\textwidth]{Experiment_Figures_Fin/Embedding_colored_by_curvature/Wisconsin_curvature_normal_loop_True_PCA.png}
    \caption{Wisconsin, symmetrized, colored by embedding curvature}
    \end{subfigure}
     \begin{subfigure}{1\textwidth}
    \includegraphics[width=\textwidth]{Experiment_Figures_Fin/Embedding_colored_by_curvature/Wisconsin_curvature_diffusion_loop_True_PCA.png}
    \caption{Wisconsin, row-normalized, colored by embedding curvature}
    \end{subfigure}
    \label{fig:my_label}
    \caption{Wisconsin. The points are colored by embedding curvature and the size is proportional to the original curvature.}
\end{figure*}

\subsection{Node Homophily}
In this section, we focus on the effect of $\beta$ on the prediction performance. Graph Neural Network implicitly assumes that the neighboring node will share similar properties.\cite{McPherson2001BoF} To overcome this shortcoming, there are several attempts\cite{Zhu2020BeyonHomophily}, \cite{Pei2020GeomGCN},\cite{Jin2021UGCN} to improve the performance on the dataset with low node homophily. From our experiments, we showed that by simply adjusting the $\beta$, we could gain comparable empirical performance on low-homophilic graphs, without employing any architectural adjustment.

\xhdr{Analysis}
We have used both the synthetic dataset(Synthetic-Cora) provided by \cite{Zhu2020BeyonHomophily} with varying levels of node homophily, and the actual datasets(Actor, Cornell). We observed the competitive level of node classification accuracy both on synthetic and actual datasets compared to the model with architectural adjustment.

For \textit{synthetic-Cora}, we observed that both for symmetric and row-normalized operators, the node prediction accuracy increases as $\beta$ increases. However, the performance sharply drops if we increase $\beta$ too much ($\beta$ = 50). 

For \textit{Actor} data, we observed that the prediction accuracy using both symmetric and row-normalized operators monotonically increases as $\beta$ increases. Compared to the literature, 35.86\%($H_2$GCN), 31.63\%(Geom-GCN), our experiments showed reasonable performance up to 36.3\% for the symmetric and 36.3\% for the row-normalized operator.

For \textit{Cornell} data, we observed that the prediction accuracy using both symmetric and row-normalized operators monotonically increases as $\beta$ increases. Compared to the literature, 82.16\%($H_2$GCN), 60.81\%(Geom-GCN), 69.77\%(UGCN), our experiments showed reasonable performance up to 69.2\% for the symmetric operator and 70.6\% for the row-normalized operator.

\begin{figure*}%[H]
     \centering
     \begin{subfigure}{0.49\textwidth}
    \includegraphics[width=\textwidth]{Experiment_Figures_Fin/Beta/exp-on-beta-symmetric.png}
    \caption{synthetic Cora, symmetric, $\alpha = 0.5$}
    \end{subfigure}
     \begin{subfigure}{0.49\textwidth}
    \includegraphics[width=\textwidth]{Experiment_Figures_Fin/Beta/exp-on-beta-row-normalized.png}
    \caption{synthetic Cora, row-normalized $\alpha = 0.5$}
    \end{subfigure}
    \begin{subfigure}{0.49\textwidth}
    \includegraphics[width=\textwidth]{Experiment_Figures_Fin/Beta/exp-on-beta-symmetric-alpha1.png}
    \caption{synthetic Cora, row-normalized $\alpha = 1.0$}
    \end{subfigure}
    \begin{subfigure}{0.49\textwidth}
    \includegraphics[width=\textwidth]{Experiment_Figures_Fin/Beta/exp-on-beta-row-normalized-alpha1.png}
    \caption{synthetic Cora, row-normalized $\alpha = 1.0$}
    \end{subfigure}
    \label{fig:my_label}
    \caption{Synthetic Cora dataset provided in \cite{Zhu2020homophily}. Node homphily index ranges from 0.1 to 1.0. $\alpha$ value is fixed to see the effect of varying $\beta$. Node classification accuracy is given across the different level of $\beta$.}
\end{figure*}

\begin{figure*}%[H]
     \centering
     \begin{subfigure}{0.49\textwidth}
    \includegraphics[width=\textwidth]{Experiment_Figures_Fin/Beta/actor-alpha5.png}
    \caption{Actor}
    \end{subfigure}
     \begin{subfigure}{0.49\textwidth}
    \includegraphics[width=\textwidth]{Experiment_Figures_Fin/Beta/cornell-alpha5.png}
    \caption{Cornell}
    \end{subfigure}
    \label{fig:my_label}
    \caption{Experiments on actual datasets with low node homophily. Node classification accuracy is given across the different levels of $\beta$ and fixed $\alpha = 0.5$.}
\end{figure*}


\bibliography{references}

\end{document}
