\section{Introduction to GAT and GCN} \label{appendix:GAT_GCN}
Here, we provide a brief introduction to GAT and GCN.

\paragraph{Graph Convolutional Neural Networks (GCNs)} The fundamental idea behind GCNs is to repeatedly apply the convolution operator on graphs~\citep{kipf2016semi}.
Define $h_i^{0} = z_i$ as the input feature of the $i$-th node and let $h_i^{\ell}$ be the feature of the $\ell$-th layer of the $i$-th node.
We have the following update rule for the features of $h_i^{\ell}$
\begin{align*}
    h_i^{\ell} 
    =  \sigma \left( \sum_{j \in \mathcal{N}_i}  \frac{1}{\sqrt{d_i d_j }} U^{\ell-1} h_j^{\ell-1}\right)
\end{align*}
where $d_i$ represents the degree of vertex $i$, $U^{\ell}$ represents the learnable weights in our model, $\mathcal{N}_i$ represents the neighbors of vertex $i$, and $\sigma(\cdot)$ is the activation function. 
% We assume the activation function $\sigma$ is $1$-Lipschitz, since most of the common activation functions, such as ReLU, eLU, sigmoid, etc. are all $1$-Lipschitz.

\paragraph{Graph Attention Neural Networks (GATs)}
GAT  is a more recent architecture that leverages the self-attention mechanisms to capture the importance of neighboring nodes to generate the features of the next layer~\citep{velivckovic2017graph}.
One of the advantages of GAT is its ability to capture long-range dependencies within the graph while giving more weight to influential nodes. This makes GAT particularly effective for tasks involving irregular graph structures and tasks where global context is essential.

Different from GCN, GAT uses the update rule for each layer
\[ h_i^{\ell} = \sigma \left(\sum_{j \in \mathcal{N}_i} e_{ij}^{\ell-1} U^{\ell-1}  h_j^{\ell-1}\right), \]
where 
\begin{align}
e_{ij}^{\ell} = \frac{\exp(\hat{e}_{ij}^{\ell})}{\sum_{j' \in \mathcal{N}_i} \exp(\hat{e}_{ij'}^{\ell})}, 
~~
\hat{e}_{ij}^{\ell} = \sigma\left(V^{\ell} [U^{\ell}h_i^{\ell}, U^{\ell}h_j^{\ell}]\right). \label{eqn:gat}
\end{align}
Here $e_{ij}^{\ell}$ is the attention score of node $j$ for node $i$ and $V^{\ell}$ and $U^{\ell}$ are learnable parameters.


\section{Proofs in Section~\ref{sec:gcn}}

We provide additional proof details from Section~\ref{sec:gcn} below. 

\subsection{Proof of Theorem~\ref{thm:rc of SGC}}\label{appendix:SGC}
\begin{lemma}
The $l_2$ norm of different embedding vectors produced by $(\beta, \theta), (\beta^\prime, \theta^\prime)$ after they process the tree all the way from the leaf level to the root can be bounded as 
\begin{align*}
    \Delta_{L, i} 
    \leq& \left(
    \frac{C_{dl}^2 + C_{dh}^2 + C_{dh}}{C_{dl}^3}
    \right)\|T_{L-1, i}(\beta, \theta)\| \|\beta - \beta^\prime\| + \left(\frac{1}{C_{dl}+1} + \frac{C_{dh}}{C_{dl}}\right)\Delta_{L-1, i}
\end{align*}
\end{lemma}

\begin{proof}
    \begin{align*}
    \Delta_{L, i} 
    &= \|T_{L, i}(\beta, \theta) - T_L(\beta^\prime, \theta^\prime)\|\\
    &= \Bigg\|\left(\frac{\beta}{d_i + \beta}T_{L-1, i}(\beta, \theta) + \sum_{j=1}^n \frac{w_{ij} T_{L-1, j}(\beta, \theta)}{\sqrt{(d_i+\beta)(d_j + \beta)}}\right) \\
    &\qquad- \left(\frac{\beta^\prime}{d_i + \beta^\prime}T_{L-1, i}(\beta^\prime, \theta^\prime) + \sum_{j=1}^n \frac{w_{ij} T_{L-1, j}(\beta^\prime, \theta^\prime)}{\sqrt{(d_i+\beta^\prime)(d_j + \beta^\prime)}}\right)\Bigg\|\\
    &\leq \left\|\left(\frac{\beta}{d_i + \beta}T_{L-1, i}(\beta, \theta)-\frac{\beta^\prime}{d_i + \beta^\prime}T_{L-1, i}(\beta^\prime, \theta^\prime)\right)\right\| \\
    &\qquad+ \sum_{j=1}^n \left(\|w_{ij}\| \left\|\left( \frac{T_{L-1, j}(\beta, \theta)}{\sqrt{(d_i+\beta)(d_j + \beta)}} - \frac{ T_{L-1, j}(\beta^\prime, \theta^\prime)}{\sqrt{(d_i+\beta^\prime)(d_j + \beta^\prime)}}\right)\right\|\right) \tag{by triangle inequality}\\
\end{align*}
The first part can be bounded as
\begin{align*}
    & \left\|\frac{\beta}{d_i + \beta}T_{L-1, i}(\beta, \theta)-\frac{\beta^\prime}{d_i + \beta^\prime}T_{L-1, i}(\beta^\prime, \theta^\prime)\right\|\\
    &\qquad\leq  \left\| 
    \frac{\beta}{d_i + \beta}T_{L-1, i}(\beta, \theta)
    -\frac{\beta^\prime}{d_i + \beta^\prime}T_{L-1, i}(\beta, \theta)\right\| \\
    &\qquad\qquad+\left\|\frac{\beta^\prime}{d_i + \beta^\prime}T_{L-1, i}(\beta, \theta)
    - \frac{\beta^\prime}{d_i + \beta^\prime}T_{L-1, i}(\beta^\prime, \theta^\prime) 
    \right\| \tag{by triangle inequality}\\
    &\qquad\leq  \left\| \frac{\beta}{d_i + \beta} - \frac{\beta^\prime}{d_i + \beta^\prime} \right\|\left\|T_{L-1, i} (\beta,\theta)\right\|
    +\left\|\frac{\beta^\prime}{d_i + \beta^\prime}\right\|
    \Delta_{L-1, i}\tag{by Cauchy-Schwarz inequality}
\end{align*}
Since $\beta \in [0,1]$ and $d_i \in [C_{dl}, C_{dh}]$, we have
\begin{align*}
    \left\|\frac{\beta^\prime}{d_i + \beta^\prime}\right\|
    = \frac{\beta^\prime}{d_i + \beta^\prime}
    \leq \frac{1}{C_{dl}+1},
\end{align*}
and
\begin{align*}
    \left\| \frac{\beta}{d_i + \beta} - \frac{\beta^\prime}{d_i + \beta^\prime} \right\|
    = \left\|\frac{d_i(\beta - \beta^\prime)}{(d_i+\beta)(d_i + \beta^\prime)}\right\|
    \leq \|\beta-\beta^\prime\| \frac{1}{C_{dl}}.
\end{align*}

For the second term, let's consider each element in the summation. Using a similar method as above, we get
\begin{align*}
    &\left\|\frac{T_{L-1, j}(\beta, \theta)}{\sqrt{(d_i+\beta)(d_j + \beta)}} - \frac{ T_{L-1, j}(\beta^\prime, \theta^\prime)}{\sqrt{(d_i+\beta^\prime)(d_j + \beta^\prime)}}\right\|\\
    &\qquad\leq\left\|\frac{T_{L-1, j}(\beta, \theta)}{\sqrt{(d_i+\beta)(d_j + \beta)}} 
    - \frac{ T_{L-1, j}(\beta, \theta)}{\sqrt{(d_i+\beta^\prime)(d_j + \beta^\prime)}}\right\|\\
    &\qquad\qquad+ \left\|\frac{T_{L-1, j}(\beta, \theta)}{\sqrt{(d_i+\beta^\prime)(d_j + \beta^\prime)}} 
    - \frac{ T_{L-1, j}(\beta^\prime, \theta^\prime)}{\sqrt{(d_i+\beta^\prime)(d_j + \beta^\prime)}}\right\| \tag{by triangle inequality}\\
    &\qquad\leq \left\|\frac{1}{\sqrt{(d_i+\beta)(d_j + \beta)}} 
    - \frac{1}{\sqrt{(d_i+\beta^\prime)(d_j + \beta^\prime)}}\right\|\|T_{L-1, j}(\beta, \theta)\| \\
    &\qquad\qquad+ \left\|\frac{1}{\sqrt{(d_i+\beta^\prime)(d_j + \beta^\prime)}}\right\|\Delta_{L-1, i} \tag{Cauchy-Schwarz inequality}
\end{align*}
Using the bounds on $\beta$ and $d_i$, we have 
\begin{align*}
    \left\|\frac{1}{\sqrt{(d_i+\beta^\prime)(d_j + \beta^\prime)}}\right\| \leq \frac{1}{C_{dl}},
\end{align*}
and
\begin{align*}
     &\left\|\frac{1}{\sqrt{(d_i+\beta)(d_j + \beta)}} 
    - \frac{1}{\sqrt{(d_i+\beta^\prime)(d_j + \beta^\prime)}}\right\|\\
    &\qquad=  \left\|  \frac{(d_i+\beta)(d_j + \beta) - (d_i+\beta^\prime)(d_j + \beta^\prime)}{\sqrt{(d_i+\beta)(d_j + \beta)(d_i+\beta^\prime)(d_j + \beta^\prime)}[\sqrt{(d_i+\beta)(d_j + \beta)} + \sqrt{(d_i+\beta^\prime)(d_j + \beta^\prime)}]} \right\|
    % \left\|\frac{1}{C_{dl}+ \beta} - \frac{1}{C_{dl} + \beta^\prime}\right\| 
    \\
    &\qquad\leq  \left\|  \frac{(d_i+d_j + \beta + \beta^\prime)(\beta-\beta^\prime)}{C_{dl}+\beta)(C_{dl}+\beta^\prime)[(C_{dl}+\beta)+ (C_{dl}+\beta^\prime)]} \right\|
    \\
    &\qquad\leq   \frac{C_{dh}+1}{C_{dl}^3} \|\beta-\beta^\prime\|
    % \\
    % \leq & \left\|\frac{1}{C_{dl}+ \|\beta-\beta^\prime\|} - \frac{1}{C_{dl}}\right\| \\
    % \leq & \frac{\|\beta - \beta^\prime\|}{C_{dl}(C_{dl} + \|\beta - \beta^\prime\|)} \leq \frac{1}{C_{dl}^2}\|\beta - \beta^\prime\|
\end{align*}
Combining these results together, we get
\begin{align*}
    \Delta_{L, i} 
    &\leq \frac{1}{C_{dl}}\|\beta - \beta^\prime\|\|T_{L-1, i}(\beta, \theta)\| + \frac{1}{C_{dl}+1}\Delta_{L-1, i} \\
    &\qquad+ \sum_{i=1}^n \left(\|w_{ij}\|\left(
    \frac{(C_{dh}+1)\|T_{L-1, i}(\beta, \theta)\|}{C_{dl}^3}
    \|\beta - \beta^\prime\| 
    + \frac{1}{C_{dl}}\Delta_{L-1,i}\right)\right)\\
    &= \frac{1}{C_{dl}}\|\beta - \beta^\prime\|\|T_{L-1, i}(\beta, \theta)\| + \frac{1}{C_{dl}+1}\Delta_{L-1, i} \\
    &\qquad+ d_i\left(
    \frac{(C_{dh}+1)\|T_{L-1, i}(\beta, \theta)\|}{C_{dl}^3}
    \|\beta - \beta^\prime\| + \frac{1}{C_{dl}}\Delta_{L-1,i}\right)\\
    &\leq \left(\frac{1}{C_{dl}}+ 
    \frac{(C_{dh}+1)C_{dh}}{C_{dl}^3}
    \right)\|T_{L-1, i}(\beta, \theta)\| \|\beta - \beta^\prime\| + \left(\frac{1}{C_{dl}+1} + \frac{C_{dh}}{C_{dl}}\right)\Delta_{L-1, i}\\
    &\leq  \left(
    \frac{C_{dl}^2 + C_{dh}^2 + C_{dh}}{C_{dl}^3}
    \right)\|T_{L-1, i}(\beta, \theta)\| \|\beta - \beta^\prime\| + \left(\frac{1 + C_{dh}}{C_{dl}}\right)\Delta_{L-1, i}
\end{align*}
\end{proof}

\begin{lemma}
    The term $\|T_{L-1, i}(\beta, \theta)\| $ satisfies
    \begin{align*}
        \|T_{L-1, i}(\beta, \theta)\| \leq \left(\frac{\beta + C_{dh}C_z}{C_{dl} + \beta}\right)^L B_xB_\theta
    \end{align*}
\end{lemma}
\begin{proof}
\begin{align*}
    \|T_{L-1, i}(\beta, \theta)\|
    &= \left\|\frac{\beta}{d_i + \beta}T_{L-2, i}(\beta, \theta) + \sum_{j=1}^n \frac{w_{ij} T_{L-2, j}(\beta, \theta)}{\sqrt{(d_i+\beta)(d_j + \beta)}}\right\|\\
    &\leq \left\|\frac{\beta}{d_i + \beta}T_{L-2, i}(\beta, \theta)\right\| + \sum_{j=1}^n \left\|\frac{w_{ij} T_{L-2, j}(\beta, \theta)}{\sqrt{(d_i+\beta)(d_j + \beta)}}\right\| 
    \tag{by triangle inequality}\\
    &\leq \frac{\beta}{d_i + \beta}\|T_{L-2, i}(\beta, \theta)\| + \sum_{j=1}^n \|w_{ij}\|\left\|\frac{ T_{L-2, j}(\beta, \theta)}{\sqrt{(d_i+\beta)(d_j + \beta)}}\right\|
    \tag{by Cauchy-Schwarz}\\
    &\leq \frac{\beta}{d_i + \beta}\|T_{L-2, i}(\beta, \theta)\| + C_{dh} \max_{j}\left\|\frac{T_{L-2, j}(\beta, \theta)}{\sqrt{(d_i+\beta)(d_j + \beta)}}\right\| \\
    &\leq \frac{\beta}{C_{dl} + \beta}\|T_{L-2, i}(\beta, \theta)\| + \frac{C_{dh}}{C_{dl}+\beta} \max_j \|T_{L-2, j}(\beta, \theta)\|\\
    &\leq \left(\frac{C_{dh} + \beta}{C_{dl} + \beta}\right)^{L-1} \|z_i \theta_i\| 
    \tag{by recursively bounding $\|T_{l, i}(\beta, \theta)\|$}\\
    % &\leq \left(\frac{C_{dh} + \beta}{C_{dl} + \beta}\right)^{L-1} C_z C_\theta\\
    &\leq \left(\frac{C_{dh}}{C_{dl}}\right)^{L-1} C_z C_\theta
\end{align*}
\end{proof}

\begin{lemma}
    The change in margin loss for each node, due to change in parameters, after $L$ layers is 
    \begin{align*}
        \Lambda_i \leq \frac{2}{\gamma} \left(\left(
        \frac{C_{dl}^2 + C_{dh}^2 + C_{dh}}{C_{dl}^3}
        \right)\left(\frac{C_{dh}}{C_{dl}}\right)^{L-1} C_z C_\theta \|\beta - \beta^\prime\| \cdot \frac{k_1 - k_1^L}{1-k_1} + k_1^L C_z \|\theta_i - \theta_i^\prime\|\right), 
    \end{align*}
    where $k_1 = (1+C_{dh}/C_{dl})$.
\end{lemma}
\begin{proof}
    From previous lemmas, we know how to recursively bound $\Delta_{L,i}$ using $\Delta_{L-1, i}$, but it remains for us to bound the base case $\Delta_{0, i}$. We have
    \begin{align*}
        \Delta_{0, i} 
        = \|T_{0, i}(\beta, \theta) - T_{0, i}(\beta, \theta)\|
        = \|z_i\theta_i - z_i\theta_i^\prime\|
        \leq \|z_i\| \|\Theta_i - \theta_i^\prime\| \leq C_z \|\theta_i - \theta_i^\prime\|,
    \end{align*}
    where the inequality is by Cauchy-Schwarz.
    For the simplicity of notation, let $\bar T_L$ be the bound we derived for $\|T_{L-1, i}(\beta, \theta)\|$ from the previous lemma. We have
    \begin{align*}
        \Delta_{L, i} 
        &\leq \left(
        \frac{C_{dl}^2 + C_{dh}^2 + C_{dh}}{C_{dl}^3}
        \right)\|T_{L-1, i}(\beta, \theta)\| \|\beta - \beta^\prime\| + \left(\frac{1 + C_{dh}}{C_{dl}}\right)\Delta_{L-1, i}\\
        &\leq \left(
        \frac{C_{dl}^2 + C_{dh}^2 + C_{dh}}{C_{dl}^3}
        \right)\bar T_{L} \|\beta - \beta^\prime\| + \left(\frac{1 + C_{dh}}{C_{dl}}\right)\Delta_{L-1, i}\\
        &= \left(
        \frac{C_{dl}^2 + C_{dh}^2 + C_{dh}}{C_{dl}^3}
        \right)\bar T_{L} \|\beta - \beta^\prime\| \cdot \sum_{l=0}^{L-1} \left(\frac{1 + C_{dh}}{C_{dl}}\right)^l + \left(\frac{1 + C_{dh}}{C_{dl}}\right)^L \cdot \Delta_{0, i}
        \tag{by recursively bounding the terms}\\
        &= \left(
        \frac{C_{dl}^2 + C_{dh}^2 + C_{dh}}{C_{dl}^3}
        \right)\bar T_{L} \|\beta - \beta^\prime\| \cdot \frac{k_1 - k_1^L}{1-k_1} + k_1^L C_z \|\theta_i - \theta_i^\prime\| 
    \end{align*}
    where 
    \begin{align*}
        k_1 &= \frac{1 + C_{dh}}{C_{dl}}.
    \end{align*}

    The change in margin loss for each node after $L$ layers is then 
    \begin{align*}
        \Lambda_i 
        &= \left|g_\gamma(-\tau (f_{\beta, \theta}(x_i), y_i)) - g_\gamma(-\tau (f_{\beta^\prime, \theta^\prime}(x_i), y_i))\right|\\
        &\leq \frac{1}{\gamma}\left|\tau (f_{\beta, \theta}(x_i), y_i)) - \tau (f_{\beta^\prime, \theta^\prime}(x_i), y_i))\right|
        \tag{since $g_\gamma$ is $1/\gamma$-Lipschitz}\\
        &= \frac{1}{\gamma}\left|(2f_{\beta, \theta}(x_i) - 1) y_i-(2f_{\beta^\prime, \theta^\prime}(x_i) - 1) y_i)\right|\\
        &\leq \frac{2}{\gamma}\left|y_i\right|\left|f_{\beta, \theta}(x_i) - f_{\beta^\prime, \theta^\prime}(x_i)\right|
        \tag{by Cauchy-Schwarz inequality}\\
        &\leq \frac{2}{\gamma}\left|\sigma(T_{L, i}(\beta, \theta)) - \sigma(T_{L, i}(\beta^\prime, \theta^\prime))\right|
        \tag{since $y_i \in \{-1,1\}$}\\
        &\leq \frac{2}{\gamma}\left|T_{L, i}(\beta, \theta) - T_{L, i}(\beta^\prime, \theta^\prime)\right| 
        \tag{since sigmoid is $1$-Lipschitz}\\
        &= \frac{2}{\gamma} \Delta_{L, i}\\
        &\leq \frac{2}{\gamma} \left(\left(
        \frac{C_{dl}^2 + C_{dh}^2 + C_{dh}}{C_{dl}^3}
        \right)\left(\frac{C_{dh}}{C_{dl}}\right)^{L-1} C_z C_\theta \left(\frac{k_1 - k_1^L}{1-k_1} \right) \|\beta - \beta^\prime\| + k_1^L C_z \|\theta_i - \theta_i^\prime\|\right) 
    \end{align*}
\end{proof}

\begin{lemma}
    The change in margin loss $\Lambda_i$ for each node can be bounded by $\epsilon$, using a covering of size P, where P depends on $\epsilon$.
\end{lemma} 
\begin{proof}
    Let $k_2 = \frac{2}{\gamma}\left(
        \frac{C_{dl}^2 + C_{dh}^2 + C_{dh}}{C_{dl}^3}
    \right)\left(\frac{C_{dh}}{C_{dl}}\right)^{L-1} C_z C_\theta \left(\frac{k_1 - k_1^L}{1-k_1} \right)$ and $k_3 = \frac{2}{\gamma} k_1^L C_z$ for simplicity of notation. 
    
    We begin by noting that we can find a covering $\mathcal{C}\left(\beta, \frac{\epsilon}{4k_2}, |\cdot|\right)$ of size 
    \begin{align*}
        \mathcal{N}\left(\beta, \frac{\epsilon}{4k_2}, |\cdot|\right) \leq \frac{8k_2}{\epsilon}+1.
    \end{align*}
   Also, we can find a covering $\mathcal{C}\left(\theta, \frac{\epsilon}{4k_3}, \|\cdot\|\right)$ of size 
    \begin{align*}
        \mathcal{N}\left(\theta, \frac{\epsilon}{4k_3}, \|\cdot\|\right) \leq \left(\frac{8k_3}{\epsilon}+1\right)^{d}.
    \end{align*}
    Thus, for any specified $\epsilon$, we can ensure that $\Lambda_i$ is at most $\epsilon$ with a covering number 
    \begin{align*}
        P \leq \mathcal{N}\left(\beta, \frac{\epsilon}{4k_2}, |\cdot|\right)\mathcal{N}\left(\Theta, \frac{\epsilon}{4k_3}, \|\cdot\|\right) 
        \leq \left(\frac{8\max\{k_2, k_3\}}{\epsilon} + 1\right)^{d+1}.
    \end{align*}
    When $\epsilon < 8\max\{k_2, k_3\}$, we have 
    \begin{align*}
        \log P \leq (d+1) \log \left(\frac{16\max\{k_2, k_3\}}{\epsilon}\right).
    \end{align*}
\end{proof}

We can now finish our proof for Lemma~\ref{thm:rc of SGC}.
\begin{proof}
    Using Lemma A.5 from \cite{bartlett2017spectrally}, we obtain that 
    \begin{align*}
        \hat R_{\mathcal{T}}(\mathcal{H}_{(\beta, \theta)}^\gamma)
        \leq \inf_{\alpha > 0}\left(\frac{4\alpha}{\sqrt{m}} + \frac{12}{m} \int_{\alpha}^{\sqrt{m}} \sqrt{\log \mathcal{N}(\mathcal{H}_{(\beta, \theta)}^\gamma, \epsilon, \|\cdot \|)} d\epsilon\right).
    \end{align*}
    Using the previous lemmas, we have
    \begin{align*}
        \int_{\alpha}^{\sqrt{m}} \sqrt{\log \mathcal{N}(\mathcal{H}_{(\beta, \theta)}^\gamma, \epsilon, \|\cdot \|)} d\epsilon
        &= \int_{\alpha}^{\sqrt{m}} \sqrt{\log P} d\epsilon\\
        &\leq \int_{\alpha}^{\sqrt{m}} \sqrt{(d+1)\log \left(\frac{16\max\{k_2, k_3\}}{\epsilon}\right)} d\epsilon\\
        &\leq \sqrt{m} \sqrt{(d+1) \log \left(\frac{16\max\{k_2, k_3\}}{\alpha}\right)}
    \end{align*}
    Plugging in $\alpha = \sqrt{\frac{1}{m}}$, we have
    \begin{align*}
        \hat R_{\mathcal{T}}(\mathcal{H}_{(\beta, \theta)}^\gamma)
        \leq \frac{4}{m} + \frac{12\sqrt{(d+1)\log(16\sqrt{m}\max\{k_2, k_3\})}}{\sqrt{m}}.
    \end{align*}
\end{proof}


\subsection{Proof of Theorem~\ref{thm:rc of GCAN}}\label{appendix:GCAN}
\begin{lemma}\label{lem:inequalities}
    For any $z, z^\prime, \in \mathbb{R}^{d \times r}$ and $b, b^\prime \in \mathbb{R}^{r \times t}$ such that $\|z\|_F \leq C_z, \|z^\prime \|_F\leq C_z, \|b\|_F\leq C_b, \|b^\prime\|_F \leq C_b$, we have 
    \begin{align*}
        \|zb - z^\prime b^\prime\|_F 
        \leq C_z\|b-b^\prime\|_F + C_b\|z-z^\prime \|_F.
    \end{align*}
    The result also holds when $z, b, z^\prime, b^\prime$ are vectors or real numbers. The corresponding norms are $\|\cdot \|$ and $|\cdot |$.

    Also, by recursively using the inequality above, we may have that for any $z_1, \ldots, z_n$ and $z_1^\prime, \ldots, z_n^\prime$ such that $\|z_i \|\leq C_i, \|z_i^\prime \|\leq C_i$,
    \begin{align*}
        \|z_1 z_2 \ldots z_n - z_1^\prime z_2^\prime \ldots z_n^\prime\| \leq \sum_{i=1}^n \left(\|z_i - z_i^\prime\| \prod_{j\in [n], j \neq i} C_j\right).
    \end{align*}
    Here, for simplicity of notation, we used $\|\cdot \|$ to denote the type of norm that corresponds to the dimension of the $z_i$'s.
\end{lemma}
\begin{proof}
    \begin{align*}
        \|ab - a^\prime b^\prime\|_F 
        &= \|ab - a^\prime b^\prime + ab^\prime - ab^\prime\|_F\\
        &\leq \|ab - ab^\prime\|_F + \|ab^\prime - a^\prime b^\prime\|_F \tag{by triangle inequality}\\
        &\leq \|a\|_F\|b-b^\prime\|_F + \|b^\prime\|_F\|a-a^\prime \|_F \tag{by Cauchy-Schwarz inequality}\\
        &\leq C_z\|b-b^\prime\|_F + C_b\|a-a^\prime \|_F
    \end{align*}
\end{proof}

\begin{lemma}
The $l_2$ norm of different embedding vectors at level $L$, $h_i^L$, produced by $(\alpha, U, V), (\alpha^\prime, U^\prime, V^\prime)$ after they process the tree all the way from the leaf level to the root can be bounded as
\begin{align*}
    \Delta_{i,L} 
    \leq & 
    C_U \left(\max_{j \in \mathcal{N}_i} \left\|h_j^{L-1}\right\|\right) |\eta - \eta^\prime|
    +  rC_U \left(\max_{j \in \mathcal{N}_i} \left\|h_j^{L-1}\right\|\right)
    + \left(\max_{j \in \mathcal{N}_i}\|h_j^{L-1}\|\right) \left\|U - U^{\prime}\right\| \\
    &+ C_U \left(\max_{j \in \mathcal{N}_i}\left\|h_j^{L-1} - h_j^{\prime (L-1)}\right\|\right)
    + \frac{2rC_U}{C_{dl}}\left\|h_i^{L-1} - h_i^{\prime (L-1)}\right\|\\
    &+ \frac{2r}{C_{dl}} \left\|h_i^{L-1}\right\| \left\|U - U^{\prime}\right\| + \frac{2rC_U}{C_{dl}}\left|\eta - \eta^\prime\right|
\end{align*}
\end{lemma}
\begin{proof}
    \begin{align*}
        \Delta_{i, L}=
        &\left\|h_i^{L}(\eta, U, V) - h_i^{L}(\eta^\prime, U^\prime, V^\prime)\right\| \\
        =& \Big\|\sigma \left(\sum_{j \in \mathcal{N}_i}\left(\eta \cdot e_{ij}^{L-1} + (1 - \eta) \cdot \frac{1}{\sqrt{d_i d_j }}\right)U  h_j^{L-1}\right)\\
        &- \sigma \left(\sum_{j \in \mathcal{N}_i}\left(\eta^\prime \cdot e_{ij}^{\prime(L-1)}  + (1 - \eta^\prime) \cdot \frac{1}{\sqrt{d_i d_j }} \right) U^{\prime}  h_j^{\prime (L-1)} \right)\Big\|
        \\
        \leq & \Big\| \sum_{j \in \mathcal{N}_i} \left((\eta \cdot e_{ij}^{L-1} U  h_j^{L-1}) - (\eta^\prime \cdot e_{ij}^{\prime(L-1)} U^{\prime}  h_j^{\prime(L-1)})\right)
        \\
        &+ \sum_{j \in \mathcal{N}_i} \left((1 - \eta) \cdot \frac{1}{\sqrt{d_i d_j }} U h_j^{L-1} - (1 - \eta^\prime) \cdot \frac{1}{\sqrt{d_i d_j }} U^{\prime}  h_j^{\prime (L-1)}\right)\Big\|
        \tag{since $\sigma$ is $1$-Lipschitz}
        \\
        \leq & \sum_{j \in \mathcal{N}_i} \left\|(\eta \cdot e_{ij}^{L-1} U  h_j^{L-1}) - (\eta^\prime \cdot e_{ij}^{\prime(L-1)} U^{\prime}  h_j^{\prime(L-1)})\right\|\\
        &+ \sum_{j \in \mathcal{N}_i} \left\|(1 - \eta) \cdot \frac{1}{\sqrt{d_i d_j }} U  h_i^{L-1} - (1 - \eta^\prime) \cdot \frac{1}{\sqrt{d_i d_j }} U^{\prime }  h_i^{\prime (L-1)}\right\|
        \tag{by triangle inequality}
    \end{align*}
    Using Lemma~\ref{lem:inequalities}, we can bound each term in the first summation as
    \begin{align*}
         &\left\|(\eta \cdot e_{ij}^{L-1} U  h_j^{L-1}) - (\eta^\prime \cdot e_{ij}^{\prime(L-1)} U^{\prime}  h_j^{\prime(L-1)})\right\|\\
         &\qquad\leq  C_U \bar e_{ij}^{L-1} \bar h_j^{L-1} \cdot |\eta - \eta^\prime|
         + C_U \bar h_j^{L-1}\cdot \left|e_{ij}^{L-1} - e_{ij}^{\prime (L-1)}\right|\\
         &\qquad\qquad+ \bar e_{ij}^{L-1} \bar h_j^{L-1} \left\|U - U^{\prime}\right\|
         + C_U \bar e_{ij}^{L-1} \left\|h_j^{L-1} - h_j^{\prime (L-1)}\right\|
    \end{align*}
    Here, $\bar h_j^{L-1}$ is an upper bound on $\|h_j^{L-1}\|$ and $\|h_j^{\prime (L-1)}\|$, and $\bar e_{ij}^{L-1}$ is an upper bound on $|e_{ij}^{L-1}|$ and $|e_{ij}^{\prime(L-1)}|$.
    
    Bounding each term in the second summation, we have 
    \begin{align*}
        &\left\|(1 - \eta) \cdot \frac{1}{\sqrt{d_i d_j }} U  h_i^{L-1} - (1 - \eta^\prime) \cdot \frac{1}{\sqrt{d_i d_j }} U^{\prime}  h_i^{\prime (L-1)}\right\|\\
        &\qquad\leq  \left\|\frac{1}{\sqrt{d_i d_j }} U  h_i^{L-1} - \frac{1}{\sqrt{d_i d_j }} U^{\prime}  h_i^{\prime (L-1)}\right\| 
        + \left\|\eta \cdot \frac{1}{\sqrt{d_i d_j }} U  h_i^{L-1} -\eta^\prime \cdot \frac{1}{\sqrt{d_i d_j }} U^{\prime }  h_i^{\prime (L-1)} \tag{by triangle inequality}\right\|\\
        &\qquad\leq  \frac{1}{C_{dl}} \|U  h_i^{L-1} - U^{\prime}  h_i^{\prime (L-1)}\| 
        + \frac{1}{C_{dl}} \|\eta \cdot U  h_i^{L-1} -\eta^\prime \cdot U^{\prime }  h_i^{\prime (L-1)}\|\\
        &\qquad\leq \frac{1}{C_{dl}}\left(C_U \|h_i^{L-1} - h_i^{\prime (L-1)}\| + \bar h_i^{L-1} \|U - U^{\prime }\|\right)\\
        &\qquad\qquad+ \frac{1}{C_{dl}}\left(C_U \|h_i^{L-1} - h_i^{\prime (L-1)}\| + \bar h_i^{L-1} \|U - U^{\prime}\|
        + C_U \bar h_i^{L-1} |\eta - \eta^\prime|\right)\tag{using Lemma~\ref{lem:inequalities}}\\
        &\qquad=  \frac{1}{C_{dl}}\left(2C_U \|h_i^{L-1} - h_i^{\prime (L-1)}\| + 2\bar h_i^{L-1} \|U - U^{\prime}\|
        + C_U \bar h_i^{L-1} |\eta - \eta^\prime|\right).
    \end{align*}
    
    Combining the above results, we have
    \begin{align*}
        \Delta_i^L 
        \leq & \sum_{j \in \mathcal{N}_i} \Big( C_U \bar e_{ij}^{L-1} \bar h_j^{L-1} \cdot |\eta - \eta^\prime|
        + C_U \bar h_j^{L-1}\cdot |e_{ij}^{L-1} - e_{ij}^{\prime (L-1)}|\\
        &+ \bar e_{ij}^{L-1} \bar h_j^{L-1} \|U - U^{\prime}\|
        + C_U \bar e_{ij}^{L-1} \|h_j^{L-1} - h_j^{\prime (L-1)}\|\\
        &+ \frac{1}{C_{dl}}\big(2C_U \|h_i^{L-1} - h_i^{\prime (L-1)}\| + 2\bar h_i^{L-1} \|U - U^{\prime}\|
        + C_U \bar h_i^{L-1} |\eta - \eta^\prime|\big) \Big)
        \\
        \leq& C_U (\max_{j \in \mathcal{N}_i}\bar h_j^{L-1}) |\eta - \eta^\prime|
        +  rC_U (\max_{j \in \mathcal{N}_i}\bar h_j^{L-1})
        + (\max_{j \in \mathcal{N}_i}\bar h_j^{L-1}) \|U - U^{\prime}\| \\
        &+ C_U (\max_{j \in \mathcal{N}_i}\|h_j^{L-1} - h_j^{\prime (L-1)}\|)
        + \frac{2rC_U}{C_{dl}}\|h_i^{L-1} - h_i^{\prime (L-1)}\|\\
        &+ \frac{2r}{C_{dl}} \bar h_i^{L-1} \|U - U^{\prime}\| + \frac{2rC_U}{C_{dl}}|\eta - \eta^\prime|
        \tag{since $e_{ij}^\ell \leq 1$, $\sum_{j \in \mathcal{N}_i} e_{ij}^\ell = 1$, and the branching factor is $r$}
    \end{align*}
    It remains for us to derive $\bar h_j^{L-1}$ for all $j$.
\end{proof}

\begin{lemma}
    We can upper bound the norm of node feature embedding at level $\ell+1$ by
    \begin{align*}
         \|h_i^\ell\|
        \leq r^\ell C_U^{\ell+1} C_z \max\left(1, \frac{1}{C_{dl}}\right)^\ell.
    \end{align*}
\end{lemma}
\begin{proof}
    \begin{align*}
        \|h_i^{\ell+1}\|
        &= \left\|\sigma \left(\sum_{j \in \mathcal{N}_i} (\eta \cdot e_{ij}^{\ell} + (1 - \eta) \cdot \frac{1}{\sqrt{d_i d_j }} )  U  h_j^{\ell} \right)\right\|\\
        &\leq \left\|\sum_{j \in \mathcal{N}_i} (\eta \cdot e_{ij}^{\ell} + (1 - \eta) \cdot \frac{1}{\sqrt{d_i d_j }} )  U  h_j^{\ell} \right\|
        \tag{since $\|\sigma(x)\| \leq \|x\|$}\\
        &\leq \sum_{j \in \mathcal{N}_i} \left|\eta \cdot e_{ij}^{\ell} + (1 - \eta) \cdot \frac{1}{\sqrt{d_i d_j }} \right| \| U\|  \|h_j^{\ell} \|
        \tag{by triangle inequality and Cauchy-Schwarz inequality}\\
        &\leq C_U  \sum_{j \in \mathcal{N}_i} \left|\eta \cdot e_{ij}^{\ell} + (1 - \eta) \cdot \frac{1}{\sqrt{d_i d_j }} \right| \|h_j^{\ell} \|\\
        &\leq r C_U \max\left(1, \frac{1}{C_{dl}}\right)\left(\max_{j \in \mathcal{N}_i}\|h_j^{\ell-1}\|\right)
    \end{align*}
    Recursively bounding the terms, we have 
    \begin{align*}
        \|h_i^\ell\| \leq r^\ell C_U^\ell \max\left(1, \frac{1}{C_{dl}}\right)^\ell \max_{j \in [n]}\|h_j^0\|
        \leq r^\ell C_U^{\ell+1} C_z \max\left(1, \frac{1}{C_{dl}}\right)^\ell.
    \end{align*}
\end{proof}

\begin{lemma}
    The change in margin loss due to the change in parameter values after L layers satisfies
    \begin{align*}
        \Lambda_{i} \leq  \frac{2}{k} \left(k_1 + k_2|\eta-\eta^\prime| + k_3 \|U - U^{\prime}\|\right) \frac{k_4^L - k_4}{k_4 - 1} + k_4 C_z \|U - U^\prime\|,
    \end{align*}
    where 
    \begin{align*}
        k_1 &= r^L C_U^{L+1} C_z \max\left(1, \frac{1}{C_{dl}}\right)^{L-1}\\
        k_2 &= r^{L-1} C_U^{L+1} C_z \max\left(1, \frac{1}{C_{dl}}\right)^{L-1} + \frac{2rC_U}{C_{dl}}\\
        k_3 &= \left(1 + \frac{2r}{C_{dl}}\right)r^{L-1} C_U^{L} C_z \max(1, \frac{1}{C_{dl}})^{L-1}\\
        k_4 &= C_U + \frac{2rC_U}{C_{dl}}.
    \end{align*}
\end{lemma}
\begin{proof}
    Using the previous two lemmas, we know
    \begin{align*}
        &\|h_i^{L}(\eta, U, V) - h_i^{L}(\eta^\prime, U^\prime, V^\prime)\|\\
        &\qquad\leq C_U (\max_{j \in \mathcal{N}_i}\bar h_j^{L-1}) |\eta - \eta^\prime|
        +  rC_U (\max_{j \in \mathcal{N}_i}\bar h_j^{L-1})
        + (\max_{j \in \mathcal{N}_i}\bar h_j^{L-1}) \|U - U^{\prime}\| \\
        &\qquad\qquad+ C_U (\max_{j \in \mathcal{N}_i}\|h_j^{L-1} - h_j^{\prime (L-1)}\|)
        + \frac{2rC_U}{C_{dl}}\|h_i^{L-1} - h_i^{\prime (L-1)}\|
        + \frac{2r}{C_{dl}} \bar h_i^{L-1} \|U - U^{\prime}\| + \frac{2rC_U}{C_{dl}}|\eta - \eta^\prime|\\
        &\qquad\leq  k_1 + k_2|\eta-\eta^\prime| + k_3 \|U - U^{\prime}\| + k_4 (\max_{j \in [n]}\|h_j^{L-1} - h_j^{\prime (L-1)}\|)
        \\
        &\qquad= \left(k_1 + k_2|\eta-\eta^\prime| + k_3 \|U - U^{\prime}\|\right) \frac{k_4^L - k_4}{k_4 - 1} + k_4 (\max_{j \in [n]}\|h^0_j - h_j^{\prime 0}\|)\\
        &\qquad\leq  \left(k_1 + k_2|\eta-\eta^\prime| + k_3 \|U - U^{\prime}\|\right) \frac{k_4^L - k_4}{k_4 - 1} + k_4 C_z \|U - U^\prime\|
    \end{align*}
    where 
    \begin{align*}
        k_1 &= r^L C_U^{L+1} C_z \max(1, \frac{1}{C_{dl}})^{L-1}\\
        k_2 &= r^{L-1} C_U^{L+1} C_z \max(1, \frac{1}{C_{dl}})^{L-1} + \frac{2rC_U}{C_{dl}}\\
        k_3 &= \left(1 + \frac{2r}{C_{dl}}\right)r^{L-1} C_U^{L} C_z \max(1, \frac{1}{C_{dl}})^{L-1}\\
        k_4 &= C_U + \frac{2rC_U}{C_{dl}}.
    \end{align*}
    The change in margin loss for each node after L layers is then
    \begin{align*}
        \Lambda_i 
        &= \left|g_\gamma(-\tau (f_{\eta, U,V}(x_i), y_i)) - g_\gamma(-\tau (f_{\eta^\prime, U^\prime,V^\prime}(x_i), y_i))\right|\\
        &\leq \frac{1}{\gamma}\left|\tau (f_{\eta, U,V}(x_i), y_i)) - \tau (f_{\eta^\prime, U^\prime,V^\prime}(x_i), y_i))\right|
        \tag{since $g_\gamma$ is $1/\gamma$-Lipschitz}\\
        &= \frac{1}{\gamma}\left|(2f_{\beta, \theta}(x_i) - 1) y_i-(2f_{\beta^\prime, \theta^\prime}(x_i) - 1) y_i)\right|\\
        &\leq \frac{2}{\gamma}\left|y_i\right|\left|f_{\eta, U,V}(x_i) - f_{\eta^\prime, U^\prime,V^\prime}(x_i)\right|
        \tag{by Cauchy-Schwarz inequality}\\
        &\leq \frac{2}{\gamma}\left|\sigma(h_i^{L}(\eta, U, V)[0]) - \sigma(h_i^{L}(\eta^\prime, U^\prime, V^\prime)[0])\right|
        \tag{since $y_i \in \{-1,1\}$}\\
        &\leq \frac{2}{\gamma}\left|h_i^{L}(\eta, U, V)[0] - h_i^{L}(\eta^\prime, U^\prime, V^\prime)[0]\right| 
        \tag{since $\sigma$is $1$-Lipschitz}\\
        % &\leq \frac{2}{\gamma} \Delta_{L, i}\\
        &\leq \frac{2}{\gamma} \left(k_1 + k_2|\eta-\eta^\prime| + k_3 \|U - U^{\prime}\|\right) \frac{k_4^L - k_4}{k_4 - 1} + k_4 C_z \|U - U^\prime\|.
    \end{align*}
\end{proof}

\begin{lemma}
    The change in margin loss $\Lambda_i$ for each node can be bounded by $\epsilon$, using a covering of size $P$, where $P$ depends on $\epsilon$, with 
    \begin{align*}
        \log P \leq (d^2+1)\log \left(\frac{8\max\{A, BC_U\sqrt{d}\}}{\epsilon}\right).
    \end{align*}  
\end{lemma}
\begin{proof}
    We let $A = \frac{2k_2(k_4^L - k_4)}{k (k_4 - 1)}$ and $B = \frac{2k_3(k_4^L-k_4) + \gamma(k_4^2-k_4)C_z}{\gamma(k_4-1)}$ for simplicity of notation. Note that we have $\Lambda_i \leq A|\eta - \eta^\prime | + B \|U - U^\prime \|$.
    
    We begin by noting that we can find a covering $\mathcal{C}(\eta, \frac{\epsilon}{2A}, |\cdot |)$ of size
    \begin{align*}
        \mathcal{N}(\eta, \frac{\epsilon}{2A}, |\cdot |) \leq 1 + \frac{4A}{\epsilon}.
    \end{align*}
    We can also find a covering $\mathcal{C}(U, \frac{\epsilon}{2B}, \|\cdot \|_F)$ with size 
    \begin{align*}
        \mathcal{N}(U, \frac{\epsilon}{2B}, \|\cdot \|_F) \leq \left(1 + \frac{4BC_U\sqrt{d}}{\epsilon} \right)^{d^2}.
    \end{align*}
    For any specified $\epsilon$, we can ensure that $\Lambda_i$ is at most $\epsilon$ with a covering number of 
    \begin{align*}
        P 
        \leq&
        \mathcal{N}(\eta, \frac{\epsilon}{2A}, |\cdot |) \cdot \mathcal{N}(U, \frac{\epsilon}{2B}, \|\cdot \|_F)\\
        \leq&
        \left( 1 + \frac{4A}{\epsilon}\right)\left(1 + \frac{4BC_U\sqrt{d}}{\epsilon} \right)^{d^2} 
        \leq
        (1 + \frac{4\max\{A, BC_U\sqrt{d}\}}{\epsilon})^{d^2+1}
    \end{align*}
    Moreover, when $\epsilon \leq 4\max\{A, BC_U\sqrt{d}\}$, we have 
    \begin{align*}
        \log P \leq (d^2+1)\log \left(\frac{8\max\{A, BC_U\sqrt{d}\}}{\epsilon}\right).
    \end{align*}    
\end{proof}

Now we can finish our proof for Theorem~\ref{thm:rc of GCAN}.

\begin{proof}
    Using Lemma A.5 from \cite{bartlett2017spectrally}, we obtain that 
    \begin{align*}
        \hat R_{\mathcal{T}}(\mathcal{H}^\gamma_{(\eta, U, V))})
        \leq \inf_{\alpha > 0}\left(\frac{4\alpha}{\sqrt{m}} + \frac{12}{m} \int_{\alpha}^{\sqrt{m}} \sqrt{\log \mathcal{N}(\mathcal{H}^\gamma_{(\eta, U, V))}, \epsilon, \|\cdot \|)} d\epsilon\right).
    \end{align*}
    Using the previous lemmas, we have
    \begin{align*}
        \int_{\alpha}^{\sqrt{m}} \sqrt{\log \mathcal{N}(\mathcal{H}^\gamma_{(\eta, U, V))}, \epsilon, \|\cdot \|)} d\epsilon
        &= \int_{\alpha}^{\sqrt{m}} \sqrt{\log P} d\epsilon \\
        &\leq \int_{\alpha}^{\sqrt{m}} \sqrt{(d^2+1)\log \left(\frac{8\max\{A, BC_U\sqrt{d}\}}{\epsilon}\right)} d\epsilon\\
        &\leq \sqrt{m}\sqrt{(d^2+1)\log \left(\frac{8\max\{A, BC_U\sqrt{d}\}}{\alpha}\right)}
    \end{align*}
    Plugging in $\alpha = \sqrt{\frac{1}{m}}$, we have
    \begin{align*}
        \hat R_{\mathcal{T}}(\mathcal{H}^\gamma_{(\eta, U, V))})
        \leq \frac{4}{m} + \frac{12\sqrt{(d^2+1)\log \left(8\sqrt{m}\max\{A, BC_U\sqrt{d}\}\right)}}{\sqrt{m}}.
    \end{align*}
\end{proof}
