\section{Method}
\subsection{Problem Formulation}
\begin{figure*}[ht]
\centering
\includegraphics[width=0.8\textwidth]{sec/figure/ResGAT_pipeline_figure.pdf}
\caption{Overview of the ResGAT pipeline for WSI classification. The framework consists of three main components: (1) tissue segmentation and patch extraction, (2) patch-level feature encoding and graph construction, and (3) slide-level representation learning and prediction.}
\label{fig:pipeline}
\end{figure*}
The whole slide image is treated as a bag of patch embeddings in the multiple instance learning (MIL) setting. Given a slide $s$, the foreground tissue is segmented and tiled into patches at a fixed magnification. Each patch is then encoded into a feature vector $\mathbf{x}_i \in \mathbb{R}^D$ using a large-scale pre-trained pathology encoder. This yields a set $\mathcal{B}_s = \{\mathbf{x}_1,\dots,\mathbf{x}_N\}$ with a slide-level label $y_s$ indicating the cancer subtype. Our goal is to learn a permutation-invariant function $f_\theta : \mathcal{B}_s \mapsto y_s$ for subtype classification.

Following previous graph-based MIL methods, we represent each slide as a patch graph $\mathcal{G}_s = (\mathcal{V}_s,\mathcal{E}_s)$. Each node $v_i \in \mathcal{V}_s$ corresponds to a patch embedding $\mathbf{x}_i$, and the edges in $\mathcal{E}_s$ are constructed based on both spatial proximity and feature similarity between the patches. ResGAT takes the graph as input, updates node features with stacked residual graph attention blocks, and aggregates them into a slide-level representation for classification. Fig.~\ref{fig:pipeline} shows the overall architecture of ResGAT.

\subsection{Graph Construction}\label{sec:graph-construction}
To establish the graph topology $\mathcal{E}_s$, we introduce a hybrid $k$-NN edge construction procedure. Each node $v_i$ is associated with a spatial coordinate $\mathbf{p}_i \in \mathbb{R}^2$ derived from the patch location on the WSI. Initially, we identify the $d_{spa}$ nearest spatial neighbors of $v_i$ measured by Euclidean distance between coordinates, denoted as the set $\mathcal{N}_{spa}(v_i)$, and its $d_{feat}$ nearest feature neighbors measured by cosine distance between node features, denoted as the set $\mathcal{N}_{feat}(v_i)$. We define the candidate pool as the intersection
\[
\mathcal{C}(v_i) = \mathcal{N}_{spa}(v_i) \cap \mathcal{N}_{feat}(v_i),
\]
which is subsequently ranked by node feature similarity. The top $k$ candidates are selected as the final connected neighbors of $v_i$. In cases of a sparse or empty intersection $(|\mathcal{C}(v_i)| < k)$, the adjacency list is padded with up to three auxiliary nearest feature neighbors to maintain robust connectivity. The resulting patch graph $\mathcal{G}_s$ is treated as undirected. The hyperparameters $d_{spa}, d_{feat}, k$ jointly determine the graph density and the node degree variance. We adopt a general configuration with $k=6, d_{feat}=50, d_{spa}\in\{15, 24\}$ in our main evaluations. A comprehensive sensitivity analysis of these parameters is provided in Section \ref{sec:ablation1}.

\subsection{ResGAT Architecture and Training Objective}

\paragraph{Node Updates.}
Given the graph $\mathcal{G}_s$, we initialize the node features as $\mathbf{h}_i^{(0)}=\mathbf{x}_i\; \text{for}\;i=1,...,N$. Let $\mathbf{h}_i^{(\ell)}$ denote the representation of node $v_i$ at layer $\ell$. ResGAT applies a stack of $L=3$ residual blocks to obtain the updated node representations $\mathbf{h}_i^{(L)}$. Each residual block updates node features through a linear projection in parallel with a multi-head graph attention convolution (GATv2Conv \cite{brody2021attentive}). Let $\mathcal{N}(i) = \{j \mid (i, j) \in \mathcal{E}_s\}$ denote the neighbors of node $v_i$. For each layer, the following combined update is applied to all nodes:
\begin{equation}
\label{eq:resgat-update}
\begin{aligned}
&e_{ij}^{(k)} = \mathbf{a}^{(k)\top} \mathrm{LeakyReLU}\!\big(\mathbf{W}_s^{(k)}\mathbf{h}_i^{(\ell)} + \mathbf{W}_t^{(k)}\mathbf{h}_j^{(\ell)}\big), 
    && j \in \mathcal{N}(i), \\[2pt]
&\alpha_{ij}^{(k)} = 
\frac{\exp(e_{ij}^{(k)})}{\sum_{u \in \mathcal{N}(i)} \exp(e_{iu}^{(k)})}, \\[2pt]
&\mathbf{m}_i^{(\ell)} = 
\bigg\|_{k=1}^{K} \sum_{j \in \mathcal{N}(i)} \alpha_{ij}^{(k)} \mathbf{W}^{(k)} \mathbf{h}_j^{(\ell)}, \\[2pt]
&\mathbf{h}_i^{(\ell+1)} = 
\phi\!\Big(
\mathrm{GN}\big(\mathbf{m}_i^{(\ell)}\big)
+
\mathrm{GN}\big(\mathbf{W}_{\text{res}}^{(\ell)} \mathbf{h}_i^{(\ell)}\big)
\Big),
\end{aligned}
\end{equation}
where $d_h = D_{\ell+1}/K$ is the output dimension of each attention head, $\mathbf{W}_s^{(k)}, \mathbf{W}_t^{(k)}, \mathbf{W}^{(k)} \in \mathbb{R}^{d_h \times D_\ell}$ are learnable projections for head $k$, $\mathbf{a}^{(k)} \in \mathbb{R}^{d_h}$ is the corresponding attention vector, and $\mathbf{W}_{\text{res}}^{(\ell)} \in \mathbb{R}^{D_{\ell+1}\times D_\ell}$ is the learnable linear projection on the residual path. The operator \(\|\) denotes concatenation over \(K\) heads. \(\mathrm{GN}(\cdot)\) denotes GraphNorm and is applied separately to the two branches, and \(\phi\) is the ELU non-linearity. This formulation accommodates progressively decreasing dimensions (e.g., $1024 \rightarrow 512 \rightarrow 256$).

\paragraph{Graph Normalization.}
Each residual block employs GraphNorm~\cite{cai2021graphnorm} to stabilize training against the severe variations in graph size and topological structure across different slides. Given the intermediate node representations at the layer $\ell$, GraphNorm defines the operation as
\begin{equation}
\mathbf{u}_i^{(\ell)} 
= \boldsymbol{\gamma} \odot 
\frac{\mathbf{f}_i^{(\ell)} - \boldsymbol{\alpha} \odot \boldsymbol{\mu}^{(\ell)}}%
     {\sqrt{\big(\boldsymbol{\sigma}^{(\ell)}\big)^2 + \epsilon}}
+ \boldsymbol{\beta},
\end{equation}
where $\boldsymbol{\mu}^{(\ell)}$ and $\big(\boldsymbol{\sigma}^{(\ell)}\big)^2$ are the mean and variance of 
$\{\mathbf{f}_i^{(\ell)}\}_{i=1}^N$ over nodes in the graph, and 
$\boldsymbol{\gamma}, \boldsymbol{\beta}, \boldsymbol{\alpha}$ are learnable parameters shared across nodes. The operator $\odot$ denotes element-wise multiplication. Intuitively, $\boldsymbol{\gamma}$ and $\boldsymbol{\beta}$ provide a channel-wise affine re-parametrization of the normalized features, 
while $\boldsymbol{\alpha}$ modulates the strength of graph-level centering on each feature dimension. 

\paragraph{Pooling and Loss.}
Following residual blocks, we apply the global mean pooling over the updated node representations $\{\mathbf{h}_i^{(L)}\}_{i=1}^N$ to obtain the slide-level representation $\mathbf{z}_s \in \mathbb{R}^{D_L}$. This vector is fed into an MLP classifier to produce the logit vector $\hat{\mathbf{y}}_s \in \mathbb{R}^C$, where $C$ is the number of cancer subtypes. The predicted probabilities are obtained via a Softmax function. Given the ground-truth label encoded as a one-hot vector $\mathbf{y}_s \in \{0,1\}^C$, we train the model using the standard cross-entropy loss:
\begin{equation}
\mathcal{L}
= - \frac{1}{|\mathcal{S}|}
  \sum_{s\in\mathcal{S}} \sum_{c=1}^C
    y_{s,c} \log \left( \frac{\exp(\hat{y}_{s,c})}{\sum_{c'=1}^C \exp(\hat{y}_{s,c'})} \right).
\end{equation}

An ablation study of the two-branch residual block design is provided in Section~\ref{sec:ablation2}.

\subsection{Graph Class Activation Mapping}\label{sec:gradcam}
We adapt Grad-CAM++~\cite{chattopadhay2018grad} to our graph-based pipeline to generate heatmaps that highlight prediction-relevant regions. Given a target class $c$, let $h_{i,d}^{(L)}$ denote the $d$-th feature channel of the final node representation $\mathbf{h}_i^{(L)} \in \mathbb{R}^{D_L}$. We compute channel-wise importance weights $w_d^c$ from the gradients of the class logit $\hat{y}_c$:
\begin{equation}
\label{eq:graph-gradcampp-weights}
w_d^c
=
\alpha_d^c \cdot \frac{1}{N}\sum_{i=1}^{N}\mathrm{ReLU}\!\left(\frac{\partial \hat{y}_c}{\partial h_{i,d}^{(L)}}\right),
\qquad
\alpha_d^c
=
\frac{\sum_i \left(\frac{\partial \hat{y}_c}{\partial h_{i,d}^{(L)}}\right)^2}
{2\sum_i \left(\frac{\partial \hat{y}_c}{\partial h_{i,d}^{(L)}}\right)^2 + N\sum_i \left(\frac{\partial \hat{y}_c}{\partial h_{i,d}^{(L)}}\right)^3 + \epsilon}.
\end{equation}
The saliency score $M_i^c$ for each node is computed via a weighted combination:
\begin{equation}
\label{eq:graph-gradcampp-map}
M_i^c = \mathrm{ReLU}\!\left(\sum_{d=1}^{D_L} w_d^c \, h_{i,d}^{(L)}\right).
\end{equation}
These scores are min-max normalized and mapped back to the corresponding patch locations on the WSI.

