% This is samplepaper.tex, a sample chapter demonstrating the
% LLNCS macro package for Springer Computer Science proceedings;
% Version 2.21 of 2022/01/12
%
\documentclass[runningheads]{llncs}
%
\usepackage{adjustbox}
\usepackage[table,dvipsnames]{xcolor}
\usepackage{booktabs}
\usepackage{enumitem}
\usepackage{multirow}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage[T1]{fontenc}
\usepackage{subfig}
\usepackage{float}
\usepackage{graphicx}
% T1 fonts will be used to generate the final print and online PDFs,
% so please use T1 fonts in your manuscript whenever possible.
% Other font encondings may result in incorrect characters.
%
% Used for displaying a sample figure. If possible, figure files should
% be included in EPS format.
%
% If you use the hyperref package, please uncomment the following two lines
% to display URLs in blue roman font according to Springer's eBook style:
%\usepackage{color}
%\renewcommand\UrlFont{\color{blue}\rmfamily}
%\urlstyle{rm}
%
\begin{document}
%
\title{Structured Spectral Graph Learning for Anomaly Classification in 3D Chest CT Scans}
%
\titlerunning{Spectral Graph for 3D CT Scans}
% If the paper title is too long for the running head, you can set
% an abbreviated paper title here
%
\author{Theo Di Piazza\inst{1,2} \and Carole Lazarus\inst{3} \and Olivier Nempont\inst{3} \and Loic Boussel\inst{1,2}}
\authorrunning{Di Piazza et al.}
\institute{University of Lyon, INSA Lyon, CNRS, INSERM, CREATIS UMR 5220, U1294\\
Hospices Civils de Lyon, Lyon, France\\
Philips Clinical Informatics, Innovation Paris, France\\
\email{theo.dipiazza@creatis.insa-lyon.fr}}

\maketitle

\begin{abstract}
With the increasing number of CT scan examinations, there is a need for automated methods such as organ segmentation, anomaly detection and report generation to assist radiologists in managing their increasing workload. Multi-label classification of 3D CT scans remains a critical yet challenging task due to the complex spatial relationships within volumetric data and the variety of observed anomalies. Existing approaches based on 3D convolutional networks have limited abilities to model long-range dependencies while Vision Transformers suffer from high computational costs and often require extensive pre-training on large-scale datasets from the same domain to achieve competitive performance. In this work, we propose an alternative by introducing a new graph-based approach that models CT scans as structured graphs, leveraging axial slice triplets nodes processed through spectral domain convolution to enhance multi-label anomaly classification performance. Our method exhibits strong cross-dataset generalization, and competitive performance while achieving robustness to z-axis translation. An ablation study evaluates the contribution of each proposed component.
\keywords{3D Medical Imaging \and Chest Computed Tomography \and Graph Neural Network \and Spectral domain \and Multi-label Anomaly Classification.}
\end{abstract}
%
%
%
\section{Introduction}
Computed Tomography (CT) is a fundamental modality in modern medical imaging, providing radiologists with detailed cross-sectional views of the human body to detect and characterize abnormalities. However, the increasing volume of CT scans has led to an important demand for automated deep learning-based methods to assist radiologists with their growing workload~\cite{broder_increasing_2006}. Deep learning has already demonstrated success in various CT-related tasks~\cite{anaya-isaza_overview_2021}, including anomaly detection~\cite{hamamci_foundation_2024}, organ segmentation~\cite{ilesanmi_reviewing_2024}, report generation~\cite{hamamci_ct2rep_2024}, and synthetic volume reconstruction~\cite{hamamci_foundation_2024} for patient-specific modeling. Among these tasks, multi-label classification of anomalies in 3D CT volumes remains challenging due to the computational complexity of processing volumetric data and the diverse range of pathological patterns. Early deep learning approaches leverage 3D Convolutional Neural Networks (CNNs), effectively capturing local spatial features but suffering from limited capabilities to model long-ranges dependencies~\cite{ma_u-mamba_2024}. More recently, Vision Transformers (ViTs)~\cite{dosovitskiy_image_2021}, initially designed for natural language processing~\cite{tucudean_natural_2024}, have been adapted to both 2D~\cite{halder_implementing_2024} and 3D~\cite{hamamci_generatect_2023} medical imaging. By enabling long-range spatial interactions through self-attention, ViTs have shown promise in various medical imaging tasks~\cite{azad_advances_2024} through its capabilities to capture global information. However, they remain computationally expensive, requiring large-scale pretraining to generalize effectively~\cite{hamamci_generatect_2023}. Our work introduces CT-Graph, a new 2.5D GNN-based framework that models 3D chest CT scans as structured graphs, where each node represents a triplet of adjacent axial slices and edges are weighted by inter-slice spacing. This design enables efficient integration of local and global context while preserving spatial structure. Our approach offers the following key advantages:

\begin{itemize}[noitemsep,topsep=0pt,parsep=0pt,partopsep=0pt]
\item CT-Graph demonstrates strong cross-dataset generalization, maintaining consistent performance when trained on a public Turkish 3D chest CT dataset and evaluated on a separate dataset from the United States.
\item  Our edge weighting strategy based on z-axis distance spacing incorporates spatial awareness with no additional learnable parameters. Ablation studies confirm the effectiveness of GNN modules and graph connectivity patterns.
\item By leveraging spectral domain convolution, CT-Graph improves anomaly classification performance and achieves robustness to z-axis translation.

\end{itemize}

\section{Related Work}

%\cl{Feature aggregation in 3D medical imaging is crucial for balancing local and long-range dependencies while maintaining global spatial awareness. Effective aggregation methods must capture fine details and broader anatomical structures. This discussion explores different approaches and their development to better address these needs.}

\subsection{3D Visual Encoder}

Feature aggregation in 3D medical imaging is crucial for balancing local and long-range dependencies while maintaining global spatial awareness. Early deep learning architectures primaliry relied on 3D CNNs~\cite{anaya-isaza_overview_2021}, which effectively capture local spatial dependencies. These models have been widely applied to tasks such as anomaly detection~\cite{ibrahim_deep-chest_2021} and segmentation~\cite{rayed_deep_2024}. However, their intrinsic locality limits their ability to model long-range dependencies, which can be crucial for capturing global anatomical structures~\cite{ma_u-mamba_2024}. The self-attention mechanism~\cite{vaswani_attention_2023}, initially introduced for natural language processing tasks was rapidly adapted to the visual domain with ViTs~\cite{dosovitskiy_image_2021}. The extension of ViTs~\cite{hamamci_generatect_2023} and Swin Transformers~\cite{yang_swin3d_2023} to 3D tasks has shown promise in applications such as dense image captioning~\cite{chen_unit3d_2022} and video processing~\cite{liu_video_2021}. In the context of CT imaging, GenerateCT leverages CT-ViT, inspired by ViViT~\cite{arnab_vivit_2021}, to integrate spatial and causal attention but requires extensive pretraining, limiting its practical applicability~\cite{hamamci_generatect_2023}. To mitigate computational challenges in 3D volume processing, CT-Net~\cite{draelos_machine-learning-based_2021} proposes to group triplets of adjacent slices to replicate the three-channel structure of RGB images, extracting features using a pretrained 2D ResNet~\cite{he_deep_2015}. While CT-Net subsequently passes these representations through a lightweight 3D CNN for dimensionality reduction, CT-Scroll~\cite{di_piazza_imitating_2025} leverages an alternating global-local attention module to enable feature interactions, effectively reducing the number of parameters while improving classification performance.

\begin{figure}[t]
    \centering
    \includegraphics[width=1.0\textwidth]{LatexSourceFile-0011/fig1.pdf}
    \caption{CT-Graph introduces a structured graph-based architecture, where triplet axial slice features define nodes. Node interactions are modeled through spectral-domain convolutions, enabling contextual aggregation prior before classification.}
    \label{sph:fig:method_overview}
\end{figure}


\subsection{Graph Neural Networks}

In various application domains such as biology~\cite{reiser_graph_2022} or transportation~\cite{makarov_graph_2024}, graphs are a common representation of data found in nature~\cite{velickovic_everything_2023}. A graph, denoted as $\mathcal{G} = \{\mathcal{V}, \mathcal{E}\}$ consists of a set of edges $\mathcal{E}$ which model the connections between a set of nodes $\mathcal{V}$. In deep learning, GNNs have become the main approach for tasks involving graph-structured data~\cite{bechler-speicher_intelligible_2024}, where each node is associated with a vector representation, which is iteratively updated through neighborhood aggregation during the forward message passing process. Representative models mainly include Convolutional GNNs, which aggregate neighboring node features through graph-based convolutions~\cite{defferrard_convolutional_2017} or Attentional GNNs, which leverage attention mechanisms to weigh the importance of neighbors' contributions~\cite{brody_how_2022}. In medical imaging, GNNs have been used in tasks such as medical knowledge integration in radiology report generation~\cite{liu_exploring_2021} and Whole Slide Image analysis~\cite{guo_higt_2023}. Specifically to 3D medical imaging, recent approaches have explored multi-view modeling, where each node encodes a triplet of orthogonal slices with axial, coronal, and sagittal views to capture complementary anatomical information~\cite{kiechle_graph_2024}.


\section{Method}

As shown in Figure~\ref{sph:fig:method_overview}, CT-Graph models the 3D CT scan as a graph of \textit{triplet axial CT slices} connected by their \textit{physical z-axis distance}. Each node corresponds to a triplet of axial slices connected by neighborhood nodes with an edge weighted by their physical distance. Node features interact through a GNN module before being summed and given to a classification head.\\

\noindent \textbf{Triplet Slices Feature Extraction.} Following a strategy similar to CT-Net~\cite{draelos_machine-learning-based_2021}, we partition the input volume $x \in \mathbb{R}^{240 \times 480 \times 480}$ into non-overlapping triplets of slices, noted $\{x^{t}_{i}\}_{i=1}^{80}$ forming a tensor of dimension $80 \times 3 \times 480 \times 480$. Each triplet is processed by a ResNet~\cite{he_deep_2015} $\Phi_{\text{ResNet}}$ pretrained on ImageNet~\cite{russakovsky_imagenet_2015} to extract a corresponding feature map. The feature maps are then processed independently, with each one being passed through a Global Average Pooling (GAP) layer~\cite{di_piazza_imitating_2025} $\Phi_{\text{GAP}}$ to obtain a compact vector representation for each triplet, noted $h_{i} \in \mathbb{R}^{512}$ ($i \ \in \{1, \ldots, 80\}$), such that:

\begin{equation}
    h_{i} = ( \Phi_{\text{GAP}} \circ \Phi_{\text{ResNet}}) ( x_{i}^{t} ), \quad \forall \ i \ \in \{1, \ldots, 80\} \, .
\end{equation}

\noindent \textbf{Graph Construction.} We define the volumetric representation as a graph $\mathcal{G} = (\mathcal{V}, \mathcal{E}, H, A)$, where:  
\begin{itemize}  
    \item $\mathcal{V} = \{ v_i \}_{i=1}^{N}$ is the set of nodes, where each node $v_i$ represents a triplet of consecutive slices. Hence, the number of nodes is $N=80$.
    \item $\mathcal{E} \subseteq \mathcal{V} \times \mathcal{V}$ is the set of edges, where an edge $(v_i, v_j) \in \mathcal{E}$ is weighted based on a function of inter-triplet distance and z-axis spacing. An undirected edge $(v_i, v_j) \in \mathcal{E}$ is established if and only if the corresponding triplet slices are separated by at most $q \in \mathbb{N}^{+}$ other triplet slices in the sequence, such that:
    \begin{equation}
    \mathcal{E} =  \{ (v_i, v_j) \ | \ |i-j| \leq q\}\, .
\end{equation}
    \item $H = \{h_{1}, \ldots, h_{N} \} \in \mathbb{R}^{N \times d}$ is the node feature matrix, where $\mathbf{h}_i \in \mathbb{R}^{d}$ denotes the feature embedding of node $v_i$ ($\forall \ i \in \{1, \ldots, N\}$). We set $d=512$.
    \item $A \in \mathbb{R}^{N \times N}$ is the weighted adjacency matrix, where $A_{ij} = w_{i, j} \in \mathbb{R}^{+}$ encodes the connectivity and spatial relationship between triplets, $w_{i, j}$ being the edge weight such that:  
    \begin{equation}
        A_{ij} =
\begin{cases}
w_{ij}, & \text{if } (v_i, v_j) \in \mathcal{E} \\
0, & \text{otherwise.}
\end{cases} \,
    \end{equation}
\end{itemize}  

\noindent \textbf{Graph Neural Network module.} A key challenge in this formulation is the variability in anatomical positioning across patients due to differences in scan length and body proportions. Traditional spatial graph convolutions, such as GraphConv~\cite{morris_weisfeiler_2021}, aggregate information from fixed local neighborhoods, which can be suboptimal in this context as anatomical structures do not consistently align across scans. Instead, we leverage Chebyshev convolutions~\cite{defferrard_convolutional_2017} to define graph convolutions in the spectral domain, each followed by a feedforward neural network. Unlike spatial approaches, which struggle with non-uniform neighborhood structures~\cite{bruna_spectral_2014}, ChebConv utilizes polynomial approximations of the graph Laplacian~\cite{belkin_laplacian_2001} to capture hierarchical feature representations while preserving spatial localization. This allows the model to adapt to variations in caudal-cranial slice positioning and effectively learn long-range anatomical relationships, making it more robust to inter-patient variability. Our GNN module, denoted as $\Phi_{\text{GNN}}$, consists of 3 Chebyshev Convolutional Layers~\cite{defferrard_convolutional_2017}, each noted $f_{n}$ ($n \in \{0, 1, 2\}$) and followed by a feedforward neural network consisting of a linear layer followed by a ReLU, denoted as $g_n$, matching the depth of CT-Scroll~\cite{di_piazza_imitating_2025} for fair comparison. For each layer, the scaled and normalized Laplacian $\hat{L}$ is defined as:
\begin{equation}  
\hat{L} =\frac{2}{\lambda_{\text{max}}} (D - A) - I \, ,
\end{equation}  

where $\lambda_{\text{max}}$ is the largest eigenvalue of the graph Laplacian $L = D - A$. The degree matrix $D$ is a diagonal matrix where $D_{i, i} = \sum_{j=1}^{N} w_{i, j}$. $w_{i, j}$ denotes the edge weight from source node $i$ to target node $j$, defined such that:
\begin{equation} \label{eq:inverse_fun}
    w_{i, j} = 1 + \frac{1}{1 + dist(i, j)} = 1 + \frac{1}{1 + 3 \times |i - j| \times s_{z}} \, ,
\end{equation}

where $s_{z}$ is the spacing along the z-axis in decimetre.
The convolution operation is parameterized using Chebyshev polynomials $T_{j}(\hat{L}) \in \mathbb{R}^{N \times N}$, resulting in a recurrence relation for the transformation of the node feature matrix. Let \( Z^0 = H \) be the initial node feature matrix, $\theta_k \in \mathbb{R}^{d \times d}$ be the learnable parameters, and \( K \) be the Chebyshev filter size fixed to 3 for all experiments, to align with common practice~\cite{defferrard_convolutional_2017}. The recurrence relation is given by:

\begin{equation}  
Z^{n+1} = (g_{n} \circ f_{n})(Z^{n}) = g_{n}( \sum_{k=0}^{K-1} T_{k}(\hat{L}) Z^{n} \theta_{k}), \quad \forall \ n \in \{0, 1, 2\} \, .
\end{equation}  

The GNN module \(\Phi_{\text{GNN}}\) produces the final output vector representation, which we denote as $Z = Z^{3} \in \mathbb{R}^{N \times d}$ and which is defined as:
\begin{equation}
    Z = \{z^{3}_{1}, \ldots, z^{3}_{N}\} = \Phi_{\text{GNN}}(H) \,.
\end{equation}

\noindent \textbf{Feature aggregation.} The obtained vector representations are aggregated through summation to derive a vector representation, denoted as \( \bar{z} \in \mathbb{R}^{d} \), which is subsequently passed to a classification head $\Psi$ implemented as a lightweight multilayer perceptron. $\Psi$ predicts the logit vector $\hat{y} \in \mathbb{R}^{18}$. The model is trained on a multi-label classification task using Binary Cross-Entropy as the loss function.

\section{Experimental results}

\subsection{Dataset preparation}
We train and evaluate our methods on the public {\tt CT-RATE} dataset~\cite{hamamci_foundation_2024}, which consists of non-contrast chest CT scans with 18 annotated anomalies extracted from radiology reports. The training set includes 17,799 unique patients, while the validation and test sets both contain 1,314 unique patients. Additionally, we extend our evaluation on the publicly available {\tt Rad-ChestCT} dataset~\cite{draelos_machine-learning-based_2021}, comprising non-contrast chest CT scans from 1,344 unique patients, focusing on the 16 anomalies shared with {\tt CT-RATE}~\cite{hamamci_foundation_2024}. Consistent with prior work~\cite{hamamci_ct2rep_2024,di_piazza_imitating_2025}, volumes for both datasets are center-cropped or padded to a resolution of 240×480×480, with a spacing of 0.75 mm on the x and y and 1.5 mm on the z axis. Hounsfield Unit values are clipped to the range $[-1000, 200]$, reflecting practical diagnostic limits~\cite{hamamci_ct2rep_2024}.
%Following prior work~\cite{hamamci_ct2rep_2024,di_piazza_imitating_2025} and f
\subsection{Implementation Details}
CT-Graph and baseline methods are trained with a batch size of 4 using the AdamW optimizer with ($\beta_{1}$, $\beta_{2}$) = (0.9, 0.99) and a weight decay of 0.01. The learning schedule follows a cosine decay with a warm-up phase of 20,000 steps, a maximum learning rate of 0.0001, and training runs for 200,000 iterations.

\begin{table*}[t]
\centering
\caption{Quantitative evaluation on the {\tt CT-RATE} and {\tt Rad-ChestCT} test sets. Reported mean and standard deviation metrics were computed over 5 independant runs. \textbf{Best} results are in bold, \underline{second best} are underlined.}
\begin{adjustbox}{width=1.0\textwidth}
\begin{tabular}{c l c c c c}
\toprule
Dataset & Method & F1 & Recall & AUROC & Accuracy \\
\toprule
% CT-RATE
\multirow{7}{*}{\rotatebox[origin=c]{0}{\small \begin{tabular}{@{}c@{}}
\small {\tt CT-RATE} \\
%\small \worldflag[width=0.4cm]{TR}
\end{tabular}}}& 
% random predictions
\scriptsize Random Pred. & 
$27.78 \text{\scriptsize $\pm 0.51$}$ &
$50.42 \text{\scriptsize $\pm 1.05$}$ &
$49.88 \text{\scriptsize $\pm 0.62$}$ &
$49.89 \text{\scriptsize $\pm 0.31$}$ \\ 
% ViViT
& \scriptsize \textbf{ViViT}~\cite{arnab_vivit_2021} & 
$49.91 \text{\scriptsize $\pm 0.28$}$ & 
$66.39 \text{\scriptsize $\pm 1.48$}$ &
$79.19 \text{\scriptsize $\pm 0.28$}$ & 
$75.95 \text{\scriptsize $\pm 0.71$}$ \\
% Swin3D
& \scriptsize \textbf{Swin3D}~\cite{liu_video_2021} &
$50.64 \text{\scriptsize $\pm 0.25$}$ &
$\underline{67.96} \text{\scriptsize $\pm 0.58$}$ &
$79.94 \text{\scriptsize $\pm 0.15$}$ &
$75.95 \text{\scriptsize $\pm 0.25$}$ \\
% CT-Net
& \scriptsize  \textbf{CT-Net}~\cite{draelos_machine-learning-based_2021} &
$51.39 \text{\scriptsize $\pm 0.50$}$ &
$66.42 \text{\scriptsize $\pm 1.99$}$ &
$79.37 \text{\scriptsize $\pm 0.27$}$ &
$77.37 \text{\scriptsize $\pm 0.40$}$ \\
% CNN3D
& \scriptsize  \textbf{CNN3D}~\cite{anaya-isaza_overview_2021} &
$52.92 \text{\scriptsize $\pm 1.08$}$ &
$67.60 \text{\scriptsize $\pm 1.01$}$ &
$81.47 \text{\scriptsize $\pm 0.78$}$ &
$77.80 \text{\scriptsize $\pm 0.37$}$ \\
% CT-Scroll
& \scriptsize \textbf{CT-Scroll}~\cite{di_piazza_imitating_2025} &
$\underline{53.97} \text{\scriptsize $\pm 0.21$}$ &
$65.36 \text{\scriptsize $\pm 1.91$}$ &
$\underline{81.80} \text{\scriptsize $\pm 0.22$}$ &
$\mathbf{79.49} \text{\scriptsize $\pm 0.45$}$ \\
% CT-Graph
& \scriptsize \textbf{CT-Graph} &  
\cellcolor[gray]{0.95} $\mathbf{54.59} \text{\scriptsize $\pm 0.17$}$ &
\cellcolor[gray]{0.95} $\mathbf{68.77} \text{\scriptsize $\pm 0.92$}$ &
\cellcolor[gray]{0.95} $\mathbf{82.44} \text{\scriptsize $\pm 0.14$}$ &
\cellcolor[gray]{0.95} $\underline{78.66} \text{\scriptsize $\pm 0.36$}$ \\

% Rad-ChestCT
\toprule
% Random predictions
\multirow{7}{*}{\rotatebox[origin=c]{0}{\small \begin{tabular}{@{}c@{}}
\small {\tt Rad-ChestCT} \\
%\small \worldflag[width=0.4cm]{US}
\end{tabular}}} & 
\scriptsize Random Pred. &
$35.91 \text{\scriptsize $\pm 0.41$}$ &
$51.51 \text{\scriptsize $\pm 0.75$}$ &
$49.68 \text{\scriptsize $\pm 0.55$}$ &
$50.40 \text{\scriptsize $\pm 0.32$}$ \\
% ViViT
& \scriptsize \textbf{ViViT}~\cite{arnab_vivit_2021} &
$48.59 \text{\scriptsize $\pm 0.97$}$ & 
$69.27 \text{\scriptsize $\pm 1.64$}$ &
$67.83 \text{\scriptsize $\pm 0.38$}$ & 
$60.22 \text{\scriptsize $\pm 1.15$}$ \\ 
% Swin3D
& \scriptsize \textbf{Swin3D}~\cite{liu_video_2021} & 
$47.98 \text{\scriptsize $\pm 0.41$}$ & 
$66.76 \text{\scriptsize $\pm 0.63$}$ &
$67.29 \text{\scriptsize $\pm 0.23$}$ & 
$60.67 \text{\scriptsize $\pm 0.60$}$ \\ 
% CT-Net
& \scriptsize  \textbf{CT-Net}~\cite{draelos_machine-learning-based_2021} & 
$47.53 \text{\scriptsize $\pm 0.93$}$ &  
$68.45 \text{\scriptsize $\pm 1.18$}$ &
$67.71 \text{\scriptsize $\pm 0.83$}$ &
$60.05 \text{\scriptsize $\pm 1.93$}$ \\ 
% CNN3D
& \scriptsize  \textbf{CNN3D}~\cite{anaya-isaza_overview_2021} &
$\underline{49.28} \text{\scriptsize $\pm 0.93$}$ &
$\mathbf{70.47} \text{\scriptsize $\pm 0.73$}$ &
$71.13 \text{\scriptsize $\pm 0.62$}$ &
$61.08 \text{\scriptsize $\pm 0.60$}$ \\
% CT-Scroll
& \scriptsize \textbf{CT-Scroll}~\cite{di_piazza_imitating_2025} & 
$48.55 \text{\scriptsize $\pm 0.54$}$ &
$66.63 \text{\scriptsize $\pm 1.49$}$ &
$\underline{71.21} \text{\scriptsize $\pm 0.37$}$ & 
$\mathbf{63.02} \text{\scriptsize $\pm 0.93$}$ \\ 
% CT-Graph
& \scriptsize \textbf{CT-Graph} & 
$\mathbf{49.52} \text{\scriptsize $\pm 0.76$}$ & 
$\underline{69.30} \text{\scriptsize $\pm 1.48$}$ &
$\mathbf{72.18} \text{\scriptsize $\pm 0.29$}$ & 
$\underline{62.60} \text{\scriptsize $\pm 0.52$}$ \\
\toprule
\end{tabular}
\end{adjustbox}
%\vspace{-0.8em}
\label{table:quantitive_metrics}
\end{table*}

\begin{figure*}[t!]
    \centering
    \includegraphics[width=1.0\textwidth]{LatexSourceFile-0011/fig2.pdf}
    \caption{GradCAM activation maps extracted from the 2D ResNet module.}
    \label{sph:fig:gradcam}
\end{figure*}

\begin{figure}[t]
    \caption{(a) Per-anomaly F1-Score comparison for the 3 anomalies with highest improvement over baselines. (b) Model robustness to z-axis volume shift. F1 are reported for volumes translated along the z-axis with minimum-value padding.}
    \centering
    \includegraphics[width=1.0\textwidth]{LatexSourceFile-0011/fig3.pdf}
    \label{sph:fig:top_7_f1}
\end{figure}

\subsection{Quantitative results}

For each method and each label, we select the threshold that maximizes F1-Score on the validation set and report all metrics on the test set. We compare our method against a 3D CNN, ViViT~\cite{arnab_vivit_2021}, a video-adapted Vision Transformer which also forms the architectural basis for CT-ViT, and Swin3D~\cite{yang_swin3d_2023}, an extension of Swin Transformer for volumetric data. We also include CT-Net~\cite{draelos_machine-learning-based_2021} and CT-Scroll~\cite{di_piazza_imitating_2025}, two 2.5D approaches that employ CNN-based feature extractors. CT-Net relies on convolutional layers for feature aggregation and dimensionality reduction, whereas CT-Scroll leverages an alternating attention mechanism to capture cross-slice dependencies. ResNet-based models used ImageNet pre-trained weights; others were initialized via weight inflation~\cite{zhang_adapting_2023} for comparability. Table~\ref{table:quantitive_metrics} shows that CT-Graph consistently outperforms all baselines across AUROC, F1-Score and Recall. On the {\tt CT-RATE} test set, our method achieves an F1-Score of $54.59$, representing a +$\Delta$1.15\% improvement over CT-Scroll~\cite{di_piazza_imitating_2025} and +$\Delta$5.93\% over CT-Net~\cite{draelos_machine-learning-based_2021}. For the F1-Score, a paired t-test comparing the performance of CT-Graph against each baseline consistently yields a p-value $<0.01$, demonstrating statistical significance. As shown in Fig.~\ref{sph:fig:top_7_f1}.a, CT-Graph yields the largest improvements on diffuse anomalies such as bronchiectasis, mosaic attenuation, and lung opacity. Reffering to Fig.~\ref{sph:fig:top_7_f1}.b, both attention and spectral convolution demonstrate robustness to z-axis translations, whereas standard convolution is sensitive to such shifts. To evaluate this property, we simulate patient body shifts by applying controlled translations along the z-axis with appropriate padding. Fig.~\ref{sph:fig:gradcam} illustrates CT-Graph's ability to classify anomalies from relevent regions.
\subsection{Ablation study}

\begin{table*}[t]
\centering
\caption{Comparison of graph connectivity schemes and GNN modules, evaluated on the {\tt CT-RATE} test set. The neighborhood size is fixed to 16 for these runs.}
\begin{adjustbox}{width=1.0\textwidth}
\begin{tabular}{c l c c c}
\toprule
\scriptsize Connectivity & \scriptsize Module & \scriptsize F1 & \scriptsize AUROC & \scriptsize Accuracy\\
\toprule
% GATv2Conv
\multirow{3}{*}{\textit{\scriptsize Fully connected}} & 
\scriptsize GATv2Conv~\cite{brody_how_2022} & 
$\text{\scriptsize $53.72$} \text{\tiny $\pm 0.34$}$ &
$\text{\scriptsize $81.56$}\text{\tiny $\pm 0.03$}$ & 
$\text{\scriptsize $78.04$} \text{\tiny $\pm 0.31$}$ \\
% GraphConv
 & \scriptsize GraphConv~\cite{morris_weisfeiler_2021} &
$\text{\scriptsize $53.73$} \text{\tiny $\pm 0.36$}$ &
$\text{\scriptsize $81.99$} \text{\tiny $\pm 0.40$}$ & 
$\text{\scriptsize $78.15$} \text{\tiny $\pm 0.31$}$ \\
% ChebConv
 & \scriptsize ChebConv~\cite{defferrard_convolutional_2017} & 
 $\text{\scriptsize $\underline{54.40}$} \text{\tiny $\pm 0.15$}$ &
$\text{\scriptsize $\underline{82.34}$} \text{\tiny $\pm 0.12$}$ & 
$\text{\scriptsize $\underline{79.01}$} \text{\tiny $\pm 0.55$}$ \\
\hline
% GATv2
\multirow{3}{*}{\textit{\scriptsize Neighbourhood}} & 
\scriptsize GATv2Conv~\cite{brody_how_2022} & 
$\text{\scriptsize $54.06$} \text{\tiny $\pm 0.19$}$ &
$\text{\scriptsize $82.22$} \text{\tiny $\pm 0.05$}$ & 
$\text{\scriptsize $78.59$} \text{\tiny $\pm 0.25$}$ \\
% GraphConv
& \scriptsize GraphConv~\cite{morris_weisfeiler_2021} &
$\text{\scriptsize $54.16$} \text{\tiny $\pm 0.24$}$ &
$\text{\scriptsize $82.33$} \text{\tiny $\pm 0.18$}$ & 
$\text{\scriptsize $78.68$} \text{\tiny $\pm 0.52$}$ \\
% ChebConv
 & \scriptsize ChebConv~\cite{defferrard_convolutional_2017} & 
$\text{\scriptsize $\mathbf{54.41}$}  \text{\tiny $\pm 0.12$}$ &
$\text{\scriptsize $\mathbf{82.47}$} \text{\tiny $\pm 0.26$}$ & 
$\text{\scriptsize $\mathbf{79.12}$} \text{\tiny $\pm 0.53$}$ \\
\toprule
\end{tabular}
\end{adjustbox}
\label{table:ablation_study}
\end{table*}

\noindent \textbf{Comparison of representative GNNs.} Table~\ref{table:ablation_study} highlights the performance gains achieved by incorporating Chebyshev Convolutions~\cite{defferrard_convolutional_2017} in our GNN module. Compared to a direct neighborhood aggregation approach~\cite{morris_weisfeiler_2021}, ChebConv improves AUROC by +$\Delta$0.42\% and F1-Score by +$\Delta$1.25\%, suggesting that spectral-domain convolutions may enhance feature aggregation while demonstrating robustness to variations in cranial-caudal slice positioning (Fig.~\ref{sph:fig:top_7_f1}). Inference time takes approximately 70 milliseconds for all GNN variants.

\noindent \textbf{Graph construction.} Table~\ref{table:ablation_study} and Table~\ref{table:abla_window} demonstrate that neighborhood graph construction consistently improves AUROC and F1-score across all GNN variants, with particularly pronounced gains for GATv2Conv and GraphConv, with ChebConv showing marginal gains.

\begin{table*}[t]
\centering
\begin{adjustbox}{width=1.0\textwidth}
\begin{tabular}{l c c c c c}
\toprule
Neighbourhood size & F1 Score & Recall & Precision & AUROC & Accuracy\\
\toprule
% 4 neighbour
\textbf{4} & 
$\underline{53.76} \text{\scriptsize $\pm 0.24$}$ & 
$66.02 \text{\scriptsize $\pm 0.92$}$ & 
$\mathbf{47.84} \text{\scriptsize $\pm 0.22$}$ &
$\underline{82.22} \text{\scriptsize $\pm 0.05$}$ & 
$\textbf{78.97} \text{\scriptsize $\pm 0.58$}$ \\
% 16 neighbour
\textbf{16} &  
$\mathbf{54.14} \text{\scriptsize $\pm 0.24$}$ & 
$\underline{67.99} \text{\scriptsize $\pm 0.75$}$ & 
$\underline{47.34} \text{\scriptsize $\pm 0.30$}$ &
$\textbf{82.33} \text{\scriptsize $\pm 0.18$}$ & 
$\underline{78.68} \text{\scriptsize $\pm 0.52$}$ \\
% fully connected
\textbf{80} (Fully connected) & 
$53.73 \text{\scriptsize $\pm 0.36$}$ & 
$\mathbf{69.34} \text{\scriptsize $\pm 0.91$}$ & 
$45.80 \text{\scriptsize $\pm 0.59$}$ &
$81.99 \text{\scriptsize $\pm 0.40$}$ & 
$78.15 \text{\scriptsize $\pm 0.31$}$ \\
\toprule
\end{tabular}
\end{adjustbox}
\caption{Impact of the neighbourhood size, using GraphConv. Neighborhood size, noted as $q$, refers to the number of nodes each node is connected to.}
%\vspace{-0.8em}
\label{table:abla_window}
\end{table*}

\noindent \textbf{Impact of the weight function.} Among the evaluated edge weighting functions, the inverse function (see Eq.~\ref{eq:inverse_fun}) with z-axis spacing measured in decimeters (dm) yields the best classification performance, as illustrated in Figure~\ref{fig:abla_weight}.

\begin{figure}[t!]
    \centering
    \subfloat[Edge Weighting vs. Node Distance]{%
        \includegraphics[height=3.0cm,width=0.50\textwidth]{LatexSourceFile-0011/fig4.pdf}%
        \label{fig:edge_weighting}%
    }
    \hfill
    \subfloat[Impact on F1-Score]{%
        \includegraphics[height=3.0cm,width=0.45\textwidth]{LatexSourceFile-0011/fig5.pdf}%
        \label{fig:edge_weighting_impact}%
    }
    \caption{Impact of the edge weighting functions, on the {\tt CT-RATE} test set. We use a GraphConv module and a fully connected graph for all experiments.}
    \label{fig:abla_weight}
\end{figure}

\section{Discussion and Conclusion}

In this work, we introduced CT-Graph, a new graph-based approach for multi-label anomaly classification from 3D Chest CT volumes. Each scan is represented as a structured graph, where nodes correspond to triplets of adjacent axial slices. To enable effective feature aggregation across this graph, we leverage a spectral approach based on Chebyshev convolution, which captures both short-range and long-range dependencies along the axial direction. Additionally, we show that incorporating spatially-aware graph structures, through both weighted edges and constrained neighborhood connectivity, enhances performance across multiple Graph Neural Network variants. CT-Graph demonstrates robustness to variations in patient body positioning along the z-axis and provides a flexible framework for modeling volumetric data. Future work may include anatomical segmentation-driven graph construction, transformer-based hybridization with patch representations, multi-view modeling extension, and exploration of architectural factors such as convolution depth and Chebyshev filter size.

\begin{credits}
\subsubsection{\discintname}
The authors have no competing interests to declare that are relevant to the content of this article.
\end{credits}
%
% ---- Bibliography ----
%
% BibTeX users should specify bibliography style 'splncs04'.
% References will then be sorted and formatted in the correct style.
%
\bibliographystyle{splncs04}
\bibliography{Paper-0011}
\end{document}
