% This is samplepaper.tex, a sample chapter demonstrating the
% LLNCS macro package for Springer Computer Science proceedings;
% Version 2.21 of 2022/01/12
%
\documentclass[runningheads]{llncs}
%
\usepackage[T1]{fontenc}
\usepackage{graphicx}
\usepackage{xcolor}
\usepackage{hyperref}   
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{booktabs}
% T1 fonts will be used to generate the final print and online PDFs,
% so please use T1 fonts in your manuscript whenever possible.
% Other font encondings may result in incorrect characters.
%
% Used for displaying a sample figure. If possible, figure files should
% be included in EPS format.
%
% If you use the hyperref package, please uncomment the following two lines
% to display URLs in blue roman font according to Springer's eBook style:
%\usepackage{color}
%\renewcommand\UrlFont{\color{blue}\rmfamily}
%\urlstyle{rm}
%
\begin{document}
%
\title{Leveraging Multi-Representation Features from Diffusion Models for Unsupervised 3D Medical Image Segmentation}
\titlerunning{Multi-Representation Features for Unsupervised 3D Segmentation}

%
%\titlerunning{Abbreviated paper title}
% If the paper title is too long for the running head, you can set
% an abbreviated paper title here
\author{
Tanatta Chaichakan\inst{1}\orcidID{0009-0004-5827-209X} \and
Fani Deligianni\inst{1}\orcidID{0000-0003-1306-5017} \and
Paul Henderson\inst{1}\orcidID{0000-0002-5198-7445}
}
\authorrunning{T. Chaichakan et al.}
\institute{
School of Computing Science, University of Glasgow, Glasgow, UK\\
\email{2831588c@student.gla.ac.uk}
}
\maketitle
\begin{abstract}
We propose a complementary feature integration framework for unsupervised 3D medical image segmentation that repurposes a pretrained 3D latent diffusion model as a feature extractor. The framework integrates raw image intensity with multiple representations extracted from different components of the diffusion model, including VAE latent representations, diffusion U-Net features, and self-attention features. These representations are integrated into a unified feature space together with spatial coordinates and subsequently segmented using recursive 3D normalized cuts. We evaluate the proposed framework on BraTS2020 brain tumour MRI and FLARE22 abdominal CT datasets using identical hyperparameters without dataset-specific tuning. Experimental results show that integrating multiple representations improves segmentation performance across both datasets compared with individual representations and baseline methods. The proposed framework achieves the best overall average performance across the evaluated datasets while demonstrating the ability to generalise across MRI and CT modalities. These findings suggest that representations derived from different components of the diffusion framework capture complementary anatomical information that benefits unsupervised 3D medical image segmentation. Code is publicly available at \url{https://doi.org/10.5281/zenodo.21375116}.
\keywords{Unsupervised medical image segmentation \and diffusion representations \and normalized cuts \and feature integration}
\end{abstract}
%
\section{Introduction}
Unsupervised segmentation in medical imaging has shown potential as an alternative to supervised approaches~\cite{acharjya2012edge,chen2018graphcuts,elnaqa2007multimodality} by eliminating the need for costly and time-consuming expert annotations~\cite{litjens2017survey}. However, accurate segmentation remains challenging because medical images often contain low-contrast regions, noise, and substantial anatomical variability~\cite{gao2025medical,bougourzi2025recent,chen2025pam}. Furthermore, the performance of unsupervised methods depends heavily on the underlying feature representations, and poorly discriminative features may lead to suboptimal segmentation results~\cite{fu2025sam}.

Existing studies on unsupervised segmentation have explored a wide range of techniques and demonstrated promising results across various biomedical imaging datasets~\cite{benkarim2017confidence,bhagwat2016manual,bijar2013contrast,brosch2016deep3d,hamrani2025self,liu2024cuts}. However, several recent approaches are designed primarily for 2D images, whereas medical imaging data are inherently three-dimensional and require volumetric consistency across slices. A further limitation is that many methods rely on a single feature representation, which may not fully capture the diverse characteristics of medical images. Relevant information exists at multiple levels, including local appearance, global context, multi-scale structures, and spatial relationships. Therefore, integrating complementary representations may improve unsupervised 3D medical image segmentation.

In this work, we propose an unsupervised framework for 3D medical image segmentation that repurposes a pretrained 3D latent diffusion model~\cite{guo2025maisi} as a feature extractor. Rather than using the model for medical image synthesis, we leverage representations extracted from different components of the diffusion framework as informative features for graph-based segmentation. The proposed framework combines image intensity (local appearance), latent representations from a VAE encoder (global contextual information), hierarchical features from the diffusion U-Net (multi-scale structural information), and self-attention features (spatial relationships). These features are augmented with spatial coordinates and fused to construct an affinity graph, which is subsequently partitioned using recursive 3D normalized cuts (N-Cuts) to produce the final segmentation. We evaluate the proposed framework on both MRI and CT datasets under consistent experimental settings. Our contributions are:

\begin{itemize}
\item An unsupervised graph-based framework for 3D medical image segmentation that leverages a pretrained 3D latent diffusion model for feature extraction and recursive 3D normalized cuts for segmentation.

\item An empirical analysis of complementary representations, including intensity, VAE, diffusion, and self-attention features, for unsupervised segmentation.

\item A unified evaluation across MRI and CT datasets using identical hyperparameter settings to assess generalisation across imaging modalities.
\end{itemize}

\section{Related Work}
\noindent\textbf{Classical Methods.}
Unsupervised segmentation has been studied using clustering-based, probabilistic, and graph-based approaches~\cite{benkarim2017confidence,bhagwat2016manual,bijar2013contrast,brosch2016deep3d}. Among these, normalized cuts~\cite{shi2000normalized} formulate segmentation as a graph partitioning problem based on pairwise affinities and form the basis of our approach.

\noindent\textbf{Diffusion-based Methods.}
DiffCut~\cite{couairon2024diffcut} extracts self-attention features from pretrained diffusion models and applies recursive normalized cuts for zero-shot segmentation. ADZUS~\cite{hamrani2025self} similarly utilizes diffusion self-attention representations for biomedical image segmentation. 

\noindent\textbf{Representation-based Methods.}
CUTS~\cite{liu2024cuts} learns patch-level representations using contrastive learning and diffusion condensation to generate hierarchical segmentations. Unlike our approach, it relies on learned representations and operates on 2D slices.

\noindent\textbf{Pretrained Generative Models.}
MAISI~\cite{guo2025maisi} is a 3D latent diffusion framework for medical image synthesis. Its volumetric design enables the learning of rich anatomical representations from CT and MRI data.

\section{Proposed Method}
The proposed method consists of two main components (Fig.~\ref{fig:figure1}): (1) multi-representation feature construction using a pretrained generative model~\cite{guo2025maisi}, and (2) segmentation via 3D recursive N-Cuts. Given a 3D input image $I$, a VAE is first used to encode the image into a latent representation $A$. Gaussian noise is then injected at a fixed noise level into the latent representation to obtain a noisy latent. This noisy latent is passed through a diffusion U-Net in a single forward pass, producing diffusion features $B$. In parallel, self-attention weights are extracted from the self-attention (SA) blocks of the network. These attention maps are averaged across attention heads, reshaped into spatial feature maps, and further reduced using principal component analysis (PCA) to obtain attention-derived features $C$. Finally, raw intensity $I$, latent representations $A$, diffusion features $B$, and attention-derived features $C$ are resampled to a common voxel grid. These feature representations, together with spatial coordinates, are then integrated to construct an affinity graph. Segmentation is subsequently performed using recursive N-Cuts to generate the final segmentation masks.
\begin{figure}[t]
    \centering
    \includegraphics[width=0.85\linewidth]{fig1_part1.pdf}
    \vspace{-0.8em}
    \includegraphics[width=0.85\linewidth]{fig1_part2.pdf}
    \vspace{-0.5em}
    \caption{Overview of the proposed framework. Latent representations (Sec.~\ref{sec:representation}), diffusion features (Sec.~\ref{sec:diffusion}), self-attention features (Sec.~\ref{sec:attention}), image intensity, and spatial coordinates are integrated to construct an affinity graph, followed by recursive normalized cuts for segmentation (Sec.~\ref{sec:ncuts}).}
    \label{fig:figure1}
\end{figure}
\subsection{Latent Representation using VAE}
\label{sec:representation}
We use the encoder of a pretrained VAE~\cite{guo2025maisi} to map an input volume
$I \in \mathbb{R}^{H \times W \times D}$ to a compact latent representation
$A \in \mathbb{R}^{C \times h \times w \times d}$.
The latent representation preserves anatomical information while reducing spatial resolution, and is used for both affinity graph construction and diffusion feature extraction.
\subsection{Diffusion-Based Feature Extraction}
\label{sec:diffusion}
Given the latent representation $A \in \mathbb{R}^{C \times h \times w \times d}$, we repurpose a pretrained MAISI latent diffusion model~\cite{guo2025maisi} as a feature extractor. Gaussian noise is injected at a fixed noise level $t \in [0,1]$ to obtain a noisy latent:
\begin{equation}
z_t = (1 - t)A + t\,\alpha \epsilon, \quad \epsilon \sim \mathcal{N}(0, I),
\end{equation}
where $\alpha$ controls the noise magnitude.

The noisy latent $z_t$ is passed through the pretrained diffusion U-Net in a single forward pass. Instead of performing iterative denoising, we extract the feature map from the fourth decoder upsampling block. This feature map is used as the diffusion representation $B \in \mathbb{R}^{C' \times h' \times w' \times d'}$.

\subsection{Self-Attention Feature Extraction}
\label{sec:attention}
Using the pretrained diffusion U-Net, we derive self-attention features from a selected self-attention layer. Following the standard attention formulation, attention weights are computed as
\begin{equation}
S = \mathrm{softmax}\left(\frac{QK^\top}{\sqrt{d}}\right),
\end{equation}
where $S$ denotes the multi-head attention weights. The attention maps are averaged across heads and symmetrized to capture mutual relationships between tokens. The symmetrized attention matrix is then reshaped into token-wise spatial feature maps, reduced using PCA, and upsampled to the target spatial size to form the attention representation $C$.

\subsection{Segmentation via Recursive Normalized Cuts}
\label{sec:ncuts}
We formulate segmentation as a graph partitioning problem using N-Cuts on a 3D voxel-wise graph, where each voxel is treated as a node and pairwise affinities define edge weights. Affinities are constructed from the feature representations described in Sec.~\ref{sec:representation}--\ref{sec:attention}, including image intensity $I$, latent representation $A$, diffusion feature $B$, and attention feature $C$.

All feature representations are resampled to a common voxel grid. A foreground mask is applied to select valid voxels, resulting in feature matrices $X_i \in \mathbb{R}^{C_i \times M}$ for each feature type $i \in \{I,A,B,C\}$. Spatial coordinates $X_{xyz} \in \mathbb{R}^{3 \times M}$ are incorporated as additional features. The augmented feature representation is defined as
\begin{equation}
\tilde{X}_i = \left[ X_i;\ \alpha_i X_{xyz} \right],
\end{equation}
where $\alpha_i$ controls the contribution of spatial information.

Affinity matrices are computed using cosine similarity:
\begin{equation}
W^{(i)}_{pq}=
\frac12\left(
\cos(\tilde{x}^{(i)}_p,\tilde{x}^{(i)}_q)+1
\right),
\end{equation}
where $\tilde{x}^{(i)}_p$ and $\tilde{x}^{(i)}_q$ denote the augmented feature vectors of voxels $p$ and $q$ in $\tilde{X}_i$, respectively.


The final affinity matrix is obtained by weighted fusion:
\begin{equation}
W = w_I W^{(I)} + w_A W^{(A)} + w_B W^{(B)} + w_C W^{(C)}.
\end{equation}
where $w_I$, $w_A$, $w_B$, and $w_C$ are fusion weights.

The symmetric normalized graph Laplacian is then computed as
\begin{equation}
L_{\mathrm{sym}} = I - D^{-1/2} W D^{-1/2},
\end{equation}
where $D$ is the degree matrix with diagonal entries
$D_{pp}=\sum_q W_{pq}$ for voxel $p$.

Segmentation is obtained from the eigenvector corresponding to the second smallest eigenvalue of $L_{\mathrm{sym}}$~\cite{shi2000normalized}. Recursive bipartitioning is applied until a minimum region size or maximum recursion depth is reached, yielding the final segmentation mask.

\section{Experiments and Results}
\subsection{Datasets and Preprocessing}
We evaluate the proposed framework on two public medical imaging datasets: BraTS2020~\cite{menze2015brats,bakas2017tcga,bakas2018brats} and FLARE22~\cite{ma2024flare22}. For BraTS2020, we use the T2-FLAIR modality and perform binary segmentation by merging all tumour subregions into a single foreground class. For FLARE22, we use abdominal CT scans for multi-organ segmentation. All volumes are preprocessed following the MAISI protocol~\cite{guo2025maisi}, including orientation standardization, modality-specific intensity normalization, and spatial resampling. Volumes are resized to a fixed resolution of $256 \times 256 \times 256$ for BraTS2020 and $256 \times 256 \times 128$ for FLARE22.

\subsection{Implementation Details}
We use a pretrained MAISI model~\cite{guo2025maisi} for feature extraction, with the VAE encoder and diffusion U-Net kept frozen during inference. A diffusion noise timestep of $t=0.2$ is used. Diffusion features are extracted from the fourth decoder upsampling block, while self-attention features are extracted from the third downsampling block. PCA dimensionality is set to 512 for BraTS2020 and 64 for FLARE22. Spatial coordinates are incorporated into all feature representations with a weighting factor of 0.01. The fusion weights are set to $(w_I,w_A,w_B,w_C)=(1.0,0.5,1.0,0.2)$. All feature representations are resampled to a common spatial resolution of $64 \times 64 \times 64$ for graph construction. Segmentation is performed using recursive normalized cuts with a maximum recursion depth of 5 and a minimum region size of 50 voxels. All hyperparameters were selected based on ablation experiments and fixed throughout the experiments.

\subsection{Post-processing and Evaluation}
After recursive N-Cuts segmentation (Sec.~\ref{sec:ncuts}), post-processing is applied to suppress small fragmented regions and reduce segmentation noise. For BraTS2020, connected component filtering and a $3 \times 3 \times 3$ mode filter are applied. For FLARE22, a $2 \times 2 \times 2$ mode filter is used for lightweight smoothing. The resulting segmentation masks are then upsampled to the original image resolution. Ground-truth annotations are used exclusively for evaluation and cluster-to-label matching, and are not involved in any stage of the segmentation process. Since cluster identities are arbitrary in unsupervised segmentation, predicted segments are matched to reference labels for metric computation. For BraTS2020, all tumour subregions are merged into a binary lesion mask, and clusters are matched based on overlap. For FLARE22, connected components are matched to organ labels using a greedy Dice-based assignment strategy.

\subsection{Representation Analysis}
Table~\ref{tab:representation} shows that the contribution of each representation varies across datasets. On BraTS2020, image intensity alone achieves strong performance, reflecting the distinctive appearance of tumour regions in MRI. In contrast, no single representation performs consistently well on FLARE22, indicating that abdominal CT segmentation requires more diverse anatomical cues. Integrating multiple representations generally improves performance over individual representations. Full integration achieves the best results on FLARE22 while remaining competitive with Intensity + VAE + Diffusion on BraTS2020. Since our objective is to develop a unified framework that generalises across imaging modalities without dataset-specific tuning, the fully integrated configuration is adopted because it provides the most consistent performance across both datasets.


\subsection{Segmentation Results}
We evaluate segmentation performance on two datasets against three baselines using Dice score (Dice; unitless in $[0,1]$), Intersection over Union (IoU; unitless in $[0,1]$), and Hausdorff distance (HD; mm). Higher Dice and IoU indicate better overlap with the reference segmentation, whereas lower HD indicates more accurate boundary localisation. We consider both 2D and 3D evaluation protocols. For our method and the intensity-based baseline, cluster-to-ground-truth matching is performed in both slice-wise and volume-wise settings. The intensity-based baseline applies 3D normalized cuts directly to the input image intensity and spatial coordinates, without using VAE, diffusion, or self-attention features. For DiffCut and CUTS, slice-wise matching is first performed, followed by global relabeling across adjacent slices using Hungarian matching~\cite{kuhn1955hungarian} based on Dice overlap. Reconstructed volumes are then evaluated using both 2D and 3D metrics. All results are reported as mean $\pm$ standard deviation, where $\uparrow$ and $\downarrow$ denote higher-is-better and lower-is-better performance, respectively.

Table~\ref{tab:brats_results} and Fig.~\ref{fig:qualitative} present the quantitative and qualitative results on the BraTS2020 dataset. Our method achieves the best performance under the 3D evaluation setting across all metrics, indicating improved volumetric consistency compared with the baselines. Although the intensity-based baseline performs competitively under 2D evaluation, incorporating complementary representations improves consistency across slices, resulting in higher Dice and IoU scores together with lower Hausdorff distance under 3D evaluation. DiffCut and CUTS show consistently lower performance across all metrics, likely because their slice-wise formulation does not explicitly exploit 3D spatial information.

To assess generalisation, we further evaluate the proposed framework on the FLARE22 abdominal CT dataset using the same experimental setting. Table~\ref{tab:abdominal_3d} presents the quantitative results under the 3D evaluation protocol. Overall performance is lower than on BraTS2020, reflecting the greater difficulty of unsupervised abdominal CT segmentation, where organs often exhibit similar intensity distributions and ambiguous boundaries.

Our method achieves the best overall performance, obtaining the highest average Dice and IoU scores across organs. While DiffCut performs better on some large organs, such as the liver and stomach, our method shows stronger performance on smaller and anatomically complex structures, including the adrenal glands and spleen. This suggests that integrating multiple representations provides complementary anatomical cues that are beneficial for challenging multi-organ segmentation tasks. Qualitative examples are shown in Fig.~\ref{fig:qualitative}.

CUTS consistently achieves lower performance under our evaluation protocol, possibly because its representations are learned from a specific training dataset and applied in a zero-shot manner. Overall, the results across BraTS2020 and FLARE22 demonstrate that the proposed framework generalises across MRI and CT datasets under a unified unsupervised setting.

\begin{table}[t]
\centering
\scriptsize
\caption{Representation analysis. Segmentation performance is reported as the average Dice score over 10 patients for each dataset.}
\label{tab:representation}
\setlength{\tabcolsep}{2pt}
\begin{tabular}{lcc}
\toprule
\textbf{Representation} & \textbf{BraTS2020} $\uparrow$ & \textbf{FLARE22} $\uparrow$ \\
\midrule
Intensity & 0.709 & 0.048 \\
VAE & 0.288 & 0.077 \\
Diffusion & 0.270 & 0.072 \\
Attention & 0.431 & 0.046 \\
\midrule
Intensity + VAE & 0.715 & 0.091 \\
Intensity + VAE + Diffusion & \textbf{0.721} & 0.087 \\
Intensity + VAE + Diffusion + Attention (Full Integration) & 0.703 & \textbf{0.092} \\
\bottomrule
\end{tabular}
\end{table}

\begin{figure}[!t]
    \centering
    \includegraphics[width=0.60\linewidth]{vis_brats.pdf}
    \vspace{-0.5em}
    \includegraphics[width=0.60\linewidth]{abdominal_1.pdf}
    \vspace{-0.5em}
    \caption{Qualitative results on BraTS2020 and FLARE22. For BraTS2020, {\color{blue}blue} contours denote ground truth and {\color{red}red} contours denote matched predictions. For FLARE22, organ boundaries are shown in different colors. Our method produces more coherent segmentations than the baselines.}
    \label{fig:qualitative}
    \vspace{-0.8em}
\end{figure}

\begin{table}[t]
\centering
\caption{Segmentation performance on BraTS2020 (30 subjects).}
\label{tab:brats_results}

\resizebox{\linewidth}{!}{
\setlength{\tabcolsep}{1.5pt}
\renewcommand{\arraystretch}{0.95}
\begin{tabular}{@{}lcccccc@{}}
\toprule
& \multicolumn{3}{c}{2D evaluation}
& \multicolumn{3}{c}{3D evaluation} \\
\cmidrule(lr){2-4}\cmidrule(l){5-7}

\textbf{Method}
& Dice $\uparrow$ & IoU $\uparrow$ & HD $\downarrow$
& Dice $\uparrow$ & IoU $\uparrow$ & HD $\downarrow$ \\
\midrule

\textbf{Ours}
& \textbf{0.597$\pm$0.137} & 0.442$\pm$0.135 & \textbf{22.79$\pm$5.58}
& \textbf{0.704$\pm$0.136} & \textbf{0.558$\pm$0.152} & \textbf{26.44$\pm$6.95} \\

Intensity
& \textbf{0.597$\pm$0.137} & \textbf{0.443$\pm$0.138} & 23.50$\pm$5.42
& 0.692$\pm$0.136 & 0.544$\pm$0.149 & 28.95$\pm$8.60 \\

DiffCut
& 0.339$\pm$0.189 & 0.269$\pm$0.182 & 69.77$\pm$21.91
& 0.259$\pm$0.143 & 0.157$\pm$0.098 & 82.77$\pm$13.94 \\

CUTS
& 0.311$\pm$0.211 & 0.241$\pm$0.190 & 69.45$\pm$22.44
& 0.281$\pm$0.176 & 0.176$\pm$0.131 & 84.21$\pm$13.97 \\

\bottomrule
\end{tabular}
\vspace{-0.8em}
}
\end{table}



% # table 3 ###
\begin{table}[t]
\centering
\caption{Per-organ segmentation performance on FLARE22 (3D evaluation) over 38 subjects.}
\label{tab:abdominal_3d}

\setlength{\tabcolsep}{1.5pt}
\renewcommand{\arraystretch}{0.95}

\resizebox{\textwidth}{!}{%
\begin{tabular}{@{}lccc ccc ccc ccc@{}}
\toprule
& \multicolumn{3}{c}{Ours}
& \multicolumn{3}{c}{Intensity}
& \multicolumn{3}{c}{DiffCut}
& \multicolumn{3}{c}{CUTS} \\
\cmidrule(lr){2-4}\cmidrule(lr){5-7}\cmidrule(lr){8-10}\cmidrule(l){11-13}

\textbf{Organ}
& Dice $\uparrow$ & IoU $\uparrow$ & HD $\downarrow$
& Dice $\uparrow$ & IoU $\uparrow$ & HD $\downarrow$
& Dice $\uparrow$ & IoU $\uparrow$ & HD $\downarrow$
& Dice $\uparrow$ & IoU $\uparrow$ & HD $\downarrow$ \\
\midrule

Liver
& 0.666 & 0.501 & 92.522
& 0.657 & 0.492 & 93.363
& \textbf{0.745} & \textbf{0.596} & \textbf{91.874}
& 0.264 & 0.155 & 177.752 \\

Right Kidney
& 0.287 & 0.182 & 54.418
& \textbf{0.318} & \textbf{0.202} & \textbf{54.346}
& 0.232 & 0.134 & 112.176
& 0.051 & 0.026 & 185.078 \\

Spleen
& \textbf{0.489} & \textbf{0.342} & \textbf{51.800}
& 0.426 & 0.283 & 59.835
& 0.157 & 0.089 & 109.293
& 0.035 & 0.018 & 204.861 \\

Pancreas
& 0.102 & 0.057 & \textbf{46.262}
& 0.100 & 0.057 & 47.559
& \textbf{0.160} & \textbf{0.089} & 69.222
& 0.053 & 0.028 & 106.982 \\

Aorta
& 0.125 & 0.070 & \textbf{58.202}
& 0.067 & 0.036 & 62.921
& \textbf{0.172} & \textbf{0.095} & 77.477
& 0.001 & 0.000 & 185.949 \\

IVC
& \textbf{0.058} & \textbf{0.031} & \textbf{55.354}
& 0.027 & 0.014 & 60.599
& 0.050 & 0.026 & 72.062
& 0.003 & 0.001 & 173.019 \\

Right Adrenal
& \textbf{0.031} & \textbf{0.017} & \textbf{16.848}
& 0.025 & 0.014 & 18.546
& 0.005 & 0.003 & 86.869
& 0.005 & 0.003 & 120.705 \\

Left Adrenal
& 0.039 & 0.022 & \textbf{14.026}
& \textbf{0.052} & \textbf{0.029} & 18.415
& 0.003 & 0.001 & 60.841
& 0.005 & 0.003 & 129.172 \\

Gallbladder
& 0.062 & 0.037 & 39.982
& 0.027 & 0.016 & \textbf{35.853}
& \textbf{0.076} & \textbf{0.044} & 109.441
& 0.060 & 0.032 & 142.078 \\

Esophagus
& 0.037 & 0.020 & 40.287
& 0.042 & 0.024 & \textbf{21.580}
& \textbf{0.071} & \textbf{0.038} & 64.933
& 0.002 & 0.001 & 175.393 \\

Stomach
& 0.222 & 0.130 & 54.069
& 0.219 & 0.127 & \textbf{52.200}
& \textbf{0.396} & \textbf{0.250} & 87.749
& 0.042 & 0.022 & 184.195 \\

Duodenum
& 0.051 & 0.027 & \textbf{46.794}
& 0.050 & 0.027 & 46.890
& \textbf{0.082} & \textbf{0.044} & 66.802
& 0.022 & 0.011 & 144.506 \\

Left Kidney
& 0.344 & 0.226 & \textbf{45.050}
& \textbf{0.388} & \textbf{0.261} & 57.369
& 0.242 & 0.139 & 107.723
& 0.076 & 0.040 & 160.929 \\

\midrule
\textbf{Average}
& \textbf{0.193} & \textbf{0.128} & \textbf{47.355}
& 0.184 & 0.122 & 48.421
& 0.184 & 0.119 & 85.882
& 0.048 & 0.026 & 160.817 \\

\bottomrule
\end{tabular}
}%
\vspace{-0.8em}
\end{table}

%------------------------------------------------------------------
% Conclusion
%------------------------------------------------------------------
\section{Conclusion}
In this work, we propose an unsupervised framework for 3D medical image segmentation that combines multiple feature representations with recursive normalized cuts. The proposed framework integrates image intensity, VAE latent representations, diffusion features, and self-attention features for unsupervised segmentation. Experiments demonstrate generalisation across imaging modalities and show that integrating complementary representations improves segmentation performance, particularly for anatomically complex structures. We further observe a trade-off between preserving global anatomical structures and capturing fine-grained details. Future work will investigate organ-aware adaptive partitioning strategies and evaluate the framework on additional medical imaging datasets.

\begin{credits}
\subsubsection{\discintname}
The authors have no competing interests to declare that are relevant to the content of this article.
\end{credits}
\bibliographystyle{splncs04}
\bibliography{references}
\end{document}
