\documentclass{midl}
\usepackage{subcaption}

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images
\usepackage{makecell}

\jmlryear{2025}\jmlrworkshop{Full Paper -- MIDL 2025}
\jmlrvolume{-- 156}
\editors{Accepted for publication at MIDL 2025}

\title[Diffusion Models for Pathological Spine Assessment]{Enhancing Low Back Pain Assessment with Diffusion Models for Lumbar Spine MRI Segmentation}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}


% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Maria Monzon\midljointauthortext{Contributed equally}\nametag{$^{1,2}$}} \orcid{0000-0003-3152-0909} \Email{mmonzon@ethz.ch}\\
\addr $^{1}$  Biomedical Data Science Lab, ETH Z\"urich, Z\"urich, Switzerland \\
\addr $^{2}$ Swiss Institute of Bioinformatics (SIB), Lausanne, 1015, Switzerland
\AND
\Name{Thomas Iff\midlotherjointauthor\nametag{$^{1}$}} \Email{thomas.iff@inf.ethz.ch}\\
\AND
\Name{Ender Konukoglu\nametag{$^{3}$}} \Email{ender.konukoglu@vision.ee.ethz.ch}\\
\addr $^{3}$ Computer Vision Lab, ETH Z\"urich, Z\"urich, Switzerland
\AND
\Name{Catherine R. Jutzeler\nametag{$^{1,2}$}} \Email{catherine.jutzeler@hest.ethz.ch}%\\
}

\begin{document}

\maketitle

\begin{abstract}
This study introduces a diffusion-based framework for robust and accurate semantic segmentation of lumbar spine MRI scans from patients with low back pain (LBP), regardless of whether the scans are T1- or T2-weighted.
We compared with advanced models for segmenting vertebrae, intervertebral discs (IVDs), and spinal canal using the SPIDER dataset. 
The results showed that SpineSegDiff achieved a segmentation performance comparable to that of the state-of-the-art non-diffusion nnUnet, particularly in improving the identification of degenerated IVDs. 
In addition, the uncertainty maps generated by our model provide valuable insights for clinical review, enhancing the robustness and reliability of the segmentation results. 
The potential of diffusion models to enhance the diagnosis and management of LBP through more precise analysis of pathological spine MRI is underscored by our findings
\end{abstract}

\begin{keywords}
Diffusion Models, Lumbar Spine MRI, presegmentation
\end{keywords}

\section{Introduction and Diffusion models for Medical Image Segmentation}
\label{sec:intro}

Low Back Pain~(LBP) is a leading cause of global disability \citep{Dionne2006-bpp}, expected to affect 800 million people by 2050 \citep{lbp-GBD2021}, imposing a significant economic burden on individuals and society \citep{Kent2005-elbp,Marto2023GlobalMeta-analysis}.
Diagnosis of LBP is particularly challenging due to the various pathophysiological mechanisms involved~\citep{Fourney2011ChronicApproach}, including social, genetic, biophysical and psychological factors. 
The multifaceted complexity nature of LBP requires a comprehensive assessment, where lumbar spine Magnetic Resonance Imaging~(MRI) is a crucial diagnostic tool. 
However, manual MRI interpretation is time-consuming and subject to inter-rater variability, potentially compromising diagnostic precision and consistency.

Convolutional neural networks~(CNNs) have shown promise in overcoming these challenges \citep{midl-maier2019} and thus enhancing the diagnostic value of lumbar spine MRI for a more quantitative interpretation \citep{mllbpmr-Galbusera2019}. %,rlbp-Dantoni2021}. 
Recent advances include methods for the automatic location of intervertebral discs~(IVD) or vertebrae~\citep{Lotus2013-lbMRDL,mllbpmr-Windsor2020,mllbpmr-He2021,Lessmann2019-lbMRDL} to detect vertebral fractures \citep{mllbpmr-Zhang2022},
%vertebral segmentation on CT images \citep{You2022-Trans3DUNet,Zhang2022-Apsegmentor}, 
to create synthetic lumbar MRI data \citep{mllbpmr-Han2018}, 
and segment MRI of the lumbar spine in different anatomical structures \citep{mllbpmr-Zhou2022,mllbpmr-Lu2018,mllbpmr-Li2021,mllbpmr-Mushtaq2022,Zheng2022-BianqueNet,od-SpiderGraaf2023}. %mllbpmr-Hess2023,}
%achieving the state-of-the-art segmentation is by nnU-Net~\citep{Isensee2020NnU-Net:Segmentation,od-SpiderGraaf2023}.

However, automatic spine segmentation is challenging due to the high intraclass similarity between vertebrae \citep{Wang2022-SpinsegBC, Sekuboyina2021-verse} and the large variability in the morphology of the intervertebral disc at all levels. 
Additionally, degenerative pathologies such as disc herniation, spinal stenosis, and vertebral fractures can significantly distort the normal anatomical structure \citep{od-MRSegChallengePang2018,od-SpiderGraaf2023}. 

Such anatomical distortions present significant challenges to conventional segmentation methodologies and highlight the need for new techniques to effectively handle this variability.
While medical image segmentation is traditionally a pixel-wise classification problem ~\citep{yao2023-mis},  it can be conceptualized as an image generation task, with a generative model learning the conditional distribution %$p(\mathbf{x}|\mathbf{y})$ to produce the segmentation map $\mathbf{x}$ from the input image $\mathbf{y}$. 
to output the segmentation map. % from the input image. 
Denoising diffusion probabilistic models (DDPM)~\citep{Ho2020DenoisingModels}, traditionally used for image generation, can be adapted for image segmentation \citep{Wolleb2021DiffusionEnsembles} by a conditional problem  $p(\mathbf{x}|\mathbf{y})$, with the mask as a generated sample $\mathbf{x}$ conditioned on the input image $\mathbf{y}$.
Recently, diffusion models~\citep{HoDenoisingModels} showed promising results in medical image analysis~\citet{Kazerouni2023DiffusionSurvey,chung2022-diffres} and also in medical image segmentation~\citet{Liu2024diffSurvey,Xing2023Diff-UNet:Segmentation,Wolleb2021DiffusionEnsembles,kim2023-diffadvrep,DS-Wu2022} due to their ability to effectively capture the underlying data distributions~\citep{DhariwalDiffusionSynthesis} and handle noise and variability in medical images ~\citep{LiOnModels}. 
The inherent ability of diffusion models to model complex and noisy data distributions ~\citep{LiOnModels} may prove advantageous in capturing the variability in signal intensity, anatomy, and pathological features present in MRI scans of LBP patients. 

Motivated by the potential of diffusion models to handle variability in LBP MRI scans, this study presents the following contributions: (i) 
explore diffusion models for unified semantic segmentation of lumbar spine MRI, focusing on their effectiveness with T1 and T2-weighted scans;
(ii) develop a 2D diffusion-based segmentation model for lumbar spine segmentation to handle  of pathological cases;
and (iii) the adaptation of presegmentation strategy that combines initial segmentation and diffusion models for efficient segmentation model training. 

\section{Methods: Diffusion models for Medical Image Segmentation}
This study presents a 2D diffusion-based framework to segment the central slice of lumbar spine MRI scans, aligned with the clinical evaluation of LBP. 
% Given that the diagnosis of LBP involves primarily evaluating the central slice of the lumbar spine, this study introduces a 2D diffusion-based framework for semantic segmentation of lumbar spine MRI scans in LBP patients. 
% This approach builds upon DDPMs, which are generative models that reconstruct data by reversing a gradual noising process. 
% This study presents a novel diffusion-based framework for semantic segmentation of lumbar spine MRI scans of patients with LBP. 
% we focus on developing a 2D  model.
% DDPMs are generative models that learn to reconstruct data by reversing a gradual noising process.  
It leverages DDPMs, generative models that reconstruct data by reversing gradual noise addition.
The forward process iteratively, over $T$ timesteps, adds Gaussian noise to mask sample $\mathbf{x}_0$, $\mathbf{x}_1, ..., \mathbf{x}_T$:
\begin{equation}
\mathbf{x}_t = \sqrt{\bar{\alpha}_t} \mathbf{x}_0 + \sqrt{1-\bar{\alpha}_t} \boldsymbol{\epsilon}
\end{equation}
where $\bar{\alpha}_t$ is an increasing variance scheduler and $\boldsymbol{\epsilon} \sim \mathcal{N}(\mathbf{0}, \mathbf{\sigma})$ identically distributed Gaussian noise with standard deviation $\sigma$.
As time step $t$ increases ($T \to \infty$), the mask loses its distinctive features, approaching an isotropic Gaussian distribution $\mathbf{x}_T$. 
%This approach enhances the robustness o f the model's predictions by leveraging the image information to guide the denoising process.

The reverse diffusion process aims to progressively denoise Gaussian noise $\mathbf{x}_T \sim \mathcal{N}(\mathbf{0}, \mathbf{I})$ to recover the segmentation mask $\mathbf{x}_0$, conditioned on the MRI scan $\mathbf{y}$.
By parameterizing the transition probability  $p_\theta(\mathbf{x}_{t-1} \vert \mathbf{x}_t)$ as a Gaussian distribution ~\citep{Sohl-DicksteinDeepThermodynamics},
we can train a diffusion model by minimizing a loss function that compares the estimated noise $\boldsymbol{\epsilon}_\theta(\mathbf{x}_t, t, \mathbf{y})$ and actual noise $\boldsymbol{\epsilon}$ at each timestep $t$~\citep{ottl2024diffseg}:
\begin{equation}
L_t = \mathbf{E}_{t \sim [1, T], \mathbf{x}_0, \boldsymbol{\epsilon}_t} \Big[\|\boldsymbol{\epsilon}_t - \boldsymbol{\epsilon}_\theta(\sqrt{\bar{\alpha}_t}\mathbf{x}_0 + \sqrt{1 - \bar{\alpha}_t}\boldsymbol{\epsilon}_t, t,\mathbf{y}))\|^2 \Big]
\end{equation}

There are two primary approaches to diffusion-based segmentation in medical imaging:
an iterative approach that predicts and removes noise $\boldsymbol{\epsilon}_t$ sequentially \citep{Wolleb2021DiffusionEnsembles}, and a direct inference method that generates the final segmentation mask $\hat{\mathbf{x}}_0$ from a partially noised input $\mathbf{x}_t$ \citep{DS-Wu2022,Xing2023Diff-UNet:Segmentation}.
Although the iterative denoising process is computationally intensive, sampling efficiency can be optimized using Denoising Diffusion Implicit Models (DDIM)~\citep{Song2020DenoisingModels}. 
DDIM enhances sampling by enabling generation at set timesteps, substantially reducing iterations and computational resources.

% Due to the iterative nature of 
% the process, sampling is computationally expensive.
% An alternative method, known as Diffusion Denoising Implicit Model (DDIM)~\citep{Song2020DenoisingModels} optimizes the sampling process by generating a sample at a specific time step $t$ based on the initial data point and noise, effectively skipping denoising steps.


% \begin{equation}
% \hat{s}_i = f(x_i; \theta)
% \end{equation}
% where \( \hat{s}_i \) is the predicted label pixel, \( x_i \) is the image pixel data, and \( \theta \) represents the model parameters.
%$$ \hat{Y} = g(X; \phi) $$
%where \( \hat{Y} \) is the generated segmentation label map, \( X \) is the input image, and \( \phi \) denotes the parameters of the generative model. 


% Leveraging the strengths of diffusion models in handling noise and variability in medical images, our framework combines a 2D diffusion-based segmentation model, SpineSegDiff, with a UNet architecture and a presegmentation strategy \citep{Guo2022AcceleratingSegmentation} to enhance efficiency and precision.

% The 2D design of SpineSegDiff addresses the memory limitations encountered by patch-based approaches, eliminating the need for sliding-window inference. By integrating these components, our framework aims to provide robust and accurate segmentation of lumbar spine MRI scans, 
% facilitating improved diagnosis and treatment planning for LBP patients.



\subsection{SpineSegDiff} 
\begin{figure*}[tbh]
  \begin{center}
\includegraphics[width=\textwidth]{figures/SpineSegDiff-full.pdf}
\end{center}

\caption{\small SpineSegDiff architecture overview (top): the 2D MRI scan ($\mathbf{y}$) is concatenated with the partially noised mask to generate the segmentation image $\mathbf{x}_{T}$.
The framework combines a multi-scale image encoder (IE) with a UNet-based denoising model.
Segmentation inference (bottom) employs a multi-sample step-weighted sum, simultaneously generating uncertainty-based heatmaps}
\label{fig:spinsegdiff-overview}

\end{figure*}

The SpineSegDiff model presents a novel two-dimensional dual-encoder architecture specifically designed for the semantic segmentation of lumbar spine MRI scans, functioning on 320x320 images without the need for sliding-window inference. 
The model architecture  (Fig.\ref{fig:spinsegdiff-overview})  combines a U-shaped backbone with a dedicated image encoder for multiscale feature extraction.
These dual-encoder features enhance the Denoising UNet embedding, enriching the model's representation capacity during training \citep{Xing2023Diff-UNet:Segmentation}.

% In contrast to traditional diffusion models that learn to denoise patterns, SpineSegDiff directly generates the output mask $\mathbf{x}_0$.  To optimize segmentation accuracy and penalize inaccuracies, SpineSegDiff is trained using a composite loss function. 
% This loss function combines the MSE denoising loss for reconstruction with Dice Loss and Binary Cross-Entropy Loss, which are computed between the predicted segmentation mask $\hat{\mathbf{x}}_f$ and the ground truth segmentation mask $\mathbf{x}_0$.


The SpineSegDiff directly generates the segmentation mask rather than iteratively denoising patterns.
To enhance segmentation accuracy, SpineSegDiff uses a composite loss, integrating MSE denoising for reconstruction, Dice Loss for boundary alignment, and Binary Cross-Entropy for calibrating probabilities between the predicted mask $\hat{\mathbf{x}}_0$ and the ground truth $\mathbf{x}$.
The sampling process leverages the stochastic nature of DDIM, generating intermediate samples  $S = 15$  and computing the arithmetic mean between multiple samples $\bar{\mathbf{x}}_t$ in each time step $t$. 
The final prediction $\hat{\mathbf{x}}_f$ is calculated as a weighted sum of these samples in the last  $T_S = 10$  timesteps, with weights exponentially scaled by time:
\begin{equation}
\hat{\mathbf{x}}_f = \sum_{t=1}^{T_s} e^{-\alpha\left(\frac{T_s - t}{T_s}\right)} \cdot \bar{\mathbf{x}}_t
\quad\quad \text{ where }   \quad\quad
\bar{\mathbf{x}}_t = \frac{1}{S} \sum_{s=1}^{S} \mathbf{x}_s
\end{equation}

where $\alpha=T_s/2$ sets the decay rate, assigning more weight to later timestep predictions.
% SpineSegDiff creates the uncertainty maps by computing the variance across the latest $T_S = 15$ timesteps and $S = 10$ samples. 



\subsubsection{Uncertainty Based Heatmaps}


Diffusion models offer a key advantage through their probabilistic nature, enabling uncertainty estimation in predictions ~\citep{Wolleb2021DiffusionEnsembles}.
This study introduces a novel approach for visualizing uncertainty in models that directly infer segmentation masks $\mathbf{\hat{x}_0}$. %, leveraging the stochastic properties of the diffusion process
These uncertainty-based heatmaps may be useful for clinical assessment of LBP, as they highlight regions where the model's predictions may be less reliable in identifying degenerated spinal structures. 
%These heatmaps are crucial for identification of regions where the model's predictions may be less reliable. and in cases of LBP, where accurately identifying degenerated spinal structure is crucial. 
We generate uncertainty-based heatmaps, by computing entropy $\hat{\mathbf{h}}_t$ at the time step $t$ during the DDIM sampling:
\begin{equation}
\hat{\mathbf{h}}_t = - \bar{\mathbf{p}}_t \cdot  \log \left(  \bar{\mathbf{p}}_t\right) \quad  \quad  \text{where} \quad \quad \bar{\mathbf{p}}_t = \frac{e^{\bar{\mathbf{x}}_t }}{\sum_{c=1}^{K} e^{\bar{\mathbf{x}}_{t,c}}}
\end{equation}

and $\mathbf{p}_t$ represents the softmax probability map normalized for the number of classes ($K=4$) for each diffusion timestep \(t\). The final uncertainty-based heatmap, is then computed  as the maximum of each time-entropy heatmap for each spinal structure: 
\begin{equation}
\hat{\mathbf{H}} = \max_{t \in T_s}(\hat{\mathbf{h}}_t)
\end{equation}


\subsubsection{presegmentation training with nnUnet}


SpineSegDiff training is significantly accelerated through the implementation of a presegmentation strategy \citep{Guo2022AcceleratingSegmentation}.
Unlike traditional original presegmentation approach where the  diffusion models learn to denoise patterns, our SpineSegDiff with presegmentation directly estimates the final segmentation mask $\mathbf{x}_0$.
The complete system,  SpineSegDiff with presegmentation, is composed of nnU-Net followed by a SpineSegDiff architecture (see Appendix Figure \ref{fig:presegmnetation}).
The workflow consists of two main stages: The initial segmentation $\hat{\mathbf{x}}_\text{pre}$ is predicted with the pre-trained baseline nnU-net model \citep{Isensee2020NnU-Net:Segmentation} and SpineSegDiff takes this partially noised presegmentation as input and learns to recover the original segmentation mask ($x_0$) through a shortened diffusion process. 
This presegmentation strategy significantly reduces the number of diffusion steps needed, as SpineSegDiff only needs to refine an already reasonable segmentation rather than starting from random noise. 
% By leveraging the initial segmentation produced by nnUNet to guide subsequent refinement stages and reduce the number of diffusion steps required for accurate segmentation, this approach improves efficiency while retaining the advantages of diffusion models, such as the generation of uncertainty maps.



\section{Experimental Results}


\subsection{Dataset and Implementation Details}

The analysis used sagittal MRI of the  lumbar spine from a multicenter cohort of 218 patients (63\% female) from SPIDER~\citep{od-SpiderGraaf2023} dataset (Appendix \ref{apd:A}). 
Scans were then realigned to the RAS+ coordinate system for consistent orientation.
MRI volumes were normalized to intensity (98th percentile, scaled to 255), followed by resampling at 1 mm resolution and resizing to 320×320 pixels. 
Ground truth labels for semantic segmentation were created by combining vertebrae annotations (starting from L5) and onehot encoded into three structures: spinal canal (SC), vertebral bodies (VB), and IVD. 
%More details about the pathology presents in the dataset can be found in the appendix  \ref{apd:A}. 


% \subsection{Implementation Details} 

The models were trained in a high-performance cluster using one RTX 4090 GPU for the 2D case and a single v100 GPU for 3D models. The models were implemented with Pytorch and MONAI \citep{JorgeCardoso2022MONAI:Healthcare} frameworks. 
The 2D models were trained and evaluated only on the central slice of the data, whereas the 3D models were trained and evaluated on the entire volume. %with dimensions 64x320x320.
%To establish a baseline for the presegmentation strategy, we trained an nnU-Net model on the SPIDER dataset. 
%The nnU-Net framework was chosen due to its state-of-the-art performance in medical image segmentation tasks. 
The optimal epochs for diffusion models were determined by the segmentation precision \citep{Bertels2019-DICE} in the first-fold validation set, where 2500 epochs were used for SpineSegDiff training. 
The diffusion models training time steps were set to $T = 1000$ with a linear variance noise schedule from $\beta_1 = 10^{-4}$ to 0.02 %$\beta_T = 0.02$.  \citep{Wolleb2021DiffusionEnsembles,Song2020DenoisingModels}.
The rest of training hyperparameters for all the compared modes are summarized in Appendix \ref{apd:A}. 



\subsection{Evaluating Diffusion Models for MRI Contrast-Independent Segmentation}

\begin{table*}[tbp]
\floatconts
  {tab:results-constrast}%
  {\caption{
Quantitative comparison of segmentation performance using mean DICE score for spinal structures (spinal canal, vertebrae, and IVDs)
trained on distinct contrast configurations: T1-weighted only (T1w), T2-weighted only (T2w), and combined T1w+T2w dataset.
}}%
{
\small
\begin{tabular}{lccccc}
\hline
\textbf{Model}  &  \textbf{Data} & \textbf{Spinal Canal} & \textbf{Vertebrae} & \textbf{IVD}& \textbf{mDICE} \\
\hline
% SpineSegDiff &  T1w& 0.93 $\pm$ 0.04 & 0.91 $\pm$ 0.03 & 0.89 $\pm$ 0.05 & 0.908 \\
% SpineSegDiff & T2w& \textbf{0.93 $\pm$ 0.04 }& 0.92 $\pm$ 0.04 & \textbf{0.90 $\pm$ 0.04 }& \textbf{0.917} \\
SpineSegDiff &  T1w + T2w& 0.92 $\pm$ 0.04 & 0.92 $\pm$ 0.02 & 0.90 $\pm$ 0.05 & 0.913\\
SpineSegDiff w/o IE &T1w + T2w& 0.92 $\pm$ 0.04 & 0.91 $\pm$ 0.03 & 0.89 $\pm$ 0.05& 0.909 \\
Diff-UNet 2D & T1w + T2w& 0.92 $\pm$ 0.05 & 0.91 $\pm$ 0.03 & 0.89 $\pm$ 0.05 & 0.906\\
% \hline
IISDM & T1w + T2w & 0.90 $\pm$ 0.03 & 0.92 $\pm$ 0.05 &0.89 $\pm$ 0.04 & 0.903\\
% \hline
nnU-Net & T1w + T2w & 0.91 $\pm$ 0.03 & 0.92 $\pm$ 0.03 & 0.84 $\pm$ 0.05 & 0.890\\
\hline
SpineSegDiff &  T1w& 0.93 $\pm$ 0.04 & 0.91 $\pm$ 0.03 & 0.89 $\pm$ 0.05 & 0.908 \\
SpineSegDiff w/o IE & T1w& 0.92 $\pm$ 0.03 & 0.90 $\pm$ 0.04 & 0.88 $\pm$ 0.06& 0.905 \\
Diff-UNet 2D & T1w & 0.9 $\pm$ 0.02 & 0.92 $\pm$ 0.02 & 0.89 $\pm$ 0.04 & 0.908\\
IISDM &  T1w & 0.87 $\pm$ 0.10 & 0.91 $\pm$ 0.04 & 0.89 $\pm$ 0.05 & 0.890 \\
nnU-Net & T1w & 0.91 $\pm$ 0.02 & 0.91 $\pm$ 0.03 & 0.84 $\pm$ 0.06 & 0.887 \\
\hline

SpineSegDiff & T2w& \textbf{0.93 $\pm$ 0.04 }& 0.92 $\pm$ 0.04 & \textbf{0.90 $\pm$ 0.04 }& \textbf{0.917} \\
SpineSegDiff w/o IE & T2w& 0.92 $\pm$ 0.04 & 0.92 $\pm$ 0.03 & 0.90 $\pm$ 0.05& 0.913 \\
Diff-UNet 2D & T2w & 0.92 $\pm$ 0.02 &\textbf{ 0.93 $\pm$ 0.02} & 0.89 $\pm$ 0.03 & 0.917\\
IISDM & T2w & 0.86 $\pm$ 0.12 & 0.91 $\pm$ 0.04 & 0.89 $\pm$ 0.05 & 0.887\\
nnU-Net & T2w & 0.91 $\pm$ 0.03 & 0.92 $\pm$ 0.03 & 0.85 $\pm$ 0.06  & 0.893\\

% \hline

%nnU-Net &2D & T1w \& T2w& \textbf{0.92 $\pm$ 0.03} & 0.91 $\pm$ 0.02 & 0.86 $\pm$ 0.06 & 0.897\\
\hline
% \label{tab:results-constrast}
\end{tabular}
}
\end{table*}

The performance of the model was evaluated using the Dice score with 5-fold cross-validation. 
The cross-validation split ensured that scans from the same patients were consistently assigned to the same split.
%The DICE score, a metric commonly used to measure the similarity between two sets, is employed to evaluate the model's performance. 
18 series oblique MRI scans were excluded from the evaluation set but retained for training. % purposes to increase model robustness. 
%The data includes T1- and T2-weighted images with spatial resolutions from 3.3 x 0.33 x 0.33 mm to 4.8 x 0.90 x 0.90 mm. The dataset also contains segmentation masks for the lumbar vertebrae, intervertebral discs (IVD), and spinal canal.
Diffusion models' capability to segment both T1- and T2-weighted MRI scans without contrast-specific training was evaluated. 
The models were trained on individual T1w and T2w contrasts, as well as a combined dataset (T1w + T2w). 

For  baseline comparison, we trained nnU-Net~\citep{Isensee2020NnU-Net:Segmentation}, which also served as our presegmentation model, to assess its performance on multi-contrast segmentation without specific optimization.
We also compared our approach ("SpineSegDiff") with several diffusion models:  a 2D adaptation of the diffusion U-Net \citep{Xing2023Diff-UNet:Segmentation} architecture ("Diff-UNet 2D"), and the Implicit Image Segmentation Diffusion Model  ("IISDM") \citep{Wolleb2021DiffusionEnsembles} and SpineSegDiff Architecture model without the additional image encoder ("SpineSegDiff w/o IE"). The experiment was expanded to 3D lumbar spine segmentation to evaluate if 2D diffusion models can match nnU-Net in 3D  (Appendix Tab.\ref{tab:results-3d-constrast}). 


The results are summarized in Table \ref{tab:results-constrast} indicated that diffusion models that directly infer segmentation masks achieve comparable or slightly better results than non-diffusion approaches across different contrast configurations. 
The performance improvements are most notable in the segmentation of IVDs, which may be particularly challenging due to their variable morphology in pathological conditions. 
Figure \ref{fig:results}  shows qualitative comparisons between SpineSegDiff and baseline methods trained on both contrast (T1w + T2W). The uncertainty-based heatmaps (right) highlight regions where segmentation predictions exhibit higher entropy. 

\subsubsection{Statistical Evaluation of Performance on Pathologies}

We analyzed how different pathologies affect the segmentation performance of the SpineSegDiff model, trained using T1w+T2w data across spine structures. 
Pathologies such as modic changes (bone marrow alterations), disc herniation (displacement of IVD material) and spondylolisthesis (forward displacement of a vertebra), disc narrowing, and overall disc degeneration evaluated through the Pfirrman grading, which are prevalent in lumbar spine conditions, were considered due to their potential impact on model performance.
The pathology distribution of the study cohort is detailed in the appendix \ref{apd:A}. 
 
Figure \ref{fig:results-statistics} illustrates the statistical analysis, showing Dice scores between patients with and without these conditions box plots and t-test results that highlight the relationship between these pathologies and model performance. To address the issue of multiple comparisons, we applied the Benjamini-Hochberg p-values correction to control the false discovery rate at $\alpha = 0.05$. 
The figure indicates that pathologies like spondylolisthesis and disc narrowing significantly impact segmentation. Upper endplate changes affected IVD segmentation ($p=0.0310$), while lower endplate changes impacted both IVD ($p=0.0120$) and SC ($p=0.0337$). Spondylolisthesis had widespread effects on SC ($p=0.0048$), VB ($p=0.0039$), and IVD ($p<0.0001$) segmentation scores. Disc herniation only significantly affected SC segmentation ($p=0.0263$), and disc degeneration significantly affected IVD segmentation ($p = 0.0003$). 
\begin{figure*}[bth]
    \centering
\includegraphics[width=0.95\linewidth]
{figures/results/performance_pathology_analysis_SpineSegDiff}


\caption{
Statistical analysis of segmentation performance in the presence of specific spinal pathologies in each subplot, including modic changes, spondylolisthesis, disc herniation, disc narrowing, disc bulging, and disc degeneration.
% The boxplots depict Dice scores for spinal structures: spinal canal (SC), vertebral bodies (VB), and intervertebral discs (IVD).
 Significant differences ($p<0.005$) identified via
T-tests with Benjamini-Hochberg correction.}
\label{fig:results-statistics}
\end{figure*} 

\subsection{presegmentation Time Diffusion Steps} 
\begin{table*}[thb]
\centering
\floatconts
{tab:presegmentation}{
\caption{Evaluation of the diffusion timesteps ($T$) on presegmentation, with $T=0$ representing the baseline non-diffusion segmentation approach.}}
{
\small
\begin{tabular}{lcccccc}
\textbf{} & \textbf{$T=0$} & \textbf{$T=30$} & \textbf{$T=100$} & \textbf{$T=300$} & \textbf{$T=500$} & \textbf{$T=1000$} \\
\hline
\textbf{SC} & 0.91 $\pm$ 0.03 & \textbf{0.92 $\pm$ 0.05} & 0.92 $\pm$ 0.06 & 0.92 $\pm$ 0.06 & 0.92 $\pm$ 0.06 & 0.92 $\pm$ 0.07 \\
\textbf{VB} & 0.92 $\pm$ 0.03 & \textbf{0.92 $\pm$ 0.04} & 0.91 $\pm$ 0.04 & 0.91 $\pm$ 0.04 & 0.91 $\pm$ 0.04 & 0.91 $\pm$ 0.03 \\
\textbf{IVD} & 0.84 $\pm$ 0.05 & \textbf{0.89 $\pm$ 0.05} & 0.89 $\pm$ 0.06 & \textbf{0.89 $\pm$ 0.05} & 0.89 $\pm$ 0.06 & \textbf{0.89 $\pm$ 0.05} \\
\hline
\end{tabular}
}
\end{table*}
To evaluate the effectiveness of the presegmentation strategy, we conducted an ablation study  to determine the optimal number of timesteps $t$ that balance computational efficiency and segmentation accuracy.
Various time-step configurations were tested, and the results were compared to a baseline model using 1000 steps starting from the noised presegmentation, summarized in Table \ref{tab:presegmentation}. 
The ablation study revealed that the preconditioning strategy significantly reduced the number of time steps needed while maintaining the 2D segmentation performance.

%This finding underscores the potential of preconditioning to enhance the efficiency of diffusion models in medical image segmentation tasks, particularly for low back pain assessment from lumbar spine MRI scans.



\section{Discussion} 

\begin{figure*}[tbt]
    \centering
\includegraphics[width=\textwidth]{figures/results-overview.pdf}


  \caption{The visual comparisons on segmentation results on the central slice produced by the evaluated baseline and diffusion models for the three anatomical structures: spinal canal (blue), vertebrae (green), and intervertebral discs (red), along with the uncertainty maps for SpineSegDiff, where regions of higher uncertainty are denoted by darker red hues. The examples highlight challenging pathological cases with advanced disc degeneration (Pfirrmann grades 4-5), endplate irregularities, and disc narrowing. The uncertainty heatmaps effectively highlight regions of anatomical ambiguity, particularly at the boundaries of spinal structures}
  \label{fig:results}\end{figure*}



Our findings demonstrate the potential of diffusion models, particularly SpineSegDiff, for accurate and efficient segmentation of the lumbar spine in MRI scans.  
The strong performance of these models, comparable to the state-of-the-art nnU-Net, highlights their ability to capture the complex anatomical structures and variability present in patients with low back pain. 
The improved segmentation of IVD is particularly noteworthy, as disc degeneration is a common cause of low back pain and accurate delineation of these structures is crucial for diagnosis and treatment planning.
Despite the  similar numerical performance of nnUNet 3D models, in many clinical settings, only 2D MRI scans of the lumbar spine may be available. 

Furthermore, a key advantage of SpineSegDiff is its ability to generate uncertainty-based heatmaps through stochastic sampling, which may provide valuable insights for quality assurance.  This approach effectively highlights anatomical regions where the model exhibits variable predictions, particularly at the boundaries of pathological structures. While visually informative, they do not currently provide calibrated statistical confidence intervals. The uncertainty is represented qualitatively rather than as precise probability distributions. 
Figure \ref{fig:results} shows SpineSegDiff segmentation errors in low-confidence areas. 

These uncertainty maps may help clinicians identify regions needing closer examination, minimizing the risk of missing subtle abnormalities.

The statistical analysis reveals that certain degenerative pathologies, particularly spondylolisthesis and disc narrowing, can substantially reduce the accuracy of SpineSegDiff. 
Spondylolisthesis and disc narrowing exhibit the highest t-statistics and the lowest p-values, which underscores their profound impact on segmentation accuracy relative to other pathological conditions. The presence of these conditions correlates with substantially lower Dice scores.

By leveraging the initial segmentation produced by nnUNet, the study of diffusion time steps $(T)$ needed (Table \ref{tab:presegmentation}) reveals the effectiveness of the presegmentation strategy in maintaining high accuracy while significantly reducing computational requirements, making SpineSegDiff a more practical and efficient solution for lumbar spine segmentation tasks by requiring fewer diffusion steps to achieve accurate segmentation.

Nonetheless, it is important to acknowledge the limitations of our study and the challenges that remain for clinical translation. Despite the multicenter nature of the dataset, with varied sequences and acquisition parameters, further validation is necessary on larger and more diverse populations to establish the generalizability of the models.
Additionally, the computational requirements of diffusion models, even with the presegmentation strategy, may still pose barriers to widespread adoption, particularly in resource-limited settings. Future work should focus on further optimizing the models for efficiency and integration into clinical workflows.

\section{Conclusion}
We present diffusion-based models for segmenting lumbar spine MRI scans from patients with LBP. 
Diffusion models demonstrate promising performance that approaches state-of-the-art results, particularly in the challenging task of identifying degenerated IVD. 
Uncertainty-based heatmaps offer valuable insights into the segmentation process, thereby improving the reliability of segmentation results.
Through the implementation of a presegmentation strategy, SpineSegDiff maintains high accuracy while reducing the number of diffusion time steps, addressing computational limitations. 

To fully realize the potential of SpineSegDiff, future research should focus on two key areas.
First, efforts should be made to further optimize the model's computational efficiency, making it suitable for clinical implementation. 
Second, the model should be validated on larger and more diverse datasets to ensure its generalizability between different patient populations and imaging protocols. 
The present study demonstrates substantial potential; however, it is acknowledged that the training of diffusion models requires significant computational resources. However, the superior ability to quantify uncertainties intrinsic to diffusion models offers a promising approach for the detection of degenerative changes in IVD among patients suffering from LBP related pathologies.



\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)

\midlacknowledgments{This research study retrospectively analyzed open access human subject data , exempt from ethical approval according to the open access license of ~\citep{od-SpiderGraaf2023}. }
This project was supported by grant \# 380 of the Strategic Focus Area “Personalized Health and Related Technologies (PHRT)” of the ETH Domain (Swiss Federal Institutes of Technology). The SpineSegDiff model code, along with training and evaluation scripts, and reproducibility instructions, is available at \url{https://gitlab.ethz.ch/BMDSlab/publications/low-back/diffusion-models-for-lumbar-spine-mri-segmentation}.


\bibliography{midl25_156}


\appendix

\section{Dataset and Implementation Details}

\subsection{Degenerative Pathologies}
\label{apd:A}
This work uses publicly available SPIDER dataset~\citep{od-SpiderGraaf2023} for training and evaluation which includes MRI scans of the lumbar spine from 218 subjects with low back pain. 
The data includes T1- and T2-weighted images with spatial resolutions from 3.3 x 0.33 x 0.33 mm to 4.8 x 0.90 x 0.90 mm. 
For the 3D analysis,  scans were resampled to a uniform spatial resolution of 1 mm and resized to 64x320x320 voxels. 
The dataset comprises a multicenter collection of sagittal lumbar MRI obtained from four different hospitals in the Netherlands, with pathological conditions such as spondylolisthesis, disc herniation, and modic changes. 
In our study, the incidence of present spinal degenerative pathologies was determined if they manifested at any vertebral level and is summarized in the following table.

\begin{table}[thb]
\caption{Overview of degenerative pathology's presence in the SPIDER dataset}
\centering
\label{tab:pathologies}
\begin{tabular}{lr}
\textbf{Pathology} & \textbf{Patients (\%)} \\
\hline
Spondylolisthesis & 42 (19.27\%) \\
Disc Herniation & 72 (33.03\%)  \\
Modic Changes & 149 (68.34\%) \\
Endplate Changes & 177 (81.19\%) \\
Disc Narrowing & 193 (88.53\%) \\
Disc Bulging & 200 (91.74\%)\\
\end{tabular}
\end{table}

\subsection{SpineSegDiff Training}  


The SpineSegDiff model is trained using a composite loss function that combines Mean Squared Error (MSE), Dice Loss, and Binary Cross-Entropy (BCE) Loss. The total loss is formulated as: $ L_{total} =  L_{MSE} +L_{Dice} + L_{BCE} $
where each terms are can be decomposed as 
$L_{MSE} = \frac{1}{N} \sum_{i=1}^N (\hat{x}_i - x_i)^2$, 
$L_{Dice} = 1 - \frac{2|\hat{X} \cap X|}{|\hat{X}| + |X|}$, 
$L_{BCE} = -\frac{1}{N} \sum_{i=1}^N [x_i \log(\hat{x}_i) + (1-x_i) \log(1-\hat{x}_i)]$. 
This loss optimizes the model for pixel accuracy (MSE), segmentation quality (Dice), and probabilistic output (BCE). The training hyperparameters are summarized in the table below:
\vspace{-0.7 cm}
\begin{table}[h]
  \caption{Training hyperparameters for SpineSegDiff}% (T1w, T2w, T1w+T2w)}
  \centering
  \small
  \begin{tabular}{lccc}
    \hline
    \textbf{Parameter} & \textbf{T1w, T2w, T1w+T2w}\\ 
    \hline
    \textbf{Image Size} & 320x320 \\
     \textbf{Epochs} & 2500 \\
    \textbf{Batch} & 4  \\
    \textbf{Optimizer} & AdamW  \\
    \textbf{Learning Rate} & 0.0001\\
  \textbf{Training Loss} & MSE +  Dice +  Cross Entropy \\
    \hline
  \end{tabular}
\end{table}


\subsection{SpinSegDiff with Presegmentation}

The presegmentation strategy~\citep{Guo2022AcceleratingSegmentation} is adapted to augment the efficiency and precision of the diffusion model's sampling process by furnishing an initial segmentation that directs subsequent refinement stages. 
An initial segmentation $\hat{\mathbf{x}}_\text{pre}$ is produced utilizing a pre-trained baseline model. 
This initial segmentation acts as a prior for the diffusion model, thereby diminishing the number of diffusion steps necessary to attain accurate segmentation. 
The diffusion segmentation is trained using SpineSegDiff. $\hat{\mathbf{x}}_\text{pre}$ undergoes partial noising via a cosine noise scheduler, which introduces noise at a more gradual rate compared to a linear scheduler, thus preserving a greater extent of image features.


\begin{figure*}[htbp]
\begin{center}
\includegraphics[width=0.9\textwidth]{figures/nnUnetSpineSegDiff-overview.pdf}
\end{center}
\vspace{-0.55cm}
\caption{(a) Training pipeline with presegmentation where nnU-Net generates initial mask $\mathbf{x}{\text{pre}}$ from MRI input $\mathbf{y}$, followed by partial noising to obtain $\mathbf{x}_{T}$ for diffusion training.}
\label{fig:presegmnetation}
\end{figure*}


\subsection{Spinsegdiff Sampling and Uncertainty Maps}
The computation of uncertainty maps in SpineSegDiff involves several key steps. 
Initially, $S$ segmentation masks are generated by repeatedly sampling the diffusion model over the latest $T_S$ timesteps. The detailed pseudo-algorithm is listed:



\begin{algorithm2e}

\caption{Uncertainty-based Heatmaps}
\label{alg:heatmaps}
\KwIn{MRI $\mathbf{y}$, Batch $N$, Number of Samples $S$}
\KwOut{Final prediction $\hat{\mathbf{x}}_f$}

\SetAlgoLined
\DontPrintSemicolon

\#Extract embeddings from the input MRI\;

$\mathbf{e_t} \gets \text{image\_encoder}(\mathbf{y})$\;

\#Generate $S$ number of  samples using DDIM sampling\;
% $\mathcal{S} \gets \emptyset$\;
\For{$i \gets 1$ \KwTo $S$}{
    $\mathcal{S}.\text{append}(\text{DDIM\_sample}(\text{model}, (1, N, P_x, P_y)), \mathbf{y}, \mathbf{e_t})$\;
}

$\hat{\mathbf{x}}_f \gets \text{zeros}((1, N, P_x, P_y))$\;

\For{$t \gets 0$ \KwTo $T_s$}{
    $\mathbf{\bar{x}}_{t}  \gets 0$\;
    \For{$i \gets 1$ \KwTo $S$}{
        $\mathbf{\bar{x}}_{t} \gets \mathbf{\bar{x}}_{t} + \mathcal{S}[i][t]$\;
    }
    $\mathbf{\bar{x}}_{t}  \gets \mathbf{\bar{x}}_{t}/ S$\;
    
    \# Compute the entropy for each timestep\;
    $\hat{\mathbf{h}}_t \gets \text{compute\_entropy}(\mathbf{\bar{x}}_{t} )$\;
    
    \# Compute timestep scaling  weight \;
    
    $w_t \gets \exp( -\alpha (  T_s - t ) / T_s)$\;


    \For{$i \gets 1$ \KwTo $S$}{
        \# Final prediction as the weighted sum\;
        $\hat{\mathbf{x}}_f \gets \hat{\mathbf{x}}_f + w_t \cdot\mathbf{\bar{x}}_{t} $\;
    }
}
\Return $\hat{\mathbf{x}}_f$\;
\end{algorithm2e}



\section{Extended Results}\label{apd:B}


\subsection{Impact of Spinal Pathologies on Segmentation Performance: Statistical Analysis}

We further detail the analysis of the impact of Spinal Pathologies  segmentation performance of the the baseline comparison model of the diffusion models compared to the nnU-Net baseline.  The plots presented in this appendix show Dice scores for different spinal structures, such as the spinal canal (SC), vertebral bodies (VB), and intervertebral discs (IVD), in various pathological conditions. Each plot compares the segmentation performance between patients with and without specific pathologies. The t-test statistics and p-values provided in the plots indicate the statistical significance of the differences observe pathologies.
\begin{figure}[htbp]
  \centering
    \centering
    \includegraphics[width=0.48\linewidth]{figures/results/performance_pathology_analysis_IISDM}
    \includegraphics[width=0.48\linewidth]{figures/results/performance_pathology_analysis_nnUnet}
    \vspace{-0.5cm}
      \caption{Dice scores boxplot for IISDM (left) and nnUnet (right)}
    \label{fig:app-IISDM}
\end{figure}

Modic changes, disc narrowing, and spondylolisthesis exhibit substantial influences on segmentation performance, particularly for intervertebral discs (IVDs) and the spinal canal, as evidenced by high t-statistics and low p-values. 


\subsection{Results of 3D Segmentation}

\begin{table*}[htb]
\centering
\floatconts
{tab:results-3d-constrast}{
% \caption{Performance of the segmentation of the 3D spine volume }
\caption{A quantitative analysis of Dice scores for 3D spinal volume segmentation of spinal structures (including the spinal canal, vertebrae, and intervertebral disks) using nnU-Net3D and Diff-UNet models across T1-weighted (T1w), T2-weighted (T2w), and combined T1w + T2w imaging modalities.}}
{
\begin{tabular}{lcccccc}
\hline
\textbf{Model}  & \textbf{Dim}  & \textbf{Modality} & \textbf{Spinal Canal} & \textbf{Vertebrae} & \textbf{IVD}& \textbf{mDICE} \\

\hline
nnU-Net & 3D & T1w & 0.92 $\pm$ 0.09 & 0.93 $\pm$ 0.02 & 0.84 $\pm$ 0.04 & 0.897\\
nnU-Net &3D & T2w & 0.93 $\pm$ 0.03 & 0.93 $\pm$ 0.02 & 0.89 $\pm$ 0.04 & 0.917\\
nnU-Net &3D & T1w + T2w & \textbf{0.93 $\pm$ 0.02}  & \textbf{0.93 $\pm$ 0.02} & 0.89 $\pm$ 0.04 & 0.917 \\
%nnU-Net &3D & T1w \& T2w  &  \textbf{0.93 $\pm$ 0.02} &\textbf{0.93 $\pm$ 0.02} & \textbf{0.89 $\pm$ 0.04} & 0.917 \\

\hline

Diff-UNet & 3D & T1w & 0.92 $\pm$ 0.04 & 0.93 $\pm$ 0.04 &\textbf{0.91 $\pm$ 0.03} & \textbf{0.920} \\
Diff-UNet & 3D & T2w & 0.92 $\pm$ 0.02 & 0.93 $\pm$ 0.02 & 0.90 $\pm$ 0.03 & 0.917 \\
Diff-UNet & 3D & T1w + T2w & 0.92 $\pm$ 0.02  & 0.93 $\pm$ 0.02 & 0.89 $\pm$ 0.04 & 0.913 \\
\hline
\end{tabular}
}
\end{table*}
We present a comprehensive analysis of the segmentation performance on full-sized 3D spine volumes.
The training was conducted using complete 3D MRI datasets, allowing for a detailed evaluation of model capabilities in capturing complex anatomical structures. 
The results, as summarized in Table \ref{tab:results-3d-constrast}, highlight the segmentation accuracy across different spinal components, including the spinal canal, vertebrae, and intervertebral discs (IVD). 

Notably, the Diff-UNet model demonstrates superior performance in segmenting IVDs, achieving the highest mean Dice score (mDICE) of 0.920 in the T1-weighted modality. These findings underscore the potential of 3D models to enhance segmentation precision, particularly in the context of detailed volumetric analysis. 



\section{Baseline Comparison Experiments Details}\label{apd:B}

\subsection{nnUnet Baseline}

The nnU-Net model  \citep{Isensee2020NnU-Net:Segmentation}  is trained using a highly automated and adaptable framework designed for semantic segmentation tasks which informs the configuration of multiple U-Net architectures tailored to the dataset's specific characteristics. 
The model training involves a multi-step process that includes preprocessing, model configuration, training. nnU-Net employs a five-fold cross-validation strategy to ensure robust performance evaluation. The training utilizes various configurations, such as 2D, 3D full resolution, to optimize segmentation performance across different data modalities. The hyperparameters that were used in the training are summarzed in the following tables: 

\begin{table}[h]
\caption{Training Hyperparameters for nnUnet 2D and 3D}
\centering\small
\begin{tabular}{l|ccc|ccc}

\hline
\textbf{Parameter} & \textbf{T1w} & \textbf{T2w} & \textbf{T1w+T2w} & \textbf{T1w} & \textbf{T2w} & \textbf{T1w+T2w} \\
\hline
Patch Size & 256x64 & 256x64 & 256x64 & \makecell{ 56x 224x192} & \makecell{ 56x224x192} & \makecell{ 56x224x192} \\
Epochs & 250 & 250 & 250 &  250 & 250 & 250 \\
Batch & 197 & 197 & 197 &  2 & 2 & 2 \\
Optimizer & SGD & SGD & SGD &  SGD & SGD & SGD \\
Learning Rate & 0.01 & 0.01 & 0.01 &0.01 & 0.01 & 0.01 \\
Training Loss & Dice & Dice & Dice &  Dice & Dice & Dice \\
\hline
\multicolumn{1}{c}{} & \multicolumn{3}{c}{\textbf{nnUnet 2D}} & \multicolumn{3}{c}{\textbf{nnUnet 3D}} \\

\end{tabular}
\end{table}




\subsection{Implicit Image Segmentation Diffusion Model (IISMD)}
IISMD ~\citep{Wolleb2021DiffusionEnsembles} follows DDPM training, adding Gaussian noise $\boldsymbol{\epsilon}_t \sim \mathcal{N}(\mathbf{0}, \mathbf{I})$ to the segmentation mask $\mathbf{x}_0$ at each timestep $t \in \{1, \ldots, T\}$ using a linear noise scheduler $\{\alpha_t \in (0, 1)\}_{t=1}^T$. For denoising, U-Net architecture $f_{\boldsymbol{\theta}}$ estimates noise $\boldsymbol{\epsilon}_t = f_{\boldsymbol{\theta}}(\mathbf{x}_t, \mathbf{y}, t)$ at each timestep, concatenated with MRI images $\mathbf{y}$, used to guide the generation of the segmentation mask.  The parameters $\boldsymbol{\theta}$ are optimized by minimizing the Mean Squared Error (MSE) loss between the estimated noise $\hat{\boldsymbol{\epsilon}}_t$ and the true noise $\boldsymbol{\epsilon}_t$.

In the inference or sampling process, the model takes random noise concatenated with the MRI input image ($\mathbf{x}_{y}$)  and iteratively denoises the segmentation mask by estimating the noise $\hat{\boldsymbol{\epsilon}}_t$ at each timestep. 
% The denoised segmentation mask $\hat{\mathbf{x}}_{t-1}$ is obtained using the following equation:
% \begin{equation}
% \hat{\mathbf{x}}_{b,t-1} = \frac{1}{\sqrt{\alpha_t}} \left( \hat{\mathbf{x}}_{b,t} - \frac{1 - \alpha_t}{\sqrt{1 - \alpha_t}}  \hat{\boldsymbol{\epsilon}}_t \right) + \sigma_t\mathbf{z}_{t} 
% \end{equation}
% where $\mathbf{z}_{t} \sim \mathcal{N}(0, I)$. 
During the sampling procedure, uncertainty maps are synthesized by exploiting the inherent stochasticity present in DDPMs. Through iterative application of IISMD, multiple segmentation masks are produced for a given input image. The uncertainty map is derived by assessing the pixel-wise variance of the  masks.

\begin{table}[h]
\caption{Training Hyperparameters for IISDM}
\centering
\small
    \begin{tabular}{lccc}
    \hline
    \textbf{Hyperparameter} & \textbf{T1w, T2w, T1w+T2w }  \\ 
    \hline
    \textbf{Image Size} & 320x320 \\  
    \textbf{Epochs} & 2600 \\  
    \textbf{Batch} & 10  \\
    \textbf{Optimizer} & AdamW  \\  
    \textbf{Learning Rate} & 0.0001 \\ 
    \textbf{Training Loss} & MSE \\ 
\hline
\end{tabular}
\end{table}




\subsection{DiffUnet }  

DiffUnet~\citep{Xing2023Diff-UNet:Segmentation} is a diffusion-based volumetric segmentation framework for medical volumetric segmentation that directly infers the segmentation mask $\hat{\mathbf{x}}_0$ from a partially noised input $\mathbf{x_{t}}$.
The architecture includes an additional encoder to extract features from MRI scans, which enhances the model during training. The training uses a composite loss function that combines cross-entropy, Dice, and MSE losses to penalize segmentation errors. During the inference phase, Diff-UNet employs the DDIM~\citep{Song2020DenoisingModels} sampling algorithm, which accelerates the process while maintaining a balance between speed and accuracy.  
To further improve robustness, Diff-UNet performs step-uncertainty-based fusion during sampling $\mathbf{u}_i = -\bar{p}_i \log(\bar{p}_i)$, applied to the step-wise predictions to compile the final fused result mask $\hat{\mathbf{x}}$.  

Due to the computational load of the diffusion models, the volumetric segmentation for DiffU-Net was performed patch-wise with input size $32\times120\times120$ and sliding window inference with 0.5 overlap. The training hyperparameters are summarized in: the next table

\begin{table}[h]
\caption{Hyperparameters for DiffU-Net}
\centering
\small
\begin{tabular}{lccc}
\hline
\textbf{} & \textbf{T1w} & \textbf{T2w} & \textbf{T1w+T2w} \\
\hline
\textbf{Patch Size} & \makecell{ 32x128x128 }
 & \makecell{ 32x128x128 } & \makecell{ 32x128x128 } \\
\textbf{Epochs} & 1350 & 1400 & 700 \\
\textbf{Batch} & 4 & 4 & 4 \\
\textbf{Optimizer} & AdamW & AdamW & AdamW \\
\textbf{Learning Rate} & 0.0001 & 0.0001 & 0.0001 \\
\textbf{Training Loss} & \makecell{MSE + Dice  + CE} & \makecell{MSE + Dice + CE} & \makecell{MSE + Dice + CE} \\
\hline
\end{tabular}
\end{table}

\end{document}
