\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{titlesec}
\titlespacing*{\section}{0pt}{1em}{0.5em}
\titlespacing*{\subsection}{0pt}{0.75em}{0.25em}



\usepackage[margin=1in]{geometry}
\usepackage{graphicx}
\usepackage{wrapfig}
\usepackage[font=small,labelfont=bf]{caption} % optional but often improves captions

% Globally reduce extra spacing around wrapped figures:
\setlength{\intextsep}{0pt}   % vertical gap above/below the wrapfigure
\setlength{\columnsep}{10pt}  % horizontal gap between figure and text
\usepackage{wrapfig}


\usepackage{graphicx}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{booktabs}
\usepackage{multirow}
\usepackage[table,xcdraw]{xcolor}
\usepackage{colortbl}
\usepackage{xcolor}


\usepackage{mwe} % to get dummy images

\jmlrvolume{-- 018}
\jmlryear{2025}
\jmlrworkshop{Full Paper -- MIDL 2025}
\editors{Accepted for publication at MIDL 2025}

\title[MedDelinea]{\textbf{\textit{MedDelinea}}: Scalable and Efficient Medical Image Segmentation via Controllable Diffusion Transformers}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
  % \midlauthor{\Name{Gayatri Deshmukh} \Email{dgayatri9850@gmail.com}\\
  %  \Name{Onkar Kishor Susladkar} \Email{onkarsus13@gmail.com}\\
  %  \Name{Debesh Jha} \Email{debesh.jha@usd.edu} \\
  %  \Name{Elif Keles} \Email{elif.keles@northwestern.edu}\\
  %  \Name{Ertugrul Aktas} \Email{h.ertugrulaktas@gmail.com}\\
  %  \Name{Alpay Medetalibeyoglu} \Email{alibeyoglualpay@gmail.com}\\
  %  \Name{Daniela P. Ladner} \Email{dladner@nm.org}\\
  %  \Name{Amir A. Borhani} \Email{amir.borhani@northwestern.edu}\\
  %  \Name{Gorkem Durak} \Email{gorkem.durak@northwestern.edu}\\
  %  \Name{Ulas Bagci} \Email{ulas.bagci@northwestern.edu}\\
  %  \addr Machine and Hybrid Intelligent Lab, Northwestern University, Chicago, IL \\
  %  \addr University of South Dakota
  %  }

  \midlauthor{
\Name{Gayatri Deshmukh\textsuperscript{†}} \Email{dgayatri9850@gmail.com}\\
\Name{Onkar Kishor Susladkar\textsuperscript{†}} \Email{onkarsus13@gmail.com}\\
\Name{Debesh Jha\textsuperscript{*}} \Email{debesh.jha@usd.edu} \\
\Name{Elif Keles\textsuperscript{†}} \Email{elif.keles@northwestern.edu}\\
\Name{Ertugrul Aktas\textsuperscript{†}} \Email{h.ertugrulaktas@gmail.com}\\
\Name{Alpay Medetalibeyoglu\textsuperscript{†}} \Email{alibeyoglualpay@gmail.com}\\
\Name{Daniela P. Ladner\textsuperscript{†}} \Email{dladner@nm.org}\\
\Name{Amir A. Borhani\textsuperscript{†}} \Email{amir.borhani@northwestern.edu}\\
\Name{Gorkem Durak\textsuperscript{†}} \Email{gorkem.durak@northwestern.edu}\\
\Name{Ulas Bagci\textsuperscript{†}} \Email{ulas.bagci@northwestern.edu}\\
\addr \textsuperscript{†}Machine and Hybrid Intelligence Lab, Northwestern University, Chicago, IL \\
\addr \textsuperscript{*}University of South Dakota
    }


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

% \author{
%     Gayatri Deshmukh \\
%     Northwestern University \\
%     {\tt\small gayatri.deshmukh@northwestern.edu}
%     \and
%     Onkar Kishor Susladkar \\
%     Northwestern University \\
%     {\tt\small onkar.susladkar@northwestern.edu}
%     \and
%     Vandan Gorade \\
%     Northwestern University \\
%     {\tt\small vandan.gorade@northwestern.edu}
%     \and
%     Debesh Jha \\
%     Northwestern University \\
%     {\tt\small debesh.jha@northwestern.edu}
%     \and
%     Ulas Bagci \\
%     Northwestern University \\
%     {\tt\small ulas.bagci@northwestern.edu}
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship


\begin{document}


\maketitle

% \begin{document}

%%%%%%%%% ABSTRACT
\begin{abstract}
   % In this paper, we introduce \textbf{\textit{MedDelinea}}, a novel architecture tailored for medical image segmentation that addresses key challenges such as generalization, efficiency, scalability, and adaptability. \textbf{\textit{MedDelinea}} introduces a fusion of Controllable Neural Networks, inspired by ControlNet, into the Diffusion Transformers (DiT) framework, offering a task adaptability. Leveraging pre-training on large-scale datasets, \textbf{\textit{MedDelinea}} shows transferability, requiring minimal fine-tuning to adapt to new datasets. It also achieves notable zero-shot performance, with results comparable to those of fully fine-tuned models, setting a new standard for adaptability with limited task-specific retraining.
   %  The modular and scalable design ensures efficiency while maintaining high segmentation accuracy across a wide variety of medical imaging challenges. Our extensive evaluations, both quantitative and qualitative, show that \textbf{\textit{MedDelinea}} consistently outperforms existing models in terms of adaptability and performance.

  %  This paper presents \textbf{\textit{MedDelinea}}, a new architecture for medical image segmentation that integrates a controllable module, inspired by \textit{ControlNet}, into the Diffusion Transformers (DiT) framework. This design addresses common challenges in segmentation tasks, including limited labeled data, varying image modalities, and the need for accurate boundary delineation. \textbf{\textit{MedDelinea}} is pre-trained on a large-scale medical dataset, reducing  overfitting risks and facilitating transfer across diverse imaging scenarios with minimal fine-tuning. Its modular structure allows efficient scaling and preserves segmentation quality in both supervised and zero-shot settings. Empirical evaluations on multiple datasets show improvements over existing segmentation approaches.


We introduce \textbf{\textit{MedDelinea}}, a novel medical image segmentation architecture that leverages a controllable module, drawing inspiration from \textit{ControlNet}, within the Diffusion Transformers (DiT) framework. By doing so, we effectively address three key challenges inherent to segmentation tasks: (1) limited availability of labeled data, (2) variability in image modalities, and (3) the need for precise boundary delineation. \textbf{\textit{MedDelinea}} is pre-trained on a large-scale medical dataset, thereby mitigating overfitting risks and enabling efficient transfer across diverse imaging scenarios with minimal fine-tuning requirements. The modular design of \textbf{\textit{MedDelinea}} facilitates scalable and efficient computation, while maintaining high-quality segmentation performance in both supervised and zero-shot settings. Through extensive empirical evaluations on multiple datasets, we demonstrate that \textbf{\textit{MedDelinea}} outperforms existing state-of-the-art segmentation approaches, showcasing its potential for robust and accurate medical image analysis. The code is publicly available at: \href{https://github.com/Onkarsus13/MedDelinea}{https://github.com/Onkarsus13/MedDelinea}.





\end{abstract}

%%%%%%%%% BODY TEXT
\section{Introduction}
\label{sec:intro}
   % Medical image segmentation, the automatic delineation of anatomical structures and lesions, is fundamental to numerous clinical applications, including image registration, quantification, and image-guided surgery. Despite advances in deep learning-based models, challenges remain, including limited generalization across tasks, task-specific architectures, inefficiency, scalability issues, and susceptibility to mode collapse.
   
    \begin{wrapfigure}{R}{0.4\textwidth}
     % \vspace{-1em}
      \centering
      \includegraphics[width=\linewidth]{imgs/loss_curve.png}
      \caption{Learning curve of different models across iterations. MedDelinea consistently demonstrates the lowest loss through the training process.}
      \label{fig:learning_curve}
   %   \vspace{1em} 
    \end{wrapfigure}
    
    Generalization is an unsolved problem in medical image segmentation, and various pre-training strategies have been explored to this challenge. Self-pretraining \cite{tang2022self}, where a model is pre-trained on the same dataset used for downstream tasks, helps the model adapt to the specific characteristics of medical images but risks overfitting, limiting generalization to new data. In contrast, pre-training on large, diverse datasets with transformer-based or CNN models improves generalization but requires extensive fine-tuning for specific medical tasks, reducing efficiency. This trade-off between generalization and efficiency hinders the practical use of these models. To address these limitations, we design a model with four key attributes: Transferability, Efficiency, Modularity, and Scalability. A transferable model requires minimal fine-tuning across related tasks. As shown in Fig.~\ref{fig:learning_curve}, \textbf{\textit{MedDelinea}} demonstrates gradual loss reduction with fewer iterations compared to others. Efficiency focuses on high accuracy with minimal annotation. Modularity enables reuse across tasks without major modifications, while Scalability ensures performance remains stable as data size or complexity grows.

    %%%%%%%%%%%%%%%%%%%%%%%%%%%
    % \begin{figure}[htbp]
    %      % Caption and label go in the first argument and the figure contents
    %      % go in the second argument
    %     \floatconts
    %       {fig:loss_curve}
    %       {\caption{Learning curve of different models across iterations. \textbf{\textit{MedDelinea}} consistently demonstrates the lowest loss throughout the training process, indicating faster convergence and improved performance in comparison to the other models.}}
    %       {\includegraphics[width=0.5\linewidth]{imgs/loss_curve.png}}
    % \end{figure}

    % \begin{wrapfigure}{r}{0.45\textwidth}  % 'r' for right side, 'l' for left
    %     \vspace{-10pt}                     % Adjust vertical space if needed
    %     \centering
    %     \includegraphics[width=\linewidth]{imgs/loss_curve.png} % your image file
    %     \caption{Learning curve of different models across iterations. MedDelinea consistently demonstrates the lowest loss through the training process.}
    %     \label{fig:learning_curve}
    % \end{wrapfigure}


    %%%%%%%%%%%%%%%%%%%%%%%%%%%
    
    In this study, we introduce \textbf{\textit{MedDelinea}}, a Large-Scale Supervised Pre-training model with Diffusion Transformer, aimed at achieving our primary goals in abdominal segmentation. \textbf{\textit{MedDelinea}} features two key innovations: 1) Novel Architecture Design: Departing from the traditional UNet backbone, it incorporates elements from ControlNet, Latent Diffusion Models (LDMs), and Vision Transformers. This replaces UNet-style diffusion models with transformers and leverages pre-trained blocks from ControlNet to balance domain-agnostic and domain-specific learning, resulting in a modular, scalable, and efficient architecture. 2) Pretraining Strategy: By using a ControlNet-based framework and performing supervised pretraining on a large-scale dataset, \textbf{\textit{MedDelinea}} enhances transferability and generalization, providing precise control over segmentation outputs to meet the demands of complex anatomical structures. Summary of our contributions are as follows:
    \begin{itemize}\itemsep0pt
        \item \textbf{Innovative Model Architecture}: We propose a novel architecture, \textbf{\textit{MedDelinea}}, which integrates controllable neural networks inspired by \textit{ControlNet} with \textit{Diffusion Transformers (DiT)}. This hybrid model is specifically designed to address challenges in medical image segmentation such as generalization to other datasets and minimal fine-tuning requirements.

        %\item \textbf{Pretraining Strategy}: 

        \item \textbf{High Segmentation Accuracy with Zero-Shot and Few-Shot Learning}: \textbf{\textit{MedDelinea}} leverages a large-scale dataset (ATLAS-8k) for pretraining, enabling enhanced transferability and zero-shot and few-shot segmentation capabilities. This approach allows the model to perform effectively on new datasets without additional fine-tuning.
       Extensive evaluations demonstrate that \textbf{\textit{MedDelinea}} achieves state-of-the-art performance in segmenting complex anatomical structures with high precision, contributing to improved diagnostics and treatment planning in medical imaging.

        \item \textbf{Robustness, Modularity, and Scalability}: The architecture of \textbf{\textit{MedDelinea}} is both modular and scalable, making it adaptable to diverse medical imaging tasks without significant computational overhead or architectural modifications. \textbf{\textit{MedDelinea}} shows consistent and robust performance across various imaging modalities, including CT and MRI, and effectively handles multiple object segmentation within these modalities, highlighting its versatility and efficiency.
    \end{itemize}
    % \begin{itemize}\itemsep0pt
        % \item We introduce \textbf{\textit{MedDelinea}}, an innovative architecture that integrates controllable neural networks inspired by ControlNet with Diffusion Transformers (DiT) for medical image segmentation, providing a scalable and efficient solution.
        % \item \textbf{\textit{MedDelinea}} leverages large-scale pre-training, achieving impressive generalization across diverse datasets with minimal fine-tuning, and demonstrates strong zero-shot performance comparable to fine-tuned models.
        % \item Through extensive quantitative and qualitative evaluations, \textbf{\textit{MedDelinea}} outperforms state-of-the-art models in terms of segmentation accuracy, boundary precision, and overall robustness of both CT, MRI and multiple objects. 
        % multiple datasets.
    % \end{itemize}
%-------------------------------------------------------------------------

\subsection{Related Work}
\label{sec:related_work}

%\textbf{Traditional Approaches:} Medical image segmentation plays a critical role in clinical diagnostics, involving the classification of pixels in medical images (e.g., CT and MRI) to delineate anatomical structures or abnormalities. Convolutional neural network (CNN) based models, particularly UNet \cite{ronneberger2015u} and its variants \cite{oktay2018attention, zhang2019net, lou2021dc}, have been widely used due to their encoder-decoder architecture, which captures both local and global context. Despite their success, CNNs struggle with long-range dependencies, limiting their effectiveness in segmenting complex anatomical structures. This has led to a shift towards transformer-based methods.

%\textbf{Shift to Transformer-Based Models:} To address the limitations of CNNs, transformer-based models have been introduced. TransUNet \cite{chen2021transunet} combines the strengths of CNNs for local feature extraction with transformers for long-range dependency modeling. This hybrid architecture significantly improves segmentation accuracy. Other models like Swin-UNet \cite{cao2022swin} and DS-TransUNet \cite{lin2022ds} further refine this approach by incorporating Swin Transformers, which capture multi-scale features using shifted windows and hierarchical structures. While these models enhance the understanding of complex anatomical features, they introduce challenges like increased computational complexity and reliance on large-scale pre-training datasets.

\textbf{Emergence of Diffusion Models in Medical Image Segmentation:} Recently, diffusion models have emerged as a promising approach in medical image segmentation by framing the task as a generative modeling problem. Denoising Diffusion Probabilistic Models (DDPMs) \cite{ho2020denoising} have been adapted for segmentation, where they treat it as conditional image generation. PD-DDPM \cite{guo2023accelerating} improves efficiency by leveraging pre-segmentation results, while BerDiff \cite{chen2023berdiff} uses a Conditional Bernoulli Diffusion Model for binary segmentation tasks. MedSegDiff \cite{wu2024medsegdiff} utilizes diffusion processes to handle uncertain cases, such as tumor boundaries. However, many of these models still rely on UNet backbones, limiting their ability to capture global features. MedSegDiff-V2 \cite{wu2024medsegdiff} overcomes this by integrating transformers with diffusion models, enhancing global feature capture and improving segmentation quality.

\textbf{Pre-Training in Medical Imaging:} Pre-training methods have become increasingly important in medical image segmentation, especially due to the scarcity of large annotated datasets. These approaches improve model performance by learning meaningful representations from large, unlabeled datasets. Self-supervised learning (SSL) is a prominent pre-training strategy that helps models learn from unlabeled data through proxy tasks. SimCLR \cite{ali2021self}, for example, applies contrastive learning to discover underlying data structures, while Swin-UNETR \cite{tang2022self} uses proxy tasks like masked volume inpainting and rotation prediction to capture anatomical patterns in large CT datasets. Another model, UNetFormer \cite{hatamizadeh2022unetformer}, reconstructs masked tokens to learn from visible input regions. These methods offer significant improvements in performance when labeled data is scarce. Task-specific self-pre-training has also gained attention. Zhou et al. \cite{zhou2023self} utilize a Masked Autoencoder (MAE) for Vision Transformers (ViT), pre-training models by reconstructing masked images, enabling them to learn rich representations without extensive labeled datasets. This addresses a key challenge in medical imaging, where annotated data is often limited.

\textbf{Existing Gaps and Research Motivation:} Despite advances in transformer-based diffusion models like \textit{MedSegDiff-V2}, challenges persist. Traditional reliance on UNet backbones limits these models in capturing global anatomical structures. SSL and self-pre-training methods show promise but  overfitting to proxy tasks is a common problem, limiting generalization power of the networks, and making them less adaptable across tasks.

To address these gaps, we propose a novel hybrid architecture combining \textit{Diffusion Transformers (DiT)}~\cite{peebles2023scalable} with \textit{ControlNet}~\cite{zhang2023adding}. This approach integrates the generative strengths of diffusion models with transformers' ability to capture global features. Pre-training on large datasets followed by fine-tuning for specific segmentation tasks optimizes accuracy, complexity, and adaptability, offering a more scalable and efficient solution for medical image segmentation across diverse clinical applications.

\section{Method}
\label{sec:methodology}
%Figure~\ref{fig:architecture} illustrates our proposed architecture of controllable Diffusion Transformers (DiT) for medical image segmentation. This architecture capitalizes on the robustness of pre-trained models, which have been trained on extensive datasets, and adapts them to the segmentation task with minimal fine-tuning or no training at all. By leveraging the ability of DiT to model complex data distributions and integrating a control mechanism via ControlNet, the model seamlessly incorporates external conditioning data to guide the segmentation process.
An overview of our proposed Controllable Diffusion Transformers (DiT) architecture for medical image segmentation is presented in Fig.~\ref{fig:architecture}. By leveraging the robust representations learned by pre-trained models on large-scale datasets, our approach enables efficient adaptation to the segmentation task with minimal fine-tuning or even zero-shot learning. The DiT framework is particularly well-suited for modeling complex data distributions inherent to medical images, while the integration of a control mechanism via \textit{ControlNet} allows for seamless incorporation of external conditioning data to inform the segmentation process

%\textbf{\textbf{\textit{MedDelinea}} Pre-training:} We conducted pre-training on the ATLAS-8k \cite{qu2024abdomenatlas} dataset due to its diverse anatomical coverage, enabling robust feature extraction for medical image segmentation. Initially, both the Control DiT Module and Pre-trained DiT Module weights were initialized from a pre-trained model, as described in \cite{peebles2023scalable}. However during our pre-training, only the Control DiT Module and the final layer of the VAE Decoder were trained, while the rest of the model's weights were frozen to preserve generalization. The same strategy was applied during fine-tuning on other medical datasets, with the key difference being that the Control DiT Module and Decoder weights were initialized from the model pre-trained on the ATLAS-8k dataset, rather than directly from \cite{peebles2023scalable}. This selective fine-tuning approach ensures efficient transfer learning while preserving generalization capabilities. Detailed descriptions of each module are provided in subsequent sections.

\textbf{Pre-training:} Our pre-training strategy for \textbf{\textit{MedDelinea}} involves leveraging the diverse anatomical coverage of the ATLAS-8k dataset \cite{qu2024abdomenatlas} to learn robust features for medical image segmentation. We initialize the Control DiT Module and Pre-trained DiT Module weights from a pre-trained model, as described in \cite{peebles2023scalable}. During pre-training, we adopt a selective training approach, where only the Control DiT Module and the final layer of the VAE Decoder are updated, while the remaining model weights are frozen to preserve their generalization capabilities. This strategy is also employed during fine-tuning on other medical datasets, with the key distinction being that the Control DiT Module and Decoder weights are initialized from the pre-trained \textbf{\textit{MedDelinea}} model, rather than directly from \cite{peebles2023scalable}. By doing so, we facilitate efficient transfer learning while maintaining the model's ability to generalize. A detailed description of each module is provided in the subsequent sections. Additionally, further pre-training details are included in the supplementary section.

\begin{figure}[htbp]
         % Caption and label go in the first argument and the figure contents
         % go in the second argument
        \floatconts
          {fig:architecture}
          {\caption{Architecture Diagram of \textbf{\textit{MedDelinea}}. $\alpha_1$, $\alpha_2$, $\gamma_1$, $\gamma_2$, $\beta_1$ and $\beta_2$ represents scale and shift parameters.}}
          {\includegraphics[width=\linewidth]{imgs/arch_diagram.png}}
\end{figure}
    
% \begin{figure*}[]
%         \centering
%         \includegraphics[width=\textwidth]{imgs/arch_diagram.png}
%         \caption{Architecture Diagram of \textbf{\textit{MedDelinea}}. $\alpha_1$, $\alpha_2$, $\gamma_1$, $\gamma_2$, $\beta_1$ and $\beta_2$ represents scale and shift parameters.}
%         \label{fig:architecture}
% \end{figure*}
    
\textbf{Input Processing and Latent Space Transformation:} %The model takes as input a segmentation mask ($I_M$) and an organ image ($I_O$). These inputs are encoded into their respective latent representations, $Z_m$ and $Z_o$, using a frozen pretrained Variational Autoencoder (VAE) \cite{kingma2013auto}. The decision to freeze the VAE during this step ensures that the latent representations are consistent and invariant, reducing the risk of overfitting and ensuring stable performance across diverse inputs.
Our model processes two input streams: a segmentation mask ($I_M$) and an organ image ($I_O$). These inputs are encoded into their respective latent representations, $Z_m$ and $Z_o$, using a pre-trained Variational Autoencoder (VAE) \cite{kingma2013auto} with frozen weights. By fixing the VAE's parameters, thanks to its ability to learn compact and informative latent representations, during this encoding step, we ensure that the resulting latent representations remain consistent and invariant, which in turn reduces the risk of overfitting and promotes stable performance across diverse input data.

The latent vector $Z_o$ corresponding to the CT/MRI image is further processed through a zero convolution layer, where the weights and biases are initialized to zero. This operation ensures that the initial influence of $Z_o$ on the subsequent diffusion process is neutral, allowing the model to adaptively learn the most relevant features from the condition image as the training progresses. On the other hand, $Z_m$ undergoes a forward diffusion process, resulting in a noisy latent vector $Z_{mt}$. 
$P(Z_{mt} | Z_{m0}) = \sqrt{\bar{\alpha}_t} Z_{m0} + \sqrt{1 - \bar{\alpha}_t} \epsilon_t $, where $\epsilon_t \sim \mathcal{N}(0, 1).$ This noisy representation $Z_{mt}$ is then concatenated with $Z_o$, producing a conditional latent vector $Z_{ct}$. This combination allows the model to embed the condition-specific information into the diffusion process, effectively guiding the generation of segmentation masks.  

\textbf{Control DiT Module ($P_\phi$):} Durining fine-tuining, the weights are initialized from the Control DiT Module pre-trained on ATLAS-8k dataset. The Control DiT Module is designed to manipulate the diffusion process by injecting conditional information derived from $Z_{ct}$. This module first converts $Z_{ct}$ into a sequence of tokens using a patchification process, where the input is divided into patches, each linearly embedded into a token of dimension $d$. This transformation allows the model to process the spatial information within the input in a manner compatible with transformer architectures.

%Subsequently, as shown in Fig.~\ref{fig:architecture}, the tokens are passed through a series of $K$ transformer blocks. Each block incorporates a timestep embedding $t_e$, which conditions the scale and shift module within the block, ensuring that the model adapts its operations according to the diffusion stage. This conditioning mechanism is crucial for maintaining coherence in the generated images as the model progresses through the diffusion timesteps. The Control DiT Module facilitates interaction with the Pre-trained DiT Module by passing feature vectors at each residual connection point of DiT transformer blocks. These vectors act as guides, providing external conditioning that informs the denoising operations within the pre-trained DiT transformer blocks. Importantly, the parameters of the Control DiT Module remain trainable, enabling the model to learn and adapt to the specific segmentation task. 
As illustrated in Fig.~\ref{fig:architecture}, the tokens are then processed through a sequence of $K$ transformer blocks. Each block incorporates a timestep embedding $t_e$, which serves as a conditioning signal for the scale and shift module within the block. This enables the model to adapt its operations dynamically according to the diffusion stage, thereby maintaining coherence in the generated images throughout the diffusion process. Furthermore, the Control DiT Module interacts with the Pre-trained DiT Module by injecting feature vectors at each residual connection point of the DiT transformer blocks. These feature vectors act as external guides, providing conditional information that informs the denoising operations within the pre-trained DiT transformer blocks. Notably, the parameters of the Control DiT Module remain trainable, allowing the model to learn task-specific adaptations and refine its performance on the target segmentation task.

\textbf{Pre-trained DiT Module ($\epsilon_\theta)$:} The Pre-trained DiT Module plays a critical role in refining the noisy latent representation $Z_{mt}$. This module, which consists of $N$ DiT transformer blocks, is entirely frozen during training. By freezing these blocks, the architecture preserves the pre-learned knowledge from the extensive training on large datasets \cite{peebles2023scalable}, ensuring that the model retains its ability to generalize while focusing on the new dataset.

Similar to the Control DiT Module, the Pre-trained DiT Module begins by patchifying the input $Z_{mt}$ into a sequence of tokens. These tokens are then processed by the frozen transformer blocks. The scaling and shifting within these blocks are guided by the feature vectors passed from the Control DiT Module. This design allows the model to fine-tune the diffusion process based on the condition-specific information, ensuring that the denoising operation aligns with the desired segmentation outcome. After processing through the transformer blocks, the latent vector is approximated to its initial state $Z_{m0}$ using Tweedie’s formula \cite{efron2011tweedie}, which is a well-established method in denoising processes for estimating the clean latent from noisy distribution. The resultant vector is then passed to the VAE Decoder. 
\begin{equation}
\hat{Z_{m0}}(Z_{mt}) = \frac{Z_{mt} - \sqrt{1 - \bar{\alpha}_t} \cdot \epsilon_\theta(Z_{mt}, t, P_\phi(Z_{mt}, Z_i, t))}{\sqrt{\bar{\alpha}_t}},
\end{equation}
where, $\epsilon_\theta$ refers to the Pre-trained DiT module and $P_\phi$ refers to the Control DiT module.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{table*}[t]
    \caption{Mean segmentation metrics for the ATLAS-8k dataset.}
    \label{tab:dice_mean_scores}
    \centering
    \tiny
    \begin{tabular}{lccc}
    \hline
    \textbf{Methods} & \textbf{mDSC (↑)} & \textbf{mHD95 (↓)} & \textbf{mASSD (↓)} \\
    \hline
    \textbf{UNet \cite{ronneberger2015u}}           & 75.32 & 33.42 & 14.32 \\
    \textbf{TransUnet \cite{chen2021transunet}}     & 78.92 & 31.78 & 12.11 \\
    \textbf{SynergyNet \cite{gorade2024synergynet}} & 80.32 & 30.28 & 10.43 \\
    \textbf{ControlNet \cite{zhang2023adding}}      & 83.77 & 28.73 & 10.01 \\
    \textbf{MedSegDiff \cite{wu2024medsegdiff}}    & 85.07 & 27.99 & 9.27  \\
    \textbf{VQDiffusion \cite{gu2022vector}}        & 84.92 & 28.01 & 9.33  \\
    \textbf{DiT \cite{peebles2023scalable}}         & 85.48 & 28.24 & 9.01  \\
    \textbf{\textit{MedDelinea}}                    & \textbf{87.98} & \textbf{25.92} & \textbf{7.98} \\
    \hline
    \end{tabular}
\end{table*}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\textbf{Image Reconstruction and Loss Functions:} The VAE Decoder, which remains trainable, reconstructs the final image from the refined latent space. The trainability of the VAE Decoder is crucial as it allows the model to adapt to the specific characteristics of the dataset, ensuring that the segmentation masks generated are both accurate and relevant to the dataset.

Two primary loss functions are employed to guide the training process:

\textbf{\textit{a) Diffusion Loss ($L_{diff}$):}} This loss function is applied at the end of the Pre-trained DiT Module. It is designed to predict the noise present in the latent noisy vector $Z_{mt}$. The diffusion loss is typically defined as the mean squared error (MSE) between the predicted and actual noise, encouraging the model to accurately denoise the latent representation at each timestep:
$L_{diff} = \left| \epsilon_t - \epsilon_\theta(Z_{mt}, t, P_\phi(Z_{mt}, Z_i, t)) \right|^2$.

\textbf{\textit{b) Segmentation Loss ($L_{seg}$):}} This loss function is applied to the final output image to ensure that the generated segmentation masks not only look visually plausible but also precisely delineate the anatomical structures of interest. This is particularly critical in medical imaging, where segmentation accuracy is paramount for diagnostic and treatment planning purposes: $L_{seg} = L_{BCE} \left( D( \hat{Z}_{m0} ), I_m \right) + L_{Dice} \left( D( \hat{Z}_{m0} ), I_m \right)$, where, $D$ is the VAE Decoder.

By combining these loss functions the total loss on which model trained on is: $L_{t} = L_{diff} + L_{seg}$. The model is trained to produce high-quality segmentation masks that meet both aesthetic and clinical standards. The architecture’s design, which integrates pre-trained knowledge with adaptive control mechanisms, ensures that the model is both robust and flexible, capable of performing complex segmentation tasks with minimal additional training.

%------------------------------------------------------------------------
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{table}[]
        \centering
        \tiny
        \caption{Quantitative results on BTCV, AMOS, Cirr600+, and PanSegData datasets.}
        \label{tab:mean_quant_results_multi_dataset_table}
        \begin{tabular}{ccccccccc}
        \hline
                                           & \multicolumn{2}{c}{\textbf{BTCV}}      & \multicolumn{2}{c}{\textbf{AMOS}}      & \multicolumn{2}{c}{\textbf{Cirr600+}} & \multicolumn{2}{c}{\textbf{PanSegData}} \\
        \multirow{-2}{*}{\textbf{Methods}} & \textbf{mDSC (↑)} & \textbf{mHD95 (↓)} & \textbf{mDSC (↑)} & \textbf{mHD95 (↓)} & \textbf{Dice (↑)} & \textbf{HD95 (↓)} & \textbf{Dice (↑)}  & \textbf{HD95 (↓)}  \\ \hline
        \multicolumn{9}{c}{\cellcolor[HTML]{EFEFEF}Models fine-tuned after ATLAS-8k pre-training}                                                                                                              \\ \hline
        \textbf{UNet}                      & 70.88             & 30.98              & 69.01             & 33.21              & 56.78             & 39.77             & 54.18              & 31.45              \\
        \textbf{TransUnet}                 & 73.47             & 28.77              & 71.65             & 31.72              & 60.11             & 36.83             & 61.92              & 33.93              \\
        \textbf{SynergyNet}                & 79.65             & 23.29              & 77.67             & 31.28              & 64.76             & 33.47             & 67.32              & 31.77              \\
        \textbf{ControlNet}                & 82.96             & 21.15              & 78.19             & 30.76              & 66.84             & 31.55             & 69.92              & 29.98              \\
        \textbf{MedSegDiff}                & 83.55             & 20.19              & 80.77             & 28.75              & 78.92             & 29.57             & 70.56              & 29.12              \\
        \textbf{VQDiffusion}               & 82.71             & 24.11              & 80.13             & 28.82              & 77.88             & 31.07             & 70.12              & 29.91              \\
        \textbf{DiT}                       & 83.41             & 23.21              & 81.45             & 27.17              & 78.18             & 30.07             & 70.13              & 29.91              \\
        \textit{\textbf{MedDelinea}}       & \textbf{87.01}    & \textbf{19.00}     & \textbf{82.19}    & \textbf{26.56}     & \textbf{87.63}    & \textbf{26.95}    & \textbf{72.03}     & \textbf{28.01}     \\ \hline
        \multicolumn{9}{c}{\cellcolor[HTML]{EFEFEF}Zero-shot learning: Direct inference on test sets after ATLAS-8k pre-training}                                                                              \\ \hline
        \textbf{UNet}                      & 59.76             & 36.67              & 54.57             & 39.21              & 47.82             & 38.72             & 49.12              & 33.47              \\
        \textbf{TransUnet}                 & 61.19             & 34.22              & 60.01             & 36.54              & 48.77             & 37.14             & 52.77              & 32.88              \\
        \textbf{SynergyNet}                & 63.44             & 32.18              & 63.47             & 35.52              & 61.21             & 35.19             & 58.16              & 31.09              \\
        \textbf{ControlNet}                & 67.52             & 30.88              & 67.29             & 34.78              & 62.32             & 34.28             & 66.92              & 30.22              \\
        \textbf{MedSegDiff}                & 73.27             & 28.92              & 70.64             & 33.01              & 64.12             & 33.01             & 68.77              & 29.88              \\
        \textbf{VQDiffusion}               & 77.54             & 29.01              & 69.64             & 33.42              & 63.92             & 33.34             & 68.17              & 29.92              \\
        \textbf{DiT}                       & 77.79             & 26.81              & 76.47             & 31.42              & 64.01             & 33.19             & 68.17              & 28.52              \\
        \textit{\textbf{MedDelinea}}       & \textbf{80.01}    & \textbf{23.11}     & \textbf{78.67}    & \textbf{29.51}     & \textbf{65.17}    & \textbf{30.12}    & \textbf{70.52}     & \textbf{27.81}     \\ \hline
        \end{tabular}
    \end{table}



%-------------------------------------------------------------------------

\section{Experiments and Results}
\label{sec:experiments}
Details regarding implementation details (Section \ref{sec:training_details}), along with information about the dataset and evaluation metrics (Section \ref{sec:dataset_and_metrics}), is provided in the Appendix.
%-------------------------------------------------------------------------

   %%%%%%%%%%%%%%%%%%%%%%%%%%%%
    \begin{figure}[htbp]
        \centering
        \includegraphics[width=\linewidth]{imgs/combined_figure_amos_btcv.pdf}
        % \includegraphics[width=\linewidth]{imgs/btcv_qualatitative_3.pdf}}
        \caption{segmentation performance of various models on AMOS and BTCV datasets under fine-tuned and zero-shot settings, highlighting the region of interest (RoI) for detailed anatomical segmentation analysis.}
        \label{fig:amos_btcv}
    \end{figure}
    %%%%%%%%%%%%%%%%%%%%%%%%%%%%
   %The quantitative analysis of \textbf{\textit{MedDelinea}}'s performance, as detailed in Table \ref{tab:mean_quant_results_multi_dataset_table} highlights the model's effectiveness in both fine-tuning and zero-shot learning scenarios. After fine-tuning on each dataset post ATLAS-8k pre-training, \textbf{\textit{MedDelinea}} consistently achieved the highest metrics, such as an mDSC of 87.01\% on BTCV and 82.19\% on AMOS. This strong performance can be attributed to \textbf{\textit{MedDelinea}}'s ability to capture detailed anatomical structures and maintain precise boundaries, which is crucial for accurate segmentation. In the zero-shot learning scenario, where direct inference was performed on test sets without additional fine-tuning, \textbf{\textit{MedDelinea}} continued to demonstrate robust generalization capabilities. It led the metrics with an mDSC of 80.01\% on BTCV and 78.67\% on AMOS, indicating its effective transfer of learned features from pre-training to new datasets. This suggests that \textbf{\textit{MedDelinea}}'s architecture is adaptable, allowing it to perform well even in unfamiliar conditions.
    %The reason \textbf{\textit{MedDelinea}} excels in zero-shot learning is likely due to its combination of ControlNet and Diffusion Transformers, which enables it to generalize complex features learned during pre-training. Other models like MedSegDiff and VQDiffusion, while competitive in fine-tuning, fall short in zero-shot scenarios. For example, MedSegDiff shows a drop in boundary accuracy, as indicated by higher mHD95 values, particularly on datasets with challenging conditions like Cirr600+. Similarly, VQDiffusion, although performing well in some metrics, struggles with maintaining surface accuracy across diverse datasets, as seen in higher mASSD values on PanSegData.
    \subsection{Quantitative Comparison}

    
    %%%%%%%%
    \textbf{Fine-tuining and zero-shot learning Scenarios:} Table \ref{tab:mean_quant_results_multi_dataset_table} presents the quantitative analysis of \textbf{\textit{MedDelinea}}'s performance in fine-tuning and zero-shot learning scenarios. After fine-tuning on each dataset post ATLAS-8k pre-training, \textbf{\textit{MedDelinea}} achieves the highest metrics, including an mDSC of 87.01\% on BTCV and 82.19\% on AMOS. In zero-shot learning, \textbf{\textit{MedDelinea}} demonstrates robust generalization capabilities, leading with an mDSC of 80.01\% on BTCV and 78.67\% on AMOS. This suggests that \textbf{\textit{MedDelinea}}'s architecture is adaptable and effective in transferring learned features from pre-training to new datasets.

The combination of ControlNet and Diffusion Transformers in \textbf{\textit{MedDelinea}} enables it to generalize complex features and excel in zero-shot learning. In contrast, other models like MedSegDiff and VQDiffusion, while competitive in fine-tuning, struggle with boundary accuracy and surface accuracy in zero-shot scenarios, as indicated by higher mHD95 and mASSD values on challenging datasets.

 \textbf{Few-shot Comparison:} Table \ref{tab:few_shot_learning} presents the few-shot learning performance of different models, including \textbf{\textit{MedDelinea}}, fine-tuned on 1\%, 10\%, and 25\% of the training data after ATLAS-8k pre-training. \textbf{\textit{MedDelinea}} consistently outperforms other models (ControlNet and DiT) across varying amounts of training data, achieving the highest mDice scores and lowest mHD95 values on both BTCV and AMOS datasets. Note that, with only 1\% of the training data, \textbf{\textit{MedDelinea}} achieves mDice scores of 77.52\% (BTCV) and 78.92\% (AMOS), demonstrating its robustness and efficiency in few-shot learning scenarios. As the training data increases, \textbf{\textit{MedDelinea}} further solidifies its lead, showcasing its superior scalability and effectiveness.
    
\textbf{Altas8k Dataset:} Table \ref{tab:dice_mean_scores} shows that our proposed method, \textbf{\textit{MedDelinea}}, outperforms previous baselines with superior scalability. The incorporation of the Control DiT Module in \textbf{\textit{MedDelinea}} mitigates the issue of weight distribution shift caused by learning new segmentation tasks, leading to improved scalability and adaptability. This confirms that \textbf{\textit{MedDelinea}} is a more reliable and efficient model for segmentation tasks. Organ-specific Dice scores for ATLAS-8K are provided in Appendix Table \ref{tab:dice_organ_scores}.

Details on the zero-shot segmentation performance analysis of models on MRI and CT, are provided in the supplementary section.
    
    %%%%%%%%%%%%%%%%%%%%%%%%%%%%
    \begin{table}[t]
        \centering
        \tiny
        \caption{Few-shot learning results: Models fine-tuned on 1\%, 10\%, and 25\% of training data after ATLAS-8k pre-training, followed by inference on test sets.}
        \label{tab:few_shot_learning}
        \begin{tabular}{lcccc}
        \hline
                                       & \multicolumn{2}{c}{\textbf{BTCV}}      & \multicolumn{2}{c}{\textbf{AMOS}}      \\
    \multirow{-2}{*}{\textbf{Methods}} & \textbf{mDSC (↑)} & \textbf{mHD95 (↓)} & \textbf{mDSC (↑)} & \textbf{mHD95 (↓)} \\ \hline
        \multicolumn{5}{c}{\cellcolor[HTML]{EFEFEF} \textit{Results on 1\% of Training Data}}                                          \\ \hline
        \textbf{ControlNet}                & 72.12             & 27.52              & 69.03             & 34.02              \\
        % \cite{zhang2023adding}
        \textbf{DiT}                       & 74.22             & 25.11              & 76.67             & 32.18              \\
        % \cite{peebles2023scalable}
        \textbf{\textbf{\textit{MedDelinea}}}                    & \textbf{77.52}    & \textbf{23.42}     & \textbf{78.92}    & \textbf{29.92}     \\ \hline
        \multicolumn{5}{c}{\cellcolor[HTML]{EFEFEF}\textit{Results on 10\% of Training Data}}                                         \\ \hline
        \textbf{ControlNet}                & 72.82             & 27.03              & 71.28             & 33.05              \\
        % \cite{zhang2023adding}
        \textbf{DiT}                       & 74.97             & 24.55              & 77.01             & 31.51              \\
        % \cite{peebles2023scalable}
        \textbf{\textbf{\textit{MedDelinea}}}                    & \textbf{78.59}    & \textbf{22.78}     & \textbf{79.23}    & \textbf{28.79}     \\ \hline
        \multicolumn{5}{c}{\cellcolor[HTML]{EFEFEF}\textit{Results on 25\% of Training Data}}                                         \\ \hline
        \textbf{ControlNet}                & 74.02             & 26.38              & 73.34             & 31.27              \\
        % \cite{zhang2023adding}
        \textbf{DiT}                       & 75.11             & 24.01              & 79.34             & 29.16              \\
        % \cite{peebles2023scalable}
        \textbf{\textbf{\textit{MedDelinea}}}                    & \textbf{81.16}    & \textbf{21.59}     & \textbf{80.46}    & \textbf{27.49}     \\ \hline
        \end{tabular}
    \end{table}
    %%%%%%%%%%%%%%%%%%%%%%%%%%%%
    %The results presented in Table \ref{tab:few_shot_learning} highlight the performance of different models, including \textbf{\textit{MedDelinea}}, in a few-shot learning scenario where models were fine-tuned on 1\%, 10\%, and 25\% of the training data after ATLAS-8k pre-training, followed by inference on the test sets of BTCV and AMOS datasets. \textbf{\textit{MedDelinea}} consistently outperforms other models, including ControlNet and DiT, across varying amounts of training data. When fine-tuned on just 1\% of the training data, \textbf{\textit{MedDelinea}} achieved the highest mDice scores of 77.52\% on BTCV and 78.92\% on AMOS, along with the lowest mHD95 values, indicating its strong performance even with minimal data. In contrast, DiT and ControlNet showed lower mDice scores and higher mHD95 values, particularly with ControlNet struggling more on AMOS, reflecting challenges in accurately segmenting boundaries with limited data. As the training data increased to 10\% and 25\%, \textbf{\textit{MedDelinea}} further solidified its lead, achieving mDice scores of 81.16\% on BTCV and 80.46\% on AMOS with the lowest mHD95 values, showcasing its superior scalability and effectiveness. DiT improved with more data but still fell short of \textbf{\textit{MedDelinea}}'s performance,while ControlNet, despite some improvement, continued to lag behind in both segmentation accuracy and boundary precision, as indicated by higher mHD95 values. These comparisons highlight \textbf{\textit{MedDelinea}}'s robustness and efficiency in few-shot learning scenarios, making it more reliable than other models for medical image segmentation, especially in data-constrained environments.
    
   \textbf{Inference Time and Model Parameter Comparison:} A detailed comparison of the number of parameters and inference time across baseline models and \textbf{\textit{MedDelinea}} is provided in the Table \ref{tab:parameters_comparison}. To ensure a fair comparison, all inference evaluations were conducted on a single NVIDIA A6000 GPU.  \textbf{\textit{MedDelinea}} consists of 227 million parameters and achieves state-of-the-art (SOTA) performance with competitive inference time relative to other models. While MedDelinea has a slightly larger parameter count compared to TransUNet and DiT, it maintains efficient inference time while significantly outperforming these models in segmentation accuracy. Notably, it is less computationally expensive than MedSegDiff, VQ-Diffusion, and ControlNet while delivering superior performance. This comparison underscores \textbf{\textit{MedDelinea's}} ability to balance model complexity and inference efficiency, making it a highly effective solution for medical image segmentation.

\begin{table}[]
    \centering
    \scriptsize
    \caption{Comparison of model parameters (in millions) and inference time (in seconds) across various baseline methods and MedDelinea.}
    \label{tab:parameters_comparison}
    \begin{tabular}{ccc}
    \hline
    Methods     & Parameters M & Inference Time \\ \hline
    UNet        & 33           & 0.25           \\
    TransUnet   & 182          & 0.76           \\
    SynergyNet  & 23           & 0.46           \\
    ControlNet  & 400          & 6.32           \\
    MedSegDiff  & 372          & 8.89           \\
    VQDiffusion & 340          & 3.11           \\
    DiT         & 187          & 2.72           \\
    MedDelinea  & 227          & 2.88           \\ \hline
    \end{tabular}
\end{table}
  
    
    \subsection{Qualitative Comparison}
    \label{sec:qualitative_results}

Qualitative results in Fig.~\ref{fig:amos_btcv} demonstrate \textbf{\textit{MedDelinea}}'s superior segmentation performance on BTCV and AMOS datasets in both fine-tuned and zero-shot scenarios. Compared to other models (e.g., MedSegDiff, ControlNet, VQ-Diffusion, and DiT), \textbf{\textit{MedDelinea}} consistently delivers accurate segmentations with fewer artifacts and mis-segmentations. Notably, \textbf{\textit{MedDelinea}} excels in delineating organ boundaries, particularly in the pancreas and spleen (BTCV) and liver, kidneys, and pancreas (AMOS). MedDelinea captures fine details and complex structures, achieving superior boundary delineation and segmentation performance. See Appendix Fig. 6 for attention maps.

% Attention maps in Fig.~\ref{fig:attention_map} further highlight \textbf{\textit{MedDelinea}}'s targeted attention mechanism, which focuses on fine-grained details and captures complex structures more effectively than other models. This superior attention enables \textbf{\textit{MedDelinea}} to achieve better boundary delineation and segmentation performance, making it a robust and generalizable model for clinical applications.



    % \begin{figure}[htbp] 
    %     \centering
    %     \includegraphics[width=\linewidth]{imgs/Attmap2.png} 
    %     \caption{Attention maps for BTCV and AMOS datasets.}
    %     \label{fig:attention_map}
    % \end{figure}
    %%%%%%%%%%%%%%%%%%%%%%%%%%%%
    %I SHORTEN THIS *ULAS* -->The qualitative results in Fig.~\ref{fig:amos_btcv} compare the segmentation performance of various models across both fine-tuned and zero-shot scenarios for the BTCV and AMOS datasets. In both scenarios, \textbf{\textit{MedDelinea}} consistently delivers highly accurate segmentations, closely aligning with the ground truth. For the BTCV dataset (Figure \ref{fig:amos_btcv}), \textbf{\textit{MedDelinea}} excels at delineating the liver, pancreas, and spleen in both fine-tuned and zero-shot scenarios. In contrast, other models, such as MedSegDiff and ControlNet, show inaccuracies in segmenting organ boundaries, particularly in the pancreas and spleen. \textbf{\textit{MedDelinea}} demonstrates fewer artifacts and mis-segmentations, providing a cleaner, more precise output.
    %I SHORTEN THIS--> ULAS --> Similarly, in the AMOS dataset, \textbf{\textit{MedDelinea}}'s fine-tuned results are nearly identical to the ground truth, especially in the segmentation of the liver, kidneys, and pancreas. In the zero-shot results, \textbf{\textit{MedDelinea}} maintains a high level of accuracy, where models like VQ-Diffusion and DiT struggle with boundary precision and miss details in more complex structures, as highlighted in the region of interest (ROI). \textbf{\textit{MedDelinea}} effectively captures fine details and maintains clear boundaries, demonstrating superior robustness and generalization capability compared to other models.


    %I SHORTEN THIS __> ULAS __>The attention maps presented in the Figure \ref{fig:attention_map} offer a qualitative comparison of \textbf{\textit{MedDelinea}} (ours) with other models, including ControlNet, VQ Diffusion, and DiT, across different input images. As shown in Figure \ref{fig:attention_map}, DiT shows improved attention compared to ControlNet and VQ Diffusion, but its focus is still less targeted than that of \textbf{\textit{MedDelinea}}. The attention maps of DiT tend to capture the overall organ boundaries but lack the fine-grained focus that is evident in \textbf{\textit{MedDelinea}}'s maps. This difference is particularly noticeable in complex regions where accurate boundary delineation is critical. In the third row, focusing on liver regions, \textbf{\textit{MedDelinea}} again outperforms the other models by showing a more concentrated and precise attention map around the liver and associated structures. This targeted attention likely contributes to \textbf{\textit{MedDelinea}}’s superior segmentation performance, as it can better delineate the liver boundaries and focus on pathological areas, which is crucial for clinical applications.

    % \begin{figure}[htbp]
    %     \floatconts
    %       {fig:attention_map}
    %       {\caption{Attention maps for BTCV and AMOS datasets.}}
    %       {\includegraphics[width=\linewidth]{imgs/Attmap2.png}}
    % \end{figure}

    % Tables~\ref{tab:quant_finetune} and \ref{tab:quant_zeroshot}
    % Fig.~\ref{fig:atlas_qualatitative}
    % Fig.~\ref{fig:3d}
    Additional quantitative class-wise metrics for the BTCV and AMOS datasets (See Tables 5 and 6 in Appendix), along with qualitative results from ATLAS-8k (See Fig. 5 in Appendix) and 3D segmentation results for BTCV and AMOS (See Fig. 7 in Appendix), are provided in the appendix.

    

%-------------------------------------------------------------------------

\subsection{Ablation Study}
\label{sec:ablation}

    \begin{table}[]
        \caption{Ablation Study. Default settings indicate the architectural choices used for all experiments.}
        \label{tab:ablation}
        \tiny
        \centering
        \begin{tabular}{cccc}
        \hline
        \textbf{Experiments}                                             & mDSE (↑)                  & mHD95 (↓)                 & mASSD (↓)                \\ \hline
        \multicolumn{4}{c}{\cellcolor[HTML]{EFEFEF}Ablation Based on Training Stratergy}                                                                    \\ \hline
        Training only Noise Predictor                                    & 86.21                     & 26.11                     & 08.03                    \\ \hline
        \multicolumn{4}{c}{\cellcolor[HTML]{EFEFEF}Abaltion Based on loss (Default: $L_{diff}$ and $L_{seg}$)}                                                \\ \hline
        Only $L_{diff}$                                                  & 86.92                     & 26.01                     & 08.01                    \\ \hline
        \multicolumn{4}{c}{\cellcolor[HTML]{EFEFEF}Ablation based on Number of Control-DiT Transformer Blocks (Default: K=15)}                              \\ \hline
        K=1                                                              & 86.57                     & 26.54                     & 08.33                    \\
        K=5                                                              & 86.72                     & 26.37                     & 08.27                    \\
        K=10                                                             & 87.37                     & 25.98                     & 08.01                    \\ \hline
        \multicolumn{4}{c}{\cellcolor[HTML]{EFEFEF}Abaltion Based on Noise Schedulers (Default: EulerAncestralDiscreteScheduler \cite{karras2022elucidating})} \\ \hline
        DDIM \cite{song2020denoising}                                       & 87.51                     & 25.32                     & 08.11                    \\
        DPMSolver++ \cite{lu2022dpm}                                        & 87.22                     & 25.45                     & 08.11                    \\
        UniPCMultistepScheduler \cite{zhao2024unipc}                        & 87.92                     & 25.78                     & 07.97                    \\
        LMSDiscreteScheduler \cite{karras2022elucidating}                   & 87.76                     & 25.34                     & 07.96  \\ \hline                 
        \end{tabular}
    \end{table}
    %%%%%%%%%%%%%%%%%%%%%%%%%%%%

    We conducted ablation studies to assess the impact of different components and configurations on the model’s performance, using the Atlas-8K dataset for these experiments. The results are summarized in Table \ref{tab:ablation}. All results for the default choices are presented in Table \ref{tab:dice_mean_scores}.
    
    \textbf{Ablation Based on Training Stratergy:}  We analyzed the impact of different training strategies for the decoder and noise predictor. Training only the noise predictor yielded lower performance, with an mDSE of 86.21, mHD95 of 26.11, and mASSD of 8.03. However, training both the noise predictor and the decoder together (default choice) improved results across all metrics (refer Table \ref{tab:dice_mean_scores}). 

    \textbf{Ablation Based on Loss:} Using only diffusion loss ($L_{diff}$) resulted in poor boundary accuracy. Adding segmentation loss ($L_{seg}$) with a BCE Dice component improved performance, raising the mDSE from 86.92 to 87.98, lowering mHD95 from 26.01 to 25.92, and improving mASSD from 8.01 to 7.98. This highlights the value of segmentation-specific losses for better boundary accuracy.

    \textbf{Ablation based on Number of Control-DiT Transformer Blocks:} Increasing the number of Control-DiT transformer blocks (K) led to better segmentation performance. With K rising from 1 to 15, the model reached its best scores—mDSE of 87.98, mHD95 of 25.92, and mASSD of 7.98. However, this improvement came with a substantial increase in trainable parameters, from 7.83 million at K=1 to 117.45 million at K=15.

    \textbf{Ablation Based on Noise Schedulers:} The results in Table \ref{tab:dice_mean_scores} show that the Euler Ancestral Discrete Scheduler outperforms others, with the highest mDSE score (87.98), fastest inference time (2.78s), and strong boundary accuracy. In comparison, the UniPC and LMSDiscreteSchedulers (Table \ref{tab:ablation}) have slightly lower mDSE scores (87.92 and 87.76) and longer inference times (8.54s and 5.32s). This indicates the Euler Ancestral Discrete Scheduler offers the best balance of speed and accuracy.
    
    % The results for the Euler Ancestral Discrete Scheduler, shown in Table \ref{tab:pre-training_results}, highlight its superior performance, achieving the highest mDSE score (87.98), fastest inference time (2.78 seconds), and strong boundary accuracy (mHD95: 25.92, mASSD: 7.98). In contrast, results for the UniPC Multistep and LMSDiscreteSchedulers, presented in Table \ref{tab:ablation}, show slightly lower mDSE scores of 87.92 and 87.76, with longer inference times of 8.54 and 5.32 seconds, respectively. These findings suggest that the Euler Ancestral Discrete Scheduler offers the best trade-off between speed and accuracy.

%-------------------------------------------------------------------------

\section{Conclusion}
\label{sec:conclusion}
    In conclusion, \textbf{\textit{MedDelinea}} presents a novel and scalable architecture that significantly enhances medical image segmentation by integrating controllable neural networks with Diffusion Transformers (DiT). The model demonstrates strong generalization capabilities, achieving state-of-the-art performance with minimal fine-tuning and excelling in zero-shot settings. Its modular design allows for efficient adaptation across various medical imaging tasks, ensuring both computational efficiency and high accuracy. \textbf{\textit{MedDelinea}}'s success highlights the potential of leveraging pre-trained models combined with adaptive control mechanisms, offering a robust solution for diverse and complex medical image segmentation challenges.

%-------------------------------------------------------------------------

%%%%%%%%% REFERENCES
{\small
\bibliographystyle{ieee_fullname}
\bibliography{midl25_018}
}

\clearpage

\appendix

\section{Training and Implementation Details} \label{sec:training_details}
    
    \textbf{Pre-training Details:} The \textbf{\textit{MedDelinea}} model was pre-trained over 35,000 steps using the AdamW optimizer with a learning rate of 2.0e-5. A Cosine annealing scheduler was applied for learning rate decay. The model was trained on 8 Nvidia A6000 GPUs (48 GB each), with a batch size of 32 per GPU. Image resolutions were dynamically adjusted per batch, alternating between 256 $\times$ 256 and 512 $\times$ 512. Gradient accumulation was set to 8 steps, and mixed precision training was employed to improve computational efficiency and reduce memory usage. \textcolor{black}{Pre-training was conducted using the Atlas 8K dataset, following a 90:10 split for training and validation.}

    \textbf{Fine-tuning Details:} Fine-tuning was conducted for 1,500 steps using the same AdamW optimizer with a learning rate of 2.0e-5. A Linear scheduler was used for learning rate adjustments. The batch size remained at 32 per GPU, with the same hardware configuration. During fine-tuning, the image resolution was fixed at 256 $\times$ 256. Similar to pre-training, gradient accumulation was set to 8 steps, and mixed precision training was used to optimize both speed and memory efficiency. This fine-tuning process was consistently applied across all datasets. \textcolor{black}{Fine-tuning and evaluation were performed using the BTCV, AMOS, and CirrMri600+ datasets, following a standardized 80:10:10 split for training, validation, and testing.}

    \textcolor{black}{\textbf{Data Augmentation:} To improve generalization and robustness to variations in medical imaging data, we applied the same data augmentation strategies during both pre-training and fine-tuning across \textbf{\textit{MedDelinea}} and all baseline methods. The augmentation techniques included Random Cropping, which extracts random patches to introduce spatial variability, and Random Affine Transformations, which apply scaling, shearing, and translation to enhance invariance to spatial deformations. Additionally, Elastic Distortion was employed to simulate realistic non-rigid deformations commonly seen in medical scans, while Random Rotation introduced angular perturbations to improve robustness to orientation differences. These augmentations were consistently applied throughout both training stages, ensuring that the learned representations remain invariant to common geometric transformations and distortions in medical imaging data.}

\section{Datasets and Metrics} \label{sec:dataset_and_metrics}

    \textbf{Datasets:} To evaluate \textbf{\textit{MedDelinea}}'s performance in medical image segmentation, we fine-tuned the model on three datasets after pre-training on ATLAS-8k \cite{qu2024abdomenatlas}, which contains 8,000 CT scans. For fine-tuning, we used BTCV \cite{fang2020multi}, AMOS \cite{ji2022amos}, Cirr600+ \cite{jha2024cirrmri600+}, and PanSegData \cite{zhang2024large}. BTCV includes 50 multi-organ abdominal CT scans for benchmarking. AMOS provides 600 volumes (500 CT, 100 MRI) for multi-modal segmentation of abdominal organs. Cirr600+ features 608 MRI scans focused on cirrhotic liver cases, and Pancreas-MRI comprises 767 MRI scans from 499 participants across five centers. This diverse dataset combination enhances \textbf{\textit{MedDelinea}}'s robustness and generalizability across medical imaging scenarios.

    \textbf{Metrics:} To assess model performance, we used mean Dice Similarity Coefficient (mDSC), mean Hausdorff Distance 95 (mHD95), and Dice. mDSC and Dice quantify segmentation overlap, where 1 indicates perfect agreement. mHD95 measure boundary errors and surface distances, with lower values indicating better accuracy and precision. These metrics offer a comprehensive evaluation of segmentation accuracy and boundary precision.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

    \begin{table*}[b]
        \centering
        \tiny
        \caption{Classwise quantitative results on the BTCV and AMOS datasets, where models are fine-tuned on these datasets after pre-training on the ATLAS-8k dataset.}
        \label{tab:quant_finetune}
        \begin{tabular}{ccccccccc}
        \hline
                                           & \multicolumn{8}{c}{\textbf{Dice (↑)}}                                                                                                                 \\
        \multirow{-2}{*}{\textbf{Methods}} & \textbf{Aorta} & \textbf{Gallbladder} & \textbf{KidneyL} & \textbf{KidneyR} & \textbf{Liver} & \textbf{Pancreas} & \textbf{Spleen} & \textbf{Stomach} \\ \hline
        \multicolumn{9}{c}{\cellcolor[HTML]{EFEFEF}\textit{BTCV}}                                                                                                                                  \\ \hline
        \textbf{UNet}
                      & 75.12          & 59.77                & 72.31            & 70.88            & 86.67          & 47.78             & 80.19           & 73.21            \\
                      % \cite{ronneberger2015u}
        \textbf{TransUnet}                  & 77.05          & 61.92                & 74.11            & 73.08            & 88.79          & 49.97             & 81.37           & 73.98            \\
        % \cite{chen2021transunet}
        \textbf{SynergyNet}                & 86.10           & 65.49                & 82.78            & 78.24            & 95.06          & 58.28             & 88.95           & 81.30             \\
        % \cite{gorade2024synergynet}
        \textbf{ControlNet}                 & 89.19          & 68.78                & 85.92            & 83.19            & 95.98          & 61.09             & 90.28           & 85.09            \\
        % \cite{zhang2023adding}
        \textbf{MedSegDiff}                & 89.92          & 69.03                & 86.67            & 83.32            & 96.88          & 61.12             & 90.11           & 84.11            \\
        % \cite{wu2024medsegdiff}
        \textbf{VQDiffusion}                & 88.56          & 66.77                & 85.12            & 83.23            & 94.11          & 59.92             & 88.63           & 81.01            \\
        % \cite{gu2022vector}
        \textbf{DiT}                       & 88.88          & 67.42                & 86.15            & 84.11            & 95.02          & 61.21             & 88.91           & 84.21            \\
        % \cite{peebles2023scalable}
        \textbf{\textbf{\textit{MedDelinea}}}             & \textbf{91.77} & \textbf{72.78}       & \textbf{88.09}   & \textbf{85.18}   & \textbf{96.72} & \textbf{63.77}    & \textbf{92.96}  & \textbf{86.61}   \\ \hline
        \multicolumn{9}{c}{\cellcolor[HTML]{EFEFEF}\textit{AMOS}}                                                                                                                                  \\ \hline
        \textbf{UNet}
                      & 74.01          & 55.71                & 71.88            & 67.82            & 86.02          & 42.21             & 77.88           & 71.18            \\
                      % \cite{ronneberger2015u}
        \textbf{TransUnet}                  & 75.82          & 56.86                & 73.42            & 69.01            & 87.18          & 44.78             & 81.82           & 72.01            \\
        % \cite{chen2021transunet}
        \textbf{SynergyNet}                & 83.89          & 63.16                & 81.01            & 82.11            & 88.12          & 44.17             & 87.19           & 84.12            \\
        % \cite{gorade2024synergynet}
        \textbf{ControlNet}                 & 84.99          & 64.77                & 82.76            & 82.62            & 89.19          & 45.22             & 89.02           & 85.32            \\
        % \cite{zhang2023adding}
        \textbf{MedSegDiff}                & 86.08          & 65.92                & 83.44            & 83.19            & 91.78          & 46.04             & 90.65           & 86.78            \\
        % \cite{wu2024medsegdiff}
        \textbf{VQDiffusion}                & 85.19          & 65.01                & 84.17            & 83.02            & 90.72          & 45.99             & 89.77           & 85.27            \\
        % \cite{gu2022vector}
        \textbf{DiT}                       & 85.99          & 64.88                & 84.76            & 82.92            & 91.44          & 45.34             & 90.02           & 84.98            \\
        % \cite{peebles2023scalable}
        \textbf{\textbf{\textit{MedDelinea}}}             & \textbf{87.12} & \textbf{66.94}       & \textbf{87.12}   & \textbf{84.05}   & \textbf{93.14} & \textbf{48.19}    & \textbf{91.72}  & \textbf{87.67}   \\ \hline
        \end{tabular}
    \end{table*}
    %%%%%%%%%%%%%%%%%%%%%%%%%%%%

    %%%%%%%%%%%%%%%%%%%%%%%%%%%%
    \begin{table*}[]
    \centering
    \tiny
    \caption{Classwise quantitative results on the BTCV and AMOS datasets, obtained through zero-shot inference on the test set following fine-tuning on the ATLAS-8k dataset.}
    \label{tab:quant_zeroshot}
    \begin{tabular}{ccccccccc}
    \hline
                                       & \multicolumn{8}{c}{\textbf{Dice (↑)}}                                                                                                                 \\
    \multirow{-2}{*}{\textbf{Methods}} & \textbf{Aorta} & \textbf{Gallbladder} & \textbf{KidneyL} & \textbf{KidneyR} & \textbf{Liver} & \textbf{Pancreas} & \textbf{Spleen} & \textbf{Stomach} \\ \hline
    \multicolumn{9}{c}{\cellcolor[HTML]{EFEFEF}\textit{BTCV}}                                                                                                                                  \\ \hline
    \textbf{UNet}
                      & 69.67          & 54.12                & 65.43            & 65.24            & 80.11          & 39.12             & 71.21           & 59.77            \\
                      % \cite{ronneberger2015u}
    \textbf{TransUnet}                  & 71.22          & 55.77                & 69.88            & 66.54            & 82.11          & 40.14             & 73.22           & 61.01            \\
    % \cite{chen2021transunet}
    \textbf{SynergyNet}                & 83.01          & 58.92                & 74.57            & 72.34            & 86.78          & 49.92             & 78.56           & 70.99            \\
     % \cite{gorade2024synergynet}
    \textbf{ControlNet}                 & 84.33          & 61.72                & 78.22            & 76.52            & 88.99          & 53.17             & 79.90            & 73.42            \\
    % \cite{zhang2023adding}
    \textbf{MedSegDiff}                & 85.19          & 62.23                & 80.01            & 78.12            & 90.11          & 51.02             & 81.27           & 76.77            \\
    % \cite{wu2024medsegdiff}
    \textbf{VQDiffusion}                & 83.19          & 59.12                & 79.11            & 79.88            & 90.02          & 54.92             & 84.22           & 78.71            \\
    % \cite{gu2022vector}
    \textbf{DiT}                       & 85.33          & 60.61                & 81.21            & 80.53            & 87.15          & 57.33             & 86.17           & 80.11            \\
    % \cite{peebles2023scalable}
    \textbf{\textbf{\textit{MedDelinea}}}            & \textbf{89.17} & \textbf{68.18}       & \textbf{85.19}   & \textbf{83.09}   & \textbf{94.21} & \textbf{60.88}    & \textbf{89.78}  & \textbf{83.10}    \\ \hline
    \multicolumn{9}{c}{\cellcolor[HTML]{EFEFEF}\textit{AMOS}}                                                                                                                                  \\ \hline
    \textbf{UNet}
                      & 63.11          & 46.44                & 63.17            & 54.72            & 68.34          & 27.55             & 60.75           & 60.53            \\
                      % \cite{ronneberger2015u}
    \textbf{TransUnet}                  & 65.12          & 48.34                & 64.77            & 59.11            & 70.57          & 31.51             & 65.32           & 63.21            \\
    % \cite{chen2021transunet}
    \textbf{SynergyNet}                & 68.92          & 50.02                & 69.24            & 71.02            & 73.76          & 36.28             & 71.54           & 66.54            \\
    % \cite{gorade2024synergynet}
    \textbf{ControlNet}                 & 71.12          & 56.72                & 73.77            & 74.57            & 78.91          & 38.91             & 77.33           & 74.58            \\
    % \cite{zhang2023adding}
    \textbf{MedSegDiff}                & 70.92          & 58.21                & 75.26            & 75.72            & 82.65          & 40.42             & 82.37           & 79.91            \\
    % \cite{wu2024medsegdiff}
    \textbf{VQDiffusion}                & 71.24          & 54.11                & 77.56            & 77.01            & 83.72          & 38.76             & 80.22           & 77.82            \\
    % \cite{gu2022vector}
    \textbf{DiT}                       & 75.12          & 60.21                & 81.56            & 77.01            & 87.46          & 41.42             & 80.22           & 77.82            \\
    % \cite{peebles2023scalable}
    \textbf{\textbf{\textit{MedDelinea}}}             & \textbf{78.44} & \textbf{62.01}       & \textbf{80.92}   & \textbf{81.11}   & \textbf{90.51} & \textbf{46.01}    & \textbf{88.52}  & \textbf{84.01}   \\ \hline
    \end{tabular}
    \end{table*}

    
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\section{Detailed Pre-training Strategy}
\label{sec:supp_pretrain}

\textcolor{black}{Our pre-training strategy leverages the ATLAS-8K dataset, which contains 8,000 3D CT volumes annotated with 15 abdominal structures. The dataset provides comprehensive anatomical coverage, allowing models to learn robust feature representations for medical image segmentation.
During pre-training, we initialize the Control DiT Module and Pre-trained DiT Module weights from a model pre-trained as described in \cite{peebles2023scalable}. However, we employ selective training, wherein only the Control DiT Module and the final layer of the VAE Decoder are updated, while the remaining weights remain frozen. This preserves the generalization capability of the original model while allowing for effective adaptation to medical imaging.}

\textcolor{black}{
\textbf{Motivation for Pre-training:} While popular diffusion models such as DiT, Stable Diffusion, and VQ-Diffusion are pre-trained on natural images, they lack domain-specific knowledge crucial for medical image segmentation. Directly applying them to CT/MRI data leads to suboptimal generalization due to differences in contrast, texture, and domain-specific noise characteristics.}
\textcolor{black}{
Our pre-training on ATLAS-8K addresses this issue in the following ways:}
\begin{itemize}
    \item \textcolor{black}{Domain Alignment: The dataset's intensity distributions, anatomical priors, and noise characteristics ensure the model learns medical image representations rather than natural image statistics.}
    \item \textcolor{black}{Improved Generalization: Exposure to diverse anatomical variations improves the model’s ability to segment structures across different CT/MRI scans.}
    \item \textcolor{black}{Segmentation-Specific Guidance: We integrate segmentation loss at the decoder stage, reinforcing feature learning for anatomical edges and overlapping structures.}
\end{itemize}

\section{Quantitative Results}
\label{sec:quant}
Tables \ref{tab:quant_finetune} and \ref{tab:quant_zeroshot} present the classwise Dice scores for various models on the BTCV \cite{fang2020multi} and AMOS \cite{ji2022amos} datasets, highlighting both fine-tuned and zero-shot results after pre-training on the ATLAS-8k \cite{qu2024abdomenatlas} dataset.

\begin{figure}[htbp]
        \floatconts
          {fig:pan_liver_qualatitative}
          {\caption{Pancreas and Liver dataset results.}}
          {\includegraphics[width=\linewidth]{imgs/pan_liver_qualatitative.pdf}}
    \end{figure}
    
In Table \ref{tab:quant_finetune}, the models were fine-tuned on the BTCV and AMOS datasets. \textbf{\textit{MedDelinea}} consistently achieves the highest Dice scores across nearly all organs in both datasets. On the BTCV dataset, \textbf{\textit{MedDelinea}} shows superior performance in complex organs such as the liver (96.72\%) and pancreas (63.77\%), outperforming models like MedSegDiff and DiT, which also perform well but not to the same degree of accuracy. Similarly, in the AMOS dataset, \textbf{\textit{MedDelinea}} achieves the highest Dice scores for the liver (93.14\%) and spleen (91.72\%), maintaining its edge across most organ classes.

Table \ref{tab:quant_zeroshot} presents the results from zero-shot inference on the BTCV and AMOS test sets without additional fine-tuning. Again, \textbf{\textit{MedDelinea}} outperforms other models, particularly in difficult organs like the liver (94.21\% in BTCV, 90.51\% in AMOS) and pancreas (60.88\% in BTCV, 46.01\% in AMOS) The reason for \textbf{\textit{MedDelinea}}'s strong performance is its architecture choice which combines diffusion transformers (DiT) with a controllable module. This combination allows the model to effectively capture intricate anatomical structures. The use of extensive pre-training on the ATLAS-8k dataset helps \textbf{\textit{MedDelinea}} build a strong representation of organ structures, making it more adept at handling fine-tuning on target datasets like BTCV and AMOS. This pre-training also explains its strong zero-shot performance, as it generalizes well to new organs without requiring further training.

Additionally, Table \ref{tab:dice_organ_scores} reports the per-organ Dice scores (\%) on the ATLAS-8k dataset. It is clear that our proposed method, MedDelinea, outperforms the other approaches across multiple organs, achieving higher Dice scores in most categories

\textcolor{black}{\textbf{Zero-Shot Performance Analysis on MRI and CT:} To further evaluate the generalization capability of models pretrained solely on CT scans, we report the zero-shot segmentation performance on the AMOS MRI and CT subsets. All models were pretrained on ATLAS-8K (CT-only dataset) and tested on AMOS without any fine-tuning. The quantitative results are presented in Table \ref{tab:mri_vs_ct}. While the pretraining dataset (ATLAS-8K) comprises only CT scans, we observe that models maintain relatively high segmentation performance on MRI scans under zero-shot settings. However, as evident from Table \ref{tab:mri_vs_ct}, the performance on MRI remains consistently lower than on CT across all models; but still its competitive. This can be attributed to several factors:} 

\begin{wrapfigure}{r}{0.6\linewidth} % 'r' for right, 'l' for left
    \centering
    \includegraphics[width=\linewidth]{imgs/Dise_Meddelinea.png}
    \caption{Validation Dice score progression across training iterations.}
    \label{fig:Dice_Meddelinea}
\end{wrapfigure}

\textcolor{black}{1. Structural vs. Intensity-Based Representations in Diffusion Models: Our latent diffusion-based model, MedDelinea, prioritizes shape, spatial structure, and anatomical boundaries rather than absolute intensity values. Unlike CNN-based models, which may rely heavily on modality-specific intensity statistics, diffusion models focus on structural regularities, which are largely consistent across MRI and CT. This enables the model to achieve relatively robust MRI segmentation, despite having been pretrained exclusively on CT scans.} 

\textcolor{black}{2. Impact of Large-Scale Pretraining on Generalization: The diverse range of CT scans within ATLAS-8K allows the model to develop robust representations of organ morphology, which can be effectively leveraged for MRI segmentation. However, differences in contrast, intensity profiles, and noise distributions between CT and MRI still result in an inherent performance gap, as reflected in the lower mDSC and higher mHD95 for MRI.}

\begin{table}[]
\centering
\scriptsize
\caption{\textcolor{black}{Comparison of zero-shot performance on MRI and CT data.}}
\label{tab:mri_vs_ct}
\begin{tabular}{ccccc}
\hline
\multirow{2}{*}{\textcolor{black}{Models}} & \multicolumn{2}{c}{\textcolor{black}{MRI}}         & \multicolumn{2}{c}{\textcolor{black}{CT}}          \\
                        & \textcolor{black}{mDSC}           & \textcolor{black}{mHD95}          & \textcolor{black}{mDSC}           & \textcolor{black}{mHD95}          \\ \hline
\textcolor{black}{UNet}                    & \textcolor{black}{73.37}          & \textcolor{black}{35.45}          & \textcolor{black}{74.17}          & \textcolor{black}{34.01}          \\
\textcolor{black}{TransUnet}               & \textcolor{black}{74.89}          & \textcolor{black}{34.47}          & \textcolor{black}{75.09}          & \textcolor{black}{33.65}          \\
\textcolor{black}{SynergyNet}              & \textcolor{black}{76.22}          & \textcolor{black}{33.39}          & \textcolor{black}{77.38}          & \textcolor{black}{32.88}          \\
\textcolor{black}{ControlNet}              & \textcolor{black}{77.54}          & \textcolor{black}{32.46}          & \textcolor{black}{79.37}          & \textcolor{black}{32.05}          \\
\textcolor{black}{MedSegDiff}              & \textcolor{black}{77.28}          & \textcolor{black}{31.98}          & \textcolor{black}{79.54}          & \textcolor{black}{30.89}          \\
\textcolor{black}{VQDiffusion}             & \textcolor{black}{78.35}          & \textcolor{black}{32.16}          & \textcolor{black}{80.22}          & \textcolor{black}{30.19}          \\
\textcolor{black}{DiT}                     & \textcolor{black}{78.39}          & \textcolor{black}{32.44}          & \textcolor{black}{80.78}          & \textcolor{black}{29.87}          \\
\textcolor{black}{nnUnet}                  & \textcolor{black}{78.11}          & \textcolor{black}{31.05}          & \textcolor{black}{81.37}          & \textcolor{black}{29.56}          \\
\textcolor{black}{nnFormer}                & \textcolor{black}{79.35}          & \textcolor{black}{30.65}          & \textcolor{black}{81.28}          & \textcolor{black}{30.34}          \\
\textcolor{black}{UNITER++}                & \textcolor{black}{79.36}          & \textcolor{black}{29.09}          & \textcolor{black}{82.28}          & \textcolor{black}{28.89}          \\
\textcolor{black}{MedSAM}                  & \textcolor{black}{80.11}          & \textcolor{black}{28.98}          & \textcolor{black}{82.98}          & \textcolor{black}{28.21}          \\
\textcolor{black}{MedDelinea}              & \textbf{\textcolor{black}{81.27}} & \textbf{\textcolor{black}{28.23}} & \textbf{\textcolor{black}{85.32}} & \textbf{\textcolor{black}{26.62}} \\ \hline
\end{tabular}
\end{table}

\textcolor{black}{Additionally, to complement the training loss curves presented in Figure \ref{fig:learning_curve} of the main manuscript, we provide an evaluation of model performance on the validation set using the Dice Similarity Coefficient (DSC). The purpose of this analysis is to assess the segmentation accuracy of different models throughout training and compare their convergence behavior in terms of validation performance. Figure \ref{fig:Dice_Meddelinea} illustrates the mean Dice score computed on the validation set at different training iterations for MedDelinea and baseline models (DiT, ControlNet, and Stable Diffusion). MedDelinea consistently outperforms all baseline models across the training process, achieving the highest Dice score at convergence. On the other hand, DiT and ControlNet demonstrate steady improvement, with their Dice scores plateauing at lower values compared to MedDelinea.}

% \begin{figure}[htbp]
%           {\includegraphics[width=\linewidth]{imgs/Dise_Meddelinea.png}}
%           \caption{\textcolor{black}{Validation Dice score progression across training iterations.}}
%           \label{fig:Dice_Meddelinea}
% \end{figure}


    
%-------------------------------------------------------------------------

\section{Qualitative Results}
\label{sec:qual}
    %%%%%%%%%%%%%%%%%%%%%%%%%%%%
    % \begin{figure*}[]
    %     \centering
    %     \includegraphics[width=\textwidth]{imgs/atlas.png}
    %     \caption{ATLAS-8k dataset results.}
    %     \label{fig:atlas_qualatitative}
    % \end{figure*}

    \begin{figure}[htbp]
          {\includegraphics[width=\linewidth]{imgs/atlas.png}}
          \caption{ATLAS-8k dataset results.}
          \label{fig:atlas_qualatitative}
    \end{figure}
          
    \begin{figure}[htbp]
      \includegraphics[width=\linewidth]{imgs/Attmap2.png}
      \caption{Attention maps for BTCV and AMOS datasets.}
      \label{fig:attention_map}
    \end{figure}

    
    %%%%%%%%%%%%%%%%%%%%%%%%%%%%
    \textbf{Additional External Data for Test: MRI Pancreas and liver:} The qualitative results in Fig.\ref{fig:pan_liver_qualatitative} highlight the segmentation performance of various models for the pancreas and liver. In the fine-tuned pancreas results (top row), \textbf{\textit{MedDelinea}} closely matches the ground truth, accurately capturing the pancreas boundaries, while models like MedSegDiff and VQ-Diffusion struggle with precision. In the zero-shot pancreas results (second row), \textbf{\textit{MedDelinea}} still outperforms other models, maintaining accurate segmentation without additional training, whereas DiT and MedSegDiff miss finer details. For the liver segmentation (third and fourth rows), \textbf{\textit{MedDelinea}} demonstrates superior performance in both fine-tuned and zero-shot scenarios. In the fine-tuned case (third row), \textbf{\textit{MedDelinea}} provides precise liver boundaries, surpassing models like SynergyNet and MedSegDiff, which miss key regions. In the zero-shot liver results (fourth row), \textbf{\textit{MedDelinea}} continues to lead, while models like ControlNet and VQ-Diffusion fail to fully capture the liver’s structure. Overall, \textbf{\textit{MedDelinea}} excels in both settings, particularly for complex organs like the pancreas and liver, where other models struggle.

    \textbf{ATLAS-8k Dataset Results:} The qualitative results in Fig. \ref{fig:atlas_qualatitative} show the segmentation performance of various models on the ATLAS-8k dataset \cite{qu2024abdomenatlas}, including SynergyNet, MedSegDiff, ControlNet, VQ-Diffusion, DiT, and \textbf{\textit{MedDelinea}}. Across different input images, \textbf{\textit{MedDelinea}} consistently provides more accurate and precise segmentation results, closely matching the ground truth. For larger organs like the liver and spleen, \textbf{\textit{MedDelinea}} performs well, capturing their shape and boundaries with high fidelity. Similarly, for smaller and more challenging organs, \textbf{\textit{MedDelinea}} shows fewer segmentation errors compared to other models, which either over-segment or under-segment the regions.

    \textbf{Attention Maps:} Attention maps in Fig.~\ref{fig:attention_map} further highlight \textbf{\textit{MedDelinea}}'s targeted attention mechanism, which focuses on fine-grained details and captures complex structures more effectively than other models. This superior attention enables \textbf{\textit{MedDelinea}} to achieve better boundary delineation and segmentation performance, making it a robust and generalizable model for clinical applications.

\section{3D Visualization:}
\label{sec:3dquant}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%

    \begin{figure}[htbp]
        \floatconts
          {fig:3d}
          {\caption{3D Visualisation of AMOS and BTCV Dataset Segmentation Map}}
          {\includegraphics[width=\linewidth]{imgs/3d_maps.pdf}}
    \end{figure}

In Fig. \ref{fig:3d}, shows a clearer comparison of how different models—MedSegDiff, ControlNet, DiT, and \textbf{\textit{MedDelinea}} (ours) capture the anatomical structures from the BTCV and AMOS datasets. These visualisations help to assess the overall quality and consistency of segmentation across various organs.

\textbf{\textit{MedDelinea}} consistently produces segmentation maps that closely align with the ground truth, demonstrating its ability to capture fine details and maintain accurate boundaries, particularly in complex and small anatomical regions. In comparison, MedSegDiff and ControlNet show more inconsistencies and less precision, often struggling with challenging areas where boundary delineation is critical. DiT performs better than MedSegDiff and ControlNet but still exhibits less refinement in capturing finer structures compared to \textbf{\textit{MedDelinea}}.

For instance, in the pancreas region of the AMOS dataset, as visualized in the third row of the maps, \textbf{\textit{MedDelinea}} produces a segmentation that closely aligns with the ground truth. The pancreas, a smaller and more complex organ to segment, is accurately captured by \textbf{\textit{MedDelinea}} with well-defined boundaries and minimal missing regions. In contrast, MedSegDiff and ControlNet show visible errors in this region, with fragmented or incomplete segmentations. MedSegDiff, for instance, misses part of the pancreas, leaving gaps, while ControlNet over-segments, blending boundaries with adjacent structures. DiT provides a better representation compared to MedSegDiff and ControlNet, but still lacks the sharp precision that \textbf{\textit{MedDelinea}} demonstrates in this area.


  

    %%%%%%%%%%%%%%%%%%%%%%%%%%%%


\section{Further Related Works}
\textbf{Traditional Approaches:} Medical image segmentation plays a critical role in clinical diagnostics, involving the classification of pixels in medical images (e.g., CT and MRI) to delineate anatomical structures or abnormalities. Convolutional neural network (CNN) based models, particularly UNet \cite{ronneberger2015u} and its variants \cite{oktay2018attention, zhang2019net, lou2021dc}, have been widely used due to their encoder-decoder architecture, which captures both local and global context. Despite their success, CNNs struggle with long-range dependencies, limiting their effectiveness in segmenting complex anatomical structures. This has led to a shift towards transformer-based methods.

\textbf{Shift to Transformer-Based Models:} To address the limitations of CNNs, transformer-based models have been introduced. TransUNet \cite{chen2021transunet} combines the strengths of CNNs for local feature extraction with transformers for long-range dependency modeling. This hybrid architecture significantly improves segmentation accuracy. Other models like Swin-UNet \cite{cao2022swin} and DS-TransUNet \cite{lin2022ds} further refine this approach by incorporating Swin Transformers, which capture multi-scale features using shifted windows and hierarchical structures. While these models enhance the understanding of complex anatomical features, they introduce challenges like increased computational complexity and reliance on large-scale pre-training datasets.


\begin{table*}[t]
    \caption{Per organ Dice scores (\%) on the ATLAS-8k dataset. Higher values indicate better segmentation performance.}
    \label{tab:dice_organ_scores}
    \centering
    \tiny
    \begin{tabular}{lcccccccc}
    \hline
    \textbf{Methods} & \textbf{Aorta} & \textbf{Gallbladder} & \textbf{KidneyL} & \textbf{KidneyR} & \textbf{Liver} & \textbf{Pancreas} & \textbf{Spleen} & \textbf{Stomach} \\
    \hline
    \textbf{UNet}           & 67.72 & 53.37 & 64.11 & 63.34 & 86.02 & 35.67 & 77.88 & 73.21 \\
    % \cite{ronneberger2015u}
    \textbf{TransUnet}     & 74.51 & 58.92 & 67.82 & 67.54 & 87.18 & 41.34 & 81.82 & 73.98 \\
    % \cite{chen2021transunet}
    \textbf{SynergyNet} & 78.11 & 60.11 & 70.11 & 71.52 & 88.12 & 46.89 & 87.19 & 81.30 \\
    % \cite{gorade2024synergynet}
    \textbf{ControlNet}      & 80.01 & 65.73 & 73.39 & 73.78 & 89.19 & 54.78 & 89.02 & 85.09 \\
    % \cite{zhang2023adding}
    \textbf{MedSegDiff}    & 79.92 & 65.27 & 75.11 & 76.01 & 91.78 & 58.94 & 90.65 & 84.11 \\
    % \cite{wu2024medsegdiff}
    \textbf{VQDiffusion}        & 79.17 & 65.34 & 76.23 & 77.02 & 90.72 & 58.55 & 89.77 & 81.01 \\
    % \cite{gu2022vector}
    \textbf{DiT}         & 79.17 & 65.34 & 76.23 & 77.02 & 90.72 & 58.55 & 89.77 & 81.01 \\
    % \cite{peebles2023scalable}
    \textbf{\textit{MedDelinea}}                    & \textbf{82.34} & \textbf{68.09} & \textbf{79.01} & \textbf{80.11} & \textbf{93.14} & \textbf{61.21} & \textbf{91.72} & \textbf{86.61} \\
    \hline
    \end{tabular}
    \normalsize
\end{table*}


\end{document}
