\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images
\usepackage{enumitem}
\usepackage{multirow}
\usepackage{listings}

\usepackage{algorithm}

\usepackage{algpseudocode}

\usepackage{amsmath}
 


\jmlrvolume{-- nnn}
\jmlryear{2025}
\jmlrworkshop{Full Paper -- MIDL 2025}
\editors{Accepted for publication at MIDL 2025}

%\title{SegResMamba: A Mamba Based Efficient Architecture for 3D Medical Image Segmentation}
\title[SegResMamba for Medical Image Segmentation]{SegResMamba: An Efficient Architecture for 3D Medical Image Segmentation}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% % Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% % % More complicate cases, e.g. with dual affiliations and joint authorship
% \midlauthor{\Name{Badhan Kumar Das\nametag{$^{ 1,2}$}} \Email{badhan.das@fau.de}\\
% \addr $^{1}$ Siemens Healthineers AG \\
% \addr $^{2}$ FAU Erlangen Nuremberg \AND
% \Name{Ajay Singh\nametag{$^{2}$}} \Email{ajay.singh@fau.de}\\
% \Name{Saahil Islam\nametag{$^{1,2}$}} \Email{saahil.islam@fau.de}\\
% \Name{Gengyan Zhao\nametag{$^{3}$}} \Email{gengyan.zhao@siemens-healthineers.com}\\
% \addr $^{3}$ Siemens Medical Solutions USA, Inc.\AND
% \Name{Andreas Maier\nametag{$^{2}$}} \Email{andreas.maier@fau.de}\\
% }


% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Badhan Kumar Das\nametag{$^{ 1,2}$}} \Email{badhan.das@fau.de}\\
\Name{Ajay Singh\nametag{$^{2}$}} \Email{ajay.singh@fau.de}\\
\Name{Saahil Islam\nametag{$^{1,2}$}} \Email{saahil.islam@fau.de}\\
\Name{Gengyan Zhao\nametag{$^{3}$}} \Email{gengyan.zhao@siemens-healthineers.com}\\
\Name{Andreas Maier\nametag{$^{2}$}} \Email{andreas.maier@fau.de}\\
\addr $^{1}$ Siemens Healthineers AG\\
\addr $^{2}$ FAU Erlangen Nuremberg\\
\addr $^{3}$ Siemens Medical Solutions USA, Inc.
}



\begin{document}

\maketitle

\begin{abstract}
The Transformer architecture has opened a new paradigm in the domain of deep learning with its ability to model long-range dependencies and capture global context and has outpaced the traditional Convolution Neural Networks (CNNs) in many aspects. However, applying Transformer models to 3D medical image datasets presents significant challenges due to their high training time, and memory requirements, which not only hinder scalability but also contribute to elevated CO$_2$ footprint. This has led to an exploration of alternative models that can maintain or even improve performance while being more efficient and environmentally sustainable. Recent advancements in Structured State Space Models (SSMs) effectively address some of the inherent limitations of Transformers, particularly their high memory and computational demands. Inspired by these advancements, we propose an efficient 3D segmentation model for medical imaging called SegResMamba, designed to reduce computation complexity, memory usage, training time, and environmental impact while maintaining high performance. Our model uses less than half the memory during training compared to other state-of-the-art (SOTA) architectures, achieving comparable performance with significantly reduced resource demands.
\end{abstract}

\begin{keywords}
Mamba, State Space Models, Vision Transformer, Medical Image Segmentation
\end{keywords}

\section{Introduction}

% Medical imaging domain is characterized by the constraint of limited data availability, juxtaposed with critical 
% % high diagnostic accuracy or highest standards of precision and reliablity
% requirement of high diagnostic accuracy. This dichotomy presents a significant challenge in utilizing the full scale of Transformer architecture. For instance, datasets such as Spleen Segmentation\cite{antonelli2022medical} and BTCV Segmentation\cite{Landman2015} contain very few samples, making them unsuitable for large Transformer-based models like UNETR\cite{hatamizadeh2021unetrtransformers3dmedical}, SwinUnetr\cite{hatamizadeh2021swin}. Furthermore, the complexity increases with 3D data, as it demands significant memory resources and meticulous parameter tuning to fit models on datasets like BraTS\cite{baid2021rsna}. This has prompted the exploration of alternative architectures, such as State Space Models.

The Transformer architecture has revolutionized deep learning by effectively modeling long-range dependencies and capturing global context. However, its application to 3D medical imaging datasets presents significant challenges, including high memory requirements, computational complexity, and prolonged training times. These challenges are particularly pronounced in tasks involving large datasets like BraTS \cite{baid2021rsna} and BTCV Segmentation \cite{Landman2015}, where training Transformer-based models such as UNETR \cite{hatamizadeh2021unetrtransformers3dmedical} and SwinUnetr \cite{hatamizadeh2021swin} demands substantial resources. Furthermore, transformer models often struggle with smaller datasets, such as Spleen Segmentation \cite{antonelli2022medical}, where their performance is suboptimal. The environmental impact of Transformers, driven by their elevated training times, has raised concerns. This has led to a growing interest in alternative architectures such as structured state space models (SSMs), which reduce computational demands and training time, offering a more efficient solution for medical image analysis.


State-space architectures like Mamba \cite{gu2024mambalineartimesequencemodeling}, S4 \cite{gu2022efficientlymodelinglongsequences}, and S4nd \cite{nguyen2022s4ndmodelingimagesvideos} have gained popularity due to their solid foundation in Kalman Filters \cite{Kalman1960}. In contrast, CNN-based models like U-Net \cite{ronneberger2015unetconvolutionalnetworksbiomedical} and SegResNet \cite{myronenko20183dmribraintumor} are effective but have a limited receptive field. Hybrid models like UNETR and SwinUnetr\cite{hatamizadeh2021swin} combine CNNs and Transformers\cite{vaswani2023attentionneed} to enhance performance, though Transformers remain computationally demanding, limiting their practicality in resource-constrained clinical settings. Numerous studies have adapted Mamba to address this issue by modeling long-range dependencies with innovative selection mechanisms \cite{zhu2024visionmambaefficientvisual, wang2024mambaunetunetlikepurevisual, liu2024vmambavisualstatespace, liao2024lightm, wang2024weak}.

3D image segmentation methods, such as U-Mamba\cite{ma2024u} and SegMamba\cite{xing2024segmambalongrangesequentialmodeling}, leverage hybrid CNN-SSM blocks to combine the local feature extraction capabilities of convolutions with the ability of SSMs to capture long-range dependencies. Inspired by these models, we propose SegResMamba, which uses the benefits of Mamba while further reducing memory consumption and computational requirements, thereby enhancing training efficiency. SegResMamba is a lightweight Mamba-based 3D image segmentation model that offers comparable performance to other state-of-the-art (SOTA) models while significantly increasing overall efficiency. Our approach employs Tri-orientated Mamba (ToM) to enhance long-range contextual understanding, combined with CNNs for effective local feature extraction. A convolution mamba mixed block (CMMB) efficiently captures both local and global features, starting with a convolutional bottleneck and leveraging Mamba's global modeling capabilities. 


% \begin{figure}[h]
%     \centering
%     \includegraphics[width=0.9\linewidth]{new_curve.png}
%     \caption{Average Dice Scores for BTCV, Spleen, and BRATS datasets plotted against memory usage (in GB) for different models (using image size $128\times128\times128$ for BTCV and BRATS dataset and $96\times96\times96$ for Spleen dataset with batch size 1). }
%     \label{fig:enter-peakmemoryl}
% \end{figure}


% Our approach employs Tri-orientated Mamba (ToM) to enhance long-range contextual understanding, combined with CNNs for effective local feature extraction. A convolution mamba mixed block (CMMB) efficiently captures both local and global features, starting with a convolutional bottleneck and leveraging Mamba's global modeling capabilities. Unlike models like UNETR and SegMamba, our design eliminates the need for auxiliary encoders, integrating features directly into a streamlined decoder with skip connections to maintain spatial accuracy. The decoder uses non-trainable linear interpolation for efficient upsampling. SegResMamba optimizes 3D medical image segmentation, offering practical benefits for clinical diagnostics and treatment planning.


\section{Methodology}

\begin{figure*}[ht]
    \centering
    \includegraphics[width=1.0\linewidth]{arch_segresmamba.png}
    \caption{a) Overview of SegResMamba architecture, b) Convolution Mamba mixed block, and c) Tri-oriented Mamba}
    \label{fig:conv_mamba_block}
\end{figure*}


%Our model consists of an encoder, a decoder, and skip connections between the encoder and decoder \cite{ronneberger2015unetconvolutionalnetworksbiomedical} as shown in Figure \ref{fig:conv_mamba_block}. The encoder utilizes encoder blocks that consist of downsampling layers, convolution mamba mixed blocks consisting of convolution, and tri-oriented Mamba blocks \cite{xing2024segmambalongrangesequentialmodeling}.


Our model is designed to be computationally efficient while maintaining competitive performance in medical image segmentation. To achieve this, we developed a powerful encoder for efficient feature extraction, followed by a lightweight decoder to reconstruct the segmentation mask. The architecture consists of an encoder, a decoder, and skip connections between them \cite{ronneberger2015unetconvolutionalnetworksbiomedical}, as illustrated in Figure \ref{fig:conv_mamba_block}. %Following feature extraction of the encoder, an MLP block with Instance Normalization \cite{ulyanov2017instancenormalizationmissingingredient} is applied to normalize activations and stabilize training. Intermediate outputs are then directly passed to the decoder via skip connections, ensuring high-resolution feature retention for precise segmentation.


%The encoder utilizes encoder blocks that consist of downsampling layers, convolution mamba mixed blocks consisting of convolution, and tri-oriented Mamba blocks \cite{xing2024segmambalongrangesequentialmodeling}. Our lightweight decoder is then used to generate segmentation masks from encoder outputs.


% As modeling global features and multi-scale features is critically important for 3D medical image segmentation, the Tri-Orientated Mamba (ToM) module, as proposed by Xing et al. \cite{xing2024segmambalongrangesequentialmodeling}, addresses this requirement by computing feature dependencies in three distinct directions: forward (\(z_f\)), reverse (\(z_r\)), and inter-slice (\(z_s\)).

% The ToM module begins by flattening the 3D input features into sequences corresponding to each of these directions. This transformation enables the module to effectively capture sequential dependencies within the high-dimensional feature space. Subsequently, the Mamba layer is applied to each sequence to model global information comprehensively.

% %Specifically:

% \begin{itemize}
% \item \textbf{Forward Direction  (\(z_f\))}: Processes the features in sequential order, capturing dependencies from the beginning to the end of the sequence.
% \item \textbf{Reverse Direction (\(z_r\))}: Processes the features in reverse order, capturing dependencies from the end to the beginning.
% \item \textbf{Inter-Slice Direction (\(z_s\))}: Processes features across different slices, capturing dependencies between slices.
% \end{itemize}

% After independently processing each direction, the outputs are fused to obtain the final 3D feature representation. The fusion is achieved by summing the outputs of the Mamba layers applied to each direction:

% \[
% \text{ToM}(z) = \text{M}(z_f) + \text{M}(z_r) + \text{M}(z_s),
% \]

% where \( \mathbf{M}_f \), \( \mathbf{M}_r \), and \( \mathbf{M}_s \) denote the Mamba layers applied to the forward, reverse, and inter-slice directions, respectively.

% This approach ensures that comprehensive 3D feature interactions are captured, integrating global information from all three directions to enhance the segmentation performance of 3D medical images.

\subsection{Encoder}

To minimize the overall computational cost, particularly for the downstream decoder, we designed a powerful yet efficient encoder that extracts high-quality features. Our method integrates CNNs and Mamba blocks to effectively capture both local and global feature representations while maintaining computational efficiency. %Our approach combines CNNs and Mamba blocks to leverage both local and global feature representations. %In our Convolution Mamba  Mixed Block, at first features are extracted using convolution and then Mamba effectively processes low-level extracted features, enhancing feature representation. We incorporate skip connections to another Mamba block to capture global features directly. However, with this addition also, the computational complexity remains linear, which is still better than the quadratic complexity of the transformer.
The encoder is composed of four cascaded blocks, each designed to progressively downsample spatial dimensions while extracting multi-scale hierarchical features. Each block includes Downsampling Layers, convolution and Convolution Mamba Mixed Blocks%, and Multi-Layer Perceptron (MLP) blocks \cite{haykin1994neural} 
as shown in Figure \ref{fig:conv_mamba_block}a.

The encoder begins with an initial downsampling layer, which applies a Conv3D operation with a $7\times7\times7$ kernel, a stride of $2\times2\times2$, and padding of $3\times3\times3$. This larger receptive field allows for a comprehensive feature abstraction at the early stage. Next, a Convolution Mamba Mixed Block is applied to refine these features before further downsampling.
The remaining three downsampling layers use $2\times2\times2$ Conv3D kernels, maintaining a balance between feature granularity and computational efficiency. Additionally, an extra convolutional layer is introduced before these downsampling operations to preserve essential features.



\textbf{Convolution Mamba Mixed Block:}
We introduce the convolution mamba mixed block, which integrates convolutional operations and Tri-oriented Mamba (ToM) layers to achieve hierarchical feature extraction across multiple receptive fields as shown in Figure \ref{fig:conv_mamba_block}b.

It begins with a larger \( 5 \times 5 \times 5 \) convolution kernel that effectively reduces the spatial dimensions while extracting coarse-grained features. These features are further refined through a \( 3 \times 3 \times 3 \) convolution, which captures local contextual relationships.

 A ToM Layer is then applied to this refined representation, enabling the abstraction of long-range dependencies and creating a more comprehensive understanding of the local context learned by the convolution filters. As shown in Figure \ref{fig:conv_mamba_block}c, the ToM module computes feature dependencies in
three distinct directions: forward (\(z_f\)), reverse (\(z_r\)), and inter-slice (\(z_s\)) by flattening the 3D input features $F_2$.

\[
\text{ToM}(z) = \text{Mamba}(z_f) + \text{Mamba}(z_r) + \text{Mamba}(z_s),
\]


To recover the original spatial resolution, we employ a \( 3 \times 3 \times 3 \) convolution and a \( 5 \times 5 \times 5 \) transposed convolution. Then we have a residual connection to generate the enhanced feature representation by calculating the sum of the extracted feature map and the original feature map, which can preserve the gradient flow and retain the original feature information. Finally, a second ToM layer further captures long-range dependencies across the enhanced feature representation. The whole flow of Convolution Mamba Mixed Block is shown in Algorithm \ref{alg:net}.

% This feature processing paradigm enables our encoder to effectively model both local spatial correlations and global contextual relationships, addressing the fundamental limitations of purely convolutional or purely sequence-based approaches.





\begin{algorithm}
\caption{\textbf{Convolution Mamba Mixed Block}}
\begin{algorithmic}[1]
\State \textbf{Input:} Tensor \( X \in \mathbb{R}^{C \times D \times H \times W} \)
\State \textbf{Output:} Feature representation \( F_{\text{out}} \)

%\State \textbf{1. Feature Extraction:}
\State \( F_1 \gets \text{Conv}_{5 \times 5 \times 5}(X) \)
\State \( F_2 \gets \text{Conv}_{3 \times 3 \times 3}(F_1) \)

%\State \textbf{2. Long-range Dependency Modeling:}
\State \( F_3 \gets \text{ToM}(F_2) \)

%\State \textbf{3. Spatial Resolution Recovery:}
\State \( F_4 \gets \text{Conv}_{3 \times 3 \times 3}(F_3) \)
\State \( F_5 \gets \text{ConvT}_{5 \times 5 \times 5}(F_4) \)

%\State \textbf{4. Residual Connection and Enhancement:}
\State \( F_6 \gets F_5 + X \)
\State \( F_{\text{out}} \gets \text{ToM}(F_6) \)

\State \textbf{Return} \( F_{\text{out}} \)
\end{algorithmic}
\label{alg:net}
\end{algorithm}




%To reverse the spatial reduction process a $3\times3\times3$ convolution layer and a $5\times5\times5$ convolution layer are used which restore the feature map's spatial dimensions to their original size. Following that, the extracted features are added to the original features passed through a skip connection. The combined feature maps are then fed into another ToM layer to further capture long-range dependencies and refine the representation.


% The Convolution Mamba Mixed Block integrates convolutional layers and ToM layers to extract features across multiple scales. It starts with a $5\times5\times5$ convolution kernel, which effectively reduces spatial dimensions while capturing coarse-grained features. This is followed by a $3\times3\times3$ convolution layer, refining local details.

% To further enhance feature representation, a ToM layer is applied to the refined features, capturing long-range dependencies and enriching the local context learned by convolution filters. The ToM module computes feature dependencies in three distinct directions: forward (\(z_f\)), reverse (\(z_r\)), and inter-slice (\(z_s\)). This enables the model to capture sequential dependencies across high-dimensional feature space.

% After processing, a $3\times3\times3$ convolution layer and a $5\times5\times5$ convolution layer restore the original spatial dimensions. The output is then fused with the original features via a skip connection before passing through another ToM layer, further refining long-range feature dependencies.

% \textbf{Hierarchical Feature Extraction:}
% The encoder architecture comprises four cascaded blocks, each meticulously designed to extract hierarchical features from the input 3D medical image while progressively downsampling the spatial dimensions. Each block sequentially employs Downsampling Layers, convolution mamba mixed blocks, and Multi-Layer Perceptron (MLP) \cite{haykin1994neural} blocks.



% % As illustrated in Figure \ref{fig:Encoder Block}, 
% The encoder commences with a downsampling layer applied to the input 3D medical image. The initial downsampling layer employs a Conv3D layer with a larger kernel size of $7\times7\times7$, a stride of $2\times2\times2$, and padding of $3\times3\times3$. This expansive receptive field facilitated by the larger kernel aids in capturing more comprehensive contextual information during feature abstraction. Then onvolution mamba mixed block is used for feature extraction. 
% Subsequently, the remaining three downsampling layers utilize smaller Conv3D kernels with dimensions of $2\times2\times2$, a stride of $2\times2\times2$, and padding of $3\times3\times3$. This adjustment in kernel size balances feature extraction granularity with computational efficiency as the network progresses through deeper layers. We also added an additional convolutional layer before these downsampling operations to preserve essential features.
% After this, we apply an MLP block coupled with Instance Normalization \cite{ulyanov2017instancenormalizationmissingingredient} to normalize the activations and improve stability during training. Notably, these intermediate outputs are directly utilized in the decoder.

%\textbf{Hierarchical Feature Extraction:}
% The encoder is composed of four cascaded blocks, each designed to progressively downsample spatial dimensions while extracting multi-scale hierarchical features. Each block includes Downsampling Layers, Convolution Mamba Mixed Blocks, and Multi-Layer Perceptron (MLP) blocks \cite{haykin1994neural}.

% The encoder begins with an initial downsampling layer, which applies a Conv3D operation with a $7\times7\times7$ kernel, a stride of $2\times2\times2$, and padding of $3\times3\times3$. This larger receptive field allows for a comprehensive feature abstraction at the early stage. Next, a Convolution Mamba Mixed Block is applied to refine these features before further downsampling.
% The remaining three downsampling layers use $2\times2\times2$ Conv3D kernels, maintaining a balance between feature granularity and computational efficiency. Additionally, an extra convolutional layer is introduced before these downsampling operations to preserve essential features.

% Following feature extraction, an MLP block with Instance Normalization \cite{ulyanov2017instancenormalizationmissingingredient} is applied to normalize activations and stabilize training. Intermediate outputs are then directly passed to the decoder via skip connections, ensuring high-resolution feature retention for precise segmentation.

\subsection{Decoder}
% Our decoder is designed to be lightweight compared to other SOTA models like UNETR, SwinUNETR and SegMamba because the extracted features from the encoder is good. This lightweight decoder helps us to reduce computational complexity and memory usage.

Our decoder is intentionally designed to be lightweight compared to state-of-the-art models like UNETR, Swin-UNETR, and SegMamba, to reduce the computational cost of the model. The responsibility of maintaining the model's performance is on the uniquely designed, powerful yet efficient encoder. This efficient design reduces both computational complexity and memory usage while maintaining strong segmentation performance.

The decoder leverages both the encoded features from the encoder and the intermediate results from the encoding process. At each level, the decoder connects to the corresponding encoder layer through an MLP \cite{haykin1994neural} with Instance Normalization \cite{ulyanov2017instancenormalizationmissingingredient}, which normalizes activations and stabilizes training. To ensure high-resolution feature retention for precise segmentation, intermediate outputs after the MLP are directly passed to the decoder. 



%At each level the decoder is connected with encoder layer through an MLP layer. Following feature extraction of the encoder, an MLP block with Instance Normalization \cite{ulyanov2017instancenormalizationmissingingredient} is applied to normalize activations and stabilize training. Intermediate outputs are then directly passed to the decoder via skip connections, ensuring high-resolution feature retention for precise segmentation.



The decoder is structured with three distinct upsampling stages, designed to progressively refine and expand the spatial resolution of the features. The main input to the decoder has a shape of 768 in the channel dimension. 
At each stage, the feature map is upsampled and its channel count is halved. This process uses a $1\times1\times1$ convolution operation followed by an upsampling layer. Inside the upsampling layer, we use non-trainable linear interpolation from Monai\cite{cardoso2022monai}. 

Upsampled features are combined with the corresponding intermediate features received during the encoding process. Instead of concatenation, these intermediate features are summed with the corresponding upsampled features at each level. In this way, the computational complexity of the decoder is further reduced. The combined features are processed through a sequence of residual blocks. The residual block consists of the ReLU activation function, Group Norm, and convolution kernel of $3\times3\times3$. We use two of these blocks and a skip connection from the input of these residual blocks to get an output. This architecture combines efficient upsampling with skip connections and residual learning, allowing it to reconstruct detailed spatial information while maintaining the ability to learn complex features at multiple scales.
After getting the output from three decoder blocks we use a transposed convolution layer to get the final segmented output. This design is lightweight, being both memory and computation-efficient.

%\subsection{Skip Connections}
%Following feature extraction of the encoder, an MLP block with Instance Normalization \cite{ulyanov2017instancenormalizationmissingingredient} is applied to normalize activations and stabilize training. Intermediate outputs are then directly passed to the decoder via skip connections, ensuring high-resolution feature retention for precise segmentation.


\section{Experiments \& Results}

\subsection{Dataset and Implementation Details}





\textbf{3D Multi-organ Segmentation (BTCV Challenge):}The 3D Multi-organ Segmentation dataset from the BTCV Challenge \cite{Landman2015} focuses on the segmentation of 13 abdominal organs. The dataset comprises 30 volumetric images, with 24 volumes allocated for training and the remaining 6 reserved for testing and evaluation. Each volumetric image provides detailed 3D representations of abdominal structures, essential for medical imaging and diagnosis. The task involves accurately delineating each of the 13 specified organs within these scans. 

\textbf{BraTS 2021:} The BraTS 2021 dataset\cite{baid2021rsna} comprises 1,251 multi-parametric magnetic resonance (mpMRI) brain scans, each annotated with segmentation masks delineating tumorous regions. Each scan includes four modalities: Fluid Attenuated Inversion Recovery (FLAIR), native T1-weighted (T1), post-contrast T1-weighted (T1Gd), and T2-weighted (T2) images. Three recombined regions—the tumor core, the entire tumor, and the enhancing tumor—are used to quantify performance using 5-fold cross-validation.


\textbf{Spleen 3d Segmentation:} The Spleen 3D Segmentation dataset \cite{antonelli2022medical} focuses on segmenting spleens within portal-venous phase CT scans from patients undergoing chemotherapy treatment for liver metastases. The dataset consists of 61 volumetric CT scans, with 41 scans designated for training and the remaining 20 reserved for testing and evaluation. Each scan provides detailed 3D representations of abdominal anatomy, emphasizing the spleen and its surrounding structures during the portal-venous phase. The segmentation task involves accurately delineating the spleen, which is critical for assessing spleen-related conditions and treatment responses in oncology patients. 



We used Dice loss and weighted ADAM optimizer for training. Dice similarity coefficient was used for quantitative evaluations.
Our experiments used the PyTorch with MONAI \cite{cardoso2022monai} framework for model implementation. All experiments were conducted on a single NVIDIA A100 GPU (40GB). 

\subsection{Results}

\begin{figure}[ht]
    \centering
    \includegraphics[width=.70\linewidth]{new_curve.png}
    \caption{Average Dice Scores for BTCV, Spleen, and BraTS2021 datasets plotted against training memory (in GB) for different models using image size $128\times128\times128$ for BTCV and BRATS dataset and $96\times96\times96$ for Spleen dataset with batch size 1. }
    \label{fig:enter-peakmemoryl}
\end{figure}


Figure \ref{fig:enter-peakmemoryl} illustrates the relationship between the peak memory consumption during training and segmentation accuracy, measured by the Average Dice Score, across various models. It can be observed that our method uses comparatively less memory than other large models like Swin Unetr and SegMamba while still maintaining comparable performance.



% \begin{table}[ht]
%     \centering
%     \small % Reduce the font size
%     \begin{tabular}{ccc}
%     \hline
%     \textbf{Model} & \textbf{Num of Params (in million)}&\textbf{FLOP Count (in GMac)} \\ \hline
%     UNET         & 4.81 & 30.13 \\    
%     UNETR           & 102.45 & 203.29  \\  
%     SegMamba       & 67.4  & 1575.13 \\
%     SwinUnetr      & 62.19  & 792.08 \\  
%     SegResMamba    & 120.12  & 340.52 \\ \hline
%     \end{tabular}
%     \caption{Num of parameters (in million) and  FLOP count of different models (in GMac) for the BraTS dataset.}
%     \label{tab:flop_count}
% \end{table}




%\subsection{Multi Organ Segmentation}
Table \ref{tab:results_BTCV} shows that SegResMamba, while having a smaller memory footprint and lower Multiply-Accumulate operations (MACs), achieves better or comparable performance on the BTCV dataset comparing with other more memory-intensive and computationally expensive models. SegResMamba outperforms nnFormer \cite{zhou2021nnformer}, 3D UX-Net \cite{lee20223d}, and nnUNet \cite{isensee2021nnu,shaker2024unetr++} on computation cost, Inference Time as well as the segmentation performance.

% \begin{table}[ht]
%     \centering
%     \begin{tabular}{c c c}
%         \hline
%         \textbf{Model} & \textbf{Avg Dice (BTCV)} & \textbf{Avg Dice (Spleen)} \\ \hline
    
%         UNETR      & 0.8027 & 0.8642\\  
%         SegMamba   & 0.8430 & 0.9004\\  
%        % Segresnet  & 0.8155 & 0.7023\\  
%         UNET       & 0.8192 & 0.8195\\  
%         SwinUnetr  & 0.8389 & 0.9126\\ 
%         SegResMamba  & 0.8361 & 0.9147\\ \hline
%     \end{tabular}
%     \caption{Average Dice scores of different models on the BTCV and Spleen segmentation dataset.}
%     \label{tab:results_BTCV}
% \end{table}


% \begin{table}[ht]
%     \centering
%     \small % Reduce the font size
%     \begin{tabular}{ccc}
%         % First subtable
%         \begin{minipage}[t]{0.45\textwidth}
%         \centering
%         \begin{tabular}{ccc}
%             \hline
%             \textbf{Model} & \textbf{MACs} &\textbf{Avg Dice} \\ \hline
%             UNETR         & 196.03G & 0.8027 \\  
%             nnUNeT          & 1078.08   & 0.8316 \\
%             SegMamba      & 1554.86G & 0.8430 \\  
%             UNET          & 60.10G & 0.8192 \\  
%             SwinUnetr     & 784.46G & 0.8389 \\
%             nnFormer & - & 0.8162 \\
%             UNETR ++  & - & 0.8328 \\
%             SegResMamba   & 336.45G & 0.8361 \\ \hline
%         \end{tabular}
%         \caption{Average Dice scores of models on the BTCV dataset.}
%         \label{tab:results_BTCV}
%         \end{minipage}
%         &
%         % Second subtable
%         \begin{minipage}[t]{0.45\textwidth}
%         \centering
%         \begin{tabular}{ccc}
%             \hline
%             \textbf{Model} & \textbf{MACs} &\textbf{Avg Dice} \\ \hline
%             UNETR         & 82.52G & 0.8642 \\  
%             SegMamba      & 655.32G & 0.9004 \\  
%             UNET          & 11.53G & 0.8195 \\  
%             SwinUnetr     & 328.68G & 0.9126 \\  
%             SegResMamba   & 137.84G & 0.9147 \\ \hline
%         \end{tabular}
%         \caption{Average Dice scores of models on the Spleen dataset.}
%         \label{tab:results_Spleen}
%         \end{minipage}
%     \end{tabular}
% \end{table}



\begin{table}[ht]
    \centering
    %\small % Reduce the font size
    \begin{tabular}{cccc}
        \hline
        \textbf{Model} & \textbf{MACs} & \textbf{Inference Time (sec)} &\textbf{Avg Dice} \\ \hline
        UNETR         & 196.03G & 0.0531 & 0.8027 \\  
        SegMamba      & 1554.86G & 0.1693 & 0.8430 \\  
        UNET          & 60.10G & 0.0273 & 0.8192 \\  
        SwinUnetr     & 784.46G & 0.1343 & 0.8389 \\
        nnFormer      & 648.10G & 0.0958 & 0.8239 \\
        %UNETR ++      & - & 0.8328 \\
        nnUNet        & 1067.97G & 0.1668 & 0.8316 \\
        3D UX-Net & 1498.66G & 0.1338 & 0.8326 \\
        SegResMamba   & 336.45G & 0.0841 & 0.8361 \\ \hline
    \end{tabular}
    \caption{Average Dice scores of models on the BTCV dataset.}
    \label{tab:results_BTCV}
\end{table}


% \begin{table}[ht]
%     \centering
%     %\small % Reduce the font size
%     \begin{tabular}{|c|cccc|c|c|}
%     \hline
%     \textbf{Model} & \multicolumn{4}{c|}{\textbf{BraTS21 (Dice Score)}} & \textbf{BTCV} & \textbf{Spleen} \\ \cline{2-5}
%                    & \textbf{TC} & \textbf{WT} & \textbf{ET} & \textbf{Mean} & \textbf{Avg Dice} & \textbf{Avg Dice} \\ \hline
%     UNETR         & 0.8653 & 0.8708 & 0.8490 & 0.8617 & 0.8027 & 0.8642 \\  
%     SegMamba      & 0.8943 & 0.8962 & 0.8685 & 0.8863 & 0.8430 & 0.9004 \\  
%     UNET          & 0.8435 & 0.8637 & 0.8260 & 0.8444 & 0.8192 & 0.8195 \\  
%     SwinUnetr     & 0.8907 & 0.8970 & 0.8707 & 0.8861 & 0.8389 & 0.9126 \\  
%     SegResMamba   & 0.8953 & 0.8958 & 0.8605 & 0.8839 & 0.8361 & 0.9147 \\ \hline
%     \end{tabular}
%     \caption{Comparison of Dice scores for different models on the BraTS21, BTCV, and Spleen segmentation datasets. BraTS21 dataset scores include Dice scores for Tumor Core (TC), Whole Tumor (WT), Enhancing Tumor (ET), and Mean Dice.}
%     \label{tab:merged_results}
% \end{table}



Brain tumor segmentation performances of different SOTA models are shown in Table \ref{tab:results_brats21}. SegResMamba achieves a competitive mean Dice score of 0.8839, which is comparable to models like SwinUNETR (0.8861) and SegMamba (0.8863). Despite this, SegResMamba operates with significantly lower MACs, making it a more computationally efficient model. This lower computational cost, combined with its strong performance, highlights SegResMamba as an excellent choice for scenarios requiring a balance between accuracy and resource efficiency.

\begin{table}[ht]
    \centering
    \small % Reduce the font size
    \begin{tabular}{cccccc}
    \hline
        \textbf{Model} & \textbf{MACs} & \textbf{Mean Dice} & \textbf{Dice TC} & \textbf{Dice WT} & \textbf{Dice ET} \\ \hline
        UNETR      & 203.29G & 0.8617 & 0.8653 & 0.8708 & 0.8490 \\  
        SegMamba   & \textbf{1575.13G} & 0.8863 & 0.8943 & 0.8962 & 0.8685 \\ 
       % Segresnet  & 0.8783 & 0.8930 & 0.8825 & 0.8594 \\  
        UNET      & 30.13G & 0.8444 & 0.8435 & 0.8637 & 0.8260 \\  
        SwinUnetr  & 792.08G & 0.8861 & 0.8907 & 0.8970 & 0.8707 \\ 
        SegResMamba & 340.52G & 0.8839 & 0.8953 & 0.8958 & 0.8605 \\  \hline
    \end{tabular}
    \caption{Mean dice scores of different models on BraTS21 dataset for 5-fold cross-validation. Dice TC, Dice WT, and Dice ET represent the Dice scores for Tumor Core, Whole Tumor, and Enhancing Tumor, respectively.}
    \label{tab:results_brats21}
\end{table}



\begin{table}[ht]
    \centering
    %\small % Reduce the font size
    \begin{tabular}{ccc}
        \hline
        \textbf{Model} & \textbf{MACs} &\textbf{Avg Dice} \\ \hline
        UNETR         & 82.52G & 0.8642 \\  
        SegMamba      & 655.32G & 0.9004 \\  
        UNET          & 11.53G & 0.8195 \\  
        SwinUnetr     & 328.68G & 0.9126 \\  
        SegResMamba   & 137.84G & 0.9147 \\ \hline
    \end{tabular}
    \caption{Average Dice scores of models on the Spleen dataset.}
    \label{tab:results_Spleen}
\end{table}


In the spleen segmentation task, as shown in Table \ref{tab:results_Spleen}, the SegResMamba network achieved the highest average Dice score of \(\mathbf{0.9147}\), outperforming UNETR (0.8642), UNET (0.8195), and SwinUNETR (0.9126). This highlights SegResMamba's superior performance compared to transformer-based models on a small dataset.



\begin{table}[ht]
    \centering
    \begin{tabular}{c c c}
        \hline
        \textbf{Exp} & \textbf{Model}& \textbf{Avg Dice (BTCV)} \\ \hline
        
        1 & SegMamba Encoder + ResNet-based Decoder & 0.8164\\ 
        2 & Exp. 1 + Convolution Mamba Mixed Block & 0.8279\\ 
        3 & Exp. 2 + Additional Conv before downsampling & 0.8361 \\ \hline
        
    \end{tabular}
    \caption{Average Dice scores of different setups on the BTCV dataset.}
    \label{tab:experiment_results}
\end{table}

%While the Mamba architecture effectively abstracts information from longer contexts, convolution remains key to the success of segmentation methods.
To investigate the contribution of various components in our model, we conducted an ablation study on the BTCV dataset, with results shown in Table \ref{tab:experiment_results}. In the first experiment, a Mamba encoder proposed by SegMamba\cite{xing2024segmambalongrangesequentialmodeling} was paired with a lightweight ResNet-based decoder. This helps us to reduce computational complexity and memory efficiency. Next, we replaced the TSMamba block (Global Spatial Context (GSC) and ToM) used in the SegMamba encoder with our convolution mamba mixed block to enhance feature extraction of the encoder. This modification leverages both local representation through convolution and global representation via the mamba layer and improves the segmentation performance by 1.15\%. Finally, we added a convolutional layer before the downsampling operation to preserve essential features. When combined with the improvements from Experiments 1 and 2, this experiment led to significant performance gains, increasing the Dice score on the BTCV dataset from 0.8164 to 0.8361.
To investigate the effectiveness of the ToM layer, we performed an ablation study with and without the ToM layer, the results shown in Table \ref{tab:ablation2}. 1.27\% improvement on BTCV dataset with ToM layer shows the effectiveness of the Mamba module.

\begin{table}[ht]
    \centering
    \begin{tabular}{c c}
        \hline
        \textbf{Model}& \textbf{Avg Dice (BTCV)} \\ \hline
        
         Ours without ToM & 0.8234\\ 
         Ours with ToM (SegResMamba) & 0.8361\\ 
          \hline
        
    \end{tabular}
    \caption{Average Dice scores of different setups with and without ToM on the BTCV dataset.}
    \label{tab:ablation2}
\end{table}


 %This significant reduction in memory usage can help train and deploy these methods on ordinary hardware. %Memory usage for our model can be further reduced depending on the \texttt{num\_slices} parameter used in Convolution Mamba block. In our experiments, reducing it does not affect the performance much. For the above graphs we used  \texttt{num\_slices} as 32 for all four appearances of the ToM layer. 

\begin{figure}[h]
    \centering
    \includegraphics[width=.70\linewidth]{new_curve_colored.png}
    \caption{Mean dice score of BraTS dataset against CO$_2$ emission with 5-fold cross-validation settings for different models.   }
    \label{fig:co2emission}
\end{figure}
 

Furthermore, Figure \ref{fig:co2emission} illustrates the relationship between CO$_2$ emission and segmentation accuracy for brain tumor segmentation with 5-fold cross-validation across various models. These estimations are based on training time and conducted using Amazon Web Services in region eu-central-1, which has a carbon efficiency of 0.61 kgCO$_2$eq/kWh. A cumulative training hours of computation was performed on hardware of type A100 PCIe 40GB (TDP of 250W). Estimations were conducted using the Machine Learning Impact calculator presented in \cite{lacoste2019quantifying}.
Among the high-performing models, SegResMamba demonstrates a notable advantage by achieving a balance between environmental efficiency and segmentation performance. Specifically, SegResMamba exhibits significantly lower CO$_2$ emissions compared to other high-performing models such as SwinUNETR and SegMamba, while maintaining a comparable dice score. Furthermore, when compared to UNET and UNETR, SegResMamba achieves superior segmentation accuracy without a substantial increase in CO$_2$ emissions, highlighting its efficiency and effectiveness.





\section{Discussion}

The experimental results demonstrate that SegResMamba is a robust and efficient model for 3D medical image segmentation tasks. It consistently delivers competitive performance across datasets while significantly reducing memory consumption and computational costs compared to state-of-the-art models like SwinUNETR and SegMamba. 
%The reduced training memory requirements make this model an excellent choice for efficient operation on lower-specification hardware, particularly in clinical settings.
The model's design prioritizes memory efficiency without compromising segmentation accuracy. The reduced training memory requirements make this model an excellent choice for training and deployment on less resource-intensive hardware. 

%To enhance feature extraction in the encoder, we introduced the Convolution Mamba Mixed Block as a replacement for the TSMamba block in SegMamba. Our ablation study (Table \ref{tab:experiment_results}) shows that using an encoder with this block improves segmentation performance. Additionally, we designed a lightweight decoder compared to SegMamba, further reducing computational cost and memory usage.




In terms of computational complexity, SegResMamba requires only 340.52 GMACs for the BraTS21 dataset (Table \ref{tab:results_brats21}), a significant improvement over SegMamba (1575.13 GMACs) and SwinUNETR (792.08 GMACs). Despite its lightweight design, SegResMamba maintains a competitive mean Dice score of 0.8839, only 0.24\% and 0.22\% less than SegMamba and SwinUNETR respectively. This demonstrates the model's ability to achieve high segmentation accuracy while remaining computationally efficient which makes it more suitable to be deployed in energy-sensitive situations.
% SegResMamba also performed similarly on the BTCV dataset. In the case of spleen segmentation, SegResMamba outperformed all other models and achieved an average dice score of 0.9147.

SegResMamba's performance across datasets further highlights its versatility. On the BTCV dataset, the model achieves Dice scores comparable to memory-intensive counterparts like SegMamba and SwinUNETR (Table \ref{tab:results_BTCV}), while attaining the highest Dice score of 0.9147 on the spleen segmentation task (Table \ref{tab:results_Spleen}). These results emphasize its effectiveness in addressing diverse segmentation challenges.

 
% Environmental efficiency is another key aspect of the proposed model. SegResMamba exhibits significantly lower CO$_2$ emissions compared to other SOTA models during training. This further underscores its practicality and alignment with environmentally conscious AI practices.


Environmental efficiency is another key aspect of the proposed model. SegResMamba demonstrates significantly lower CO$_2$ emissions compared to other SOTA models during training due to reduced memory and computational requirements. This aligns with sustainable AI practices, promoting the development of energy-efficient models that minimize environmental impact without compromising performance.

While SegResMamba demonstrates substantial advantages, there are a few limitations to consider. First, its segmentation performance, although competitive, is marginally lower than other high-performing models like SwinUNETR and SegMamba, as observed in the BraTS and BTCV datasets. This slight trade-off may be a consideration for applications where peak accuracy is critical. Another limitation is that the training and evaluation were performed on three datasets with well-defined segmentation tasks; performance on more challenging, larger, or less-structured datasets remains to be explored.
% A key advantage of our model is its architectural efficiency, leading to a smaller memory footprint compared to models like SegMamba and SwinUNETR. While SwinUNETR requires more advanced GPUs (e.g., NVIDIA A100) due to memory constraints, our model runs efficiently on the V100. %Despite being lightweight, our model delivers competitive performance, evaluated using the Dice similarity coefficient for various medical image segmentation tasks. Detailed comparisons with other state-of-the-art models follow.


\section{Conclusion}
% The development of SegResMamba represents a significant stride towards enhancing the segmentation of 3D medical images. Addressing challenges posed by traditional Transformer-based models, SegResMamba strikes a balance between training efficiency and high performance. By integrating Mamba's capabilities with convolutional layers, the model effectively manages global context and local features, crucial for accurate medical diagnostics segmentation tasks.

% SegResMamba's efficiency gains are particularly notable in real-world applications where memory limitations are critical. It achieves comparable or superior performance to state-of-the-art models while reducing memory overhead, making it highly suitable for deployment in clinical settings. This capability ensures that medical professionals can access advanced segmentation technology without compromising practical implementation resources. For future work, we can further enhance computational efficiency and scalability, thereby broadening its applicability across different modalities and datasets. Additionally, ongoing exploration into novel training strategies and data augmentation techniques will continue to push the boundaries of segmentation accuracy and generalization capabilities.

SegResMamba marks a significant advancement in 3D medical image segmentation, balancing efficiency and performance by combining Mamba's global context modeling with convolutional layers for local feature extraction. Its reduced memory overhead,  along with improved computational and training efficiency, makes it well-suited for real-world clinical applications, delivering excellent results while remaining resource-efficient. Future work will focus on exploring new training strategies and data augmentation to further enhance segmentation accuracy and generalization across various datasets.

\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version
%\midlacknowledgments{We thank a bunch of people.}


\bibliography{midl25_111}


\appendix


% For comparison of memory usage, we used images of shape $128\times128\times128$ and batch size 1 with input channel 1  and output channel 14. We used Adam optimizer for the training. Figure \ref{fig:enter-peakmemorylapp} shows the comparison of memory usages during training of our model vs other state-of-the-art models. 
% \begin{figure}[ht]
%     \centering
%     \includegraphics[width=.8\linewidth]{memory_usage_training.png}
%     \caption{Comparison of Memory usage during Training for SOTA models}
%     \label{fig:enter-peakmemorylapp}
% \end{figure}

% From Figure \ref{fig:enter-peakmemoryl} it can be understood that our method use comparatively less memory than other large models like Swin Unetr and SegMamba while still maintaining comparable performance.
% This is also reflected in the inference graph shown in Figure \ref{fig:inference} where our model requires less than half memory the memory usage of SegMamba and almost one third of SwinUnetr.
% \begin{figure}[ht]
%     \centering
%     \includegraphics[width=.8\linewidth]{memory_usage_inference.png}
%     \caption{Comparison of Memory usage during Inference for SOTA models}
%     \label{fig:inference}
% \end{figure}

\clearpage

\section{Additional Implementation Details}

\subsection{Brain Tumor Segmentation}
BraTS2021 dataset \cite{baid2021rsna} was used for brain tumor segmentation to compare the performance across multiple folds for the SOTA models. We trained 5-fold cross-validation for 200 epochs utilizing strategies like learning rate scheduling with CosineAnnealing, Adam optimizer with weight decay of 1e-5, and gradient scaling. We used dice metric and dice loss as metric and the loss function. Various dataset transforms like foreground cropping, random spatial cropping, random flip with probability 0.5 in each direction, and random intensity scaling were used.

\subsection{Multi-organ Segmentation}
We conducted experiments on the BTCV dataset for multi-organ segmentation \cite{Landman2015}. The training process ran for 25,000 steps. We utilized the Adam optimizer with a learning rate of 1e-4 for our experiments. Our data transformations included scaling intensity range, orientation adjustment (Orientationd), foreground cropping (CropForegroundd), and spacing adjustment (Spacingd). To optimize the model's performance, we employed DiceLoss as the loss function and evaluated using the dice metric for validation.

\subsection{Spleen Segmentation}
For the spleen segmentation task, we used the spleen 3D segmentation dataset \cite{antonelli2022medical} and we limited training to 100 epochs. Extending the training to larger epoch numbers, such as 200, results in overfitting due to the relatively small size of the dataset compared to larger datasets like BraTS. Following a similar approach to the aforementioned tasks we used Adam optimizer with a learning rate of 1e-4. Transformations like scaling intensity range, normalizing the orientations of images, foreground cropping, and spacing adjustment were used. DiceLoss was used as the loss function and dice metric as the metric for validation. 

\section{Training Time and CO$_2$ Footprint}

% Figure \ref{fig:co2emission} experiments were conducted using Amazon Web Services in region eu-central-1, which has a carbon efficiency of 0.61 kgCO$_2$eq/kWh. A cumulative training hours of computation was performed on hardware of type A100 PCIe 40/80GB (TDP of 250W).

% Estimations were conducted using the Machine Learning Impact calculator presented in \cite{lacoste2019quantifying}.

\begin{table}[ht!]
\centering
\begin{tabular}{ccccccc}
\hline
\textbf{Model} & \textbf{Epoch Time} & \textbf{Total Time} & \textbf{5-Fold Time} & \multicolumn{3}{c}{\textbf{CO$_2$ Emissions (kg)}} \\ 
& (in sec) & (in hours) & (in hours) & Azure &  Google & Amazon\\ \hline

UNETR       & 262.83 & 14.60 & 73.01 & 10.40 & 11.32 & 11.13 \\ 
Segmamba    & 321.50 & 17.86 & 89.31 & \textbf{12.73} & \textbf{13.84} & \textbf{13.62} \\ 
UNET        & 255.80 & 14.21 & 71.06 & 10.13 & 11.01 & 10.84 \\
SwinUNETR   & 321.39 & 17.85 & 89.28 & 12.72 & 13.84 & 13.61 \\ 
%Segresnet   & 258.94 & 14.39 & 71.93 & 10.25 & 11.15 & 10.97 \\ \hline
Segresmamba & 267.83 & 14.88 & 74.40 & 10.60 & 11.53 & 11.35 \\ 
\hline
\end{tabular}
\caption{Comparison of models in terms of training time, and CO$_2$ emissions across different cloud providers for training of 5-fold cross-validation using BraTS dataset}
\label{tab:co2}
\end{table}


A detailed comparison of CO$_2$ emissions across different cloud providers, including Amazon Web Services, Google Cloud, and Azure, for 5-fold training of the BraTS dataset, is presented in Table \ref{tab:co2}. It is important to note that these values represent only the emissions from 5-fold training; incorporating hyperparameter optimization would result in significantly higher CO$_2$ emissions. These estimations were conducted using the Machine Learning Impact calculator presented in \cite{lacoste2019quantifying}.


\section{Models Configuration}
Table \ref{tab:params_details} presents the model configurations, including the number of parameters (in millions) and FLOPs for each method for the BTCV dataset. Our proposed model, SegResMamba, has 188.42G FLOPs, striking a balance between computational efficiency and performance compared to more complex architectures like SwinUNETR, SegMamba and 3D UX-Net.

\begin{table}[ht]
    \centering
    %\small % Reduce the font size
    \begin{tabular}{ccc}
        \hline
        \textbf{Model} & \textbf{Num of Params (in million)} & \textbf{FLOPs} \\ \hline
        UNETR         & 93.01 & 177.44G  \\  
        SegMamba      & 67.36 & 1443.96G  \\  
        UNET          & 4.89 & 25.93G  \\  
        SwinUnetr     & 62.19 & 767.23G  \\
        nnFormer      & 149.32 & 426.74G  \\
        %UNETR ++      & - & 0.8328 \\
        nnUNeT        & 31.19 & 480.06G  \\
        3D UX-Net & 53.01 & 1373.75G  \\
        SegResMamba   & 119.98 & 188.42G \\ \hline
    \end{tabular}
    \caption{Num of params of methods with FLOPs count}
    \label{tab:params_details}
\end{table}

\section{Memory Efficiency}

A comparison of training memory of different models is shown in Table \ref{table:models_memory} (using image size $128 \times 128 \times 128$ for BTCV and BraTS dataset and $96 \times 96 \times 96$ for Spleen dataset with batch size 1).

\begin{table}[ht]
\centering
\begin{tabular}{cccc}
\hline
\textbf{Model} & \textbf{BTCV (GB)} & \textbf{Spleen (GB)} & \textbf{BRATS (GB)} \\ \hline
\textbf{UNETR}  & 3.08               & 0.14                 & 3.02               \\  
\textbf{SegMamba} & 13.51             & 5.68                 & 13.44              \\ 
\textbf{UNET}    & 1.42               & 0.48                 & 1.13 
\\
\textbf{SwinUNETR} & 7.77             & 3.21                 & 7.68               \\ 
\textbf{SegResMamba} & 5.10           & 2.22                 & 4.78               
\\ \hline
\end{tabular}
\caption{Training memory (in GB) for different models on BTCV, Spleen, and BraTS datasets.}
\label{table:models_memory}
\end{table}





% \section{Proof of Theorem 2}

% This is a complete version of a proof sketched in the main text.

\end{document}
