\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images
\usepackage{graphicx}
\usepackage{booktabs}
\usepackage{caption}
\usepackage{lipsum}
\usepackage{wrapfig}


\jmlryear{2025}
\jmlrworkshop{Full Paper -- MIDL 2025}
\jmlrvolume{-- nnn}
\editors{Accepted for publication at MIDL 2025}
\title[MAIS: Memory-Attention for Interactive Segmentation]{MAIS: Memory-Attention for Interactive Segmentation}

\midlauthor{\Name{Mauricio Orbes-Arteaga \nametag{$^{1}$}}  \Email{henry.m.orbes$\_$arteaga@kcl.ac.uk}\AND
\Name{Oeslle Lucena \nametag{$^{1}$}} \Email{oeslle.lucena@kcl.ac.uk}\AND
\Name{Sebastien Ourselin \nametag{$^{1}$}} \Email{sebastien.ourselin@kcl.ac.uk}\AND
\Name{M. Jorge Cardoso \nametag{$^{1}$}} \Email{m.jorge.cardoso@kcl.ac.uk}\\
\addr $^{1}$ King's College London, London, UK
}
\begin{document}
\maketitle
\begin{abstract}
Interactive medical segmentation reduces annotation effort by refining predictions through user feedback. Vision Transformer (ViT)-based models, such as the Segment Anything Model (SAM), achieve state-of-the-art performance using user clicks and prior masks as prompts. However, existing methods treat interactions as independent events, leading to redundant corrections and limited refinement gains. We address this by introducing \textbf{MAIS}, a \textbf{M}emory-\textbf{A}ttention mechanism for \textbf{I}nteractive \textbf{S}egmentation that stores past user inputs and segmentation states, enabling temporal context integration. Our approach enhances ViT-based segmentation across diverse imaging modalities, achieving more efficient and accurate refinements.
\end{abstract}
\begin{keywords}
 Interactive Segmentation, Vision Transformers, Fundation Models, SAM.
\end{keywords}

\section{Introduction}
\begin{wrapfigure}{r}{0.5\textwidth}
    \centering
    \includegraphics[trim={2mm 3mm 2mm 2mm},clip,width=0.5\textwidth]{images/improved_introduction_plot.png }  % Adjust width or height as needed
    % \captionsetup{justification=justified, singlelinecheck=false} % Removes indentation
    \captionsetup{font=scriptsize, justification=justified, singlelinecheck=false} 
    % \captionsetup{font=tiny, justification=justified, singlelinecheck=false} 
    \caption{ Segmentation accuracy (Dice $\%$) vs. user interactions on \textbf{Han-Seg} \cite{Han-seg}. SAM-Med3D (3D medical-specific) shows poor zero-shot performance and plateaus after few clicks (fine-tuned), revealing limited task specificity. Our method (MAIS:Ft ), leveraging memory of past interactions, sustains improvement with increasing clicks, approaching Oracle performance even with 10$\%$ training data.}
    \label{fig:itroduction_fig}
\end{wrapfigure}
Automated segmentation has transformed medical imaging, enabling fast delineation of anatomical structures and pathologies. Deep learning models \cite{isensee2021nnu,totalsegmentator,hatamizadeh2022unetr,hatamizadeh2021swin} perform well on specialized tasks with large, annotated datasets but struggle with anomalies, where high variability in shape, texture, and location demands robust generalization \cite{diaz2024monai}. Additionally, their reliance on extensive labeled data makes them costly and labor-intensive in clinical settings.

To address these limitations, interactive segmentation frameworks integrate human feedback—such as corrective clicks or scribbles—to iteratively refine predictions \cite{du2023segvol,diaz2024monai}. Vision Transformer (ViT)-based architectures, such as the Segment Anything Model (SAM) \cite{kirillov2023segment}, have emerged as powerful tools for this purpose due to their prompt-driven design and zero-shot capability. Recent efforts, such as fine-tuning SAM on medical images \cite{ma2024segment,li2024promise,gong20233dsam,wang2024sam}, demonstrate promising results. However, a critical bottleneck persists: most implementations process 3D volumes slice-by-slice in 2D, significantly increasing clinician effort during inference \cite{mazurowski2023segment,cheng2023sam}.

Efforts to extend SAM to 3D medical imaging face inherent challenges. For instance, SAM2 \cite{ravi2024sam}, originally designed for video segmentation, treats 3D scans as stacks of 2D slices, akin to video frames. This approach assumes temporal consistency between adjacent slices—a flawed premise in medical imaging, where anatomical cross-sections often exhibit abrupt spatial variations despite representing contiguous structures. Subsequent adaptations like Sam3D \cite{bui2024sam3dsegmentmodelvolumetric} attempt to mitigate this by extracting 3D features slice-wise using SAM encoders and aggregating them via lightweight 3D decoders. While this improves efficiency, segmentation quality remains suboptimal, particularly in heterogeneous regions like tumors. Domain-specific architectures such as SAM-Med3D \cite{wang2024sammed3dgeneralpurposesegmentationmodels}, trained on large-scale medical datasets, enhance zero-shot capabilities but still lack task-specific precision, often requiring excessive user prompts to achieve clinically acceptable results (see Figure~\ref{fig:itroduction_fig}).

A key limitation of existing approaches is their treatment of user interactions as isolated events. Models like SAM-Med3D handle each correction independently, disregarding past interactions once the mask is updated. This lack of memory leads to redundant corrections, diminishing returns in refinement, and eventual performance plateaus—even with fine-tuning. Our preliminary experiments (see Figure~\ref{fig:itroduction_fig}) with off-the-shelf SAM-Med3D show that while early interactions enhance segmentation, improvements taper off after a certain point, highlighting the need for models that more effectively incorporate historical context. We hypothesize that incorporating temporal context from past user interactions and segmentation states can overcome these limitations. To this end, we propose a memory-attention mechanism that dynamically stores and retrieves embeddings from sparse user prompts (clicks) and dense segmentation masks. By conditioning predictions on both current inputs and a memory bank of prior interactions, our model enables coherent, incremental refinements across user sessions. Built on top of  SAM-Med3D, our method retains the benefits of foundation models—including zero-shot adaptability—while addressing their shortcomings in interactive settings.

The contributions of this work are threefold: (1) A memory-attention mechanism that integrates temporal context into interactive segmentation, enabling robust refinement across user interactions; (2) A lightweight, modular architecture compatible with existing ViT-based frameworks, requiring minimal computational overhead; and (3) Comprehensive validation across multiple modalities and anatomical regions, highlighting the model’s adaptability and superiority in low-data scenarios.    

\section{Methodology}

\subsection{Memory-attention for interactive segmentation (MAIS)}
Our method is built on SAM-Med3D ViT-based as backbone for 3D interactive segmentation \cite{wang2024sammed3dgeneralpurposesegmentationmodels}. In this framework, a mask decoder processes image embeddings and user prompts to generate 3D segmentation masks. Inspired by SAM2 \cite{ravi2024sam}, we introduce a memory attention mechanism that conditions the output on current prompts and also on the memories from past predictions and interactions. Note that in this work, we use the same architecture as SAM-Med3D for the image encoder, prompt encoders, and mask decoder. However, the proposed memory attention mechanism is designed to work with various architectures for these components. Figure~\ref{fig:Method scheme} illustrates the proposed model, with key components described as follows.
% \begin{figure}[htbp]
\begin{figure}[!htb]
    \centering
    \includegraphics[trim={4mm 8mm 3mm 25mm},clip,width=.9\textwidth]{images/Method_scheme.png}  % Adjust width or height as needed
    % \todo{maybe to include this description in the text above}
    \caption{Overview of the proposed segmentation model incorporating memory attention: A 3D image encoder processes the input image generating a 3D embedding. Prompt encoders transform user interactions—positive or negative 3D coordinates—and previous mask predictions into prompt embeddings, which are stored in a memory bank for future interactions. The memory attention mechanism conditions the image embedding on the stored memory before passing it to the mask decoder, which produces the final segmentation output.}
    \label{fig:Method scheme}
\end{figure}

\subsubsection{3D Image Encoder and Prompt Encoders} 
The image encoder is a 3D adaptation of the SAM encoder, originally based on ViT. It replaces the 2D convolutional layers with 3D convolutions to process input images and extends the 2D positional encoding by adding an extra dimension. Additionally, a 3D relative positional encoding (PE) is integrated into the 3D attention blocks. During training and inference, the image encoder processes each image once before prompting the model.
 
 As with most interactive segmentation architectures, we use two types of prompts:
\textbf{Sparse prompts:} These consist of 3D coordinates derived from point clicks, which can be positive (indicating addition) or negative (indicating deletion). \textbf{Dense prompts:} These are 3D masks that represent the current state of the segmentation. Sparse prompts are represented by positional encodings combined with learned embeddings, whereas dense prompts are embedded using convolutional layers.

\subsubsection{Memory Bank} 
\label{sec:memory_bank}
The memory bank is designed to store information about past predictions and user interactions. After each user interaction, a new memory, composed of sparse and dense embeddings, is created and added to the memory bank. The bank operates as a first-in, first-out (FIFO) queue, retaining the latest \textit{N} memories. Unlike SAM2, we omit the use of a memory encoder for memory creation. Preliminary experiments show that omitting the memory encoder reduces computational complexity while maintaining performance. The memory bank can be: \textbf{Sparse Memory:} this memory bank consists only of click embeddings, meaning that the image embedding is conditioned solely on these sparse prompts through cross-attention, \textbf{Dense Memory:} this memory bank consists of only of previous mask embeddings, with the image embedding conditioned on these dense prompts by incorporating the self-attended output of the dense memories, or \textbf{Sparse + Dense}: the proposed method (Figure~\ref{fig:Method scheme}), where the memory bank integrates both click and previous mask embeddings. This is the final memory configuration used in MAIS. 

\subsubsection{Memory Attention} 
The memory attention block conditions the image embedding based on both the memory bank and the current interaction. It first performs self-attention on the sparse and dense memory stacks, generating a dense output, which is then added to the image embedding. Next, a cross-attention is computed between both memory types. To perform self-attention on the dense prompt stack, we employ a Convolutional Transformer block, ensuring computational efficiency while maintaining effective attention processing.


\subsubsection{Mask Decoder} 
The mask decoder receives memory-conditioned image embeddings along with the prompt encodings and outputs the segmentation mask. It consists of a stack of ``Two-way" Transformer blocks that apply both self-attention and cross-attention to contextualize the prompt tokens with the memory-conditioned image embeddings. (Note that during the first interaction, the image embedding is unconditioned on memory and functions as in the original SAM-Med3D model.) Finally, transposed 3D convolutional layers are used for 3D upscaling, and an MLP is employed at the end to output the segmentation mask.


\subsection{Datasets}
\label{sec:dataset}
We conducted experiments on publicly available medical imaging datasets, including Computed Tomography (CT) and Magnetic Resonance Imaging (MRI) scans. These datasets contain annotations for multiple anatomical regions, such as the abdomen, head, neck, and heart. Each dataset was split into training (70$\%$) and testing (30$\%$) sets. Notably, none of the images in these datasets were used during the pre-training of SAM-Med3D.

Four medical imaging datasets were used: ACDC~\cite{ACDC} (MR, $70$ training, $30$ testing, $4$ heart classes), HaN-Seg~\cite{Han-seg} (CT, $28$ training, $13$ testing, $30$ head and neck classes),  and the validation set of AMOS~\cite{ji2022amos}, which we split into AMOS-CT (CT, $70$ training, $30$ testing, $15$ abdomen classes) and AMOS-MR (MR, $14$ training, $6$ testing, $15$ abdomen classes).

\subsection{Experimental Design}


We designed experiments to evaluate the impact of the proposed memory-attention module on interactive segmentation performance and its effect on the few-shot learning potential of foundation models. To investigate this, we adopted a two-stage approach. First, we analyzed the effect of memory bank size and memory embedding types (sparse vs. dense) on segmentation efficacy (Subsection~\ref{sec:ablation_studies}). Second, we conducted extensive external validation across multiple datasets and tasks of varying complexity to assess the robustness and generalizability of our approach (Subsection~\ref{sec:segmentation_performance}).

In these experiments, MAIS leverages pretrained weights from SAM-Med3D \cite{wang2024sammed3dgeneralpurposesegmentationmodels}, with the image encoder frozen to preserve foundational feature extraction capabilities. During training, only the prompt encoder and mask decoder are fine-tuned, while the memory attention module is trained from scratch in parallel. we simulate sparse visual prompts by sampling them from regions where previous predictions were incorrect. This approach mirrors human correction behavior, enabling the model to focus on areas requiring refinement. 

Training models from scratch on large datasets is out of our scope  as it requires substantial resources, making them computationally prohibitive. Furthermore, while they can gain representational capabilities, they may lost specificity for certain applications. Instead, we are interested in improving the adaptive ability of generic interactive models to unseen datasets where data is limited.

\subsubsection{Memory Bank and Prompt Type Analysis}
\label{sec:ablation_studies}
To investigate key design choices, we conducted experiments on the \textbf{HaN-Seg} dataset, focusing on the ``Cavity Oral" class. These experiments assessed how memory bank size and  prompt types (sparse vs. dense) memory embeddings influence segmentation performance.

\textbf{Impact of Memory Bank Size:}
We examined whether increasing the memory bank size improves the model’s ability to leverage past interactions for better segmentation and how performance scales with additional memory capacity. To this end, we trained \textbf{MAIS} with varying memory bank sizes, storing N = 10, 20, 30, 40, 50, 60 and 80 embeddings. The model refined its predictions iteratively using user-provided clicks, accumulating information at each step. Segmentation accuracy was measured using the Dice score ($\%$) at each iteration. Additionally, we included an oracle baseline (nn-UNet) to establish the upper bound of achievable performance.

\textbf{Sparse vs  Dense Memory Embeddings}
To evaluate the influence of different prompt types in constructing the memory bank, we compared three variations of our method, as described in Section~\ref{sec:memory_bank}. Specifically, we assessed the impact of using sparse (click-based) memory, dense (mask-based) memory, and their combination. A version of \textbf{MAIS} removing the memory attention block (no memory) was included as a baseline for comparison. The models were   trained on varying numbers of images (1, 2, 4, and 28) and evaluated using different numbers of user clicks (5, 10, 20, and 50).

\subsubsection{Segmentation Performance}
\label{sec:segmentation_performance}
We evaluated segmentation performance across the four datasets (section ~\ref{sec:dataset}) by fine-tuning models with varying amounts of training samples and assessing their performance on validation sets. The following models were compared:
\textbf{MAIS:} The proposed model (Figure~\ref{fig:Method scheme}) with a memory size of N = 60, using both sparse and dense embeddings.  
\textbf{Ft-SAM3D:} A variant of the model in Figure~\ref{fig:Method scheme} with the memory attention module removed, effectively reducing it to a fine-tuned version of vanilla \textbf{SAM-Med3D}. Similar to the \textbf{MAIS} model, only the mask decoder and prompt encoder parameters were updated during fine-tuning. Furthermore, we test two baseline models for performance comparison: \textbf{Oracle:} \textbf{nn-UNet}~\cite{isensee2021nnu} trained from scratch on target data, providing a reference for state-of-the-art performance in task-specific segmentation. \textbf{SAM3D (Zero-shot):} The off-the-shelf \textbf{SAM-Med3D} model was used to segment the validation sets directly, serving as a benchmark for zero-shot performance of foundation segmentation models on unseen data.

\section{Results}
\subsection{Memory Bank and Prompt Type Analysis}

\textbf{Impact of memory bank size}
Figure~\ref{fig:ablation_study}-a illustrates the effect of memory size on segmentation performance. The results demonstrate that increasing memory size generally improves performance, with larger memory bank sizes (over $50$) approaching/outperforming oracle performance. In contrast, smaller memory sizes ($10$ and $20$) show lower Dice scores, indicating that limited memory restricts the model’s ability to refine predictions effectively. The performance gap is most prominent in the early stages (fewer clicks), where larger memory sizes exhibit steeper improvements. However, as the number of clicks increases, the performance differences among larger memory sizes diminish, suggesting a saturation effect. These findings highlight the critical role of the size of the memory bank in achieving accurate and efficient segmentation. It important to note that larger memory sizes require more computational resources due to increased attention computations, making the model more resource-intensive. Thus, a tradeoff between memory size and efficiency is necessary to balance performance and resource usage.
\begin{figure}[ht]
    \centering
    \includegraphics[width=\textwidth]{images/hr_merged_ablation_horizontal_clicks.png}  % Adjust width or height as needed
    \caption{ Memory bank and prompt type analysis : Subplot \textbf{(a)} Impact of memory bank size on segmentation performance. Dice score ($\%$) is shown as a function of the number of clicks for different memory sizes. Subplots \textbf{(b)} Sparse vs dense memory embeddings comparison for the different number of clicks and images used for training.} 
    \label{fig:ablation_study}
\end{figure}

\textbf{Sparse vs dense memory embeddings}
The results are presented in Figure~\ref{fig:ablation_study}-b. The Sparse Memory approach consistently underperforms compared to Dense Memory, indicating that click embeddings solely are less effective for refining segmentations. The Sparse + Dense configuration achieves the highest Dice scores, especially at higher click counts (Figure~\ref{fig:ablation_study}-b4, $50$ clicks), demonstrating the advantage of combining both memory types. As the number of clicks increases, the performance gap widens, showing that more interactions help the model better utilize stored information. The baseline without memory attention performs worst, highlighting the importance of an effective memory mechanism

\subsection{Segmentation Performance}
Table~\ref{tab:performance_results} summarizes the segmentation performance of the evaluated methods across three datasets described in Section \ref{sec:dataset} (See qualitative results in Appendix \ref{sec:qualitative_ct} and \ref{sec:qualitative_mr}). The memory bank configuration (Sparse + Dense) was selected based on its superior performance in the previous analysis.
\input{tables/results}
% Ours Vs Ft-SAM3D
MAIS was found to consistently outperform Ft-SAM3D fine-tuning, particularly as the number of interactions increases. The performance gap is most pronounced in low-data regimes ($10\%$ and $50\%$ of the data), where our model achieves significant improvements over Ft-SAM3D, demonstrating its effectiveness when training data is limited. Additionally, in the one-shot scenario, our method significantly outperforms Ft-SAM3D, except on the ACDC dataset.
% Number of interaction
Regarding the impact of interaction count, we observe that increasing user interactions generally improves segmentation performance across all interactive methods. Notably, the performance gap between MAIS and Ft-SAM3D widens as more interactions are allowed, with Ft-SAM3D showing diminishing gains after 10 interactions, whereas our method continues improving up to 50 interactions. This suggests that MAIS effectively leverages a large memory capacity to enhance refinement.

% Agains the oracle
Comparing interactive methods against the Oracle nn-UNet, we find that with sufficient training data, the Oracle remains the best performer across all datasets. However, in several cases (e.g., HaN-SEG and AMOS-MR), MAIS outperforms or closely matches the Oracle’s performance, particularly when a higher number of interactions is allowed. Conversely, the Oracle struggles when trained on limited data, often underperforming compared to interactive methods.
%Agains zero-shot SAM3D
Finally, both Ft-SAM3D and MAIS consistently outperform the zero-shot SAM3D baseline, demonstrating the effectiveness of fine-tuning foundation models for medical image segmentation.

\section{Conclusions}
In this work, we introduced Memory-Attention for Interactive Segmentation (MAIS) to address the limitations of traditional Vision Transformer (ViT)-based approaches in interactive segmentation. By incorporating temporal context through a memory bank that stores past user interactions and segmentation masks, our method significantly enhances segmentation performance across various medical imaging datasets, including MRI and CT scans. The lightweight architecture of the proposed attention module enables effective training even with limited data, achieving competitive performance against state-of-the-art task-specific models such as nn-UNet. Our experiments demonstrate that increasing memory capacity leads to more effective segmentation refinement as user interactions grow. These capabilities are crucial for developing interactive segmentation tools that enhance clinical workflows, allowing clinicians to improve labeling efficiency and accuracy in medical imaging applications. Future work will integrate the proposed attention mechanism with additional pretrained backbone architectures.

\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{ This research was supported by Wellcome/ EPSRC Centre for Medical Engineering
(WT203148/Z/16/Z, 213038/Z/18/Z ), Research capability fund (PJ11439), The London AI centre for Value-based Heathcare and GE Healthcare.}
\bibliography{midl25_126}

\newpage
\appendix
\section{Implementation and Computational Details}\label{sec:apendix_implement}
Our training settings follow those used in \cite{wang2024sammed3dgeneralpurposesegmentationmodels}, with some modifications. We employ different learning rates for different parts of our model: the prompt encoders and mask decoder are optimized with an initial learning rate of 8e-5, while the memory attention parameters use an initial learning rate of 8e-4. Both learning rates follow a multi-step scheduler, decreasing by a factor of 0.1 at epochs 129 and 180. We use the Dice loss function and the AdamW optimizer, training our models for 200 epochs. Input images are cropped into 128×128×128 3D patches centered on a foreground voxel.
Training was conducted on an NVIDIA GeForce RTX-4090(24GB). 

\section{Model Complexity MAIS vs SAM2}
We compute the number of parameters for each component in the MAIS and SAM2 models to analyze their relative complexity. In MAIS, the memory module consists solely of the memory attention block, whereas SAM2 incorporates an additional memory encoder that processes both image embeddings and output masks. Additionally, we calculate the number of parameters being fine-tuned, which includes both the memory attention module and the mask decoder. Notably, the image encoder is excluded from fine-tuning, as it constitutes the majority of the total model parameters—90\% in MAIS and 94\% in SAM2—making its optimization computationally impractical.  The parameter breakdown in Table \ref{tab:my_label} highlights the efficiency of the proposed method in terms of model complexity. The memory attention module in our approach (MAIS) comprises only 2.84M parameters, which represents 29.79\% of the fine-tuned parameters and merely 2.77\% of the total model parameters. In contrast, SAM2’s memory module is significantly heavier, with 7.31M parameters, corresponding to 63.42\% of the fine-tuned parameters and 3.26\% of the total model. This difference underscores the efficiency of our method in maintaining a lightweight memory mechanism  while achieving comparable functionality. Additionally, our model does not require a memory encoder, further reducing computational complexity compared to SAM2.  

\begin{table}[!htb]
    \centering
\resizebox{\textwidth}{!}{%
\begin{tabular}{lllllll}
\toprule
Component & \multicolumn{2}{l}{ $\#$ Parameters} & \multicolumn{2}{l}{\% of FT Parameters} & \multicolumn{2}{l}{\%  of total Parameters} \\
 &MAIS &     SAM2 &                   MAIS &     SAM2 &                                    MAIS &     SAM2 \\
\midrule    
    Memory Attention &      2.84M &    5.92M &                 29.79\% &   51.40\% &                                   2.77\% &    2.64\% \\
    Memory Encoder &          - &    1.38M &                      - &   12.02\% &                                       - &    0.62\% \\
    \textbf{Total Memory module} &      2.84M &    7.31M &                 29.79\% &   63.42\% &                                   2.77\% &    3.26\% \\
    \midrule
    Mask decoder &      6.69M &    4.22M &                 70.21\% &   36.58\% &                                   6.53\% &    1.88\% \\
    \textbf{Total parameters finetuned} &      9.53M &   11.52M &                - &  - &                                   9.30\% &    5.13\% \\ 
    \midrule
    Image encoder &     92.92M &  212.70M &                      - &        - &                                  90.69\% &   94.77\% \\
    \textbf{Total parameters} &    102.46M &  224.43M &                      - &        - &                                 - &  - \\
    
\bottomrule
\end{tabular}
}
    \caption{Breakdown of total parameters, finetuned parameter share, and overall model parameter distribution for SAM2 and MAIS across different components, highlighting the differences in memory and mask decoder contributions}
    \label{tab:my_label}
\end{table}

\section{MAIS training and inference cost}
To evaluate the computational impact of the proposed memory attention mechanism, we measured GPU memory usage and training time required to train and infer MAIS on a single-image, single-label dataset while simulating 150 user interactions. We compared different memory configurations, including No Memory (equivalent to FT-SAM3D), Sparse, Dense, and a combination of Sparse + Dense memory banks. 

The results presented in Table \ref{tab:training_inference_cost} demonstrate that the additional computational burden introduced by the proposed memory attention mechanism remains minimal, particularly during inference. During training, GPU memory consumption increases gradually with larger memory sizes but remains within a reasonable range. For instance, in the most demanding configuration (Sparse + Dense with 60 stored embeddings), GPU usage reaches 7164 MB, representing a 32.7\% increase over the No Memory baseline (5398 MB), while training time per image rises by approximately 62\% (from 14.65 min to 23.75 min). However, for more moderate configurations (e.g., Sparse 10 or Sparse + Dense 10), the increase is minimal, suggesting that efficient memory management can significantly reduce overhead.

Inference results indicate that the added memory mechanism has an even smaller impact on computational cost. The No Memory baseline requires 2756 MB and 27 seconds per image, while the heaviest memory configuration (Sparse + Dense 60) increases memory usage by only 16.5\% (3210 MB) and inference time by just 6 seconds (from 27s to 33s). This suggests that, unlike training, where memory size plays a more significant role, inference benefits from the efficiency of the proposed attention module, maintaining near-real-time performance.

In summary, the proposed memory mechanism introduces a modest increase in computational cost during training while remaining nearly imperceptible during inference. Moreover, the parameter analysis confirms that our attention module is significantly more efficient than SAM2, enabling improved memory integration without excessive resource demands. These findings demonstrate that our approach achieves a favorable balance between efficiency and performance, making it well-suited for real-world applications. 

\begin{table}[!htb]
    \centering
\resizebox{\textwidth}{!}{
\begin{tabular}{lccccc}
\toprule
  & & \multicolumn{2}{c}{Training} & \multicolumn{2}{c}{Inference} \\
 \cmidrule(lr){3-4} \cmidrule(lr){5-6}
Method conf & Memory Size & GPU Memory (MB) & Time (min) & GPU Memory (MB) & Time (s) \\
\midrule
No Attention   & 0  & 5398  & 14.65  & 2756  & 27  \\
Sparse         & 10 & 5578  & 14.82  & 2774  & 28  \\
Sparse         & 20 & 5940  & 14.98  & 2846  & 28  \\
Sparse         & 60 & 6260  & 16.08  & 3100  & 32  \\
Dense          & 10 & 5690  & 15.72  & 2864  & 28  \\
Dense          & 20 & 5954  & 17.80  & 2904  & 31  \\
Dense          & 60 & 6552  & 21.40  & 3208  & 32  \\
Sparse + Dense & 10 & 5930  & 16.00  & 2836  & 28  \\
Sparse + Dense & 20 & 6370  & 18.55  & 2906  & 29  \\
Sparse + Dense & 60 & 7164  & 23.75  & 3210  & 33  \\
\bottomrule
\end{tabular}
}
\caption{Comparison of GPU memory usage and computation time for training and inference under different memory configurations.}
\label{tab:training_inference_cost}
\end{table}

\section{Qualitative results-MR}\label{sec:qualitative_mr}
\begin{figure}[!ht]
    \centering
    \includegraphics[width=0.95\textwidth]{images/grid_plot_amosmrval_liver_61_spring.png}  % Adjust width or height as needed
    % \todo{maybe to include this description in the text above}
    \caption{Qualitative result on AMOS-MR dataset For \textbf{Liver} with varying numbers of user clicks. The first column shows the original MRI images, and the second column presents the ground-truth segmentations. The remaining columns compare segmentations produced by SAM3D, fine-tuned SAM3D (FT-SAM3D), and MAIS.}
    \label{fig:Qualitative-MR}
\end{figure}
\newpage
\section{Qualitative results-CT}\label{sec:qualitative_ct}
\begin{figure}[!ht]
    \centering
    \includegraphics[width=0.95\textwidth]{images/grid_plot_hanseg_Esophagus_S_61.png}  % Adjust width or height as needed
    % \todo{maybe to include this description in the text above}
    \caption{Qualitative result on the HaN-SEG dataset (CT images) for \textbf{Esophagus} segmentation with different numbers of user clicks. The first column shows the original CT images, while the second column presents the ground-truth segmentations. The remaining columns compare segmentations produced by SAM3D, fine-tuned SAM3D (FT-SAM3D), and MAIS. It can be observed that SAM3D tends to oversegment the esophagus, while FT-SAM3D undersegments it. MAIS provides more balanced results as the number of clicks increases.}
    \label{fig:Qualitative-CT}
\end{figure}
\end{document}
