\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

% NOTE: Fetching these two packages for check-marks in ablation
\usepackage{xcolor}
\usepackage{amssymb}

\usepackage{mwe} % to get dummy images
\jmlrvolume{-- 324}
\jmlryear{2026}
\jmlrworkshop{Full Paper -- MIDL 2026}
\editors{Accepted for publication at MIDL 2026}

\title[Automatic Prompt Generation (APG)]{Revisiting foundation models for cell instance segmentation}

\midlauthor{
  \Name{Anwai Archit\nametag{$^{1}$}} \orcid{0009-0002-9533-8620} \Email{anwai.archit@uni-goettingen.de}\\
  \Name{Constantin Pape\nametag{$^{1,2,3}$}} \orcid{0000-0001-6562-7187} \Email{constantin.pape@informatik.uni-goettingen.de}\\
  \addr $^{1}$ Georg-August-University Göttingen, Institute of Computer Science\\
  \addr $^{2}$ CAIMed - Lower Saxony Center for AI \& Causal Methods in Medicine, Göttingen\\
  \addr $^{3}$ Cluster of Excellence Multiscale Bioimaging (MBExC), Georg-August-University Göttingen\\
}

\begin{document}

\maketitle

\begin{abstract}
Cell segmentation is a fundamental task in microscopy image analysis. Several foundation models for cell segmentation have been introduced, virtually all of them are extensions of Segment Anything Model (SAM), improving it for microscopy data. Recently, SAM2 and SAM3 have been published, further improving and extending the capabilities of general-purpose segmentation foundation models. Here, we comprehensively evaluate foundation models for cell segmentation (CellPoseSAM, CellSAM, $\mu$SAM) and for general-purpose segmentation (SAM, SAM2, SAM3) on a diverse set of (light) microscopy datasets, for tasks including cell, nucleus and organoid segmentation. Furthermore, we introduce a new instance segmentation strategy called automatic prompt generation (APG) that can be used to further improve SAM-based microscopy foundation models. APG consistently improves segmentation results for $\mu$SAM, which is used as the base model, and is competitive with the state-of-the-art model CellPoseSAM.
Moreover, our work provides important lessons for adaptation strategies of SAM-style models to microscopy and provides a strategy for creating even more powerful microscopy foundation models.
\end{abstract}

\begin{keywords}
vision foundation models, segment anything, microscopy, instance segmentation, cell segmentation
\end{keywords}

\section{Introduction}

Instance segmentation is one of the most important image analysis tasks in microscopy, enabling phenotypic drug screening in high-content imaging \cite{chandrasekaran2024three}, analysis of embryogenesis at the cellular level \cite{lange2024multimodal}, and many other applications.
Virtually all current methods for microscopy instance segmentation are based on deep learning, such as dedicated tools addressing cell \cite{cellpose} and nucleus \cite{stardist} segmentation in light microscopy, nucleus segmentation in histopathology \cite{hovernet}, or segmentation of mitochondria \cite{mitonet} and other organelles \cite{synapsenet} in electron microscopy.
A repository \cite{bioimageio} collects trained models for such segmentation tasks, compatible with tools for model inference \cite{deepimagej, ilastik}.

These dedicated models can accelerate many analyses, yet the large number of tools and the potential lack of pretrained models for specific tasks put a burden on users without substantial computational expertise. Hence, foundation models have been introduced in this domain, enabling a wider range of tasks with a single model \cite{micro-sam, cellpose-sam, cell-sam, cell-vit, patho-sam}.
They are predominantly based on the Segment Anything Model (SAM) \cite{SAM}, a general-purpose segmentation foundation model. SAM itself has been extended to video data by SAM2 \cite{SAM2} and, recently, to text- and example-based segmentation by SAM3 \cite{SAM3}.
The latter also included microscopy data in its training set.

These developments open up the following questions:
(i) What is the best strategy for adapting a SAM-style model to microscopy?
(ii) Are specific (foundation) models for microscopy segmentation still needed or do general-purpose segmentation models, in particular SAM3, make them obsolete?
(iii) What influence does the training data (modalities, size, data diversity) have on model performance?

\begin{figure}[ht]
\floatconts
  {fig:overview}
  {\caption{a) Overview of our new instance segmentation method, automatic prompt generation (APG), which re-purposes the trained $\mu$SAM (or PathoSAM) models by deriving point prompts from decoder predictions, predicting masks based on these prompts, and then filtering overlapping masks via NMS. Note that the model is not retrained. APG replaces the prior instance segmentation logic. b) Overview of segmentation results for four different microscopy modalities. We report the averaged rank over the 9 datasets per domain in parentheses, top three methods are colored. c) Example label-free cell segmentation with different methods. Only APG correctly segments the large central cell, highlighting its advantage for complex cell morphologies.}}
  {\includegraphics[width=\linewidth]{figures/fig_1.png}}
\end{figure}

To address these questions we:
(i) Extensively benchmark recent foundation models for general-purpose and microscopy segmentation.
(ii) Develop a new instance segmentation algorithm that operates on top of (fine-tuned) SAMs without the need for additional training.
Our results show substantial improvements thanks to our segmentation approach, competitive with the state-of-the-art. Further, they show that foundation models for microscopy have an edge over SAM3, and show a substantial influence of the training data on model performance. See Fig.~\ref{fig:overview} for an overview of our methodology and contributions. 

\section{Related Work} \label{rel-work}

Vision foundation models (VFMs) have emerged after the success of large language models (LLMs) as generally capable language processors \cite{gpt3}. VFMs can be divided into three categories: (i) vision and text models, such as CLIP \cite{clip} and SigLIP \cite{siglip} that are trained via contrastive learning on image-text-pairs. They are a key ingredient of multi-modal LLMs.
(ii) general-purpose vision encoders such as Dino V2 \cite{dino-v2} and V3 \cite{dino-v3} that are trained via self supervised learning, enabling diverse downstream tasks by fine-tuning small decoders.
And (iii) foundation models for segmentation. Among these, SAM \cite{SAM} is the most popular. It supports interactive segmentation based on point or box prompts, and also supports automatic segmentation. SAM2 \cite{SAM2} extends its capabilities to video segmentation. SAM3 \cite{SAM3} introduces ``concept-based'' segmentation, enabling prompting with short text descriptions or example images with object annotations.
Other examples of segmentation foundation models include SegGPT \cite{seg-gpt}, which supports example-based learning, LISA \cite{LISA}, which extends an LLM with segmentation capabilities, and SEEM \cite{SEEM}, which offers a wide range of prompting options.

Many segmentation foundation models for biomedical images have emerged. They are predominantly based on SAM(2).
Their exact adaptation strategies vary and can be divided into three categories and combinations thereof:
\begin{enumerate}
    \item By automatically deriving prompts (points or bounding boxes) for objects in the image and then using SAM to predict the corresponding masks \cite{un-sam}. 
    \item By using SAM's encoder as backbone of a model for automatic segmentation that is trained on domain-specific data \cite{vista-3d}.
    \item By fine-tuning SAM's architecture for promptable segmentation on domain-specific data \cite{MedSAM, MedicoSAM, SAMed2d}.
\end{enumerate}

Several segmentation foundation models have been proposed for medical imaging, e.g. CT, MRI, X-Ray, (see citations in the previous paragraph). Instance segmentation in microscopy is another important application. The most popular models are CellSAM \cite{cell-sam} for light microscopy segmentation, $\mu$SAM \cite{micro-sam} for light and electron microscopy segmentation, its extension PathoSAM \cite{patho-sam} for histopathology, CellPoseSAM \cite{cellpose-sam} for light microscopy and histopathology, and CellViT \cite{cell-vit} for histopathology. See also Sec.~\ref{methods:overview}.

\section{Methods} \label{sec:methods}

We first review foundation models for microscopy segmentation (Sec.~\ref{methods:overview}), then explain our new instance segmentation method (Sec.~\ref{methods:apg}), and the evaluation methodology (Sec.~\ref{sec:metrics}).

\subsection{Instance segmentation with SAM-based models} \label{methods:overview}

The models of the SAM family all support automatic instance segmentation. SAM \cite{SAM} and SAM2 \cite{SAM2} are trained with an objective for prompt-based segmentation.
They support automatic segmentation by covering the input image with a grid of point prompts, segmenting each prompt, and then merging the predicted masks via non-maximum suppression (NMS).
This mode is called automatic mask generation (AMG).
In contrast, SAM3 \cite{SAM3} predicts instances directly with a DETR-style approach \cite{DETR}.
All three models were primarily trained on natural images with segmentation annotations, 11 million images with 1 billion annotations for SAM, an additional 50k videos with 642k object track annotations for SAM2, and an additional 5 million images and 50k videos for SAM3. 

SAM has been studied extensively for microscopy data (see e.g. \cite{micro-sam}, Sec.~\ref{sec:res}). It yields good segmentation results for easy tasks (e.g. well separated nuclei) via AMG, but fails for more difficult tasks.
AMG with SAM performs worse for microscopy (see Sec.~\ref{sec:res}). SAM3 has been published very recently. To our knowledge we are the first to evaluate it for microscopy. 

While SAM itself fails at difficult microscopy segmentation tasks, the state-of-the-art models for microscopy are built on top of it, following one of the strategies 1.-3. outlined in Sec.~\ref{rel-work}.
The simplest strategy (2) is to train a new decoder on top of SAM's image encoder (initialized with its pretrained weights) that outputs an intermediate prediction, followed by (non-differentiable) post-processing to obtain instances. This approach is implemented by CellPoseSAM \cite{cellpose-sam}, which chose the CellPose instance segmentation method \cite{cellpose} and was trained on 22,826 light microscopy and histopathology images with 3.34 million annotated cells and nuclei, and Cell-ViT, which chose the HoverNet semantic instance segmentation method \cite{hovernet} and was trained on the PanNuke \cite{pannuke} dataset, consisting of 200,000 annotated nuclei. 

CellSAM \cite{cell-sam} takes a more complex approach: it trains a bounding box detection decoder (CellFinder) on top of SAM's image encoder and then uses its predictions as box prompts for SAM's prompt encoder to segment instance masks. The mask decoder is also finetuned, i.e., corresponding to a combination of strategies 1 and 2. CellSAM was trained on ten datasets with annotated cells and nuclei.

$\mu$SAM \cite{micro-sam} finetunes the entire SAM architecture for promptable segmentation while adding a decoder for instance segmentation that predicts foreground probabilities as well as normalized distances to object centers and boundaries. These predictions serve as input to a watershed for instance segmentation. The procedure is called automatic instance segmentation (AIS).
$\mu$SAM was trained on ca. 17,000 light microscopy images with over 2 million annotated cells and nuclei; a different version of the model for electron microscopy also exists. PathoSAM \cite{patho-sam} replicates this effort for histopathology. It was trained on ca. 5,000 images with over 400,000 annotated nuclei. This approach corresponds to a combination of strategies 2 and 3.

\subsection{Automatic prompt generation} \label{methods:apg}

We observe that none of the SAM-based microscopy foundation models combine all three adaptation strategies, i.e. none of them combine automatically derived prompts (1) with a custom decoder (2), and finetuning for promptable segmentation (3).
While CellSAM comes close to this combination, it does not finetune for promptable segmentation and thus relies heavily on the box predictions, which are translated one-to-one to masks. Hence, if an object is not correctly detected, it cannot be recovered by SAM's promptable segmentation. This becomes more likely under a domain shift, leading to diminished generalization, which could be avoided with a more flexible prompting strategy.
On the other hand, segmentation via a dedicated decoder as implemented by CellPoseSAM and $\mu$SAM foregoes potential improvements due to prompt-based segmentation. Empirically, we observed that segmentation based on prompts derived from annotated objects performs significantly better than fully automated segmentation \cite{micro-sam}. Hence, a suitable automatic prompting strategy should be able to improve upon automatic instance segmentation without prompting, exemplified for a cell with complex morphology in Fig.~\ref{fig:overview}. 

To overcome the limitations discussed in the previous paragraph, we implement a new instance segmentation method called automatic prompt generation (APG).
It operates on top of the $\mu$SAM model, which was \textbf{not retrained} by us. APG uses the predictions of $\mu$SAM's segmentation decoder to derive point prompts, uses the prompt encoder and mask decoder to predict masks based on these prompts, and merges the predicted masks via NMS to obtain an instance segmentation. The procedure is illustrated in Fig.~\ref{fig:overview} a).
Note that this approach does not require us to derive exactly one prompt per object (as in CellSAM) since multiple predicted masks per object can be filtered by NMS.
In detail, APG works as follows:
\begin{enumerate}
    \item Apply the image encoder and segmentation decoder to predict foreground probabilities $fg$ and normalized boundary as well as center distances, $d_b$ and $d_c$.
    \item Apply the thresholds $t_{fg}$, $t_{b}$ and $t_{c}$ to $fg$, $d_b$, and $d_c$, respectively, to obtain binary masks. 
    \item Apply connected components to the intersection of the three binary masks from 2.
    \item Derive a point prompt for each component by computing the maximum of the boundary distance transform per component.
    \item Apply prompt encoder and mask decoder to these prompts to obtain mask and IoU predictions. The latter give a quality estimate for each predicted mask.
    \item Apply a size filter $s$ to the masks.
    \item Compute the pairwise overlap of predicted masks and apply NMS with threshold $t_{nms}$ based on predicted IoUs to filter overlapping masks.
\end{enumerate}
The parameters of APG are $t_{fg}$, $t_b$, $t_c$, $s$, and $t_{nms}$ . Their default values are: $t_{fg} = 0.5$, $t_b = 0.5$, $t_c = 0.5$, $s = 25$, and $t_{nms} = 0.9$. We run all experiments with these default values.

Note that step 2 and 3 are the same as in the AIS method of $\mu$SAM, which then uses the components as seeds for a watershed. However, in AIS $t_b$ and $t_c$ have to be determined such that each object is covered with a single component, leading to a trade-off between over- and under-segmentation. In APG, we can choose these values so that multiple prompts are derived for one object, then filtered in step 7 by NMS.
APG can be applied to the $\mu$SAM (and PathoSAM) model as is without retraining. APG is implemented as part of the $\mu$SAM code base at \url{https://github.com/computational-cell-analytics/micro-sam}. The use of APG is documented at \url{https://computational-cell-analytics.github.io/micro-sam/micro_sam.html\#apg}.

\subsection{Datasets \& metrics} \label{sec:metrics}

We evaluate APG and other foundation models on 36 datasets from four different tasks and domains:
nucleus segmentation in fluorescence microscopy \cite{dynamicnuclearnet, u20s, arvidsson, ifnuclei, gonuclear, nis3d, paryhale_regen, dsb, bitseg}, cell segmentation in fluorescence microscopy \cite{cellpose, cellbindb, tissuenet, plantseg, covid-if, hpa, pnas_arabidopsis, mouse-embryo}, cell segmentation in label-free microscopy \cite{livecell, omnipose, deepbacs, usiigaci, vicar, deepsea, toiam, segpc, yeaz}, and nucleus segmentation in histopathology \cite{ihc-tma, lynsec, monuseg, pannuke, tnbc, nuinsseg, puma, cytodark0, cryonuseg}. We use the test splits for all of these datasets. For volumetric datasets we run and evaluate the segmentation in 2D and sub-sample the test sets for efficiency reasons. Note that some of the methods we evaluate were trained on some of these datasets (though not only on the train splits, not on the test splits). See Fig.~\ref{fig:results} for details.
A detailed overview of all datasets can be found in Tab.~\ref{tab:datasets}.

We use the mean segmentation accuracy, following \cite{dsb}, to evaluate instance segmentation results.
The mean segmentation accuracy (mSA) is based on true positives ($TP$), false negatives ($FN$), and false positives ($FP$), which are derived from the intersection over union (IoU) of predicted and true objects.
Specifically, $TP(t)$ is defined as the number of matches between predicted and true objects with an IoU above the threshold $t$, $FP(t)$ correspond to the number of predicted objects minus $TP(t)$, and $FN(t)$ to the number of true objects minus $TP(t)$.
The mean segmentation accuracy is computed over multiple thresholds:
\begin{equation*}
    \text{Mean Segmentation Accuracy} = \frac{1}{|\text{\# thresholds}|} \sum_{t} \frac{TP(t)}{TP(t) + FP(t) + FN(t)}\,.
\end{equation*}
Here, we use thresholds $t \in [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]$. For each dataset, we report the average mean segmentation accuracy over images in the test set. This metric is commonly used to evaluate instance segmentation in microscopy, see \cite{msa} for an in-depth discussion.

\section{Results} \label{sec:res}

We evaluate SAM (w/ AMG), SAM3, $\mu$SAM (w/ AIS, APG), PathoSAM (w/ AIS, APG), CellPoseSAM, and CellSAM (see Sec.~\ref{sec:methods}) on the data described in Sec.~\ref{sec:metrics}. We also evaluate CellPose3 \cite{cellpose3}, which uses a convolutional architecture and a smaller training set but is otherwise similar to CellPoseSAM.
PathoSAM is only evaluated on histopathology data and $\mu$SAM is not evaluated in this domain to account for the specific focus of these models.
Note that we do not evaluate SAM2 (w/ AMG) as we found it to be inferior to SAM in this setting. It did not segment any objects for several datasets in initial experiments we ran. SAM3 is prompted with the single short noun phrase ``cell'' for all images. See Sec.~\ref{sec:res-sam3} for a detailed analysis on the choice of text prompt.

\begin{figure}[h]
\floatconts
  {fig:results}
  {\caption{Results for 36 microscopy segmentation datasets in four different modalities: cells (fluorescence, a), cells (label-free, b), nuclei (fluorescence, c), and nuclei (histopathology, d). We indicate the top-3 ranked method with blue colors and methods that were trained on the corresponding training split with textured bars. For our method (APG) we indicate the absolute performance difference with respect to the reference method (AIS).}}
  {\includegraphics[width=\linewidth]{figures/fig_2.png}}
\end{figure}

The result summary is shown in Fig.~\ref{fig:overview}, the results for all datasets in Fig.~\ref{fig:results}, reported separately across the four imaging modalities. We highlight the three top performing methods and indicate whether the model was trained on the respective data's train split.

APG improves the $\mu$SAM model compared to AIS for all label-free microscopy cases, including a very substantial improvement for TOIAM. It improves 3 / 9 datasets for cell segmentation in fluorescence microscopy, with only modest differences in segmentation quality, improves 7 / 9 datasets for nucleus segmentation in fluorescence, and 7 / 9 for nucleus segmentation in histopathology.
Overall, we find that CellPoseSAM and APG are consistently among the best approaches, ranking among the top three for all four modalities.
AIS is the third best model overall, followed by CellPose 3.
CellSAM performs worse than the other microscopy foundation models, but consistently better than SAM and SAM3. Tables with all numerical results can be found in App.~\ref{app:res-quantitative}. Qualitative results for selected methods and one dataset per domain are shown in Fig.~\ref{fig:res-qualitative} and for all datasets in Figs.~\ref{fig:app-quali-labelfree} - \ref{fig:app-quali-histo}. We further evaluate the statistical significance of differences between the models in App.~\ref{app:statistical-results}.

\begin{figure}[h]
\floatconts
  {fig:res-qualitative}
  {\caption{Qualitative segmentation results for all microscopy foundation models. We show examples for one dataset per domain / task: cell segmentation in label-free microscopy, cell segmentation in fluorescence microscopy, nucleus segmentation in fluorescence microscopy, and nucleus segmentation in histopathology (top to bottom). Examples for all datasets can be found in Figs.~\ref{fig:app-quali-labelfree} - \ref{fig:app-quali-histo}.}}
  {\includegraphics[width=\linewidth]{figures/fig_4.png}}
\end{figure}

\subsection{Comparison of APG strategies}

We also compare an alternative strategy for deriving prompts in APG, using the foreground-restricted maxima of the boundary distance predictions. The corresponding results are shown in Fig.~\ref{fig:res-ablation}. 
The strategy using connected components, as described in Sec.~\ref{methods:apg}, is superior.

\begin{figure}[h]
\floatconts
  {fig:res-ablation}
  {\caption{Comparison of our connected component-based strategy APG (APG (Components)) with an alternative deriving prompts from distance map maxima (APG (Boundary)) for the four different modalities (a-d), reporting the relative mean segmentation accuracy difference w.r.t AIS.}}
  {\includegraphics[width=\linewidth]{figures/fig_3.png}}
\end{figure}

\FloatBarrier
\subsection{Prompting strategies for SAM3} \label{sec:res-sam3}

The results for SAM3 in Fig.~\ref{fig:results} were obtained by prompting the model with the phrase ``cell''.
We perform an additional experiment to determine the sensitivity of SAM3 to the text prompt on four datasets (one per modality).
We evaluate the segmentation quality with prompts corresponding to the correct biological term (``cell'', ``nucleus''), single nouns describing the object's shapes (``blob'', ``dot''), and combinations of adjectives and nouns describing the shapes. We evaluate both singular and plural for all cases.
The results are shown in Tab.~\ref{tab:sam3-ablation}. They show that SAM3 lacks the knowledge of biological terms (``nucleus'' is not recognized) and that it is overall fairly sensitive to the choice of prompts. For example, prompts describing the shape perform substantially better than the correct biological term ``cell'' in multiple cases.
We did not yet prompt SAM3 with an example image and representative segmented objects. 

\begin{table}[h]
\centering
\resizebox{\columnwidth}{!}{%
\begin{tabular}{|c|c|c|c|c|} \hline
\textbf{Dataset} & \textit{LIVECell} & \textbf{CellPose} & \textbf{DSB} & \textit{PanNuke} \\ \hline
\textbf{Modality} & Label Free & Fluorescence & Fluorescence & Histopathology \\ \hline
\textbf{Task} & Cell & Cell & Nucleus & Nucleus \\ \hline
\noalign{\vskip 3pt}
\cline{1-1}
\multicolumn{1}{|c|}{\textbf{Text Prompts}} \\ \hline

% Biological NPs

\textit{cell(s)} &
\textcolor{green}{\checkmark} (\textbf{0.331}) $\mid$ \textcolor{green}{\checkmark} (0.311) & 
\textcolor{green}{\checkmark} (0.299) $\mid$ \textcolor{green}{\checkmark} (0.231) &
\textcolor{green}{\checkmark} (0.366) $\mid$ \textcolor{green}{\checkmark} (0.386) &
\textcolor{green}{\checkmark} (0.341) $\mid$ \textcolor{green}{\checkmark} (0.159)
\\ \hline

\textit{nucleus (nuclei)} &
$-\mid-$ &
$-\mid-$ &
$\textcolor{red}{\times}\mid$ \textcolor{green}{\checkmark} (0.085) &
$\textcolor{red}{\times}\mid\textcolor{red}{\times}$
\\ \hline

% Shorter NPs

\textit{blob(s)} &
$\textcolor{red}{\times}\mid\textcolor{red}{\times}$ &
$\textcolor{red}{\times}\mid$ \textcolor{green}{\checkmark} (0.175) &
\textcolor{green}{\checkmark} (0.103) $\mid$ \textcolor{green}{\checkmark} (0.416) &
$\textcolor{red}{\times}\mid$ \textcolor{green}{\checkmark} (0.179)
\\ \hline

\textit{dot(s)} &
$\textcolor{red}{\times}\mid$ \textcolor{green}{\checkmark} (0.014) &
\textcolor{green}{\checkmark} (0.015) $\mid$ \textcolor{green}{\checkmark} (0.096) &
\textcolor{green}{\checkmark} (0.047) $\mid$ \textcolor{green}{\checkmark} (0.387) &
\textcolor{green}{\checkmark} (0.007) $\mid$ \textcolor{green}{\checkmark} (0.021)
\\ \hline

% Longer NPs

\textit{bright spot(s)} &
$\textcolor{red}{\times}\mid\textcolor{red}{\times}$ &
$\textcolor{red}{\times}\mid$ \textcolor{green}{\checkmark} (0.027) &
\textcolor{green}{\checkmark} (0.465) $\mid$ \textcolor{green}{\checkmark} (\textbf{0.509}) &
$\textcolor{red}{\times}\mid\textcolor{red}{\times}$
\\ \hline

\textit{irregular shape(s)} &
\textcolor{green}{\checkmark} (0.282) $\mid$ \textcolor{green}{\checkmark} (0.119)&
\textcolor{green}{\checkmark} (\textbf{0.301}) $\mid$ \textcolor{green}{\checkmark} (0.247) &
\textcolor{green}{\checkmark} (0.489) $\mid$ \textcolor{green}{\checkmark} (0.427) &
\textcolor{green}{\checkmark} (\textbf{0.379}) $\mid$ \textcolor{green}{\checkmark} (0.319)
\\ \hline

\textit{large particle(s)} &
$\textcolor{red}{\times}\mid\textcolor{red}{\times}$ &
\textcolor{green}{\checkmark} (0.005) $\mid$ \textcolor{green}{\checkmark} (0.002) &
\textcolor{green}{\checkmark} (0.199) $\mid$ \textcolor{green}{\checkmark} (0.076) &
$\textcolor{red}{\times}\mid\textcolor{red}{\times}$
\\ \hline

\end{tabular}%
}
\caption{{SAM3 results for text prompt-based microscopy segmentation. Datasets marked in italic font are part of SAM3's training data, datasets marked in bold font are not. The results for best performing text prompts per dataset are marked in \textbf{bold}. All reported scores are mean segmentation accuracy for the entire dataset. Prompts resulting in a score  $<=$0.001 are marked as ``\textcolor{red}{$\times$}''. Prompts that are not applicable to the given segmentation task are marked as ``$-$''.}}
\label{tab:sam3-ablation}
\end{table}

\FloatBarrier
\section{Discussion}

We introduced APG, a new method for instance segmentation with SAM-based foundation models. It consistently and substantially improved the segmentation results of $\mu$SAM and PathoSAM compared to the prior AIS approach and is competitive with the state-of-the-art CellPoseSAM.
APG was applied without the need for re-training. It could also be applied to other SAM-based models, e.g. CellPoseSAM. However, in this case it would either rely on another model for prompt-based segmentation or joint training of CellPoseSAM with the objective for interactive segmentation.

Furthermore, we evaluated SAM3, the latest model of the SAM family, for microscopy. It performed well -- though not yet competitive with domain-specific foundation models --, despite being trained on only two relevant datasets \cite{livecell, pannuke}. Moreover, it showed a sensitivity to the choice of text prompt.
Finetuning SAM3 on microscopy data promises to yield further improvements in this domain.
We did not yet test prompting SAM3 with examples of annotated images, which could lead to substantial improvements without further training.

Overall, we saw a clear impact of model training data on performance, with models typically performing similar when trained on the respective data's training split, and more pronounced for ``out-of-domain'' datasets. Yet, we found that APG can lead to substantial improvements for both in-domain (e.g. OmniPose, DeepBacs) and out-of-domain (e.g. TOIAM) case.
Overall, the performance of models seems to correlate with the size of their domain-specific training data, CellPose and $\mu$SAM (w/ AIS \& APG) having the largest training sets and overall best performances.
This observation also indicates that further finetuning to improve for specific applications remains relevant, as already shown in prior work \cite{peft-sam, cellseg1} that demonstrated substantial improvements through training on small datasets -- even a single image. These approaches would also directly translate to specifically improving APG.

Furthermore, APG could likely be improved by incorporating box prompts, which generally perform better than point prompts \cite{micro-sam}. We performed initial experiments to derive candidate box prompts from the $\mu$SAM decoder predictions. However, we found that deriving a set of high-quality prompts that over-sample objects was challenging and did not yet find a strategy competitive with the simpler point prompt derivation. Future work, such as iterative prompt derivation, may enable such improvements.

Finally, a limitation of our study is the restriction to 2D evaluation, also for 3D datasets. CellPoseSAM, $\mu$SAM, and SAM3 support 3D segmentation, so this evaluation would provide a further valuable comparison and would further inform how to improve microscopy foundation models.
We plan to address this problem in future work, potentially including adaptations of SAM2 and/or SAM3 to microscopy.

% \clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{
The work of Anwai Archit was funded by the Deutsche Forschungsgemeinschaft (DFG, German Research Foundation) - PA 4341/2-1. Constantin Pape is supported by the German Research Foundation (Deutsche Forschungsgemeinschaft, DFG) under Germany’s Excellence Strategy - EXC 2067/1-390729940. This work is supported by the Ministry of Science and Culture of Lower Saxony through funds from the program zukunft.niedersachsen of the Volkswagen Foundation for the ’CAIMed – Lower Saxony Center for Artificial Intelligence and Causal Methods in Medicine’ project (grant no. ZN4257). It was also supported by the Google Research Scholarship “Vision Foundation Models for Bioimage Segmentation”. We gratefully acknowledge the computing time granted by the Resource Allocation Board and provided on the supercomputer Emmy at NHR@G\"{o}ttingen as part of the NHR infrastructure, under the project nim00007. We would like to thank Sebastian von Haaren for suggestions on data visualizations, Carolin Teuber and Titus Griebel for data processing scripts, and Julia Jeremias for post-processing scripts.}

\bibliography{midl26_324}

\newpage

\appendix

\section{Dataset Details}

\begin{table}[h]
\centering
\footnotesize
\begin{tabular}{|l|p{5cm}|l|}
\hline
\textbf{Dataset} & \textbf{Imaging Modality} & \textbf{Input Dimensions} \\   
\hline

% Label-free datasets
LIVECell \cite{livecell} & Phase Contrast & 2D \\
\hline
OmniPose \cite{omnipose} & Phase Contrast \& Brightfield  & 2D \\  
\hline
DeepBacs \cite{deepbacs} & Brightfield \& Fluorescence & 2D\\  
\hline
Usiigaci \cite{usiigaci} & Phase Contrast & 2D \\  
\hline
Vicar \cite{vicar} & Quantitative Phase & 2D \\  
\hline
TOIAM \cite{toiam} & Phase Contrast & 2D+T\\  
\hline
DeepSeas \cite{deepsea} & Phase Contrast & 2D \\  
\hline
YeaZ \cite{yeaz} & Brightfield \& Phase Contrast & 2D \& 2D+T \\  
\hline
SegPC \cite{segpc} & Brightfield & 2D \\  
\hline

% Fluorescence cell segmentation datasets
TissueNet \cite{tissuenet} & Fluorescence & 2D \\  
\hline
CellPose \cite{cellpose} & Fluorescence & 2D \\  
\hline
PlantSeg (Root) \cite{plantseg} & Light-Sheet Fluorescence & 3D \\  
\hline
PlantSeg (Ovules) \cite{plantseg} & Confocal & 3D \\  
\hline
PNAS Arabidopsis \cite{pnas_arabidopsis} & Confocal & 3D \\  
\hline
Covid-IF \cite{covid-if} & Immunofluorescence & 2D \\  
\hline
HPA \cite{hpa} & Confocal & 2D \\  
\hline
CellBinDB \cite{cellbindb} & Multiple & 2D \\  
\hline
Mouse Embryo \cite{mouse-embryo} & Confocal & 3D \\  
\hline

% Fluorescence nucleus segmentation datasets
DSB \cite{dsb} & Fluorescence & 2D \\  
\hline
U20S \cite{u20s} & Fluorescence & 2D \\  
\hline
Arvidsson \cite{arvidsson} & High-Content Fluorescence & 2D \\  
\hline
BitDepth NucSeg \cite{bitseg} & Fluorescence & 2D \\  
\hline
IFNuclei \cite{ifnuclei} & (Immuno)Fluorescence & 2D \\  
\hline
DynamicNuclearNet \cite{dynamicnuclearnet} & Fluorescence & 2D+T \\  
\hline
GoNuclear \cite{gonuclear} & Fluorescence & 3D \\  
\hline
NIS3D \cite{nis3d} & Light-Sheet Fluorescence & 3D \\  
\hline
Parhyale Regen \cite{paryhale_regen} & Confocal & 3D \\  
\hline

% Histopathology datasets
PanNuke \cite{pannuke} & \textit{H\&E staining} & 2D \\  
\hline
IHC TMA \cite{ihc-tma} & \textit{IHC staining} & 2D \\  
\hline
LynSec \cite{lynsec} & \textit{IHC staining} & 2D \\  
\hline
MoNuSeg \cite{monuseg} & \textit{H\&E staining} & 2D \\  
\hline
NuInsSeg \cite{nuinsseg} & \textit{H\&E staining} & 2D \\  
\hline
PUMA \cite{puma} & \textit{H\&E staining} & 2D \\  
\hline
TNBC \cite{tnbc} & \textit{H\&E staining} & 2D \\  
\hline
CryoNuSeg \cite{cryonuseg} & \textit{(Cryo-Sectioned) H\&E staining} & 2D \\ 
\hline
CytoDark0 \cite{cytodark0} & \textit{Nissl staining} & 2D \\  
\hline

\end{tabular}
\caption{Description of the different datasets used in our study. For the 3D datasets / 2D+T datasets, we evaluate over individual slices / frames. For the histopathology datasets, the imaging modality describes the staining protocol (in italics) for the images.}
\label{tab:datasets}
\end{table}

\FloatBarrier
\section{Extended Qualitative Results} \label{app:quali-plots}

\begin{figure}[h]
\floatconts
  {fig:app-quali-labelfree}
  {\caption{Qualitative results for all label-free microscopy datasets for cell instance segmentation.}}
  {\includegraphics[width=0.8\linewidth]{figures/supp_fig_quali_labelfree.png}}
\end{figure}

\begin{figure}[h]
\floatconts
  {fig:app-quali-fluocells}
  {\caption{Qualitative results for all fluorescence microscopy datasets for cell instance segmentation.}}
  {\includegraphics[width=0.8\linewidth]{figures/supp_fig_quali_fluocells.png}}
\end{figure}

\begin{figure}[h]
\floatconts
  {fig:app-quali-fluonuc}
  {\caption{Qualitative results for all fluorescence microscopy datasets for nucleus instance segmentation.}}
  {\includegraphics[width=0.8\linewidth]{figures/supp_fig_quali_fluonuc.png}}
\end{figure}

\begin{figure}[h]
\floatconts
  {fig:app-quali-histo}
  {\caption{Qualitative results for all histopathology datasets for nucleus instance segmentation.}}
  {\includegraphics[width=0.8\linewidth]{figures/supp_fig_quali_histo.png}}
\end{figure}

\FloatBarrier
\section{Quantitative Results} \label{app:res-quantitative}

Detailed quantitative results (both mean over all datasets and per dataset (per domain)).

\begin{table}[h]
\centering
\begin{tabular}{|l|c|c|c|c|}
\hline
\textbf{Method} &
\textbf{\shortstack{Label-Free\\(Cell)}} &
\textbf{\shortstack{Fluorescence\\(Cell)}} &
\textbf{\shortstack{Fluorescence\\(Nucleus)}} &
\textbf{\shortstack{Histopathology\\(Nucleus)}} \\
\hline
AMG (SAM) & 0.106 & 0.091 & 0.283 & 0.200 \\ \hline
AIS & \textit{0.480} & \underline{0.347} & \underline{0.513} & \textit{0.390} \\ \hline
SAM3 & 0.269 & 0.143 & 0.255 & 0.331 \\ \hline
CellPose 3 & 0.424 & 0.218 & 0.438 & 0.155 \\ \hline
CellPoseSAM & \textbf{0.544} & \textbf{0.363} & \textit{0.483} & \textbf{0.418} \\ \hline
CellSAM & 0.380 & 0.272 & 0.386 & 0.277 \\ \hline
\textbf{APG} & \underline{0.541} & \textit{0.344} & \textbf{0.518} & \underline{0.398} \\ \hline
\end{tabular}
\caption{Mean Segmentation Accuracy (mSA) averaged over all datasets for each modality. The best / second / third ranking methods are shown in \textbf{bold} / \underline{underline} / \textit{italics}. For histopathology, AIS / APG correspond to the PathoSAM, and $\mu$SAM for others.}
\label{tab:res-quantitative-average}
\end{table}

\begin{table}[h]
\centering
\resizebox{\textwidth}{!}{%
\begin{tabular}{|p{2cm}|c|c|c|c|c|c|c|}
\hline
\textbf{Dataset} & \textbf{\shortstack{AMG\\(SAM)}} & \textbf{\shortstack{AIS\\($\mu$SAM)}} & \textbf{SAM3} & \textbf{CellPose 3} & \textbf{CellPoseSAM} & \textbf{CellSAM} & \textbf{\shortstack{APG\\($\mu$SAM)}} \\
\hline
LIVECell & 0.075 & \textit{0.415} & 0.331 & 0.414 & \textbf{0.444} & 0.098 & \underline{0.437} \\ \hline
OmniPose & 0.137 & \textit{0.599} & 0.356 & 0.468 & \underline{0.644} & 0.531 & \textbf{0.651} \\ \hline
DeepBacs & 0.057 & \textit{0.497} & 0.157 & 0.455 & \textbf{0.612} & 0.441 & \underline{0.586} \\ \hline
Usiigaci & 0.051 & 0.353 & \textit{0.362} & 0.291 & \textbf{0.445} & 0.167 & \underline{0.383} \\ \hline
Vicar & 0.115 & 0.411 & 0.086 & 0.338 & \textbf{0.458} & \underline{0.426} & \textit{0.417} \\ \hline
TOIAM & 0.009 & 0.387 & 0.027 & \underline{0.837} & \textbf{0.898} & 0.631 & \textit{0.701} \\ \hline
DeepSeas & 0.098 & \textit{0.287} & 0.277 & 0.191 & \textbf{0.345} & 0.203 & \underline{0.293} \\ \hline
YeaZ & 0.382 & 0.841 & 0.723 & 0.817 & \textbf{0.873} & \underline{0.853} & \textit{0.849} \\ \hline
SegPC & 0.027 & \underline{0.529} & 0.106 & 0.001 & \textit{0.178} & 0.069 & \textbf{0.554} \\ \hline
\end{tabular}%
}
\caption{Quantitative results on label-free microscopy datasets for cell segmentation: mean segmentation accuracy (mSA) per dataset and method. For each dataset, the best / second / third ranking methods are shown in \textbf{bold} / \underline{underline} / \textit{italics}.}
\label{tab:res-quantitative-labelfree}
\end{table}

\begin{table}[h]
\centering
\resizebox{\textwidth}{!}{%
\begin{tabular}{|p{2cm}|c|c|c|c|c|c|c|}
\hline
\textbf{Dataset} & \textbf{\shortstack{AMG\\(SAM)}} & \textbf{\shortstack{AIS\\($\mu$SAM)}} & \textbf{SAM3} & \textbf{CellPose 3} & \textbf{CellPoseSAM} & \textbf{CellSAM} & \textbf{\shortstack{APG\\($\mu$SAM)}} \\
\hline
TissueNet & 0.069 & \textit{0.329} & 0.121 & 0.154 & \textbf{0.475} & \underline{0.345} & 0.324 \\ \hline
CellPose & 0.147 & 0.383 & 0.299 & \underline{0.431} & \textbf{0.566} & 0.413 & \textit{0.416} \\ \hline
PlantSeg (Root) & 0.091 & \textbf{0.507} & 0.067 & 0.076 & \textit{0.161} & 0.096 & \underline{0.489} \\ \hline
PlantSeg (Ovules) & 0.135 & \textbf{0.341} & 0.184 & 0.266 & \textit{0.331} & \underline{0.333} & 0.325 \\ \hline
PNAS \newline Arabidopsis & 0.145 & \textit{0.461} & 0.241 & 0.411 & \textbf{0.471} & 0.459 & \underline{0.462} \\ \hline
Covid-IF & 0.007 & \underline{0.317} & 0.005 & 0.161 & \textbf{0.333} & 0.154 & \textit{0.296} \\ \hline
HPA & 0.043 & \underline{0.301} & 0.155 & 0.078 & \textbf{0.431} & \underline{0.301} & \textit{0.298} \\ \hline
CellBinDB & 0.177 & \textit{0.316} & 0.137 & 0.279 & \textbf{0.342} & 0.264 & \underline{0.321} \\ \hline
Mouse \newline Embryo & 0.003 & \textbf{0.164} & 0.081 & 0.109 & \textit{0.155} & 0.083 & \underline{0.161} \\ \hline
\end{tabular}%
}
\caption{Quantitative results on fluorescence microscopy datasets for cell segmentation: mean segmentation accuracy per dataset and method. For each dataset, the best / second / third ranking methods are shown in \textbf{bold} / \underline{underline} / \textit{italics}.}
\label{tab:res-quantitative-fluocells}
\end{table}

\begin{table}[h]
\centering
\resizebox{\textwidth}{!}{%
\begin{tabular}{|p{2cm}|c|c|c|c|c|c|c|}
\hline
\textbf{Dataset} & \textbf{\shortstack{AMG\\(SAM)}} & \textbf{\shortstack{AIS\\($\mu$SAM)}} & \textbf{SAM3} & \textbf{CellPose 3} & \textbf{CellPoseSAM} & \textbf{CellSAM} & \textbf{\shortstack{APG\\($\mu$SAM)}} \\
\hline
DSB & 0.331 & \textit{0.654} & 0.367 & 0.484 & \underline{0.656} & 0.634 & \textbf{0.665} \\ \hline
U20S & 0.258 & \underline{0.786} & \textit{0.674} & \textbf{0.787} & \textbf{0.787} & 0.673 & \textbf{0.787} \\ \hline
Arvidsson & 0.416 & \underline{0.594} & 0.297 & \textbf{0.611} & 0.484 & 0.434 & \textit{0.567} \\ \hline
BitDepth \newline NucSeg & 0.224 & \textit{0.323} & 0.182 & 0.302 & \textbf{0.377} & 0.168 & \underline{0.346} \\ \hline
IFNuclei & 0.293 & \underline{0.729} & 0.301 & 0.404 & \textit{0.728} & 0.589 & \textbf{0.749} \\ \hline
Dynamic- \newline NuclearNet & 0.298 & \textbf{0.592} & 0.346 & \textit{0.512} & 0.379 & 0.455 & \underline{0.584} \\ \hline
GoNuclear & 0.339 & \underline{0.452} & 0.034 & \textit{0.447} & 0.415 & 0.112 & \textbf{0.454} \\ \hline
NIS3D & 0.216 & \underline{0.268} & 0.031 & 0.255 & 0.246 & \textit{0.264} & \textbf{0.269} \\ \hline
Parhyale Regen & 0.173 & \textit{0.215} & 0.063 & 0.138 & \textbf{0.272} & 0.144 & \underline{0.242} \\ \hline
\end{tabular}%
}
\caption{Quantitative results on fluorescence microscopy datasets for nucleus segmentation: mean segmentation accuracy per dataset and method. For each dataset, the best / second / third ranking methods are shown in \textbf{bold} / \underline{underline} / \textit{italics}.}
\label{tab:res-quantitative-fluonuc}
\end{table}

\begin{table}[h]
\centering
\resizebox{\textwidth}{!}{%
\begin{tabular}{|p{2cm}|c|c|c|c|c|c|c|}
\hline
\textbf{Dataset} & \textbf{\shortstack{AMG\\(SAM)}} & \textbf{\shortstack{AIS\\(PathoSAM)}} & \textbf{SAM3} & \textbf{CellPose 3} & \textbf{CellPoseSAM} & \textbf{CellSAM} & \textbf{\shortstack{APG\\(PathoSAM)}} \\
\hline
PanNuke & 0.199 & \underline{0.467} & 0.341 & 0.152 & \textit{0.342} & 0.244 & \textbf{0.478} \\ \hline
IHC TMA & 0.236 & 0.264 & \underline{0.435} & 0.297 & \textbf{0.452} & \textit{0.333} & 0.272 \\ \hline
LynSec & 0.233 & \textit{0.291} & 0.157 & 0.163 & \textbf{0.561} & 0.213 & \underline{0.314} \\ \hline
MoNuSeg & 0.182 & \textbf{0.395} & 0.345 & 0.125 & \textit{0.373} & 0.302 & \underline{0.394} \\ \hline
NuInsSeg & 0.161 & 0.238 & \underline{0.312} & 0.144 & \textbf{0.349} & 0.229 & \textit{0.239} \\ \hline
PUMA & 0.232 & \underline{0.707} & 0.415 & 0.101 & \textit{0.501} & 0.294 & \textbf{0.712} \\ \hline
TNBC & 0.209 & \textbf{0.481} & 0.443 & 0.075 & \textit{0.451} & 0.383 & \underline{0.471} \\ \hline
CryoNuSeg & 0.165 & \textit{0.275} & 0.118 & 0.113 & \textbf{0.295} & 0.177 & \underline{0.293} \\ \hline
CytoDark0 & 0.182 & 0.393 & \underline{0.414} & 0.222 & \textbf{0.441} & 0.315 & \textit{0.409} \\ \hline
\end{tabular}%
}
\caption{Quantitative results on histopathology datasets for nucleus segmentation: mean segmentation accuracy per dataset and method. For each dataset, the best / second / third ranking methods are shown in \textbf{bold} / \underline{underline} / \textit{italics}.}
\label{tab:res-quantitative-histopatho}
\end{table}

\FloatBarrier
\section{Statistical Evaluation} \label{app:statistical-results}

We additionally performed paired Wilcoxon signed-rank tests on per-image mSA differences for all method pairs. Figs.~\ref{fig:app-stats-labelfree} $-$ \ref{fig:app-stats-histo} summarize the resulting wins, losses, and draws across the four imaging modalities. Overall, the statistical analysis confirms the main ranking trends, with large score differences being consistently significant and the relative ordering of the strongest methods remaining broadly unchanged.

\begin{figure}[h]
\floatconts
  {fig:app-stats-labelfree}
  {\caption{Statistical evaluation results for all label-free microscopy datasets for cell instance segmentation.}}
  {\includegraphics[width=\linewidth]{figures/comparison_heatmap_label_free.png}}
\end{figure}

\begin{figure}[h]
\floatconts
  {fig:app-stats-fluocells}
  {\caption{Statistical evaluation results for all fluorescence microscopy datasets for cell instance segmentation.}}
  {\includegraphics[width=\linewidth]{figures/comparison_heatmap_fluo_cells.png}}
\end{figure}

\begin{figure}[h]
\floatconts
  {fig:app-stats-fluonuc}
  {\caption{Statistical evaluation results for all fluorescence microscopy datasets for nucleus instance segmentation.}}
  {\includegraphics[width=\linewidth]{figures/comparison_heatmap_fluo_nuclei.png}}
\end{figure}

\begin{figure}[h]
\floatconts
  {fig:app-stats-histo}
  {\caption{Statistical evaluation results for all histopathology datasets for nucleus instance segmentation.}}
  {\includegraphics[width=\linewidth]{figures/comparison_heatmap_histopatho.png}}
\end{figure}

\end{document}
