\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images
\usepackage{graphicx}
\usepackage{hyperref}
\usepackage{booktabs}
% \usepackage[
% backend=biber,
% sorting=ynt
% ]{biblatex}


\jmlryear{2024}
\jmlrworkshop{Full Paper -- MIDL 2024}
\jmlrvolume{-- Accepted}
\editors{Accepted for publication at MIDL 2024}


\title[Auto-Generating Weak Labels]{Auto-Generating Weak Labels for Real $\&$ Synthetic Data to Improve Label-Scarce Medical Image Segmentation}


\midlauthor{\Name{Tanvi Deshpande\nametag{$^{1}$}} \Email{tanvimd@stanford.edu}\\
\addr $^{1}$ Stanford University \AND
\Name{Eva Prakash\nametag{$^{1}$}} \Email{eprakash@stanford.edu}\\
\Name{Elsie Gyang Ross\nametag{$^{1}$}} \Email{elsie.ross@stanford.edu}\\
\Name{Curtis Langlotz\nametag{$^{1}$}} \Email{langlotz@stanford.edu}\\
\Name{Andrew Ng\nametag{$^{1}$}} \Email{ang@stanford.edu}\\
\Name{Jeya Maria Jose Valanarasu\nametag{$^{1}$}} \Email{jmjose@stanford.edu}\\
}

\begin{document}

\maketitle

\begin{abstract}
The high cost of creating pixel-by-pixel gold-standard labels, limited expert availability, and presence of diverse tasks make it challenging to generate segmentation labels to train deep learning models for medical imaging tasks. In this work, we present a new approach to overcome the hurdle of costly medical image labeling by leveraging foundation models like Segment Anything Model (SAM) and its medical alternate MedSAM. Our pipeline has the ability to generate \emph{weak labels} for any unlabeled medical image and subsequently use it to augment label-scarce datasets. We perform this by leveraging a model trained on a few gold-standard labels and using it to intelligently prompt MedSAM for weak label generation. This automation eliminates the manual prompting step in MedSAM, creating a streamlined process for generating labels for both real and synthetic images, regardless of quantity. We conduct experiments on label-scarce settings for multiple tasks pertaining to modalities ranging from ultrasound, dermatology, and X-rays to demonstrate the usefulness of our pipeline. The code is available at \href{https://github.com/stanfordmlgroup/Auto-Generate-WLs/}{github.com/stanfordmlgroup/Auto-Generate-WLs/}. 

\end{abstract}

\begin{keywords}
Label Scarcity, Weak Labeling, Segmentation, Synthetic Data\end{keywords}

\section{Introduction}

The process of automatically identifying and delineating specific structures within medical images, i.e. segmentation, holds immense use-cases in various tasks, including diagnosis, treatment planning, and surgical procedures. Deep learning-based methods such as \cite{10.1007/978-3-319-24574-4_28, nestedunet, 7785132, kiu-net, medtrans, chen2021transunet, tang2022self, ma2024u} have recently constituted advancements in medical image segmentation. Most of these methods are fully supervised networks and need a good amount of data and labels for them to perform well. 


While it is typical in computer vision tasks to obtain many segmentation labels through crowd workers, medical imaging tasks require experts to  annotate the images. Due to the high cost of obtaining gold-standard labels from qualified medical professionals, practitioners often encounter the issue of \emph{label scarcity} in many medical imaging tasks. Annotating medical images for segmentation is also particularly difficult due to the fine-grained nature of the label. In contrast, however, unlabeled data is more freely available. In addition, generative models like GANs \cite{goodfellow2020generative} and diffusion models \cite{diffusion} can help generate synthetic medical images, further boosting the data size. However, this does not help augment datasets in a fully supervised pipeline, as we also need paired labels with the images to train the model. 

Recently, \citet{SAM} released Segment Anything Model (SAM), a vision foundation model trained on over 11 million images and 1 billion masks. It has the unique ability to segment any image out-of-the-box in a zero-shot setting. Segmentation using SAM can be done automatically without any inputs. However, this typically does not lead to optimal performance, and so segmentation is usually done with the help of prompting techniques on the image like points or boxes \cite{SAM}.  As SAM was trained on natural images, it is often found to be suboptimal when applied to medical imaging tasks \cite{Oliveira2023ZeroshotPO}. To address this gap, \citet{MedSAM} introduced MedSAM, a fine-tuned version of SAM created specifically for the medical imaging domain, trained on over 1 million images across 15 modalities. MedSAM was found to be superior in terms of performance on a variety of medical image segmentation tasks when compared to SAM. 


\begin{figure}[htbp]
    \centering
    \vspace{-1em}
    \includegraphics[width=\textwidth]{weak_2_2.jpg}
    \vspace{-2em}
    \caption{Labels obtained from different configurations of SAM and MedSAM.  Our method auto-generates effective input prompts (bounding boxes) using only very few annotations to generate high quality weak labels while SAM and MedSAM fail in auto options and are sensitive to input prompts in manual option.}
    \label{fig:motivation}
\end{figure}

Intuitively, MedSAM should help us annotate the unlabeled images out-of-the-box and obtain labels to augment our training dataset. However, one major limitation of SAM and MedSAM is their sensitivity to input prompts. Both models are only as good as the point or box inputs they receive on a given image. In specific, for medical images, the area of interest to be segmented may be particularly subtle within the image with thin fine-grained boundaries. Also, inputs that include too much of the target or background result in over- or undersegmentation, requiring precise input prompts to get a good segmentation. This can be seen in Fig. \ref{fig:motivation}, where we show labels generated from SAM  and MedSAM. MedSAM clearly gives us better labels than SAM, but it is very sensitive to the bounding box prompt we use and is still constrained by the manual prompting part. We also note that auto-prompting MedSAM performs poorly and often leads to blank segmentation predictions.  Therefore, selecting an appropriate input prompt is key to ensuring success. Also, eliminating the need for manual prompting could allow practitioners to auto-generate labels for any number of unlabeled real or synthetic data, which would be very useful in label-scarce scenarios.


% \begin{figure}[htbp]
%     \centering
%     \includegraphics[width=0.5\textwidth]{fig1-method.png}
%     \caption{Methodology}
%     \label{fig:methodology}

% \end{figure}


To this end, we present a new pipeline that tackles the challenge of limited labeled data by harnessing the power of foundation models like MedSAM. Our approach leverages coarse labels, generated by training segmentation models on few labels (ranging from 25 to 50), to guide the selection of inputs for unlabeled data fed into the SAM model. This process effectively creates a richer dataset, enabling the training of a significantly more accurate model while using only a few gold-standard labels and eliminating the need for time-consuming, expensive manual labeling. This enriched dataset fuels the training of deep learning models with improvements in the dice accuracy ranging from 6.6\% to 72.3\% for medical image segmentation datasets like BUSI, ISIC, and CANDID-PTX.

In summary, our contributions are as follows: 
\begin{itemize}
    \item We introduce a pipeline that automatically generates efficient weak labels for any unlabeled data using MedSAM, eliminating the need for manual prompts.
    \item We show that when using these weak labels to augment training datasets, we observe notable performance gains in deep learning models. This is especially impactful in scenarios with limited labeled data.
    \item We conduct thorough validation of the method and ablation studies across ultrasound (BUSI), dermoscopy (ISIC), and X-ray (CANDID-PTX) datasets.
\end{itemize}


\section{Related Works}
After the introduction of SAM \cite{SAM} and MedSAM \cite{MedSAM}, these vision foundation models have been directly applied to a myriad of medical imaging tasks, such as brain tumor segmentation \cite{braintumor}, eye feature segmentation \cite{eyesegmentation}, liver tumor segmentation, and lung nodule segmentation \cite{he2023computervision}, in both fine-tuning and zero-shot settings. 

Several approaches have also been introduced tailoring SAM for specific medical imaging tasks. For example, \citet{samus} tailors SAM specifically for ultrasound segmentation by injecting features into SAM's encoder through a parallel CNN branch. \citet{heterogeneous} jointly trains a network on heterogeneous ultrasound datasets using condition embedding blocks along with SAM, to allow SAM to adapt to each dataset separately. \citet{chen2023masam} adapts SAM for 3D medical image segmentation, by incorporating 3D adapters into SAM's encoder.

%In general, SAM and MedSAM have been used in various medical imaging settings, using text-based, image-based, and multimodal methods \cite{azad2023foundational, zhang2024segment, lee2024foundation}. 

There has also been work involving self-prompting SAM for medical image segmentation. One branch of such work involves separately learning prompts for SAM and combining them with SAM's existing architecture. One such approach includes learning a pixel-wise classifier from SAM's own embedding space and encoder to prompt SAM in few-shot settings \cite{selfprompt}. In addition, \citet{lei2023medlsam} use extreme points from 3D medical images to generate 2D bounding-box prompts for SAM. \citet{yolov8} first train YOLOv8 on several image-mask pairs to detect bounding-boxes for the regions to segment, which are then fed to SAM. Lastly, \citet{anand2023oneshot} use localization to a template image to prompt SAM in one-shot settings. The other branch of such work involves altering SAM's architecture to incorporate learning prompts.  \citet{shaharabany2023autosam} learn prompts for SAM by training a separate prompt encoder to automatically generate prompts, rather than conditioning on manual prompts, as SAM does. \citet{cui2023allinsam} use SAM to generate weak segmentation labels, which are then used to fine-tune SAM. 

Annotations generated from SAM can also be used to augment existing segmentation pipelines, such as U-Nets \cite{zhang2023input}. 
%Lastly, \citet{deng2023samu} use outputs from multiple prompts to estimate SAM's uncertainty on medical images. 
In addition, there has been work regarding sampling prompts from SAM \cite{qi2023selfguided}, and SAM has been used for a variety of non-medical tasks, such as inpainting \cite{yu2023inpaint} and captioning \cite{wang2023caption}.
\section{Method}
% \subsection{Problem Statement}

We propose a pipeline that automatically generates weak labels for both unlabeled real and synthetic data using MedSAM, eliminating the need for manual prompting. 
%This approach addresses the challenge of overfitting and poor performance often encountered when training segmentation models on limited labeled datasets. By augmenting such label-scarce datasets with weak labels generated through our pipeline, we demonstrate significant improvements in segmentation performance. 
Our pipeline includes training a small model on the available gold-standard labels, using predictions from this model to generate prompts for MedSAM, and retraining using a larger dataset consisting of the gold-standard and weak-labeled data. Our method is illustrated in Fig. \ref{fig:method}. 

\begin{figure}[htbp]
    \centering
    \vspace{-1em}
    \includegraphics[width=0.8\textwidth]{weak_1_1.png}
    \vspace{-1.5em}
    \caption{An illustration of our pipeline to auto-generate weak labels for unlabeled data. We use the limited annotations to train an initial model that generates low-quality coarse labels on unlabeled data. We then select inputs from these coarse labels as prompts to MedSAM to create higher-quality weak labels. These weak labels are used to train a stronger segmentation model.}
    \label{fig:method}
    \vspace{-1.5em}
\end{figure}

\paragraph{Preliminaries}
A label-scarce scenario in a medical imaging setting can be defined as a task for which there exists a small amount of labeled (gold-standard) data, and a larger pool of unlabeled data. Let the small labeled dataset be denoted by $\mathcal{D}_{labeled}$, where we assume less than 50 gold-standard labels from medical practitioners. We chose this number after considering some real clinical scenarios, in which we encountered situations where it was difficult to obtain more than 50 gold standard labels. We acknowledge that this number may vary depending on the specific task, and therefore, we conduct ablation studies using different label quantities in Section 5. Notably, acquiring 50 high-quality labels is typically more feasible than the hundreds to thousands often required for traditional segmentation models. We define the larger pool of unlabeled data as $\mathcal{D}_{unlabeled}$, which may comprise real medical scans or synthetic images generated by models like diffusion models. The evaluation set, denoted as $\mathcal{D}_{test}$, remains separate for performance assessment. 

We also define \textit{weak labels} as labels that are not gold-standard (as they are not created by experts), but are still useful in improving the model performance.

\paragraph{Coarse Label Generation}
To make use of the full potential of limited annotations, we first train an initial model $\theta$ on the labeled dataset $\mathcal{D}_{labeled}$. This model is used to generate coarse labels for the unlabeled dataset $\mathcal{D}_{unlabeled}$.  These coarse labels, despite potentially limited accuracy due to the model's training on a smaller number of labels, serve as valuable inputs for automatically prompting MedSAM to produce higher-quality weak labels.

\paragraph{Input Selection}
%We use MedSAM to generate weak labels for the images in $\mathcal{D}_{unlabeled}$.

The prompts to MedSAM can be either points or bounding-boxes. Both ``positive points" (corresponding to portions of the image to segment) and ``negative points" (corresponding to portions of the image \emph{not} to segment) can be given as input point prompts. For bounding-boxes, one needs to draw a box around their region of interest and feed that as the prompt input to MedSAM. Although these prompts are an easier alternative to pixel-by-pixel annotations, they still require manual effort and a good knowledge about the segmentation task.

To auto-generate labels using MedSAM without any manual intervention, we first pick up the coarse label prediction from the initial model $\theta$.  If the coarse label is empty, we filter out the sample, since there is no basis for a prompt to MedSAM. If not, we pick the largest contiguous blocks from the coarse label and filter out small blocks, as those are likely to be noise. For point inputs, we use the middle of the largest contiguous region(s) of the coarse label as the prompt to MedSAM; in addition, low-probability points from the original prediction mask are used as ``negative points" for the MedSAM model.  For bounding-box inputs, we compute the minimum and maximum indices of the largest contiguous region(s) of the coarse label as the prompt to MedSAM. An example of the input-selection process is shown in Fig. \ref{fig:busi-example}. We fixed these techniques after thorough experimentation on various input selection methods (App. \ref{input-selection}). While the input selection technique we describe works well for multiple tasks, we do acknowledge that for some tasks, there could be more intelligent prompting techniques that could perform better. However, we would like to point out that our input selection technique is much more generic and can be helpful in obtaining useful weak labels. It should also be noted that these weak labels are far better than the ones generated by SAM and MedSAM through automatic option (App \ref{autoseg}).  
\begin{figure}
    \centering
    \includegraphics[width=0.65\textwidth]{wl-examples.png}
    \vspace{-2em}
    \caption{Example of prompting and labeling from coarse label.}
    \label{fig:busi-example}
    \vspace{-0.5em}
\end{figure}
\paragraph{Labeling and Filtering}

To ensure the quality of weak labels generated from unlabeled images, we implement a filtering process that eliminates extreme cases. Specifically, masks containing a disproportionately high percentage of pixels ($>97\%$) belonging to either the background or the segmented class are excluded. This filtering step safeguards against misleading or uninformative masks that could potentially hinder model performance.


\paragraph{Augmenting the training data}

Finally, the weak labels generated for unlabeled images are combined with $\mathcal{D}_{labeled}$ to form the final \emph{augmented} dataset $\mathcal{D}_{augmented}$, consisting of images, where a small number of labels (25-50) are gold-standard, and the rest are weak labels. This augmented dataset will be used in the final training of the model.

% For training and evaluation, we train a model $\theta$ on binary segmentation tasks, using a combination of $\mathcal{D}_{labeled}$ and the weak labels for $\mathcal{D}_{unlabeled}$. A model $\theta_{labeled}$ trained on $\mathcal{D}_{labeled}$ and evaluated on $\mathcal{D}_{test}$ serves as the baseline to evaluate models trained on gold-standard data and weak-labeled data. 


\paragraph{Synthetic Data}
In cases where data scarcity might further exacerbate label scarcity, we explore the potential of synthetic image generation as a means to potentially increase dataset size. Thus, we additionally generate synthetic images from each dataset in order to evaluate the performance of our pipeline on such data. We train a denoising diffusion probabilistic model (DDPM) with a UNet backbone following \cite{ho2020denoising}. We set the hyperparameters following \cite{wolleb2021diffusion}, notably with a learning rate of $1e^{-4}$, image size of 256, and batch size of 4. 

\section{Experiments and Results}

In this section, we give details about the datasets we use, experimental setup, and the results that validate the usefulness of our method.

\subsection{Datasets} 
We utilize four datasets featuring 2D binary medical image segmentation tasks. Within each dataset, we randomly select $N$ random samples from the training set to be the ``gold-standard" labels. Note that we experiment on different values of $N$ to mimic different label-scarce settings. The rest of the training set is considered unlabeled data (i.e. we do not use the ground-truth labels in our experiments, but generate weak labels for these images). This approach enables us to train and evaluate our model without relying on the full set of ground-truth annotations.

\noindent \textbf{BUSI:} 
 The Breast Ultrasound Images Dataset (BUSI) consists of ultrasound images in three classes: benign, malignant, and normal \cite{busi}. We focus on segmentation of benign and malignant lesions from the ultrasound images. The number of samples with segmentation masks is 650. We use a randomized train-test split of 80--20, resulting in 130 samples in the test set. 

\noindent  \textbf{ISIC:} 
The International Skin Imaging Collaboration (ISIC) dataset consists of dermoscopic lesion segmentation tasks \cite{isic}. It comprises 1279 dermoscopic lesion images, divided into a training set of 900 images and a test set of 379 images.

\noindent \textbf{CANDID:} We use a subset of the CANDID-PTX dataset \cite{lungseg}, consisting of 500 images, which consists of binary segmentation of lungs from chest X-rays. We use a randomized 80--20 train-test split.

% \noindent \textbf{CHASE:} CHASE is a dataset consisting of 28 retinal scans \cite{CHASE}, in which the task is retinal vessel segmentation. We use 20 random images as the training set and 8 as the test set.

\subsection{Implementation Details}
We use UNet++ \cite{nestedunet} as our base segmentation network. Please note that our method is agnostic to the choice of network; we picked UNet++ as it is stable in training on label-scarce conditions. We use a combination of DICE loss and binary cross-entropy loss with a scaling ratio of 1 and 0.5 respectively to train our models. We use the SGD optimizer with a learning rate of $10^{-3}$, and a cosine annealing learning rate scheduler with minimum learning rate $10^{-5}$. All images were also reshaped to a resolution of $256 \times 256$. We use a batch size of 4 and train for 100 epochs on NVIDIA A4000 GPUs.

\subsection{Results}
We provide the quantitative results for 3 datasets in Table \ref{results} in label-scarce settings. The performance of models trained for datasets augmented with weak labels is compared against the performance of a model trained on just the base dataset $\mathcal{D}_{labeled}$, which consists of gold-standard (GS) labels. In addition, we compare against SAM's automatic option as well as UniverSeg \cite{butoi2023universeg}, using $\mathcal{D}_{labeled}$ as the support dataset. We provide results for both bounding-box and point prompts (automatically generated by our pipeline), observing that different prompts are more suited to different datasets. We achieve improvements of up to 73.3\%, with more dramatic improvements where the initial DICE score was lower; though the weak labels may not be an exact match with the gold-standard labels, they are accurate enough to provide a boost in performance. More qualitative results can be seen in the appendix (App. \ref{qualitative}).

\begin{table}[htbp]
    \centering
    \caption{Results using our method in label-scarce settings. GS = Gold Standard.}
    \label{results}
    \vspace{4pt}
    \resizebox{\linewidth}{!}{
    \begin{tabular}{ccccccccccc}
        \toprule
         \# GS Labels & \# Weak Labels & \multicolumn{9}{c}{DICE} \\
         \midrule
         & & \multicolumn{3}{c}{BUSI} & \multicolumn{3}{c}{ISIC} & \multicolumn{3}{c}{CANDID-PTX} \\
         \cmidrule(l){3-11}
         \multicolumn{2}{c}{UniverSeg} & \multicolumn{3}{c}{0.3681} & \multicolumn{3}{c}{0.5257} & \multicolumn{3}{c}{0.7700} \\
         \cmidrule(l){3-11}
         &  & Auto & Box & Points  & Auto & Box & Points  & Auto & Box & Points\\
         \cmidrule(l){3-5} \cmidrule(l){6-8} \cmidrule(l){9-11}
         25 & 0 & 0.3059 & 0.3059 & 0.3059 & 0.6123 & 0.6123 & 0.6123 & 0.8182 & 0.8182 & 0.8182 \\
         25 & 25 & 0.2629 & 0.3613 & 0.3777 & 0.5810 & 0.7367 & 0.7884 & 0.6396 & 0.8879 & 0.8726\\
         25 & 50 & 0.2401 & 0.4326 & 0.4661 & 0.5907& 0.7587 & 0.8087 & 0.5519 & 0.9044 & 0.8872\\
         25 & 100 & 0.2124 & 0.4661 & \textbf{0.5302} & 0.5893 & 0.7424 & \textbf{0.8483} & 0.4115 & \textbf{0.9096} & 0.8443\\
        \bottomrule
    \end{tabular}
    }
\end{table}

\paragraph{Synthetic Data}
In addition to experiments using real data, we conduct experiments using data from a diffusion model trained on unlabeled samples from each of the datasets (Table \ref{synthetic}). We train the diffusion model on the train splits of BUSI, ISIC, and CXR-COVID \cite{fraiwan2023dataset} datasets respectively and sample it multiple times to generate the required number of synthetic images. As diffusion models can produce large amounts of synthetic data after being trained on just a few samples, the number of possible weak labels is higher.

\begin{table}[htbp]
    \centering
    \caption{Synthetic Data Experiments}
    \label{synthetic}
    \vspace{4pt}
\resizebox{0.6\linewidth}{!}{
    \begin{tabular}{ccccc}
        \toprule
         \# GS Labels & \# Weak Labels & \multicolumn{3}{c}{DICE} \\
         \midrule
         & & BUSI & ISIC & CANDID-PTX \\
        \cmidrule(l){3-5}
        25 & 0 & 0.3059 & 0.6123 & 0.8182 \\
        25 & 50 & 0.4177 & 0.7762 & 0.8997 \\
        25 & 250 & 0.4312 & 0.7228 & 0.9073 \\
        25 & 500 & 0.4301 & 0.7247 & 0.9022 \\
        \bottomrule
    \end{tabular}
    }
\end{table}

\subsection{Ablation Studies}
In addition to our main experiments, we perform ablation studies examining various elements of our pipeline and also discuss some limitations in this section. 
\paragraph{Label scarce settings} We vary the number of images in the base dataset to observe the effect on the performance boost from weak labeling and present the results in Table \ref{ablation-scarce}. Though increasing gold-standard labels is expected to reduce performance gains from weak labels, when starting with only 10 base images, the quality of coarse labels plummets, rendering MedSAM prompting ineffective in such extreme label scarcity. However, it can be noted that we always gain performance while using our pipeline.

\begin{table}[htbp]
    \centering
    \caption{Ablation on label-scarce settings. Experiments are conducted on BUSI dataset.}
    \label{ablation-scarce}
    \vspace{4pt} % Add space after the caption
\resizebox{0.73\linewidth}{!}{
    \begin{tabular}{cccc}
        \toprule
         \# GS Labels & \# Weak Labels (Syn. Data) & DICE (GS) & 
         DICE (GS + WL) \\
         \midrule
         10 & 100 & 0.2327 & 0.2637  \\
         25 & 100 & 0.3059 & 0.4661 \\
         50 & 100 & 0.4379 & 0.5416  \\
         100 & 100 & 0.5575 & 0.6155  \\
        \bottomrule
    \end{tabular}
    }
\end{table}
\paragraph{SAM vs. MedSAM}
We ablate on the use of the base SAM model compared to the fine-tuned MedSAM model (Table \ref{ablation-sam}) and observe that MedSAM outperforms SAM. 
\begin{table}[htbp]
    \centering
    \caption{SAM vs. MedSAM. Experiments are conducted on BUSI dataset.}
    \label{ablation-sam}
    \vspace{4pt} % Add space after the caption
    \resizebox{0.5\linewidth}{!}{
    \begin{tabular}{cccc}
        \toprule
        Model & \# GS Labels & \# Weak Labels & DICE \\
        \midrule
         ---& 25 & 0 & 0.3059 \\
        SAM & 25 & 100 & 0.3886 \\
        MedSAM & 25 & 100 & 0.4661 \\
        \bottomrule
    \end{tabular}
    }
\end{table}
% \paragraph{Type of Prompt}
% We perform an ablation on the usage of bounding boxes compared to point prompts and report it in Table \ref{ablation-prompt}. We observe that bounding boxes are largely more effective than point prompts.
%\begin{table}[htbp]
%    \centering
%    \caption{Ablation on the type of prompt. Experiments are %conducted on CANDID-PTX dataset.}
%    \label{ablation-prompt}
%    \vspace{4pt} % Add space after the caption
%    \begin{tabular}{cccc}
%        \toprule
%        Prompt & \# GS Labels & \# Weak Labels & DICE \\
%        \midrule
%        --- & 25 & 0 & 0.8182 \\
%        Bounding Box & 25 & 100 & 0.9096 \\
%        Points & 25 & 100 & 0.8443 \\
%        \bottomrule
%    \end{tabular}
%\end{table}

\paragraph{Limitations}


    Our work focuses on 2D ultrasound, X-ray, 
    and dermoscopic data. We recognize that tasks involving highly intricate structures may require different input selection approaches due to potential sensitivities in our method (App. \ref{app:chase}). We also did not touch on 3D segmentation tasks; future work could investigate extensions to address such scenarios.
 
%We provide results on the CHASE dataset \cite{CHASE}, which consists of 28 images for the task of retinal vessel segmentation.
%\begin{table}[htbp]
%\label{limitations}
%    \centering
%    \caption{CHASE}
%    \vspace{4pt} % Add space after the caption
%    \begin{tabular}{cccc}
%        \toprule
%        \# GS Labels & \# Weak %Labels & DICE \\
%        \midrule
%         25 & 0 & \\
%         25 & 25 & \\
%         25 & 50 & \\
%        25 & 100 & \\
%        \bottomrule
%    \end{tabular}
%\end{table}


\section{Conclusion}
We introduce a new method for addressing label-scarce scenarios in medical image segmentation using recent advancements in vision foundation models. By selecting inputs to MedSAM from coarse labels trained on a small gold-standard dataset, we create augmented datasets with weak labels that can be auto-generated for any number of unlabeled data. Using these augmented datasets, we train models that obtain significant boosts in performance on label-scarce settings. Weak labels generated through our method can also be used to improve human-in-the-loop annotation processes.

\bibliography{midl24_117}

\appendix

\newpage

\section{Qualitative Results}
\label{qualitative}
In this section, we provide some examples comparing predictions from the base model trained on 25 gold-standard labels, compared to the final model trained on augmented datasets consisting of 25 gold-standard and 100 weak labels. We observe that the final prediction is far more accurate than the base model predictions.

\begin{figure}[htbp]
    \centering
    \vspace{-1em}
    \includegraphics[width=\textwidth]{outputs.png}
    \vspace{-2em}
    \caption{Example outputs from each dataset}
\end{figure}


\section{Comparison to SAM and MedSAM Automatic Segmentation}
In this section, we compare our weak labels to the outputs from SAM and MedSAM in their automatic options (Fig. \ref{auto}). SAM tends to segment many regions, in which it is not clear which is the target. In contrast, MedSAM tends to generate blank masks in its auto option.  The weak labels generated from our method is better and is closer to GT while also auto-generated.
\label{autoseg}
\begin{figure}[h]
    \centering
    \vspace{-1em}
    \includegraphics[width=\textwidth]{autoseg-comparisons.png}
    \vspace{-2em}
    \caption{Comparison to SAM and MedSAM automatic segmentation for each dataset}
    \label{auto}
\end{figure}
\section{Input Selection}
\label{input-selection}
We provide comparisons of various input selection methods to our method in Fig. \ref{inputs}. In the third column, for example, we choose inputs based on the darkest pixels present in the image. In the fourth column, we use a bounding box based solely on the image size rather than the image content. In the fifth and sixth columns, we include outputs from our pipeline, using point and box prompts that were automatically generated from the base model.

Our pipeline outperforms both of these alternative methods. In addition, we note that our pipeline is dataset-agnostic; it may not be the case that the target region contains the darkest pixels in an image, for example, or that it can be captured with a bounding box encompassing most of the image.
\begin{figure}[htbp]
    \centering
    \vspace{-1em}
    \includegraphics[width=\textwidth]{selection-comparisons.png}
    \vspace{-2em}
    \caption{Comparison of input selection methods}
    \label{inputs}
\end{figure}

\section{Limitations: Results on CHASE Dataset}
\label{app:chase}
We provide results on the CHASE dataset \cite{CHASE}, which consists of 28 images for the task of retinal vessel segmentation, using a train-test split of 20 and 8 images, respectively.

We note that our pipeline does not improve performance on the CHASE dataset; segmentation of particularly fine-grained vessels may require alternate input selection methods due to the increased sensitivity MedSAM may have to inputs on them. Furthermore, we note that due to the extremely small size of the CHASE dataset (28 images), the label-scarcity of the gold-standard dataset may be too extreme to provide informative coarse labels to help us generate high-quality weak labels. 
\begin{table}[htbp]
\label{limitations}
    \centering
    \caption{CHASE}
    \vspace{4pt} % Add space after the caption
    \begin{tabular}{ccc}
        \toprule
        \# GS Labels & \# Weak Labels & DICE \\
        \midrule
         5 & 0 & 0.2406\\
         5 & 5 & 0.2557\\
         5 & 10 & 0.218\\
        5 & 15 & 0.2081\\
       \bottomrule
    \end{tabular}
\end{table}

\end{document}

