\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution
\usepackage{nicematrix}
% \usepackage{caption}
\usepackage{arydshln}
\usepackage{tabularray}
% \usepackage{hyperref}
\usepackage{siunitx}        % new, for nicer aligning of numbers in table
\usepackage{etoolbox} 
\usepackage[export]{adjustbox}
\usepackage{graphicx}
% \usepackage{amsthm}
\usepackage{booktabs}
\usepackage{bbm}
\usepackage{bm}
\usepackage{multirow}
\usepackage{verbatim}
\usepackage[ruled]{algorithm2e}
\usepackage{colortbl}
\PassOptionsToPackage{table, dvipsnames}{xcolor}
\usepackage[dvipsnames]{xcolor}
% \usepackage[colorlinks,linkcolor=blue,citecolor=blue]{hyperref}
\usepackage{booktabs}
% \usepackage{subcaption}
\usepackage{amsfonts}
\usepackage{dcolumn}
\providecommand{\zkreffig}[1]{Figure~\ref{#1}} 
\providecommand{\zkreftb}[1]{Table~\ref{#1}}

\usepackage{booktabs}
\usepackage{bbm}
\usepackage{threeparttable}
%\usepackage{ulem}
\usepackage{color,graphicx}
\usepackage{amsmath}
\usepackage[ruled]{algorithm2e}
%\usepackage{appendix}
\usepackage{bm}
\usepackage{multirow}
\usepackage{arydshln}
\usepackage{booktabs}
\usepackage{amssymb}
%\usepackage[colorlinks]{hyperref}
\usepackage{hyperref}
%\usepackage{changes}
\definecolor{babyblue}{rgb}{0.54, 0.81, 0.94}
\definecolor{babypink}{rgb}{0.96, 0.76, 0.76}

\definecolor{navy}{RGB}{9,123,248}
\definecolor{red}{RGB}{215,59,46}
\definecolor{teal}{RGB}{0, 128, 128}
% \definechangesauthor[name={Review 1}, color=navy]{R1}
% \definechangesauthor[name={Review 3}, color=orange]{R3}
% \definechangesauthor[name={Review 4}, color=red]{R4}
% \definechangesauthor[name={Review 1+ Review 2}, color=teal]{R1+R4}

% \newcommand[1]{\added[id=R1]{\textit{#1}}}

% \newcommand[1]{\added[id=R3]{\textit{#1}}}

% \newcommand[1]{\added[id=R4]{\textit{#1}}}

% \newcommand[1]{\added[id=R1+R4]{\textit{#1}}}
% Used for displaying a sample figure. If possible, figure files should
% be included in EPS format.

% If you use the hyperref package, please uncomment the following line
% to display URLs in blue roman font according to Springer's eBook style:
\renewcommand\UrlFont{\color{blue}\rmfamily}
\usepackage{mwe} % to get dummy images
% \jmlrvolume{}
% \jmlryear{2025}
% \jmlrworkshop{Full Paper -- MIDL 2025 submission}
% % \editors{Under Review for MIDL 2025}

% \documentclass{midl}  % Do NOT use the 'anon' option

\jmlryear{2025}

\jmlrworkshop{Full Paper -- MIDL 2025}

\jmlrvolume{-- 21}

\editors{Accepted for publication at MIDL 2025}
\title[MedCL]{MedCL: Learning Consistent Anatomy Distribution for Scribble-supervised Medical Image Segmentation}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
\midlauthor{
  \Name{Ke Zhang} \Email{kzhang99@jhu.edu} \\
  \Name{Vishal M. Patel} \Email{vpatel36@jhu.edu} \\
  \addr Department of Electrical and Computer Engineering, Johns Hopkins University, USA}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship

% \midlauthor{\Name{Author Name1\midljointauthortext{Contributed equally}\nametag{$^{1,2}$}} \orcid{1111-2222-3333-4444} \Email{abc@sample.edu}\\
% \addr $^{1}$ Address 1 \\
% \addr $^{2}$ Address 2 \AND
% \Name{Author Name2\midlotherjointauthor\nametag{$^{1}$}} \Email{xyz@sample.edu}\\
% \Name{Author Name3\nametag{$^{2}$}} \Email{alphabeta@example.edu}\\
% \Name{Author Name4\midljointauthortext{Contributed equally}\nametag{$^{3}$}} \Email{uvw@foo.ac.uk}\\
% \addr $^{3}$ Address 3 \AND
% \Name{Author Name5\midlotherjointauthor\nametag{$^{4}$}} \Email{fgh@bar.com}\\
% \addr $^{4}$ Address 4
% }

\begin{document}

\maketitle

\begin{abstract}
Curating large-scale fully annotated datasets is expensive, laborious, and cumbersome, especially for medical images.  Several methods have been proposed in the literature that make use of weak annotations in the form of scribbles.   
However, these approaches require large amounts of scribble annotations, and are only applied to the segmentation of regular organs, which are often unavailable for the disease species that fall in the long-tailed distribution. 
Motivated by the fact that the medical labels have anatomy distribution priors, we propose a scribble-supervised clustering-based framework, called MedCL, to learn the inherent anatomy distribution of medical labels. Our approach consists of two steps:
i) Mix the features with intra- and inter-image mix operations, and
ii) Perform feature clustering and regularize the anatomy distribution at both local and global levels.
Combined with a small amount of weak supervision,  the proposed MedCL is able to segment both regular organs and challenging irregular pathologies.
We implement MedCL based on SAM and UNet backbones, and evaluate the performance on three open datasets of regular structure (MSCMRseg), multiple organs (BTCV) and irregular pathology (MyoPS). It is shown that even with less scribble supervision, MedCL substantially outperforms the conventional segmentation methods. Our code is available at \url{https://github.com/BWGZK-keke/MedCL}.
\end{abstract}

\begin{keywords}
Weakly supervised learning, Segmentation, Scribble, Data augmentation
\end{keywords}

\section{Introduction}
Manually labeling medical images is an arduous task that requires remarkable efforts from clinical experts. To alleviate this challenge, many recent approaches use sparsely annotated data for model training, termed weakly supervised learning (WSL)~\cite{l2016inscribblesup,bai2018recurrent,ji2019scribble,luo2022scribble, han2024dmsps}.
%SSL typically requires precise annotations for a portion of images in the dataset. 
WSL leverages weak supervision such as scribbles, points, bounding boxes, and image-level labels~\cite{tajbakhsh2020embracing}, {by modeling shape priors~\cite{kervadec2021beyond} and developing novel loss functions~\cite{kervadec2019constrained}}. Existing WSL medical image segmentation methods focus mainly on scribbles, which are suitable to annotate nested structures~\cite{Can2018LearningTS}. These methods make use of large amounts of scribble annotations to compensate for the lack of fully supervised data. 
The existing scribble-supervised segmentation approaches can be divided into two categories. The first group of works aims to generate pseudo-labels that are then used for supervised training~\cite{luo2022scribble,bai2018recurrent,l2016inscribblesup,ji2019scribble,han2024dmsps}. However, these models are susceptible to noise introduced by inaccurate segmentation results.
%These methods are susceptible to the quality of the generated labels, which can be of poor quality as the annotation amount decreases and structural complexity increases.
The second line of approaches focuses on regularization techniques, to constrain the prediction with size priors~\cite{zhang2022shapepu} and the consistency of the augmented versions~\cite{zhang2022cyclemix,zhang2024modelmix}.
These techniques work for regular organs, but sometimes fail in capturing the characteristics of irregular pathologies.
Unlike existing methods, we propose to investigate the inherent anatomy distribution priors and take it as the principle to guide the image segmentation. 
\\
\indent There exists a group of mix-based augmentation techniques~\cite{zhang2018mixup,devries2017cutout,yun2019cutmix,kimICML20,kim2021comixup}, termed as mixup. The mixup operation could lead to unrealistic results and change the shape of features significantly.
To overcome this, several techniques~\cite{zhang2022shapepu,zhang2022cyclemix,zhang2024modelmix} have been proposed to leverage mix-invariant properties for regularization. 
However, these works treat mix-up as a regularization strategy and fail to increase the diversity of the original training samples.
To tackle the above mentioned problems, we propose MedCL to learn anatomy distribution in an unsupervised manner.
Existing unsupervised representation learning methods~\cite{caron2018deep,caron2019unsupervised,caron2020unsupervised,chen2021exploring,chen2020simclr,wu2018unsupervised,chaitanya2020contrastive,wu2024voco} are primarily designed for pre-training and often rely on large batch sizes to achieve optimal performance. In contrast, MedCL adopts a two-stage process of {feature mixing} and clustering, enabling models to be trained from scratch while effectively capturing the anatomical patterns inherent in medical semantics.
Firstly, we propose a weak-to-strong mix strategies to thoroughly mix image features at both intra- and inter- image levels.
Secondly, we perform the feature clustering and require the compactness within center, discriminability between centers, and the consistent anatomy distribution across all centers. \\
\indent Our contributions are summarized as follows: 1) We propose a novel unsupervised clustering-based framework, \emph{i.e.}, MedCL, to learn the anatomy distribution priors for medical image segmentation, with two implementations using SAM \cite{kirillov2023segment} and UNet \cite{ronneberger2015u} backbones.
2) A feature-shuffling mechanism has been derived to generate a variety of image-prompt pairs for clustering. %We propose weak-to-strong mix approaches to shuffle features completely and increase the diversity of training samples at both intra- and inter- image levels.
3) We introduce the feature clustering approach to learn the inherent anatomy relationships among semantics. %and regularize the consistent distribution of feature centers at both global and local level. 
Specifically, we apply constraints to obtain compact, distinguishable, and regularly distributed feature clusters.
4) %MedCL is evaluated on two open datasets of regular structure and irregular pathology segmentation, \textit{i.e.}, MSCMRseg and MyoPS. Combined with fewer scribbles, the proposed MedCL demonstrates promising performance significantly better than existing methods.
Evaluated on three open datasets,the proposed MedCL demonstrates promising performance significantly better than existing methods with fewer scribbles.
\begin{figure*}[!t]
\centering
         \includegraphics[width=\textwidth]{figures/pipeline.png}
\caption{{An overview of the proposed MedCL based on the SAM architecture.}}
\label{fig:pipeline}
\end{figure*}

\section{Method}
As shown in \zkreffig{fig:pipeline}, MedCL is composed of two steps, including feature {mixing} and clustering.
Firstly, we mix image features at intra- and inter- image levels. 
Then, the feature clustering is conducted with the online mapping and is regularized with anatomy properties. 
Finally, we apply MedCL to the medical image segmentation task, and combine it with the weak supervision, \textit{i.e.}, scribble and image-level labels, to achieve better performance.
\subsection{{Mixing features}}
%We perform feature shuffling via two-stages of weak-to-strong mix augmentation. In the first stage, we apply a gentle shuffling of the image features by blending the image with its rotated version.
%In the second stage, we shuffle the images throughout the batch via inter-image mix and prompt fusion.
%Finally, we sample the augmented pairs to increase the size of the image set for clustering.
\noindent\textbf{Intra-mix:}
We surmise that the image rotated with a small angle is a resemble of the artifact. Therefore we propose to mix the image $x$ of dimension $h \times w$ with its rotated version $R(x,\theta)$. We sample the intra-image mix ratio $\beta'$ from the beta distribution and obtain the mixed image $x'$ as $x' = \beta' x + (1-\beta') R(x,\theta).$ {Correspondingly, we define $y'$ as the segmentation of mixed image $x'$.}
We further introduce bounding boxes to enable the multi-scale mix while preserving the shape characteristics within the region of interest.  We first randomly sample bounding boxes from the image, and train model to predict the segmentation results of the bounding boxes. Then, we perform mixup outside of the bounding boxes. Denoting the bounding box with the binary mask $I_b$, we modify the mix operation as $x' = I_b x + (1-I_b)[\beta' x+ (1-\beta') R(x,\theta)]$.

% To facilitate model training, we also randomly sample text prompts for each class, and generate their combinations randomly. For $m$ classes, we first sample text prompts $\phi_i$ for each class $\omega_i$, i.e., $\Phi_1 = \{\phi_i\}_{i=1}^m$ to instruct model to generate segmentation for each class. Then, we sample the class set $\Omega_k$ of size $k$, and its text prompt combination $\Phi'_k=\{\phi_i|i\in\Omega_k\}$.
% For simplicity, we increase $k$ from 2 to $m$, so that $\Omega_k$ gradually expands to the entire set.
% Then, the generated text prompts are represented as
% $\bm{\Phi} = \{\Phi_1,\Phi'_2, \cdots, \Phi'_m\}$ of dimension $m-1$. These prompt combinations instruct model to generate the combined segmentation results of multiple classes. Let $(x',I_b,\Phi)$ be the pair of intra-mixed image, bounding box mask and text prompts, the corresponding ground truth segmentation is denoted as $\bm{y} = \{y_i\}_{i=1}^m \cup\{y'_k\}_{k=2}^m$, which is of dimension $h\times w \times (2m-1)$, with both $y_i$ and $y'_k$ of dimension $h \times w \times 1$
% The element $y_i$ denotes the segmentation vector for class $\omega_i$, and $\bm{y}_k$ refers to the combination of segmentations for class set $\Omega_k$, \emph{i.e.}, $y'_k = \bigcup_{i \in \Omega_k} y_i $.

To facilitate model training, we randomly sample text prompts for each class and generate their combinations. For \( m \) classes, we begin by sampling text prompts \( \phi_i \) for each class \( \omega_i \), resulting in \( \Phi_1 = \{\phi_i\}_{i=1}^m \), which instructs the model to generate segmentation for individual classes. Next, we sample class subsets \( \Omega_k \) of size \( k \) and derive the corresponding text prompt combinations \( \Phi'_k = \{\phi_i \mid i \in \Omega_k\} \).
For simplicity, we progressively increase \( k \) from 2 to \( m \), allowing \( \Omega_k \) to gradually expand until it covers the entire class set. The resulting text prompt combinations are denoted as:  
$
\bm{\Phi} = \{\Phi_1, \Phi'_2, \ldots, \Phi'_m\},
$  
These combinations instruct the model to generate segmentation results for multiple combined classes. Let \( (x', I_b, \Phi) \) represent the mixed pairs, including intra-mixed image, the bounding box mask, and the text prompts. The corresponding ground truth segmentation is denoted by \( \bm{y} = \{y_i\}_{i=1}^m \cup \{y'_k\}_{k=2}^m \), where \( \bm{y} \) has a dimension of \( h \times w \times (2m-1) \). Each component \( y_i \) and \( y'_k \) has a dimension of \( h \times w \times 1 \).
Here, \( y_i \) represents the segmentation probability map for class \( \omega_i \), while \( y'_k \) is the combined segmentation of class subset \( \Omega_k \), defined as $y'_k = \bigcup_{i \in \Omega_k} y_i$. {The union operation defines the total area covered by the segmentation labels.}
\\
\indent\textbf{Inter-mix:}
To perform the mix across images, we simply mix two images, fuse the bounding boxes, and interpolate the text tokens accordingly. Let $(x'_1,I_{b_1}, \bm{\Phi}_1)$ and $(x'_2, I_{b_2}, \bm{\Phi}_2)$ be the two training sample pairs, we derive the mixed samples $(x_{12}, I_{b_{12}}, \bm{\Phi}_{12})$ as $x_{12} = \beta x'_1 + (1-\beta) x'_2$, $I_{b_{12}} = I_{b_1} \cup I_{b_2}$, $\bm{\Phi}_{12} = \beta e(\bm{\Phi}_1) + (1-\beta) e(\bm{\Phi}_2)$, respectively.
The text tokens $e(\bm{\Phi})$ are extracted with a prompt encoder $e(\cdot)$, and the inter-mix ratio $\beta$ is sampled from the beta distribution. The prediction of $x_{12}$ is denoted as $\hat{\bm{y}}_{12} = M(\hat{\bm{y}}'_1,\hat{\bm{y}}'_2) = \beta \bm{y}'_1 + (1-\beta)\hat{\bm{y}}'_2$, {where the addition refers to pixel-wise addition of the probability maps}. We thereby apply the mix consistency loss ($\mathcal{L}_{\text{mix}}$) to segmentation $\hat{\bm{y}}_{12} = f(x_{12},I_{b_{12}},\Phi_{12})$:
\begin{gather}
\mathcal{L}_{\text{mix}} = \text{sim}(\hat{\bm{y}}_{12},M(\hat{\bm{y}}'_1,\hat{\bm{y}}'_2)),
\label{eqmix}
\end{gather}
where $\text{sim}(z_1,z_2)=-\frac{z_1\cdot z_2}{\|z_1\|_2\cdot\|z_2\|_2}$. we aim to minimize the negative cosine similarity between the mixed segmentation $M(\hat{\bm{y}}'_1,\hat{\bm{y}}'_2)$ and the segmentation of the mixed image $\hat{\bm{y}}_{12}$.
\\
\indent\textbf{Sampling:}
%drop out
We sample the augmented image multiple times to increase the number of features for clustering. 
Inspired by previous work~\cite{caron2020unsupervised}, we first randomly crop regions of an image from a range of resolutions.  Secondly, the intra- and inter- image mix are performed to obtain the mixed pairs of \( (x', I_b, \Phi) \). Finally, we repeatedly sample training images from the mixed pairs to completely blend the features within the entire database, and achieve about 40 times amplification of training samples for each epoch.


\subsection{Cluster features}
%We perform feature clustering through online mapping and anatomy regularization. Firstly, we map the predicted segmentation to the prototype space, and then regularize it with anatomy properties at both global and local levels.

\noindent\textbf{Online mapping:}
For prediction \( \hat{\bm{y}} \) with dimensions {\((2m-1) \times h \times w \)}, we aim to map it to a set of anatomical prototypes \( \bm{a} = [a_1, \ldots, a_d] \) of size {$(2m-1) \times d$}. By flattening the predictions, {the multi-label probability map} \( \hat{\bm{y}} \) is reshaped to dimension {\((2m-1)\times n\)}, where $n = h \times w$. A mapping \( P \) of size \( d \times n \) is defined to maximize the similarity between the prediction \( \hat{\bm{y}} \) and the prototypes \( \bm{a} \).
The optimization objective is formulated as follows:
\begin{equation}
\max_{P\in \mathcal{R}^{d\times n}} \text{Tr}(P^T\bm{a}^T\hat{\bm{y}}) - w\sum_{i=1}^{d\times n}P_{i}\log P_{i},
\label{eq5}
\end{equation}
where the second term with weight $w$ is taken for regularization, aimed to control the smoothness of mapping $P$. The solution of Eq.(\ref{eq5}) is denotes as $P^*$, which is derived as the normalized exponential matrix~\cite{cuturi2013sinkhorn}:
\begin{equation}
P^* = \text{Diag}(U) \text{exp}\left(\frac{\bm{a}^T\hat{\bm{y}}}{w}\right)\text{Diag}(V),
\label{eq6}
\end{equation}
where $U\in \mathcal{R}^d$ and $V\in\mathcal{R}^n$ indicate the re-normalization vectors, which are efficiently determined using the Sinkhorn-Knopp algorithm~\cite{cuturi2013sinkhorn,caron2020unsupervised}. {We optimize the algorithm on a per-batch basis; the corresponding pseudocode is provided in Appendix Section 7.}
\\
\indent\textbf{Anatomy regularization:}
We assume the anatomy prototype clusters should meet the following criteria: (1) Compactness: the density of prototype distribution within clusters. (2) Discriminability: Clear boundaries between clusters. (3) Anatomy consistency: Consistent distribution priors across all clusters. The cluster loss $\mathcal{L}_{\text{cluster}}$ is defined as:
\begin{equation}
\mathcal{L}_{\text{cluster}} =  -\log \left[\frac{\sum_{b,i}\exp(
\frac{1}{\tau}\text{sim}(\bm{a}^b_i, \bm{a}_i^*)}{\sum_{b,i}\exp(
\frac{1}{\tau}\text{sim}(\bm{a}^b_i, \bm{a}_i^*)+\sum_{i,j}\mathbbm{1}_{i\neq j}\exp(\frac{1}{\tau}\text{sim}(\bm{a}^*_i,\bm{a}^*_j))}\right] ,
\label{cluster_eq}
\end{equation}
where $\bm{a}^*_i$ and $\bm{a}^*_j$ ($i,j\in[1,m]$) denotes the cluster center of prototypes $\{\bm{a}_i^b\}_{b=1}^B$ for class $\omega_i$, which is calculated by $\bm{a}^*_i=\frac{1}{B}\sum_{b=1}^B\bm{a}_i^b$; $b$ refers to the index of samples within the batches of size $B$; $\tau$ is the temperature parameter controlling sharpness~\cite{wu2018unsupervised}. Then,  $\sum_{b,i}\exp(
\frac{1}{\tau}\text{sim}(\bm{a}^b_i, \bm{a}_i^*)$ controls the compactness and $\sum_{i,j}\mathbbm{1}_{i\neq j}\exp(\frac{1}{\tau}\text{sim}(\bm{a}^*_i,\bm{a}^*_j))$ represents the discriminability term.
%According to~\cite{chen2020simsiam}, we introduce the stop gradient operation to make the gradient does not back-propagate to $P$, which is the mapping in the sub-problem of Eq.(~\ref{eq5}).\\


For the third principle, \emph{i.e.}, consistent anatomy distribution, we apply the consistency constraint to segmentation and prototypes at both global and local levels. By manipulating the text prompts, we construct the multi-scale regularization for model prediction. For the class set $\Omega_k$ of size from 2 to $m$, we define the anatomy consistency loss $\mathcal{L}_{\text{ac}}$:
\begin{equation}
    \mathcal{L}_{\text{ac}} = \sum_{j= m+1}^{2m-1}[\text{sim}(\hat{\bm{y}}_j, \sum_{i\in\Omega_{k}}\hat{\bm{y}}_i) + \text{sim}(\hat{\bm{a}}^*_j, \sum_{i\in\Omega_{k}}\hat{\bm{a}}^*_i)],
    \label{ac_eq}
\end{equation}
where $k = j-m+1$, indicating that the number of categories within $\Omega_k$ increases along with $j$, thereby achieving the multi-scale constraint of distribution from local to global level. 
% It shows that the number of categories within $\Omega_k$ increases as $j$ increases from 2 to $m$, thereby achieving multi-scale distribution constraints from local to global.
The first term applies the consistency of segmentation and the second term regularizes the anatomy distribution across prototype clusters.
\\
\noindent\textbf{Weak supervision:}
Although MedCL is conducted in a unsupervised setting, it can be easily combined with weak supervision forms of scribbles and image-level labels. For scribble annotations, we calculate the cross-entropy loss and dice loss for the annotated pixels, and thereby define the scribble-supervised loss as $\mathcal{L}_{\text{scribble}} = -\sum_{i=1}^m\left[\bm{y}_i\log(\hat{\bm{y}}_i) + 2\bm{y}_i \hat{\bm{y}}_i/(\bm{y}_i+\hat{\bm{y}}_i)\right] $
where $\bm{y}_i$ denotes the scribble annotations.
For image-level labels, we exploit the given set of categories ($\Psi$) presented in the image, and require the sum of their probabilities equal to 1. The weakly-supervised loss of image-level labels is formulated accordingly as:
$\mathcal{L}_{\text{category}} = -\log\left(\sum_{i\in\Psi} \bm{y}_i\right)$,
which also minimizes the probability of non-exist classes.
Then, the training objective $\mathcal{L}$ is derived as:
\begin{equation}
\mathcal{L} =  \underbrace{\mathcal{L}_{\text{mix}} + \mathcal{L}_{\text{cluster}} + \mathcal{L}_{\text{ac}}}_{\text{unsupervised}} + \underbrace{\mathcal{L}_{\text{scribble}} + \mathcal{L}_{\text{category}}}_{\text{supervised}}.
\end{equation}
%where $\mathcal{L}_{\text{mix}}$ leverages the mix equivalence; $\mathcal{L}_{\text{cluster}}$ controls the compactness and discriminability of clusters; $\mathcal{L}_{\text{ac}}$ regularizes the anatomy pattern of segmentation and prototype clusters.

% \begin{table*}[!t]
% 	\caption{Regular structure segmentation (Dice): The results in Dice of MedCL compared with scribble-supervised segmentation methods on MSCMRseg using 5 scribbles.}\label{tab:tab1}
% 	\centering
% 		\resizebox{0.75\linewidth}{!}{
% 			\begin{NiceTabular}{fcccccccccc}
% 				\midrule
% 				\multirow{2}{*}{Methods}&\multirow{2}{*}{Backbone}&\multicolumn{4}{c}{5/25 scribbles}& \multicolumn{4}{c}{25/25 scribbles}\\
% 				\cmidrule(lr){3-6}\cmidrule(lr){7-10}
% 				&&{LV} & MYO & RV & {Avg}&LV & MYO & RV&{Avg}\\
% 				\midrule
% 				\multicolumn{1}{l|}{PCE}&UNet&.261$\pm$.106&.193$\pm$.095&.018$\pm$.013&{.157$\pm$.132}&.514$\pm$.078&.582$\pm$.067&.058$\pm$.023&.385$\pm$.243\\ 
%                 \multicolumn{1}{l|}{MixUp~\cite{zhang2018mixup}}&UNet&.440$\pm$.102&.310$\pm$.127&.021$\pm$.013&{.257$\pm$.200}&.483$\pm$.09&.466$\pm$.080&.455$\pm$.134&.468$\pm$.102\\ 
% 				\multicolumn{1}{l|}{Cutout~\cite{devries2017cutout}}&UNet&.315$\pm$.103&.307$\pm$.153&.166$\pm$.110&{.263$\pm$.139}&.468$\pm$.076&.642$\pm$.132&.694$\pm$.146&.602$\pm$.154\\ 
% 				\multicolumn{1}{l|}{CutMix~\cite{yun2019cutmix}}&UNet&.335$\pm$.119&.282$\pm$.099&.017$\pm$.013&{.211$\pm$.166}&.588$\pm$.062&.624$\pm$.118&.760$\pm$.106&.657$\pm$.122\\ 
% 				\multicolumn{1}{l|}{Puzzle Mix~\cite{kimICML20}}&UNet&.084$\pm$.029&.351$\pm$.104&.010$\pm$.008&{.148$\pm$.160}&.061$\pm$.021&.630$\pm$.082&.029$\pm$.012&.240$\pm$.283\\ 
% 				\multicolumn{1}{l|}{Co-mixup~\cite{kim2021comixup}}&UNet&.322$\pm$.169&.221$\pm$.097&.034$\pm$.010&{.192$\pm$.163}&.435$\pm$.072&.300$\pm$.075&.436$\pm$.116&.390$\pm$.109\\ 
% 			    \multicolumn{1}{l|}{CycleMix~\cite{zhang2022cyclemix}}&UNet&.517$\pm$.086&.421$\pm$.108&.007$\pm$.007&{.315$\pm$.237}&.872$\pm$.060&.734$\pm$.048&.787$\pm$.073&.798$\pm$.083\\ 
%           \multicolumn{1}{l|}
% {ShapePU~\cite{zhang2022shapepu}}&UNet&.758$\pm$.191&.567$\pm$.168&.059$\pm$.026&{.461$\pm$.331}&.880$\pm$.046&.785$\pm$.080&\underline{.833$\pm$.087}&.833$\pm$.082\\ 
%           \multicolumn{1}{l|}{WSL4~\cite{luo2022scribble, han2024dmsps}}&UNet&.809$\pm$.079&.653$\pm$.109&.599$\pm$.261&{.687$\pm$.191}&.902$\pm$.040&.815$\pm$.033&.828$\pm$.101&.848$\pm$.076\\
%            \multicolumn{1}{l|}{ModelMix~\cite{zhang2024modelmix}}&UNet&\underline{.875$\pm$.077}&\underline{.754$\pm$.079}&\underline{.722$\pm$.201}&{\underline{.784$\pm$.145}}&-&-&-&-\\
%         	\multirow{2}{*}{MedCL}&{SAM}&.882$\pm$.065&.745$\pm$.065&.789$\pm$.103&{.805$\pm$.065}&.886$\pm$.074&.758$\pm$.070&.803$\pm$.091&.816$\pm$.067\\
%          &{UNet}&\textbf{.904$\pm$.045}&\textbf{.787$\pm$.047}&\textbf{.804$\pm$.101}&{\textbf{.832$\pm$.086}}&\textbf{.930$\pm$.034}&\textbf{.858$\pm$.037}&\textbf{.885$\pm$.046}&\textbf{.891$\pm$.049}\\
%   		   \midrule
% 			\multicolumn{1}{l|}{FullSup-UNet}&UNet&.775$\pm$.158&.604$\pm$.147&.572$\pm$.207&{.651$\pm$.191}&.917$\pm$.046&.813$\pm$.058&.750$\pm$.162&.827$\pm$.122\\ 
% 			\multicolumn{1}{l|}{FullSup-nnUNet}&nnUNet&.885$\pm$.085&.757$\pm$.147&.757$\pm$.201&{.799$\pm$.160}&.909$\pm$.049&.880$\pm$.027&.902$\pm$.047&.907$\pm$.044\\ 
% 				\midrule
% 		\end{NiceTabular}}
% \end{table*}

% \begin{figure}[!t]
% \centering
%          \includegraphics[width= 0.8\textwidth]{figures/MSCMR_visual_original.png}
% \caption{Qualitative results of typical cases from MSCMR dataset.} 
% \label{fig:MSCMR_visual}
% \end{figure}
\section{Experiments}
%In this section, we first validate the performance of MedCL on regular organ (MSCMRseg) and irregular pathology (MyoPS) segmentation.
%Then, we compare two implementations of MedCL based on SAM and UNet.
%Afterward, we empirically study the behaviours of MedCL, including the ablations of components, supervision sensitivity, the robustness against text prompts and noisy bounding boxes.
% \subsection{Experiment Setup}
\textbf{Datasets:}
\textbf{MSCMRseg}~\cite{gao2023bayeseg,zhuang2018multivariate} dataset is released by the MICCAI'19 multi-sequence cardiac MR Segmentation challenge. It comprises of late gadolinium enhancement (LGE) MRI images obtained from 45 patients who underwent cardiomyopathy,
The organizers provided annotations for the left ventricle (LV), myocardium (MYO), and the right ventricle (RV) in these images. Following~\cite{yue2019cardiac}, we randomly partition the images from the 45 patients into three sets: 25 training images, 5 validation images, and 15 test cases.  We adopt the manual scribble annotations released by ~\cite{zhang2022cyclemix}.
\textbf{MyoPS}~\cite{luo2022mathcal,qiu2023myops} was released in the MICCAI'20 myocardial pathology segmentation challenge, which contains 45 paired multisequence CMR images of BSSFP, LGE and T2 CMR. 
MyoPS is a more challenging task compared to MSCMR structure segmentation, due to the heterogeneous representation of pathology across different patients.
We use scribble annotations released by~\cite{zhang2024modelmix,zhang2023zscribbleseg}.
Following Li \textit{et al.}~\cite{li2023myops}, we split the dataset into 20 pairs for training, 5 for validation, and 20 for testing.
\textbf{BTCV}~\cite{landman2015miccai} dataset contains 3D abdominal CT scans from 30 subjects, with annotations provided for 13 organs. Each scan comprises 80 to 225 slices at a resolution of 512×512 pixels. {We employ complete annotations and identical data splits as those used in previous studies to ensure consistency and enable direct comparisons~\cite{tang2022self,wu2024voco}}, with 24 images for training and 6 images for validation. Additionally, we include the results of a 5-fold cross-validation in the appendix (Sec\ref{appendix_5}) for reference.
\begin{table*}[!t]
	\caption{Regular structure segmentation: Dice and HD comparison of MedCL on the MSCMRseg test set with 5 training scribbles.}\label{tab:tab1}
	\centering
		\resizebox{0.8\linewidth}{!}{
			\begin{NiceTabular}{lcccccccccc}
                      \CodeBefore
                    \rowcolor{babypink!30}{13}
                    \rowcolor{babypink!30}{14}
                    \Body
				\midrule
				\multirow{2}{*}{Methods}&\multirow{2}{*}{Backbone}&\multicolumn{4}{c}{Dice}& \multicolumn{4}{c}{HD(mm)}\\
				\cmidrule(lr){3-6}\cmidrule(lr){7-10}
				&&{LV} & MYO & RV & {Avg}&LV & MYO & RV&{Avg}\\
				\midrule
				\multicolumn{1}{l|}{PCE}&UNet&.261$\pm$.106&.193$\pm$.095&.018$\pm$.013&{.157$\pm$.132}&259.43$\pm$14.19&240.58$\pm$13.41&254.20$\pm$12.66&251.40$\pm$15.39\\ 
                \multicolumn{1}{l|}{MixUp~\cite{zhang2018mixup}}&UNet&.440$\pm$.102&.310$\pm$.127&.021$\pm$.013&{.257$\pm$.200}&259.42$\pm$14.18&210.00$\pm$12.37&251.98$\pm$15.6&240.47$\pm$25.96\\
				\multicolumn{1}{l|}{Cutout~\cite{devries2017cutout}}&UNet&.315$\pm$.103&.307$\pm$.153&.166$\pm$.110&{.263$\pm$.139}&259.42$\pm$14.18&240.06$\pm$16.38&252.18$\pm$15.42&250.56$\pm$17.04\\ 
				\multicolumn{1}{l|}{CutMix~\cite{yun2019cutmix}}&UNet&.335$\pm$.119&.282$\pm$.099&.017$\pm$.013&{.211$\pm$.166}&259.43$\pm$14.19&241.30$\pm$13.94&258.51$\pm$12.91&253.08$\pm$15.81\\ 
				\multicolumn{1}{l|}{Puzzle Mix~\cite{kimICML20}}&UNet&.084$\pm$.029&.351$\pm$.104&.010$\pm$.008&{.148$\pm$.160}&259.43$\pm$14.19&223.22$\pm$13.02&256.37$\pm$12.56&246.34$\pm$21.05\\ 
				\multicolumn{1}{l|}{Co-mixup~\cite{kim2021comixup}}&UNet&.322$\pm$.169&.221$\pm$.097&.034$\pm$.010&{.192$\pm$.163}&259.43$\pm$14.19&239.02$\pm$13.25&257.04$\pm$12.60&251.83$\pm$15.98\\ 
			    \multicolumn{1}{l|}{CycleMix~\cite{zhang2022cyclemix}}&UNet&.517$\pm$.086&.421$\pm$.108&.007$\pm$.007&{.315$\pm$.237}&213.20$\pm$35.65&151.36$\pm$55.12&260.56$\pm$12.66&208.37$\pm$58.88\\
          \multicolumn{1}{l|}
{ShapePU~\cite{zhang2022shapepu}}&UNet&.758$\pm$.191&.567$\pm$.168&.059$\pm$.026&{.461$\pm$.331}&209.04$\pm$16.09&234.08$\pm$18.15&237.86$\pm$14.13&226.99$\pm$20.45\\ 
          \multicolumn{1}{l|}{WSL4~\cite{luo2022scribble, han2024dmsps}}&UNet&.809$\pm$.079&.653$\pm$.109&.599$\pm$.261&{.687$\pm$.191}&140.95$\pm$69.06&147.74$\pm$59.93&95.07$\pm$60.53&127.92$\pm$67.49\\
           \multicolumn{1}{l|}{ModelMix~\cite{zhang2024modelmix}}&UNet&.875$\pm$.077&\underline{.754$\pm$.079}&.722$\pm$.201&{.784$\pm$.145}&78.05$\pm$16.11&69.85$\pm$30.45&99.20$\pm$46.81&82.36$\pm$35.09\\
        	\multirow{2}{*}{\textbf{MedCL}}&{SAM}&\underline{.882$\pm$.065}&.745$\pm$.065&\underline{.789$\pm$.103}&\underline{.805$\pm$.065}&\textbf{7.50$\pm$4.54}&\textbf{10.85$\pm$6.56}&\textbf{26.47$\pm$12.48}&\textbf{14.94$\pm$5.23}\\
         &{UNet}&\textbf{.904$\pm$.045}&\textbf{.787$\pm$.047}&\textbf{.804$\pm$.101}&{\textbf{.832$\pm$.086}}&\underline{56.99$\pm$42.85}&\underline{52.92$\pm$42.81}&\underline{55.41$\pm$38.58}&\underline{55.10$\pm$40.54}\\
  		   \midrule
			\multicolumn{1}{l|}{FullSup-UNet}&UNet&.775$\pm$.158&.604$\pm$.147&.572$\pm$.207&{.651$\pm$.191}&23.50$\pm$21.79&34.03$\pm$19.25&81.29$\pm$11.29&46.27$\pm$30.91\\ 
			\multicolumn{1}{l|}{FullSup-nnUNet}&nnUNet&.885$\pm$.085&.757$\pm$.147&.757$\pm$.201&{.799$\pm$.160}&21.48$\pm$29.68&13.5$\pm$12.99&18.27$\pm$12.51&17.75$\pm$19.87\\
				\midrule
		\end{NiceTabular}}
\end{table*}
\textbf{Preprocessing:} 
%For MyoPS, we concatenate the paired slices from multi-sequences of BSSFP, LGE, and T2 CMR. Then, 
For MSCMRseg and MyoPS, we extract a 256$\times$256 central region for experiments with the UNet~\cite{ronneberger2015u} backbone, while for BTCV, we use nnU-Net~\cite{isensee2021nnu}. For SAM-based models, the region is resized to 1024$\times$1024, and slice intensities are normalized to [0,1]. Models are trained with a learning rate of $1e^{-4}$ on eight NVIDIA RTX A5000 GPUs.
\textbf{Implementation:} The two versions of MedCL, based on SAM~\cite{kirillov2023segment} and UNet~\cite{ronneberger2015u} architecture, are termed as MedCL-SAM and MedCL-UNet, respectively. 
For MedCL-SAM, we leverage the pretrained encoder of MedSAM~\cite{ma2023segment} and fine-tune its decoder. The text prompts are encoded using the pre-trained CLIP~\cite{radford2021learning}.
For MedCL-UNet, we disable the input of bounding box and text prompts, and train the model from scratch.
\textbf{Evaluation Metrics}:  Following the practice of medical image segmentation, we report the Dice score and the Hausdorff distance (HD) for each foreground class of MSCMRseg and MyoPS segmentation tasks separately.

\begin{figure*}[!t]
\centering
         \includegraphics[width=0.8\textwidth]{figures/MSCMR_visual.png}
\caption{The visualization of regular organ segmentation from the MSCMRseg dataset.}
\label{fig:MSCMR_visual}
\end{figure*}
\subsection{Results}
\noindent\textbf{Regular structure segmentation:}
\zkreftb{tab:tab1} presents the Dice and HD results for cardiac ventricle segmentation using \emph{twelve} methods on the MSCMRseg dataset. Results for ModelMix~\cite{zhang2024modelmix} are borrowed from the original publication, while other methods are implemented by us.
{Note that ModelMix trains models on complementary datasets, while our approach focuses on a single task without requiring additional data sources.}
Both MedCL-SAM and MedSAM-UNet outperform the compared approaches by large margins in terms of Dice and HD metrics. This is further affirmed by the qualitative results in \zkreffig{fig:MSCMR_visual}, which visualizes the worst and median cases selected based on the average Dice score.
{The poor performance of PuzzleMix and Co-Mixup is caused by their patch transportation strategy, a limitation noted in previous work~\cite{zhang2022cyclemix}.}
%As the amount of scribble increases to 25, MedCL still achieves higher Dice scores than the other state-of-the-art methods, by an average of 4.3\%. 
%The results demonstrate that the proposed MedCL can exploit substantial supervision and learn anatomy distribution effectively.

\indent\textbf{Multi-organ segmentation:}
Table~\ref{tab:organ_results} presents the multi-organ segmentation results on the BTCV dataset. The results of the compared methods are sourced from VOCO~\cite{wu2024voco}, which was pre-trained on 1.6K CT scans (including BTCV) and further fine-tuned using the BTCV dataset. Remarkably, even without pre-training, our model trained from scratch outperforms VOCO by an average Dice of 1.72\%. This highlights the effectiveness of our proposed method in capturing the anatomical distribution of multiple organs.


\indent\textbf{Irregular pathology segmentation:} We further evaluate MedCL to the challenging task of myocardial pathology segmentation (MyoPS) with heterogeneous shape features. 
%Given that the pseudo-label based methods like WSL4, are developed for regular structures and fail to converge for this challenging task, we instead 
We compare the proposed MedCL with scribble-supervised (nnPU~\cite{NIPS2017_7cce53cf}, CVIR~\cite{garg2021mixture}, ModelMix~\cite{zhang2024modelmix}) and fully-supervised (Fullsup-UNet, Fullsup-nnUNet) benchmarks.
We borrow the results of ModelMix from the original paper, without incorporating any additional data sources.
\zkreftb{tab:tab3} summarizes the results. One can find that the advantages of the MedCL are demonstrated evidently in such a difficult task, achiving comparable performance with fully supervised benchmarks with SAM and UNet backbones, respectively.  
\zkreffig{fig:MyoPS_visual} presents three typical cases.
%, \emph{i.e.}, the worst, median, and best cases according to the average Dice by all compared methods. One can observe that scribble-supervised MedCL performs robustly and produces realistic results with less noise. %particularly compared with the other scribble-supervised approaches.
%\subsection{Ablation study}
\begin{table}[!t]
\centering
\caption{Multi-organ segmentation on BTCV multi-organ segmentation (Dice).}
\begin{adjustbox}{width=0.9\textwidth}
\begin{NiceTabular}{llcccccccccccccccc}
                 \CodeBefore
                    \rowcolor{babypink!30}{20}
                    \Body
\toprule
\textbf{Type}&\textbf{Method} & \textbf{Spl} & \textbf{RKid} & \textbf{LKid} & \textbf{Gall} & \textbf{Eso} & \textbf{Liv} & \textbf{Sto} & \textbf{Aor} & \textbf{IVC} & \textbf{Veins} & \textbf{Pan} & \textbf{RAG} & \textbf{LAG} & \textbf{AVG} \\
\midrule
%\multicolumn{15}{l}{\textbf{With General pretraining}} \\
\multirow{6}{*}{w/ general pretraining}&MAE3D~\cite{chen2023masked,he2022masked} & 93.98 & 94.37 & 94.18 & 69.86 & 74.65 & 96.66 & 80.40 & 90.30 & 83.10 & 72.65 & 77.11 & 71.34 & 60.54 & 81.33 \\
&SimCLR~\cite{chen2020simple} & 92.79 & 93.52 & 93.36 & 60.24 & 60.64 & 95.90 & 79.92 & 85.56 & 80.58 & 63.47 & 67.77 & 55.99 & 50.45 & 75.14 \\
&SimMIM~\cite{xie2022simmim} & 95.56 & 95.56 & \underline{95.08} & 63.56 & 53.52 & \textbf{98.98} & \underline{90.42} & \textbf{92.71} & 85.82 & 58.63 & 71.16 & 60.55 & 47.73 & 78.88 \\
&MoCo v3~\cite{he2020momentum,chen2021empirical} & 94.92 & 93.85 & 92.42 & 65.28 & 62.77 & 96.89 & 78.64 & 88.66 & 82.21 & 71.15 & 75.09 & 66.48 & 58.81 & 79.54 \\
&Jigsaw~\cite{chen2021jigsaw} & 94.62 & 93.45 & 93.23 & 75.63 & 73.23 & 95.03 & 85.61 & 90.65 & 83.58 & 71.71 & 79.57 & 65.68 & 58.05 & 81.35 \\
&PositionLabel~\cite{zhang2023positional} & 94.35 & 93.15 & 93.21 & 75.39 & 72.34 & 95.55 & 87.94 & 90.34 & 84.41 & 71.18 & 79.02 & 65.11 & 60.12 & 81.09 \\
\midrule
%\multicolumn{15}{l}{\textbf{With Medical pretraining}} \\
\multirow{10}{*}{w/ medical pretraining}&MG~\cite{zhou2021models} & 91.99 & 93.52 & 91.81 & 65.11 & 76.14 & 95.98 & 86.88 & 89.65 & 83.59 & 71.79 & 81.50 & 67.97 & 63.18 & 81.45 \\
&ROT~\cite{taleb20203d} & 91.75 & 93.18 & 91.62 & 65.09 & \underline{76.55} & 95.85 & 86.16 & 89.74 & 83.03 & 71.73 & \underline{81.51} & 67.07 & 62.90 & 81.25 \\
&Vicreg~\cite{bardes2022vicregl} & 92.03 & 92.50 & 91.62 & 75.24 & 74.96 & 96.07 & 85.50 & 89.43 & 83.08 & 74.74 & 78.35 & 71.14 & 63.44 & 81.81 \\
&Rubik++~\cite{tao2020revisiting} & \underline{96.21} & 91.36 & 92.68 & 75.22 & 75.52 & \underline{97.44} & 85.94 & 89.76 & 82.96 & 74.47 & 79.25 & 71.13 & 62.10 & 82.39 \\
&PCRL~\cite{zhou2023unified} & 95.30 & 91.43 & 89.62 & \underline{76.15} & 72.58 & 95.88 & 86.15 & 89.08 & 83.42 & 75.13 & 80.17 & 67.50 & 62.73 & 81.85 \\
&Swin-UNETR~\cite{tang2022self} & 95.21 & 92.03 & 92.22 & 74.27 & 73.39 & 96.32 & 84.62 & 90.78 & 83.03 & 75.51 & 79.87 & 68.99 & 61.59 & 82.11 \\
&SwinMIM~\cite{wang2023swinmm} & 95.44 & 92.43 & 94.37 & 75.29 & 73.06 & 96.44 & 84.20 & 90.76 & 83.10 & 70.91 & 79.78 & 70.11 & 62.44 & 82.07 \\
&GL-MAE~\cite{zhuang2023advancing} & 95.21 & 91.22 & 92.37 & \textbf{76.19} & 73.66 & 96.09 & 86.23 & 89.80 & 81.65 & 75.71 & 79.68 & 70.36 & 60.98 & 81.92 \\
&GVSL~\cite{he2023geometric} & 95.27 & 91.22 & 92.37 & 74.92 & 74.20 & 96.64 & 86.02 & 90.48 & 82.14 & 72.42 & 78.67 & 67.44 & 62.73 & 81.93 \\
&VoCo~\cite{wu2024voco} & 95.73 & \textbf{96.53} & 94.48 & 76.02 & 76.50 & 97.41 & 78.43 & 91.21 & \underline{86.12} & \underline{78.19} & 80.88 & \underline{71.47} & \underline{67.88} & \underline{83.85} \\
\midrule
%\multicolumn{15}{l}{\textbf{From Scratch}} \\
\multirow{3}{*}{from scratch}&UNETR~\cite{hatamizadeh2022unetr} & 93.02 & 94.13 & 94.12 & 66.99 & 70.87 & 96.11 & 77.27 & 89.22 & 82.10 & 70.16 & 76.65 & 65.32 & 59.21 & 79.82 \\
&Swin-UNETR~\cite{hatamizadeh2021swin} & 94.06 & 93.54 & 93.80 & 65.51 & 74.60 & 97.09 & 75.94 & \underline{91.80} & 82.36 & 73.63 & 75.19 & 68.00 & 61.11 & 80.53 \\
&\textbf{{MedCL-nnUNet}} &\textbf{96.77}&\underline{95.29}&\textbf{95.32}&62.95&\textbf{76.71}&97.30&\textbf{90.43}&90.48&\textbf{88.91}&\textbf{79.47}&\textbf{86.53}&\textbf{74.52}&\textbf{73.05}&\textbf{85.21}\\
\bottomrule
\end{NiceTabular}
\end{adjustbox}
\label{tab:organ_results}
\end{table}
\begin{table*}[!t]
	\caption{Irregular pathology segmentation: comparison on MyoPS test set using 5 scribbles.}\label{tab:tab3}
	\centering
		\resizebox{0.75\linewidth}{!}{
			\begin{NiceTabular}{cccccccc}
                             \CodeBefore
                    \rowcolor{babypink!30}{7}
                    \rowcolor{babypink!30}{8}
                    \Body
				\midrule
				\multirow{2}{*}{Methods}&\multirow{2}{*}{Backbone}&\multicolumn{3}{c}{Dice}& \multicolumn{3}{c}{HD}\\
				\cmidrule(lr){3-5}\cmidrule(lr){6-8}
				&&{Scar} & Edema & {Avg} &Scar & Edema &{Avg}\\
				\midrule
				PCE&{UNet}&.242$\pm$.170&.122$\pm$.077&{.182$\pm$.144}&76.22$\pm$37.24&124.89$\pm$21.27&{100.55$\pm$38.77}\\ 
                  {nnPU~\cite{NIPS2017_7cce53cf}}&{UNet}&.290$\pm$.166&.236$\pm$.078&{.263$\pm$.131}&126.51$\pm$35.27&125.05$\pm$20.69&{125.78$\pm$28.55}\\
                {CVIR~\cite{garg2021mixture}}&{UNet}&.288$\pm$.191&.085$\pm$.034&{.186$\pm$.170}&\underline{45.01$\pm$18.44}&125.27$\pm$20.83&{85.14$\pm$45.04}\\	
                \multicolumn{1}{l|}{ModelMix~\cite{zhang2024modelmix}}&{UNet}&.348$\pm$.189&\underline{.531$\pm$.106}&{.440$\pm$.177}&-&-&-\\	
			    \multirow{2}{*}{\textbf{MedCL}}&{SAM}&\textbf{.467$\pm$.222}&.505$\pm$.113&{\underline{.486$\pm$.155}}&\textbf{28.06$\pm$13.01}&\textbf{31.88$\pm$10.57}&\textbf{29.97$\pm$6.76}\\ 
			&{UNet}&\underline{.458$\pm$.229}&\textbf{.536$\pm$.152}&{\textbf{.497$\pm$.196}}&47.84$\pm$18.40&\underline{42.91$\pm$17.64}&{\underline{45.37$\pm$17.97}}\\
                \midrule
			    {FullSup-UNet}&{UNet}&.423$\pm$.253&.445$\pm$.149&{.434$\pm$.205}&117.61$\pm$35.08&119.13$\pm$22.7&{118.37$\pm$29.17}\\ 
			    {FullSup-nnUNet}&{nnUNet}&.496$\pm$.252&.563$\pm$.141&{.529$\pm$.204}&43.86$\pm$37.27&45.14$\pm$33.86&{44.50$\pm$35.15}\\
				\midrule
		\end{NiceTabular}}
\end{table*}
\begin{figure*}[!t]
\centering
         \includegraphics[width= 0.8\textwidth]{figures/MyoPS_visual.png}
\caption{Qualitative results of typical pathologies from MyoPS dataset.} %The three slices are the worst, median, and best cases by the average Dice of the compared methods.}
\label{fig:MyoPS_visual}
\end{figure*}

\begin{table}[!t]
\centering
\caption{Results on BTCV. The best results are bolded. ‘From Scratch’ denotes the supervised baseline without self-supervised pretraining.}
\begin{adjustbox}{width=0.9\textwidth}
\begin{NiceTabular}{llcccccccccccccccc}
\toprule
\textbf{Type}&\textbf{Method} & \textbf{Spl} & \textbf{RKid} & \textbf{LKid} & \textbf{Gall} & \textbf{Eso} & \textbf{Liv} & \textbf{Sto} & \textbf{Aor} & \textbf{IVC} & \textbf{Veins} & \textbf{Pan} & \textbf{RAG} & \textbf{LAG} & \textbf{AVG} \\
\midrule
%\multicolumn{15}{l}{\textbf{With General pretraining}} \\
\multirow{6}{*}{w/ general pretraining}&MAE3D~\cite{chen2023masked,he2022masked} & 93.98 & 94.37 & 94.18 & 69.86 & 74.65 & 96.66 & 80.40 & 90.30 & 83.10 & 72.65 & 77.11 & 71.34 & 60.54 & 81.33 \\
&SimCLR~\cite{chen2020simple} & 92.79 & 93.52 & 93.36 & 60.24 & 60.64 & 95.90 & 79.92 & 85.56 & 80.58 & 63.47 & 67.77 & 55.99 & 50.45 & 75.14 \\
&SimMIM~\cite{xie2022simmim} & 95.56 & 95.56 & \underline{95.08} & 63.56 & 53.52 & \textbf{98.98} & \underline{90.42} & \underline{92.71} & 85.82 & 58.63 & 71.16 & 60.55 & 47.73 & 78.88 \\
&MoCo v3~\cite{he2020momentum,chen2021empirical} & 94.92 & 93.85 & 92.42 & 65.28 & 62.77 & 96.89 & 78.64 & 88.66 & 82.21 & 71.15 & 75.09 & 66.48 & 58.81 & 79.54 \\
&Jigsaw~\cite{chen2021jigsaw} & 94.62 & 93.45 & 93.23 & 75.63 & 73.23 & 95.03 & 85.61 & 90.65 & 83.58 & 71.71 & 79.57 & 65.68 & 58.05 & 81.35 \\
&PositionLabel~\cite{zhang2023positional} & 94.35 & 93.15 & 93.21 & 75.39 & 72.34 & 95.55 & 87.94 & 90.34 & 84.41 & 71.18 & 79.02 & 65.11 & 60.12 & 81.09 \\
\midrule
%\multicolumn{15}{l}{\textbf{With Medical pretraining}} \\
\multirow{10}{*}{w/ medical pretraining}&MG~\cite{zhou2021models} & 91.99 & 93.52 & 91.81 & 65.11 & 76.14 & 95.98 & 86.88 & 89.65 & 83.59 & 71.79 & 81.50 & 67.97 & 63.18 & 81.45 \\
&ROT~\cite{taleb20203d} & 91.75 & 93.18 & 91.62 & 65.09 & \underline{76.55} & 95.85 & 86.16 & 89.74 & 83.03 & 71.73 & 81.51 & 67.07 & 62.90 & 81.25 \\
&Vicreg~\cite{bardes2022vicregl} & 92.03 & 92.50 & 91.62 & 75.24 & 74.96 & 96.07 & 85.50 & 89.43 & 83.08 & 74.74 & 78.35 & 71.14 & 63.44 & 81.81 \\
&Rubik++~\cite{tao2020revisiting} & \textbf{96.21} & 91.36 & 92.68 & 75.22 & 75.52 & 97.44 & 85.94 & 89.76 & 82.96 & 74.47 & 79.25 & 71.13 & 62.10 & 82.39 \\
&PCRL~\cite{zhou2023unified} & 95.30 & 91.43 & 89.62 & 76.15 & 72.58 & 95.88 & 86.15 & 89.08 & 83.42 & 75.13 & 80.17 & 67.50 & 62.73 & 81.85 \\
&Swin-UNETR~\cite{tang2022self} & 95.21 & 92.03 & 92.22 & 74.27 & 73.39 & 96.32 & 84.62 & 90.78 & 83.03 & 75.51 & 79.87 & 68.99 & 61.59 & 82.11 \\
&SwinMIM~\cite{wang2023swinmm} & 95.44 & 92.43 & 94.37 & 75.29 & 73.06 & 96.44 & 84.20 & 90.76 & 83.10 & 70.91 & 79.78 & 70.11 & 62.44 & 82.07 \\
&GL-MAE~\cite{zhuang2023advancing} & 95.21 & 91.22 & 92.37 & \underline{76.19} & 73.66 & 96.09 & 86.23 & 89.80 & 81.65 & 75.71 & 79.68 & 70.36 & 60.98 & 81.92 \\
&GVSL$\dagger$~\cite{he2023geometric} & 95.27 & 91.22 & 92.37 & 74.92 & 74.20 & 96.64 & 86.02 & 90.48 & 82.14 & 72.42 & 78.67 & 67.44 & 62.73 & 81.93 \\
&\textbf{VoCo~\cite{wu2024voco}} & 95.73 & \textbf{96.53} & 94.48 & 76.02 & 76.50 & 97.41 & 78.43 & 91.21 & \underline{86.12} & \textbf{78.19} & \underline{80.88} & \underline{71.47} & \textbf{67.88} & \underline{83.85} \\
\midrule
%\multicolumn{15}{l}{\textbf{From Scratch}} \\
\multirow{3}{*}{from scratch}&UNETR~\cite{hatamizadeh2022unetr} & 93.02 & 94.13 & 94.12 & 66.99 & 70.87 & 96.11 & 77.27 & 89.22 & 82.10 & 70.16 & 76.65 & 65.32 & 59.21 & 79.82 \\
&Swin-UNETR~\cite{hatamizadeh2021swin} & 94.06 & 93.54 & 93.80 & 65.51 & 74.60 & 97.09 & 75.94 & 91.80 & 82.36 & 73.63 & 75.19 & 68.00 & 61.11 & 80.53 \\
&\textbf{ours} & \underline{95.69} & \underline{95.04} & \textbf{96.60} & \textbf{84.24} & \textbf{78.65} & \underline{97.57} & \textbf{90.60} & \textbf{93.78} & \textbf{87.22} & \underline{75.87} & \textbf{80.99} & \textbf{72.64} & \underline{63.47} & \textbf{85.57} \\
\bottomrule
\end{NiceTabular}
\end{adjustbox}
\label{tab:organ_results}
\end{table}
\indent\textbf{Supervision sensitivity:} By varying the number of scribble annotations, we validate the supervision sensitivity of MedCL and compare it to FullSup-nnUNet and scribble-supervised model on MSCMRseg and MyoPS, \emph{i.e.}, WSL4, and nnPU. 
As shown in \zkreffig{fig:exp:supervision_sensitivity} \textcolor{red}{(a)} and \textcolor{red}{(b)}, our MedCL surpasses the scribble-supervised benchmarks by large margins on all experiments. Interestingly, one can observe that when the number of annotations is particularly small (less than 5), scribble-supervised MedCL achieves comparable or even slightly better performance than FullSup-nnUNet. 
This indicates the superiority of MedCL in the situation of extremely weak supervision.
\\
\indent\textbf{Comparison to SAM-based methods:} By changing the shift of the bounding box prompts, we evaluate the robustness of MedCL-SAM against noisy bounding boxes on MSCMRseg and MyoPS datasets. 
As shown in  \zkreffig{fig:exp:supervision_sensitivity} \textcolor{red}{(c)} and  \textcolor{red}{(d)}, the performance of SAM and MedSAM clearly decreases as the bounding box shift increases.
In contrast, MedCL-SAM is robust to bounding box shifts.
Thanks to feature shuffling and anatomy-guided clustering, MedCL achieves the stable performance against noisy prompts.
\\
\indent\textbf{Ablations:} We validate the effectiveness of MedCL components with SAM and UNet backbones. The models are trained with 5 scribbles and evaluated on the validation set. We verify the key components of MedCL, including feature mixing (Mix/{$\mathcal{L}_{\text{mix}}$}), cluster loss ($\mathcal{L}_{\text{cluster}}$, Eq.~\ref{cluster_eq}), and anatomy consistency loss ($\mathcal{L}{\text{ac}}$, Eq.~\ref{ac_eq}). Details are summarized in \zkreftb{tab:tab7}.
Incorporating feature shuffling, Model \#2 shows substantial improvement over Model \#1, with an average Dice increase of 17.1\% (0.521 vs. 0.350) for SAM and 34.1\% (0.563 vs. 0.222) for UNet, highlighting the benefits of our mix augmentation. Model \#3, enhanced with cluster loss ($\mathcal{L}_{\text{cluster}}$), further improves Dice by 20.8\% (0.558 vs. 0.350) for UNet and 7.5\% (0.638 vs. 0.563) for SAM. Finally, with the addition of anatomy consistency loss ($\mathcal{L}_{\text{ac}}$), our MedCL model achieves the best performance for SAM and UNet, respectively. We provide the ablations of text prompts (Sec\ref{appendix_3}), batch size (Sec\ref{appendix_2}), and investigate the effectiveness of feature shuffling components (Sec\ref{appendix_4}), category label (Sec\ref{appendix_1}) in Appendix.


\begin{table}[!t]
	\caption{Ablation on MSCMRseg validation {(* p $\leq$ 0.05, Wilcoxon test).}}\label{tab:tab7}
	\centering
		\resizebox{0.85\linewidth}{!}{
			\begin{NiceTabular}{ccccccccccccc}
				\midrule
				\multirow{2}{*}{Methods}&\multirow{2}{*}{{Mix/$\mathcal{L}_{\text{mix}}$}}&\multirow{2}{*}{$\mathcal{L}_{\text{cluster}}$}&\multirow{2}{*}{$\mathcal{L}_{\text{ac}}$} &\multicolumn{4}{c}{{MedCL-SAM (w/ text prompt)}}& \multicolumn{4}{c}{{MedCL-UNet (w/o text prompt)}}\\
				\cmidrule(lr){5-8}\cmidrule(lr){9-12}
				&&&&{LV} & MYO & RV & {Avg} &LV & MYO & RV&{Avg}\\
				\midrule
        	{\#1}&$\times$&$\times$&{$\times$}&.567$\pm$.315&.317$\pm$.254&.166$\pm$.112&{.350$\pm$.165}&.139$\pm$.131&.275$\pm$.225&.251$\pm$.194&.222$\pm$.184\\
        	{\#2}&$\checkmark$&$\times$&{$\times$}&.806$\pm$.147*&\underline{.724$\pm$.060*}&.032$\pm$.029&{.521$\pm$.070*}&.762$\pm$.198*&.443$\pm$.119*&.484$\pm$.442*&.563$\pm$.304*\\
        	{\#3}&$\times$&$\checkmark$&{$\times$}&\underline{.885$\pm$.051}*&.606$\pm$.066&.183$\pm$.300*&{.558$\pm$.130} &.654$\pm$.202&.581$\pm$.173*&\underline{.678$\pm$.304*}&.638$\pm$.220*\\
            {\#4}&$\checkmark$&$\checkmark$&{$\times$}&.882$\pm$.052&.718$\pm$.067*&\underline{.520$\pm$.286*}&{\underline{.707$\pm$.110}*}&\underline{.899$\pm$.087*}&\underline{.693$\pm$.219*}&.641$\pm$.379&\underline{.744$\pm$.265*}\\
            {MedCL}&$\checkmark$&$\checkmark$&{$\checkmark$}&\textbf{.894$\pm$.067}&\textbf{.784$\pm$.065*}&\textbf{.821$\pm$.122*}&{\textbf{.833$\pm$.083*}}&\textbf{.914$\pm$.059}&\textbf{.779$\pm$.076*}&\textbf{.791$\pm$.198*}&\textbf{.828$\pm$.133*}\\
          \midrule
		\end{NiceTabular}}
\end{table}
\begin{figure*}[!t]
    \centering
        \includegraphics[width=0.85\linewidth]{figures/ablation_img.png}
    \caption{The impact of supervision amount (a,b) and bounding box shift (c,d).}
    \label{fig:exp:supervision_sensitivity}
\end{figure*}


\section{Conclusion}
In this work, we have presented MedCL, a novel framework to learn anatomy distribution for medical image segmentation, with two implementations based on SAM and UNet architectures.
MedCL exploits supervision via {feature mixing}, and effectively learns anatomy priors with regularizations of compactness, discriminability, and distribution consistency.
Evaluated on three challenging segmentation tasks, MedCL demonstrates state-of-the-art performance with robust and general applicability.

\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version
% \midlacknowledgments{We thank a bunch of people.}


\bibliography{midl25_21}

\clearpage  

\appendix
We evaluate the effect of loss calculations for category labels (Sec\ref{appendix_1}), batch size influence (Sec\ref{appendix_2}), text prompt types (Sec\ref{appendix_3}), and components of feature shuffling (Sec\ref{appendix_4}). Cross-validation results on BTCV are detailed in Sec\ref{appendix_5}, and typical scribbles from MSCMRseg and MyoPS are shown in Sec\ref{appendix_6}.
\subsection{Category labels}
\label{appendix_1}
\begin{table}[!h]
\renewcommand{\thetable}{\uppercase\expandafter{\romannumeral1}}
	\caption{The effect of label categories on MyoPS dataset.}\label{tab:category}
	\centering
		\resizebox{\linewidth}{!}{
			\begin{NiceTabular}{c|c|ccc|ccc|ccc|ccc}
				\midrule
  \multirow{2}{*}{Method}&\multirow{2}{*}{$\mathcal{L}_{\text{sup-category}}$}&\multicolumn{3}{c|}{1 scribble}&\multicolumn{3}{c|}{3 scribbles}&\multicolumn{3}{c|}{5 scribbles}&\multicolumn{3}{c}{10 scribbles}\\
  %\cmidrule(lr){3-5}\cmidrule(lr){6-8}\cmidrule(lr){9-12}\cmidrule(lr){13-15}
  &&Scar& Edema & AVG&Scar& Edema & AVG&Scar& Edema & AVG&Scar& Edema & AVG\\
	\midrule
   nnPU&-&\underline{.230}&.192&.231&\underline{.397}&.080&.238&.290&.236&.263&\underline{.477}&.084&.280\\
   MedCL-UNet* &$\times$&\textbf{.302}&\underline{.262}&\underline{.282}&\textbf{.402}&\underline{.426}&\underline{.414}&\underline{.449}&
   \underline{.522}&\underline{.486}&.470&\underline{.527}&\underline{.498}\\
   MedCL-UNet&$\checkmark$&.218&\textbf{.508}&\textbf{.363}&.391&\textbf{.495}&\textbf{.443}&\textbf{.458}&\textbf{.536}&\textbf{.497}&\textbf{.517}&\textbf{.647}&\textbf{.582}\\
   \cdashline{1-14}
   FullSup-nnUNet&-&.121&.314&.218&.230&.460&.350&.496&.563&.529&.590&.643&.617\\
   \midrule
		\end{NiceTabular}}
\end{table}
We compare the detailed performance of MedCL-UNet* (without $\mathcal{L}_{\text{sup-category}}$) and compared methods using MyoPS dataset, including MedCL-UNet, nnPU, and FullSup-nnUNet. We vary the training scribbles from 1 to 10, and summarize the detailed results in \zkreftb{tab:category}.
One can observe that even without $\mathcal{L}_{\text{sup-category}}$,  MedCL-UNet$^*$ still evidently surpasses scribble-supervised method nnPU, demonstrating the effectiveness of proposed feature shuffling and clustering strategies.

\subsection{Batch size}
\label{appendix_2}
\begin{table}[!h]
\renewcommand{\thetable}{\uppercase\expandafter{\romannumeral2}}
	\caption{The effect of batch sizes on MSCMR dataset.}\label{tab:tab2}
	\centering
		\resizebox{0.6\linewidth}{!}{
			\begin{NiceTabular}{c|cccc}
				\midrule
    Batch Size & LV & MYO & RV & Avg\\
	\midrule
    16&.890$\pm$.076&.773$\pm$.058&\underline{.778$\pm$.087}&.814$\pm$.091\\
    32&\underline{.899$\pm$.048}&\underline{.784$\pm$.047}&.768$\pm$.163&\underline{.817$\pm$.116}\\
	64&\textbf{.907$\pm$.045}&\textbf{.787$\pm$.047}&\textbf{.804$\pm$.047}&\textbf{.832$\pm$.086}\\
   \midrule
		\end{NiceTabular}}
\end{table}
\noindent\zkreftb{tab:tab2} reports the performance of MedCL-UNet with a batch size ranges from 16 to 64. 
All models are trained with 5 scribbles using MSCMRseg dataset.
Our MedCL-UNet works consistently well over the range of batch size, with a slight drop of 1.8\% (0.814 vs 0.832) for batch size 16 or 1.5\% (0.817 vs 0.832) for batch size 32.
This demonstrates that our MedCL performs robustly even on small batch sizes.
\begin{table}[!t]
\centering
\renewcommand{\thetable}{\uppercase\expandafter{\romannumeral5}}
\caption{Multi-organ segmentation on BTCV (Dice): "From Scratch" denotes the supervised baseline without self-supervised pretraining.}
\begin{adjustbox}{width=\textwidth}
\begin{NiceTabular}{llcccccccccccccccc}
                 \CodeBefore
                    \rowcolor{babypink!30}{21}
                    \Body
\toprule
\textbf{Type}&\textbf{Method} & \textbf{Spl} & \textbf{RKid} & \textbf{LKid} & \textbf{Gall} & \textbf{Eso} & \textbf{Liv} & \textbf{Sto} & \textbf{Aor} & \textbf{IVC} & \textbf{Veins} & \textbf{Pan} & \textbf{RAG} & \textbf{LAG} & \textbf{AVG} \\
\midrule
\multirow{6}{*}{w/ general pretraining}&MAE3D~\cite{chen2023masked,he2022masked} & 93.98 & 94.37 & 94.18 & 69.86 & 74.65 & 96.66 & 80.40 & 90.30 & 83.10 & 72.65 & 77.11 & 71.34 & 60.54 & 81.33 \\
&SimCLR~\cite{chen2020simple} & 92.79 & 93.52 & 93.36 & 60.24 & 60.64 & 95.90 & 79.92 & 85.56 & 80.58 & 63.47 & 67.77 & 55.99 & 50.45 & 75.14 \\
&SimMIM~\cite{xie2022simmim} & 95.56 & 95.56 & 95.08 & 63.56 & 53.52 & \textbf{98.98} & 90.42 & \underline{92.71} & 85.82 & 58.63 & 71.16 & 60.55 & 47.73 & 78.88 \\
&MoCo v3~\cite{he2020momentum,chen2021empirical} & 94.92 & 93.85 & 92.42 & 65.28 & 62.77 & 96.89 & 78.64 & 88.66 & 82.21 & 71.15 & 75.09 & 66.48 & 58.81 & 79.54 \\
&Jigsaw~\cite{chen2021jigsaw} & 94.62 & 93.45 & 93.23 & 75.63 & 73.23 & 95.03 & 85.61 & 90.65 & 83.58 & 71.71 & 79.57 & 65.68 & 58.05 & 81.35 \\
&PositionLabel~\cite{zhang2023positional} & 94.35 & 93.15 & 93.21 & 75.39 & 72.34 & 95.55 & 87.94 & 90.34 & 84.41 & 71.18 & 79.02 & 65.11 & 60.12 & 81.09 \\
\midrule
%\multicolumn{15}{l}{\textbf{With Medical pretraining}} \\
\multirow{10}{*}{w/ medical pretraining}&MG~\cite{zhou2021models} & 91.99 & 93.52 & 91.81 & 65.11 & 76.14 & 95.98 & 86.88 & 89.65 & 83.59 & 71.79 & 81.50 & 67.97 & 63.18 & 81.45 \\
&ROT~\cite{taleb20203d} & 91.75 & 93.18 & 91.62 & 65.09 & 76.55 & 95.85 & 86.16 & 89.74 & 83.03 & 71.73 & \underline{81.51} & 67.07 & 62.90 & 81.25 \\
&Vicreg~\cite{bardes2022vicregl} & 92.03 & 92.50 & 91.62 & 75.24 & 74.96 & 96.07 & 85.50 & 89.43 & 83.08 & 74.74 & 78.35 & 71.14 & 63.44 & 81.81 \\
&Rubik++~\cite{tao2020revisiting} & \underline{96.21} & 91.36 & 92.68 & 75.22 & 75.52 & 97.44 & 85.94 & 89.76 & 82.96 & 74.47 & 79.25 & 71.13 & 62.10 & 82.39 \\
&PCRL~\cite{zhou2023unified} & 95.30 & 91.43 & 89.62 & 76.15 & 72.58 & 95.88 & 86.15 & 89.08 & 83.42 & 75.13 & 80.17 & 67.50 & 62.73 & 81.85 \\
&Swin-UNETR~\cite{tang2022self} & 95.21 & 92.03 & 92.22 & 74.27 & 73.39 & 96.32 & 84.62 & 90.78 & 83.03 & 75.51 & 79.87 & 68.99 & 61.59 & 82.11 \\
&SwinMIM~\cite{wang2023swinmm} & 95.44 & 92.43 & 94.37 & 75.29 & 73.06 & 96.44 & 84.20 & 90.76 & 83.10 & 70.91 & 79.78 & 70.11 & 62.44 & 82.07 \\
&GL-MAE~\cite{zhuang2023advancing} & 95.21 & 91.22 & 92.37 & \underline{76.19} & 73.66 & 96.09 & 86.23 & 89.80 & 81.65 & 75.71 & 79.68 & 70.36 & 60.98 & 81.92 \\
&GVSL~\cite{he2023geometric} & 95.27 & 91.22 & 92.37 & 74.92 & 74.20 & 96.64 & 86.02 & 90.48 & 82.14 & 72.42 & 78.67 & 67.44 & 62.73 & 81.93 \\
&VoCo~\cite{wu2024voco} & 95.73 & \textbf{96.53} & 94.48 & 76.02 & 76.50 & 97.41 & 78.43 & 91.21 & 86.12 & 78.19 & 80.88 & 71.47 & \underline{67.88} & 83.85 \\
\midrule
%\multicolumn{15}{l}{\textbf{From Scratch}} \\
\multirow{4}{*}{from scratch}&UNETR~\cite{hatamizadeh2022unetr} & 93.02 & 94.13 & 94.12 & 66.99 & 70.87 & 96.11 & 77.27 & 89.22 & 82.10 & 70.16 & 76.65 & 65.32 & 59.21 & 79.82 \\
&Swin-UNETR~\cite{hatamizadeh2021swin} & 94.06 & 93.54 & 93.80 & 65.51 & 74.60 & 97.09 & 75.94 & 91.80 & 82.36 & 73.63 & 75.19 & 68.00 & 61.11 & 80.53 \\
&\textbf{MedCL}(Same split with compared methods) &\textbf{96.77}&\underline{95.29}&\underline{95.32}&62.95&\underline{76.71}&97.30&\underline{90.43}&90.48&\textbf{88.91}&\textbf{79.47}&\textbf{86.53}&\textbf{74.52}&\textbf{73.05}&\underline{85.21}\\
&\textbf{MedCL}(Cross-validation)  & 95.69 & 95.04 & \textbf{96.60} & \textbf{84.24} & \textbf{78.65} & \underline{97.57} & \textbf{90.60} & \textbf{93.78} & \underline{87.22} & \underline{75.87} & 80.99 & \underline{72.64} & 63.47 & \textbf{85.57} \\
\midrule
\multirow{5}{*}{MedCL (Cross-validation)}&Fold1 & 95.66 & 95.72 & 96.70 & 74.39 & 81.18 & 96.77 & 73.93 & 92.33 & 93.64 & 84.09 & 74.20 & 73.46 & 64.62 & 84.36 \\
&Fold2 & 97.35 & 92.33 & 96.66 & 72.87 & 82.78 & 97.87 & 94.83 & 93.92 & 87.51 & 88.25 & 83.12 & 84.50 & 80.58 & 88.66 \\
&Fold3 & 92.93 & 95.44 & 96.31 & 89.47 & 75.30 & 97.67 & 94.07 & 94.21 & 75.06 & 65.68 & 78.01 & 72.69 & 27.73 & 81.12 \\
&Fold4 & 96.81 & 96.11 & 96.66 & 93.89 & 77.28 & 97.86 & 95.17 & 94.52 & 89.72 & 56.65 & 83.23 & 61.95 & 87.54 & 86.72 \\
&Fold5 & 95.70 & 95.60 & 96.68 & 90.57 & 76.72 & 97.66 & 95.00 & 93.91 & 90.15 & 84.68 & 86.38 & 70.60 & 56.90 & 86.97 \\ 
\bottomrule
\end{NiceTabular}
\end{adjustbox}
\label{tab:cross_val}
\end{table}
\subsection{Types of text prompts}
\label{appendix_3}
\begin{table}[!t]
\renewcommand{\thetable}{\uppercase\expandafter{\romannumeral3}}
	\caption{The influence of text prompt types on MSCMRseg dataset.}\label{tab:tab_text}
	\centering
		\resizebox{0.66\linewidth}{!}{
			\begin{NiceTabular}{lccccccc}
				\midrule
    Methods & Text prompt & LV& MYO & RV & Avg\\
		\midrule
        \multicolumn{1}{l|}{SAM}&\multicolumn{1}{c|}{Deterministic} &.037$\pm$.013&.042$\pm$.011&.039$\pm$.011&\multicolumn{1}{c}{.040$\pm$.010}\\
        \multicolumn{1}{l|}{MedSAM}&\multicolumn{1}{c|}{Deterministic}&.041$\pm$.034&.024$\pm$.006&.026$\pm$.009&\multicolumn{1}{c}{.030$\pm$.012}\\
		\multicolumn{1}{l|}{MedCL-SAM}&\multicolumn{1}{c|}{Deterministic}&\textbf{.879$\pm$.078}&\textbf{.744$\pm$.066}&\textbf{.793$\pm$.087}&\multicolumn{1}{c}{\textbf{.805$\pm$.062}}\\
   \cdashline{1-6}
    \multicolumn{1}{l|}{SAM}&\multicolumn{1}{c|}{Ambiguous}&.040$\pm$.010&.037$\pm$.013&.042$\pm$.011&\multicolumn{1}{c}{.039$\pm$.011}\\
    \multicolumn{1}{l|}{MedSAM}&\multicolumn{1}{c|}{Ambiguous}&.023$\pm$.007&.026$\pm$.006&.024$\pm$.008&\multicolumn{1}{c}{.024$\pm$.005}\\
    \multicolumn{1}{l|}{MedCL-SAM}&\multicolumn{1}{c|}{Ambiguous}&\textbf{.633$\pm$.206}&\textbf{.563$\pm$.099}&\textbf{.399$\pm$.159}&\multicolumn{1}{c}{\textbf{.532$\pm$.116}}\\
   \midrule
		\end{NiceTabular}}
\end{table}
\zkreftb{tab:tab_text} compares MedCL-SAM to MedSAM and SAM with determinstic and ambiguous text prompts.  The determinstic prompt refers to noun, \emph{i.e.}, "Myocardium", and "Left Ventrical". The ambiguous prompts refers to the sentence description, such as “Myocardium typically appears dark or black in LGE images, and have circular shape. SAM and MedSAM are the latest segmentation benchmarks pre-trained with large-scale natural and medical image datasets, respectively, while MedCL-SAM is initialized with the weights of MedCL and finetuned with 5 scribbles. One can find that our MedCL-SAM achieves promising results with various text prompts, although the performance decreases on ambiguous descriptions of long sentence. By contrast, SAM and MedSAM fail to tackle text prompts for these tasks, indicating the necessity of our proposed anatomy prior guided fine-tuning. 


{We provide the list of text prompts in the table~\zkreftb{tab:tab_text}. Using the MSCMR dataset as an example, there are three classes: RV, Myo, and LV. We utilize two groups of text prompts: the noun group and the sentence description group.}
\begin{table}[!t]
\renewcommand{\thetable}{\uppercase\expandafter{\romannumeral4}}
	\caption{{Examples of text prompt on MSCMRseg dataset.}}\label{tab:tab_text}
	\centering
		\resizebox{0.9\linewidth}{!}{
			\begin{NiceTabular}{ll}
				\midrule
    Noun (Deterministic) & Description (Ambiguous) \\
		\midrule
RV (Right Ventricle) & Right ventricle has complex shape, triangular from the frontal aspect and crescentic from the apex. \\
Myo (Myocardium) & Myocardium typically appears dark or black in LGE images, and has a circular shape. \\
LV (Left Ventricle) & Left ventricle is typically observed as a roughly elliptical or oblong structure. \\
    \midrule
\end{NiceTabular}}
\end{table}

{We sample class combinations following the pattern below. As described in the manuscript, we first sample text prompts for each class. Taking MSCMRseg as an example, there are three foreground classes: RV, Myo, and LV. The initial sampled prompt is therefore [RV, Myo, LV]. Next, we sample combinations of these classes, with the number of classes in each combination ranging from 2 to m (where m is the total number of foreground classes). For MSCMRseg, the possible combinations are: [RV and LV, RV and Myo, Myo and LV, RV and Myo and LV]. For example, when sampling two-class combinations, we might select [RV and LV]. For three-class combinations, we get [RV and Myo and LV]. The final sampled set, in this case, could be [RV, Myo, LV, RV and LV, RV and Myo and LV], which has a dimension of 5. In general, for foreground classes, the total number of possible sampled prompts is 
 (e.g., 2$\times$3-1=5). This progressive sampling strategy allows us to gradually expand the set of class combinations until it covers all possible subsets of the class set.}

{Additionally, we apply data augmentation during the sampling process. For example, the conjunction "and" can be replaced with synonyms (e.g., "with", "along with"), and class names such as RV can be substituted with equivalent terms like Right Ventricle. }

% \subsection{Scribbles}
% \zkreffig{fig:scribble} visualizes typical scribble annotations of MSCMRseg and MyoPS datasets. The first four cases present typical cases for scribbles from MSCMRseg dataset. The last four cases show scribble annotations from MyoPS dataset.
\subsection{Components of feature {mixing}}
\label{appendix_4}
\begin{table}[!t]
  \centering
      \renewcommand{\thetable}{\uppercase\expandafter{\romannumeral5}}
	\caption{Ablation study of intra-mix components on MSCMRseg.}\label{tab:tab_mix}
  \resizebox{0.7\linewidth}{!}{
  \begin{NiceTabular}{lccccccc}
    \toprule
    \multirow{2}{*}{Methods}&\multicolumn{2}{c|}{Intra-Mix}&\multirow{2}{*}{Inter-Mix}&\multicolumn{4}{c}{Dice}\\ 
    \cmidrule(lr){2-3}\cmidrule(lr){5-8}
    &Bounding box & Rotation && LV & MYO & RV & Avg \\
    \midrule
    \#1 &$\times$ &$\times$ & $\times$ &.139&.275&.251&.222\\
    \#2 &$\checkmark$ &$\times$ & $\times$ &\underline{.700}&.431&.296& .476\\
    \#3 &$\checkmark$ &$\checkmark$ & $\times$ &.661&\textbf{.527}&.325& .505\\
    \#4 &$\times$&$\times$ & $\checkmark$ &\underline{.700}&.416&\textbf{.501}&\underline{.539}\\
    \#5 &$\checkmark$&$\checkmark$ & $\checkmark$ &\textbf{.762}&\underline{.443}&\underline{.484}&\textbf{.563}\\
    \bottomrule
  \end{NiceTabular}}
\end{table}
For intra-mix, the bounding boxes control the range, while the rotated angles determine the intensity. The two operations work in an complementary way. Following the settings of ablations in Table~\textcolor{red}{6}, we provide the results of ablations in \zkreftb{tab:tab_mix}.

\subsection{Cross-validation on BTCV dataset}
\label{appendix_5}
For a fair comparison, we report results using the same dataset split provided by VoCo~\cite{wu2024voco}, consistent with prior studies~\cite{chen2023masked,zhou2021models,zhuang2023advancing,tang2022self}. To ensure a more comprehensive evaluation, we also perform five-fold cross-validation on MedCL, with the results presented in \zkreftb{tab:cross_val}. Notably, our cross-validation results also significantly outperform the methods in comparison.

\subsection{Scribble visualization}
\label{appendix_6}
\begin{figure}[!h]
\renewcommand{\thefigure}{\uppercase\expandafter{\romannumeral1}}
     \centering
         \includegraphics[width=\textwidth]{figures/scribble_visuals.png}
\caption{Visualization of scribbles from MSCMRseg and MyoPS dataset.}
\label{fig:scribble}
\end{figure}
We visualize the scribble examples from the MSCMR and MyoPS datasets. For the MSCMRseg dataset, we utilize the manual scribbles provided by~\cite{zhang2022cyclemix}, while for the MyoPS dataset, we adopt the scribbles released by~\cite{zhang2024modelmix,zhang2023zscribbleseg}. Note that for the BTCV dataset, we use full annotations instead of scribbles, aligning with previous studies~\cite{wu2024voco,chen2023masked,zhou2021models,zhuang2023advancing,tang2022self}.

\subsection{{Pseudo code to optimize mapping $P$}}
{We use a linear layer to implement the mapping and optimize the parameters of the linear layers to maximize the similarity between the segmentation probability map and the prototype. The optimization procedure is described in the following pseudo-code.}
\begin{verbatim}
# a_t: the transpose of prototypes, d x (2m-1)
# y_hat: flatted model prediction, (2m-1) x n
# model: convnet + Mapping head
# w: the weight for regularization for smoothness

scores = torch.mm(a_t, y_hat) # prototype scores: (d x n)

with torch.no_grad():
  q = sinkhorn(scores)

p = Softmax(scores / w)
loss = - mean(q * log(p))

# Sinkhorn algorithm to compute optimal transport matrix
function sinkhorn(scores, eps=0.05, niters=3):
    P = exp(scores / eps).T            # Exponentiate and transpose the scores
    P /= sum(P)                        # Normalize P by row sum
    d, n = P.shape                     # Get the dimensions of P
    u, r, c = zeros(d), ones(d) / d, ones(n) / n  # Initialize scaling vectors
    
    for _ in range(niters):            # Iterate for a fixed number of iterations
        u = sum(P, dim=1)              # Update u as row sum of P
        P *= (r / u).unsqueeze(1)      # Scale P by row scaling factor
        P *= (c / sum(P, dim=0)).unsqueeze(0)  # Scale P by column scaling factor
    
    return (P / sum(P, dim=0, keepdim=True)).T  # Normalize and return the final result
\end{verbatim}
{For regularization, $w$ is a parameter that controls the smoothness of the mapping. We have observed that a high value of $w$, which enforces strong entropy regularization, often leads to a trivial solution where all samples collapse into a single representation and are uniformly assigned to all prototypes. Therefore, in practice, we maintain a low value for $w$. We solve the optimization using the Sinkhorn-Knopp algorithm~\cite{cuturi2013sinkhorn}. The parameter optimization during the clustering process follows the approach outlined in previous work~\cite{caron2020unsupervised}. }

\subsection{{The contribution of the unsupervised and supervised loss}}
{We evaluate the contribution of the unsupervised and supervised loss components through two experimental setups as follows:
(a) Fixed supervised loss with added unsupervised losses: As detailed in Table 4 of the manuscript, when the unsupervised losses were incorporated,while keeping the supervised loss unchanged, the average Dice score improved significantly from 0.350 to 0.833. This result demonstrates the effectiveness of the proposed unsupervised approaches.
(b) Fixed unsupervised loss with varying supervised losses: As shown in \zkreftb{tab:category}, the impact of the supervised loss was further examined by varying the number of scribble annotations (from 1 to 10), with and without category information. In all cases, the model incorporating unsupervised losses (denoted as MedCL-UNet) consistently outperformed the compared approaches. Meanwhile, as the number of scribble annotations increased from 1 to 10, the average Dice score on a challenging pathology segmentation task improved from 36.3\% to 58.2\%.
Based on the results of the two experimental setups, we clarify that our proposed unsupervised losses, which constitute the primary contribution of our method, yield significant performance improvements across diverse scenarios. Furthermore, our findings indicate that when annotations are limited, the supervised losses provide essential guidance to the model in identifying the target regions of interest.}


\subsection{{The effect of each loss terms}}
{We provided extensive ablation studies to assess the contributions of the various loss terms in Table 4 of manuscript (Ablations of unsupervised losses), Table I of Appendix Sec 1(Ablations of supervised loss), and Table IV (detailed ablations for $L_{mix}$). We clarify the ablation details in the descriptions below:
For unsupervised Loss:  This component consists of $L_{mix}$, $L_{cluster}$, and $L_{ac}$. As shown in Table 4 of the manuscript. Detailed ablation results on the effects of intra-mix and inter-mix can be found in \zkreftb{tab:tab_mix}.
For supervised Loss:  The supervised component is comprised of $L_{scribble}$ and $L_{category}$. \zkreftb{tab:category} presents the baseline performance when using the combined supervised loss, while Table 1 in Appendix Section 3 provides a detailed analysis of the individual contributions of $L_{scribble}$ and $L_{category}$. Even though the contribution of $L_{category}$ is significant, the model trained without $L_{category}$ still outperforms other scribble-supervised baselines such as nnPU and nnUNet.
For $L_{scribble}$, it calculate cross-entropy and Dice loss for annotated pixels as serves as the baseline. 
Our ablation studies highlight the crucial contribution of each loss term to the overall performance improvements.}

\subsection{Rationale of MedCL}
{We clarify that the main goal of our work is to model the anatomy distribution for better image segmentation. We use feature clustering to capture this distribution. Since medical datasets are often small and sparsely annotated, we introduce feature shuffling to generate augmented features, improving clustering quality. Regarding text prompting, this is specific to cases where we apply our method to SAM-based architectures. In such cases, feature-level shuffling involves generating diverse segmentation masks, and we use prompt sampling (via text prompts) to guide the model in producing augmented outputs that cover different classes. Thus, text prompting is used as a practical way to perform feature shuffling by generating diverse segmentation masks through varied prompts, but it is not a core conceptual contribution on its own.}
% \section{Proof of Theorem 1}

% This is a boring technical proof of
% \begin{equation}\label{eq:example}
% \cos^2\theta + \sin^2\theta \equiv 1.
% \end{equation}
% \section{Proof of Theorem 2}

% This is a complete version of a proof sketched in the main text.

\end{document}
