\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images
\usepackage{graphicx}
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{array}
\usepackage{multirow}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{pifont}
\usepackage{color}
\usepackage{caption}
\usepackage{lineno}
\usepackage{mathrsfs}
\usepackage{bbding}
\jmlrvolume{-- 070}
\jmlryear{2024}
\jmlrworkshop{Full Paper -- MIDL 2024}
\editors{Accepted for publication at MIDL 2024}

\title[Boundary-aware Contrastive Learning for Nuclei Segmentation]{Boundary-aware Contrastive Learning for Semi-supervised Nuclei Instance Segmentation}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Ye Zhang\nametag{$^{1}$}}\Email{zhangye94@stu.hit.edu.cn} \\
		\Name{Ziyue Wang{$^{1}$}}\Email{200111326@stu.hit.edu.cn}\\
		\Name{Yifeng Wang{$^{2}$}} \Email{wangyifeng@stu.hit.edu.cn} \\
		\Name{Hao Bian{$^{3}$}} \Email{h2495067728@gmail.com}\\
		\Name{Linghan Cai{$^{1}$}} \Email{ceilinghans@gmail.com}\\
		\Name{Hengrui Li{$^{4}$}} \Email{23B903028@stu.hit.edu.cn}\\
		\Name{Lingbo Zhang{$^{3}$}}\Email{zhang-lb23@mails.tsinghua.edu.cn}\\
		\Name{Yongbing Zhang{$^{1,}$}\textsuperscript{\Envelope}} \Email{ybzhang@.hit.edu.cn}\\
		\addr $^{1}$ School of Computer Science and Technology, Harbin Institute of Technology, 518055, China.\\
		\addr $^{2}$ School of Science, Harbin Institute of Technology, 518055, China.\\
  	\addr $^{3}$ Tsinghua Shenzhen International Graduate School, Tsinghua University, 518071, China.\\
		\addr $^{4}$ Faculty of Computing, Harbin Institute of Technology, 150001, China.\\
}

\begin{document}

\maketitle

\begin{abstract}
Semi-supervised segmentation methods have demonstrated promising results in natural scenarios, providing a solution to reduce dependency on manual annotation. However, these methods face significant challenges when directly applied to pathological images due to the subtle color differences between nuclei and tissues, as well as the significant morphological variations among nuclei. Consequently, the generated pseudo-labels often contain much noise, especially at the nuclei boundaries. To address the above problem, this paper proposes a boundary-aware contrastive learning network to denoise the boundary noise in a semi-supervised nuclei segmentation task. The model has two key designs: a low-resolution denoising (LRD) module and a cross-RoI contrastive learning (CRC) module. The LRD improves the smoothness of the nuclei boundary by pseudo-labels denoising, and the CRC enhances the discrimination between foreground and background by boundary feature contrastive learning. We conduct extensive experiments to demonstrate the superiority of our proposed method over existing semi-supervised instance segmentation methods.
\end{abstract}

\begin{keywords}
Semi-supervised learning, Nuclei instance segmentation, Edge denoising.
\end{keywords}


\section{Introduction}
Nuclei instance segmentation is essential in the quantitative analysis of pathological images. The characteristics of nuclei, including their size, morphology, and distribution, can provide valuable insights into the tumor microenvironment, thereby offering crucial support for cancer diagnosis, staging, and grading processes \cite{khened2021generalized,hollandi2022nucleus}. In recent years, deep learning techniques have made remarkable advancements in nuclei segmentation\cite{zhang2024seine, zhang2024dawn}. DCAN \cite{chen2016dcan} adopts a dual-branch decoder architecture to predict semantics and contours simultaneously to enhance the instance distinguishing. HoverNet \cite{graham2019hover} incorporates distance and gradient constraints to split individual instances effectively. Similar methods such as CDNet \cite{he2021cdnet}, and CellPose \cite{stringer2021cellpose} are also proposed to address overlapping nuclei challenges. However, these supervised methods typically rely on pixel-level annotations, which are time-consuming and labor-intensive and need professional guidance, hindering the development of models. Therefore, developing a technique that can effectively address the dependency on manual annotation for nuclei instance segmentation is crucial.

A common approach to address the problem of scarce labeled data is semi-supervised learning \cite{reddy2018semi, van2020survey}. During the training process, abundant unlabeled and insufficient labeled data are used to train the network. The existing semi-supervised methods mainly leverage prior information to improve the pseudo-label quality. For example, ShapeProp \cite{zhou2020learning} combines the information from bounding boxes and partially annotated masks to improve the segmentation accuracy of target regions based on Mask R-CNN \cite{he2017mask}. PAIS \cite{hu2023pseudo} uses a dynamic alignment loss to address the misalignment problem between classification and segmentation results, and then a new threshold filtering method for pseudo-labels is proposed. PointWSSIS \cite{kim2023devil} balances false negative and false positive errors by utilizing point supervision prior information. However, due to the low color contrast differences between the nuclei and tissues, these methods still have defects in generating nuclear pseudo-labels, limiting the application of semi-supervised instance segmentation in pathological images.

Some methods use pseudo-label optimization strategies to enhance nuclei segmentation accuracy in semi-supervised scenarios. MMT-PSM \cite{zhou2020deep} integrates multiple data-augmented segmentation results to construct reliable predictions and enhance pseudo-labels' confidence. CDCL \cite{wu2022cross} uses feature contrastive learning to promote feature consistency between the teacher and student networks, thus improving the quality of pseudo-labels. PG-FANet \cite{jin2024inter} employs a pseudo-label guided module that aggregates multi-scale, multi-stage features to enhance segmentation performance. However, nuclei exhibit diversity in morphology and size, and in cases with limited annotations, it is challenging for the teacher network to capture the complete range of nuclei shape features. Consequently, the generated pseudo-labels often contain edge noise because existing pseudo-label optimization methods lack specific designs for denoising nuclei boundaries, which always leads to inaccurate nuclei boundary predictions.

In this paper, to address the issue of boundary noise in nuclei segmentation, we propose a coarse-to-fine \textbf{b}oundary-\textbf{a}ware contrastive learning network for \textbf{s}emi-supervised nuclei \textbf{s}egmentation (BASS\footnote[1]{Our code is availiable at \href{https://github.com/zhangye-zoe/BASS}{https://github.com/zhangye-zoe/BASS}.}). Firstly, we design a low-resolution denoising (LRD) segmentation head that promotes boundary smoothness. Additionally, within this segmentation head, we use a low-weight loss for the nuclei boundary region optimization, which reduces the impact of uncertain boundary prediction during training. Secondly, to minimize boundary noise further, we design a cross-RoI contrastive learning (CRC) module that finely partitions the internal, external, and boundary regions of nuclei, enhancing the discriminative capability of nuclei boundary features. To demonstrate the effectiveness of our proposed method, we conduct comparative experiments and ablation studies on two public datasets. The experimental results show that our proposed method outperforms existing semi-supervised methods, and the ablation studies demonstrate the effectiveness of the proposed modules.

\begin{figure*}[t!]
	\centering
	\includegraphics[width=5.5in]{"Fig1.pdf"}
	\caption{The framework of our semi-supervised nuclei segmentation method. (a)The training flow of our BASS. First, the teacher model generates pseudo-labels, and then the student model is used to train the nuclei segmentation network. (b) and (c) is the proposed low-resolution denoising module and cross-RoI contrastive learning module.}
	\label{fig: flowchart}
\end{figure*}


\section{Methodology}
\subsection{Framework Overview}

To address the boundary noise problem of nuclei segmentation under a semi-supervised scenario, we propose a coarse-to-fine boundary-aware denoising model, as shown in Fig. \ref{fig: flowchart}. 
Our whole training process can be divided into three stages. 
First, the labeled data $D_L = \{ {(x_i, y_i)}\}_{i=1}^{N}$ is used to train a teacher model. In this step, we employ Mask R-CNN \cite{he2017mask} as the baseline, and the loss function of the teacher network is defined as follows:
\begin{equation}
	Loss^{t}= L_{seg}^{t} + L_{det}^{t},
\end{equation}
where $L_{seg}^{t}$ is the loss of the segmentation head, and $L_{det}^{t}$ is the loss of the detection head, which consists of the classification loss and regression loss. Then, the trained teacher network is employed to generate pseudo-labels $y_j^p$ for input $x_j$. To reduce the uncertainty of pseudo-labels, we employ box and pixel threshold filtering to generate high-confidence pseudo-labels.
Finally, we combine the labeled data $D_L$ and the generated pseudo-labeled data $D_U = \{(x_j, y_j^p)\}_{j=1}^M$ to train the student network. 

In the student network, our developed denoising methods are shown in the green box of Fig.\ref{fig: flowchart}, which consists of a low-resolution denoising (LRD) module and a cross-RoI contrastive learning (CRC) module. The LRD employs low-resolution pseudo-labels as supervision information to promote the smoothness of nuclei contours. Meanwhile, the CRC utilizes boundary-aware contrastive learning to enhance the discriminative capability of contour features. In the training process, the overall loss is designed as follows:
\begin{equation}
	Loss^s = L_{det}^{s} + L_{nmh} + L_{lrd} + L_{cl},
\end{equation}
where $L_{nmh}$ represents the naive high resolution segmentation loss, $L_{lrd}$ represents the low-resolution segmentation loss, and $L_{cl}$ represents the contrastive learning loss. 

\subsection{Threshold Filtering}
Before training the student network, we chose high-confidence instances as pseudo-labels to reduce the uncertainty of the samples during student network training. In the pseudo-label generation stage, the teacher network outputs probability values for instances (box value) and mask probabilities (pixel value). We consider the pixel-level threshold $v_p$ to be the hyperparameter. As for the box-level threshold $v_b$, we assume the nuclear number distribution is consistent between labeled and unlabeled data. Based on this assumption, we uniformly sample 91 values between 0.1 and 1.0. Then, we iteratively apply these probabilities to filter the instances and calculate the number distribution of nuclei in the unlabeled data. Finally, we select the threshold closest to the distribution of labeled data. We validate the effectiveness of the threshold filtering method in ablation experiments.

\subsection{Low-resolution Denoising Module}
In the naive Mask R-CNN \cite{he2017mask}, the RoI head outputs a $14 \times 14$ feature map containing boundary noise. In the subsequent convolution process, Mask R-CNN increases the size of the feature map to capture more semantic information, but the boundary noise is also amplified. To avoid amplified noise effects, we design a low-resolution denoising module as shown in Fig.\ref{fig: flowchart}(b), which utilizes the low-resolution pseudo-labels as supervision for model training. In the LRD, BASS directly performs segmentation in the $14 \times 14$ feature map. This approach effectively smooths the boundaries and initially reduces the noise in nuclei boundaries. Furthermore, to minimize the impact of boundary uncertainty on segmentation, we apply a weighted loss to the low-resolution segmentation head. Specifically, pixels in the boundary region are assigned a lower weight, and other areas are set to a high weight.

According to previous studies \cite{wang2022noisy}, although low-resolution images can reduce the boundary noise, they lose some detailed information. To preserve the details, we parallel the original segmentation head and low-resolution prediction head to perform the segmentation task simultaneously, as shown in Fig.\ref{fig: flowchart}(b). In this manner, the output mask head decreases the influence of the original feature noise and keeps more details. 


\subsection{Cross-RoI Contrastive Learning}

In the subsection, we propose an elaborate denoising method named cross-RoI contrastive learning. It leverages labeled data to train a boundary feature extraction module, and then the module is applied to learn the embedding of unlabeled data, which can mitigate the impact of boundary noise caused by pseudo-labels and enhance the feature discrimination ability of foreground and background. In general, object boundaries typically correspond to hard-to-classify samples, and their embeddings are highly unstable. To avoid the impact of features from difficult samples on the representation of easy-to-classify samples, we employ region-based contrastive learning and our proposed CRC is shown in Fig.\ref{fig: flowchart} (c). 

First, the input image $x$ is fed into the network for feature extraction and alignment, then we randomly sample two aligned RoI features $f_i$ and $f_j$. 
%To address the challenges of differentiating nuclei from tissues due to the issue of boundary noise, we employ pixel contrastive learning to optimize the foreground and background features. However, object boundaries typically correspond to hard-to-classify samples, and their embeddings are highly unstable. To avoid the impact of features from difficult samples on the representation of easy-to-classify samples, we employ the approach of region-based contrastive learning. 

Based on the contour (plotted in black line) shown in Fig.\ref{fig: flowchart} (c), we split the feature maps into foreground region $\mathcal{F}$ and background region $\mathcal{B}$. By shrinking and expanding $d$ distances, we obtain the inner contour (plotted in red dotted line) and outer contour (plotted in blue dotted line). The regions between them and the true contour are represented as the inner boundary and outer boundary. These boundary regions correspond to challenging pixels for classification and can be expressed using the following formula:
\begin{equation}
\begin{split}
    \mathcal{R}_{i} = \{p_i| \; {p_i \in \mathcal{F} \ and \  \Vert p_i, c_i \Vert}_{2}^{2} \leq d\}, \\
    \mathcal{R}_{o} = \{p_i| \; {p_i \in \mathcal{B} \ and \  \Vert p_i, c_i \Vert}_{2}^{2} \leq d\},
\end{split}
\end{equation}
where $c_i$ represents the contour pixel closest to pixel $p_i$.

At the same time, we also sample other foreground and background pixels, which can be expressed as the following equations:
\begin{equation}
\begin{split}
    \mathcal{R}_{f} = \{ p_i | \; p_i \in \mathcal{F} \ and \ p_i \not\in \mathcal{R}_{i} \},\\
    \mathcal{R}_{b} = \{ p_i | \; p_i \in \mathcal{B} \ and \ p_i \not\in \mathcal{R}_{o} \},
\end{split}	
\end{equation}
where $\mathcal{R}_{f}$ represents the set of pixels obtained by excluding $\mathcal{R}_{i}$ from $\mathcal{F}$ and $\mathcal{R}_{b}$ represents the set of pixels obtained by excluding $\mathcal{R}_{o}$ from $\mathcal{B}$.

Next, we sample pixel features from the sets $\mathcal{R}_{i}$, $\mathcal{R}_{o}$, $\mathcal{R}_{f}$ and $\mathcal{R}_{b}$. The sampling ratio is set to $\alpha$. For feature $f_i$, the sampled features are denoted as $k_{b}^i$, $k_{o}^i$, $k_{f}^i$ and $k_{i}^i$ respectively. Similarly,  for feature $f_j$, the sampled features are denoted as $k_{b}^j$, $k_{o}^j$, $k_{f}^j$ and $k_{i}^j$ respectively.

Then, we calculate the query features of background and foreground across RoIs through: 
\begin{equation}
    q^b = M(k_{b}^i, k_{o}^i, k_{b}^j, k_{o}^j), \quad q^f = M(k_{f}^i, k_{i}^i, k_{f}^j, k_{i}^j),
\end{equation}
where $M$ represents the averaged operation of vectors.

Finally, to narrow the same category feature distance and expand the feature distance between foreground and background. We calculate four pairs of contrastive losses as follows:
\begin{equation}
    L_{cl}^{s} = CL(q^b, k_{b}, k_{f}) + CL(q^b, k_{o}, k_{i})+ CL(q^f, k_{f}, k_{b}) + CL(q^f, k_{i}, k_{o}), 
\end{equation}
where $CL$ represents contrastive learning loss. $k_{b}$ represents the concatenation of $k_{b}^{i}$ and $k_{b}^{j}$. The calculation of $CL$ is described below:
\begin{equation}
    CL(q^+, k^+, k^-) = -log{\frac{e^{cos(q^+, k^+)/\tau}}{e^{cos(q^+,k^+)/\tau}+\sum_{i=1}^{N}e^{cos(q^+, k^-_{i})/\tau}}},
\end{equation}
where $q^+$ and $k^+$ represent a pair of positive instances, $k^-$ represents a negative instance, and $\tau$ is the temperature hyper-parameter.

Different from the previous methods, PC$_2$Seg \cite{zhong2021pixel} extracts positive instance pair of contrastive learning from a single perspective. However, our proposed CRC performs contrastive learning cross-RoI, which enhances the feature generality. 

\section{Experiments}
\subsection{Datasets}
Our method is evaluated on the Cryosectioned Nuclei Segmentation (CryoNuSeg) dataset \cite{mahbod2021cryonuseg}, the Digestive-System Pathological Segmentation (DigestPath) dataset \cite{da2022digestpath}, and Multiple Organs Nuclei Segmentation (MoNuSeg) dataset \cite{kumar2017dataset}. CryoNuSeg contains 30 images from 10 organs, each with a size of $512\times512$. DigestPath contains 69 images of the digestive system, each with a size of approximately 1500$\times$1200. MoNuSeg contains 30 images from 7 organs, each with a size of $1000\times1000$.

% We randomly select 41 images for training, 14 for validation, and 14 for testing.
% We randomly select 18 images for training, 6 for validation, and 6 for testing.

\begin{table*}[ht]\footnotesize
% \footnotesize
\centering
	\renewcommand{\arraystretch}{1.1}
	\caption{Performance comparisons on CryoNuSeg, DigestPath and MoNuSeg Datasets. The best performance is highlighted in \textbf{bold}, and the second-best is \underline{underlined}. $\dag$ represents p-value of AJI $<$ 0.001 and $\ddag$ reprsents p-value of AJI $<$ 0.05.}
	\resizebox{\linewidth}{!}{
		\begin{tabular}{c|c|ccc|ccc|ccc}
			\hline
			  \multirow{2}{*}{\textbf{Ratio}} &\multirow{2}{*}{\textbf{Methods}} & \multicolumn{3}{c|}{\textbf{CryoNuSeg}} & \multicolumn{3}{c}{\textbf{DigestPath}} & \multicolumn{3}{c}{\textbf{MoNuSeg}} \\
			\cline{3-11}
			& & \textbf{Dice} &\textbf{AJI} & \textbf{PQ}  & \textbf{Dice} &\textbf{AJI} & \textbf{PQ} & \textbf{Dice} &\textbf{AJI} & \textbf{PQ}\\
			\hline
                & \textcolor{gray}{Mask R-CNN}$^{\dag}$   \cite{he2017mask} & 50.28 & 26.43 & 27.17 & 52.58 & 29.12 & 30.87 & 70.03 & 48.76 & 45.29 \\
               % \cline{2-11}
			& MMT-PSM$^{\ddag}$   \cite{zhou2020deep} & 54.83 & 30.17 & 29.81 & 55.68 & 32.34 & 35.94 & 73.28 & 50.14 & 47.27 \\
			& PointWSSIS$^{\dag}$  \cite{kim2023devil} & 58.66 & 35.41 & 33.61 & \underline{59.90}& 40.06 & 44.10 & \underline{76.12} & \underline{50.80} & 49.34\\
			\textbf{1/8} & ShapeProp$^{\ddag}$ \cite{zhou2020learning} & 57.42 &\underline{35.53} & \underline{33.68} & 58.18& 39.94 & 43.49 & 75.33 & 49.89 & 50.02\\
			& NoisyBoundary$^{\dag}$  \cite{wang2022noisy} & 55.14 & 29.57 & 30.96 & 58.34 & 36.75 & 37.94 & 75.27 & 48.14 & 49.37\\
                & PG-FANet $^{\ddag}$   \cite{jin2024inter} & \underline{59.06} & 35.47 & 32.96 & 58.20 & \underline{40.32} & \underline{44.12} & 75.17 & 50.44 & \underline{51.05} \\
			& BASS$^{\dag}$  (Ours) & \textbf{59.26}&\textbf{36.32} &\textbf{35.09} & \textbf{61.00}& \textbf{41.33} & \textbf{45.07} & \textbf{77.43} & \textbf{51.80} & \textbf{53.05} \\
			\cline{1-11}
               & \textcolor{gray}{Mask R-CNN}$^{\ddag}$   \cite{he2017mask} & 62.89 & 34.17 & 32.96 & 53.44 & 35.12 & 38.79 & 72.30 & 49.30 & 47.21 \\
               % \cline{2-11}
			& MMT-PSM$^{\ddag}$ \cite{zhou2020deep} & 67.24 & 37.60 & 34.67 & 58.23 & 37.64 & 41.93 & 73.14 & 51.08 & 49.17 \\
			& PointWSSIS$^{\dag}$ \cite{kim2023devil} & \textbf{75.01} & 47.12 & 49.83 & \textbf{64.93}& 43.16 & 47.86 & \underline{75.21} & 51.11 & 52.06\\
			  \textbf{1/4} & ShapeProp$^{\ddag}$   \cite{zhou2020learning} & 73.37 &\underline{48.70} & 48.72 & 63.31 & 43.35 & 48.44 & 74.86 & 51.29 & 52.44 \\
			& NoisyBoundary$^{\dag}$ \cite{wang2022noisy} & 69.34 &38.85 & 35.91& 61.15 & 40.77& 45.74 & 73.13 & 50.77 & 51.94\\
                & PG-FANet $^{\dag}$ \cite{jin2024inter} & 74.54 & 47.80 & \underline{49.93} & 63.24 & \underline{43.71} & \underline{48.76} & \underline{75.21} & \underline{52.19} & \underline{53.33} \\
			& BASS$^{\dag}$ (Ours) &\underline{74.79} &\textbf{48.96} & \textbf{50.36} & \underline{63.41} & \textbf{44.72} & \textbf{49.14} & \textbf{76.34} & \textbf{53.39} & \textbf{55.85} \\
			\cline{1-11}
                & \textcolor{gray}{Mask R-CNN}$^{\ddag}$  \cite{he2017mask} & 69.31 & 43.34 & 42.10 & 57.17 & 38.01 & 42.44 & 74.92 & 50.28 & 50.26 \\
               % \cline{2-11}
			& MMT-PSM$^{\ddag}$ \cite{zhou2020deep} & 72.85 & 45.06 & 44.47 & 59.11 & 39.97 & 45.58 & 75.12 & 51.05 & 51.17\\
			& PointWSSIS$^{\dag}$ \cite{kim2023devil} &  \underline{74.67}& \underline{49.91} & 49.29 &  63.87&\underline{45.45} & 51.64 & 75.89 & 52.14 & 52.30 \\
			\textbf{1/2} & ShapeProp$^{\ddag}$ \cite{zhou2020learning} & 74.40 & 48.24 & 47.55 & 64.15 & 45.02 & \underline{52.93} & 76.01 & 51.88 & 52.94\\
			& NoisyBoundary$^{\dag}$ \cite{wang2022noisy} & 73.71 &46.58 & 46.13 &  61.35& 44.41& 50.65 & 77.10 & 53.99 & 55.20 \\
                & PG-FANet $^{\ddag}$  \cite{jin2024inter} & 72.17 & 49.86 & \underline{49.37} & \underline{64.49} & 45.14& 51.15 & \textbf{78.77} & \textbf{54.91} & \underline{56.04} \\
			& BASS$^{\dag}$ (Ours) & \textbf{76.76}& \textbf{51.09} & \textbf{49.66} &  \textbf{65.72} & \textbf{46.14} & \textbf{53.96} & \underline{77.80} & \underline{54.82} & \textbf{56.59} \\
			\hline
    		\end{tabular}}
		% \end{center}
	\label{tab1}
\end{table*}



\subsection{Implementation Details and Evaluation Metrics}
Following the previous method \cite{graham2019hover}, we crop all the images to patches of 256 × 256 pixels with an overlap of 128 pixels for data preprocessing. All experiments are carried out with an RTX 3090 GPU. SGD is used as the optimizer. The learning rate, momentum, and weight decay are set to 0.02, 0.9, and 0.001, respectively. Besides, we evaluate the segmentation performance in terms of Dice \cite{vu2019methods}, aggregated Jaccard index (AJI) \cite{kumar2017dataset}, and panoptic quality (PQ) \cite{kirillov2019panoptic}. 

\subsection{Comparison with the State of the Art Methods}
We compare our proposed BASS against several state-of-the-art methods, including MMT-PSM \cite{zhou2020deep}, PointWSSIS \cite{kim2023devil}, ShapeProp \cite{zhou2020learning}, NoisyBoundary \cite{wang2022noisy} and PG-FANet \cite{jin2024inter}. Besides, to validate the improvement of our semi-supervised model, we also compare our method with supervised Mask R-CNN. We trained the models using 1/8, 1/4, and 1/2 of the labeled data on ResNet-50. A more detailed data split is presented in the appendix. 

\begin{figure*}[h!]
    \centering
    \includegraphics[width=5.0in]{"Fig2.pdf"}
    \caption{The semi-supervised instance segmentation visualization comparisons.}
    \label{fig: visulization}
\end{figure*}

Quantitative comparison results on three datasets are displayed in Table \ref{tab1}, which shows that our method achieves the optimal performance at all three annotation ratios. Even with only 1/8 of the annotations, our BASS exceeds the suboptimal method approximately 1\% in PQ. Fig.\ref{fig: visulization} displays the visual comparison results. We can see that MMT-PSM and NoisyBoundary mistakenly identify nuclei as tissue due to the lack of semantic discrimination between nuclei and tissues. Although ShapeProp and PointWSSIS employ weak labels to enhance the location ability of nuclei, they still have nuclear shape errors. %These observations demonstrate the effectiveness of our boundary-aware denoising method in the semi-supervised nuclei segmentation task.



\begin{table}[ht]\footnotesize
\centering
\caption{The segmentation head ablation experiments on CryoNuSeg and DigestPath.}
    \begin{tabular}{ccc|ccc|ccc}
        \hline
        \multirow{2}{*}{\textbf{NMH}} & \multirow{2}{*}{\textbf{LRD}} & \multirow{2}{*}{\textbf{CRC}} & \multicolumn{3}{c}{\textbf{CryoNuSeg}} & \multicolumn{3}{c}{\textbf{DigestPath}} \\
        \cline{4-9}
        & & & \textbf{Dice} &\textbf{AJI} & \textbf{PQ} & \textbf{Dice} &\textbf{AJI} & \textbf{PQ}\\
        \hline
        \ding{51} & & & 65.49 & 40.23  & 36.94 & 60.80 & 44.73 & 48.54  \\
        & \ding{51} &  & 64.37 & 39.24 & 39.91 & 61.22 & 43.09 & 47.80 \\
        \ding{51} & \ding{51} & & \underline{72.17}& \underline{46.44} & \underline{45.72} & \underline{63.19}& \underline{45.86} & \underline{51.22} \\
        \ding{51} & \ding{51} & \ding{51} & \textbf{76.76}& \textbf{51.09} & \textbf{49.66} & \textbf{65.72} & \textbf{46.14} & \textbf{53.96} \\
        \hline
\end{tabular}\label{tab2}
\end{table}


\vspace{-0.4cm}
\subsection{Ablation Studies}
\noindent\textbf{Ablation Studies for Segmentation Head.} 
In the student network, we employ three prediction heads, namely, the naive mask head (NMH), low-resolution denoising mask head (LRD), and cross-RoI contrastive learning mask head (CRC), to jointly supervise the segmentation predictions. To evaluate the effectiveness of these heads, we conducted a series of ablation experiments to assess the impact of different designs.
Specifically, we compared four designs: NMH, LRD, NMH+LRD, and NMH+LRD+CRC. The experimental results are listed in Table \ref{tab2}. From the table, we can observe the NMH+LRD+CRC outperforms the other methods, indicating that incorporating multiple segmentation constraints is effective. 


\noindent
\textbf{Ablation Studies for Box and Pixel Thresholds}
We conduct threshold filtering experiments on the MoNuSeg dataset with a 1/2 annotation ratio, and The experiment results are shown in Tables \ref{tab3} and \ref{tab4}. Table \ref{tab3} uses nuclear count statistics to determine the optimal box threshold. We can find that the model performs best when the threshold is set to 0.38. When changing the value, the model's performance deteriorated, indicating the effectiveness of using nucleus count for box threshold selection. In addition, Table \ref{tab4} shows the experiment results of pixel threshold. The table shows that when choosing 0.5 as the threshold, the model achieved optimal performance in terms of Dice and PQ scores.

\vspace{5pt}
% \noindent
\begin{minipage}[c]{0.43\textwidth}
\centering
\tiny
\captionof{table}{The box threshold setting experiments.}
\resizebox{\linewidth}{27pt}{
\begin{tabular}{c|ccc}
\hline
\textbf{Box Thr}& \textbf{Dice} &\textbf{AJI} & \textbf{PQ} \\
\hline
    0.3 & \underline{76.37} & \underline{52.20} & \underline{54.07} \\
    \textbf{0.38(opt)} & \textbf{77.80} & \textbf{54.82} & \textbf{56.59} \\
    0.5 & 75.68 & 50.83 & 53.26 \\
    0.7 & 72.28 & 48.01 & 51.58\\
    \hline
    \end{tabular}}\label{tab3}
\end{minipage}
\hspace{16pt}
\begin{minipage}[c]{0.43\textwidth}
\centering
\tiny

\captionof{table}{The mask threshold setting experiments.}
\resizebox{\linewidth}{27pt}{
\begin{tabular}{c|ccc}
\hline
\textbf{Pixel Thr}& \textbf{Dice} &\textbf{AJI} & \textbf{PQ} \\
\hline
    0.3 & 76.12 & 54.01 & 55.23 \\
    0.4 & \underline{77.04} & \textbf{54.96} & 55.37 \\
    \textbf{0.5(opt)} & \textbf{77.80} & \underline{54.82} & \textbf{56.59} \\
    0.6 & 76.97 & 54.04 & \underline{56.21} \\
\hline
\end{tabular}}\label{tab4}
\end{minipage}
\vspace{0.3cm}



\noindent\textbf{Ablation Studies for Sampling Ratio $\alpha$.} 
We conduct sampling ratio experiments and set four sampling ratios of 0.1, 0.3, 0.5, and 0.7 in the CRC. The experimental results are shown in Fig. \ref{fig3}. The table shows that as the sampling ratio increases, the performance gradually improves, indicating that the sampling ratio indeed influences the performance. When the sampling ratio is large, the model obtains more sampled pixels, resulting in better contrastive learning performance. However, as the sampling ratio increases, the computational cost of the model also increases. Therefore, we select 0.7 as the final sampling ratio, which achieves the best balance between model performance and computational cost.

\vspace{0.1cm}
\noindent
\begin{minipage}[c]{0.45\textwidth}
\centering
% \tiny
\includegraphics[width=0.9\textwidth]{Fig3.pdf}
\vspace{-0.2cm}
\captionof{figure}{The sampling ratio ablation experiments on CryoNuSeg dataset.}\label{fig3}
\end{minipage}
\hspace{15pt}
\begin{minipage}[c]{0.45\textwidth}
\centering
% \tiny
\includegraphics[width=0.9\textwidth]{Fig4.pdf}
\vspace{-0.2cm}
\captionof{figure}{The distance comparison experiments on CryoNuSeg dataset.}\label{fig4}
\end{minipage}
\vspace{0.2cm}

\noindent\textbf{Ablation Studies for Distance $d$.}
We investigate the effect of distance $d$, which represents the distance from the inner (outer) contour to the accurate nuclei contour. Expressly, we set $d$ to 0, 2, 4, and 6. It is worth noting when $d=0$, we do not sample between the actual and inner (outer) contour. From the Fig. \ref{fig4}, we can see when $d=4$, the model performs best. However, the performance drops as $d$ decreases. This is because when reducing the sampling range, the boundary information obtained by the model also decreases. On the contrary, when $d$ increases to 6, the sampling area becomes more extensive, leading to a mixture of boundary and non-boundary features, ultimately decreasing performance. 

\vspace{-0.4cm}
\section{Conclusions}
This paper proposes a boundary-aware contrastive learning model based on the teacher-student framework for semi-supervised nuclei segmentation. The model utilizes a low-resolution feature supervision head and a cross-RoI contrastive learning module to achieve the nuclei boundary denoising. However, the model trains the teacher and student networks in separate stages, which hinders the student network from effectively utilizing the features extracted by the teacher network. Therefore, in the future, we will adopt an end-to-end training approach for both the teacher and student networks to enhance the information interaction between the teacher and student networks.
\section{Acknowledgements}
This work was supported in part by the National Natural Science Foundation of China under 62031023 \& 62331011; in part by the Shenzhen Science and Technology Project under GXWD20220818170353009, and in part by the Fundamental Research Funds for the Central Universities under No.HIT.OCEF.2023050.

\bibliography{midl24_070}

% \newpage
\section{Appendix}

In the main body, we used 1/8, 1/4 and 1/2 labeled data to conduct experiments on CryoNuSeg \cite{mahbod2021cryonuseg}, DigestPath \cite{da2022digestpath} and MoNuSeg \cite{kumar2017dataset} datasets respectively. 

In this section, we provide the data split details as shown in Table \ref{tab5}. 
First, these three datasets are divided into the training set, validation set and testing set according to the proportion of 6:2:2. Then, we re-divide the training set into labeled and unlabeled data sets according to 1/8, 1/4 and 1/2. In the whole training process, we keep the validation and testing sets unchanged.

\begin{table}[thp]\footnotesize
\begin{center}
\caption{The data split on CryoNuSeg, DigestPath and MoNuSeg datasets.}
        \begin{tabular}{cc|cc|cc}
            \hline
            \multirow{2}{*}{\textbf{Dataset}} & \multirow{2}{*}{\textbf{Ratio}} & \multicolumn{2}{c|}{\textbf{Training}} & \multirow{2}{*}{\textbf{Validation}} & \multirow{2}{*}{\textbf{Testing}} \\
            \cline{3-4}
            & & \textbf{Labeled} & \textbf{Unlabeled} & &  \\
            \hline
            & \textbf{1/8} & 20 &142 & 54 & 54  \\
            \textbf{CryoNuSeg} & \textbf{1/4} & 40 & 122 & 54 & 54 \\
            & \textbf{1/2} & 81 & 81 & 54& 54 \\
            \hline
            & \textbf{1/8} & 631 & 2653 & 835 & 994  \\
            \textbf{DigestPath} & \textbf{1/4} & 930 &2354 & 835 & 994 \\
            & \textbf{1/2} & 1740 & 1554 & 835 & 994 \\
            \hline
            & \textbf{1/8} & 98 & 686  & 392 & 294   \\
            \textbf{MoNuSeg} & \textbf{1/4} & 196 & 588 & 392 & 294\\
            & \textbf{1/2} & 392 & 392 & 392 & 294 \\
            \hline
        \end{tabular}\label{tab5}
    \end{center}
\end{table}



\end{document}
