\documentclass[10pt]{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution
\usepackage{adjustbox}
\usepackage{mwe} % to get dummy images

\jmlryear{2025}
\jmlrworkshop{Full Paper -- MIDL 2025}
\jmlrvolume{-- 035}
\editors{Accepted for publication at MIDL 2025}


\title[Self-corrective Cell Segmentation]{CASC-AI: Consensus-aware Self-corrective Learning for Cell Segmentation with Noisy Labels}

%\title[Short Title]{Self-Correcting AI for Robust Cell Segmentation in Noisy Annotations}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{
\Name{Ruining Deng \nametag{$^{1,2}$}} \Email{rud4004@med.cornell.edu}\\
\Name{Yihe Yang \nametag{$^{1}$}} \Email{yiy4007@med.cornell.edu}\\
\Name{David J. Pisapia \nametag{$^{1}$}} \Email{djp2002@med.cornell.edu}\\
\Name{Benjamin Liechty \nametag{$^{1}$}} \Email{bel9057@med.cornell.edu}\\
\Name{Junchao Zhu \nametag{$^{2}$}} \Email{junchao.zhu@vanderbilt.edu}\\
\Name{Juming Xiong \nametag{$^{2}$}} \Email{juming.xiong@vanderbilt.edu}\\
\Name{Junlin Guo \nametag{$^{2}$}} \Email{junlin.guo@vanderbilt.edu}\\
\Name{Zhengyi Lu \nametag{$^{2}$}} \Email{zhengyi.lu@vanderbilt.edu}\\
\Name{Jiacheng Wang \nametag{$^{2}$}} \Email{jiacheng.wang.1@vanderbilt.edu}\\
\Name{Xing Yao \nametag{$^{2}$}} \Email{xing.yao@vanderbilt.edu}\\
\Name{Runxuan Yu \nametag{$^{2}$}} \Email{runxuan.yu@vanderbilt.edu}\\
\Name{Rendong Zhang \nametag{$^{2}$}} \Email{rendong.zhang@vanderbilt.edu}\\
\Name{Gaurav Rudravaram \nametag{$^{2}$}} \Email{gaurav.rudravaram@vanderbilt.edu}\\
\Name{Mengmeng Yin \nametag{$^{3}$}} \Email{mengmeng.yin.1@vumc.org}\\
\Name{Pinaki Sarder \nametag{$^{4}$}} \Email{pinaki.sarder@ufl.edu}\\
\Name{Haichun Yang \nametag{$^{3}$}} \Email{haichun.yang@vumc.org}\\
\Name{Yuankai Huo \nametag{$^{2,3}$}} \Email{yuankai.huo@vanderbilt.edu}\\
\Name{Mert R. Sabuncu \nametag{$^{1,5}$}} \Email{msabuncu@cornell.edu}\\
\addr $^{1}$ Weill Cornell Medicine, New York, NY 10021 \\
\addr $^{2}$ Vanderbilt University, Nashville, TN, USA 37215 \\
\addr $^{3}$ Vanderbilt University Medical Center, Nashville, TN, USA 37232 \\
\addr $^{4}$ University of Florida, Gainesville, FL, USA 32611\\
\addr $^{5}$ Cornell Tech, New York, NY, USA 10044 \\
}

\begin{document}

\maketitle
\begin{abstract}
Multi-class cell segmentation in high-resolution gigapixel whole slide images (WSIs) is crucial for various clinical applications. However, training such models typically requires labor-intensive, pixel-wise annotations by domain experts. Recent efforts have democratized this process by involving lay annotators without medical expertise. However, conventional non-corrective approaches struggle to handle annotation noise adaptively because they lack mechanisms to mitigate false positives (FP) and false negatives (FN) at both the image-feature and pixel levels. In this paper, we propose a consensus-aware self-corrective learning that leverages the Consensus Matrix to guide its learning process. The Consensus Matrix defines regions where both the AI and annotators agree on cell and non-cell annotations, which are prioritized with stronger supervision. Conversely, areas of disagreement are adaptively weighted based on their feature similarity to high-confidence consensus regions, with more similar regions receiving greater attention. Additionally, contrastive learning is employed to separate features of noisy regions from those of reliable consensus regions by maximizing their dissimilarity. This paradigm enables the model to iteratively refine noisy labels, enhancing its robustness. Validated on one real-world lay-annotated cell dataset and two reasoning-guided simulated noisy datasets, our method demonstrates improved segmentation performance, effectively correcting FP and FN errors and showcasing its potential for training robust models on noisy datasets. The official implementation and cell annotations are publicly available at~\url{https://github.com/ddrrnn123/CASC-AI}.

\end{abstract}
\begin{keywords}
Consensus matrix, Corrective Learning, Noisy label learning, Cell Segmentation
\end{keywords}

\section{Introduction} 
Multi-class cell segmentation is essential for analyzing tissue samples in digital pathology, often serving as the initial step in extracting biological signals crucial for accurate disease diagnosis and treatment planning~\cite{caicedo2017data,deng2020deep,keren2018structured,pratapa2021image,litjens2017survey,border2024investigating,ke2023clusterseg,zhu2023anti,zhu2024cross}. Accurate cell quantification aids pathologists in diagnosing diseases~\cite{comaniciu2002cell, xing2016robust}, determining disease progression~\cite{olindo2005htlv}, assessing severity~\cite{wijeratne2018quantification}, and evaluating treatment efficacy~\cite{jimenez2006mast}. For instance, the distribution and density of cells in the glomerulus (e.g., podocytes, mesangial cells, endothelial cells, and epithelial cells) can serve as indicators of functional injury in renal pathology~\cite{imig2022interactions}. However, cell-level characterization is challenging even for experienced pathologists due to the long annotation time, extensive labor required, significant variability in cell morphology~\cite{zheng2021deep}, and the potential for human error. Needless to mention the rigorous medical training required for a pathologist. 


\begin{figure*}[t]
\begin{center}
 \includegraphics[width=0.9\linewidth]{Figures/Scope.pdf}
\end{center}
  \caption{
  \textbf{Consensus-aware self-corrective learning.} We propose a Consensus-Aware Self-Corrective Learning for robust cell segmentation with noisy training data. The model leverages the CM to guide learning, prioritizing CP and CN regions with stronger supervision, while adaptively weighting DM and DH regions based on their similarity to reliable CP regions by contrastive learning.}
\label{fig:introduction}
\end{figure*}  

Previous efforts have democratized the annotation process by involving lay annotators without medical expertise and integrating pair-wise molecular images with pathological images, resulting in a substantial number of accurate cell annotations for training AI models~\cite{deng2023democratizing}. However, this approach inevitably introduces noise and errors, necessitating correction by experienced pathologists. Directly training models on such noisy labels often leads to suboptimal performance. This highlights the urgent need for a corrective learning paradigm that effectively addresses label noise during cell segmentation model training~\cite{vuadineanu2022analysis,karimi2020deep}. Previous research on noisy-label learning has focused on defining efficient loss functions~\cite{zhang2018generalized,wang2020noise,ma2020normalized} and leveraging multi-network strategies~\cite{zhang2020robust,han2018co,lu2023uncertainty,guo2023sac}. However, these approaches largely overlook the integration of feature-level analysis with pixel-level analysis to effectively identify annotation errors at the pixel level.

In this work, we propose Consensus-Aware Self-Corrective Learning (CASC-AI), which incorporates insights from the Consensus Matrix (CM) to guide its learning process (as shown in Fig.~\ref{fig:introduction}). Unlike conventional heuristic-based correction methods, CASC-AI actively learns from noisy annotations by leveraging both pixel-wise and feature-wise information to iteratively refine its predictions. The self-corrective learning mechanism autonomously detects patterns in annotation errors and adapts its training by distinguishing noisy labels from high-confidence regions through maximizing feature dissimilarity, thereby enhancing its robustness against annotation errors. The contributions of this paper is threefold:


\noindent(1) A Consensus-Aware Self-Corrective Learning is designed to provide robust cell segmentation when training data contains noise.

\noindent(2) A reasoning-guided noise-generation process is introduced for pathological cell images to simulate realistic noise for label analysis.

\noindent(3) By integrating Consensus Matrix insights at both the pixel and feature levels, the proposed method demonstrates improved segmentation performance, effectively addressing FP and FN errors, showcasing its potential for training robust models on noisy datasets.



\section{Method}
Introducing lay annotators into the labeling process significantly increases the volume of annotations available for training deep learning models. However, it also introduces noise and errors due to human visual limitations and variability among annotators.
There are several types of annotation errors introduced by humans, including contour-wise boundary errors~\cite{zhang2020disentangling,dang2024singr} and instance-wise location errors~\cite{vuadineanu2022analysis,goldsborough2024instanseg}. In this study, we mainly focus on instance-wise location errors, where false positive and false negative cells are introduced during the molecular-empowered lay annotation process (shown in Fig.~\ref{fig:introduction}).

With the rapid development of deep learning, AI has demonstrated its capability in representing images~\cite{oquab2023dinov2,huang2021gloria}, providing reliable and stable latent features for image understanding. Therefore, the proposed CASC-AI aims to combine the strengths of human expertise and AI capability during the training phase, guiding the model to capture accurate information from lay annotations while distinguishing potential noise at the pixel level. The overall learning paradigm consists of three components: (1) Consensus Matrix, (2) Consensus-aware Supervision, and (3) Contrastive Noise Separation.

\subsection{Consensus Matrix} 


\begin{figure*}[t]
\begin{center}
 \includegraphics[width=0.9\linewidth]{Figures/Method.pdf}
\end{center}
  \caption{
  \textbf{Overview of the Consensus-Aware Supervision Framework.} The architecture integrates AI-derived confidence maps ($c$) and lay annotations ($y_l$) to identify consensus-positive (CP), consensus-negative (CN), and disagreement regions (DM, DH). This framework emphasizes robust training by focusing on regions of consensus and leveraging disagreement as informative cues for improved cell segmentation accuracy.}
\label{fig:Method}
\end{figure*}  


To capture the agreement between lay annotators and the AI model, we define a Consensus Matrix (in Fig.~\ref{fig:introduction}), inspired by the confusion matrix, to guide pixel-level image understanding. The matrix is composed of the following components:

\noindent\textbf{Consensus Positives (CP):} Regions where both the AI and annotators agree on a ``cell” annotation. These regions represent strong consensus for action, where both parties confidently identify cells.

\noindent\textbf{Consensus Negatives (CN):} Regions where both the AI and annotators agree on a ``non-cell” annotation. These regions reflect mutual consensus to abstain from action, ensuring non-cell regions are left unannotated.

\noindent\textbf{Disagreement Model-positives (DM):} Regions where the AI identifies a ``cell,” but annotators label it as ``non-cell.” These regions highlight potential false negatives in the lay annotations, where cells may have been missed.

\noindent\textbf{Disagreement Human-positives (DH):} Regions where the AI labels a region as ``non-cell,” but annotators identify it as a ``cell.” These regions represent potential false positives in the lay annotations, where cells may have been overannotated.

\subsection{Consensus-aware Supervision}

Building on our previous works~\cite{deng2023democratizing,deng2024hats}, we select a token-based residual U-Net from~\cite{deng2024prpseg} as the backbone for cell segmentation tasks. This backbone demonstrates superior performance in multi-class cell segmentation using partially labeled datasets, compared to two other cell segmentation backbones~\cite{horst2024cellvit,israel2024foundation} as shown in Table~\ref{tab:ablationstudy}. As illustrated in Fig.~\ref{fig:Method}, the model outputs the final prediction logits \( p \in \mathbb{R}^{2 \times W \times H} \), the pixel-level feature map of the decoder's last layer \( f_D \in \mathbb{R}^{Ch \times W \times H} \), and a confidence map \( c \in \mathbb{R}^{1 \times W \times H} \), which represents the foreground channel of \( p \) after applying the channel-wise softmax function. \( W \) and \( H \) are the width and height of the input image, while \( Ch \) represents the number of channels in the decoder's last layer. The confidence map \( c \in (0, 1) \) indicates the confidence level of predictions: values closer to 1 suggest stronger confidence in identifying a region as a cell, while values closer to 0 suggest a higher likelihood of non-cell regions.



%\subsubsection{Consensus Cell Feature Distillation}

\noindent\textbf{Consensus Cell Feature Distillation: }Using the confidence map \( c \) from the AI model, we combine it with lay annotations \( y_l \) to identify pixel locations with the highest agreement scores \( a_{\text{CP}} \) in CP regions. These regions are used to distill features \( f_{\text{cell}} \) that best represent cell types.  The computation for \( a_{\text{CP}} \) and \( f_{\text{cell}} \) is defined in Eq.~\ref{eq:topk} (annotated as \(\phi_1\) in Fig.~\ref{fig:Method}).

{
\begin{equation}
\begin{aligned}
a_{\text{CP}} &= c \cdot y_l \\
\text{Ind}_{\text{CP}} &= \text{argsort}(-a_{\text{CP}})[:k] \\
f_{\text{cell}} &= \frac{1}{k} \sum_{i=1}^k f_D(\text{Ind}_{\text{CP}}[i])
\end{aligned}
\label{eq:topk}
\end{equation}
}



\noindent\textbf{Disagreement Noise Feature Distillation: }In DH and DM regions, where the AI model and lay annotators disagree, we identify top pixel locations with the highest disagreement scores \( a_{\text{DH}} \) and \( a_{\text{DM}} \). Features from these regions \( fm_{\text{noise}} \) potentially contain both real cells and noise, as represented in Eq.~\ref{eq:do_mc} (annotated as \(\phi_2\) in Fig.~\ref{fig:Method}).

{
\begin{equation}
\begin{aligned}
a_{\text{DM}} &= c \cdot (1 - y_l) \\
\text{Ind}_{\text{DM}} &= \text{argsort}(-a_{\text{DM}})[:k/2] \\
a_{\text{DH}} &= (1 - c) \cdot y_l \\
\text{Ind}_{\text{DH}} &= \text{argsort}(-a_{\text{DH}})[:k/2]
\end{aligned}
\label{eq:do_mc}
\end{equation}
}

When aggregating potential noise features \( fm_{\text{noise}} \) into the distilled noise feature \( f_{\text{noise}} \), we calculate the similarity \( s_\text{cell} \) between the potential noise features \( fm_{\text{noise}} \) and the cell feature \( f_{\text{cell}} \). Using a weighted sum, we derive the final noise feature \( f_{\text{noise}} \), based on the assumption that noise features in these regions are dissimilar to cell features. The process is defined in Eq.~\ref{eq:btmk2} (highlighted as \(\phi_3\) in Fig.~\ref{fig:Method}).

{
\begin{equation}
\begin{aligned}
fm_{\text{noise}} &= f_D([\text{Ind}_{\text{DM}}, \text{Ind}_{\text{DH}}]) \\
s_\text{cell} &= \frac{fm_{\text{noise}} \cdot f_{\text{cell}}}{\|fm_{\text{noise}}\| \|f_{\text{cell}}\|} \\
w &= \text{softmax}(1 - \text{norm}(s_\text{cell})) \\
f_{\text{noise}} &= w \cdot fm_{\text{noise}}
\end{aligned}
\label{eq:btmk2}
\end{equation}
}

We compute the similarity between the feature map \( f_D \) and the top cell and noise feature \( f_{\text{cell}} \) and \( f_{\text{noise}} \), obtaining similarity maps \( {sim}_{\text{cell}} \) and \( {sim}_{\text{noise}} \). The computation are provided in Eq.~\ref{eq:sim} (labeled as \(\phi_4\) and \(\phi_5\) in Fig.~\ref{fig:Method}).

{
\begin{equation}
\begin{aligned}
{sim}_{\text{cell}} = \frac{f_D \cdot f_{\text{cell}}}{\|f_D\| \|f_{\text{cell}}\|} \\
{sim}_{\text{noise}} = \frac{f_D \cdot f_{\text{noise}}}{\|f_D\| \|f_{\text{noise}}\|}
\end{aligned}
\label{eq:sim}
\end{equation}
}


\noindent\textbf{Consensus-aware Loss Function: }During training, the model is guided to focus on regions where both the AI model and lay annotators agree (CP and CN) while ignoring regions likely to contain noise. By combining the confidence map \( c \) and lay annotations \( y_l \), CP and CN regions are highlighted, and \( \text{sim}_{\text{cell}} \) and \( \text{sim}_{\text{noise}} \) further refine the focus on cell-like regions within DM and DH areas. The final supervised loss is defined in Eq.~\ref{eq:supervise}:


\begin{equation}
\begin{aligned}
\omega_c = \exp(c \cdot y_l + (1 - c) \cdot (1 - y_l)) \quad   \omega_{\text{sim}} = \exp(\text{sim}_{\text{cell}} - \text{sim}_{\text{noise}}) \\
\mathcal{L}_{\text{supervise}}(y_l, f(x; \theta)) = (\mathcal{L}_{\text{Dice}} + \mathcal{L}_{\text{BCE}})(y_l, f(x; \theta)) \cdot \omega_c \cdot \omega_{\text{sim}} \\
\end{aligned}
\label{eq:supervise}
\end{equation}

\noindent Where \( f \) is the segmentation model, \( \theta \) are the trainable parameters, and \( \mathcal{L}_{\text{Dice}} \) and \( \mathcal{L}_{\text{BCE}} \) are the Dice efficiency loss and Binary Cross-Entropy loss, respectively.

\subsection{Contrastive Noise Separation}

Using the final cell feature \( f_{\text{cell}} \) and noise feature \( f_{\text{noise}} \), we aim to maximize their separation using a contrastive learning loss function in Eq.~\ref{eq:contrastive}:

\begin{equation}
\mathcal{L}_{\text{contrastive}}(f_{\text{cell}}, f_{\text{noise}}) = 
(\mathcal{L}_{\text{KL}} + \mathcal{L}_{\text{MSE}})(\text{norm}(f_{\text{cell}}), \text{norm}(f_{\text{noise}}))
\label{eq:contrastive}
\end{equation}

\noindent where \(\mathcal{L}_{\text{KL}} \) is the KL Divergence loss, and \(\mathcal{L}_{\text{MSE}} \) is the Mean Squared Error loss.

The final consensus-aware self-corrective learning loss combines \( \mathcal{L}_{\text{supervise}} \) and \( \mathcal{L}_{\text{contrastive}} \) to achieve robust training shown in Eq.~\ref{eq:finalloss}:

\begin{equation}
\mathcal{L}_{\text{consensus-aware}}(y_l, f(x; \theta)) = \mathcal{L}_{\text{supervise}}(y_l, f(x; \theta)) + \mathcal{L}_{\text{contrastive}}(f_{\text{cell}}, f_{\text{noise}})
\label{eq:finalloss}
\end{equation}


\section{Data and Experiment}
\subsection{Data}
To evaluate the performance of the consensus-aware self-corrective learning framework, we collected a glomerular cell segmentation dataset. We utilized 21 whole slide images (WSIs) from normal adult cases in the nephrectomy dataset and HuBMAP. These slides were stained with Periodic Acid-Schiff (PAS), and  were scanned at \( 20\times \) magnification. The WSIs were cropped into \( 512 \times 512 \)-pixel segments to facilitate cell labeling. The cell labels are confined within glomeruli. The labeled cells included mesangial cells (Mes.), endothelial cells (Endo.), podocytes (Pod.), and parietal epithelial cells (Pecs.). Labeling was performed in a partial-label manner, where each image contained a single class label with binary masks. The details of data collection are shown in Table~\ref{tab:dataset} (In Appendix~\ref{ap:dataset}).

\noindent \textbf{Real Lay Annotation Dataset:} Following the annotation process described in~\cite{deng2023democratizing}, two sets of annotations were obtained (1) directly from lay annotators and (2) underwent a quality assurance process conducted by experienced pathologists.

\noindent \textbf{Reasoning-Generated Noise Datasets:} To further explore the capabilities of the proposed method, we designed two reasoning-based noise generation pipelines to create FP and FN datasets: (1) The \textbf{FP data generation pipeline} adds plausible noise labels by following these principles: a. annotating nuclei regions indicated by PAS staining; b. providing annotations for glomeruli that are near to the correct cells; and c. creating annotations with sizes that do not exceed the acceptable range for cells, where such annotations are more likely to contain human errors; (2) \textbf{The FN data generation pipeline} randomly removes parts of the ground truth labels annotated by pathologists.


The visualizations of the three datasets are shown in Fig.~\ref{fig:Dataset}, and the labeling accuracy for each dataset, the detailed pipelines are presented in Table~\ref{tab:data_acc}, Algorithm~\ref{alg:add}, and Algorithm~\ref{alg:remove} in the Appendix~\ref{ap:dataset}. 


\begin{figure*}[t]
\begin{center}
 \includegraphics[width=0.75\linewidth]{Figures/Adding_noise.pdf}
\end{center}
  \caption{
    \textbf{Illustration of the Noisy Dataset.} The figure depicts a real lay annotation dataset and two reasonable noise generation pipelines used to create FP and FN datasets with plausible noise. These processes are applied to evaluate the proposed method under challenging scenarios.}

\label{fig:Dataset}
\end{figure*}  


\subsection{Experimental details}

The dataset was split into training, validation, and testing sets at the WSI level in a 6:1:3 ratio, ensuring balanced distributions of injured and normal glomeruli across splits. All experiments used the same hyperparameter settings, which were determined from an ablation study (see Table~\ref{tab:ablationstudy}) on a non-error dataset using supervised learning. Model selection was based on the mean Dice score across the four cell classes in the validation set. All experiments were conducted on an NVIDIA RTX A6000 GPU for uniformity.

\subsection{Evaluation Metrics}
We evaluate performance using Dice similarity coefficient scores, with the binary mask for each image serving as the ground truth. We also provide F1-score results by converting the binary segmentation labels into instance segmentation labels following the method in~\cite{deng2025kpis}. Standard deviations are provided for the results in the tables, and a Wilcoxon t-test is performed to assess the significance of differences between methods.

\section{Results}
\subsection{Testing Set Segmentation Performance}
We evaluate the proposed CASC-AI framework alongside other loss correction noisy label learning methods on three datasets. All methods were implemented with the same backbone and hyperparameters to ensure fair comparisons. We conducted an ablation study to identify the optimal backbone and hyperparameter settings for cell segmentation, using error-free ground-truth labels that were corrected and verified by pathologists under supervised learning, shown in the Appendix~\ref{ap:ablation}.

Table~\ref{tab:testing_dice} and Fig.~\ref{fig:qualitative} demonstrate that the proposed method achieves improvements compared to direct supervised learning and other baseline methods. This indicates that CASC-AI effectively leverages lay annotations while mitigating noise for enhanced segmentation performance.


\begin{table*}[htbp]
\centering
\caption{Performance of various noisy label learning methods. Dice similarity coefficient scores (\%) and F1-scores (\%) are reported. The top two performing methods are highlighted in \textcolor{red}{red} and \textcolor{blue}{blue}. The Wilcoxon signed-rank test was performed using CASI-AI as the reference method to compare with other methods. All results are statistically significant ($p < 0.001$) compared to the proposed method.}
%--------------------- Real Dataset ---------------------%
\adjustbox{max width=\textwidth}{
\begin{tabular}{l|ccccc|ccccc}
\hline
\multicolumn{11}{c}{\textbf{Real Dataset}} \\
\hline
\textbf{Method} & \multicolumn{5}{c|}{Dice (\%)} & \multicolumn{5}{c}{F1-score (\%)} \\
\hline
                & \textbf{Pod.} & \textbf{Mes.} & \textbf{Endo.} & \textbf{Pecs.} & \textbf{Mean}  & \textbf{Pod.} & \textbf{Mes.} & \textbf{Endo.} & \textbf{Pecs.} & \textbf{Mean} \\
\hline
Supervised      & 71.18  $\pm$ 10.08      & \textcolor{blue}{68.33} $\pm$ 06.87       & 51.99 $\pm$ 02.99      & \textcolor{red}{76.09} $\pm$ 10.60 & 66.90             & 43.79 $\pm$ 22.19       & \textcolor{blue}{42.42} $\pm$ 16.78       & 04.66 $\pm$ 06.61       & \textcolor{blue}{53.34} $\pm$ 24.87       & 36.06       \\
GCE~\cite{zhang2018generalized}      & 66.94  $\pm$   07.97
     & 51.25   $\pm$  02.47  
    & 49.71  $\pm$   00.27
     & 55.56    $\pm$  06.83  
   & 55.86           & 30.96 $\pm$ 18.98       & 02.53 $\pm$ 06.86       & 00.00 $\pm$ 00.00       & 09.67 $\pm$ 17.67       & 10.79       \\
NCE+NMAE~\cite{ma2020normalized}      & 49.92 $\pm$  00.05      & 49.86 $\pm$   00.12      & 49.71   $\pm$  00.27    & 49.91  $\pm$  00.07     & 49.85            & 00.00 $\pm$ 00.00       & 00.00 $\pm$ 00.00       & 00.00 $\pm$ 00.00       & 00.00 $\pm$ 00.00       & 00.00        \\
NRDice~\cite{wang2020noise}      & 71.00  $\pm$    08.20     & 52.75  $\pm$   04.42     & 49.72  $\pm$   00.27     & 65.62  $\pm$  11.74     & 59.77              & 44.35 $\pm$ 18.48       & 06.29 $\pm$ 11.69       & 00.00 $\pm$ 00.00       & 33.97 $\pm$ 27.68       & 21.15        \\
CL~\cite{deng2023democratizing}       & \textcolor{blue}{74.00} $\pm$  09.18  & 67.26   $\pm$  06.37    & \textcolor{blue}{69.53} $\pm$  07.91  & 73.89 $\pm$ 11.07     & \textcolor{blue}{71.17}        & \textcolor{blue}{50.62} $\pm$ 21.58       & 38.66 $\pm$ 16.50       & \textcolor{blue}{42.01} $\pm$ 18.48       & 49.65 $\pm$ 25.49       & \textcolor{blue}{45.23}       \\
CASC-AI (Ours)  & \textcolor{red}{74.93} $\pm$   07.38 & \textcolor{red}{68.88} $\pm$ 05.32 & \textcolor{red}{72.24} $\pm$  07.29 & \textcolor{blue}{75.94} $\pm$ 10.71 & \textcolor{red}{73.00}      & \textcolor{red}{52.25} $\pm$ 19.90       & \textcolor{red}{43.00} $\pm$ 13.93       & \textcolor{red}{46.22} $\pm$ 18.83       & \textcolor{red}{55.60} $\pm$ 26.32       & \textcolor{red}{49.27}      \\
\hline
\end{tabular}
}
%--------------------- FP Dataset ---------------------%
\adjustbox{max width=\textwidth}{
\begin{tabular}{l|ccccc|ccccc}
\multicolumn{11}{c}{\textbf{FP Dataset}} \\
\hline
\textbf{Method} & \multicolumn{5}{c|}{Dice (\%)} & \multicolumn{5}{c}{F1-score (\%)} \\
\hline
                & \textbf{Pod.} & \textbf{Mes.} & \textbf{Endo.} & \textbf{Pecs.} & \textbf{Mean}  & \textbf{Pod.} & \textbf{Mes.} & \textbf{Endo.} & \textbf{Pecs.} & \textbf{Mean} \\
\hline
Supervised      & \textcolor{red}{71.12} $\pm$   05.45 & 64.24  $\pm$ 05.35      & 64.56   $\pm$  07.12     & 70.87  $\pm$  10.62     & 67.70             & 21.16 $\pm$ 10.67       & 35.09 $\pm$ 12.60       & 33.97 $\pm$ 15.68       & 37.12 $\pm$ 21.30       & 31.84        \\
GCE~\cite{zhang2018generalized}      & 62.71   $\pm$ 08.27    & 62.27  $\pm$ 04.99     & 66.16    $\pm$ 07.27    & 66.96   $\pm$  10.97    & 64.52              & 24.01 $\pm$ 17.75       & 22.68 $\pm$ 11.48       & 31.69 $\pm$ 15.98       & 31.68 $\pm$ 25.01       & 27.52        \\
NCE+NMAE~\cite{ma2020normalized}       & 49.92 $\pm$  00.05      & 49.86 $\pm$   00.12      & 49.71   $\pm$  00.27    & 49.91  $\pm$  00.07     & 49.85              & 00.00 $\pm$ 00.00       & 00.00 $\pm$ 00.00       & 00.00 $\pm$ 00.00       & 00.00 $\pm$ 00.00       & 00.00        \\
RDice~\cite{wang2020noise}      & 67.66 $\pm$   07.48     & 60.10 $\pm$ 06.23     & 67.97   $\pm$  07.82    & 73.16 $\pm$  11.60     & 67.22              & \textcolor{blue}{30.98} $\pm$ 15.61       & 22.33 $\pm$ 13.90       & 39.24 $\pm$ 18.17       & 44.78 $\pm$ 25.33       & 34.34        \\
CL~\cite{deng2023democratizing}       & 65.24 $\pm$  07.38      & \textcolor{blue}{65.89} $\pm$ 05.32     & \textcolor{blue}{69.04} $\pm$   07.29    & \textcolor{blue}{73.78} $\pm$ 10.71      & \textcolor{blue}{68.49}              & 30.16 $\pm$ 15.89       & \textcolor{blue}{35.23} $\pm$ 12.34       & \textcolor{blue}{42.90} $\pm$ 16.87       & \textcolor{blue}{46.59} $\pm$ 24.08       & \textcolor{blue}{38.72}        \\
CASC-AI (Ours)  & \textcolor{blue}{68.49} $\pm$    06.98 & \textcolor{red}{66.24} $\pm$    05.55 & \textcolor{red}{70.64} $\pm$  07.38     & \textcolor{red}{74.75} $\pm$  10.41     & \textcolor{red}{70.03}              & \textcolor{red}{33.23} $\pm$ 14.57       & \textcolor{red}{35.35} $\pm$ 12.47       & \textcolor{red}{43.00} $\pm$ 16.87       & \textcolor{red}{48.34} $\pm$ 25.35       & \textcolor{red}{39.98}        \\
\hline
\end{tabular}
}
%--------------------- FN Dataset ---------------------%
\adjustbox{max width=\textwidth}{
\begin{tabular}{l|ccccc|ccccc}
\multicolumn{11}{c}{\textbf{FN Dataset}} \\
\hline
\textbf{Method} & \multicolumn{5}{c|}{Dice (\%)} & \multicolumn{5}{c}{F1-score (\%)} \\
\hline
                & \textbf{Pod.} & \textbf{Mes.} & \textbf{Endo.} & \textbf{Pecs.} & \textbf{Mean} &  \textbf{Pod.} & \textbf{Mes.} & \textbf{Endo.} & \textbf{Pecs.} & \textbf{Mean} \\
\hline
Supervised      & 61.51 $\pm$ 08.26       & 66.60 $\pm$ 07.52       & 66.02 $\pm$ 08.85      & 71.13 $\pm$ 11.12      & 66.32              & 45.49 $\pm$ 19.26       & 34.93 $\pm$ 19.05       & 32.69 $\pm$ 19.64       & 46.33 $\pm$ 27.06       & 39.86       \\
GCE~\cite{zhang2018generalized}      & 56.17 $\pm$  05.43      & 49.86  $\pm$ 00.12       & 49.71 $\pm$ 00.26      & 49.91  $\pm$ 00.07       & 51.41              & 12.47 $\pm$ 15.13       & 00.00 $\pm$ 00.00       & 00.00 $\pm$ 00.00       & 00.00 $\pm$ 00.00       & 03.12       \\
NCE+NMAE~\cite{ma2020normalized}       & 49.92 $\pm$  00.05      & 49.86 $\pm$   00.12      & 49.71   $\pm$  00.27    & 49.91  $\pm$  00.07     & 49.85              & 00.00 $\pm$ 00.00       & 00.00 $\pm$ 00.00       & 00.00 $\pm$ 00.00       & 00.00 $\pm$ 00.00       & 00.00        \\
NRDice~\cite{wang2020noise}      & 52.48  $\pm$ 04.48     & 49.86    $\pm$ 00.12    & 52.85   $\pm$ 03.81    & 49.91   $\pm$ 00.07     & 51.28             & 06.62 $\pm$ 13.02       & 00.00 $\pm$ 00.00       & 07.32 $\pm$ 10.78       & 00.00 $\pm$ 00.00       & 03.48        \\
CL~\cite{deng2023democratizing}       & \textcolor{blue}{71.92} $\pm$ 09.18      & \textcolor{blue}{67.40} $\pm$ 06.37      & \textcolor{blue}{70.94} $\pm$ 07.91      & \textcolor{blue}{73.05} $\pm$ 11.07      & \textcolor{blue}{70.83}              & \textcolor{blue}{46.07} $\pm$ 21.83       & \textcolor{blue}{39.19} $\pm$ 15.21       & \textcolor{blue}{45.98} $\pm$ 17.61       & \textcolor{blue}{50.54} $\pm$ 26.40       & \textcolor{blue}{45.44}        \\
CASC-AI (Ours)  & \textcolor{red}{72.85} $\pm$ 08.47    & \textcolor{red}{70.04} $\pm$ 04.98     & \textcolor{red}{72.63} $\pm$ 07.78      & \textcolor{red}{74.90} $\pm$ 11.07      & \textcolor{red}{72.60}             & \textcolor{red}{53.01} $\pm$ 21.36       & \textcolor{red}{43.17} $\pm$ 14.98       & \textcolor{red}{47.31} $\pm$ 18.60       & \textcolor{red}{53.00} $\pm$ 26.16       & \textcolor{red}{49.12}        \\
\hline
\end{tabular}
}
\label{tab:testing_dice}
\end{table*}




\begin{figure*}[t]
\begin{center}
 \includegraphics[width=1.0\linewidth]{Figures/Qualitative.pdf}
\end{center}
  \caption{
    \textbf{Qualitative Results.} The figure presents qualitative results on real dataset obtained using the supervised method and the proposed CASC-AI method. The results demonstrate that the proposed approach enhances segmentation performance on noisy labels by reducing false positives and false negatives.
}
\label{fig:qualitative}
\end{figure*}


\subsection{Training Set Segmentation Performance}
To evaluate the hypothesis that CASC-AI recognizes FP and FN during training, Table~\ref{tab:training_dice} presents Dice scores and F1-score for TP predictions and Intersection over Union (IoU) scores for FP and FN predictions. These results highlight that CASC-AI reduces predictions in FP regions while increasing predictions in FN regions, leading to corrections of the imperfect labels for accurate segmentation during the training phase.

\begin{table*}[htbp]
\centering
\caption{Performance on training dataset on TP, FP, and FN regions of the label. Dice similarity coefficient scores (\%) and F1-score results (\%) are reported on TP, while IoU (\%) are reported on FP and FN.}
\adjustbox{width=\textwidth}{
\begin{tabular}{l|cccc|ccc|ccc}
\hline
\textbf{Method} & \textbf{TP(Dice) $\uparrow$} & \textbf{TP(F1) $\uparrow$} & \textbf{FP(IoU) $\downarrow$} & \textbf{FN(IoU) $\uparrow$} 
                & \textbf{TP(Dice) $\uparrow$} & \textbf{TP(F1) $\uparrow$} & \textbf{FP(IoU) $\downarrow$} 
                & \textbf{TP(Dice) $\uparrow$} & \textbf{TP(F1) $\uparrow$} & \textbf{FN(IoU) $\uparrow$} \\
\hline
Supervised      & 67.99       & 39.25       & \textbf{2.86}       & 8.20       
                & 67.35       & 52.63       & 20.67       
                & 66.52       & 35.58       & 17.01 \\
CASC-AI (Ours)  & \textbf{73.25}       & \textbf{48.17}       & 3.48       & \textbf{9.89}       
                & \textbf{69.83}       & \textbf{56.28}       & \textbf{18.22}       
                & \textbf{68.20}       & \textbf{38.66}       & \textbf{18.73} \\
\hline
\end{tabular}
}
\label{tab:training_dice}
\end{table*}


\section{Conclusion}
In this work, we present the CASC-AI framework, a consensus-aware self-corrective learning designed to address the challenges of cell segmentation in noisy datasets. By leveraging the Consensus Matrix to identify and prioritize consensus regions between human annotators and the AI model, while adaptively weighting disagreement areas, the framework enhances segmentation reliability even in the presence of noisy annotations. This approach highlights the potential of incorporating an AI model to correct human errors in the labels, paving the way for scalable and robust solutions in medical imaging and digital pathology. The limitations and future work of this study can be found in Appendix~\ref{limitation}.


\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{This research was supported by the WCM Radiology AIMI Fellowship, NIH R01DK135597 (Huo), DoD HT9425-23-1-0003 (HCY), and the KPMP Glue Grant. This work was also supported by the Vanderbilt Seed Success Grant, the Vanderbilt Discovery Grant, and the VISE Seed Grant. }

\bibliography{midl25_035}

\clearpage
\appendix


\section{Data Collection and Experiments}
\label{ap:dataset}

\subsection{Data Information}
The details of the patch-level data collection are provided in Table~\ref{tab:dataset}.

\begin{table*}[htbp]
\centering
\caption{Summary of data collection for different cell classes.}
\begin{tabular}{ll|cccc}
\hline
\textbf{Class Name} & \textbf{Abbreviation} & \textbf{Patch \#} & \textbf{Size} & \textbf{Scale} & \textbf{Stain} \\
\hline
Podocytes & Pod. & 1147 & $512^{2}$ & $20 \times$ & PAS \\
Mesangial cells & Mes. & 789 & $512^{2}$ & $20 \times$ & PAS \\
Glomerular endothelial cells & Endo. & 715 & $512^{2}$ & $20 \times$ & PAS \\
Parietal epithelial cells & Pecs & 2014 & $512^{2}$ & $20 \times$ & PAS \\
\hline
\end{tabular}
\begin{flushleft}
\end{flushleft}
\label{tab:dataset}
\end{table*}

\subsection{Reasoning-Generated Noise Pipeline}
The detailed processes of Reasoning-Guided Noise Injection for FP data and Generating Masks with Missing Contours for FN data are illustrated in Algorithm~\ref{alg:add} and Algorithm~\ref{alg:remove}.


\subsection{Label Accuracy}
To illustrate the accuracy of the training data, we provide Dice scores and F1-scores for label accuracy in Table~\ref{tab:data_acc}, compared with noise-free ground truth confirmed by two pathologists.

\begin{table*}[htbp]
\centering
\caption{Label accuracy of each dataset. Dice similarity coefficient scores (\%) and F1-score results (\%) are reported.}
\adjustbox{max width=\textwidth}{
\begin{tabular}{l|ccccc|ccccc}
\hline
\textbf{Dataset} & \multicolumn{5}{c|}{Dice (\%)} & \multicolumn{5}{c}{F1-score (\%)} \\
\hline
                 & \textbf{Pod.} & \textbf{Mes.} & \textbf{Endo.} & \textbf{Pecs.} & \textbf{Mean} & \textbf{Pod.} & \textbf{Mes.} & \textbf{Endo.} & \textbf{Pecs.} & \textbf{Mean} \\
\hline
Real data        & 83.13       & 76.03       & 57.38        & 57.95        & 66.93      & 84.18        & 81.34       & 58.96         & 59.53       & 68.89       \\
FP data          & 57.18       & 57.62       & 66.85        & 78.94        & 68.34      & 66.43        & 66.72       & 66.93         & 76.26       & 70.85       \\
FN data          & 69.02       & 69.54       & 71.59        & 73.59        & 71.52      & 69.44        & 69.29       & 71.85         & 74.32       & 71.95       \\
\hline
\end{tabular}
}
\label{tab:data_acc}
\end{table*}


\section{Ablation Study}
\label{ap:ablation}
We conducted an ablation study to identify the optimal backbone and hyperparameter settings for cell segmentation, using non-error ground-truth labels that were corrected and verified by pathologists under supervised learning. Results shown in Table~\ref{tab:ablationstudy} indicate that reducing the learning rate to \(10^{-4}\) provides the best performance. Increasing the loss weights for the cell class during loss calculations and extending the training epochs did not lead to further performance gains. Our proposed backbone outperformed alternative approaches on our dataset.


\section{Limitations \& Future Work}
\label{limitation}
This study has several limitations. We restricted the design to a \textbf{loss-correction approach}. Exploring additional paradigms of corrective learning, such as multi-network architectures or co-training, could further enhance performance. \textbf{Exploring additional backbones including instance segmentation models} represents a promising direction for better capturing subtle patterns between cells and noise at the latent level, which could improve overall cell segmentation performance and feature embedding quality. Furthermore, \textbf{analyzing noise distributions and patterns, learning the variances among different raters, and incorporating annotator confidence within datasets as conditional information during model training} could provide valuable insights and improve overall noise-label learning, addressing issues such as boundary errors and label ambiguity. Eventually, molecular-empowered cell quantification could be fully automated, from data annotation to AI model training, without any human intervention.

\begin{table}[ht]
\centering
\caption{Performance on different hyperparameter settings. Dice similarity coefficient scores (\%) are reported.}
\adjustbox{max width=1.0\textwidth}{
\begin{tabular}{llccc|ccccc}
\hline
\textbf{Backbone} & \textbf{Freeze} & \textbf{Max Epoch} & \textbf{Learning Rate} & \textbf{Loss Weights} & \textbf{Pod.} & \textbf{Mes.} & \textbf{Endo.} & \textbf{Pecs.} & \textbf{Mean} \\ 
\hline
PrPSeg~\cite{deng2024prpseg} && 100 & $10^{-3}$ & 1:1  & 73.65 & 68.99 & 70.06 & 73.73 & 71.61 \\ 
PrPSeg~\cite{deng2024prpseg} && 100 & $10^{-3}$ & 10:1 & 73.06 & \textbf{71.24} & 70.05 & 72.39 & 71.69 \\ 
PrPSeg~\cite{deng2024prpseg} && 200 & $10^{-3}$ & 1:1  & 74.22 & 70.33 & 69.88 & 74.93 & 72.34 \\ 
PrPSeg~\cite{deng2024prpseg} (Ours) && 100 & $10^{-4}$ & 1:1  & 73.92 & 69.19 & \textbf{74.52} & \textbf{77.30} & \textbf{73.73} \\ 
PrPSeg~\cite{deng2024prpseg} && 200 & $10^{-4}$ & 1:1  & \textbf{75.01} & 67.79 & 74.33 & 76.70 & 73.46 \\ 
PrPSeg~\cite{deng2024prpseg} && 100 & $10^{-5}$ & 1:1  & 68.52 & 64.09 & 69.44 & 75.17 & 69.31 \\ 
\hline
CellViT~\cite{horst2024cellvit} && 100 & $10^{-4}$ & 1:1  & 57.52 & 51.61 & 49.70 & 58.39 & 54.31 \\ 
CellViT~\cite{horst2024cellvit} & Encoder & 100 & $10^{-4}$ & 1:1  & 50.93 & 59.14 & 52.37 & 54.47 & 54.23 \\ 
CellSAM~\cite{israel2024foundation} && 100 & $10^{-4}$ & 1:1  & 49.91 & 49.86 & 49.71 & 49.91 & 49.85\\  
CellSAM~\cite{israel2024foundation} & Encoder & 100 & $10^{-4}$ & 1:1  & 49.91 & 49.86 & 49.71 & 49.91 & 49.85\\ 
\hline
\end{tabular}
}
\label{tab:ablationstudy}
\end{table}


\begin{algorithm2e}
\caption{Reasoning-Guided Noise Injection (FP data)}
\label{alg:add}
\KwIn{Pathological image \( X \), Manual label \( Y \), Intensity threshold \( T \), Noise limit \texttt{limit}}
\KwOut{Processed image and noise mask}
Load the pathological image \( X \) and corresponding manual label \( Y \)\;
Perform color deconvolution on \( X \) to compute stain-specific masks\;
Select the PAS channel image and generate a binary mask \( M \) using intensity threshold \( T \)\;
Extract contours from \( M \) and sort them by proximity to existing annotations in \( Y \)\;
Determine the noise addition limit based on the number of cells in \( Y \)\;
\ForEach{\texttt{new\_contour} in sorted contours}{
    \eIf{\texttt{new\_contour} overlaps with existing annotations or violates spatial constraints}{
        \textbf{continue}\;
    }{
        \eIf{\texttt{new\_contour} size is outside the acceptable range for cells}{
            \textbf{continue}\;
        }{
            Add \texttt{new\_contour} to the final noise mask\;
        }
    }
    \If{number of added contours reaches \texttt{limit}}{
        \textbf{break}\;
    }
}
Save the processed image and the generated noise mask\;
\end{algorithm2e}

\begin{algorithm2e}
\caption{Generating Masks with Missing Contours (FN data)}
\label{alg:remove}
\KwIn{Image \( X \), Binary mask \( M \), Missing ratio \texttt{missing\_ratio}}
\KwOut{Processed image and modified mask}
Load the image \( X \) and the binary mask \( M \)\;
Extract contours from \( M \)\;
Shuffle the contours randomly\;
Set \( \texttt{limit} \leftarrow (1 - \texttt{missing\_ratio}) \times \texttt{len(contours)} \)\;
Initialize \( \texttt{new\_mask} \leftarrow 0 \)\;
Initialize \( \texttt{cnt} \leftarrow 0 \)\;
\ForEach{\texttt{contour} in \texttt{contours}}{
    Draw \texttt{contour} on \( \texttt{new\_mask} \)\;
    Increment \( \texttt{cnt} \)\;
    \If{\texttt{cnt} reaches \texttt{limit}}{
        \textbf{break}\;
    }
}
Save the processed image and the generated noise mask\;
\end{algorithm2e}


\end{document}
