\documentclass{midl} % Include author names
%\documentclass[anon]{midl} % Anonymized submission

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

%\usepackage{mwe} % to get dummy images
\jmlrvolume{-- 152}
\jmlryear{2024}
\jmlrworkshop{Full Paper -- MIDL 2024}
\editors{Accepted for publication at MIDL 2024}

%\title{Nuclei Segmentation in Histopathology Images using Gated-U-Net3+}
%\title{Feature Sensitive U-Net Model for Nuclei Segmentation of Histopathological Images}
\title{Nuclei Segmentation in Histopathological Images with  Enhanced U-Net3+}
 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
  \midlauthor{\Name{Bishal Ranjan Swain}\nametag{$^{1}$} \Email{bishalswain@kumoh.ac.kr}\\
  	\Name{Kyung Joo Cheoi} \nametag{$^{2}$} \Email{kjcheoi@chungbuk.ac.kr}\\
   \Name{Jaepil Ko} \nametag{$^{1,*}$} \Email{nonezero@kumoh.ac.kr}\\
   \addr \nametag{$^{1}$} Dept. of Computer Eng., Kumoh National Institute of Technology, Gumi, Korea\\
   \addr \nametag{$^{2}$} Dept. of Computer Science, Chungbuk National University, Cheongju, Korea\\
   \addr \nametag{$^{*}$} Author to whom correspondence should be addressed.
   }

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
%\midlauthor{\Name{Bishal Ranjan Swain\midljointauthortext{Contributed equally}\nametag{$^{1,2}$}} \Email{bishalswain@kumoh.ac.kr}\\
%\addr $^{1}$ 61 Daehak-ro, Gumi-si \\
%\addr $^{2}$ Republic of Korea \AND
%\Name{Jaepil Ko\midlotherjointauthor\nametag{$^{1}$}} \Email{nonezero@kumoh.ac.kr}\\
%\Name{Author Name3\nametag{$^{2}$}} \Email{alphabeta@example.edu}\\
%\Name{Author Name4\midljointauthortext{Contributed equally}\nametag{$^{3}$}} \Email{uvw@foo.ac.uk}\\
%\addr $^{3}$ Address 3 \AND
%\Name{Author Name5\midlotherjointauthor\nametag{$^{4}$}} \Email{fgh@bar.com}\\
%\addr $^{4}$ Address 4
%}

\begin{document}

\maketitle

\begin{abstract}
 In the rapidly evolving field of nuclei segmentation, there is an increasing trend towards developing a universal segmentation model capable of delivering top-tier results across diverse datasets. While achieving this is the ultimate goal, we argue that such a model should also outperform dataset-specific specialized models. To this end, we propose a task-specific feature sensitive U-Net model, that sets a baseline standard in segmentation of nuclei in histopathological images. We meticulously select and optimize the underlying U-Net3+ model, using adaptive feature selection to capture both short- and long-range dependencies. Max blur pooling is included to achieve scale and position invariance, while DropBlock is utilized to mitigate overfitting by selectively obscuring feature map regions. Additionally, a Guided Filter Block is employed to delineate fine-grained details in nuclei structures. Furthermore, we apply various data augmentation techniques, along with stain normalization, to reduce inconsistencies and thus resulting in significantly outperforming the state-of-the-art performance and paving the way for precise nuclear segmentation essential for cancer diagnosis and possible treatment strategies.
 \end{abstract}

\begin{keywords}
Nuclei segmentation, Histopathological images, Segmentation
\end{keywords}

\section{Introduction}

Nuclei Segmentation of histopathological images play an important role in medical diagnostics of Hematoxylin and Eosin (H\&E) stained tissues as it aids pathologists in understanding morphology of cellular structures \cite{Clayton1991,Elston1991}. This not only helps in cancer grading but also in predicting the effectiveness of various treatments \cite{monuseg_ds}. Finding and segmenting individual nuclei in histopathological images is one of the crucial steps involved in numerous analytical procedures of cancer diagnosis \cite{Cui_2019, IMTIAZ2023107378}. However, segmentation of nuclei in histopathological images is challenging as there are large variations in color, texture, and shape. Moreover, the variability in staining procedures used during the image procurement processes further increase inconsistencies in the images \cite{Sampias_Rolls}.

\iffalse
% removed to adhere to page limit
Traditionally, the segmentation was performed manually which is not only time consuming and labor intensive but also heavily depended on the personal experience of the pathologist \cite{Davey2006}. A variety of traditional image processing algorithms were also proposed, such as morphological operations, contour models, k-means clustering, thresholding, etc. However, the performance was far away from being comparable. The density distribution and overlapping edges of nuclei in the predicted images often resulted in either over-segmentation or under-segmentation \cite{Irshad2014}. Moreover, complex factors such as variable size and shape of nuclei, uneven staining, overlapping cells contributed to a high error rate in nucleus segmentation tasks \cite{Chow2012}. 
\fi

% Explaination - Introduction on Unet and its advantages. why does it perform better in semantic segmentation and what does it lack or disadvantages.
Recent advances have seen the rise of convolutional neural network (CNN)-based deep learning models that have started to gain attention due to their robust and consistent performance in semantic segmentation \cite{ZHANG2020}. Among these, U-Net \cite{ronneberger2015unet} has emerged as the most influential architecture, introducing encoder-decoder structure that has been highly effective for medical image segmentation tasks \cite{Liuetal}. It is a fully convolutional network (FCNN) that captures details through deep layers and upscales images, merging simple and complex features using skip connections for precise segmentation \cite{li2023fusionunet}. The introduction of U-Net++ \cite{zhou2018unet} aimed to improve upon U-Net by implementing nested and dense skip pathways, enhancing feature propagation and reducing the semantic gap between encoder and decoder layers. MBUTransNet \cite{Qiao2023} introduced a fusion of multi-branch U-shaped networks with transformer architecture to leverage long-range dependencies. CFNet \cite{ZHAN2023104112} proposed a cross-scale feature fusion method to enhance feature selection across different scales, but it may not fully address the challenge of capturing highly detailed local features alongside global context.
U-Net3+ \cite{huang2020unet} was introduced with full-scale connections and deep supervisions that sought to capture multi-scale features more effectively. However, variations in nuclear morphology, staining quality, and image resolution inherent in different datasets can notably impact the performance of U-Net based models \cite{li_semi_supervised_2023}.

% Simplex noise justification
Data augmentation plays a crucial role in generation of data required for proper training of a deep learning model. Among many data augmentation techniques, adding Gaussian noise has been a common choice to simulate random perturbations and variation in images \cite{Wyatt_2022_CVPR}. However, Gaussian noise adds perturbations that are statistically uniform across the image and lacks spatial coherence that are often present in real-world artifacts and variations \cite{Bae2018}. Moreover, the random fluctuations introduced by Gaussian noise do not adequately represent the artifacts or noise variations in medical images \cite{KASCENAS2023102963}. To overcome the shortages of human-labeled data we implement  Perlin-noise based data augmentation strategy on pathological images. Perlin noise provides random but natural looking patches or textures, with little computational cost \cite{perlinNoise}. 

% Introduction on universal models, their advantages and why is it better than UNet in generalization and how it might have inferior performance compared to specialized models. establish the need for the work that is need for specialized model to be set as baseline. We how do we plan on achieving it.
Recently, there has been a rise in the concept of universal or foundational models in segmentation, aiming to develop adaptable and dataset-agnostic models. These models can be trained once and then applied to a wide range of segmentation tasks \cite{ma2023segment}. 
%Such models would not only exhibit heightened versatility in terms of model capacity, but also potentially lead to more consistent results across different tasks, benefiting from a shared underlying architecture and training process \cite{ma2023segment}. 
The Segment Anything Model (SAM) \cite{kirillov2023segany}, in particular has been trained with a massive dataset of mask labels, making it highly adaptable to a wide range of tasks and there have been various other models like AutoSAM \cite{shaharabany2023autosam} that utilize SAM architecture in making more robust and models with high performance. However, this broad adaptability of such models can lead to compromised performance in highly specialized tasks. Study showed that SAM based model do not consistently achieve satisfying performance for dense segmentation tasks like in pathology images \cite{deng2023segment}. Moreover we also performed initial test experiments on generalized U-Net based model like nnUNet \cite{isensee2018nnunet}, but it did not perform better than U-Net3+ as shown in the ablation studies.
To this end, our study proposes a nuclei segmentation-specific enhanced U-Net3+ model that captures the nuclear morphology and staining variations to surpass the state-of-the-art segmentation results. Inspired from the skip connections of U-Net3+ \cite{huang2020unet}, we implement our model  to simultaneously capture both local and global representations, introduce Gated Linear Units (GLU) \cite{dauphin2017language} within the convolution layers for adaptive feature selection and thereby allowing the model to selectively focus on both local and global features effectively, incorporate max blur pooling \cite{zhang2019making} to solve scale and position invariance and anti-aliasing problems that are inherently found in encoder-decoder architectures, utilize DropBlock \cite{ghiasi2018dropblock} to mitigate overfitting by selectively obscuring feature map regions and using a Guided Filter Block \cite{wu2019fast} to delineate fine-grained details in nuclei structures. Furthermore, we perform several pre-processing on the data including stain normalization to account for staining inconsistencies and  data augmentation techniques that are meticulously tailored to address the inherent variability in the pathological H\&E stained images. Through this specialized approach, our model outperforms the current state-of-the-art models by some margin in nuclei segmentation of histopathological images. 

\section{Materials and Methods}
\subsection{Dataset}
This study utilizes the MoNuSeg2018 dataset \cite{monuseg_ds} for training and evaluating our model. The dataset includes 30 training images and 14 test images, encompassing a total of 21,623 manually annotated nuclei for training and 7,223 for testing. These images were derived from H\&E-stained whole slide images, featuring tissues from breast, kidney, liver, prostate, bladder, colon, and stomach in the training set, and an additional inclusion of lung and brain tissues in the testing set. All images were acquired at a 40x magnification, offering high-resolution insights into the cellular structures. In addition to MoNuSeg2018, the study further uses the CPM-17 \cite{vu2018methods} and CoNSEP \cite{graham2019hovernet} datasets for comparative and comprehensive evaluation. 
\iffalse
% removed to adhere to page limit
CPM-17 Dataset is a collection specifically designed for cellular process modeling and encompasses a wide range of tissue types and staining variations. It is instrumental in assessing the model's adaptability and performance across diverse histopathological conditions. This dataset contains 40 pathological images with pixel-level annotations, of which 32 are in the training set, and eight are in the test set. Each image, scanned at 40x magnification, has 500 × 500 pixels. In addition, all images in the train set CoNSEP Dataset is focused on the segmentation of nuclei in complex and densely packed tissue environments, the CoNSEP dataset provides high-quality, annotated images across various tissue types. It serves as a challenging benchmark for evaluating the model's precision in segmenting closely situated or overlapping nuclei. This dataset contains 41 H\&E stained images with 1000 × 1000 pixels at 40x magnification extracted from 16 CRA WSIs. CoNSeP dataset is split into train set (n = 27) and test set (n = 14) as employed in the original work \cite{graham2019hovernet}, and each image is cropped into 256 × 256 pixels in the experiment.
\fi


\subsection{Data Preprocessing}
%Data preprocessing is an important stage for deep learning architectures since it ensures the diversity, relevancy, consistency, and quality of the dataset in order to build a robust model. The steps of the designed data preprocessing stage are explained as follows -

\subsubsection{Stain Normalization}
Stain normalization addresses the variability in histopathological images due to differences in H\&E staining procedures, which affects the appearance of cellular structures. Variations in hues and intensities arise from disparate staining protocols, lighting, and imaging equipment, posing challenges for models trained on specific staining conditions. To mitigate these issues, we adopted a computationally efficient stain normalization approach, following the method described by \cite{mahbodetal}. A reference image from the training set was selected for Reinhard normalization, aligning all dataset images to this standard \cite{reinhardetal}. Reinhard normalization was chosen for its proven effectiveness in previous related researches \cite{patiletal}.

\begin{figure}[htbp]
	% Caption and label go in the first argument and the figure contents
	% go in the second argument
	\floatconts
	{fig:example2}
	{\caption{The flowchart of the proposed methodology. 1. Stain normalization: (a) original images, (b) stain normalized images; 2. Simplex noise: (c) simplex noise and stain image, (d) noise added image; 3. Image augmentation: (e) noise added image, (f) augmented images; 4. Training and Evaluation: (g) pre-processed image, (h) proposed model, (i) predicted segmentation mask.}}
	{\includegraphics[width=0.9\linewidth]{figs/flow.png}}
\end{figure}

\subsubsection{Data Augmentation}
Histopathology images inherently contain a high degree of variability, not only in the stains but also in the positions and morphological characteristics of cellular structures. Keeping in consideration, various photometric and geometric augmentations were performed to handle limited number of samples in the datasets.

\begin{itemize}
	\item Geometric and Photometric Augmentations:
	Performing augmentations that alter the angular positions and shapes of the nuclei can differ the training samples significantly. Basic geometric augmentations such as rotations, scaling, flips were performed to obtain samples from different perspectives and scales. Additionally, elastic deformations that mimic the natural deformations in biological tissues were performed that helped in enhancing the model’s ability to handle non-rigid transformations and complex variations. Photometric augmentations like Gamma and intensity level transformations along with contrast limited adaptive histogram equalization were performed. 
	\item Perlin Simplex Noise:
	The Perlin noise algorithm is employed to generate a pseudo-random gradient vector on each corner on a given grid. Next, it calculates the distance vectors from a given position to the surrounding corners on the grid. Then, it takes the dot product between the distance vector and the gradient vector, thereby obtaining influence values. The dot product becomes positive if the two vectors are pointing in the same direction, and it becomes negative if the two vectors are pointing in opposite directions. In the final step, interpolation is performed between these influence values to construct smooth patterns within the grid. In our experiments, we generated 2D Perlin noise with the same size of the image patches and concatenated them to produce noisy images.
\end{itemize}

\iffalse
% removed to adhere to page limit
\subsection{Problem Definition}
Given a H\&E stained pathological image set S, our motive is to assign each pixel of an image to a class belonging to either nuclei regions or non-nuclei regions. We have $S = \{I_j, G_j\}^N_{j=1} $, where $I_j$ represents original RGB images, and $G_j = \{ g^{(j)}_i , i= 1, 2, ..., |I_j|, g^{(j)}_i \epsilon \{0,1\} \}$ denotes their corresponding ground truth mask results. Each mask has each pixel labeled as $g^{(j)}_i = 1$ for representing nuclei regions and $g^{(j)}_i = 0$ for representing non-nuclei regions. The aim is to learn the following mapping function:
\begin{equation}
	\label{eq:sedov}
	S_j = NucleiOptimisedNet(I_j)
\end{equation}
where $NucleiOptimisedNet$ represents our proposed model and $S_j$ is the segmented output result.
\fi

\subsection{Proposed Methodology}

\subsubsection{Model Overview}
We implement a modified and refined U-Net3+ model for nuclei segmentation inspired from the full-scale skip connection implementation. We selected U-Net3+ as the baseline as it performed better on our initial experiments as detailed in Appendix. Key adjustments to U-Net3+ \cite{huang2020unet} include reducing the model's depth to mitigate overfitting on small datasets, optimizing feature channels to capture essential details without excessive complexity and using GLU for gated feature selection.

\iffalse
% Modified to above to adhere to page limit
In our study, we modify the U-Net3+ model for the given nuclei segmentation task. U-Net3+ model is a variation of U-Net architecture, designed for semantic and instance segmentation tasks in medical and natural images \cite{huang2020unet}. U-Net3+ incorporates a nested and dense skip pathway, enhancing the model's ability to capture and utilize multi-scale features. It uses multi-level feature aggregation mechanism that allows the model to make full use of hierarchical features from both encoding and decoding pathways and offers multi-level feature aggregation and enhanced localization. But the model is computationally complex and can potentially lead to over-fitting on smaller datasets. 
we modify U-Net3+ model to reduce the computational complexity and adapt it for nuclei segmentation. It is to be modified in such a way that it strikes a balance between computational demands and effectiveness. The task is performed in two ways – optimizing the capacity of the model and using appropriate feature channels. 
\fi

\begin{figure}[htbp]
	\floatconts
	{fig:example3}
	{\caption{Structural overview of the proposed enhanced U-Net3+ model.}}
	{\includegraphics[width=0.5\linewidth]{figs/model.png}}
\end{figure}

\begin{itemize}
	\item Optimization of Network Capacity:
	To prevent the model from memorizing rather than learning, we reduced the depth of U-Net3+ by eliminating one layer. Reducing the capacity of the model compels it to focus on the most important features such as – cell boundaries and tissue types and prevents it from focusing on irrelevant details.
	\item Appropriate Feature Channels:
	Feature channels help in capturing specific characteristics of the input, such as edges, corners, or textures. The initial layer of the encoder captures the low-level features like edges and corners and the deeper layers capture high-level abstract features. The specific arrangement of 32, 64, 128 and 128 was chosen as it empirically performed better than other configurations as in \tableref{app:table1}.
\end{itemize}
\subsubsection{Adaptive Feature Selection}
The importance of features can vary significantly across different regions in a histopathological image. Certain areas may contain more nucleus and might require more nuanced feature extraction, while others may be relatively homogeneous like cytoplasm and might require less detail. While traditional activation functions like ReLU \cite{agarap2019deep}, apply the same transformation across all features, GLUs can learn to `gate' certain features selectively. A GLU takes its input and splits it into two halves. One half is transformed linearly (much like a standard unit), and the other is transformed via a sigmoid activation function. The output of the sigmoid is used as a gate to control the information flow from the linear half. Mathematically, for an input $x$,
\begin{equation}
	\label{eq:glu}
	GLU(x) = sigmoid(W_a \cdot x + b_a) \odot (W_i \cdot x + b_l)
\end{equation}
Here, $\odot$ denotes the element wise multiplication, $W_a, b_a$ are the weights and biases for the gated mechanism, and $W_l, b_l$ are the weights and biases for the linear transformation.

\iffalse
% removed to adhere to page limit
The sigmoid gating mechanism can learn to turn features on and off adaptively to learn the features that are important and focus on them. It makes the feature maps sparse by not focusing on less important features \cite{LIU2022593}. In the encoder-decoder like architecture in U-Net3+, the earlier layers capture local features and the deeper layer capture global features. GLUs are placed across multiple layers to adaptively learn to gate features at different scales and balance the need between local and global details. It helps the model learn local features like edges of the nuclei, texture or tissue segment and long-range global dependencies like shape and arrangement of cluster of nuclei.
\fi

\iffalse
% removed to adhere to page limit
\subsubsection{Shift Invariant Convolutions}
Max pooling is often used in U-Net based models for down-sampling feature maps. It selects the maximum value from each pooling window, discarding all other information. This abrupt down-sampling of feature maps can introduce aliasing artifacts. These artifacts can distort high-frequency components of the image, which could affect the model's learning ability, particularly in capturing fine details. This can be solved by using max blur pooling, where a blurring operation is implemented after the pooling step to smooth out the feature map. By smoothing the feature map, max-blur pooling can not only reduce the aliasing artifacts that are commonly introduced by abrupt down-sampling in max-pooling but also increase the receptive field and enhance scale invariance.
\fi

\subsubsection{Further Refinements}
\begin{itemize}
	\item Shift Invariant Convolutions:
	To address aliasing artifacts from max pooling, we implemented max blur pooling. This approach, involving a blurring step post-pooling, smooths feature maps to reduce aliasing and enhance the model's ability to capture fine details.
	\item DropBlock Regularization:
	DropBlock is a structured form of dropout that removes contiguous regions from the feature maps. By enforcing the model to fill in missing regions based on surrounding context, DropBlock encourages the network to learn more robust and generalized feature representations. 
	\item Trainable Guided Filter:
	The trainable guided filter \cite{wu2019fast} offers a sophisticated mechanism for refining segmentation masks by leveraging edge-preserving filtering. It takes the coarse segmentation map produced by the network and refines it using the original input image as a guide. The guided filter performs smoothing within regions while preserving edges, effectively capturing both the global structure and local details of the targeted nuclei without introducing the halo artifacts.
\end{itemize}

\subsubsection{Combined Loss Function}

The model is trained using combined loss function by integrating the weighted average dice loss and focal loss \cite{lin2018focal}. Weighted dice loss is commonly used in biomedical imaging applications where there is infrequency in occurrence of certain regions or features. The focal loss on the other hand focuses more on the hard negatives and down-weights the easy examples, thereby allowing the model to focus on more challenging regions.  $L_{seg}$ is the loss used in the experiments where, $\alpha$ and $\beta$ are the controlling parameters of the respective loss components.
%The mathematical formulation of dice loss is computed using \equationref{eq:lossDice}. \equationref{eq:lossFocal1,eq:lossFocal2} are used to compute the focal loss when the ground truth is 1 and 0 respectively. \equationref{eq:lossSeg} denotes the hybrid loss used for training the model. Here, N is the total number of pixels and $y_i, p(y_i)$ represents ground truth and confidence predicted level for $i^{th}$ pixel. $\gamma$ is the focusing parameter and $\alpha_i$ is the balancing factor. The value of $\alpha=\beta=0.5$ and $\gamma=2$.

\begin{equation}
	\label{eq:lossSeg}
	L_{seg} = \alpha L_{dice} (y, p(y)) + \beta L_{focal} (y, p(y))
\end{equation}

\iffalse
\begin{equation}
	\label{eq:lossDice}
	L_{dice}=1-\ \frac{2\sum_{i}^{N}{y_i.p(y_i)}}{\sum_{i}^{N}{\left|y_i\right|^2+\ }\sum_{i}^{N}{\left|{p(y}_i)\right|^2\ }}
\end{equation}

\begin{equation}
	\label{eq:lossFocal1}
	L_{Focal}=-\sum_{i}^{N}{a_i.\left(1-p\left(y_i\right)\right)^\gamma\times\log{\left(p\left(y_i\right)\right)}\ \ if\ y_i=\ 1}
\end{equation}

\begin{equation}
	\label{eq:lossFocal2}
	L_{Focal}=-\sum_{i}^{N}{a_i.\left(p\left(y_i\right)\right)^\gamma\times\log{\left(1-p\left(y_i\right)\right)}\ \ if\ y_i=0}
\end{equation}
\fi

\section{Experiments and Results}
The experiments were conducted on NVIDIA A6000 GPU, Intel i7 6700 CPU, running on the Ubuntu 22.10 operating system and using PyTorch framework. The images in training set was split into training and validation sets with ratio of 8:2 and then were augmented. The batch size was set to be 16 and number of epochs to be 50. Several experiments were conducted and through empirical results the optimal learning rate was found to be 1e-6.   
For the performance evaluation of the model, widely used evaluation criterion of Intersection-over-Union (IoU) \cite{rezatofighi2019generalized} and Dice-Score \cite{Eelbode_2020} were used across experiments. The Dice Score computes the overlap between the predicted segmentation mask (A) and the ground truth mask (B). The IoU metric evaluates the quality of the object-level segmentation by calculating the overlap between the predicted and ground truth masks for each class and then averaging these overlaps.

\iffalse
\begin{figure}[htbp]
	% Caption and label go in the first argument and the figure contents
	% go in the second argument
	\floatconts
	{fig:example4}
	{\caption{Example Image}}
	{\includegraphics[width=0.9\linewidth]{example-image}}
\end{figure}
\figureref{fig:example4} provides a performance comparison between the stock U-Net3+ model and an optimized U-Net3+ variant with tailored network capacity and feature channels. While the stock U-Net3+ struggled with training inefficiency due to its inherent complexity, the optimized version demonstrated improved performance and training efficiency, despite retaining the same input and hyperparameter settings. This highlights the effectiveness of capacity optimization and feature channel selection validating the adaptation of U-Net3+ for nuclei segmentation.
\fi

\iffalse
\tableref{tab:table2} shows the advantage of employing a hybrid loss function over using Dice loss alone. The hybrid loss, which combines weighted Dice and focal loss, yielded superior performance metrics: a Dice coefficient of 0.8794 and an mIoU of 0.7879, compared to 0.8494 and 0.7257 respectively when using Dice loss alone. This improvement demonstrates the contribution of focal loss in enhancing the model's ability to focus on challenging regions, thereby boosting overall segmentation performance.
\fi

Our experiments demonstrate a significant improvement in nuclei segmentation performance across various datasets, as evidenced by \tableref{tab:table3} and \tableref{tab:table5}. Our optimized model, showcases superior results compared to previous and present existing state-of-the-art methods, including U-Net, U-Net++, CFNet, MBUTransNet, nnUNet, SAM, and AutoSAM. Specifically, on the MoNuSeg test images, our model achieves the highest Dice coefficient of 0.8902 and mIoU of 0.7924, outperforming other methods. This performance is not isolated to the MoNuSeg dataset but extends across CPM-17 and CoNSep datasets, with our model outperforming other approaches with Dice scores of 0.9325 and 0.8172, and mIoU scores of 0.8776 and 0.7257, respectively. The model was trained individually across the datasets.

\begin{table}[htbp]
	\floatconts
	{tab:table3}%
	{\caption{Comparison on results obtained from MoNuSeg test images using previous methods. Underlined values denote the baseline results.}}%
	{\begin{tabular}{ccc}
			\bfseries Method & \bfseries Dice & \bfseries mIoU\\
			\hline
			U-Net \cite{ronneberger2015unet} & 0.7943 & 0.6599\\
			U-Net++ \cite{zhou2018unet} & 0.7949 & 0.6604\\
			CFNet \cite{ZHAN2023104112} & 0.7987 & 0.6668\\
			MBUTransNet \cite{Qiao2023} & 0.8160 & 0.6902\\
			U-Net3+ \cite{huang2020unet} & \underline{0.8260} & \underline{0.7039}\\
			nnUNet \cite{isensee2018nnunet} & 0.8031 & 0.6781 \\
			SAM \cite{kirillov2023segany} & 0.6950 & 0.6187\\
			AutoSAM \cite{shaharabany2023autosam} & 0.8242 & 0.7017\\
			\bfseries Ours & \bfseries 0.8902 & \bfseries 0.7924\\ \hline
			
	\end{tabular}}
\end{table}



\begin{table}[htbp]
	\floatconts
	{tab:table5}%
	{\caption{Comparison of proposed model performances across datasets. Underlined values denote the baseline results.}}%
	{\begin{tabular}{ccccccc}
			\bfseries Models & \multicolumn{2}{c}{\bfseries MoNuSeg} & \multicolumn{2}{c}{\bfseries CPM-17} & \multicolumn{2}{c}{\bfseries CoNSep} \\
			\hline
			& \bfseries Dice & \bfseries mIoU & \bfseries Dice & \bfseries mIoU & \bfseries Dice & \bfseries mIoU \\
			U-Net & 0.7943 & 0.6599 & 0.8312 & 0.7759 & 0.7192 & 0.6260 \\
			U-Net++ & 0.7949 & 0.6604 & 0.8471 & 0.7891 & 0.7416 & 0.6433 \\
			UNet3+ & \underline{0.8260} & \underline{0.7039} & \underline{0.8619} & \underline{0.8042} & \underline{0.7784} & \underline{0.6829} \\
			\bfseries Ours & \bfseries 0.8902 & \bfseries 0.7924 & \bfseries 0.9325 & \bfseries 0.8776 & \bfseries 0.8172 & \bfseries 0.7257 \\ \hline
	\end{tabular}}
\end{table}

\begin{figure}[htbp]
	\floatconts
	{fig:qualitative}
	{\caption{Comparison of segmentation results of our proposed model on MoNuSeg dataset with U-Net3+.}}
	{\includegraphics[width=0.8\linewidth]{figs/qualitativeImage3.png}}
\end{figure}

The results underscore the effectiveness of our approach, which incorporates techniques such as stain normalization, simplex noise, GLU, max blur pooling, dropblock and trainable guided filter. These enhancements contribute to the model's ability to more accurately distinguish between nuclei and surrounding tissue, even in the presence of staining variability and complex tissue morphology. \tableref{tab:table3} and \tableref{tab:table5} draw the comparative quantitative analysis while \figureref{fig:qualitative} draws the qualitative superiority of our approach. We also performed statistical analysis, using a two-sample t-test (Welch's t-test for unequal variances), got a p-value of approximately 0.0134. This result of p-value $<$ 0.05 suggests that the performance difference observed between your model and the base model is unlikely to have occurred by chance and that our model has a statistically significant improvement \cite{Fu2024TSCANetTB}.

\section{Conclusion}


In this study, we introduced a nuclei segmentation-specific enhanced U-Net3+ model in H\&E stained histopathology images. Our implementation included - enhancing the performance of U-Net3+ model by implementing a series of optimizations including adaptive feature selection through Gated Linear Units (GLUs), max blur pooling for scale and position invariance, DropBlock regularization to mitigate overfitting and fast trainable guided filter for efficient learning. Furthermore, we applied stain normalization to achieve consistency across images and utilized advanced data augmentation techniques to expand the training dataset. Our methodology significantly outperformed previous methods including the existing state-of-the-art models. This was achieved by addressing key challenges such as data imbalance with a hybrid combined loss function that enhanced the model’s sensitivity to varying object sizes and class imbalances. Despite the promising results, the segmentation of overlapping and clumped nuclei in H\&E stained images remains a challenge. Future work will focus on developing a more intricate model architecture to capture that takes advantage of the foundational and task-specific models using instance segmentation. Additionally, we plan to explore the potential of transformers and promptable segmentations for further advancements in nuclei segmentation. Our study demonstrates that  strategic optimizations can lead to significant improvements in histopathological image analysis, thereby laying a critical groundwork for more accurate cancer diagnosis and informing potential treatment pathways.

% Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{This research was supported by BL Science grants (202301630001).}


\bibliography{midl23_152}


\appendix

\section{Ablation Studies}
\begin{figure}[htbp]
	\floatconts
	{app:figablation}
	{\caption{An overview of the ablation study which details the enhancements performed on the base model.}}
	{\includegraphics[width=0.9\linewidth]{figs/ablationFlow3.png}}
\end{figure}

This section meticulously evaluates the incremental contribution of each model design choice as proposed enhancement to the baseline. Sub-section A.1., details the experiments conducted for the selection of baseline and similarly section A.2., A.3., A.4., detail about the effectiveness of feature channel configuration, dropblock + max blur pooling, and normalization respectively. A.5. and A.6., detail the experiments and verify the effectiveness of the activation and loss functions implemented. Finally A.7. shows the effectiveness of the fast trainable guided filter in the model.


\subsection{Baseline Model Performance}
The baseline performance comparison between U-Net3+, ELU-Net \cite{elu_net} and nnUNet \cite{isensee2018nnunet} on MoNuSeg test images establishes U-Net3+ as superior, with an average Dice score of 0.826 compared to ELU-Net's 0.808 and nnUNet’s 0.796. \tableref{app:table0} illustrates the performance of the models across all the test image cases. This highlighted U-Net3+'s efficiency in capturing nuanced features across various tissue types compared to ELU-Net and nnUNet \cite{isensee2018nnunet}, setting a robust foundation as baseline for further enhancements.

\begin{table}[htbp]
	\floatconts
	{app:table0}%
	{\caption{Performance in terms of Dice scores of MoNuSeg test images on U-Net3+, ELU-Net and nnUNet for creating baseline}}%
	{\begin{tabular}{ccccccccccc}
			\bfseries{Model} & \bfseries{0} & \bfseries{1} & \bfseries{2} & \bfseries{3} & \bfseries{4} & \bfseries{5} & \bfseries{6} & \bfseries{7} & \bfseries{8} & \bfseries{9} \\ \hline
			U-Net3+ & 0.825 & 0.905 & 0.885 & 0.917 & 0.751 & 0.786 & 0.786 & 0.886 & 0.887 & 0.849\\
			ELU-Net & 0.804 & 0.896 & 0.881 & 0.898 & 0.712 & 0.777 & 0.79  & 0.868 & 0.874 & 0.811\\ 
			nnUNet & 0.796 & 0.900 & 0.891 & 0.892 & 0.686 & 0.782 & 0.807  & 0.863 & 0.874 & 0.787\\ 	\hline
			\	\end{tabular}}
		
	{\begin{tabular}{ccccc}
			 \bfseries{10} & \bfseries{11} & \bfseries{12} & \bfseries{13} & \bfseries{AVG} \\ \hline
			 0.904 & 0.578 & 0.839 & 0.758 & \bfseries 0.826\\
			 0.859 & 0.51  & 0.842 & 0.784 & \bfseries 0.808\\ 
			  0.828 & 0.455  & 0.858 & 0.823 & \bfseries 0.803\\ \hline
			\	\end{tabular}}
\end{table}




\subsection{Feature Channel Configurations}
\tableref{app:table1} evaluated the performance of various feature channel configurations in the U-Net3+ model adapted for MoNuSeg data. The table compares the parameters (‘\#params’) and the dice coefficient (‘dice’) for each configuration. The configuration [32, 64, 128, 128] was found to be most effective which means that the first layer has 32 feature channels, second layer has 64, third layer has 128 and the fourth layer has 128. The optimal configuration demonstrates that a strategic increase in channel depth at later stages can significantly enhance segmentation accuracy without excessively inflating the model's parameter count. This suggests an effective method for maximizing performance while maintaining computational efficiency.

\begin{table}[htbp]
	\floatconts
	{app:table1}%
	{\caption{Model performance on different configurations of feature channel combinations}}%
	{\begin{tabular}{ccc}
			\bfseries Configuration & \bfseries \#params & \bfseries Dice\\
			\hline
			16, 32, 64, 128 & 1.02M & 0.8483\\
			32, 32, 64, 128 & 1.26M & 0.8372\\
			32, 64, 64, 128 & 1.34M & 0.8443\\
			32, 32, 128, 128 & 1.50M & 0.8493\\
			32, 64, 128, 128 & 1.60M & 0.8494\\ 
			32, 64, 128, 256 & 2.2M & 0.8239 \\ \hline
\	\end{tabular}}
\end{table}


\subsection{DropBlock and Max Blur Pooling Implementations}
The comparison of dropout strategies underscores DropBlock's effectiveness over traditional dropout in spatially structured data like histopathological images. Integrating DropBlock, along with Max Blur Pooling (MBP), leads to the highest Dice and mIoU scores in \tableref{app:table2}, indicating its pivotal role in enhancing model generalization and mitigating overfitting by encouraging spatially distributed feature learning.
\begin{figure}[htbp]
	\floatconts
	{app:fig1}
	{\caption{Visualization of DropBlock. (a) Input image to the network. The blue regions in (b) and (c) include the activation units which contain semantic information in the input image. Dropping out activations at random is not effective in removing semantic information because nearby activations contain closely related information. Instead, dropping continuous regions can remove semantic information (that is the whole nucleus or area surrounding nucleus) and consequently enforcing remaining units to learn features for classifying input image.}}
	{\includegraphics[width=0.7\linewidth]{figs/dropblock.png}}
\end{figure}

\begin{table}[htbp]
	\floatconts
	{app:table2}%
	{\caption{Model performance on different dropout strategies}}%
	{\begin{tabular}{ccc}
			\bfseries Feature & \bfseries Dice & \bfseries mIoU\\
			\hline
			None & 0.8188 & 0.6946 \\
			Dropout & 0.8361 & 0.7164\\
			DropBlock & 0.8488 & 0.7246\\
			Dropout+MBP & 0.8379 & 0.7168\\
			DropBlock+MBP & 0.8494 & 0.7251\\ \hline
			\	\end{tabular}}
\end{table}

\subsection{Stain Normalization and its Need}


The analysis of stain normalization techniques underscores the critical role of selecting an appropriate target image for both macenko \cite{macenko} and reinhard normalization as shown in \figureref{app:fig2a}. The target image was chosen after meticulous evaluation of the dataset to ensure compatibility with the majority of images. Figure \ref{app:fig2b} illustrates the process, showing the original, target, and Reinhard-normalized images. This approach emphasizes the importance of a careful selection process to maintain consistency across the dataset. The effectiveness of this normalization technique is quantitatively validated in Table \ref{app:table3}, demonstrating the effectiveness of Reinhard normalization and highlighting its suitability for preserving histological details essential for accurate segmentation.


\begin{figure}[htbp]
	\floatconts
	{app:fig2a}
	{\caption{Visualization of normalized images and need of selecting correct target image. (a) the original image, (b) Macenko normalized image, (c) Reinhard normalized image. The green circle highlights the ambiguous region in the image where, the normalization can make it more difficult to segment the images. Therefore there is a need to correctly select the target image for normalization.}}
	{\includegraphics[width=0.75\linewidth]{figs/stain_0.png}}
\end{figure}

\begin{table}[htbp]
	\floatconts
	{app:table3}%
	{\caption{Model performance on stain normalization}}%
	{\begin{tabular}{ccc}
			\bfseries Feature & \bfseries Dice & \bfseries mIoU\\
			\hline
			None & 0.8263 & 0.7059 \\
			Macenko & 0.8361 & 0.7164\\
			Reinhard & 0.8494 & 0.7251\\ \hline
			\	\end{tabular}}
\end{table}
\begin{figure}[htbp]
	\floatconts
	{app:fig2b}
	{\caption{Stain Normalization. (a) Unnormalized images, (b) target image, (c) reinhard normalized images}}
	{\includegraphics[width=0.8\linewidth]{figs/stain.png}}
\end{figure}


\subsection{Activation Function Selection}
We perform several experiments to compare the effectiveness of GLU activation compared to ReLU, Leaky ReLU, and Swish in terms of Dice and mIoU scores. Superior performance of GLU shows its ability to adaptively filter and propagate relevant features through the network underlines its utility in complex segmentation tasks, suggesting a promising direction for enhancing model sensitivity and specificity.

\begin{table}[htbp]
\floatconts
{app:table4}%
{\caption{Model performance comparison with ReLU and GLU activation functions}}%
{\begin{tabular}{ccc}
		\bfseries Activation Function & \bfseries Dice & \bfseries mIoU\\
		\hline
		ReLU & 0.8488 & 0.7246 \\
		Leaky ReLU & 0.8494 & 0.7251 \\
		Swish & 0.8486 & 0.7244\\ 
		GLU & 0.8794 & 0.7879\\ \hline
		\	\end{tabular}}
\end{table}
The sigmoid gating mechanism in GLU can learn to turn features on and off adaptively to learn the features that are important and focus on them. It makes the feature maps sparse by not focusing on less important features \cite{LIU2022593}. In the encoder-decoder like architecture in U-Net3+, the earlier layers capture local features and the deeper layer capture global features. GLUs are placed across multiple layers to adaptively learn to gate features at different scales and balance the need between local and global details. It helps the model learn local features like edges of the nuclei, texture or tissue segment and long-range global dependencies like shape and arrangement of cluster of nuclei.
		
\subsection{Loss Function Comparisons}
In the assessment of various loss functions including - BCE, Dice, Focal and Combined Focal and Dice illustrates the superior performance of the combined Focal and Dice loss, achieving the highest Dice and mIoU scores. This combination effectively balances the model's attention between prevalent and rare segmentation targets, optimizing the learning process towards challenging regions and improving overall segmentation accuracy.
\begin{table}[htbp]
	\floatconts
	{app:table5}%
	{\caption{Model performance on various loss functions}}%
	{\begin{tabular}{ccc}
			\bfseries Loss Functions & \bfseries Dice & \bfseries mIoU\\
			\hline
			BCE & 0.8436 & 0.7243 \\
			Dice & 0.8541 & 0.7289 \\
			Focal & 0.8694 & 0.7779\\ 
			Focal + Dice & 0.8807 & 0.7897\\ \hline
			\	\end{tabular}}
\end{table}

\subsection{Fast Trainable Guided Filter Block}
The usage of a fast trainable guided filter from \cite{wu2019fast} was implemented at the end of the network before the sigmoid layer.  As indicated in Table \ref{app:tableguided}, the inclusion of the guided filter showed an increase in both dice and mIoU scores. Visual representation of with and without fast guided filter is shown in Fig. \ref{app:figGuided}.
\begin{figure}[htbp]
	\floatconts
	{app:figGuided}
	{\caption{Visualization for the effects of guided filter. (a) input image, (b) overlayed target image, (c) predicted image before guided filter and (d) predicted image after using guided filter.}}
	{\includegraphics[width=\linewidth]{figs/guidedFilter2.png}}
\end{figure}
\begin{table}[htbp]
	\floatconts
	{app:tableguided}%
	{\caption{Model performance on with and without guided filter}}%
	{\begin{tabular}{ccc}
			\bfseries Feature & \bfseries Dice & \bfseries mIoU\\
			\hline
			w/o Guided Filter & 88.07 & 78.97\\
			w/ Guided Filter & 89.02	& 79.24\\ \hline
			\	\end{tabular}}
\end{table}




\end{document}
