\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution
\usepackage{xcolor}
\definecolor{cr}{RGB}{0, 0, 0}
\usepackage{mwe} % to get dummy images
\jmlrvolume{-- nnn}
\jmlryear{2024}
\jmlrworkshop{Full Paper -- MIDL 2024}
\editors{Accepted for publication at MIDL 2024}

\title[HoVer-NeXt]{HoVer-NeXt: A Fast Nuclei Segmentation and Classification Pipeline for Next Generation Histopathology}

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship

\midlauthor{\Name{Elias Baumann\nametag{$^{1}$}}
\Email{elias.baumann@unibe.ch} \\
\Name{Bastian Dislich\nametag{$^{1}$}} \\
\Name{Josef Lorenz Rumberger\nametag{$^{2,3}$}}
\Email{JosefLorenz.Rumberger@mdc-berlin.de} \\
\Name{Iris D. Nagtegaal\nametag{$^{4}$}} \\
\Name{Mar\'ia Rodr\'iguez Mart\'inez\nametag{$^{5}$}} \\
\Name{Inti Zlobec\nametag{$^{1}$}} \\
\addr $^{1}$ Institute of Tissue Medicine and Pathology, University of Bern, Bern, Switzerland \\
\addr $^{2}$ Max-Delbrück-Center for Molecular Medicine in the Helmholtz Association, Berlin, Germany\\
\addr $^{3}$ Humboldt-Universität zu Berlin, Faculty of Mathematics and Natural Sciences, Berlin, Germany\\
\addr $^{4}$ Department of Pathology, Radboud Institute for Molecular Life Sciences, Radboud University Medical Center, Nijmegen, The Netherlands \\
\addr $^{5}$ Yale School of Medicine \\
\vspace{-8mm}
}
%
\begin{document}
\maketitle
\vspace{-6mm}
\begin{abstract}
In cancer, a variety of cell types, along with their local density and spatial organization within tissues, play a key role in driving cancer progression and modulating patient outcomes. At the basis of cancer diagnosis is the histopathological assessment of tissues, stained by hematoxylin \& eosin (H\&E), which gives the nuclei of cells a dark purple appearance, making them particularly distinguishable and quantifiable. The identification of individual nuclei, whether in a proliferating (mitosis) or resting state, and their further phenotyping (e.g. immune cells) is the foundation on which histopathology images can be used for further investigations into cellular interaction, prognosis or response prediction. To this end, we develop a H\&E based nuclei segmentation and classification model that is both fast (1.8s/mm2 at 0.5mpp, 3.2s/mm2 at 0.25mpp) and accurate (0.84 binary F1, 0.758 mean balanced Accuracy) which allows us to investigate the cellular composition of large-scale colorectal cancer (CRC) cohorts. We extend the publicly available Lizard CRC nuclei dataset with a mitosis class and publish further validation data for the rarest classes: mitosis and eosinophils. Moreover, our pipeline \textcolor{cr}{is 5$\times$ faster than the CellViT pipeline, 17$\times$ faster than the HoVer-Net pipeline, and performs competitively on the PanNuke pan-cancer nuclei dataset (47.7 mPQ$_{Tiss}$, +3\% over HoVer-Net)}. Our work paves the way towards extensive single-cell information directly from H\&E slides, leading to a quantitative view of whole slide images. Code, model weights as well as all additional training and validation data, are publicly available on \href{https://github.com/digitalpathologybern/hover_next_inference}{github}. 
\end{abstract}

\begin{keywords}
Panoptic segmentation, Nuclei segmentation and classification, Deep Learning, Histopathology, Colorectal Cancer
\end{keywords}

\section{Introduction}

% cancer is important 
Histopathological assessment of tissues is a cornerstone for the diagnosis and prognosis of diseases, including cancer. Among the most frequent and deadly is colorectal cancer (CRC), for which the overall 5-year survival rate is only around 65\% \cite{Siegel2023CancerS2}. 
% why do we care about cells
Tissue biomarkers play a crucial role in improving prognostication and designing more personalized treatments for individual patients. %Such biomarkers may include counts, densities, or even the spatial relationships between different cell types composing the diseased tissue and in most cases, are still being performed manually by pathologists using a microscope. For example, eosinophils (a particular type of immune cell) are counted multiple times in eosinophilic esophagitis  \cite{Dellon2018UpdatedIC}, proliferating cells, namely mitoses are quantified in breast cancer  \cite{ukasiewicz2021BreastCR}, the number of intraepithelial lymphocytes is being assessed in celiac disease \cite{Ludvigsson2014DiagnosisAM} and in CRC, single tumor cells or small tumor cell clusters called “tumor buds” are scored as additional prognostic factors \cite{Lugli2017RecommendationsFR}.
% spatial biology
%With technologies such as spatial omics or multiplex immunofluorescence imaging, more detailed descriptions of cells, their neighborhoods, and interactions can be made. These methods facilitate new discoveries and are also responsible for increased interest in spatial analysis of tissue  \cite{Page2023SpatialAO}, yet usually cohorts are small as individual slides are expensive to produce. \newline
% something about medical images and deep learning
In recent years, deep neural networks have shown promising results in biomarker prediction~\cite{kather2019deep,Bulten2019AutomatedDS} and discovery~\cite{zheng2023machine}, as well as other tasks such as segmentation and classification of biological structures~\cite{Ronneberger2015UNetCN,Havaei2015BrainTS}.
%In recent years, deep neural networks have shown promising results in segmentation, classification, and other image analysis tasks in the biomedical domain \cite{Ronneberger2015UNetCN,Havaei2015BrainTS,Bulten2019AutomatedDS}, particularly in radiology and pathology where data can be easily digitized. % and made available for training of machine learning models.
% Nuclei segmentation
One of the emerging tasks is the identification, classification, and segmentation (i.e. panoptic segmentation) of cells or more commonly their nuclei directly on routine diagnostic slides \cite{gamper2019pannuke,Graham2021LizardAL}. 
%Accurate nuclei segmentation and classification, combined with tissue type segmentation leads to a holistic view of the tissue, exact measurement of counts and areas, and assist in estimating and discovering novel biomarkers.%many of the currently implemented biomarkers.
% why is this challenging
%While some cell types can likely not be differentiated on H\&E images with the current digitization strategies, both pathologists and deep learning models can correctly identify many types of particular importantance for diseases. To recognize nuclear morphology and texture, this process can only be done at high magnification.
Most cell types can only be differentiated on H\&E images at high magnifications (i.e. 20x or 40x), because identification is based on nuclear morphology, texture and tissue context. 
% downside 1: long inference time
However, % compared to many other image analysis problems, 
Whole Slide Images (WSI) at high magnification are large with sizes above $100000\times100000$ pixels, and inference runtimes of currently available models make it infeasible to run them for clinical routine or investigations on large-scale cohorts. 
%currently available models often require a long time to process even a single WSI. This makes current models running at high magnification less feasible for the clinical routine and for large-scale cohorts to investigate new biomarkers. \newline
% related work 
Methods for panoptic segmentation include HoVer-Net~\cite{Graham2018HoverNetSS}, which uses an encoder-decoder architecture with three decoders, one for semantic and two for watershed-based instance segmentation. In comparison, the panoptic segmentation version of StarDist \cite{Weigert2022NucleiIS} has a similar architecture, but only one decoder for instance segmentation which predicts star-shaped polygon mask proposals for the individual cells and post-processes them to instances by means of a non-maximum suppression algorithm.
%HoVer-Net \cite{Graham2018HoverNetSS} is one of the most popular models for panoptic nuclei segmentation. It uses an encoder-decoder architecture with three decoders for binary segmentation, horizontal and vertical (HV) center point vector prediction, and class prediction. HV maps are post-processed using watershed and classes are assigned based on per nucleus class distribution. HV maps particularly help the model differentiate between instances in crowded areas.
%Another approach to nuclei segmentation, independent of modality, is StarDist which uses star-shaped polygon proposals refined with non-maximum suppression to detect and segment nuclei \cite{Schmidt2018CellDW}. As the method by default does not assign classes, Weigert and Schmidt \cite{Weigert2022NucleiIS} extend the method with a class prediction decoder and assign classes the same way as HoVer-Net.
CellViT~\cite{hörst2023cellvit} then further improves state-of-the-art by using a  SAM~\cite{Kirillov2023SegmentA} encoder combined with HoVer-Nets decoders. On the other hand,  \citet{Tommasino2023HoVerUNetAH} propose a simplified HoVer-UNet for 3$\times$ speedup over HoVer-Net. 
Recently, the CoNiC challenge tried to find new best practices in nuclei segmentation and classification \cite{Graham2023CoNICCP,Graham2021CoNICCN}. They found that the top three submissions, which included our own, are based on newer encoders such as EfficientNet-v2 \cite{Tan2021EfficientNetV2SM}, and tackled class imbalance via class distribution-based importance sampling and loss weighting~\cite{Weigert2022NucleiIS,rumberger2022panoptic,Zhang2022AugHoverNetAH}.
The CoNiC challenge dataset (Lizard) \cite{Graham2021LizardAL}, which is based on H\&E CRC images at $\sim$0.5mpp, includes six classes: lymphocytes, neutrophils, plasma cells, eosinophils, epithelial cells and  connective-tissue cells. The post-challenge analysis \citet{Graham2023CoNICCP}, demonstrates the value of the dataset and such methods by successfully applying them to tumor grading and patient survival prediction tasks.
% Downside 2: relevant things present in the data but not in the annotations
However, Lizard does %not differentiate between neoplastic and non-neoplastic epithelium; also importantly, it does 
not consider mitoses as separate objects of interest. Rather, they are classified as epithelium, lymphocyte, neutrophil, or not at all (c.f. \figureref{fig:model_overview} D). Mitoses are indicators of the proliferative activity of tumors and have an impact on treatment decisions for e.g. breast or pancreatic neuroendocrine tumors~\cite{ukasiewicz2021BreastCR,Kim2016RecentUO}.\newline
%and in cancers, such as breast or pancreatic neuroendocrine tumors, have an impact on treatment decisions\cite{ukasiewicz2021BreastCR,Kim2016RecentUO}. \newline 
In Summary, long inference times of publicly available models, incompatible with large-scale investigations, coupled with the absence of biologically relevant class annotations such as mitosis, motivate the subsequent contributions:
\vspace{-4mm}
\begin{enumerate}
\item We developed HoVer-NeXt (HN), an updated model based on our CoNiC challenge submission, which retains high performance on the Lizard cell types while also predicting mitoses.
\item We provide an additional mitosis training dataset, modify Lizard to include mitosis, and publish both together with additional validation sets for mitoses and eosinophils.
\item We provide a model trained on the PanNuke~\cite{gamper2019pannuke} pan-cancer panoptic segmentation dataset, which shows competitive performance.
\item We construct a highly efficient WSI inference pipeline, which achieves a 17$\times$ speedup over \textcolor{cr}{the HoVer-Net pipeline and a 5$\times$ speedup over the CellViT pipeline on whole slide inference}.
\end{enumerate} 
\vspace{-4mm}
Code for training, inference, weights for all models as well as links to data can be found here: \href{https://github.com/digitalpathologybern/hover_next_train}{github.com/digitalpathologybern/hover\_next\_train}, and here \href{https://github.com/digitalpathologybern/hover_next_inference}{/hover\_next\_inference}.
\vspace{-3mm}
\section{Methods}\label{sec:methods}
% \vspace{-3mm}
\begin{figure}[!ht]
\floatconts
  {fig:model_overview}
  {\caption{Our proposed pipeline consists of the Model (HoVer-NeXt) and a separate Stitcher that post-processes the output(A). HoVer-NeXt is trained with random sampling from Lizard-Mitosis and Mitosis, and uses a U-Net architecture with two decoders to produce raw class predictions, center-point vectors and a boundary, nuclei center and background map(B). For fast inference, tiles are pre-stitched, then post-processed and then overlaps are resolved(C). Lizard-Mitosis and Mitosis have differing distributions and strong class imbalance (D).}}
  {\includegraphics{model_overview_data.pdf}\vspace{-8mm}}%[width=0.5\linewidth]
  \vspace{-6mm}
\end{figure}
\vspace{-3mm}
\subsection{Summary of the CoNiC 2022 Submission}
Starting from the HoVer-Net model setup, we propose several simplifications and extensions.
Firstly, we combine the two instance segmentation decoders into a single decoder. The binary nuclei segmentation map is replaced by a 3-class nuclei boundary, nuclei center, background prediction map (BCB-map), which showed improved results in other modalities \cite{Caicedo2018EvaluationOD}, and can directly be used for watershed-based instance segmentation, reducing the need for additional post-processing steps. The HoVet-Net HV maps or center-point vectors are thus only used as an auxiliary task~\cite{hirsch2020auxiliary}. As the architecture, we use a U-Net~\cite{Ronneberger2015UNetCN} with an EfficientNet-V2 encoder~\cite{Tan2021EfficientNetV2SM}. To tackle the class imbalance in Lizard, we employ class-based importance sampling using per-pixel class statistics as weights and use focal loss with class weighting proportional to the inverse of the exponential moving average class prior \cite{Araslanov2021SelfsupervisedAC}.
%Per-pixel class statistics are computed over all training tiles. Then each tile receives a sampling probability based on the inverse tile-specific class distribution relative to the total class distribution. 
%In the same line, w
Finally, model outputs are post-processed with class-specific hyperparameters, as the classes differ in average object size and shape. For more details
%, and specifics on the original challenge submission, 
we refer to our previous publication for the CoNiC Challenge \cite{rumberger2022panoptic}. 
While the setup and processing steps were feasible within the scope of optimizing for the challenge metrics, they do not scale well to WSI.
%The focus of the challenge submission was to maximize the leaderboard metrics, class-averaged panoptic quality and R$^2$, within one hour processing time on the test-set tiles. 
% The task was to optimize class-averaged panoptic quality and R$^2$ within one hour processing time on the test-set tiles. Therefore, we used 16 views for test-time augmentations (TTA) and expensive post-processing steps such as convex hull-based shape filtering.% step for individual nuclei. 
% While feasible in the limited scope of the challenge, it is not scalable to large WSI cohorts, where fast inference times are key. 
Therefore, we optimize the model and embed it in a pipeline for efficient WSI inference.
\vspace{-3mm}
\subsection{HoVer-NeXt}
We updated the model with a ConvNeXt-v2~\cite{Woo2023ConvNeXtVC} encoder, which shows competitive results on a variety of benchmarks~\cite{Roy2023MedNeXtTS}.
ConvNeXt-v2 uses a larger pooling operation which we accommodate for by adding an additional upsampling step to maintain the same U-Net depth. In our experiments, we use ConvNeXt-v2 Tiny, Base and Large. 
\textcolor{cr}{We further simplify the model by replacing class based loss weighting with a standard focal loss~\cite{Lin2017FocalLF}, since data sampling is already sufficient to treat the label imbalance (Ablation: See Supp.~\ref{app:ablation}).} 
A convex-hull-based post-processing step is also removed as individual convex hull computations are computationally expensive. %, limiting the per-nuclei operations to connected components, removing small holes and area-based filtering. 
Tile-based normalization leads to artifacts in out-of-distribution tiles and is removed as well (See Supp.~\figureref{fig:app:ln_comp}). The training setup can be found in Supp.~\ref{app:model_trainig}.
Beyond these changes, we setup an easy-to-use WSI inference pipeline. 
\vspace{-3mm}
\subsection{Inference Pipeline}\label{sec:inferencepipe}
Relevant foreground area on the WSI is first identified using
%one of the first steps should be to limit the relevant area. Similar to HoVer-Net, we therefore include 
a threshold on the gray scale representation of \textcolor{cr}{the WSI thumbnail (Details: Supp.~\ref{app:fgbg_est})}. % or on the thumbnail if available. 
The model then processes tiles with overlap (8px/0.5mpp, 16px/0.25mpp) with classmap and BCB-map being compressed and stored on disk. For test-time-augmentations (TTAs), we only include HED color augmentation, mirroring and 90° rotation to avoid negative effects of augmentations \textcolor{cr}{(Details: Supp.~\ref{app:negative_aug}, Augmentation Parameters: Supp.~\ref{app:aug_params})}. Tiles are center-cropped and stitched to large regions for parallel processing. Then, based on individual class thresholds, foreground area and seed points %for each class 
are generated in the BCB map and then processed with a watershed algorithm to get nuclei instances. 
Small holes in instances are removed and false merges are resolved. Classes are assigned based on majority vote and instances are filtered based on class-specific size thresholds, which are determined via hyperparameter search on the validation set.
%Instances are then processed to remove small holes %(e.g. single pixel holes not passing the thresholds) 
%and re-analyzed with connected components to check for false instance merges. 
%Each nucleus is assigned a class by pixel majority vote
%obtaining the majority class within its boundaries
%, and then further processed with  % where too small or too big nuclei are removed.
%Class-specific thresholds are defined using hyperparameter search on the validation set. 
Finally, overlaps between large ROIs are resolved (Details: Supp.~\ref{app:resolving_overlaps}).
% Class assignments are stored in a lookup dictionary together with the center of mass of the nucleus and the instance map is stored as a compressed array.

\paragraph{Optimizing for speed:} To further optimize the inference time, storage, and memory requirements of the pipeline, the model runs at half-precision and the class output is quantized by mapping softmax outputs to values between 0-255.
Outputs are written to disk without post-processing by a separate process to ensure high GPU utilization.
Data is stored as LZ4 compressed Zarr arrays, allowing for fast compression and concurrent writes and reads. 
% The compression algorithm was selected based on a benchmark \cite{Miles2016CompressionBench} and adapted to our datatypes. Zarr arrays also allow multiple concurrent writes and reads. % check if there are citing options, otherwise put their websites.
Finally, the pre-stitching of raw inference tiles to large regions allows us to avoid resolving large numbers of overlaps, retain the option to parallelize watershed, and keep the peak memory consumption low.%er compared to having the entire raw output uncompressed in memory. 
% We also optionally split inference and post-processing to parallelize large WSI Cohort inference to maximize GPU utilization time and run post-processing on separate machines with more CPU cores and memory.
\vspace{-3pt}
\subsection{Datasets}
\paragraph{Lizard and PanNuke}
Lizard and PanNuke are publicly available H\&E panoptic nuclei segmentation datasets, one CRC specific and one pan-cancer (More details: Supp.~\ref{app:liz_pan}). To be able to compare HoVer-NeXt to our own challenge submission, we use the same 80\% train, 20\% validation split with the "GlaS" subset as an out-of-distribution test set. Conclusions drawn in the challenge evaluation \cite{Graham2023CoNICCP} are therefore likely to be translatable to our new models. 
%PanNuke has been used as a benchmark for other panoptic segmentation approaches so we also 
To compare our model with HoVer-UNet~\cite{Tommasino2023HoVerUNetAH}, CellViT~\cite{hörst2023cellvit} as well as HoVer-Net~\cite{Graham2018HoverNetSS}, we also include the PanNuke dataset as a benchmark. 
%Moreover, a pan-cancer model can be a good starting point for investigations in large H\&E cohorts.

\paragraph{Mitosis and Lizard-Mitosis}
We create our own dataset specifically for mitoses and extend the Lizard dataset with mitoses. To achieve this, we select 48 ROIs ($8192\times8192$px) from 11 H\&E stained CRC WSI, create a mitosis specific pHH3 immunohistochemistry restain,\textcolor{cr}{ register the images and generate ground truth by thresholds on the stain deconvolved DAB channel (see Supp.~\ref{app:regphh3})}. 
%register them using SimpleElastix. Mitosis annotations are produced using ROI-specific thresholds on the DAB channel from stain deconvolved (Hematoxylin+DAB) WSI.
To generate panoptic segmentation labels for this dataset beyond mitoses, as well as re-labeling Lizard for mitoses, we adapt a self-training routine proposed by \citet{Yang2021STMS} (see Supp.~\ref{app:self_train}).
%As we find many mitoses in Lizard, both unlabelled and wrongly labeled (\figureref{fig:model_overview} D), we also let the model predict mitoses on Lizard and use those as ground truth labels whenever there is no overlap with original labels. 
%The full class distribution for both the mitosis augmented Lizard as well as our own mitosis dataset are shown in . 
%Naturally, 
%Compared to Lizard, the mitosis dataset includes an even higher percentage of epithelial cells, as mitoses are primarily observable within neoplastic epithelial tissue \figureref{fig:model_overview} D. 
%The dataset created with the described strategy is based on 11 WSI with 48 ROIs ($8192\times8192$px). %scanned on a 3DHistech P1000.

\paragraph{Further validation: MitEval and EosEval}
Additionally, we create two holdout test sets, \textcolor{cr}{one with eosinophils manually annotated by a board-certified pathologist in 11 ROIs of CRC resection WSI respectively, and one with 3 board-certified pathologists annotating 13 ROIs for mitosis (Supp. \ref{app:additonal_validation}). For both datasets, we report WSI-level performance.}
% models trained on pannuke are relevant in real-world applications, to scan through large cohorts H&E cohorts to find interesting slides
\vspace{-3pt}
\subsection{Evaluation Metrics}
% Comparison to other methods binds us to use the same evaluation criteria and aggregation strategies and we therefore always include panoptic quality. 
\citet{Foucart2023PanopticQS} show that panoptic quality should be avoided for the evaluation of nuclei segmentation and classification. Panoptic quality is defined as the product of the detection F1 score and Intersection over Union (IoU) of true positives. However, the small size of nuclei makes IoU 
%-based matching 
%and IoU as a metric
too sensitive for coarse annotations. Moreover, the aggregation of IoU and F1 score 
%means that a high IoU while missing many nuclei is equal to worse segmentation but detecting all nuclei and it 
incentivizes not detecting an instance at all over misclassifying it. 
We therefore employ their guidelines, yet also report panoptic quality for comparison. For binary segmentation, we use F1-Score and Matthews correlation coefficient (MCC). For detection, we use distance-based matching (6$\mu$m@0.5mpp, 12$\mu$m@0.25mpp \cite{Sirinukunwattana2016LocalitySD}) and evaluate the detections using balanced accuracy and F1 Score. Detection metrics for Lizard are evaluated on $248\times248$ center crops to avoid having to detect nuclei with their center outside of the tile. % and are computed over all tiles compared to the per-tile approach of the PanNuke evaluation. 
For segmentation, we use Hausdorff distance, which has the advantage of also considering shape irregularities that IoU would miss~\cite{Foucart2023PanopticQS}.

% \paragraph{Binary segmentation quality}
% % more details
% Binary pixel F1 is used to evaluate the overall segmentation quality of the model irrespective of unique instances or classes. 
% \paragraph{Detection metrics}
% Matching previous papers, we also use distance based nuclei matching with 6um and 12um respectively \cite{Sirinukunwattana2016LocalitySD}. 
% With the exception of PanNuke, we evaluate detection metrics on center crops (248 for Lizard) matching the same center crop parameters afterwards used for inference. Since nuclei outside of this area are discarded during inference, and cannot be properly assessed by the model, this evaluation sets more realistic expectations on WSI performance. Furthermore, as the evaluation is based on tiles, some nuclei are only visible as few pixels precisely at the image border and are unlikely to be detected by the model. 
% Detection metrics are computed over all tiles, as compared to a per tile approach that was used in the evaluation for pannuke. This approach avoids having to compute metrics on empty tiles or with tiles that do not contain all classes. We include balanced accuracy to also include true negatives, but additionally report F1 score.
% \paragraph{Segmentation metrics}
% Again we follow the recommendations of \cite{Foucart2023PanopticQS} and use hausdorff distance for per class segmentation quality evaluation.
% The Hausdorff distance measures the longest distance from any point within one set to points in another set and thereby can be used as a measure of overlap that also takes outlier protrusions or other irregularities into account that a dice or IOU would not. However, the hausdorff distance is different at different magnifications as it is an absolute distance measure and should therefore not be used to compare between datasets.
% \paragraph{Panoptic quality}
% Lastly, we employ panoptic quality, both as per tile aggregation on PanNuke and the exact evaluation script that was used during the CoNiC challenge to enable fair comparisons. 
% we completely ignore per tile R2 because thats the dumbest idea ever.

\section{Results}
\subsection{Lizard-Mitosis}
\vspace{-4mm}
\begin{figure}[!ht]
\floatconts
  {fig:results_lizard}
  {\caption{Results on the GlaS test-set, including comparison against HN$_{CoNiC}$ for detection and segmentation metrics (A), binary segmentation (B), and F1 Score differences when using varying numbers of test-time augmentations (C). Detection results on EosVal and MitVal (D)}}
  {\includegraphics[width=\textwidth]{results.pdf}\vspace{-8mm}}%[width=0.5\linewidth]
  \vspace{-6mm}
\end{figure}
In the first experiment we compare our new HoVer-NeXt model with our CoNiC submission (referred to as HN$_{CoNiC}$) on the GlaS test set. 
All results shown in \figureref{fig:results_lizard} are obtained with 16TTA.
For binary segmentation, all three HoVer-NeXt models achieve higher F1 score and MCC, with HN$_{Large}$ having +0.005 F1-Score HN$_{CoNiC}$ and HN$_{Tiny}$ being on par with HN$_{CoNiC}$ with 0.814 binary pixel F1 score. These improvements are reflected in an increase of +0.009 for HN$_{Large}$ over the 0.832 baseline for binary detection F1.
Considering class-specific classification, the results are more diverse, with HN$_{CoNiC}$ achieving the highest balanced accuracy in neutrophil (0.706) and plasma cell (0.65) classification. 
HN$_{Base}$ achieves the highest balanced accuracy on lymphocytes (0.867) and is on par with HN$_{Large}$ and HN$_{CoNiC}$ on connective tissue cells (0.794).
On eosinophils, we observe the largest change with HN$_{Large}$ at 0.785 (+0.074) and even HN$_{Tiny}$ (+0.064) largely increases in accuracy over HN$_{CoNiC}$ (0.711).
Epithelial cells remain more consistent, but HN$_{Large}$ achieves the highest balanced accuracy with 0.796, HN$_{CoNiC}$ the lowest with 0.789.
% Additionally, with increasing model size, the average balanced classification accuracy increases from 0.749 to 0.758 (+0.009).
For segmentation quality, HN$_{CoNiC}$ achieves the lowest Hausdorff distances across all cell types except eosinophils % with the biggest differences occuring between HN$_{Tiny}$ and HN$_{CoNiC}$ for neutrophil and eosinophil segmentation.
(\figureref{fig:results_lizard} A). 
%We also observe that ranking based on F1 score vs balanced accuracy can lead to different results, e.g. for neutrophils where HN$_{Large}$ has the highest F1 (0.313, +0.013 over HN$_{CoNiC}$) but is second in balanced Accuracy (0.627, -0.023 compared to HN$_{CoNiC}$).
Investigating EosEval, we find an increase in performance across all model sizes compared to GlaS. HN$_{Large}$ has 0.553 F1 score on GlaS, but a per region average F1 score of 0.668 on EosEval. % On EosEval, results are computed over the entire ROI and averaged across patient samples. 
%The standard deviation of all models is higher on precision (+-0.12 for HN$_{Large}$) than recall (+-0.06).
On MitEval, HN$_{Tiny}$ performs best with 0.553 F1 compared to 0.521 for HN$_{Large}$ and 0.517 for HN$_{Base}$. % HN$_{Tiny}$ in particular detects the most mitoses, while false positives are on par with HN$_{Base}$.%leading to a Recall of 0.72 and a precision of 0.545.
%Qualitatively, some false positives occur due to mitotic figures in anaphase being detected as two separate entities.%, though they are annotated as one figure. 
Evaluating the effect of TTA, we observe an initial F1 score bump when using TTAs (\figureref{fig:results_lizard} C), but less of an increase with additional views. HN$_{Base}$ has an increase of 2.4\% in F1 from zero to four TTA views, but less from eight to sixteen (+0.0004) with standard deviation decreasing slightly (+-0.0039 to +-0.0036). Additionally, we find that the rare cell types, eosinophils, neutrophils, and mitoses gain the most from TTA (+2.16, +4.16, +4.87 \% F1 Score with 4 TTA), whereas more common cell types only show increases of less than one percent or even a slight negative impact (Plasma cells: -0.35\%).
\vspace{-3mm}
\subsection{PanNuke}
We train HoVer-NeXt on PanNuke, optimize hyperparameters, and evaluate it with the PanNuke evaluation script \cite{gamper2020pannuke}. HN$_{Tiny}$ with 16 TTA has a tissue average mPQ (mPQ$_{Tiss}$) of 0.477, achieves the highest PQ for inflammatory (0.418) and dead (0.154), and improves on HoVer-Net in the epithelium (+0.024) and connective (+0.027) class, yet only reaches 0.536PQ on neoplastic (\figureref{fig:results_pannuke}).
%We also include HoVer-UNet, as \citet{Tommasino2023HoVerUNetAH} report 3 times speedup over HoVer-Net with comparable results.
\begin{figure}[!ht]
\floatconts
  {fig:results_pannuke}
  {\caption{Average results over 3-fold cross-validation on PanNuke for different models}}
  {\includegraphics[width=\linewidth]{results_pannuke.pdf}\vspace{-8mm}}%[width=0.5\linewidth]
  
\end{figure}
CellViT \textcolor{cr}{sets the current state-of-the-art with 0.498 mPQ$_{Tiss}$ with large improvements in neoplastic and epithelial panoptic quality (+0.03, +0.09) over HoVer-Net. However, \citet{hörst2023cellvit} note that their model was only performing on par with HoVer-Net without pretraining. Moreover, we observe qualitatively, that the performance does not seem to translate to WSI (Supp. \figureref{fig:app:qualitativ_comp}). More thorough evaluation metrics for HN$_{Tiny}$ can be found in Supp.~\ref{app:additional_pan}.}
%We also qualitatively compare HoVer-Net, CellViT, and HoVer-NeXt and find that CellViT's PanNuke performance does not seem to translate to actual WSI (See Supp. \figureref{fig:app:qualitativ_comp}).
\vspace{-5mm}
\subsection{Inference Time}

We utilize five publicly available TCGA WSI (Supp.~\tableref{tab:app:tcga_speed_cases}) to evaluate inference timings for ~.24mpp and ~.5mpp for PanNuke and Lizard respectively (specs see Supp.~\ref{app:inf_specs}).
\begin{figure}[!h]
\floatconts
  {fig:speed}
  {\caption{Inference timings across different images using HN$_{Large}$ (A), for different encoder sizes at 4TTA (B) and comparing HoVer-Net, CellViT and HoVer-NeXt (C)}}
  {\includegraphics[width=\linewidth]{speed.pdf}\vspace{-8mm}}%[width=0.5\linewidth]
  \vspace{-6mm}
\end{figure}
% Using the same method, investigate the median foreground area for the TCGA COAD/READ cohort (median=109mm2, N=576) and an internal cohort (median=332mm2, N=571), indicating that for resection specimen cohorts, the larger selected slides are likely more representative for expected runtime.
HN$_{Large}$ trained on Lizard-mitosis at 0.5mpp takes 45s to run the smallest WSI and 7:26m for the largest WSI at 0 TTA. It takes $\sim$2$\times$ longer per 4 TTA views, with the largest WSI taking 14:52m at 4 TTA, and 25:12m at 8 TTA (\figureref{fig:speed}). Using the largest slide with 4 TTA as a reference, 
%we compute seconds per square millimeter for total time which is 1.04s/mm$^2$ for HN$_{Tiny}$, 1.40s/mm$^2$ for Base and 
HN$_{Large}$ runs inference at 1.78s/mm$^2$.
For PanNuke at 0.25mpp, HN$_{Tiny}$ takes 26:52m for the largest WSI, whereas CellViT and Hover-Net take \textcolor{cr}{02:37:39h} and 8:08:28h respectively. Based on the largest WSI, HN$_{Tiny}$ (at 4 TTA) processes WSI at 3.22s/mm$^2$. The entire WSI test set takes 50:53m on HN$_{Tiny}$, \textcolor{cr}{04:42:31h} on CellViT, and 14:38:44h on HoVer-Net making our pipeline \textcolor{cr}{5.6$\times$} and 17.2$\times$ faster. At the time of writing, HoVer-UNet had no WSI pipeline available, but with the reported $3\times$ speedup over HoVer-Net, it would take 4:52:54h.
\vspace{-5mm}
\section{Discussion}
To make large-scale investigations into the cellular composition of CRCs feasible, we develop a pipeline for nuclei segmentation and classification. Our model retains the predictive performance of our original CoNiC challenge submission, improves upon the detection metrics, and successfully learns the additional mitosis class using the mitosis and Lizard-mitosis datasets. 
Differences in eosinophils and plasma cells \textcolor{cr}{between HN$_{CoNiC}$ and HN$_{Large}$ and the ablation results indicate that the loss function and sampling strategy have varied impact on rare cell types, however with no clear best configuration.} We also find that particularly eosinophils and neutrophils are more sensitive to color changes, but reduce this problem with TTA. 
Also the removal of convex-hull-based post-processing likely leads to more segmentation outliers increasing the Hausdorff distance.
One of the major improvements to further increase model accuracy on small datasets such as PanNuke are large-scale pretraining~\cite{Chen2022SelfSupervisedVT, hörst2023cellvit} or semi-supervised learning approaches such as \citet{Rumberger2023ACTISID}. 
%CellViT achieves the highest mPQ$_{Tiss}$ using such large scale pretraining, so it would also be a natural next step for HoVer-NeXt development. 
Also, other published implementations of ConvNeXt-based U-Net variants such as \cite{Roy2023MedNeXtTS} could further improve results.  Larger context sizes could also lead to more robust classification, in particular in the healthy vs. malignant case \cite{frei2023local}. 
As demonstrated by our adaptation of ST++~\cite{Yang2021STMS}, automatic labeling of objects in histopathology by re-staining is a straightforward way of generating large labeled datasets and even single institute data provides sufficient variety for learning mitosis.
Finally, HoVer-NeXt is 5$\times$ faster than state-of-the-art and runs inference on TCGA COAD/READ (N=576) at 0.5mpp in 50 hours.
%, speeding up research and taking large steps towards feasibility in a routine diagnostic workflow.
%and improves upon it when not using large scale pretraining. 
%Running HoVer-NeXt completely without TTA would reduce inference time even more, however since out-of-distribution data is rather common in histopathology, TTA might help recover accuracy even on that data.
\vspace{-5mm}
\section{Limitations}
%Self-training via ST++ might not be the best approach to integrating a new label. However, \citet{Yang2021STMS} did not design the method for this purpose. 
Lizard contains wrongly annotated mitoses and not all pHH3-positive objects are visibly mitoses, thereby creating a noisy dataset. Moreover, perfect annotations  on H\&E are difficult, and cell types are only estimated by pathologists. Therefore, reported results will never entirely reflect the true model performance. \textcolor{cr}{Moreover, a more accurate approach for H\&E mitosis annotations on Mit-Eval could have been chosen \cite{Aubreville2023ACM}.}
Finally, we did not use the 3-fold evaluation split for Lizard to maximize available training data and compare with our own HN$_{CoNiC}$. 
% HoVer-NeXt also does not scale linearly with TTA which indicates that IO could be further improved on. We rely on OpenSlide to directly read tiles from slides and support a variety of formats, but e.g. for the MIRAX format this creates overhead of actually stitching the tiles on the fly \cite{Gilbert2012IntroMirax}.
% Finally, the HoVer-Net speed comparison is flawed, as HoVer-Net seems to be strongly dependent on IO with any other storage operations happening simulatenously slowing it down. 
\vspace{-4mm}
\section{Conclusion}
We publish HoVer-NeXt, a fast and efficient H\&E-based nuclei segmentation and classification pipeline, allowing for investigations into cellular compositions, spatial relationships, and morphological parameters of nuclei directly on large cohorts. 
While much of current research focuses on spatial technologies such as multiplex-immunofluorescence, generating such data is still expensive. Here, HoVer-NeXt can be a used for hypothesis generation and for finding interesting WSI to be further investigated using spatial technologies. Our work facilitates the next generation of histopathology and provides an important building block towards a quantitative view of WSI in clinical routine.
\clearpage
\midlacknowledgments{The results published here are in whole or part based upon data generated by the TCGA Research Network: \href{https://www.cancer.gov/tcga}{https://www.cancer.gov/tcga}. We'd like to thank Sophie Lechner for her support in annotating eosinophils, and Lucine Constance Christe and Philipp Zens for annotating Mitoses as additional observers. Calculations were performed on UBELIX (\href{https://www.id.unibe.ch/hpc}{https://www.id.unibe.ch/hpc}), the HPC cluster at the University of Bern.}

% This is where the content of your paper goes.  Some random
% notes\footnote{Random footnote are discouraged}:
% \begin{itemize}
% \item You should use \LaTeX \cite{Lamport:Book:1989}.
% \item JMLR/PMLR uses natbib for references. For simplicity, here, \verb|\cite|  defaults to
%   parenthetical citations, i.e. \verb|\citep|. You can of course also
%   use \verb|\citet| for textual citations.
% \item Eprints such as arXiv papers can of course be cited \cite{Hinton:arXiv:2015:Distilling}. We recomend using a \verb|@misc| bibtex entry for these as shown in the sample bibliography.
% \item You should follow the guidelines provided by the conference.
% \item Read through the JMLR template documentation for specific \LaTeX
%   usage questions.
% \item Note that the JMLR template provides many handy functionalities
% such as \verb|\figureref| to refer to a figure,
% e.g. \figureref{fig:example},  \verb|\tableref| to refer to a table,
% e.g. \tableref{tab:example} and \verb|\equationref| to refer to an equation,
% e.g. \equationref{eq:example}.
% \end{itemize}

% \begin{table}[htbp]
%  % The first argument is the label.
%  % The caption goes in the second argument, and the table contents
%  % go in the third argument.
% \floatconts
%   {tab:example}%
%   {\caption{An Example Table}}%
%   {\begin{tabular}{ll}
%   \bfseries Dataset & \bfseries Result\\
%   Data1 & 0.12345\\
%   Data2 & 0.67890\\
%   Data3 & 0.54321\\
%   Data4 & 0.09876
%   \end{tabular}}
% \end{table}

% \begin{figure}[htbp]
%  % Caption and label go in the first argument and the figure contents
%  % go in the second argument
% \floatconts
%   {fig:example}
%   {\caption{Example Image}}
%   {\includegraphics[width=0.5\linewidth]{example-image}}
% \end{figure}

% \begin{algorithm2e}
% \caption{Computing Net Activation}
% \label{alg:net}
%  % older versions of algorithm2e have \dontprintsemicolon instead
%  % of the following:
%  %\DontPrintSemicolon
%  % older versions of algorithm2e have \linesnumbered instead of the
%  % following:
%  %\LinesNumbered
% \KwIn{$x_1, \ldots, x_n, w_1, \ldots, w_n$}
% \KwOut{$y$, the net activation}
% $y\leftarrow 0$\;
% \For{$i\leftarrow 1$ \KwTo $n$}{
%   $y \leftarrow y + w_i*x_i$\;
% }
% \end{algorithm2e}

% % Acknowledgments---Will not appear in anonymized version
% \midlacknowledgments{We thank a bunch of people.}

\bibliography{midl24_256}
\include{Appendix}
\end{document}
