%\documentclass{midl} % Include author names
\documentclass{midl} % Anonymized submission

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images
\usepackage{multirow}
\usepackage[nolist]{acronym}

\jmlryear{2024}\jmlrworkshop{Full Paper -- MIDL 2024}\jmlrvolume{-- nnn}\editors{Accepted for publication at MIDL 2024}

\title[Improved Mitosis Detection Through Rescanning and Mitosis Subtyping]{Improving CNN-Based Mitosis Detection through Rescanning Annotated Glass Slides and Atypical Mitosis Subtyping}

% {Atypical Mitosis Subtyping Improves Performance in Deep Learning-Based Mitosis Detection: Application to MIDOG22 and TCGA-BRCA Outcome Prediction}



 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
\midlauthor{\Name{Rutger H.J. Fick} \Email{fick.rutger@gmail.com}\\
\addr Tribun Health
\AND
\Name{Christof A. Bertram} \Email{christof.bertram@vetmeduni.ac.at}\\
\addr University of Veterinary Medicine Vienna
\AND
\Name{Marc Aubreville\midljointauthortext{corresponding author}} \Email{marc.aubreville@thi.de}\\
\addr Technische Hochschule Ingolstadt 
}

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
% \midlauthor{\Name{Author Name1\midljointauthortext{Contributed equally}\nametag{$^{1,2}$}} \Email{abc@sample.edu}\\
% \addr $^{1}$ Address 1 \\
% \addr $^{2}$ Address 2 \AND
% \Name{Author Name2\midlotherjointauthor\nametag{$^{1}$}} \Email{xyz@sample.edu}\\
% \Name{Author Name3\nametag{$^{2}$}} \Email{alphabeta@example.edu}\\
% \Name{Author Name4\midljointauthortext{Contributed equally}\nametag{$^{3}$}} \Email{uvw@foo.ac.uk}\\
% \addr $^{3}$ Address 3 \AND
% \Name{Author Name5\midlotherjointauthor\nametag{$^{4}$}} \Email{fgh@bar.com}\\
% \addr $^{4}$ Address 4
% }

\begin{acronym}
\acro{ROI}[ROI]{regions of interest}
\acro{MIDOG}[MIDOG]{MItosis DOmain Generalization}
\acro{HE}[H\&E]{hematoxylin and eosin}
\acro{ICPR}[ICPR]{the International Conference on Pattern Recognition}
\acro{MICCAI}[MICCAI]{Medical Image Computing and Computer Assisted Intervention}
\acro{MF}[MF]{mitotic figure}
\acro{AMF}[AMF]{atypical mitotic figure}
\acro{NMF}[NMF]{normal MF}
\acro{PHH3}[PHH3]{Phospho-Histone H3}
\acro{IHC}[IHC]{immunohistochemical}
\acro{CMC}[CMC]{canine mammery carcinoma}
\acro{MC}[MC]{mitotic count}
\acro{DA}[DA]{diagnostic archive}
\acro{TTA}[TTA]{test-time augmentation}
\acro{DETR}[DETR]{Detection Transformer}
\acro{UMC}[UMC]{University Medical Center}
\acro{FUB}[FUB]{Freie Universität Berlin}
\acro{CNN}[CNN]{convolutional neural network}
\acro{WSI}[WSI]{whole slide image}
\acro{VMU}[VMU]{University of Veterinary Medicine Vienna}
\acro{AP}[AP]{average precision}
\acro{BIC}[BIC]{Bayesian information criterion}
\acro{OOF}[OOF]{Out-Of-Focus}
\end{acronym}

\begin{document}

\maketitle


\begin{abstract}
The identification of mitotic figures (MFs) is a routine task in the histopathological assessment of tumor malignancy with known limitations for human observers.
For a machine learning pipeline to robustly detect MFs, it must overcome a variety of conditions such as different scanners, staining protocols, tissue configurations, and organ types.
In order to develop a deep learning-based algorithm that can cope with these challenges, there are two obstacles that need to be overcome: obtaining a large-scale dataset of MF annotations spread across different domains of interest, including whole slide images (WSIs) exhaustively annotated for MFs, and using the annotated MFs in an efficient training process to extract the most relevant features for classification.
Our work attempts to address both of these challenges and establishes an MF detection pipeline trained solely on animal data, yet competitive on the mixed human/animal MIDOG22 dataset, and, in particular, on human breast cancer.
First, we propose a processing pipeline that allows us to strengthen the \textit{true} scanner robustness of our dataset by physically rescanning the glass slides of annotated WSIs and registering MF positions. To enable the use of such rescans for training, we propose a novel learning paradigm tailored for labels that match partially, which allows to account for ambiguous MF positions in the rescans caused by spurious, suboptimal fine-focus on potential MFs by the scanner. Second, we demonstrate how a multi-task learning approach for MF subtypes, including the prediction of atypical mitotic figures (AMFs), can significantly enhance a model's ability to distinguish MFs from imposters. Our algorithm, using a standard object detection pipeline, performs very competitively with an average test set F1 value across five runs of 0.80 on the MIDOG22 training set. We also demonstrate its ability to stratify overall survival on the TCGA-BRCA dataset based on mitotic density, though it falls short of reaching significance in stratifying survival based on AMFs.
\end{abstract}

\begin{keywords}
Whole Slide Imaging, Mitosis Detection, Atypical Mitosis Subtyping, Deep Learning
\end{keywords}

\section{Introduction}

%\begin{figure}[t]
%    \centering
%    \includegraphics[width=\textwidth]{atypical_mitosis.png}
%    \caption{Illustrations of normal and atypical mitotic figures. All types are equally important to identify, despite the varying appearance of the dark chromosome pattern.}
%    \label{fig:mitotic_illustrations}
%\end{figure}

The identification of morphological structures of dividing cells, known as \acp{MF}, is a routine task in the histopathologic assessment of tumor malignancy \cite{donovan2021mitotic}. \Acp{MF} are morphologically heterogeneous and can be either normal or atypical. Due to their morphological complexity, \acp{MF} are prone to be missed, or mistaken for other apoptotic or necrotic cell structures during manual assessment, resulting in significant inter-rater disagreement \cite{Meyer:2005cl,malon2012mitotic,veta2016mitosis}.
%, partially caused by a high object-level disagreement \cite{veta2016mitosis}, but also linked to a non-reproducible selection of the \ac{ROI} where pathologists count \acp{MF} \cite{Bertram2021VetPathol}.
For these reasons, and also to reduce the workload of pathologists, automatic \ac{MF} identification is a well-established computer vision task. The success of recent methods, which show an accurate and reproducible detection of \acp{MF} on \acp{ROI} was, to a large degree, fueled by the availability of highly diverse and sufficiently large datasets, such as AMIDA13 \cite{veta2015assessment}, TUPAC16 \cite{veta2019predicting}, MIDOG21 \cite{aubreville2023mitosis} and MIDOG22 \cite{aubreville2023domain}. %The TUPAC16 \cite{veta2019predicting} challenge dataset was the first to contain specimens from multiple labs that were digitized with two different \ac{WSI} scanners. Building on this idea, the dataset of the MIDOG21 challenge \cite{aubreville2023mitosis} provided \acp{ROI} digitized with four different scanners. The MIDOG22 challenge dataset \cite{aubreville2023domain} extended this diversity by including specimen from different species and organs, sampled from different labs, digitized using different scanners. 
Common to those datasets, is the limitation that only annotations for a pre-selected \ac{ROI} exist, that is typically selected from the area of highest \ac{MF} density within the tumor. This restricts the data diversity of those datasets, in particular by not including areas that may contain cells such as aptotic or necrotic cells, that can be easily mistaken for \acp{MF} and thus can be considered hard examples for the pattern recognition problem. Practically, this means that the application to the complete \ac{WSI} can become an out-of-distribution problem for detectors solely trained on \acp{ROI}. However, annotation of whole tumor sections is a labor-intensive task. %Additionally, it is a task prone to cognitive biases (such as satisfaction of search or confirmation bias), which can lead to semantic shifts in label distribution. This warrants the use of multiple experts in the annotation process, which further increases labeling costs. 
In the field of canine histopathology, two notable datasets have been made publicly available, that, combined, provide annotations for more than fifty thousand mitotic figures, collected from 53 tumor specimens \citep{bertram2019large,aubreville2020completely}. These annotations, however, have been provided for slides scanned with a single scanner only, limiting domain generalization across scanners. Annotation of a wide range of \acp{WSI} acquired on multiple scanners is infeasible due to the high cost of skilled labor for this task. As a cost-effective alternative, our first contribution in this study is a training paradigm that allows us re-use the efforts to exhaustively annotate these WSI by rescanning their physical glass slides and filtering spurious ambiguous \acp{MF} in the rescans appropriately.

Recently, it has also been reported that subtyping of \acp{MF} into normal and \acp{AMF}, which indicate an aberration of the normal chromosome separation process resulting in genetic alterations, might be relevant for the calculation of additional, prognostically relevant criteria in the assessment of breast cancer \cite{ohashi2018prognostic,lashen2022characteristics}. Initial work on the automatic subtyping of \acp{MF} \cite{aubreville2023deep} has found this to be a challenging task, additionally restricted by a low inter-rater agreement. 

In this study, we introduce a pipeline for detecting and subclassifying MFs in multi-organ, multi-scanner, multi-species, and fully WSI-based settings, demonstrating that subclassifying mitoses in multi-task learning significantly enhances the performance of \ac{MF} detection. Our approach, which was trained on a diverse set of fully annotated canine samples scanned with seven different systems, uses a robust training objective that is unaffected by out-of-focus mitoses in rescans. 
Our main contributions can be summarized as: 

%In this work, we demonstrate a deep learning approach that can detect and subclassify mitotic instances in multi-organ, multi-scanner, multi-species and fully WSI-based setting. Moreover, We show that subclassifying mitoses is more than just a separable addition to classific mitotis detection pipelines: Adding \acp{MF} subclassification to \acp{MF} classification in a multi-task learning setting significantly improves the performance of the \acp{MF} detection itself. To enable detection on \acp{WSI}, we trained our pipeline on a diverse data set of fully annotated cases of canine mammary carcinoma, which were scanned using seven different scanners, and for which we derive a robust training objective that is invariant to mitoses being out of focus in a rescan.
%Our detection pipeline consists of a well-tested two-step construction. First, a detector model is used to detect likely mitotic candidates from incoming HE-stained sections. These candidates are then fed in an ensemble of classifiers, which simultaneously classify an object into 1) \acp{MF} or non-mitotic, and 2) subclassify it as normal or atypical.

\begin{itemize}
    \item We introduce a tailored training objective to counteract focusing artifacts commonly occurring when rescanning slides, allowing for the training of \ac{MF} detectors with registered slides digitized by multiple scanners. 
    \item We show that adding a atypical classification subtask regularizes the mitotic figure detection task, leading to consistenly better results.
    %\item  We show that by training a SOTA \acp{MF} detector on fully-annotated \acp{WSI} of canine mammary carcinoma, we yield an algorithm with competitive performance on the MIDOG22 multi-organ challenge data set, in particular on the MIDOG21 subset of human breast cancer.
    \item We show that our detector delivers both SOTA performance on the MIDOG22 challenge set and also stratifies survival on the BRCA-TCGA breast cancer WSI dataset.
    % \item We propose an algorithmic solution for uncertainty-aware derivation of the \acp{AMF} as a prognostic marker, which, as we show, is predictive for survival on the TCGA-BRCA set.
\end{itemize}


%were the
%- mitosis detection is important
%- lots and lots of literature that makes WSI-based mitosis detection viable.
% - A great dataset is midog22 that shows generalizability for organs, scanners and species.
% - However, mitosis subtyping, in which identified mitotic object are subclassified as "normal" or "atypical", is only rarely explored in AI research [us], though it is well known to be relevant for patient prognosis[some refs].
% - In our previous work, in which we pioneered AI-based mitotic subtyping, we showed it was possible to classify mitoses as either normal or atypical with some accuracy, but did not go beyond a limited proof of concept on the TUPAC and MIDOG21 datasets.
% - In this work, we demonstrate a deep learning approach that can detect and subclassify mitotic instances in multi-organ, multi-scanner, multi-species and fully WSI-based setting.
% - Moreover, We show that subclassifying mitoses is not just a separable addition to classific mitotis detection pipelines: Adding mitosis subclassification to mitosis classification in a multi-task learning setting significantly improves the performance of mitosis detection itself.
%- To create a robust WSI-level model in breast cancer setting, we have used the previously annotated CMC dataset [cite], and rescanned the slides in 7 scanners, and registered the annotations between the rescanned WSI.
%- To further enhance robustness to a multi-organ setting, we have acquired mitosis annotations from 17 different cancer types, whose slides were also rescanned.
%- All of the mitoses in these datasets were subtypes for being normal or atypical, along with the respective phases for normal mitosies (A, B, C) and the different types of atypia can be be presented in a mitosis (E, F, G, E).
%- Our detection pipeline consists of a well-tested two-step construction. First a detector model is used to to detect likely mitotic candidates from incoming HE-stained sections. These candidates are then fed in an ensemble of classifiers, which simultaneously classify an object to be a mitosis or not, and subclassify the object into normal or atypical. 

\begin{figure}
\centering
\includegraphics[width=0.95\textwidth]{MIDL_Paper_VisualAbstract_3.pdf}
\caption{Overview: We register rescanned glas-slides of canine breast cancer and employ a novel filtering paradigm to disregard unrecognizable mitotic figures (MF). We additionally employ multi-task learning with atpical MF classification. }
\end{figure}

\section{Materials and Methods}

In this section we first describe our \ac{MF} dataset in section \ref{sec:MF dataset}, followed by our WSI rescan filtering paradigm in section \ref{sec:filtering}, and finally our model training approach in section \ref{sec:pipeline}.


\subsection{Mitosis Dataset}\label{sec:MF dataset}
A clinically usable \ac{MF} detection algorithm must be robust to the varying tissue composition and quality conditions found in a \acp{WSI}, as well as to domain shifts due to different scanner and organ domains. Our dataset is designed to provide robustness to these challenges:
\begin{description}
    \item[WSI Robustness] We include the \ac{CMC} dataset \cite{aubreville2020completely}, which consists of 21 \acp{WSI} exhaustively annotated for \acp{MF}.
    \item[Scanner Robustness] We rescanned a selection of the original CMC glass slides with 6 other scanners. We then transferred the \ac{MF} annotations from the original WSI to the rescans using a WSI-level registration algorithm \citep{marzahl2021robust}. To remove blurred, out-of-focus or otherwise missing \acp{MF} in the rescans we designed a custom filtering approach, which we detail in section \ref{sec:filtering}.
    \item[Organ Robustness] We acquired a secondary dataset of 159 \acp{WSI}, covering 17 different cancer types throughout different organs and animal species. As WSI and scanner robustness is obtained from the previous two datasets, here we only selected and annotated one ROI per WSI for \acp{MF}.
\end{description}

% \begin{itemize}
%     \item To create robustness to varying WSI: we include the \ac{CMC} dataset \cite{aubreville2020completely}, which consists of 21 \acp{WSI} exhaustively annotated for mitotic figures by an ensemble vote of three experts.
%     \item To enhance robustness to different scanners, we retrieved the original glass slides for the CMC dataset from the authors and rescanned a selection of them with 6 other scanners. We registered the rescans using a registration algorithm for \acp{WSI} \citep{marzahl2021robust}. for which we employed a custom heuristic filtering approach, which we detail in section \ref{sec:filtering}, to remove blurred, out-of-focus or otherwise missing mitoses in the rescans.
%     \item To enhance robustness to different organs, we acquired a secondary dataset where we obtained \acp{WSI} of 17 different cancer types throughout different organs and animal species. As scanner and WSI robustness is obtained from the previous two datasets, here we only select and annotate selected regions of interest for mitotic figures and lookalikes, using AI-assisted annotation.
% \end{itemize}
We provide an overview of our composite dataset in Table \ref{tab:dataset}. Our hold-out test set for \ac{MF} detection is the MIDOG22 training dataset \cite{aubreville2023domain}, which consists of mitotic figure annotations in regions of interest originating from multiple species, organs and scanners. 
To allow for our secondary \ac{MF} subtyping strategy, a trained pathologist also subtyped all annotated \acp{MF} into normal \acp{MF} or \acp{AMF}. We find that \acp{AMF} are \textit{rare}, representing only about 10\% of all mitotic figures. Our atypical \ac{MF} test set consists of the MIDOG21 training set \cite{aubreville2023mitosis}, whose mitotic figures were similarly subtyped into atypical and normal mitoses in previous work \cite{aubreville2023deep}.


\begin{table}[t]
    \centering
    \resizebox{\textwidth}{!}{
    \begin{tabular}{llrrrr}
        Dataset & Scanner & Resolution & No. cases & No. mitotic figures  & No. atypical mitotic figures \\\cline{1-6}
        CMC original & Aperio Scanscope & 0.25 $\frac{\mu m}{px}$ & 21 WSI & 14154& 1533\\\cline{1-6}
        \multirow{6}{*}{CMC re-scanned glass slides} & Hamamatsu HT2.0 & 0.23 $\frac{\mu m}{px}$ & 18 WSI &9724/12694& 1039\\
        & Hamamatsu S360 & 0.23 $\frac{\mu m}{px}$ & 18 WSI & 8987/12694& 940\\
        & 3DHistech Scan II & 0.25 $\frac{\mu m}{px}$ & 18 WSI &9840/12694& 1092\\
        & 3DHistech Flash III & 0.12 $\frac{\mu m}{px}$ & 18 WSI &8660/12694& 950\\
        & Philips SG360 & 0.25 $\frac{\mu m}{px}$ &  4 WSI &4393/5605& 663\\
        & Olympus VS200 & 0.25 $\frac{\mu m}{px}$ & 1 WSI & 1088/1343& 204\\\cline{1-6}
        multi tumor & 3DHistech Scan II &  0.25 $\frac{\mu m}{px}$ &  159 ROI &4670& 400
    \end{tabular}}
    \caption{Overview of our training dataset, indicating the number of all and atypical MFs. For the rescans, we show the number of MFs remaining \textit{after} our filtering scheme.}\label{tab:dataset}
\end{table}

%\subsection{Atypical Mitosis Dataset}
%\acp{MF} describe cells in the process of dividing, and can be either normal or atypical \cite{donovan2021mitotic}, which we illustrate in Figure \ref{fig:mitotic_illustrations}. A normal mitotic figure indicates it is following the natural cell division process, going through Prometaphase, Metaphase, Anaphase and finally telophase until two mature daughter cells are created. In our annotations we merge Anaphase and telophase into one group "ana/telophase". An atypical \acp{MF} indicates an irregular event, morphologically visible by abnormal chromatid distributions. We further subtype atypical mitoses into bipolar asymmetry, multipolar asymmetry, segregation and grouped together any other atypical morphologies.

% To create a dataset that is able to subclassify detected mitotic figures into their subtypes, we further subtype all annotated mitotic figures in the CMC dataset into either normal or atypical, and their respective subtypes. We show the distributions in Table \ref{tab:dataset}, where we demonstrate quantitatively that atypical mitotic figures are \textit{rare}, representing only about 10\% of all mitotic figures. To acquire atypical \acp{MF} annotations more efficiently than that base-rate, we trained a simple classifier model on these initial annotations and let this model sort the mitotic figure candidates in decreasing order of atypical probability for the annotation process in our multi-cancer dataset and the CCMCT dataset \cite{bertram2019large}, followed by a fine-grained annotation by a \ac{MF} expert. Our final training dataset statistics are shown in Table XXX.



\subsection{Training paradigm for rescanned slides}\label{sec:filtering}
%\subsection{Heuristic Filtering of Non-Interpretable Mitotic Figures in Registered Rescans}
\label{lab:filtering}
Rescanning the original CMC glass slides and transfering the \ac{MF} annotations to different scanners allows us to re-use the massive effort that was done to exhaustively annotate the over 14K \acp{MF} in the original scanner domain by 2+1 experts \cite{aubreville2020completely}. However, while the rescanned WSI \textit{should} represent the same tissue as the original, the scanning process itself is not perfect and can result in \ac{MF} annotation errors in the rescanned scanner domains.
False annotations occur for various reasons: 1) \acp{MF}, which were in-focus in the original WSI are out-of-focus (OOF) in the rescan; 2) the scanning area has device-dependent limitations, leading to parts of the tissue not being scanned in areas of registered \acp{MF}; 3) the scanner's stitching algorithm can locally cut out mitotic figures if they live at the intersection between two stitched patches. Missing annotations in the rescan occur when \acp{MF} that were not visible in the original scanner domain are now in-focus in the rescan, but are not annotated.
We show examples of false annotations in Figure \ref{fig:filtering}. Given the scale of the \ac{MF} annotations in our CMC rescan dataset, it is infeasible to manually verify whether each registered \ac{MF} is \textit{still} interpretable as such in the rescan. To clean our dataset for such errors without expert intervention, we propose a filtering approach based on the premise that false annotations \acp{MF} in rescanned images are caused by spurious scanner artifacts that have a random distribution. We trained 10 different classification architectures on all scanner domains and merge them into one big ensemble. Knowing that this ensemble will be reasonably robust to different scanners \textit{and label noise}, we apply this ensemble to the rescanned WSI and apply a conservative threshold to mask any uncertain \ac{MF}. During the detector training, when such masked objects reside in the (larger) patch that is sampled, we opt to draw white circles over the suppressed \acp{MF} so the training ignores uncertain objects. We chose to not use this approach to pseudo-label possible missed \acp{MF} annotations in the rescans as we felt we might end up validating \ac{MF} lookalikes. 

\begin{figure}[t]
    \centering
    \includegraphics[width=0.95\textwidth]{scanner_mitosis_filtering.png}
    \caption{The first row illustrates the original and rescanned WSIs. Subsequent rows showcase examples of MFs from the original slide, followed by registered MFs from the rescans. The filtering process acceptance or rejection for training is represented by a green checkmark or red cross, respectively. MFs were rejected due to stitching artifact, being out of focus, and borderline morphology.}
    \label{fig:filtering}
\end{figure}

\subsection{Mitosis Detection Pipeline}\label{sec:pipeline}
Our detection pipeline for \ac{MF} detection is combination of a detector network to propose \ac{MF} candidates, and an ensemble of classifiers that refines the selected candidates, which has proven to be successful in many \ac{MF} approaches \cite[e.g.][]{li2018deepmitosis, piansaddhayanaon2023recasnet}. Our detector network is a YOLOR-D6 \cite{wang2021you} and our ensemble of two classifiers consists of a DenseNet201 \cite{huang2017densely} and an EfficientNetB4 \cite{tan2019efficientnet}. We chose these architectures heuristically as those contributing most to the \ac{MF} detection F1 score. For the classifier decoder heads, in addition to the standard binary \ac{MF}/non-\ac{MF} head we add a secondary head for binary normal/atypical \ac{MF} classification, realized by a DenseNet201 network. We resampled all \acp{WSI} to $25 \frac{\mu m}{px}$ and determined all thresholds on the validation set.

\paragraph{Detector Training and Data Sampling}
Since we have such a large-scale, diverse and \textit{unbalanced} dataset for organ types (see Table \ref{tab:dataset} and Supplementary Table \ref{tab:multi-tumor}), it is important to guide the model training to not focus only on the majority groups. 
Therefore, we adjusted the sampling probabilities to 50\% from the base CMC dataset, and 25\% each from the rescanned CMC and multi-organ datasets, and sampling with equal probability between the subgroups of those sets.
%Within the rescanned CMC dataset, we sample equally between different scanners. Within the multi-organ dataset, we sample equally between all organs. 
We maintained a 50-50 split between \ac{MF} annotations and negative annotations while maintaining the natural atypical/normal mitosis distribution. This strategy encourages model robustness w.r.t. different scanners in the rescanned dataset and to varying organs/cancer types in the multi-organ \acp{ROI} while leveraging the trustworthy annotations of the CMC base dataset.
We trained on $1024\times1024$ px images until convergence was observed using the F1 score on the validation set, for which we assigned two cases from the CMC sets and 17 \acp{ROI} (one per tumor type) from the multi tumor set. 
%With standard YOLOR training hyperparameters, we train the detector until the F1 no longer improves for 10 epochs, where we set each epoch to consist of 4000 patches of patch size of 1024x1024 at magnification 40x. 
We used an open-source library for HE-based data augmentation \cite{faryna21}. 
%We track a composite validation set from our three datasets, where we split 2 slides that we have in common in the CMC and its rescans. We found that running the whole 2 slides per scanner was too much for just a validation step, so we chose to sample the same number of \textit{fixed} patches centered around positive an negative mitotic figure annotations per slide. 
%Aggregating the results per scanner, we can track WSI and scanner robustness. %As for the multi-organ dataset, since for some organs we only have 5 ROIs annotated, we split off one ROI per organ in the validation set.

\paragraph{Classifier Training and Data Sampling}
We adopted a similar domain sampling distribution for our classifier networks as for the detector. To train these networks, we used false positives generated by the trained detector model as negative examples, effectively utilizing the network as refining model~\cite{li2018deepmitosis}. We optimized the ensemble weighting and decision threshold for each validation step using grid search, using the best performing model per architecture (on the validation set) out of five runs.
%We use the same data domain sampling distribution for our classifier networks as our detector network. However, while we train these networks with the positive pathologist annotations in our datasets, we only use the false positive detections generated by the trained detector model as negative. In this way, the classifiers act as refining networks for the candidates the detector network provides, and do not need to know the negatives that the detector has already discarded.
%Previous work suggests the classifier dataloader should sample the generated data distribution adversarially to the detector probablity, so that we classifier samples patches more often where the detector is most wrong \cite{piansaddhayanaon2023recasnet}. However, we did not see this had any effect on the optimal F1 score. However, what we did do is tune our early stopping technique to take into account the detector probabilities in an ensemble. This means we calculate the optimal F1 score by optimizing the ensemble weighting $\alpha$ and decision threshold $\tau$ for every validation step using gridsearch.
The second classifier head only handles the normal or atypical \ac{MF} subtyping. For this reason, we created a separate dataloader that samples \textit{only} \acp{MF} and samples normal and atypical mitoses equally. Within each category, we sampled the subtypes equally. Given that the primary head is used for early stoppping, it is not expected that the second head is optimal at the same time. For this reason, after convergence of the \ac{MF} classification head, we froze the encoder and trained the secondary head until convergence. 

%For each ensemble, we find separate ensemble weights for maximizing the primary task and secondary task (if applicable) F1 scores. 

\subsection{Survival prediction on TCGA-BRCA}
To investigate the \ac{WSI}-based performance, we evaluated our detection pipeline on the breast cancer cohort (BRCA) of The Cancer Genome Atlas (TCGA) project. 
%A pathologist screened all 1062 cases to ensure sufficient tissue, staining and scanning quality, and retained 1035 cases. 
Following the diagnostic grading process of breast cancer \citep{fitzgibbons2023protocol}, we selected the area with the highest \ac{MC}, using a circular field of view spanning $2\,mm^2$.
Since the classification head for atypical/normal shows only average accuracy, we selected only the predictions with the highest confidence ($p>0.9$) for either class, reducing the number of detected \acp{MF} utilized for the next step by 42.84\%. We then calculated the \ac{AMF} to \ac{NMF} ratio per case, similar to \cite{lashen2022characteristics}. 
Due to the expected high level of inaccuracies in the \ac{AMF}/\ac{NMF} decision, we elected to calculate the ratio not only per \ac{ROI} but also per \ac{WSI} in an effort to decrease the SNR of the metric.
We then fitted the survival data provided in the dataset to find the threshold which provided the strongest predictive value, as indicated by the p value of the Cox linear hazard model and fitted a Kaplan Meier estimator on this optimal threshold.% (both using the python lifelines package v.0.28.0). 

\section{Results}

Comparing the pure classification performance (second stage of the approach) for different network architectures, we find that adding the atypical subtyping task consistently improves the performance (see Table \ref{tab:classifier_only}).
%In Table \ref{tab:classifier_only} we compare F1-scores on the MIDOG22 test set for different classifier architectures with and without atypical \acp{MF} subtyping as a secondary task. %Aggregating the results from 5 runs per architecture, we show that adding the subtyping task improves performance on the primary task every time.
\begin{table}[]
    \centering
    \resizebox{1\textwidth}{!}{
    \begin{tabular}{l|r|r|r}
     & \multicolumn{3}{|c}{Mitosis Detection F1}\\
   Classifier Model & w/o filtering, w/o MF subtyping & w/ filtering, w/o MF subtyping & w/ filtering, w/ MF subtyping\\\cline{1-4}
         DenseNet201~\cite{huang2017densely} & $0.72\pm0.022$&$0.74\pm0.011$ & $\mathbf{0.77}\pm0.025$ \\
         EfficientNet B4~\cite{tan2019efficientnet} & $0.69\pm0.030$&$0.72\pm0.013$ & $\mathbf{0.75}\pm0.020$\\
         ResNet50~\cite{he2016deep} & $0.70\pm0.016$ &$0.71\pm0.020$ & $\mathbf{0.74}\pm0.014$
    \end{tabular}}
    \caption{Ablation study of both contributions on the second stage ($\mu\pm\sigma$ over five runs). }
%    \caption{We train different classifier model architectures with and without MF subtyping and evaluate optimal F1 performance on the validation set. The F1 scores are averaged over 5 training runs. It can be seen that the primary task \acp{MF} detection task consistently improves when mitotic subtyping is included in the training.}
    \label{tab:classifier_only}
\end{table}
%From the five runs for each of these classifier architectures, we took the ones with the highest validation set performance and ensembled them as. 
%We do this once for the models without secondary task and once with, to show the impact on an end-to-end pipeline. 
We find this to also hold true in the end-to-end performance on CMC, CMC Rescan, our multi-tumor dataset and MIDOG22 (see Table \ref{tab:trainval}).
Moreover, we find that \ac{AMF} classification is significantly more challenging for our model, as shown in the third and fourth column of Table \ref{tab:trainval}. Note that the evaluations given in this table assume perfect recognition of \ac{MF} and only evaluate the subtask of \ac{AMF} classification. Our evaluation of the precision-recall-curves in the supplementary Figure \ref{fig: prauc} reveals a similar performance across scanners for breast cancer, demonstrating the scanner-robustness of our scheme, a notably high performance for the \ac{MF} recognition in mast cell tumor, and a deterioration of performance on the neuroendocrine and lung cancer tumors. Lastly, as shown in the supplemantary Figure~\ref{fig:corr}, there is a strong correlation between the predicted \ac{MC} and the respective ground truth, and a less significant correlation for the \ac{AMF}/\ac{NMF} ratio.

\begin{table}[t]
    \centering
    \resizebox{\textwidth}{!}{
    \begin{tabular}{l|llcc|cc}
        \multicolumn{1}{c}{} & & & \multicolumn{2}{c}{Primary Task F1} & \multicolumn{2}{c}{Secondary Task F1}\\
        \multicolumn{1}{c}{} & & & \multicolumn{2}{c}{Mitosis Detection} & \multicolumn{2}{c}{Mitosis Subtyping }\\
        & Dataset & Scanner & wo/ MF subtyping & w/ MF subtyping & Perfect Primary & End2End\\\cline{1-7}
        & Canine CMC Orig. & Aperio Scanscope & 0.774 & \textbf{0.811} & 0.815 & 0.696\\\cline{2-7}
        & \multirow{6}{*}{Canine CMC Rescan} & Hamamatsu HT2.0 & 0.783 & \textbf{0.808} &0.771& 0.649\\
        \multirow{2}{*}{\rotatebox{90}{validation}}& & Hamamatsu S360 & 0.816 & \textbf{0.848} & 0.789& 0.666\\
         & & 3DHistech Scan II & 0.781 & \textbf{0.837} &0.808& 0.697\\
        & & 3DHistech Flash III & 0.808 & \textbf{0.832} &0.806& 0.650\\
        & & Philips SG360 & 0.805 &\textbf{0.850} &0.796& 0.671\\
        & & Olympus VS200 & 0.792 &\textbf{0.840} & 0.774& 0.647\\\cline{2-7}
        & Animal MultiTumor & 3DHistech Scan II & 0.853 & \textbf{0.914} &0.742& 0.707\\\cline{1-7}
        & \multirow{5}{*}{Human MIDOG22} & Breast/Ham XR & 0.755 & \textbf{0.760} & 0.64 & 0.463 \\
         & & Breast/Ham S360 & 0.741  & \textbf{0.742} & 0.582 & 0.438 \\
        \multirow{2}{*}{\rotatebox{90}{\,\, hold-out test}} & & Breast/Aperio CS2 & 0.764 & \textbf{0.783} & 0.623 & 0.458 \\
         & & Neuroendocrine/ Ham XR & 0.626 & \textbf{0.699} & N/A & N/A\\\cline{2-7}
        & \multirow{3}{*}{Canine MIDOG22}& Lymphoma/3DHist Scan II & 0.753 & \textbf{0.804} & N/A & N/A\\
        & & Cutaneous Mast Cell/ Aperio CS2 & 0.824 & \textbf{0.859} & N/A & N/A\\
        & & Lung / 3DHist Scan II & 0.684 & \textbf{0.708} & N/A & N/A  \\ \cline{2-7}      
        & Aggregate MIDOG22 & All & 0.763 & \textbf{0.801} & 0.615 & 0.45 
    \end{tabular}}
    \caption{Detection performance of our pipeline for mitotic figure (MF) detection and subtyping across different subgroups. Subtyping performance was evaluated assuming perfect MF recognition. Subtyping helps MF recognition in each given condition. }
%    We compare end-to-end performance for our detection pipeline for the primary MF detection task and secondary MF subtyping task for different dataset subgroups. For the CMC and MultiTumor datasets we show the results on the validation set, and MIDOG 22 is the test set. All results show the average performance over 5 training runs. In the first results column we show the F1 score of the primary classification task, without and with mitotic subtyping multi-task training for the classifier ensemble. We see that in each instance the the primary task improves when the second training task is added. In the second and third columns, we show the F1 score of the \acp{MF} subtyping. For clarity, in the first of the two columns we assume the primary task is "perfect", i.e. no non-MF objects are being fed to the secondary task. In the second of the two columns, we show the true end-to-end performance.}
    \label{tab:trainval}
\end{table}

%For the ensemble that included mitotic subtyping, we also show the sub-cohort-wise PR-AUC curves and F1 scores over varying decision threshold in Figure \ref{fig: prauc}. Note that performance for breast with different scanners is very similar, demonstrating the scanner-robustness effect of including CMC-Rescan in training. Furthermore, together with the inclusion of the smaller, ROI-wise multi-tumor dataset, performance over different organs is relatively robust, with mast cell tumors performing best and neuro-endocrine being the hardest.




%Lastly, to link object-wise prediction to biomarker quantification at ROI level, we show ROI-wise correlation of the ground truth mitotic density versus the predicted one in the supplemantary Figure~\ref{fig:corr}-left, and the ground truth atypical-to-normal ratio to the predicted one in supplemantary Figure~\ref{fig:corr}-right. We show the correlations for three decision thresholds (low-optimal-high) to illustrate the correlation remains significant even when the threshold is suboptimal on the applied dataset.




% Performance on CMC

% Performance on MIDOG22

% Surivival
When predicting on the TCGA-BRCA dataset, we found an average count of mitoses per \acp{ROI} of 26.67 and an average count of atypical mitoses within the \ac{ROI} of 1.01. The \ac{AMF}/\ac{NMF} ratio on WSIs had a mean value of 0.10, whereas the same metric, when evaluated on the \acp{ROI} of the \ac{MC} was 0.13, indicating a slightly higher rate of \acp{AMF} within the \acp{ROI}. The survival prediction, shown in Figure \ref{fig:kaplanmeier}, shows that the \ac{MC} stratified survival into two groups. The Chi-squared test indicates significant difference ($p<0.01$) between the groups. On the other hand, even for an optimized cutoff value of 0.11, the test did not indicate significant differences between the groups stratified by the \ac{AMF}/\ac{NMF} ratio. We found the \ac{AMF}/\ac{NMF} ratio calculated on the hotspot \ac{ROI} did not stratify survival at all. 




%\subsection{Results on MIDOG22 test set}
%\begin{itemize}
%    \item maybe correlation plots of atypical/normal ratio per hotspot
%\end{itemize}

%\subsection{Prognostic results on TCGA-BRCA}


%\begin{itemize}
%    \item Binary classification atypical
%    \item Kaplan Maier Curve
%    \item AUC curve for multiple threshold values
%\end{itemize}

\begin{figure}[t]
    \centering
    \begin{minipage}{0.49\textwidth}
        \includegraphics[width=\textwidth]{MC_overall_survival_cutoff_11.pdf}
    \end{minipage}
    \hfill
    \begin{minipage}{0.49\textwidth}
        \includegraphics[width=\textwidth]{amf_os_stratification_thres_0_11.pdf}
    \end{minipage}
    \caption{Kaplan Meier curves for the 10 year overall survival of breast cancer patients of the BRCA-TCGA dataset, for the most discriminating cutoff value. MC  (left plot) discriminates subgroups significantly ($p<0.01$), while AMF/NMF ratio-based stratification (right plot) is non-significant ($p=0.10$) }
    
    \label{fig:kaplanmeier}
\end{figure}

\section{Discussion and Conclusion}
Our work shows that by using animal histopathology data, we can train a highly competitive \ac{MF} detector for human breast cancer. The training scheme, utilizing registered rescans of the glass slides, incorporated a high degree of scanner robustness, as our results show.  
While the \ac{AMF} detection itself yielded only mediocre classification rates, likely caused by the high inter-rater variability and the difficulty of the problem, it consistently regularlized the model and improved performance on the primary \ac{MF} detection task. 

We note that although the training paradigm introduced in our work reduced the number of false annotations in rescanned slides, it did not aim to recover \acp{MF} that were not annotated in the primary dataset, thus increasing the risk of false negatives in the rescanned slides. Moreover, the measurable effect of rescan filtering depends on how often the rescans are sampled during training. The use of immunohistochemistry in a restaining procedure could help in the identification of OOF mitoses in future work~\cite{tellez2018whole}.

The prediction of survival on the external BRCA dataset underscores the robustness of our pipeline. While we chose the threshold for subgroup separation in the survival analysis post-hoc, the value of 11 mitoses per $2\,mm^2$ is well within expectations given the current CAP guidelines \cite{fitzgibbons2023protocol}.
In contrast to the findings of Lashen et al. \cite{lashen2022characteristics}, we did not find the \ac{AMF}/\ac{NMF} ratio to be significantly stratifying for overall survival, even for an optimized threshold value. While this could be linked to the performance of our classifier, we also observe a striking difference in the apparent perception of mitoses being atypical, which is expressed by a mean \ac{AMF}/\ac{NMF} ratio of 0.2 on TCGA-BRCA and a mean count of 2 \acp{AMF} per \ac{ROI} in the original work \cite{lashen2022characteristics}, where our overall estimates are significantly lower.

In conclusion, we demonstrated that our dual-pronged approach of using rescanned slides and MF subtyping allowed us to train a highly competitive MF detection approach using animal data only, as benchmarked on the multi-species MIDOG22 dataset. We envision that future improvements to the AMF subtyping task will allow us to find statistically significant and clinically meaningful uses for detecting atypical mitoses.  
%within expectations given the distribution of \acp{AMF} in our dataset. 

%\begin{itemize}
%    \item atypical mitosis subtyping somehow increases performance on primary task
%    \item automatic paradigm filtering thing allows utilization of rescans for high volume, high resolution objects. Refer to appendix image for blur detection on original + a rescan at different locations (FP + FN). point is that original CMC \textit{is} exhaustive VISUALLY, but in reality it is a snapshot of reality.
%    \item it is important to realize that we need BOTH the atypical subtyping strategy and the rescan filtering to achieve SOTA on midog22. only rescans does not give good results. only atypical does not give scanner robustness.
%    \item limiation is is that we define threshold on the same dataset, which is not very robust.
%    \item while atypical is not significantly stratifying on TCGA, we do find promising patterns. Atypical mitoses have differnet subtypes, of which the other/tripolar subtypes are considered \textit{more} malignant than atypical or segregation, due to the more catastrophic nature of the mitotic process failure. In future work we will explore specifically occurrences of these subtypes for patient stratification.
%\end{itemize}
%\section{Conclusion}

%Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{ M.A. acknowledges support by the German Research Foundation (project number 520330054). C.A.B. acknowledges funding by the Austrian Science Fund (FWF, project number: I 6555). }

\bibliography{midl24_138}

\appendix
%\renewcommand{\figurename}{Supplementary Figure}
%\renewcommand{\tablename}{Supplementary Table}
\renewcommand{\thefigure}{S\arabic{figure}}
\renewcommand{\thetable}{S\arabic{table}}


\setcounter{figure}{0}
\setcounter{table}{0}

\clearpage
\section{Additional performance evaluations}
In Figure \ref{fig:corr} we show correlations and scatter plots of the ground truth mitotic density and AMF/NMF ratios versus our predicted ones on the MIDOG22 dataset. For both, we show the correlation for three different decision thresholds to demonstrate the overall robustness of the correlation. In each case, the second label showing the orange scatter plot show performance for the optimal threshold on the validation set, while the blue and green show a lower rand higher threshold, respectively.
\begin{figure}[h!]
    \centering
    \begin{minipage}{0.49\textwidth}
         \includegraphics[width=\textwidth]{corr_mitotic.png}
    \end{minipage}
    \hfill
    \begin{minipage}{0.49\textwidth}
        \includegraphics[width=\textwidth]{correlation_atyp.png}
    \end{minipage}
    \caption{Scatter plots showing correlation between ground truth and estimated mitotic density (left) and AMF/NMF ratios (right) for each ROI of the MIDOG22 dataset.}
    \label{fig:corr}
\end{figure}


\begin{figure}[ht]
    \centering
    \begin{minipage}{0.45\textwidth}
         \includegraphics[width=\textwidth]{prauc.png}
    \end{minipage}
    \hfill
    \begin{minipage}{0.45\textwidth}
        \includegraphics[width=\textwidth]{f1_scores.png}
    \end{minipage}
    \caption{Quantitative results of best model on MIDOG22 dataset by scanner/organ subgroups. Left: PR-AUC. Right: F1-scores over decision thresholds.}\label{fig: prauc}
\end{figure}

\clearpage

\section{Multi-Tumor Dataset Composition}
Our in-house multi-tumor dataset consists of 17 different cancer types. We selected one ROI per WSI spread over 156 WSI, spread over the cancer subtypes. We show the precise dataset composition in Table \ref{tab:multi-tumor}.

\begin{table}[h!]
    \centering
    \begin{tabular}{llcc}
        Tumor Type & \#ROI & \#MF & \#AMF\\\cline{1-4}
AdrencorticalTumors & 2 & 5 & 2 \\
ColonCarcinoma & 10 & 365 & 12 \\
GastrointestinalStromalTumors & 9 & 139 & 26 \\
HemangioSarcoma & 9 & 290 & 9 \\
HepaticCarcinoma & 8 & 33 & 3 \\
Lymphoma & 10 & 672 & 5 \\
MammaryCarcinoma & 8 & 363 & 12 \\
MastCellTumor & 12 & 392 & 31 \\
Melanoma & 11 & 593 & 61 \\
Meningioma & 6 & 28 & 4 \\
OsteoSarcoma & 15 & 434 & 21 \\
Pheochromocytoma & 7 & 104 & 3 \\
ProstateCarcinoma & 4 & 74 & 13 \\
PulmonaryCarcinoma & 10 & 329 & 67 \\
RenalCarcinoma & 7 & 233 & 10 \\
SoftTissueSarcoma & 13 & 170 & 10 \\
UrothelialCellCarcinoma & 9 & 371 & 35 \\
    \end{tabular}
    \caption{Overview of the number of ROIs and MFs annotated per tumor type in our multi-cancer training dataset.}\label{tab:multi-tumor}
\end{table}


\section*{Author contributions}
R.H.J.F. led the study design and data acquisition, trained and evaluated the models, and spearheaded the algorithm development.
M.A. performed the survival analysis and contributed to algorithm development.
C.A.B. provided annotations for the multi-tumor dataset and the mitotic phases.
All others jointly wrote the manuscript. 


\end{document}
