\documentclass{midl} % Include author names
%\documentclass[anon]{midl} % Anonymized submission

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution
\usepackage{tcolorbox}
\makeatletter
% hack the footer of the title page as it doesn't seem to automatically use the et al. version when spanning acros multiple pages
\renewcommand*{\@titlefoot}{\scriptsize\copyright\space\@jmlryear
    \ifanonsubmission
    \space Author(s) names withheld.\hfill
    \else
    \space \@firstauthor\space \emph{et al}.\hfill
    \@reprint
    \fi
  }
  
\renewcommand{\@jmlrmaketitle}{
  % specify the page style in case it spans over several pages. Not sure why this is needed
  \thispagestyle{jmlrtps}%
  % define a new label for the first page as otherwise it starts from 2 in the header
  \label{jmlrstartnew}%
  % hack the spacing to mimick what happens with a vbox
  \null\vspace{-\parindent}\vspace{-2pt}{
  \hsize\textwidth
  \linewidth\hsize
 \jmlrpretitle
  {%
    \def\titletag##1{##1}%
    \@title
  }%
 \jmlrposttitle
 \jmlrpreauthor \@author \jmlrpostauthor
}
% hack the spacing to mimick what happens with a vbox
\vspace{2pt}
}  
  
\makeatother
\usepackage{float}
\usepackage{url}
\usepackage{mwe} % to get dummy images

% Header for extended abstracts
\jmlrproceedings{MIDL}{Medical Imaging with Deep Learning}
\jmlrpages{}
\jmlryear{2021}

% to be uncommented for submissions under review
\jmlrworkshop{Short Paper -- MIDL 2021}
%\jmlrvolume{-- Under Review}
%\editors{Under Review for MIDL 2021}

\title[Common limitations of performance metrics in biomedical image analysis]{Common limitations of performance metrics \\ in biomedical image analysis}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}
\midlauthor{\Name{Annika Reinke\nametag{$^{1,2,3}$}} \Email{a.reinke@dkfz-heidelberg.de}\\
\Name{Delphi consortium on metrics}$ $\footnote{\scriptsize \textbf{Full author list:} A. Reinke, M. Eisenmann, M.D. Tizabi, C.H. Sudre, T. Rädsch, M. Antonelli, T. Arbel, S. Bakas, M.J. Cardoso, V. Cheplygina, K. Farahani, B. Glocker, D. Heckmann-Nötzel, F. Isensee, P. Jannin, C.E. Kahn, J. Kleesiek, T. Kurc, M. Kozubek, B.A. Landman, G. Litjens, K. Maier-Hein, A.L. Martel, B. Menze, H. Müller, J. Petersen, M. Reyes, N. Rieke, B. Stieltjes, R. Summers, S.A. Tsaftaris, B.van Ginneken, A. Kopp-Schneider, P. Jäger, L. Maier-Hein.\\
\textbf{Full paper and affiliations:} \cite{reinkeMetrics2021}: \url{https://arxiv.org/abs/2104.05642}}\\
\Name{Lena Maier-Hein\nametag{$^{1,2,3,4}$}} \Email{l.maier-hein@dkfz-heidelberg.de}\\
\addr $^{1}$\footnotesize Div. Computer Assisted Medical Interventions, German Cancer Research Center (DKFZ), Germany \\
\addr $^{2}$\footnotesize HIP Helmholtz Imaging Platform, German Cancer Research Center (DKFZ), Heidelberg, Germany \\
\addr $^{3}$\footnotesize Faculty of Mathematics and Computer Science, Heidelberg University, Germany \\
\addr $^{4}$\footnotesize Medical Faculty, Heidelberg University, Germany}


\begin{document}

\maketitle

\begin{abstract}
While the importance of automatic biomedical image analysis is increasing at an enormous pace, recent meta-research revealed major flaws with respect to algorithm validation. Performance metrics are key for objective, transparent and comparative performance assessment, but little attention has been given to their pitfalls. Under the umbrella of the Helmholtz Imaging Platform (HIP), three international initiatives -- the MICCAI Society's challenge working group, the Biomedical Image Analysis Challenges (BIAS) initiative, as well as the benchmarking working group of the MONAI framework -- have now joined forces with the mission to generate best practice recommendations with respect to metrics in medical image analysis. Consensus building is achieved via a Delphi process, a popular tool for integrating opinions in large international consortia. The current document serves as a teaser for the results presentation and focuses on the pitfalls of the most commonly used metric in biomedical image analysis, the Dice Similarity Coefficient (\textit{DSC}), in the categories of (1) mathematical properties/edge cases, (2) task/metric fit and (3) metric aggregation. Being compiled by a large group of experts from more than 30 institutes worldwide, we believe that our framework could be of general interest to the MIDL community and will improve the quality of biomedical image analysis algorithm validation. 

\end{abstract}

\begin{keywords}
Segmentation, Validation, Metrics, Challenges, Good Scientific Practice.
\end{keywords}

\section{Common limitations of segmentation metrics}
Image segmentation is one of the most popular image processing tasks. An international meta-analysis showed that the chosen metrics in segmentation challenges radically influence the resulting rankings~\citep{maier2018rankings}. Although work on clinical relevance of metrics \citep{vaassen2020evaluation} or data biases \citep{badgeley2019deep} exist, % Although several papers highlight specific strengths and weaknesses of common metrics \citep{kofler2021DICE, vaassen2020evaluation, margolin2014evaluate},
researchers are missing guidelines for choosing the right metric for a given problem~\citep{maier2018rankings}. To address this community request, this document summarizes common pitfalls related to the most frequently used metric in medical image segmentation, namely the Dice Similarity Coefficient (\textit{DSC}) \citep{dice1945measures}. A longer version of this teaser document is available at~\citep{reinkeMetrics2021}.

\subsection{Fundamental mathematical properties}
\label{subsec:mathematics}
Awareness of a metric's mathematical properties is crucial when determining its suitability for a given task. Segmentation of small structures, such as brain lesions, is essential for many image processing applications; however, the \textit{DSC} may be inappropriate here (Fig.~\ref{fig:DSC-small}).

\begin{figure}[H]
    \floatconts
      {fig:DSC-small}
      {\vspace{-0.9cm}\caption{Effect of the \textbf{structure size} on the  Dice Similarity Coefficient (\textit{DSC}). The predictions of two algorithms (\textit{Prediction 1/2}) differ in only a single pixel. In case of a small structure (b), this has a substantial effect on the associated metric value.}}
      {\includegraphics[width=1\linewidth]{images/SmallLarge.png}}
\end{figure}

\subsection{Suitability for underlying image processing task}
\label{subsec:underlying-task}
While performance metrics are typically expected to reflect a domain-specific validation goal, segmentation metrics such as the \textit{DSC} are commonly also applied to \textit{detection and localization} tasks \citep{jager2020challengesphd}. From a clinical perspective, an algorithm covering all structures of interest (e.g. tumors) would be of much higher value compared to one producing a highly accurate segmentation for one structure but missing the others. This, however, is not reflected in the \textit{DSC} metric values, as shown in Fig.~\ref{fig:DSC-detection}.

\begin{figure}[H]
    \floatconts
    {fig:DSC-detection}
    {\vspace{-0.9cm}\caption{Effect of using a \textbf{segmentation metric for object detection}. The prediction of an algorithm only detecting one of three structures (\textit{Prediction~1}) leads to a substantially higher \textit{DSC} compared to that of another algorithm (\textit{Prediction~2}) detecting all structures.}}
    {\includegraphics[trim=0 0 0 0, clip, width=0.75\linewidth]{images/Detection.png}}
\end{figure}

\subsection{Metric aggregation}
\label{subsec:aggregation}
In international competitions, metric values are often aggregated over all test cases to produce a ranking~\citep{maier2018rankings}. Fig.~\ref{fig:na-missings} illustrates the effect of missing values.

\begin{figure}[H]
    \floatconts
    {fig:na-missings}
    {\vspace{-1.1cm}\caption{Effect of \textbf{missing values} when aggregating metric scores. Ignoring missing values can lead to a substantially higher \textit{DSC} compared to setting them to the worst possible value (here: 0).}}
    {\includegraphics[trim=0 0 0 0, clip, width=0.69\linewidth]{images/NA_DSC.png}}
\end{figure}
\vspace{-0.9cm}
\section{Conclusion}
Choosing the right metric for a specific image processing task is non-trivial. Our MIDL presentation raises awareness about some common flaws of the most frequently used segmentation metric in the biomedical image analysis community and gives best practice recommendations for choosing the most appropriate metric(s) in an application-specific manner. Details regarding the Delphi consortium compiling the recommendations will be presented in a follow-up publication soon.

\midlacknowledgments{This work was initiated by the Helmholtz Imaging Platform (HIP). It was further supported by the NIH Clinical Center Intramural Research Program, the NIH National Cancer Institute (NCI: U01CA242871) and the NIH National Institute of Neurological Disorders and Stroke (NINDS: R01NS042645).}

\bibliography{midl-samplebibliography}
\end{document}
