\documentclass{midl} % Include author names
%\documentclass[anon]{midl} % Anonymized submission

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images

% Header for extended abstracts
\jmlrproceedings{MIDL}{Medical Imaging with Deep Learning}
\jmlrpages{}
\jmlryear{2021}

% to be uncommented for submissions under review
% \jmlrworkshop{Short Paper -- MIDL 2021 submission}
% \jmlrvolume{-- Under Review}
% \editors{Under Review for MIDL 2021}

\title[LSR for Optimal Lesion Detection and Segmentation]{Optimizing Operating Points for High Performance Lesion Detection and Segmentation Using Lesion Size Reweighting}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Brennan Nichyporuk\nametag{$^{1}$}} \Email{brennan.nichyporuk@mail.mcgill.ca}\\
\Name{Justin Szeto\nametag{$^{1}$}} \Email{justin.szeto@mail.mcgill.ca}\\
\Name{Douglas L. Arnold\nametag{$^{2}$}} \Email{douglas.arnold@mcgill.ca}\\
\Name{Tal Arbel\nametag{$^{1}$}} \Email{arbel@cim.mcgill.ca}\\
\addr $^{1}$ Centre for Intelligent Machines, McGill University, MILA, Canada \\
\addr $^{2}$ Montreal Neurological Institute, McGill University, Montreal, Canada
}

% ADD MILA ENTRY TO ASSOCIATIONS?
% SLW --> LSR (Lesion Size Reweighing)
% ADD IPMSA ACKNOWLEDGEMENT 

\begin{document}
\maketitle
\begin{abstract}
There are many clinical contexts which require accurate detection and segmentation of all focal pathologies (e.g. lesions, tumours) in patient images. In cases where there are a mix of small and large lesions, standard binary cross entropy loss will result in better segmentation of large lesions at the expense of missing small ones. Adjusting the operating point to accurately detect all lesions generally leads to oversegmentation of large lesions. In this work, we propose a novel reweighing strategy to eliminate this performance gap, increasing small pathology detection performance while maintaining segmentation accuracy. We show that our reweighing strategy vastly outperforms competing strategies based on experiments on a large scale, multi-scanner, multi-center dataset of Multiple Sclerosis patient images.  
\end{abstract}

\begin{keywords}
lesion, segmentation, CNN
\end{keywords}

% Segmentation border uncertainty plays a role... 
\section{Introduction}
Many clinical contexts require accurate detection and segmentation of multiple lesions of varying sizes within a single patient image, either to diagnose or stage a disease or determine treatment efficacy \cite{doyle2017lesion}. Methods based on the UNet architecture \cite{unet} use pixel-wise loss functions to learn the appropriate segmentation output given an input MRI and a target. Although voxel-wise loss functions have proven effective to train models to produce accurate segmentations as measured by voxel-wise metrics such as DICE, they suffer from an inherent bias towards larger lesions that contain more voxels. As a result, voxel-wise loss functions typically miss smaller lesions at operating points that are favorable to voxel-wise metrics such as DICE \cite{nair2020exploring}. Reducing the detection threshold to an operating point that is more suited for detection is a feasible workaround, but this comes at the cost of over-segmenting larger lesions. Given that the optimal operating point for detection and segmentation are different, simultaneously achieving both objectives is not possible with standard loss functions. Recent research \cite{inverse-lesion-weighing} suggests that re-weighing the voxels of each lesion in a manner that is inversely proportional to that lesion's size can be an effective way to improve small lesion detection performance. Although this approach directly deals with the size imbalance between multiple lesions, it assigns equal weight to each lesion, which can be problematic in contexts such as cancer and Multiple Sclerosis, where lesions span a wide range of sizes (and can be quite small). In this work, we propose a novel weighing function that is much less prone to the training instability caused by assigning a high weight to smaller lesions that are typically much more uncertain. Our approach closes the detection/segmentation performance gap, showing that with the right lesion reweighing strategy, high overall simultaneous detection and segmentation accuracies are possible. Through large scale experiments on a large propriety dataset of Multiple Sclerosis patient images, the proposed method outperforms the competing baseline and several other common loss functions.

\section{Lesion Size Reweighting}
We propose a lesion weighing function, where the objective is to have the optimal detection and segmentation operating points converge by assigning more weight to small lesions than would otherwise be assigned by binary cross entropy. Although small lesions can be weighed more, they should still be assigned less weight than larger lesions, which are typically much more certain. Our conjecture is that assigning too much weight to small lesions can produce suboptimal results.

Formally, each lesion $L_j$ is assigned a weight $W_j$ that is a function of the number of voxels $|L_j|$ that comprise that lesion. In practice, weights must be assigned to individual voxels rather than individual lesions, so we also define the voxel weight $w_j$, related to $W_j$ via $w_j = \frac{W_j}{\vert L_j \vert}$. 
\begin{align}
    \label{eq:lsr-eqs}
    W_j = \vert L_j \vert + \alpha e^{-\frac{1}{\beta} (\vert L_j \vert - 1)}
    \qquad
    w_j = 1 + \frac{\alpha}{\vert L_j \vert} e^{-\frac{1}{\beta} (\vert L_j \vert - 1)}
\end{align}
where $\alpha$ and $\beta$ are hyperparameters such that $\alpha \leq \beta$ to ensure monotonicity in the weight with respect to lesion size. Background (i.e. non lesions) voxels retain a weight of $1$.

\section{Experiments and Results}
We train a UNet architecture \cite{unet} to segment T2 lesions with binary cross entropy (BCE), weighted BCE (WBCE), focal loss (FL) \cite{focal-loss}, BCE with the proposed lesion size reweighting (BCE+LSR), and BCE with inverse weighting (BCE+IW) \cite{inverse-lesion-weighing}. Hyperparameters common to all methods (augmentation, dropout, etc.) were first tuned for our baseline BCE model. We then freeze these hyperparameters for all subsequent experiments, modifying only the loss function and learning rate. Hyperparameters for the proposed BCE+LSR loss function were tuned on a $log_2$ scale ($\alpha=4$ and $\beta=4$ performed best in our experiments). Our dataset (train/validation/test), contains 1350/175/175 MRI scans from 575/175/175 subjects obtained over the course of a 2 year clinical trial. The train split contains 1-3 scans per subject, each taken 1 year apart. MRI sequences used include FLAIR, PDW, T2, T1, and gadolinium enhanced T1.

Figure~\ref{fig:tradeoff-curves} shows the TPR vs FDR curves and compares overall segmentation performance with detection performance for small (3-10 voxels), medium (11-50 voxels), and large (51+ voxels) lesions for the proposed BCE+LSR, as compared to BCE and BCE+IW. In the case of BCE+LSR, the optimal operating points for segmentation and detection (red and blue dots) overlap and the method performs well on both tasks. This is in contrast to BCE, for which the optimal operating points are comparatively far apart, and which shows a degree of over-segmentation at the optimal detection operating points (and under-detection at the optimal segmentation operating point, particularly for small lesions). WBCE and FL exhibited performance characteristics similar to BCE. For BCE+IW, the distance between the optimal detection and segmentation operating points is even larger, and the method significantly underperforms all others. Given the significant decrease in performance for BCE+IW relative to both BCE and BCE+LSR, further analysis revealed that BCE+IW applied substantial weight to extremely small lesions. Since the lesion weights computed by BCE+IW ranged over several orders of magnitude, training was extremely unstable. On the other hand, using the proposed BCE+LSR, the weights remain in a reasonable range, upper bounded by $1 + \frac{\alpha}{\vert L_j \vert}$. Since smaller lesions are considerably more uncertain, using a weighting scheme with a reasonable upper bound prevented training instability.
\begin{figure}[htbp]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
 \floatconts
  {fig:tradeoff-curves}
  {\vspace*{-6mm}\caption{TPR vs FDR curves: voxel-level segmentation and lesion-level detection. The best detection F1 operating point (blue dot) is based on the \textit{lesion detect - all sizes} curve. The best segmentation F1 operating point (red dot) is based on the \textit{voxel seg - all} curve. The closer the operating points the better. The operating points overlap for the proposed BCE+LSR (i.e. BCE+LSR achieves the highest simultaneous detection and segmentation F1).}\vspace*{-9mm}}
  {
    \subfigure[BCE+LSR]{%
        \includegraphics[width=0.31\textwidth]{figures/bce+slw-size-legend2.png}
    } % space out the images a bit
    \subfigure[BCE]{%
        \includegraphics[width=0.31\textwidth]{figures/bce-size-legend2.png}
    }
    \subfigure[BCE+IW]{%
        \includegraphics[width=0.31\textwidth]{figures/bce+iw-size-legend2.png}
    }
    \label{fig:performance-curves}
  }
\end{figure}
% Acknowledgments --- Will not appear in anonymized version
\midlacknowledgments{This work was supported by an award from the International Progressive MS Alliance (PA-1603-
08175) and by funding from the Canada CIFAR AI Chairs Program.}
\bibliography{midl-samplebibliography}


% \appendix

% \section{Proof of Theorem 1}

% This is a boring technical proof of
% \begin{equation}\label{eq:example}
% \cos^2\theta + \sin^2\theta \equiv 1.
% \end{equation}

% \section{Proof of Theorem 2}

% This is a complete version of a proof sketched in the main text.

\end{document}
