\documentclass{midl} % Include author names
%\documentclass[anon]{midl} % Anonymized submission

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{float}
\usepackage{stmaryrd}
\usepackage{graphicx} % pour \includegraphics{monJPG}
\usepackage{dsfont}
\usepackage{booktabs}


%\usepackage{mwe} % to get dummy images
%\jmlrvolume{-- Under Review}
\jmlryear{2021}
\jmlrworkshop{Full Paper -- MIDL 2021}
%\editors{Under Review for MIDL 2021}

\title[Balanced sampling for object detection]{Balanced sampling for an object detection problem - application to fetal anatomies detection}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Antoine Olivier\nametag{$^{1}$}} \Email{antoine.olivier@philips.com}\\
% \Name{Laurence Rouet\nametag{$^{1}$}} \Email{laurence.rouet@philips.com}\\
% \Name{Cybèle Ciofolo-Veit\nametag{$^{1}$}} \Email{cybele.ciofolo-veit@philips.com}\\
\and
\Name{Caroline Raynaud\nametag{$^{1}$}} \Email{caroline.raynaud@philips.com}\\
\addr $^{1}$ Philips Research, France
}

\newtheorem{rk}{Remark}[section]
% \newtheorem{proposition}{Proposition}

\DeclareMathOperator{\IoU}{IoU}

% \newcommand\overmat[2]{%
%   \makebox[0pt][l]{$\smash{\color{white}\overbrace{\phantom{%
%     \begin{matrix}#2\end{matrix}}}^{\text{\color{black}#1}}}$}#2}
% \newcommand\bovermat[2]{%
%   \makebox[0pt][l]{$\smash{\overbrace{\phantom{%
%     \begin{matrix}#2\end{matrix}}}^{\text{#1}}}$}#2}
% \newcommand\partialphantom{\vphantom{\frac{\partial e_{P,M}}{\partial w_{1,1}}}}
    
\begin{document}

\maketitle


\begin{abstract}
In this paper, we propose a novel approach to overcome the problem of imbalanced datasets for object detection tasks, when the distribution is not uniform over all classes. The general idea is to compute a probability vector, encoding the probability for each image to be fed to the network during the training phase. This probability vector is computed by solving a quadratic optimization problem and ensures that all classes are seen with similar frequency. We apply this method to a fetal anatomies detection problem, and conduct a statistical analysis of the resulting performance to show that it performs significantly better than two baseline models: one with images sampled uniformly and one implementing oversampling.

\begin{keywords}
Object detection, Imbalanced dataset, Quadratic optimization, Fetal anatomy detection, Ultrasound.
\end{keywords}

\end{abstract}

\section{Introduction and problem statement}

In this paper, we tackle the issue of imbalanced datasets in object detection problems. Object detection consists in simultaneous identification and localization of objects in an image. Current state-of-the-art methods for object detection rely on deep learning algorithms, which require to train a model on a given dataset. For example, let us mention the classical two-stage networks Fast R-CNN and Faster-RCNN \cite{Girshick2015,Ren2015}, as well as the one-stage architectures SSD \cite{Wei2016,fu2017dssd} and YOLO \cite{Redmon2015,Redmon2016}. However, balance between different classes is often hard to achieve,  in particular in the field of medical imaging where data and annotations are costly and difficult to obtain.

% In this paper, we address the problem of class imbalance, and show how it can be tackled by solving some Quadratic Programming (QP) problem. It can be applied to any type of model, requires no changes to the network architecture, only to the data generation phase. 

\paragraph{Imbalanced dataset for object detection.} An imbalanced dataset may affect the performance of a neural network, leading to poorer behavior of a model on under-represented classes.
A common approach, consisting in oversampling all images containing the less frequent classes, could introduce unexpected and unwanted behavior, as it may also oversample examples of the most represented classes if they are jointly present in the images, and therefore not improve the overall performance of a model. We will elaborate more on this issue in Section \ref{subsec:data}.

The general issues with imbalance in object detection have been covered in the survey paper \cite{Oksuz2019}, dividing them into \emph{class imbalance}, \emph{scale imbalance}, \emph{spatial imbalance} and \emph{objective imbalance}. The problem of interest in this paper falls into the category of \emph{class imbalance}, and more precisely in the so-called \emph{foreground-foreground class imbalance}. Although objects appear at different frequencies in nature, and therefore class imbalance is most likely to happen, it is stated in the survey \cite{Oksuz2019} that `imbalance amongst the foreground classes has not attracted as much interest as foreground-background imbalance', for which the works \cite{Shrivastava2016,Lin2017,Pang2019,li2019,Chen2019} can be cited.

For foreground-foreground imbalance, let us mention the papers \cite{Ouyang2016, Oksuz2020}. In \cite{Ouyang2016} the authors investigate fine-tuning of a model on a dataset with \emph{long-tail distribution} and 200 classes. They show that it is better to have (pseudo-)uniform number of samples per class, but sampling is done at bounding box level, before training the detector to classify each region, which is not easily generalizable to one-stage detectors. In \cite{Oksuz2020} the authors present an online foreground balancing (OFB) method, aiming at making the classes balanced in a batch. Although their approach would apply to the same type of problems than ours, we point out that it is more suitable for two-stage detectors, as they generate positive bounding boxes after the region proposal network, whereas ours is agnostic to the network's architecture. The experiments we will present are conducted using the YOLO model, a one-stage detector. Besides, OFB makes the classes balanced at batch level, whereas our approach plays a role at dataset level. Note that it is a noticeable difference as, especially for small batch sizes,  distribution within a batch may be different from distribution within the full dataset. \citet{Oksuz2019} lists as an open question whether OFB might induce a bias in the learning process.
% Their approach consists in clustering objects of different classes into groups, in view of compensating the foreground class imbalance. 

Generative methods \cite{goodfellow2014} can also be used to produce artificial images (see \cite{Tripathi2019,Wang2019}) for which special attention can be given to under-represented classes. One other noticeable approach is the one described in \cite{Dwibedi2017}, where object instances are simply `cut' and then `paste' on random backgrounds. Of course, this can result in unrealistic images. 

We emphasize that the aforementioned papers only deal with natural images, for which the datasets available are usually larger than the medical datasets. The approach we suggest in this paper involves no generation of artificial images and requires no changes to the network architecture (and could indifferently be combined with one-stage or two-stage detectors), only a balanced way to sample images so that the distribution is uniform.

\paragraph{Paper outline.} In what follows, we will start in Section \ref{sec:qp} by explaining how the problem of imbalanced classes can be tackled using quadratic optimization, and how it can be solved in practice. 
Then, we will show in Section \ref{sec:results} how it was applied to a specific fetal anatomies detection problem, and the influence we observed on the training results, compared to several baselines.

\section{Balanced sampling as quadratic programming}
\label{sec:qp}

\subsection{Introducing problem \eqref{eq:P0} for sampling data}
\paragraph{Problem statement.} 
We now state the problem under its general form. Even if we aim at applying it to ultrasonic medical images, our approach is general and could be applied to any object detection problem.

Let $N$ be the number of images in the dataset, and $(X_1, X_2, \ldots, X_N)$ denote the collection of images. We also denote by $C$ the number of classes present in the dataset, and we consider that for any given image $X_i$ (with $i \in \llbracket 1, N \rrbracket$), any label can be present in the image. More precisely, we denote by $E$ the matrix encoding the distribution of labels within the image collection: $E := (\varepsilon_{i,l})_{1\leq i \leq N, 1\leq l \leq C} \in \mathcal{M}_{N,C}(\mathbb{R})$ with $\varepsilon_{i,l} = 1$ if label $l$ is present in image $i$, $0$ otherwise. 

Consider now a probability vector $p = (p_1, \ldots, p_N) \in \mathbb{R}^N$ (\emph{i.e.}, $p_i \geq 0$ for all $i \in \llbracket 1, N \rrbracket$ and $p_1 + \ldots + p_N = 1$). If we randomly pick images amongst the collection $(X_i)_{1 \leq i \leq N}$ under the probability distribution $p$, the expectation to observe the class $l$ is $f_l := \sum_{i=1}^N{p_i \varepsilon_{i,l}} / N$. In order to have a balanced dataset, we therefore aim at finding a discrete probability vector (when possible) such that all expectations are the same. Therefore, the problem of sampling the images in a balanced fashion writes as follows:
find a vector $p \in \mathbb{R}^N$, such that $p_i \geq 0$ for all $i \in \llbracket 1, N \rrbracket$ and $p_1 + \ldots + p_N = 1$, and $f_l = f_k$ for all $l, k \in \llbracket 1, C \rrbracket $, where $f_l := \sum_{i=1}^N{p_i \varepsilon_{i,l}} / N$. 
In what follows we will, with a slight abuse in the notations, forget the normalizing factor $1/N$, and still denote by $f_l$ the quantity $\sum_{i=1}^N{p_i \varepsilon_{i,l}}$, or in other words, $f = E^T p$.

Note that it may not always be possible to find such a vector $p$. To circumvent this issue, we consider instead the following optimization problem:
\begin{equation*}
\begin{array}{lc}
\text{Minimize} & \frac{1}{2}\sum_{l=1}^C \sum_{k=1}^C {(f_l - f_k)^2},   \\
\text{subject to} & \left\{ 
                    \begin{array}{c}
                    p_i\geq \alpha, \\
                    p_1 + \ldots + p_N = 1. 
                    \end{array}
                    \right.\\
\end{array}
\tag{$\mathcal{P}_{\alpha}$}
\end{equation*}
The term $p_i\geq \alpha$ aims at ensuring that every image has a minimal probability to be picked in the sampling process, \emph{i.e.} that no image is left unseen during training. (Note also that in order to have at least one solution, $\alpha \leq 1 / N$ is required.)

% The cost function is continuous, and the set of feasible points $\left\{ p \in \mathbb{R}_+^N \quad | \quad p_1 + \ldots + p_N = 1 \right\}$ is compact. Following classical optimization results, problem \eqref{eq:P0} therefore always admits a solution.

Before going further, we perform some algebra on the cost function in \eqref{eq:P0}:
\begin{equation}
\begin{split}
\sum_{l=1}^C \sum_{k=1}^C {(f_l - f_k)^2} 	&= \sum_{l=1}^C \sum_{k=1}^C {f_l^2 + f_k^2 - 2 f_l f_k},\\
											&= 2 C \sum_{l=1}^C {f_l^2 } - 2 \sum_{l=1}^C \sum_{k=1}^C {f_l f_k},\\
											&= 2 C f^T f - 2 f^T J_C f, \\
\end{split}
\end{equation}
where $J_C$ denotes the matrix in $\mathcal{M}_C(\mathbb{R})$ where every element is equal to $1$. Finally, we get that the cost writes $f^T (2C I_C - 2 J_C) f / 2$ and we denote by $A'$ the matrix $2C I_C - 2 J_C \in \mathcal{M}_C(\mathbb{R})$ ($I_C$ is the identity matrix of size $C$).
Introducing $A = E A' E^T \in \mathcal{M}_N(\mathbb{R})$, we get that problem \eqref{eq:P0} writes
\begin{equation*}
\begin{array}{lc}
\text{Minimize} & \frac{1}{2}p^T A p,   \\
\text{subject to} & \left\{ 
                    \begin{array}{c}
                    p_i\geq \alpha, \\
                    p_1 + \ldots + p_N = 1,
                    \end{array}
                    \right.\\
\end{array}
\tag{$\mathcal{P}_{\alpha}$}
\label{eq:P0}
\end{equation*}
which is a standard form for a quadratic optimization problem.

\subsection{Enforcing uniqueness}
\label{subsec:uniqueness}
The solution to problem \eqref{eq:P0} is (in general) not unique (see Appendix \ref{appendix:conv} for more details). In order to enforce uniqueness of the solution, we add to the initial problem a regularization term $\lambda \left\| p \right\|^2 / 2$ where $\lambda \geq 0$ is a penalization parameter. This yields the following optimization problem, which admits a unique solution (as long as $\alpha \leq 1/N)$,

\begin{equation*}
\begin{array}{lc}
\text{Minimize} & \frac{1}{2} p^T A p + \lambda \frac{\left\| p \right\|^2}{2},   \\
\text{subject to} & \left\{ 
                    \begin{array}{c}
                    p_i\geq \alpha, \\
                    p_1 + \ldots + p_N = 1. 
                    \end{array}
                    \right.\\
\end{array}
\tag{$\mathcal{P}_{\lambda, \alpha}$}
\label{eq:Ppen}
\end{equation*}
where $\alpha$ and $\lambda$ will be two hyper parameters, that can be set by the user for training. We refer to Appendix \ref{appendix:conv} for mathematical results on \eqref{eq:Ppen}, as well as some details on how it can be efficiently solved in practice.

%For all $\lambda > 0$ and $0 \leq \alpha \leq 1/N$, the following result holds on the structure of the solution to \eqref{eq:Ppen}:
% \begin{proposition}
% \label{prop:solution}
% For $\lambda > 0$ and $0 \leq \alpha \leq 1/N$, let $p^{(\lambda, \alpha)}$ denote the (unique) solution to \eqref{eq:Ppen}. If $X_{i_1}$ and $X_{i_2}$ are two images containing the same objects, \emph{i.e.}, the $i_1$-th and $i_2$-th lines of the matrix $E$ are the same, then the $i_1$-th and $i_2$-th components of vector $p^{(\lambda, \alpha)}$ are the same, $p^{(\lambda, \alpha)}_{i_1} = p^{(\lambda, \alpha)}_{i_2}$.
% \end{proposition}
% The proof is detailed in Appendix \ref{appendix:conv}.
% This result shows that our sampling technique is consistent for images containing exactly the same objects: if two (or more) images have the same object distribution (\emph{i.e.}, same corresponding lines in the matrix $E$), they will be assigned the same sampling probability, ensuring that none is favored over the other.

% Problem \eqref{eq:Ppen} can be easily and efficiently solved in practice, as explained in Appendix \ref{appendix:conv}.

% \subsubsection{Link between \eqref{eq:Ppen} and oversampling for image classification.} In this section, we make a comment on the special case when for each image $X_i$, exactly one object is present in the image (it means that each line of the matrix $E$ contains exactly one non-zero entry), and the parameter $\alpha$ is chosen to be $0$. In this case, the object detection problem at hand resembles a classification problem, in the sense that part of the task is to correctly identify the object present in the image, which can be seen as a label of the image.

% For each $l \in \llbracket 1, C \rrbracket$, let $n_l$ denote the number of images containing an object of class $l$. Besides, we define the function $\sigma : \llbracket 1, N \rrbracket \rightarrow \llbracket 1, C \rrbracket$ that assigns the index of the images to the class of the object present in the image: for all $i \in \llbracket 1, N \rrbracket$, the class $\sigma(i)$ is present in image $X_i$. The following result analyses the behaviour of the solution to $(\mathcal{P}_{\lambda, \alpha=0})$.

% \begin{proposition}
% \label{prop:conv}
% Assume that for each image $X_i$, exactly one object is present in the image.
% Let $p^* \in \mathbb{R}^N$ be the vector defined by $p^*_i = \frac{1}{n_{\sigma(i)}} \cdot \frac{1}{C}$ for all $i \in \llbracket 1, N \rrbracket$. Then $p^*$ is a solution to problem \eqref{eq:P0}. 

% Besides, for all $\lambda > 0$, we denote by $p^{(\lambda)} \in \mathbb{R}^N$ the (unique) solution to problem $(\mathcal{P}_{\lambda, \alpha=0}$. Then the following convergence result holds:
% \begin{equation*}
%     p^{(\lambda)} \underset{\lambda \rightarrow 0^+}{\longrightarrow} p^*.
% \end{equation*}
% \end{proposition}
% $p^*$ is in fact the only vector in $\mathbb{R}^N$ satisfying the constraints, such that $E^T p = (1/C, \ldots, 1/C)$, and such that two images $X_{i_1}$ and $X_{i_2}$ with the same label have the same probability: $p_{i_1} = p_{i_2}$.
% The proof of this result is elementary and we provide it in the Appendix \ref{appendix:conv}.

% Therefore, our sampling technique can be seen as a generalization of oversampling in image classification problems: Sampling the data with the probability $p^*$ leads to all classes having the same prior probability $1/C$. 
% Besides, it has been shown that in image classification, neural networks estimate Bayesian a posteriori class probabilities whenever using squared error or cross-entropy cost functions, see \cite{Richard1991NeuralNC} and \cite{buda2017}. In this case, thresholding \cite{buda2017} can be used, coupled with oversampling, to adjust the network's output, and reflect the actual class probabilities.  

\section{Application to fetal anatomy detection}
\label{sec:results}

% After presenting the data over which we performed our evaluations in Subsection \ref{subsec:data}, we describe the process of selecting the hyperparameters values in Subsection \ref{subsec:setup}. We then focus on a prefered hyperparameter set $(\lambda, \alpha)_{best}$, that we thoroughly evaluate, using the Kolmogorov-Smirnov (KS) test, for which we give more details in Subsection \ref{subsec:results}.

%After performing preliminary trainings and networks' evaluations, using solutions to \eqref{eq:Ppen} based on multiple hyper parameters values ($\lambda$, $\alpha$), we decided to perform a thorough validation on a value $(\lambda, \alpha)_{best}$ that seemed to show the best performance. The statistical evaluation was performed using the Kolmogorov-Smirnov (KS) test, for which we give more details and show how we applied it in Subsection \ref{subsec:results}.

\subsection{Data}
\label{subsec:data}
We apply the method to fetal anatomy detection. The dataset comprises 2D frames from ultrasonic acquisitions of the head, abdomen and upper leg of a fetus. The aim is to detect and localize in the images some predefined anatomies, that correspond to our classes.

\paragraph{Training dataset.} 
Our training dataset consists in 1237 2D images.
The target anatomies can be split into two categories that we will call \emph{main fetal anatomies} and \emph{sub-anatomies}. They are summarized in Table \ref{tab:fetal_labels}. Here, sub-anatomies are defined as structures that are part of a main anatomy, \emph{e.g.}, a cerebellum is \emph{always} included in a head (see also \figureref{fig:image_examples} for some sample examples). 

\begin{table}
    \floatconts{tab:fetal_labels}
    {\caption{Summary of bounding boxes anatomies and sub-anatomies, and the number of occurences in the training dataset. CSP stands for Cavum Septum Pellucidum}}
    {
    \resizebox{\textwidth}{!}{%
    \begin{tabular}{l c l c l c l c l}
    \toprule
        main anatomies & ~ & \multicolumn{7}{c}{sub-anatomies} \\
        \midrule
        head (579) & & falx cerebri (523) & ~ & cerebellum (114) & ~ & thalamus (408) & ~ & CSP (284) \\
        
        abdomen (327) & & umbilical vein (239) & ~ & stomach (279) & ~ & spine (349) & ~ & heart (36) \\
        
        femur (303) & & ~ & & & ~ & ~ & \\
    \bottomrule
    \end{tabular}
    }
    }
\end{table}

Given the definition of labels (`main anatomies' and `sub-anatomies'), any fetal anatomy dataset is intrinsically imbalanced. Indeed, sub-anatomies are not present in all frames where their corresponding main anatomy is present (\emph{e.g.}, a stomach may be missing, even though we visualize the abdomen), whereas the corresponding main anatomy is necessarily present when a sub-anatomy is present (\emph{e.g.}, if we visualize the stomach, then the abdomen must be visible too). This leads to datasets in which heads and abdomens are over represented compared to inner structures, as shown in Table \ref{tab:fetal_labels}.

Besides, a common strategy consisting in duplicating images containing the less represented anatomies (for instance in our case, all the images containing a heart or a cerebellum), would also lead to an over-representation of the corresponding `main anatomies', which may in turn introduce a new bias in the dataset. This will be supported by the experiments presented in Subsection \ref{subsec:results}, where we will compare our strategy to oversampling.

\begin{figure}[htbp]
\floatconts{fig:image_examples}
{\caption{Examples of dataset samples and bounding boxes}}
{
\centering
\subfigure[Head]{
\includegraphics[width=0.29\textwidth]{figures/head2_cropped}
}
\subfigure[Abdomen]{
\includegraphics[width=0.265\textwidth]{figures/abdomen_cropped}
}
\subfigure[Femur]{
\includegraphics[width=0.29\textwidth]{figures/femur_cropped}
}
}
% \label{fig:image_examples}
% \caption{Examples of dataset samples and bounding boxes}
\end{figure}

\paragraph{Validation and test datasets.} We also use a small validation dataset (315 images) to monitor the loss and metrics during training and avoid over-fitting on the train dataset, and a final test dataset of 517 images on which all the statisitical evaluation is performed.

% \begin{figure}
%     \floatconts{fig:dataset_ori_distribution}
%     {\caption{Original training dataset number of bounding boxes per anatomy}}
%     {
%     \includegraphics[scale=0.6]{figures/class_imbalance.pdf}
%     }
% \end{figure}

\subsection{Setup}
\label{subsec:setup}

\paragraph{Parameters.}
In order to evaluate our method, we need to set the hyper-parameters $\alpha$ and $\lambda$, defined in Section \ref{subsec:uniqueness}. To do so, we simulated draws after solving the optimization problem for $\lambda \in \left[0, 1000\right]$ and $\alpha \in \left[0, 1/ N \right]$. 

We chose to set $\alpha = 0.5 \times 1/N$ as this ensures that each image is seen at worse twice fewer times than in the original training dataset distribution. We found it to be a good compromise between high data variety and class balance. 
% Indeed if $\alpha = 0$ and $\lambda = 0$, a solution to \eqref{eq:Ppen} could be to see only a very small portion of the images that contain all the classes and discard the rest of the dataset. 
% On the contrary, setting $\alpha = 1/N$ would lead to the original image distribution without balanced sampling, as the only feasible point would be $(1/N, \ldots, 1/N)$.

% In order to set $\lambda$, we simulate draws after solving \eqref{eq:Ppen} for $\lambda \in \left[0, 1000\right]$. that we illustrate in Figure \ref{fig:lambda_impact_img} for $\lambda = 0$ and $\lambda = 500$. 
% We report as well the corresponding frequencies of apparition per anatomy, in Figure \ref{fig:lambda_impact_freq} (for $\lambda \in \left\{0, 500\right\}$). 
When $\lambda$ is too small, the anatomies are better balanced but with very few images over-represented in the training set.
% On the contrary, classes are slightly more imbalanced with $\lambda = 500$ but with a higher number of images over-represented, with a maximal oversampling factor of $5$ compared to the average image frequency.  
We found $\lambda = 500$ to offer a good compromise. High enough for regularization to become effective, but low enough to actually achieve a more balanced dataset in terms of anatomy distribution, as illustrated on \figureref{fig:lambda_impact_freq}.
%It is thus important to find a balance between a high enough value of $\lambda$ for regularization to be effective, but low enough to actually achieve a more balanced dataset in terms of anatomy distribution. In our experiments, we set $\lambda = 500$.

% \begin{figure}
% \floatconts
% {fig:lambda_impact_img}
% {\caption{Probability per image based on $\lambda$}}
% {
% \subfigure[$\lambda = 0$]{
% \includegraphics[width=0.38\textwidth]{figures/prob_per_image_nopen.pdf}
% }
% \subfigure[$\lambda = 500$]{
% \includegraphics[width=0.38\textwidth]{figures/prob_per_image_pen500.pdf}
% }
% }
% \end{figure} 

\begin{figure}[h]
\floatconts
{fig:lambda_impact_freq}
{\caption{Impact of our sampling method (with $\alpha = 0.5 \times 1 / N$ and $\lambda = 500$) on the frequencies of the different anatomies in the training dataset, compared to uniform sampling. Colors indicate anatomies that share the same main anatomies.}}
{
\subfigure[Uniform sampling]{
\includegraphics[width=0.42\textwidth]{figures/histogram_training}
}
\subfigure[``Balanced" sampling]{
\includegraphics[width=0.42\textwidth]{figures/frequencies_pen500}
}
}
\end{figure} 

\paragraph{Model.}
One of the state-of-the-art deep learning models for object detection is the YOLO model \cite{Redmon2015,Redmon2016}. Because of its high speed, and in view of real-time usage, this is the baseline model that we use. The network's layers are initialized using weights pretrained on the VOC 2007 dataset \cite{Everingham15}. 
We start from a learning rate of $1e-5$, then increase it by a factor 10 after 50 epochs, and then decrease it by a factor 5 regularly during training (approx. every 500 epochs).
All networks are trained using Adam \cite{KingmaB14} algorithm, and various data augmentation operations (scaling, translation, rotation, gaussian noise, flips) are also applied.

\subsection{Results}
\label{subsec:results}

To evaluate the impact of balanced sampling, we use three different training methods:
\begin{itemize}
    \item \textbf{Image generator with \emph{uniform sampling}}: the original training set is uniformly sampled. This will be our first baseline strategy.
    \item \textbf{Image generator with \emph{oversampling}}: our two minority classes, namely heart and cerebellum are duplicated in order to artificially produce more training images, with a factor 2 for cerebellum, and 5 for heart. The choice of those factors was decided to get a frequency comparable to the remaining classes. This will be our second baseline.
    \item \textbf{Image generator with \emph{balanced sampling}}: a different probability factor is applied to each image of the training set, in order to obtain a more balanced dataset. This factor is determined by finding a solution to \eqref{eq:Ppen} with $\alpha = 0.5 \times 1/N$ and $\lambda = 500$.
\end{itemize}

\subsubsection{Evaluation method.}
We evaluate each of the trained models with the mean Average Precision (mAP) metric.  Due to the inherent stochastic nature of neural network's training (even with a fixed weight initialization), we trained several models for each method (30 times with \emph{uniform sampling}, 12 times with \emph{oversampling} and 11 times with \emph{balanced sampling}) aiming to obtain a statistically significant comparison of the various training methods. Due to long training times (around half a day to train one model, on a GTX 1080 Ti, with input image size $416 \times 416$), we make the choice of evaluating only one setup of $\alpha$ and $\lambda$ in order to be able to conduct a thorough statistical evaluation and comparison with the baselines. 

In order to compute the mAP, we need to set an Intersection Over Union (IoU) threshold $\theta$ that will separate false positives ($\IoU < \theta$) from true positives ($\IoU \geq \theta$) bounding boxes detections. In what follows, we will use the values $\theta = 0.2$ and $\theta = 0.4$.

We used the Kolmogorov-Smirnov test (KS) (for which we give more details in Appendix \ref{appendix:KS}) to compare performance distributions from both methods. 

\subsubsection{Detailed results and statistical analysis.}
In Table \ref{tab:map_results} and Table \ref{tab:map_results_oversampling}, we provide average mAP results over all our trained models with  $\theta \in \{ 0.2, 0.4 \}$, as well as their respective standard deviations. For instance, for $\theta = 0.2$, the KS test between uniform sampling and balanced sampling provides us with a statistic $D = 0.476$ and p-value $P = 0.035$. 
Our strategy improves the overall performance of the models, both when compared to uniform or oversampling.

\paragraph{Balanced sampling vs. uniform sampling.}
As displayed in Table \ref{tab:map_results}, the improvement appears when looking at specific sub-anatomies, such as the heart or cerebellum, which are under represented in the original dataset. With uniform sampling, the trained models perform very poorly on these structures, whereas the AP performance is greatly improved using balanced sampling.

\paragraph{Balanced sampling vs. oversampling.} Duplicating images of the minority classes has boosted the performance of the networks on those classes, compared to the uniform strategy. As displayed in Table \ref{tab:map_results_oversampling}, our strategy and oversampling perform similarly on under-represented anatomies. However, our method has a better overall performance, showing that it is efficient in boosting the performance on under-represented classes, while not deteriorating the performance on the remaining classes.

\begin{table}
    \floatconts{tab:map_results} 
    {\caption{mAP based on IoU threshold $\theta \in \{0.2, 0.4\}$ with a focus on under-represented anatomies such as the heart and cerebellum. The average, std and best mAP values over all trained models are presented, as well as the p-value for the statistical test comparing the distribution of performances for \emph{uniform sampling} and our \emph{balanced sampling} strategy.}}
    {
        \begin{tabular}{r c rrr c rrr c r}
        \toprule
         & ~ &  \multicolumn{3}{c}{Uniform sampling} & ~ & \multicolumn{3}{c}{Bal. sampling (ours)} & ~ & \\
        \cmidrule{3-5} \cmidrule{7-9}  
         & & average & std & best & & average & std & best & & p-value \\
        \midrule
        all anatomies & & & & & & & & & & \\
        mAP @ $\theta = 0.2$ & & $58.97$ & $3.96$ & $63.4$ & & \bf{62.35} & $1.53$ & \bf{64.3} & & .035 \\
        %\cline{2-8} 
        mAP @ $\theta = 0.4$ & & 54.87 & 3.98  & \bf{59.6} & & \bf{57.75} & 1.48 & 59.3 & & .087 \\
        
        Heart & & & & & & & & & & \\
        AP @ $\theta = 0.2$ & & $1.03$ & $3.08$ & $7.7$ & & \bf{6.95} & $7.34$ & \bf{21.2} & & .038 \\
        %\cline{2-8} 
        AP @ $\theta = 0.4$ & & 0.5 & 2.31 & 7.7 & & \bf{5.7} & 5.8 & \bf{15.4} & & .031 \\
        
        Cerebellum & & & & & & & & & &  \\
        AP @ $\theta = 0.2$ & & $4.84$ & $12.01$ & $33.8$ & & \bf{17.13} & $9.91$ & \bf{36.3}& & .0002 \\
        %\cline{2-8} 
        AP @ $\theta = 0.4$ & & 3.72 & 9.4 & 28.6 & & \bf{14.9} & 9.4 & \bf{36.3} & & .0001 \\
        \bottomrule
    \end{tabular}
    }
\end{table}
 
\begin{table}   
    \floatconts{tab:map_results_oversampling}
    {
    \caption{Comparison between \emph{oversampling} and our \emph{balanced sampling} strategy.}
    }
    {
    \begin{tabular}{r c rrr c rrr c r}
        \toprule
         & ~ &  \multicolumn{3}{c}{oversampling} & ~ & \multicolumn{3}{c}{Bal. sampling (ours)} & ~ & \\
        \cmidrule{3-5} \cmidrule{7-9}  
         & & average & std & best & & average & std & best & & p-value \\
        \midrule
        all anatomies & & & & & & & & & & \\
        mAP @ $\theta = 0.2$ & & $56.8$ & $3.5$ & $62.3$ & & \bf{62.35} & $1.53$ & \bf{64.3} & & .002 \\
        %\cline{2-8} 
        mAP @ $\theta = 0.4$ & & $51.9$ & $3.5$ & $57.2$ & & \bf{57.75} & 1.48 & \bf{59.3} & & .004 \\
        
        Heart & & & & & & & & & & \\
        AP @ $\theta = 0.2$ & & $3.4$ & $4.5$ & $15.3$ & & \bf{6.95} & $7.34$ & \bf{21.2} & & .86 \\
        %\cline{2-8} 
        AP @ $\theta = 0.4$ & & $0.0$ & $0.0$ & $0.0$ & & \bf{5.7} & 5.8 & \bf{15.4} & & .04 \\
        
        Cerebellum & & & & & & & & & &  \\
        AP @ $\theta = 0.2$ & & \bf{19.1} & $12.0$ & \bf{44.1} & & $17.13$ & $9.91$ & $36.3$& & .89 \\
        %\cline{2-8} 
        AP @ $\theta = 0.4$ & & $14.6$ & $10.7$ & \bf{37.2} & & \bf{14.9} & $9.4$ & $36.3$ & & .85 \\
        \bottomrule
    \end{tabular}
    }
\end{table}

%% Joint table
% \begin{table}
%     \floatconts{tab:map_results_all}
%     {\caption{mAP based on IoU threshold $\theta \in \{0.2, 0.4\}$ with a focus on under-represented anatomies such as the heart and cerebellum. The average, std and best mAP values over all trained models are presented, as well as the p-value for the statistical test comparing the distribution of performances for \emph{uniform sampling} and our \emph{balanced sampling} strategy.}}
%     {  
%         \begin{tabular}{\textwidth}{r c rrr c rrr c rrr c rr}
%         \toprule
%          & ~ &  \multicolumn{3}{c}{Uniform sampling} & ~ & \multicolumn{3}{c}{Oversampling} & ~ & \multicolumn{3}{c}{Bal. sampling (ours)} & ~ & \multicolumn{2}{c}{p-values}\\
%         \cmidrule{3-5} \cmidrule{7-9} \cmidrule{11-14} \cmidrule{16-17}
%          & & average & std & best & & average & std & best & & average & std & best & & ours vs. uniform & ours vs.oversampling \\
%         \midrule
%         all anatomies & & & & & & & & & & & & & & & & & & \\
%         mAP @ $\theta = 0.2$ & & $58.97$ & $3.96$ & $63.4$ & & $56.8$ & $3.5$ & $62.3$ & & \bf{62.35} & $1.53$ & \bf{64.3} & & .035 & 0.002 \\
%         %\cline{2-8} 
%         mAP @ $\theta = 0.4$ & & 54.87 & 3.98  & \bf{59.6} & & $51.9$ & $3.5$ & $57.2$ & & \bf{57.75} & 1.48 & 59.3 & & .087 & 0.004\\
        
%         Heart & & & & & & & & & & & & & &  \\
%         AP @ $\theta = 0.2$ & & $1.03$ & $3.08$ & $7.7$ & & $3.4$ & $4.5$ & $15.3$ & & \bf{6.95} & $7.34$ & \bf{21.2} & & .038 & 0.86\\
%         %\cline{2-8} 
%         AP @ $\theta = 0.4$ & & 0.5 & 2.31 & 7.7 & & $0.0$ & $0.0$ & $0.0$ & & \bf{5.7} & 5.8 & \bf{15.4} & & .031 & 0.04\\
        
%         Cerebellum & & & & & & & & & & & & & &   \\
%         AP @ $\theta = 0.2$ & & $4.84$ & $12.01$ & $33.8$ & & \bf{19.1} & $12.0$ & \bf{44.1} & & \bf{17.13} & $9.91$ & \bf{36.3}& & .0002 & 0.89\\
%         %\cline{2-8} 
%         AP @ $\theta = 0.4$ & & 3.72 & 9.4 & 28.6 & & $14.6$ & $10.7$ & \bf{37.2} & &  \bf{14.9} & 9.4 & \bf{36.3} & & .0001 & 0.85\\
%         \bottomrule
%     \end{tabular}
%     }
% \end{table}

\subsubsection{Discussion}
It is also interesting to notice than even if the average mAP over all trained models is improved with \emph{balanced sampling}, it is more limited when focusing on the best performing trained model. In fact, depending on the $\IoU$-threshold that is set for the mAP evaluation, the best model can be obtained with the strategy consisting in picking images uniformly (which needs to be mitigated by recalling that we trained the uniform baseline strategy three times more than the two others). 

We interpret the difference as the fact that a balanced training dataset reduces the stochastic impact of data feeding to the network. The lower std values obtained with \emph{balanced sampling} support this interpretation. It tends to make the training more \emph{robust} and \emph{reproducible} and enables to reach the best level of performance in a more systematic way, and with less tries. We believe this can be of great interest in practice, given the time and resources required to train deep neural networks.

\section{Conclusion}
\label{sec:conclusion}
In this paper, we suggest a new approach to deal with imbalanced datasets for object detection problems. During training, images are sampled following a probability distribution that helps bring balance between various classes. This probability distribution is computed beforehand by solving some quadratic optimization problem. Besides, the method is systematic, and can be applied to potentially any object detection problem. However, it requires tuning of the two hyperparameters $\alpha$ and $\lambda$.
% , for which visual inspection of the effects of these parameters, as displayed in Figures \ref{fig:lambda_impact_img} and \ref{fig:lambda_impact_freq}, can be of valuable help.

We also showed how this sampling strategy impacted the performance of models: under-represented structures become better detected, while it does not deteriorate the performance of the network on other structures. In fact, the average mAP performance increased by around $3\%$ compared to uniform sampling (while the standard deviation of the performance was reduced from $\approx 4$ to $\approx 1.5$), and by around $5.5\%$ compared to oversampling (while the standard deviation was reduced from $\approx 3.5$ to $\approx 1.5$).

A natural perspective would be to apply the technique to other object detection challenges, for instance on the COCO dataset (for natural images) or other medical imaging datasets, and further evaluate what it brings to the models' performance. Another interesting perpective would be to extend our evaluation to two-stage detectors, and combine our method with existing methods for two-stage detectors, for instance OFB, as they would act at two different levels of the training pipeline: before constitution of the batch (ours), and at the level of ROI proposals by the region proposal network (OFB). 

\midlacknowledgments{Both authors would like to thank Cybèle Ciofolo-Veit and Laurence Rouet for their valuable insight into both the clinical application and the scientific contribution of this work.}
\bibliography{olivier21} 

\appendix

\section{Mathematical properties of \eqref{eq:P0} and \eqref{eq:Ppen}.}
\label{appendix:conv}

\paragraph{Non-uniqueness of a solution for \eqref{eq:P0}.}

Depending on the matrix $A$, the solutions to problem \eqref{eq:P0} are not necessarily unique. Let us for instance consider a toy problem with 2 classes and 3 images, one of which contains an occurrence of the first class, the two others containing an occurrence of the second class. That is, the matrix $E$ encoding the distribution of classes in the dataset is:
\begin{equation}
    E = 
    \begin{pmatrix}
    1 & 0 \\
    0 & 1 \\
    0 & 1
    \end{pmatrix}
\end{equation}
Then, any vector $p \in \mathbb{R}^3$ of the form $p = (0.5, 1/x, 0.5 - 1/x)$ with $\alpha \geq x \geq 2/(1 - 2 \alpha)$ is a solution to the problem (indeed, $E^T p$ is in this case an eigenvector of the matrix $A'$, associated to the eigenvalue $0$).

\paragraph{Structure of the solution to \eqref{eq:Ppen}.}
For all $\lambda > 0$ and $0 \leq \alpha \leq 1/N$, the following result holds on the structure of the solution to \eqref{eq:Ppen}:
\begin{proposition}
\label{prop:solution}
For $\lambda > 0$ and $0 \leq \alpha \leq 1/N$, let $p^{(\lambda, \alpha)}$ denote the (unique) solution to \eqref{eq:Ppen}. If $X_{i_1}$ and $X_{i_2}$ are two images containing the same objects, \emph{i.e.}, the $i_1$-th and $i_2$-th lines of the matrix $E$ are the same, then the $i_1$-th and $i_2$-th components of vector $p^{(\lambda, \alpha)}$ are the same, $p^{(\lambda, \alpha)}_{i_1} = p^{(\lambda, \alpha)}_{i_2}$.
\end{proposition}

This result shows that our sampling technique is consistent for images containing exactly the same objects: if two (or more) images have the same object distribution (\emph{i.e.}, same corresponding lines in the matrix $E$), they will be assigned the same sampling probability, ensuring that none is favored over the other.

\begin{proof}
For $\lambda > 0$ and $\alpha \leq 1/N$, we denote by $p^{(\lambda, \alpha)}$ the solution to \eqref{eq:Ppen}.
Let $i_1$ and $i_2$ be two indices such that the $i_1$-th and $i_2$-th lines of $E$ are the same. We also denote by $(L_1, L_2, \ldots, L_N)$ the lines of the matrix $E$, and we therefore have $L_{i_1} = L_{i_2}$.

We show by contradiction that it implies that $p^{(\lambda, \alpha)}_{i_1} = p^{(\lambda, \alpha)}_{i_2}$. Assuming that $p^{(\lambda, \alpha)}_{i_1} \neq p^{(\lambda, \alpha)}_{i_2}$, we define the vector $q$ by:
\begin{equation}
\label{eq:qdef}
    q_i = \left\{
    \begin{array}{cc}
        p^{(\lambda, \alpha)}_{i} & \text{ if } i \neq i_1 \text{ and } i\neq i_2, \\
        \frac{p^{(\lambda, \alpha)}_{i_1} + p^{(\lambda, \alpha)}_{i_2}}{2} & \text{ if } i = i_1 \text{ or } i = i_2.
    \end{array}
    \right.
\end{equation}
First, it is obvious that $q_1 + \ldots + q_N = p^{(\lambda, \alpha)}_1 + \ldots + p^{(\lambda, \alpha)}_N = 1$, and that $q_i \geq \alpha$ for all $i \in \llbracket 1, N \rrbracket$.

Then, we verify that the probabiliy vector $q$ yields the same frequency distibution as $p^{(\lambda, \alpha)}$:
\begin{align}
\label{eq:Eq}
\begin{split}
    E^T p^{(\lambda, \alpha)}   &= \sum_{i=1}^N{p^{(\lambda, \alpha)}_i L_i^T} \\
            &= p^{(\lambda, \alpha)}_{i_1} L_{i_1}^T + p^{(\lambda, \alpha)}_{i_2} L_{i_2}^T + \sum_{i \neq i_1, i_2}{p^{(\lambda, \alpha)}_i L_i^T} \\
            &= 2 \cdot \frac{p^{(\lambda, \alpha)}_{i_1} + p^{(\lambda, \alpha)}_{i_2}}{2} L_{i_1}^T + \sum_{i \neq i_1, i_2}{q_i L_i^T} \\
            &= q_{i_1} L_{i_1}^T + q_{i_2} L_{i_2}^T + \sum_{i \neq i_1, i_2}{q_i L_i^T} \\
            &= E^T q.
\end{split}
\end{align}

We deduce from the equality \eqref{eq:Eq} that 
\begin{align}
\label{eq:cost_equal}
    \begin{split}
        q^T A q &= (E^T q)^T A' E^T q \\
        &= (E^T p^{(\lambda, \alpha)})^T A' E^T p^{(\lambda, \alpha)} \\
        &= {p^{(\lambda, \alpha)}}^T A p^{(\lambda, \alpha)}.
    \end{split}
\end{align}

Finally, we compute the euclidian norm of $q$:
\begin{align}
\label{eq:norm_lower}
\begin{split}
    \left\| q \right\|_2^2  &= \sum_{i=1}^N q_i^2 \\
                            &= q_{i_1}^2 + q_{i_1}^2 + \sum_{i \neq i1, i2} q_i^2 \\
                            &= \frac{\left(p^{(\lambda, \alpha)}_{i_1} + p^{(\lambda, \alpha)}_{i_2}\right)^2}{2} + \sum_{i \neq i1, i2} \left(p^{(\lambda, \alpha)}_i\right)^2  \\
                            &< \left(p^{(\lambda, \alpha)}_{i_1}\right)^2 + \left(p^{(\lambda, \alpha)}_{i_2}\right)^2 + \sum_{i \neq i1, i2} \left(p^{(\lambda, \alpha)}_i\right)^2   \\
                            &< \left\| p^{(\lambda, \alpha)} \right\|_2^2.
\end{split}
\end{align}
Therefore, combining Equation \eqref{eq:cost_equal} and \eqref{eq:norm_lower}, we get the following inequality on the cost of the optimization problem: 
\begin{equation}
    \frac{1}{2} q^T A q + \lambda \frac{\left\| q \right\|_2^2}{2} < \frac{1}{2} {p^{(\lambda, \alpha)}}^T A p^{(\lambda, \alpha)} + \lambda \frac{\left\| p^{(\lambda, \alpha)} \right\|_2^2}{2},
\end{equation}
which contradicts the optimality of the solution $p^{(\lambda, \alpha)}$.
\end{proof}

\paragraph{Solving \eqref{eq:Ppen}.}
The optimization problem \eqref{eq:Ppen} is a quadratic optimization problem, with linear constraints. This class of problems admits a wide variety of solving methods, one of which is \emph{interior-point methods}. For this paper, we used the Python package \texttt{CVXOPT} \cite{Andersen2011} which offers a convenient setting to define any optimization problem under the form 
\begin{equation}
\begin{array}{cc}
\text{Minimize} & \frac{1}{2} x^T Q x + q^T x,   \\
\text{s.t.} & \left\{ 
                    \begin{array}{c}
                    G x \preceq h, \\
                    A x = b. \\
                    \end{array}
                    \right.
\end{array}
\end{equation}
Let us also mention the interior point optimizer \texttt{IPOPT} \cite{Wachter2006} (for general non-linear programming), which offers an interface with many programming languages, and can also be coupled with the modeling language \texttt{AMPL} \cite{fourer1993ampl}.

\section{Kolmogorov-Smirnov test} 
\label{appendix:KS}
In this section, we make a general comment on our methodology to statistically evaluate the performance of two models. Assume that we want to compare the performance of a ``new" model with a baseline model. The ``new" model is trained $n$ times, with performance $x = (x_1, \ldots, x_n)$ following an (unknown) distribution $F$, and the baseline model is trained $m$ times, with performance $y = (y_1, \ldots, y_m)$ following a distribution $G$.  

First, the empirical distribution functions need to be defined:
\begin{align*}
    F_n(u) &= \frac{1}{n}\sum_{i=1}^n{\mathds{1}_{\left] - \infty, u \right]}(x_i)}, \\
    G_m(u) &= \frac{1}{m}\sum_{j=1}^m{\mathds{1}_{\left] - \infty, u \right]}(y_j)}.
\end{align*}
The Kolmogorov statistic is then defined as 
\begin{equation*}
    D_{n,m} = \sup_{u\in \mathbb{R}} {\left| F_n(u) - G_m(u)\right|} 
\end{equation*}

We consider the null-hypothesis $\mathcal{H}_0$ that the samples $x$ and $y$ follow the same distribution, \emph{i.e.}, $F = G$.
Given an observed value $d$ of the Kolmogorov statistic, and in order to reject (or not) the null-hypothesis $\mathcal{H}_0$, we aim at computing the p-value of the statistical test:
\begin{equation}
\label{eq:kstest}
    P = \mathbb{P}(D_{n,m} \geq d ~ | ~  F=G ).
\end{equation}

If the probability \eqref{eq:kstest} is below a threshold $\alpha$, we reject $\mathcal{H}_0$ at level $\alpha$. In python, this statistical test is implemented with the function \texttt{ks\_2samp}, within the package \texttt{scipy.stats}.

\end{document}
