\RequirePackage{etoolbox}
\patchcmd{\bibliographystyle}{#1}{abbrvnat}{}{}

\documentclass{midl} % Include author names
%\documentclass[anon]{midl} % Anonymized submission

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{booktabs}
\usepackage{multirow}
\usepackage{array}
\newcolumntype{H}{>{\setbox0=\hbox\bgroup}c<{\egroup}@{}}
\usepackage{mwe} % to get dummy images

%\jmlrvolume{-- Under Review}
\jmlryear{2020}
\jmlrworkshop{Full Paper -- MIDL 2020}

\editors{Accepted to MIDL 2020}

\title[Direct estimation of fetal head circumference based on regression CNN]{Direct estimation of fetal head circumference from ultrasound images based on regression CNN}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Jing Zhang\nametag{$^{1}$}
%\midljointauthortext{Contributed equally}\nametag{$^{1,2}$}
} \Email{jing.zhang@insa-rouen.fr}\\
\Name{Caroline Petitjean\midlotherjointauthor\nametag{$^{1}$}} \Email{caroline.petitjean@univ-rouen.fr}\\
\Name{Pierre Lopez\nametag{$^{1}$}} \Email{pierre.lopez@etu.univ-rouen.fr}\\
\Name{Samia Ainouz\nametag{$^{1}$}} \Email{samia.ainouz@insa-rouen.fr}\\
\addr $^{1}$Normandie Univ, INSA Rouen, UNIROUEN, UNIHAVRE, LITIS, Rouen, France
%\addr $^{2}$ Litis Lab, University of Rouen, 76000, France \AND
%\addr $^{3}$ Address 3 \AND
%\Name{Author Name5\midlotherjointauthor\nametag{$^{4}$}} \Email{fgh@bar.com}\\
%\addr $^{4}$ Address 4
}

\begin{document}
\maketitle

\begin{abstract}
The measurement of fetal head circumference (HC) is performed throughout the pregnancy as a key biometric to monitor fetus growth. This measurement is performed on ultrasound images, via the manual fitting of an ellipse. The operation is operator-dependent and as such prone to intra and inter-variability error. There have been attempts to design automated segmentation algorithms to segment fetal head, especially based on deep encoding-decoding architectures. In this paper, we depart from this idea and propose to leverage the ability of convolutional neural networks (CNN) to directly measure the head circumference, without having to resort to handcrafted features or manually labeled segmented images. The intuition behind this idea is that the CNN will  learn itself to localize and identify the head contour. Our approach is experimented on the public HC18 dataset, that contains images of all trimesters of the pregnancy. We investigate various architectures and three losses suitable for regression. While room for improvement is left, encouraging results show that it might be possible in the future to directly estimate the HC - without the need for a large dataset of manually segmented ultrasound images. This approach might be extended to other applications where segmentation is just an intermediate step to the computation of biomarkers.
\end{abstract}

\begin{keywords}
CNN, deep regression, ultrasound images, fetus head circumference
\end{keywords}

\section{Introduction}

Automated measurement of fetal head circumference (HC) is performed throughout the pregnancy as a key biometric to monitor fetus growth  and estimate gestational age. In clinical routine, this measurement is performed on ultrasound (US) images, via manually tracing of the skull contour or fitting it to an ellipse. Indeed, identifying the head contour is challenging due to low signal-to-noise ratio in US images, and also because the contours have fuzzy (and sometimes missing) borders (Fig. \ref{fig:us}). Manual contouring is an operator-dependant operation, prone to intra and inter-variability, which provokes inaccurate measurements \citep{sarris2012intra}. More precisely, the 95\% limits of agreement are $\pm$7mm for the intra-operator variability and $\pm$12mm for the inter-operator variability \cite[Tab. 1 p. 272]{sarris2012intra}.

Some works have been proposed to automate the measurement of fetal head circumference in US images, such as \cite{li2017randomf,LU2005929,JARDIM2005243}. However, there are now more and more works aiming at directly extracting biomarkers from medical images, such as organ volume, area or features, to help clinical diagnosis. The goal is to avoid intermediate steps, such as segmentation, that maybe computationally expensive (both for model training and labeling) and prone to errors \cite{zhen2015direct}.
For example, in \cite{Direct2015}, the authors propose a learning-based approach to perform a direct volume estimation of the cardiac left and right ventricles from magnetic resonance images, without segmentation. The approach consists in computing shape descriptors using a bag-of-word model, and to perform Bayesian estimation and regression forests. By taking advantage of the power of convolutional neural networks (CNN), one can now skip the feature design step and learn the features, while at the same time performing the prediction of the value of interest, i.e. performing regression. Note that regression CNN have found several applications in the field of computer vision, such as head-pose estimation \cite{headpose20163d}, facial landmark detection \cite{facial2013deep} and human-body pose estimation \cite{2014bodypose}.

In this work, we investigate if such a direct approach is reasonable to estimate the HC from ultrasound images, without having to resort to segmentation. Our approach is based on a regression CNN, for which we investigate four architectures, which differ by their complexity, and  explore three losses for regression. Our experiments are carried out on the public dataset HC18 \cite{hcdataset}. To our knowledge, this is the first attempt to directly assess the fetal head circumference, without resorting to segmentation.

\begin{figure}
    \centering
    \includegraphics[width=12cm]{fig/us.png}
    \caption{Ultrasound images of fetal head from \cite{hcdataset}. Corresponding head circumference (HC) is displayed in millimeters and pixels.}
    \label{fig:us}
\end{figure}

The rest of the paper is organized as follows. Section 2 introduces related works about HC measurement in ultrasound images. Section 3 describes the proposed architecture and the loss functions. Experiments are conducted in Section 4. The conclusion and future works are drawn in Section 5.

\section{Related works}

Several approaches have been proposed in the literature to measure the head circumference in US images, based on image segmentation. Usually they follow at two-step approach, namely fetal head localization and segmentation refinement. In \cite{HCchallenge}, the first step consists in locating the fetal head via machine learning, with Haar-like features used to train a random forest classifier; and the second step consists in the measurement of the HC, via ellipse fitting and Hough transform. Similar method is used in \cite{li2017randomf}. Other approaches build upon deep segmentation models also in a two-step process, prediction and ellipse fitting \cite{kim2019automatic}. In \cite{HC2019confident}, the standard segmentation model U-Net \cite{2015unet} is trained using manually labeled images, and segmentation results are fitted to ellipses. In \cite{2019fetalseg}, authors build upon the same idea, combining image segmentation and ellipse tuning together in a multi-task network. 

\section{CNN regressor}

Standard CNN have several convolutional layers followed by fully-connected layers, ended with a classification softmax layer. Adapting a classification CNN architecture to regression consists in removing the softmax layer and replacing it by a fully connected regression layer with linear or sigmoid activation. Linear activation means that the transfer function is a straight line, thus the activation is proportional to input, and not confined to a specific range.

\subsection{Model architectures}

We have experimented four deep models with varying numbers of parameters and depths: two custom models and two common architectures. We have considered two simple models inspired by the base regressor of \cite{Dubost2019}: a first model called CNN\_263K, with around 263K parameters  and the second one called CNN\_1M which has around 1M parameters (see Fig. \ref{fig:model}). We also experimented  VGG16 (+14M parameters) \cite{vgg2014} and Resnet50 (+23M parameters) \cite{resnet2016} pre-trained on ImageNet, and subsequently trained on our dataset. In each model, the fully connected regression layer has linear activation.

\begin{figure}
\centering
%\setlength{\lineskip}{1ex}% increase spacing
\subfigure[Regression CNN\_1M]{\includegraphics[width=.40\textwidth]{fig/model_1m.pdf}}%
%\hspace{.1\textwidth}%
\subfigure[Regression CNN\_263K]{\includegraphics[width=.40\textwidth]{fig/model_263k.pdf}}%
%\hspace{.1\textwidth}%
\caption{Architectures of the custom CNN based regression models}
\label{fig:model}
\end{figure}

\subsection{Regression loss function}
 Conventional regression loss functions are metrics-inspired losses, namely the Mean Absolute Error (MAE), Mean Squared Error (MSE) and Huber Loss (HL), defined as:
\begin{equation}
    MAE =\frac{1}{n} \sum_{i=1}^n|p_i-g_i|
\end{equation}

\begin{equation}
    MSE = \frac{1}{n} \sum_{i=1}^n(p_i-g_i)^2
\end{equation}

\begin{equation}
HL =\left\{
\begin{aligned}
&\frac{1}{n} \sum_{i=1}^n\frac{1}{2}(p_i-g_i)^2, \qquad\quad\quad \text{for} \quad|p_i-g_i|<\delta \\
&\frac{1}{n} \sum_{i=1}^n\delta*(|p_i-g_i|-\frac{\delta}{2}), \quad\text{otherwise}
\end{aligned}
\right.
\label{eq:hl}
\end{equation}
where predicted (resp. ground truth) values are denoted $p_i$ (resp. $g_i$). Huber loss is less sensitive to outliers than the quadratic loss \cite{2019hlnovel}. As there is no heuristics to chose one loss over the other, we experience these three loss functions, as advocated in \cite{2019deepreg}. 

\section{Experiments}

\subsection{The HC18 dataset}
We use the HC18 \textit{training} dataset \cite{hcdataset}, that contains 999 US images acquired at varying times during the pregnancy, along with the corresponding head circumference\footnote{The HC18 challenge is rather dedicated to head segmentation and evaluation on the HC18 \textit{test} set requires to submit the parameters of an ellipse, which we do not have in our case.}. HC values range from 439.1 pixels (44.3 mm) to 1786.5 pixels (346.4 mm), with average value being 1263.3$\pm$264.4pixels (174.4$\pm$65.2mm). We randomly split the dataset into a training (600), a validation (200) and a test set (199), except for the images that were made during one echographic examination, that are assigned the same set. We augment the data of the training set to 1800 images, by performing horizontal flipping, translation with 5 pixels offset, and rotation with 10 degrees.

Image preprocessing includes a resizing from 800 $\times$ 540 pixels to 224$\times$224, and normalization by subtracting the mean and dividing by standard deviation. The HC values are normalized by dividing the maximum value of HC, in order to improve convergence.

\subsection{Experimental setup}

All the experiments are performed with 5-fold cross validation. The metrics to evaluate the results are Mean Absolute Error (mae) measured in pixels and in mm, and the percentage of mae (pmae). We have empirically set $\delta=0.5$ in Huber loss. Models are trained with a batch size of 8, a learning rate of $1e^{-3}$, and Adam as optimizer. Models are implemented with Keras and TensorFlow.

\subsection{Results}

\begin{table}[h]
\centering
\caption{Performance of regression models in terms of mean absolute error (mae) in pixels and \%mae ($\pm$ standard deviation) for three different loss functions: MSE, MAE, HL}
\label{tab:light}
\scalebox{0.75}{\begin{tabular}{@{}lllllllll@{}}
\toprule
\multicolumn{1}{c}{} & \multicolumn{2}{c}{CNN\_263K}& \multicolumn{2}{c}{CNN\_1M} & \multicolumn{2}{l}{Reg-VGG16} & \multicolumn{2}{l}{Reg-ResNet50} \\ \midrule
loss & mae(pix) & pmae(\%) & mae(pix)& pmae(\%) & mae(pix) & pmae(\%) & mae (pix) & pmae(\%) \\\midrule
MSE& 90.18$\pm$86.42 & 8.74$\pm$12.51 & 50.96$\pm$58.61 & 4.96$\pm$7.85&               38.85$\pm$40.31 & 5.31$\pm$5.63 & 36.21$\pm$35.82 & 4.62$\pm$4.27   \\
MAE& 101.85$\pm$108.51&10.99$\pm$18.48&51.61$\pm$59.96&5.15$\pm$8.66&  40.17$\pm$40.99&5.26$\pm$5.79&37.34$\pm$37.46&4.85$\pm$4.93      \\
HL&  98.18$\pm$89.77&9.69$\pm$13.9&53.87$\pm$66.46&5.45$\pm$9.08  &      40.7$\pm$40.07&5.67$\pm$5.19&38.18$\pm$37.32&5.16$\pm$4.84       \\ \bottomrule
\end{tabular}}
\end{table}


\begin{table}[h]
\centering
\caption{Performance of Reg-Resnet50 vs Reg-VGG16 in terms of mae (pixels) ($\pm$ standard deviation) with and without data augmentation (DA). }
\label{tab-noDA}
\scalebox{0.8}{
\begin{tabular}{@{}ccccc@{}}
\toprule
& \multicolumn{2}{c}{Reg-Resnet50} & \multicolumn{2}{c}{Reg-VGG16} \\
\midrule
loss & without DA & with DA & without DA & with DA  \\
\midrule
MSE & 63.92$\pm$63.61 & 36.21$\pm$35.82& 66.84$\pm$67.48 & 38.85$\pm$40.31\\
MAE & 62.44$\pm$63.63 & 37.34$\pm$37.46  & 67.71$\pm$68.03 &40.17$\pm$40.99 \\
HL & 66.62$\pm$66.18 & 38.18$\pm$37.32  & 67.02$\pm$76.08 &  40.7$\pm$40.07\\
\bottomrule
\end{tabular}}
\end{table}

\begin{table}[h]
\centering
\caption{Performance of Reg-Resnet50 vs Reg-VGG16 in terms of mae (pixels and mm) for three different loss functions: MAE, MSE, HL with data augmentation. $^{\dagger}$: significantly different (p $<$ 0.05) from all other methods, $^\circ$: significantly different (p$<$0.05) from all other methods, except for Reg-VGG16-MAE and Reg-VGG16-HL.}
\label{tab:DA}
\scalebox{0.8}{
\begin{tabular}{@{}ccccc@{}}
\toprule
& \multicolumn{2}{c}{Reg Resnet50} & \multicolumn{2}{c}{Reg VGG16} \\
\midrule
loss & mae (pixels) & mae (mm) & mae (pixels) & mae (mm) \\
\midrule
MSE & 36.21$\pm$35.82$^{\dagger}$  & 4.52$\pm$4.27$^{\dagger}$  & 38.85$\pm$40.31 & 4.87$\pm$5.81 \\
MAE & 37.34$\pm$37.46 & 4.78$\pm$4.41 & 40.17$\pm$40.99$^{\circ}$ & 5.46$\pm$5.99$^{\circ}$ \\
HL & 38.18$\pm$37.32 & 4.68$\pm$4.37 & 40.7$\pm$40.07$^{\circ}$ & 5.19$\pm$5.42$^{\circ}$ \\
\bottomrule
\end{tabular}}
\end{table}

\begin{figure}[t]
\centering
%\setlength{\lineskip}{1ex}% increase spacing
\subfigure[Reg-VGG16-MSE]{\includegraphics[width=.5\textwidth]{fig/mse_VGG16.png}}%
%\hspace{.1\textwidth}%
\subfigure[Reg-ResNet50-MSE]{\includegraphics[width=.5\textwidth]{fig/mse_resnet.png}}%
\caption{Training and validation losses for Reg-VGG16 and ResNet50 with MSE}
\label{fig:losscurve}
\end{figure}

\begin{figure}[t]
    \centering
    \includegraphics[width=12cm]{fig/good.png}
    \caption{Samples of well predicted HC values (in pixels) with corresponding US images, with Reg-Resnet50-MSE}
    \label{fig:good}
\end{figure}

\begin{figure}[t]
    \centering
    \includegraphics[width=12cm]{fig/bad.png}
    \caption{Samples of incorrectly predicted HC values (in pixels) with corresponding US images, with Reg-Resnet50-MSE}
    \label{fig:bad}
\end{figure}

In our experiments, we compare the four regression models and the three loss functions, the Mean Absolute Error (MAE), the  Mean Squared Error (MSE) and the Huber Loss (HL), and assess the added value of data augmentation.

Results in Tab. \ref{tab:light} show that the MSE loss obtains the best results, while the MAE and Huber loss have similar accuracy. Best results are obtained with Reg-VGG16 and Reg-ResNet50, which argues for a deeper architecture, with more power to grasp the image features. Reg-ResNet50 with MSE is found particularly powerful, as confirmed by the loss evolution during training and validation in Fig. \ref{fig:losscurve}.

Thus, in the rest of the experiments, we focus on Reg-VGG16 and Reg-Resnet50 only. First, looking at the contribution of data augmentation, we can gather from Tab. \ref{tab-noDA} that data augmentation is really necessary to get a boost in performance, the error being divided by almost 2 with data augmentation. Then, we use a paired Wilcoxon signed-rank test to evaluate if the differences between methods and regression losses are significant: it appears that Reg-Resnet50 with the MSE loss has a significantly different error  (p $<$ 0.05) than the rest of the methods, and is thus the best setting in this case, as shown in Table \ref{tab:DA}, where errors are reported in mm also. The best configuration thus has an error of 4.52$\pm$4.27 mm. This value is to be compared to the accuracy obtained by segmentation-based approaches: 2.12 $\pm$ 1.8 mm in \cite{2019fetalseg}, 2.8  $\pm$ 3.3 mm in \cite{HCchallenge} and 1.81 $\pm$ 1.6 mm in \cite{HC2019confident}. One should handle this comparison with care, since results have not been obtained on the same dataset and/or using the same protocol; however, we can say that the error obtained by the CNN regressor is doubled w.r.t that of segmentation-based approaches. Furthermore, standard deviation is high and remains to be investigated and compared to segmentation-based approaches.

The analysis of the prediction correctness w.r.t the images shows that correct predictions mainly stem from low speckle and highly contrasted US images; where the skull is rather correctly outlined, as shown in Figure \ref{fig:good}, whereas images with a high level of speckle inside and outside the skull, that include other structures, yield high errors (Figure \ref{fig:bad}).


\section{Conclusion}
In this work, we have proposed an approach to directly estimate the fetal head circumference from US images by regression CNN. Our goal was to estimate how far a direct estimation method of the HC via regression was, from conventional prediction methods, which are based on segmentation and ellipse fitting. The rationale behind our approach is to remove the need for segmenting the US image. We compared several regression CNN architectures and three loss functions. Experimental results showed that the deeper model Reg-ResNet50 performed better, along with the MSE loss function. Encouraging results are obtained, since the best models results in error comparable to manual measurement variability; however room for improvement for CNN-based regressors is left, especially when comparing to the accuracy of segmentation-based approaches.
Future work will focus on designing the network so that the feature extraction is fostered in a way to segment the image - without segmentation ground truth. For this, we will investigate attention mechanisms and multi-task learning. We will also investigate whether errors are related to gestational age, as is the case for manual measurements \cite{sarris2012intra}.


% Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{The authors would like to thank the China Scholarship Council (CSC) for supporting Jing Zhang and acknowledge the CRIANN (Centre des Ressources Informatiques et Applications Num\'erique de Normandie, France) for providing computational resources.}

%\bibliographystyle{myplainnat.bst}
\bibliography{zhang2020}

\end{document}