\documentclass{midl} % Include author names
% \documentclass[anon]{midl} % Anonymized submission

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

% Header for extended abstracts
\jmlrproceedings{MIDL 2020}{Medical Imaging with Deep Learning 2020}
\jmlrvolume{}
\jmlryear{}
\jmlrworkshop{MIDL 2020 -- Short Paper}
\editors{}

% to be uncommented for submissions under review
%\jmlrworkshop{Short Paper -- MIDL 2020 submission}
%\editors{Under Review for MIDL 2020}

\title[Using Generative Models for Pediatric wbMRI]{Using Generative Models for Pediatric wbMRI}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}
% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Alex Chang\midljointauthortext{Contributed equally}\nametag{$^{1,2,3}$}} \Email{changa47@cs.toronto.edu}\\
\Name{Vinith M. Suriyakumar\midlotherjointauthor\nametag{$^{1,2,3}$}} \Email{vinith@cs.toronto.edu}\\
\Name{Abhishek Moturu\midlotherjointauthor\nametag{$^{1,2,3}$}} \Email{moturuab@cs.toronto.edu}\\
\Name{Nipaporn Tewattanarat\nametag{$^{3}$}} \Email{nipaporn.tewattanarat@sickkids.ca}\\
\Name{Andrea Doria\nametag{$^{3}$}} \Email{andrea.doria@sickkids.ca}\\
\Name{Anna Goldenberg\nametag{$^{1,2,3}$}} \Email{anna.goldenberg@utoronto.ca}\vspace{1em}\\
\addr $^{1}$ Department of Computer Science, University of Toronto, Toronto, Canada \\
\addr $^{2}$ Vector Institute, University of Toronto, Toronto, Canada \\
\addr $^{3}$ The Hospital for Sick Children, Toronto, Canada}

\begin{document}

% how it fits in current literature
% rigour
% references

\maketitle

\begin{abstract}
Early detection of cancer is key to a good prognosis and requires frequent testing, especially in pediatrics. Whole-body magnetic resonance imaging (wbMRI) is an essential part of several well-established screening protocols, with screening starting in early childhood. To date, machine learning (ML) has been used on wbMRI images to stage adult cancer patients. It is not possible to use such tools in pediatrics due to the changing bone signal throughout growth, the difficulty of obtaining these images in young children due to movement and limited compliance, and the rarity of positive cases. We evaluate the quality of wbMRI images generated using generative adversarial networks (GANs) trained on wbMRI data from The Hospital for Sick Children in Toronto. We use the Fréchet Inception Distance (FID) metric, Domain Fréchet Distance (DFD), and blind tests with a radiology fellow for evaluation. We demonstrate that StyleGAN2 provides the best performance in generating wbMRI images with respect to all three metrics.
\end{abstract}

\begin{keywords}
machine learning, generative models, cancer detection, MRI, whole body MRI
\end{keywords}

% \iffalse
\section{Introduction}

Whole-body magnetic resonance imaging (wbMRI) is an essential part of well-established cancer screening protocols \cite{Villani:2016}. These protocols were shown to improve early detection of cancer for both adult \cite{attariwala2013whole} and pediatric \cite{greer2017pediatric} patients. Machine learning methods have been successfully applied in staging adult cancer patients from wbMRIs \cite{lavdas2019machine}. The same task is much more challenging for pediatric patients due to  i) varying bone signals during growth, ii) the movement and limited compliance of young children during imaging, and iii) the rarity of positive cases. The lack of training data suggests the need for alternatives to standard CNN-based \cite{10.1371/journal.pmed.1002699} approaches or augmentation-based detectors.

Generative models, such as generative adversarial networks (GANs), have shown promise in anomaly detection in numerous medical imaging applications \cite{yi2019generative}. Given the need for an automated pediatric wbMRI cancer screening tool, we set out to study different generative models for the primary task of generating pediatric wbMRIs. We limited our study to evaluating the generation of images. The quality of the generated images can be seen as a measure of how well the model has captured the underlying data distribution which is essential to the eventual downstream task of cancer screening (anomaly detection). We applied these models to 360 wbMRI slices from The Hospital for Sick Children in Toronto. We trained multiple GAN architectures and used Fr\'echet Inception Distance (FID), Domain Fréchet Distance (DFD), and radiology blind tests to evaluate the image quality of each model. 

We demonstrate that StyleGAN2 generates the best quality images and that DFD is a promising metric to compare image quality. We also demonstrate our preliminary results on the task of anomaly detection. Our analyses characterize the use of generative models for medical image generation and potential downstream tasks such as anomaly (cancer) detection, contributing to the much needed advances in pediatric medical imaging.

\section{Methods}

Our dataset is comprised of 90 de-identified healthy patients from a pediatric hospital, including males and females of ages 4 to 18. Four middle anatomically similar slides were selected from each volume. Each slice was  preprocessed using N4ITK bias field correction \cite{tustison2010n4itk}, contrast-limited adaptive equalization \cite{kaur2016mri}, and noise reduction \cite{senthilkumaran2014histogram}. We cropped and padded the images to be a uniform size of 800 $\times$ 256 to register the position of different patients.

We trained (Appendix A) four different generative models: DCGAN \cite{radford2015unsupervised}, StyleGAN \cite{DBLP:journals/corr/abs-1812-04948}, StyleGAN with progressive training (PGStyleGAN) \cite{DBLP:journals/corr/abs-1710-10196}, and StyleGAN2 \cite{karras2019analyzing}. For evaluation, we measured the FID \cite{DBLP:journals/corr/HeuselRUNKH17} and the DFD in the feature space of a Variational Autoencoder (VAE) trained on the same dataset according to \cite{1803.07474}. For our blind tests with our radiology fellow, we randomly chose 10 real images and 10 generated images from each model. We then showed the radiologist each of the images in random order asking them to classify the image as real or fake (generated).

Finally, we performed anomaly detection using a GAN trained with healthy images \cite{1703.05921}. With a query image, we find the closest generated image and subtract the two images to provide areas of high disease probability \figureref{fig:example2}. Cancer tumours are simulated by generating a set of circles around a point on the image with varying pixel intensities and radii. For future work, we are working on acquiring and using real cancer images instead of simulating tumours. We compared the accuracy of our anomaly detection to watershed segmentation \cite{mustaqeem2012efficient} which is traditionally used in low data settings as its performance is agnostic to data amount. This method is not the state of the art in classical image segmentation but it is a commonly used method that is low resource and fast which is why we selected it.

\section{Results}

\paragraph{Generated Image Quality.} 
\figureref{fig:example} shows samples from StyleGAN2 have the highest visual quality, which is supported by the error rate in classification by our radiologist in \tableref{tab:example}. The radiologist was able to detect most images were fake across all of the chosen architectures most commonly due to artifacts generated by the model which would not be present in real images. Furthermore, we observed that StyleGAN2 generates more diverse samples and does not suffer as much from mode collapse compared to other approaches.

\paragraph{Domain Fréchet Distance Metric.}
We observed that the FID metric is inconsistent with the visual quality of samples for this domain since StyleGAN2 should have the lowest FID (see \tableref{tab:example} and \figureref{fig:example}). We hypothesize the reason to be that our wbMRI images are very different from natural images used to train Inception v3. The DFD in the VAE feature space successfully captures the order of model performance for the same dataset. 

\paragraph{Anomaly detection.}
\figureref{fig:example2}A shows a proof-of-concept of the anomaly detection method proposed by \cite{1703.05921} for wbMRI. Since the GAN is only trained using healthy images, by finding the closest image in the generative distribution, we can highlight anomalous areas in a diseased query. We demonstrate that our GAN outperforms the classic watershed segmentation in \figureref{fig:example2}B.

%FID is computed using Inception v3 \cite{DBLP:journals/corr/SzegedyVISW15} feature space statistics and is commonly used in the literature to assess the quality and diversity of GAN samples.

\section{Conclusion}

In this paper, we demonstrate that state-of-the-art GANs are able to generate pediatric wbMRIs needed to enable automated cancer detection. In particular, samples generated using the StyleGAN2 architecture had high enough visual fidelity that our radiologist classified them as real. We also demonstrate that the FID metric used in the GAN literature is inappropriate for this domain and that DFD is a promising alternative. Finally, we show a downstream task of anomaly detection, using the GAN trained on healthy images to detect cancerous lesions, which may mitigate the need for scarce examples of wbMRIs with cancer.

\begin{table}[h]
\floatconts
  {tab:example}%
  {\caption{\small{FID and DFD scores (with VAE Features) along with the false positive rate for \\the radiologist blind test for each of the GAN architectures.}}}%
    {\begin{tabular}{llll}
  \bfseries Model & \bfseries FID & \bfseries DFD & \bfseries Radiologist False Positive Rate\\ 
  DCGAN & 457.30  & 23.72 & 0\%\\
  StyleGAN & 481.3 & 19.378 & 0\%\\
  PGStyleGAN & 442.61 & 18.56 & 20\%\\
  StyleGAN2 & 497.09 & 17.234 & 30\%\\
  \end{tabular}}
\end{table}

\begin{figure}[h]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
\floatconts
  {fig:example}%
  {\caption{\small{From left to right, two images generated by each of the following GAN architectures: DCGAN, StyleGAN, PGStyleGAN, and StyleGAN2.}}}%
  {\begin{center}
      {\includegraphics[width=0.104\linewidth]{dc1.png}\hspace{-0.15em}
  \includegraphics[width=0.104\linewidth]{dc2.png}\hspace{0.47em}
  \includegraphics[width=0.104\linewidth]{style1.png}\hspace{-0.15em}
  \includegraphics[width=0.104\linewidth]{style2.png}\hspace{0.47em}
  \includegraphics[width=0.104\linewidth]{stylepg1.png}\hspace{-0.15em}
  \includegraphics[width=0.104\linewidth]{stylepg2.png}\hspace{0.47em}
  \includegraphics[width=0.104\linewidth]{style21.png}\hspace{-0.15em}
  \includegraphics[width=0.104\linewidth]{style22.png}\vspace{-2em}
  }
  \end{center}}
  
 
\end{figure}

\begin{figure}[h]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
 \floatconts
 {fig:example2}%
 {\caption{\vspace{-2em}\small{(A): (i) query wbMRI; (ii) with imputed tumour; (iii) the nearest image in the generator's modeled distribution; (iv) the absolute difference between (ii) and (iii). (B): Accuracy of our GAN anomaly detection vs the watershed segmentation as a function of simulated tumours' pixel intensities (top) and tumour radii (bottom).}}}%
  {\begin{center}
      \includegraphics[width=0.84\linewidth]{anom_det.png}
  \end{center}}
  
%  \begin{verbatim}
%                    (A)                        
%     (B)
%  \end{verbatim}
  
\end{figure}
% \begin{figure}[htbp] 
%  % Caption and label go in the first argument and the figure contents
%  % go in the second argument
% \floatconts
%   {fig:example}
%   {\caption{AUC Curve for nodule intensities?}}
%   {\includegraphics[width=0.5\linewidth]{example-image}}
% \end{figure}
\clearpage
\midlacknowledgments{We acknowledge the support of the Natural Sciences and Engineering Research Council of Canada (NSERC) and the The Mark Foundation for Cancer Research. Resources used in preparing this research were provided, in part, by the Province of Ontario, the Government of Canada through CIFAR, and companies sponsoring the Vector Institute www.vectorinstitute.ai/\#partners.}
\bibliography{midl-shortpaper}

\newpage
\appendix
\section{GAN Training Settings}
\begin{table}[htbp]
 % The first argument is the label.
 % The caption goes in the second argument, and the table contents
 % go in the third argument.
\floatconts
  {tab:example3}%
  
  {\begin{tabular}{llll}
  \bfseries Model   & \bfseries $\alpha$    & \bfseries Batch Size    & Instance Noise steps \\ 
  DCGAN             & 0.001                 & 30                    & 10K   \\
  StyleGAN          & 0.001                 & 12                    & 10K   \\
  PGStyleGAN        & 0.001                 & 360,360,180,60,30,15*  & 10K**  \\
  StyleGAN2         & 0.002                 & 12                    & 0     \\
  VAE               & 0.001                 & 45                    & N/A     \\
  \end{tabular}}
  {\caption{Hyperparameters used during GAN training}}%
  
\end{table} 
* Batch size at each progressive growth step between 1 and 6 respectively. \newline
** After the complete growing of the last layer. \newline

In all models, a similar architecture skeleton is used to upsample noise (512) to the image resolution ($800 \times 256$). The Generator first upsamples noise to 512 feature maps of size $25 \times 8$ with a fully connected layer. Next, 5 convolutional blocks, each consisting of two convolutional layers (with $3\times3$ filters and stride 1) and an upsampling layer (by bilinear interpolation) in between, are used to double the width and height of the feature maps. The number of feature maps are also halved in the last 3 blocks. The result of dimension $64 \times 800 \times 16$ is passed to one last convolutional layer to obtain the grayscale image. The discriminator is almost a mirror of the generator; it obtains intermediate feature maps of the same dimension with similar convolutional blocks, but downsamples the width and height with a convolutional stride of 2. Two fully connected layers of size 512 and an output layer are added at the end. The remaining hyperparameters, and training details are inherited from the original StyleGAN paper.

For the training of DCGAN, StyleGAN and PGStyleGAN, Gaussian noise with $\sigma=0.2$ is independently added to each pixel in both real and fake images and $\sigma$ is linearly reduced to 0 in the number of steps indicated in Table 2. In the training of all models, a latent dimension size of 512 is used to sample Gaussian noise.

\iffalse
\fi

\end{document}
