\documentclass{midl} % Include author names
%\documentclass[anon]{midl} % Anonymized submission

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images
\jmlryear{2020}
\jmlrworkshop{Full Paper -- MIDL 2020}
%\jmlrvolume{-- Under Review}
%\editors{Under Review for MIDL 2020}

\title[4D CMR Image Synthesis on XCAT]{4D Semantic Cardiac Magnetic Resonance Image Synthesis on XCAT Anatomical Model}
% sample title:From Simulated to Synthetic Cardiovascular Magnetic Resonance Imaging to Real Application 

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
% \midlauthor{\Name{Author Name1\midljointauthortext{Contributed equally}\nametag{$^{1,2}$}} \Email{abc@sample.edu}\\
% \addr $^{1}$ Address 1 \\
% \addr $^{2}$ Address 2 \AND
% \Name{Author Name2\midlotherjointauthor\nametag{$^{1}$}} \Email{xyz@sample.edu}\\
% \Name{Author Name3\nametag{$^{2}$}} \Email{alphabeta@example.edu}\\
% \Name{Author Name4\midljointauthortext{Contributed equally}\nametag{$^{3}$}} \Email{uvw@foo.ac.uk}\\
% \addr $^{3}$ Address 3 \AND
% \Name{Author Name5\midlotherjointauthor\nametag{$^{4}$}} \Email{fgh@bar.com}\\
% \addr $^{4}$ Address 4
% }

\midlauthor{\Name{Samaneh Abbasi-Sureshjani\midljointauthortext{Contributed equally}\nametag{$^{1}$}} \Email{s.abbasi@tue.nl}\\
\Name{Sina Amirrajab\midlotherjointauthor\nametag{$^{1}$}} \Email{s.amirrajab@tue.nl} \\
\addr $^{1}$ Biomedical Engineering Department, Eindhoven University of Technology, Eindhoven, The Netherlands \\
 \AND
\Name{Cristian Lorenz\nametag{$^{2}$}} \Email{cristian.lorenz@philips.com}\\
\Name{Juergen Weese\nametag{$^{2}$}} \Email{juergen.weese@philips.com}\\
\addr $^{2}$ Philips Research Laboratories, Hamburg, Germany \\
\AND 
\Name{Josien Pluim\nametag{$^{1}$}} \Email{j.pluim@tue.nl}\\
\Name{Marcel Breeuwer \nametag{$^{1,3}$}} \Email{m.breeuwer@tue.nl}\\ 
\addr $^{3}$ Philips Healthcare, MR R\&D - Clinical Science, Best, The Netherlands
}

\usepackage{xcolor}
% \setlength\intextsep{0pt}
% \setlength\belowcaptionskip{0pt}

\begin{document}

\maketitle

\begin{abstract}
% One big challenge in the medical research community is the limited number of accurately annotated datasets for various conditions. Previous works tried to address this challenge by data-driven image synthesis approaches, which are still limited due to lack of anatomically meaningful labeled images in 3D or 4D. Other approaches such as model-based simulation also result in images that are far from desired realism in terms of
% image appearance, tissue texture, and surrounding organs. In this paper, we address these limitations by proposing a hybrid method for the synthesis of controllable labeled 4D medical images reflecting both motion and volumetric changes. We transfer the real image characteristics to mechanistic
% controllable anatomical ground truth via a data-driven generative model that preserves the semantic information of ground truth. Our anatomical ground truth is based on the 4D eXtended  CArdiac  Torso (XCAT) heart model, which offers great flexibility in generating virtual populations with different anatomical variability and spatio-temporal resolution. We show it is possible to learn and transfer both spatial and non-spatial image characteristics from existing real datasets that might differ depending on image modality, vendor and acquisition site. Our proposed 4D labeled synthetic XCAT dataset has a great potential to be used in various supervised multi-modal applications in medical image analysis such as data augmentation and segmentation. 
We propose a hybrid controllable image generation method to synthesize anatomically meaningful 3D+t labeled Cardiac Magnetic Resonance (CMR) images. Our hybrid method takes the mechanistic 4D eXtended  CArdiac  Torso (XCAT) heart model as the anatomical ground truth and synthesizes CMR images via a data-driven Generative Adversarial Network (GAN). We employ the state-of-the-art SPatially Adaptive De-normalization (SPADE) technique for conditional image synthesis to preserve the semantic spatial information of ground truth anatomy. Using the parameterized motion model of the XCAT heart, we generate labels for 25 time frames of the heart for one cardiac cycle at 18 locations for the short axis view. Subsequently, realistic images are generated from these labels, with modality-specific features that are learned from real CMR image data. We demonstrate that style transfer from another cardiac image can be accomplished by using a style encoder network. 
% New anatomically plausible, controllable, 4D labeled CMR images can be synthesized by SPADE GAN on the XCAT heart. 
Due to the flexibility of XCAT in creating new heart models, this approach can result in a realistic virtual population to address different challenges the medical image analysis research community is facing such as expensive data collection. 
Our proposed method has a great potential to synthesize 4D controllable CMR images with annotations and adaptable styles to be used in various supervised multi-site, multi-vendor applications in medical image analysis.

\end{abstract}

\begin{keywords}
4D semantic image synthesis, cardiac magnetic resonance imaging, XCAT phantom, generative adversarial network, SPADE GAN
\end{keywords}

\section{Introduction}
\textbf{ Medical image synthesis and simulation} have considerably transformed the way we develop, optimize, assess and validate new image analysis and reconstruction algorithms. They address several issues the medical research community is facing such as lack of proper, annotated data, clinical privacy and sharing policy, and inefficient data acquisition costs.
%Defining concepts of mechanistic (hypothesis-driven) physics-based simulation and phenomenological (data-driven) image synthesis for the medical imaging community, the special editorial by \cite{frangi2018simulation} highlights the synergistic commonality, shared challenges, advantages and disadvantages of either image generation approaches.
\cite{frangi2018simulation} highlights the synergistic commonality, shared challenges, advantages and disadvantages of both (hypothesis-driven) physics-based simulation and phenomenological (data-driven) image synthesis for the medical imaging community.
We can perform fully controllable experiments on the computer by mechanistic simulations grounded on implementing principles of physics-based medical imaging algorithms and benefiting from defined computerized anatomical and physiological human body models. Without doubt, an accurate in-silico human anatomy plays a crucial role in this approach. The well-known four-dimensional (4D) eXtended CArdiac Torso (XCAT) \cite{segars20104d} computerized whole body models are arguably one of the most comprehensive digital models covering a vast series of phantoms of varying ages from newborn to adult, each comprising parameterised models for cardiac and respiratory motion \cite{segars2013population}.% The 4D XCAT phantoms have gained a lot of attention in the biomedical imaging community~\cite{segars2017application} by offering great flexibility in generating virtual populations with different anatomical variability, body size, structural organ geometry and location, and spatio-temporal resolution. %The recent review paper by \cite{segars2017application} summarises various tools and applications of these phantoms developed over the past decade for biomedical imaging community and beyond.

More recently, by increasing the availability of big data combined with both computational powers and artificial intelligence breakthroughs, phenomenological data-driven synthetic methods for generating data have grown exponentially.
% Generative adversarial networks (GANs) introduced by \cite{GANs_Goodfellow2014} are widely used for various applications in computer vision including medical image analysis~\cite{ReviewGAN}. 
Significant improvements in Generative Adversarial Networks (GANs)~\cite{GANs_Goodfellow2014} have addressed the challenge of synthesizing images with realistic and coherent spatial and non-spatial properties~\cite{bigbigan,park2019SPADE}. However, the applications of synthetic images are still limited, because the synthetic data (sampled from learned distributions) are often limited by the number and quality of existing datasets. Limited anatomically meaningful annotated images makes it difficult to generate high dimensional data reflecting both motion and volumetric changes.

% In this paper, we propose a hybrid image generation method which combines both mechanistic accurate anatomical model with data-driven generative model to generate controllable 4D annotated cardiac images. We use both the image simulation and synthesis approaches to bridge the gap of unlabeled data generation by mapping the real image appearance to mechanistic controllable anatomical ground truth. 
In this paper, we propose a hybrid approach to  bridge the gap between simulated and real datasets by mapping the real image appearance to mechanistic controllable anatomical ground truth via a data-driven generative model.
We synthesize 3D+t controlled Cardiac Magnetic Resonance (CMR) images using XCAT heart model. %These images together with what we call \emph{true ground truth} labels can be used for many applications in medical image analysis. 
The accurate underlying anatomical model (what we call \emph{true ground truth}) is preserved while modality-specific texture and style are transferred from real images. This approach makes it possible to transfer the information from any domain i.e., image modality or vendor to its corresponding anatomical model and create realistic labeled sets to be used in various supervised applications. To the best of our knowledge, this is the first time to synthesize 4D semantically and anatomically meaningful images with controllable ground truths, which is of great importance to tackle the issue of limited labeled data for developing deep learning methods serving the medical image community.

% The remainder of the paper is structured as follows. We begin with a brief overview of the state-of-the-art medical image synthesis and simulation techniques, their applications and limitations in  Section~\ref{sec:relatedwork}. Then in Section~\ref{sec:method}, we describe our proposed methodology in detail, followed by the results in Section~\ref{sec:results}. In the end, the results are discussed and paper is concluded in Section~\ref{sec:conclusion}.

\section{Related Work}
\label{sec:relatedwork} 
% In recent years, there have been significant improvements in image synthesis techniques using GANs. 
\textbf{Data-driven image synthesis} by GANs has had significant improvements in computer vision lately.
% GANs despite other deep generative networks such as 
% %auto-regressive models (e.g.~\cite{PixelCNN++}) or
% Variational Auto-Encoders (VAE) \cite{VAEsWelling}, do not need any explicit assumptions about the data, posterior or prior distributions.
% %  i.e., no slow and non-accurate sampling technique such as Markov chain~\cite{MarkovChain} is required.
% %  Moreover, GANs unlike the 
% %  Variational autoencoders (VAE)~\cite{VAEs2018} need no assumptions on posterior and prior distributions. 
%  Such assumptions still do not guarantee good results, and GANs are capable of generating much sharper and higher quality images. 
%  Moreover, sampling from the generated distributions is possible by one single parallelizable, accurate and fast feed-forward network.
%  from the latent vector rather than slow and non-accurate sampling techniques. Even though GANs suffer more from mode-collapse and a less stable training. 
% The high quality synthetic images generated by GANs 
% %These realistic synthetic images 
%  can be used in data augmentation, cross-domain image translation or domain adaptation~\cite{StyleCardiacSeg_MICCAI19,2019Chen_Munit_Style}.  
In conditional image synthesis approaches some certain input data is used as the input of the generator to provide more semantic information for the image generation~\cite{huang2018munit,DRIT_plus,wang2018pix2pixHD,park2019SPADE}.
% The image synthesis can be either unconditional (generation directly from a random noise input)~\cite{bigbigan,StyleGAN2018}, or conditional where some certain input data is used as the input of the generator~\cite{huang2018munit,DRIT_plus,wang2018pix2pixHD,park2019SPADE}. In a conditional setting, meaningful inputs provide more information to the generator.
However, one of the challenges is that the semantic information and spatial relations of different classes might get removed in the stacks of convolution, normalization and non-linearity layers. 
% Having realistic data and preserving the semantic information is important especially for synthetic medical images that are going to be used for improving the techniques used in clinics  at the end. 
% Various approaches have been proposed to force this semantic-consistency by deploying segmentation masks in addition to appropriate normalization layers~\cite{wang2018pix2pixHD,park2019SPADE}. 
The state-of-the-art conditional GAN by~\cite{park2019SPADE} deploys the segmentation masks in novel SPatially-Adaptive (DE)normalization layers (SPADE) which despite other normalization techniques, prevents the loss of semantic information.%the network to remove the semantic information in the stacks of convolution, normalization and non-linearity layers.

% This is very helpful especially in medical image analysis since data annotations are scarce, the datasets are highly imbalanced and often have a small size.
% Lately, image synthesis is used in several medical image analysis works to deal with the datasets that often have scarce annotations or are highly imbalanced.
Recent image synthesis approaches in the medical imaging community mainly focus on the idea of disentangling the spatial anatomical information (often called as content) from the non-spatial modality-specific features (called as style). For instance, the works by~\cite{2019Chen_Munit_Style,StyleCardiacSeg_MICCAI19}
proposed to mix the contents of a known domain (with available segmentation masks) with the styles learned from a new domain. 
%Content of the new stylized images matches the available segmentation masks, and their style represents imaging properties of the new domain. Thus 
These new labeled synthetic images
can help in adapting the segmentation networks to the new domain. The style is either learned by a style encoder in a Variational Auto-Encoders (VAE) \cite{VAEsWelling} setup
%that embeds images into a latent space with the standard Gaussian distribution~\cite{huang2018munit} 
or is manipulated via normalization layers affecting the statistics of the high-level image representations \cite{styleTransfer_2016_CVPR}. %derived from convolutional neural networks.
% transferred by changing the statistics of the high-level image representations derived from convolutional neural networks optimized for object recognition tasks~\cite{styleTransfer_2016_CVPR}. 
Other recent works such as~\cite{Agisilaos2018_factorized,CHARTSIAS2019_disentangled} proposed to factorize images into spatial anatomical and non-spatial modality representations by latent space factorization relying on the cycle-consistency principle. The anatomical factor is then used in a segmentation task.
% By matching only the content factor to the segmentation masks in a supervised or adversarial setting, they enforce the content to represent the anatomical properties. 
All these methods rely on existing labeled sets which are both limited and not controllable. Recently, \cite{factorized_ETH} proposed to use unlabeled images by learning an anatomical model in a factorized representation learning setting. Even though the segmentation masks are not needed anymore, but still their learned multi-tissue anatomical model is not physiologically accurate and does not match actual organs. 

\textbf{Physics-based image simulation} can produce controllable images by combining the modality-specific principle of image formation with a rich anatomical model.
%, physics-based image simulator.
The image contrast is governed by known equations and can be altered by changing a set of parameters. These parameters are known as sequence parameters specific to imaging modality protocol that in combination with tissue-specific properties can generate image contrast. In this branch of methods, \cite{tobon2011realistic} and \cite{wissmann2014mrxcat} investigate two types of approaches based on XCAT phantom to simulate cardiac MR images. The image contrast for the first one is calculated using a numerical Bloch solver \cite{kwan1999mri} and the latter one benefits from analytical solution for Bloch equations available for cardiac cine sequence protocol. Despite having lots of flexibility and control over the image generation process, simulated images are still far from desired realism in terms of global image appearance, tissue texture, image artifact, and surrounding organs. Furthermore, in order to create a visually familiar image appearance, large scale optimization  sequence-specific and tissue-specific parameters are required. These limitations have hindered the progress of using simulated cardiac images for medical imaging applications.
    
Taking advantage of the biophysical motion model of the heart, the second branch of the simulation method generates more realistic images by warping already existing real images. This model-based image simulation highly depends on matching the time series of cardiac data to an electromechanical heart model \cite{prakosa2012generation}. This method relies on registration in which a real cardiac image is first segmented, and then deformed and warped according to the used motion model to generate a set of transformed time series of images. Differences in the motion estimated from real images and the simulated motion of the heart during warping procedure can produce registration errors. Although much of the problems are solved in the new pipeline introduced by \cite{duchateau2017model}, this warping approach is bounded by the used images and could not generate new appearances with variable contrast, surroundings and texture.

\textbf{The main contribution of this paper} lies in efficiently combining the controllable physics-driven XCAT anatomical model \cite{segars20104d}  with data-driven SPADE-GAN model \cite{park2019SPADE} in order to synthesize realistic-looking cardiac MR images. These images do not require expert annotation since the labels derived from the XCAT model serve as the ground truth segmentation map for the generated images. The spatial information provided by XCAT model are anatomically and physiologically plausible which enables the resulting images to be useful for the purpose of data augmentation. The ability to control both anatomical representation and style in cardiac image synthesis is considered as one of the main advantage of our proposed technique compared to previous techniques. %We are able to introduce a wide range of accurate anatomical variations through the XCAT phantoms and transfers the modality-specific MR image characteristics to derived label maps through the SPADE-GAN to generate anatomically plausible images without any limitation.

% The main contribution of this paper lies in  using the controllable XCAT heart model together with a conditional image synthesis setting to address the challenges of generating realistic labelled CMR images in a large scale. We combine anatomically accurate spatial information (as content) and realistic imaging properties (as style) to generate images in 4D (volumetric in different times). Additionally, we show it is possible to transfer not only the domain-specific global style, but also the domain-specific spatial characteristics (such as surrounding organs) to our true ground truth set in a VAE setting. %, but also it is possible to synthesize images with domain-specific spatial characteristics such as surrounding organs in a VAE setting. Our focus is only on the heart labels in this work.

\section{Methodology}
\label{sec:method}
An overview of our method is shown in \figureref{fig:overview}\footnote{An animated version of our methodology is available here: \url{https://bit.ly/2Ggr61j}}. 
Our conditional image synthesis network is trained on real image data with their corresponding segmentation labels. We make use of the SPADE technique to preserve the anatomical content of the labels during image generation. At the inference time, we swap the used segmentation labels with our voxelized labels which are derived from the XCAT surface-based heart model. We use the flexibility of the XCAT motion model to make a set of 3D+t labels of the heart including only the classes provided by the real data. These new controlled labels are then used to synthesize new images. The details of conditional image synthesis network, image data for training and controllable 4D heart labels for inference are explained in the following. 
\begin{figure}[t]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
\floatconts
  {fig:overview}
  {\caption{An overview of our method. In training (blue blocks), we use the ACDC images with their corresponding segmentation masks as inputs of the SPADE GAN. At inference (red blocks), we substitute the ACDC labels with our 4D voxelized XCAT labels created from the XCAT heart surface model to synthesize new images (4D synthetic XCAT).
%   Motion parameters in the XCAT program allows us to control heart motion and create different time frame across a desired cardiac cycle. 
  The rendered version for the XCAT heart surface model is shown for five time frames. 
  The 4D voxelized XCAT labels 
%   voxelized XCAT has 25 frames for one cardiac cycle with 1 sec duration and 18 slices for short axis view of the heart to make a 3D+t labels covering
cover heart from apex through mid to base location for one cardiac cycle. %These labels are fed as the input of the SPADE GAN generator to synthesize images (4D synthetic XCAT). 
The same labels are used as the ground truth for the new synthetic images (4D labeled synthetic XCAT).}}
 {\includegraphics[width=0.85\linewidth]{images/Methodology3.pdf}}
\end{figure}

\textbf{Conditional image synthesis} in this work is based on the method proposed by~\cite{park2019SPADE}, which we call SPADE GAN. The architecture of the generator consists of a series of the residual blocks with SPADE normalization, followed by nearest neighbor upsampling layers. During the normalization step,
% uses the segmentation mask to modulate the layer activations. In other words, 
the layer activations are initially normalized  to  zero  mean  and  unit  standard deviation in a channel-wise  manner  and  then  modulated  with  a learned  scale  and bias, which depend on the input segmentation mask and vary with respect to the location. The learned modulation parameters encode enough information about the label layout and are used in different resolutions across the generator. Therefore, they avoid the wash out of semantic information which often happens with other normalization layers such as instance normalization (IN). We also used the combination of an image encoder and the generator, and replaced the input noise with the encoded latent vector to form a VAE setup. We altered the architecture of the encoder compared to~\cite{park2019SPADE} by removing the IN layers. The encoder with IN is in charge of capturing only the global appearance of its input image, but by removing IN we allow the spatial information to be transferred as well. Then the generator's task is to combine the encoded (global and local) style and the content coming from the semantic segmentation mask to synthesize an image. This setup is useful in controlling the style of synthetic images and the reconstruction of the surrounding organs of the heart. The architecture of the discriminator, the losses and training settings are kept unchanged.

\textbf{The real dataset} used for training the network is the Automated Cardiac Diagnosis Challenge (ACDC) dataset~\cite{ACDC}. This dataset consists of Cine MR images of 100 patients.
% acquired in breath-hold with a retrospective or prospective gating and with an SSFP sequence in short axis orientation. 
The spatial resolution goes from 1.37 to $1.68~mm^2/pixel$ and images cover the cardiac cycle completely or partially. In total, there are 100  end-systolic and 100 end-diastolic phase instances, with an average of 9 slices.  The segmentation masks for left ventricle (LV) blood pool, LV myocardium, and right ventricle (RV) blood pool are available. We pre-process the data by subsampling them to $1.3\times1.3~mm$ in-plane resolution (fixed inter-slice resolution) and take a central crop of the images with $128\times128$ pixels. All the intensity values are scaled between -1 and 1. 
The SPADE GAN is trained on the entire 2D set of image-mask pairs of this dataset for 100 iterations, using Adam optimizer with learning rate if 0.0002, batch size of 32 on 2 NVIDIA TITAN Xp GPUs. 
We use the VAE setting with larger images ($256\times256$) for a better demonstration. 

\textbf{Controllable 4D heart model} is the key element of our method.
We employ the 3D+t NURBS-based surfaces of the XCAT heart model which is anatomically based on 4D cardiac-gated multislice CT data and its motion model is parameterized by tagged MRI data. To create an accurate 4D voxelized heart model, the XCAT program offers various parameters to control morphological (heart shape) and physiological (heart motion) features of the heart. These parameters include heart scaling factors in 3D; the length of the beating heart cycle; left ventricle volume at end-diastole, end-systole, and three intermediate phases; cardiac cycle timing which is the duration between different phases. We keep the geometrical scaling of the XCAT heart unchanged, set the length of beating heart cycle to $1~sec$ ($60~ heartbeats/min$) and output 25 time frames along one heart cycle. Voxelization of surfaces can be done at any desired resolution. We create $1~mm$ isotropic in-plane resolution for 18 slices perpendicular to the long axis of the heart to form the short axis view of the heart which shows the cross-section of the left and right ventricles.

Our main contribution comes at the inference time. We use our 4D voxelized XCAT labels (sets of 2D slices at different locations and times) as the inputs of the generator and synthesize their corresponding realistic images.  The synthetic slices reflect the accurate anatomical model with modality-specific texture and style. These new images together with the true ground truth create a new 4D synthetic XCAT dataset, which can be used in various applications. Results are presented and discussed in the next sections. 

% \begin{itemize}
    % \item Semantic Image Synthesis: Explain SPADE and how it's used in this paper (briefly). {\color{blue}
    %     $\Rightarrow$ Samaneh}
    % \item Simulated CMRI (XCAT): An explanation about the simulated data used during the training. {\color{blue} $\Rightarrow$ Sina}
    % \item ACDC dataset: A short explanation and the pre-processing steps used {\color{blue}
    %     $\Rightarrow$ Sina}
% \end{itemize}

\section{Results}
\label{sec:results}
First, we show the synthetic images when using the labels of the ACDC dataset as inputs of SPADE GAN. Figure~\ref{fig:synth_slice_ED} shows different synthetic slices (from apex to base) for one subject of the ACDC dataset in the  end-diastolic phase. Similar results for the end-systolic phase are depicted in \ref{sec:appendix}, ~\figureref{fig:synth_slice_ES}. As seen in these figures, the synthetic images are coherent between slices even though the training is done on 2D slices. Moreover, the three classes of interest in the heart have been reconstructed reasonably well. There are some differences between the background tissues in real and synthetic images. This is because all different tissues in that region are mapped into one  class in the label map (background shown by black in the label map). Thus the SPADE GAN is not able to preserve their spatial information. %as it does not have any clue for them in the label maps. 
\begin{figure}[t]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
\floatconts
  {fig:synth_slice_ED}
  {\caption{The synthetic ACDC slices from apex to base location for one subject of the ACDC dataset at the end-diastolic phase. The rows from top to bottom show the input  label  maps,  the  synthetic  and  real  images respectively.}}
 {\includegraphics[width=\textwidth]{images/patient085_frame01_acdc_0-11.png}}
\end{figure}

% \begin{figure}[htbp]\floatconts{fig:synth_slice}% label for whole figure
% {\caption{Synthetic data for different slices}}% caption for whole figure
% {
% \subfigure[frames for the end-diastolic phase, patient085 of ACDC dataset][c]{\label{fig:ACDC_ED}\includegraphics[width=\columnwidth]{images/patient085_frame01_acdc_0-11.png} }

% \subfigure[frames for the end-systolic phase, patient085 of ACDC dataset][c]{\label{fig:ACDC_ES}\includegraphics[width=\columnwidth]{images/patient085_frame09_acdc_0-11.png}}
% }
% \end{figure}
The main results, which are the synthetic images corresponding to the XCAT labels are shown in~\figureref{fig:synth_frame_0-12_base}. For visualization purposes, we fix the location and vary the time frame. 
The results for 12 time frames from  end-diastolic to end-systolic phase (from left to right) are shown at the base location of the short axis view of the heart. Due to limited space, similar results for other time frames and locations are shown in~\ref{sec:appendix}, \figureref{fig:synth_frame_0-12} and \figureref{fig:synth_frame_12-24}. Additionally, a 4D visualization of our results is available here: \url{https://bit.ly/2REVAzB}. As seen in these figures, for a fixed location, the classes of interest are generated according to the input label map, while the background is consistent and coherent. 
% \begin{figure}[t]\floatconts{fig:synth_frame_0-12}% label for whole figure
% {\caption{4D synthetic images on XCAT labels. 12 time frames from ED phase to end-systolic phase is shown at 3 different locations of the short axis view of the heart. In each figure, the first and second rows represent the input label map and their corresponding synthetic images respectively. Due to limited space, other time frame and locations are shown in the appendix. }}% caption for whole figure
% {
% \subfigure[12 time frames from end-diastolic to end-systolic for the apex location][c]{\label{fig:apex_0-11}\includegraphics[width=\textwidth]{images/heart_mask_23_apex_0-11.png} }
% % \qquad % space out the images a bit
% \subfigure[12 time frames from end-diastolic to end-systolic for the mid location][c]{\label{fig:mid_0-11}\includegraphics[width=\textwidth]{images/heart_mask_31_mid_0-11.png}}

% \subfigure[12 time frames from end-diastolic to end-systolic  for the base location][c]{\label{fig:base_0-11}\includegraphics[width=\textwidth]{images/heart_mask_35_base_0-11.png}}
% }
% \end{figure}

\begin{figure}[t]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
\floatconts
  {fig:synth_frame_0-12_base}
  {\caption{4D synthetic images on XCAT labels for 12 time frames from end-diastolic to end-systolic phase (left to right) are shown at the base location of the short axis view of the heart. The rows represent the input label maps and their corresponding synthetic images.}}
 {\includegraphics[width=\textwidth]{images/heart_mask_35_base_0-11.png}}
\end{figure}

In another experiment, we test our modified VAE setup on the 4D voxelized XCAT labels to show the capability of the method in generating synthetic images in which the global and local styles are matched to images from an unseen dataset. %The network is trained and images are synthesized for the 256x256 resolution for better visualization.
% We removed the IN layer from the encoder to make the encoded latent vector include not only the global image appearance information, but also some semantic information about the surrounding regions of the heart. The SPADE generator is strong enough to synthesize the heart classes correctly and the rest of the image is constructed according to the encoded latent vector. 
Some sample results are shown in \figureref{fig:VAE}. The input images of the encoder (representing the style) are depicted in the first column. Two different
synthetic images for each style are shown in the second and third columns, and the label maps (the inputs of the SPADE layers) are shown on the top left corner of the resulting synthetic images. In these images the local and global appearance of the style images are transferred to the synthetic images, while keeping the classes of interest intact. This VAE setup provides an additional control on our image generation. The generator is capable of creating realistic heart models, while the encoder transfers the information related to the other surrounding organs. For the sake of comparison, using the same combination of style and label maps, the resulting synthetic images when the IN layers are kept in the style encoder are also shown in the fourth and fifth columns. In these cases, only the global style is transferred and the control on the surrounding regions of the heart is very limited. %regions of the image which their corresponding labels are not available. 

\begin{figure}[h!]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
\floatconts
  {fig:VAE}
  {\caption{Transferring desired styles to synthetic XCAT images.
%   An encoder is used to encode the style of a new real image into a latent vector which is later used as the input of the generator in addition to the segmentation mask to generate stylized XCAT image.
  The first column represents the desired style images. The resulting synthetic images for each style without and with IN layers are shown in the second to fifth columns. The corresponding input label maps are shown in the top left corner of the synthetic images.}}
 {\includegraphics[width=0.75\textwidth]{images/Figure4.pdf}}
\end{figure}

\section{Discussion and Conclusion}
\label{sec:conclusion}
In this paper, we have proposed a hybrid method to use the voxelized 3D+t NURBS-based surfaces of the  XCAT heart model in a deep generative network and synthesize semantically  and  anatomically  meaningful 4D realistic CMR images with controllable ground truth labels. Even though the SPADE GAN is trained on 2D images, the synthetic images are very coherent across the other two dimensions of the labels (slice and time). Specifically, the heart that is our main focus in this work, is synthesized consistently. However, small variation and inconsistency in the background can occur because all tissues that are not of interest (i.e. not belonging to the heart) are assigned to the background class. This may be ignored when the application of the synthetic data is heart cavity segmentation. For multi-organ segmentation applications, the main limitation comes from the limited number of classes in the ACDC dataset  as various organs are mapped to the background class. Since the background label does not contain any spatial information, we only have limited control over the generated background regions through our modified VAE setting. Our style encoder encodes the local semantic information of the input style image, in addition to global style information, to a latent vector. Removing the IN layers prevents the removal of semantic information and helps in generating consistent background for nearby slices.
Definitely, multi-tissue or multi-class segmentation of background can help in generating more realistic results as it provides more information to the generator. Moreover, using other MR modalities such as T1-weighted and late gadolinium enhancement extends the variations in the global style compared to the limited styles learned from the ACDC dataset with cine MR contrast. It is worth mentioning that for the 4D voxelized XCAT labels, we only selected the classes matching the labels of the ACDC dataset. If we use another dataset with more labels, we can use more classes of the XCAT model as well.

%This makes our approach computationally efficient, as training the SPADE GAN for 3D or 4D data requires more resources i.e., hardware and 3D or 4D real labeled datasets are few and have a small size. Moreover, despite the fact that the ground truth maps of only two frames of the 4D ACDC dataset are available, the SPADE GAN has generalized well to the other time frames.

 

The main advantage of using the XCAT model is that not only it can be controlled and modified to generate new heart labels, it can also provide anatomically meaningful accurate ground truth for different time frames. So the 4D labeled synthetic CMR images can potentially be employed in cardiac supervised tasks. This is a great advantage over the previous approach by ~\cite{factorized_ETH} in which their estimated mutli-tissue segmentation map is not necessarily anatomically plausible. Moreover, their deformable model does not provide physiologically meaningful information since its motion is modelled by an interpolation in the latent space between anatomical shapes of end-systolic and end-diastolic phases. 
% We showed that the style encoder in the VAE setting can provide more control over image synthesis. It makes it possible to generate new appearances (image contrast) and examples (not limited to the ACDC dataset). If only a change in global style is required, then the VAE with IN layers is recommended as they prevent the semantic information to be transferred to the synthetic image. In addition, it is possible to substitute the ACDC dataset with other datasets coming from different sites/vendors to learn the site-based modality information and applying it to XCAT anatomical models for creating both anatomical and contrast variations in the database.
% We are able to create both anatomical and contrast variations in the synthetic dataset via the controllable XCAT anatomical model and our modified style encoder.


Our future works are twofold: i) improving the control over generating the background by dividing it into an approximated multi-organ segmentation map which eventually results in more temporary consistent background and ii) quantitative application-based evaluation of the synthetic images by deploying them in a heart segmentation task for multi-site, multi-vendor scenarios. We use our proposed approach to generate a large virtual population with various anatomical and style variations and utilize the synthetic images in different data augmentation strategies for the cardiac cavity segmentation task. The goal is to investigate the utility of the synthetic data in training deep learning algorithm for segmentation and evaluate that the data generated by this approach is clinically meaningful to replace the need for real data.

%Our proposed 4D labeled synthetic XCAT dataset has a huge potential to be used in various applications including data augmentation in multi-organ and multi-modal image analysis. Our true ground truth provides shape priors for segmentation purposes.  On the other hand,  Future work will be on quantitative evaluation of the synthetic images and deploying them in real applications in addition to real datasets. 

% \begin{itemize}
    % \item \emph{Limitation:} the synthetic images do not necessarily contain consistent backgrounds because various organs/classes are mapped to the background class and the label maps do not contain any spatial information.
    % \item \emph{Limitation:} multi-tissue multi-class segmentation of background can help in more accurate results.
    % \item \emph{Limitation:} Our results are limited by the number of available labels for the real dataset i.e., the labels of XCAT are not the limitation.
    % \item \emph{Limitation:} Also the 4D ACDC dataset only has labels for two frames and the labels for other frames are not available.  
    % \item \emph{Application:} These can be used in various applications including data augmentation for multi-organ segmentation and  multi-modal image generation.
    % \item \emph{Application:} the encoder of SPADE in the VAE setting can be used to generate new appearances and examples (not limited to ACDC dataset). The encoder changes the global style.
    % \item \emph{Application:} accurate GT provide shape priors for segmentation purposes. 
    % \item \emph{Application:} Substituting ACDC with other datasets coming from different sites/vendors to learn the site-based modality information and apply it to XCAT anatomical models for creating variations.   
    
% \end{itemize}


% Acknowledgments---Will not appear in anonymized version
% \midlacknowledgments{We thank a bunch of people inclduing ourselves :P}


\bibliography{Abbasi-Sureshjani20}


\appendix

\section{Additional Figures}
\label{sec:appendix}
This section includes additional synthetic images.
%=============================================
\figureref{fig:synth_slice_ES} includes synthetic slices for the fixed end-systolic phase for one patient of the ACDC dataset.
\begin{figure}[htbp]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
\floatconts
  {fig:synth_slice_ES}
  {\caption{The synthetic ACDC slices from apex to base location for one subject of the ACDC dataset at the end-systolic phase. The rows show the input  label  maps,  the  synthetic  and  real  images respectively}}
 {\includegraphics[width=\textwidth]{images/patient085_frame09_acdc_0-11.png}}
\end{figure}
%=============================================

Figure~\ref{fig:synth_frame_0-12} shows the generated samples for XCAT labels for 12 time frames from end-diastolic to end-systolic phase while fixing the location. \figureref{fig:apex_0-11}, \ref{fig:mid_0-11} correspond to apex and middle locations respectively.
\begin{figure}[htbp]\floatconts{fig:synth_frame_0-12}% label for whole figure
{\caption{4D synthetic images on XCAT labels for 12 time frames from end-diastolic to end-systolic phase at apex and mid locations of the short axis view of the heart. In each figure, the first and second rows represent the input label map and their corresponding synthetic images respectively. }}% caption for whole figure
{
\subfigure[The apex location][c]{\label{fig:apex_0-11}\includegraphics[width=\textwidth]{images/heart_mask_23_apex_0-11.png} }
% \qquad % space out the images a bit
\subfigure[The mid location][c]{\label{fig:mid_0-11}\includegraphics[width=\textwidth]{images/heart_mask_31_mid_0-11.png}}

% \subfigure[12 time frames from end-diastolic to end-systolic  for the base location][c]{\label{fig:base_0-11}\includegraphics[width=\textwidth]{images/heart_mask_35_base_0-11.png}}
}
\end{figure}
%=============================================
Similarly, the results for end-systolic to end-diastolic phases, corresponding to apex, middle and base locations are shown in \figureref{fig:synth_frame_12-24}.
% shows the generated samples for XCAT labels for varying frames while fixing the slice. The results for 12 time frames from end-systolic phase to end-diastolic phase are shown for different locations. \figureref{fig:apex_12-24}, \ref{fig:mid_12-24} and \ref{fig:base_12-24} correspond to apex, middle and base locations respectively.
\begin{figure}[htbp]\floatconts{fig:synth_frame_12-24}% label for whole figure
{\caption{4D synthetic images on XCAT labels for 12 time frames from end-systolic to end-diastolic phase at three different locations of the short axis view of the heart. The first and second rows represent the input label map and their corresponding synthetic images respectively.}}% caption for whole figure
{
\subfigure[The apex location][c]{\label{fig:apex_12-24}\includegraphics[width=\textwidth]{images/heart_mask_23_apex_12-24.png} }
% \qquad % space out the images a bit
\subfigure[The mid location][c]{\label{fig:mid_12-24}\includegraphics[width=\textwidth]{images/heart_mask_31_mid_12-24.png}}

\subfigure[The base location][c]{\label{fig:base_12-24}\includegraphics[width=\textwidth]{images/heart_mask_35_base_12-24.png}}
}
\end{figure}

%=============================================
% This is a boring technical proof of
% \begin{equation}\label{eq:example}
% \cos^2\theta + \sin^2\theta \equiv 1.
% \end{equation}

% \section{Proof of Theorem 2}

% This is a complete version of a proof sketched in the main text.

\end{document}
