\documentclass{midl} % Include author names
%\documentclass[anon]{midl} % Anonymized submission

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images
\usepackage[font=small,skip=5pt,labelfont=bf]{caption}
\setlength{\abovedisplayskip}{3pt}
\setlength{\belowdisplayskip}{3pt}
\usepackage{diagbox}

\jmlryear{2024}
\jmlrworkshop{Full Paper -- MIDL 2024}
\jmlrvolume{-- nnn}
\editors{Accepted for publication at MIDL 2024}


\title[Multi-scale Stochastic Generation of Labelled Microscopy Images]{Multi-scale Stochastic Generation of Labelled Microscopy Images for Neuron Segmentation}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Meghane Decroocq\nametag{$^{1}$}} \Email{meghane.decroocq@riken.jp}\\
\addr $^{1}$ Brain Image Analysis Unit, Center for Brain Science, RIKEN, Wako, Japan \AND
\Name{Binbin Xu\nametag{$^{2}$}} \Email{binbin.xu@mines-ales.fr}\\
\addr $^{2}$ EuroMov Digital Health in Motion, Univ Montpellier, IMT Mines Ales, France \AND
\Name{Katherine L {Thompson-Peer}\nametag{$^{3}$}} \Email{ktpeer@uci.edu }\\
\addr $^{3}$ Department of Developmental and Cell Biology, University of California, Irvine, CA 92697, USA \AND
\Name{Adrian Moore\nametag{$^{4}$}} \Email{adrian.moore@riken.jp}\\
\addr $^{4}$ Laboratory for Neurodiversity, Center for Brain Science, RIKEN Wako, Japan  \AND
\Name{Henrik Skibbe\midlotherjointauthor\nametag{$^{1}$}} \Email{henrik.skibbe@riken.jp}\\
}

\begin{document}

\maketitle

\begin{abstract}

	We introduce a novel method leveraging conditional generative adversarial networks (cGANs) to generate diverse, high-resolution microscopy images for neuron tracing model training. This approach addresses the challenge of limited annotated data availability, a significant obstacle in automating neuron dendrite tracing. Our technique utilizes a multi-scale cascade process to generate synthetic images from single neuron tractograms, accurately replicating the complex characteristics of real microscopy images, encompassing imaging artifacts and background structures. In experiments, our method generates diverse images that mimic the characteristics of two distinct neuron microscopy datasets, which were successfully used as training data in the segmentation task of real neuron images. 
 
 %Applied to two distinct neuron microscopy datasets, our method demonstrates its potential to enhance the accuracy of automated neuron reconstructions without the need for manual annotation. The code is available at \url{https://github.com/BrainImageAnalysis/Multi-scale-cGAN}.
 
\end{abstract}

\begin{keywords}
multi-scale cGAN, microscopy imaging, neuron tracing
\end{keywords}

\section{Introduction}

The automated segmentation and tracing of neuronal structures from microscopy images is a crucial step toward understanding brain structure and function. Despite its importance, it remains a challenging task that requires high levels of morphological precision, often necessitating time-consuming manual efforts. Recent advancements in deep learning have led to substantial improvements in automatic neuron segmentation and tracing \cite{chen2021deep}. However, the effectiveness of these models is constrained by the scarcity of annotated training data and the diversity of imaging methods and neuronal morphologies.


One approach to circumvent the limitations associated with the scarcity of annotated data is the generation of artificial images. This involves rasterizing a tree structure into an image and adding noise to simulate real-world imperfections. Such synthetic images are often utilized to expand existing training datasets or even to directly train models, thereby rendering the time-consuming acquisition and labeling of real images obsolete \cite{chen2021deep}. This technique is not limited to neuron segmentation but also benefits the analysis of other tree-like structures, such as vessels \cite{tetteh2020deepvesselnet, prabhakar2024vesselformer} and airways \cite{Nardelli2020Generative}. To create images that mimic realistic morphology, these are often generated from publicly available digital reconstructions (e.g., neuron tractograms, vascular trees) \cite{radojevic2019automated,chen2021deep}, or artificially generated trees \cite{hamarneh2010vascusynth}. Synthetic images can be generated in substantial volumes, effectively addressing the scarcity of real-world data and the imprecision of manual annotations. However, they often fail to accurately replicate the surrounding anatomical structures or imaging artifacts unique to specific organs and imaging modalities, thus limiting the transferability of the model to real-world images.


In recent years, generative adversarial networks (GAN), particularly conditional GANs (cGANs) \cite{isola2017image}, have demonstrated remarkable capabilities in generating realistic synthetic images. These models generate images conditioned on provided ground truth, creating paired images that bypass the need for further annotation. cGANs have been successfully employed across several imaging modalities, including CT \cite{jin2018ct}, MR \cite{lau2018scargan, mok2019learning}, ultrasound \cite{tom2018simulating}, and retinal images \cite{costa2017end}. 
However, their application to microscopy imaging of neurons, to the best of our knowledge, remains unexplored. The unique challenges posed by neuron imaging include the high resolution of microscopy images, which imposes substantial computational demand and necessitates a large amount of annotated data for training. Moreover, the one-to-one image mapping inherited from cGANs constrains the diversity of the generated images to that of the input ground truth images. Consequently, achieving a broad range of image outputs requires a large and varied dataset of neuron tractograms.   

To address these challenges, we propose a novel method utilizing cGANs. Our approach leverages a multi-scale cGAN cascade to iteratively refine image resolution, employing a patch-based strategy that facilitates the generation of high-resolution microscopy images with minimal memory demands. An important part of our approach is the employment of a mode seeking loss function \cite{mao2019mode} that enhances the diversity of the generated data, setting our method apart from previous methods \cite{uzunova2019multi} that generate unique pairs of real and generated images. We have developed a new training strategy that ensures coherence of the image content across different scales during refinement. This allows for the stochastic generation of a broader variety of images without compromising quality. We have successfully applied our method to two datasets of neuron microscopy images, producing synthetic images that faithfully replicate the characteristics of their real counterparts. The code is available at \url{https://github.com/BrainImageAnalysis/Multi-scale-cGAN}.  


\section{Methods}
	
	\subsection{Data}

    In this study, we use two datasets of \textit{Drosophila} single-neuron microscopy images with diverse anatomical structures, and real-world imaging artifacts. Dataset 1 contains 417 wide-field fluorescent microscopy images of \textit{Drosophila} class I da neurons. Dataset 2 \cite{thompson2016vivo, nguyen2021comparing} contains 221 confocal microscopy images of \textit{Drosophila} class IV da neurons; see Appendix \ref{appendix:data-detail} for further details. The Z-stack images were converted to 2D by maximum intensity projection, and the images of both datasets have been rescaled from $1920 \times 1440$ and $1024 \times 1024$ pixels to $1024 \times 1024$ pixels. Both datasets were manually traced. Neurite radii and the cell bodies were automatically extracted. All images were post-processed using histogram equalization. We downloaded all the available tracings for \textit{Drosophila} class I and IV da neurons from the NeuroMorpho database \cite{ascoli2007neuromorpho}, and rasterized them into a $1024 \times 1024$ segmentation mask, forming the evaluation datasets Neuromorpho 1 ($373$ segmentations) and Neuromorpho 2 ($501$ segmentations).
    

 
	\subsection{Image generation}
		
		\subsubsection{Multi-scale cascade structure}

        Models such as Laplacian GAN \cite{denton2015deep} or HDpix2pix \cite{wang2018high} employ a multi-scale iterative refinement to generate high-resolution images. However, these methods impose high memory demands by requiring entire images as inputs to the network. To circumvent this limitation, we utilize a cascade of conditional GANs with a fixed input and output size. As proposed in \cite{uzunova2019multi}, rather than enlarging the size of the image processed through the model, we keep the patch size constant while iteratively decreasing the receptive field, thereby increasing the pixel resolution (Figure \ref{Img:pipeline}).
        
        Conditional GANs are generative models that learn to map a condition image $x$ to a target image $y$, $G : x \rightarrow y$. In this work, $x$ is the neuron segmentation and $y$ is the paired microscopy image. A discriminator $D$ is simultaneously trained to distinguish fake from real images, driving the generator to create realistic images while following the constraints of the input image $x$. 

        \vspace{-0.25cm}
        \begin{figure}[h]
        \centering
        \includegraphics[width=1\textwidth]{Img/pipeline}
        \caption{Multi-scale generation pipeline. \textbf{u} (respectively \textbf{d}) refers to an upsampling (respectively downsampling) operation, and $\tilde{y}_{p_{0}}$ (respectively $x_{p_{0}}$) designates the patches extracted from the whole image $\tilde{y}_{0}$ (respectively $x_{0}$). }
        \label{Img:pipeline}
        \end{figure}

        At the first scale, a conditional GAN $(G_{0}, D_{0})$ learns the mapping between a low-resolution ($64 \times 64$) neuron segmentation image $x_{0}$ and a low-resolution microscopy image $y_{0}$, $G_{0} : x_{0} \rightarrow \tilde{y_{0}}$. The generated image $G_{0}(x_{0})$ is upsampled to the original resolution ($1024 \times 1024$) and divided into patches $\tilde{y}_{p_{0}}$. In the next step, another conditional GAN $(G_{1}, D_{1})$ learns to generate next resolution patch $\tilde{y}_{p_{1}}$ conditioned on both the segmentation $x_{p_{0}}$ and the previous patch $\tilde{y}_{p_{0}}$, $G_{1} : (x_{p_{1}}, \tilde{y_{p_{0}}}) \rightarrow \tilde{y}_{p_1}$. The generated patches are combined and upsampled to the original resolution to form the refined images $\tilde{y}_{1}$. This refinement step is repeated until we reach the target resolution. At every scale, the receptive field of the patches is divided by two; the resolution of the refined images is consequently doubled. The objective of the base generative cGAN $(G_{0}, D_{0})$ and the refinement cGANs $(G_{i}, D_{i})$ with $i \in [1, n]$  differs. We detail the architecture of each model in the next sections. 
        

		\subsubsection{Base generation cGAN} 

        The objective of the base generative model is to create a coarse image background, laying the foundational layer for subsequent refinement at higher scales. This base model is crucial, as it determines both the realism and the diversity of the generated images.

        cGANs usually suffer from the lack of diversity of the generated samples for a given input, primarily because the model tends to overlook the random noise vectors. This oversight leads to a predominant one-to-one mapping between the input condition and the target image. The widely used cGAN-based pix2pix model \cite{isola2017image} relies on a $L_1$ loss between the generated image $G_{0}(x_0)$ and the target image $y_0$ to improve the realism of the generated images, restricting even more the variability of the generated samples. In this type of model, the only source of variability is the input image $x$. To increase the diversity of the generated images, Mao et al. \cite{mao2019mode} proposed a new regularization term to drive the generator to generate dissimilar images, called mode seeking loss. We propose to use this loss in our base generation model to increase the diversity of the generated images. 

        \begin{figure}[h]
        \centering
        \includegraphics[width=0.8\textwidth]{Img/mode_seekingcGAN}
        \caption{Mode seeking conditional GAN model.}
        \label{Img:mode_seekingcGAN}
        \end{figure}
        
        Our base generation cGAN is composed of a generator $G_0$ and a discriminator $D_0$. The generator is a U-Net \cite{Ronneberger2015U}, and the discriminator a PatchGAN \cite{isola2017image}. The generator $G_0$ takes as input the downsampled neuron segmentation image $x_0$ and a random vector $z$ drawn from a uniform distribution. The discriminator $D_{0}$ is trained to distinguish fake pairs $(x_{0}, G(z, x{0}))$ from real pairs $(x_0, y_0)$. As shown in Figure \ref{Img:mode_seekingcGAN}, we use two losses during the training. The mode seeking loss $\mathcal{L}_{ms}$ (Equation \ref{Eq:Lms}) maximizes the dissimilarity between two images generated from different $z$ random vectors, increasing the diversity of the generated images and avoiding mode collapse. The adversarial loss $\mathcal{L}_{GAN}$ (Equation \ref{Eq:LGAN0}) pushes the generator to create realistic images. These losses are:
        %
        \begin{equation}
        \mathcal{L}_{ms} = \max_{G_{0}}(\textstyle{\frac{MAE(G_{0}(z_{a}, x_{0}), G_{0}(z_{b}, x_{0}))}{MAE(z_{a}, z_{b})}}),
        \label{Eq:Lms}
        \end{equation}
        %
        where $z_{a}$ and $z_{b}$ are two random vectors of size $n_{z}$, and $MAE$ is the mean absolute error, and
        %
        \begin{equation}
        \mathcal{L}_{GAN}(G_{0}, D_{0}) = \min_{G_{0}}\max_{D_{0}}\mathbb{E}_{x_{0}, y_{0}}[\log{D_{0}(x_{0}, y_{0})}] + 
        \mathbb{E}_{z, x_{0}}[1 - \log{D_{0}(x_{0}, G_{0}(z, x_{0}))}].
        \label{Eq:LGAN0}
        \end{equation}

        \noindent The total loss $\mathcal{L}$ is the sum of both losses $\mathcal{L} = \mathcal{L}_{ms} + \mathcal{L}_{GAN}$. Figure \ref{Img:repeat} shows the effect of the mode seeking loss on the diversity of the generated images compared to the original pix2pix model.

        \vspace{-0.25cm}
        \begin{figure}[h]
        \centering
        \includegraphics[width=0.7\textwidth]{Img/repeat}
        \caption{Repeated generation for the same input tractogram, using pix2pix (a), and mode seeking (b). As pix2pix repeatedly generated the same image, we show only one of them.}
        \label{Img:repeat}
        \end{figure}
 
        
		\subsubsection{Refinement cGAN}

        \begin{figure}[h]
        \centering
        \includegraphics[width=0.8\textwidth]{Img/refinementcGAN}
        \caption{Refinement conditional GAN model.}
        \label{Img:refinementcGAN}
        \end{figure}
        
        The low-resolution images produced by the base generation cGAN are progressively enhanced in quality through a cascade of refinement cGAN models at subsequent scales. The structure of our refinement cGAN is illustrated in Figure \ref{Img:refinementcGAN}. We keep the same architecture as the base generation cGAN for the generator $G_i$ and the discriminator $D_i$.
        
        The stochasticity of our base generation cGAN drove us to develop a new training strategy for refinement compared to existing methods. Unlike the multi-scale GAN of \cite{uzunova2019multi}, the images produced by the base cGAN do not correspond to real images. Therefore, they cannot be used as matched pairs for training the discriminator. To prevent the generator from ignoring the image generated at the previous scale (a role normally undertaken by the discriminator in cGANs), we use an $MAE$ loss between the generated images at scales $i-1$ and $i$;
        %
        \begin{equation}
        \mathcal{L}_{MAE}(\tilde{y}_{p_{i-1}},\tilde{y}_{p_{i}}) = MAE(\tilde{y}_{p_{i-1}}, \textbf{d}(\tilde{y}_{p_{i}})),
        \end{equation}
   
        %\textbf{d}(G_{i}(x_{p_{i}}, \tilde{y}_{p_{i-1})})
        where \textbf{d} is the operation of downsampling to the resolution of the previous scale, and $\tilde{y}_{p_{i}}$ are the patches generated at scale $i$. The adversarial loss can be written as follows:


        \begin{equation}
        \mathcal{L}_{GAN}(G_{i}, D_{i}) = \min_{G_{i}}\max_{D_{i}}2\mathbb{E}_{x_{p_{i}}, y_{p_{i}}}[\log{D_{i}(x_{p_{i}}, y_{p_{i}})}] + \mathbb{E}_{x_{p_{i}}, y_{p_{i-1}}}[1 - \log{D_{i}(x_{p_{i}}, \tilde{y}_{p_{i}})}] 
        \label{Eq:LGANi}
        \end{equation}

        The total loss $\mathcal{L}$ is a combination of the adversarial loss and the $L_1$ losses, balanced by a parameter $\lambda$; $\mathcal{L} = \mathcal{L}_{GAN}(G_{i}, D_{i}) + \lambda \mathcal{L}_{MAE}(\tilde{y}_{p_{i-1}},\tilde{y}_{p_{i}})$.
        


		\subsubsection{Training details} \label{sec:training-param}

        Data augmentation ($90$ degree clockwise / counterclockwise rotations, horizontal and vertical flip) is applied to the images only during the base generation of scale $0$. The model is trained with a batch size of $32$, a learning rate of $0.002$ using an Adam optimizer for all scales. Following heuristic exploration, the length of the random $z$ vectors is set to $n_z = 12$, and the $\lambda$ parameter to 10. During the application of our method to the microscopy Datasets 1 and 2, we observed that keeping a model input patch size of $64 \times 64$ in the high resolutions of the cascade led to the creation of artifacts in the background of the image. We believe that it is caused by the lack of context of the patches in the high resolutions. Increasing the size of the patches to input in the model in the high-resolutions solved this problem while keeping the memory demand reasonable. The best trade-off parameters, as well as the number of iterations for each scale, can be found in Table \ref{Tab:parameters}. For training and inference, we divide the images into regularly organized patches with a $50 \%$ overlap. The overlapping parts of the generated patches are merged using cosine weights, to avoid border effects in the generated images.
        


\section{Results}

	\subsection{Image generation}

     We present visual examples of images generated with our model. Figure \ref{Img:multi-scale} illustrates the ability of the refinement cGAN to enhance the details of the rather coarse image generated at scale $0$. As shown in Figure \ref{Img:repeat_images}, our method can reproduce characteristics of the real microscopy images, such as background structures, including other surrounding neurons in Dataset 2, and imaging artifacts.     Moreover, as illustrated in Figure \ref{Img:multi-scale}, our generative model can produce a diverse set of images from a single input neuron tractogram, while maintaining the morphology of the input neuron (more examples in Appendix \ref{appendix:figures}).
     
    \begin{figure}[h]
    \centering
    \includegraphics[width=0.8\textwidth]{Img/multi_scale.pdf}
    \caption{Images generated at the different scales, from coarse to fine resolution.}
    \label{Img:multi-scale}
         \vspace{-0.5cm}
    \end{figure}
    

    \begin{figure}[h]
    \centering
    \includegraphics[width=1\textwidth]{Img/repeat_images.pdf}
    \caption{Different images generated by our method from a single input segmentation. The row 1 is an example from Dataset 1, the row 2 an example from Dataset 2.}
    \label{Img:repeat_images}
     \vspace{-0.3cm}
    \end{figure}
   

    \subsection{Ablation study} \label{sec:ablation}

    In this section, we emphasize the contribution of the different components of our model through an ablation study. To this aim, we evaluated five models of increasing complexity; (1) pix2pix model \cite{isola2017image}, refined according to the method of \cite{uzunova2019multi},  where the images generated at the previous scale are shown to the discriminator, (2) a hybrid model where the cGAN with mode seeking loss proposed in this work is used at scale $0$, but the generated images are refined using the strategy of \cite{uzunova2019multi}, (3) the proposed model without the $\mathcal{L}_{MAE}$ loss, and (4) the proposed model. We keep the same hyperparameters and network architecture for all models. The realism of the generated images is measured by the Frechet inception distance at the first scale $FID_{0}$, and at the final scale $FID_{n}$. The $scale\_corr$ metric measures the coherence of the generation process across scales, $repeat\_corr$ reflects the diversity of the generated images, and $label\_corr$ shows the compliance with the input neuron morphology. The evaluation metrics are further described in Appendix \ref{appendix:metrics}. Table \ref{Tab:ablation} shows the results of this ablation study. We show examples of generated images in Figure \ref{Img:ablation} of Appendix \ref{appendix:figures}.

        \begin{table}[!h]
        \centering
        \small
        \resizebox{\textwidth}{!}{\begin{tabular}{|c|ccccc|}
        \hline     
        model & $FID_{0}\downarrow$ & $FID_{n} \downarrow$ & $scale\_corr \uparrow$ & $repeat\_corr \downarrow$ & $label\_corr \uparrow$ \\
        \hline  
        %(1) cGAN + \cite{uzunova2019multi} & $1.04$ & $1.95$ & $0.89 \pm 0.03$ & $1.00 \pm 0.00$ & $0.53 \pm 0.04$\\
        (1) pix2pix + \cite{uzunova2019multi} & $0.00$ & $0.66$ & $0.98 \pm 0.01$ & $1.00 \pm 0.00$ & $0.66 \pm 0.05 $ \\
        (2) cGAN with $\mathcal{L}_{ms}$ + \cite{uzunova2019multi} & $0.29$ & $10.04$ & $0.04 \pm 0.03$ & $0.45 \pm 0.06$ & $0.78 \pm 0.02$ \\
        (3) ours without $\mathcal{L}_{MAE}$ & $0.27$ & $0.71$ & $0.66 \pm 0.06$ & $0.31 \pm 0.08$ & $0.66 \pm 0.02$\\
        (4) ours & $0.26$ & $0.38$ & $0.97 \pm 0.01$ & $0.19 \pm 0.08$ & $0.70 \pm 0.03$\\
        \hline
        \end{tabular}}
        \caption{Ablation study: our method enhances the diversity of the generated images while keeping a high fidelity to real images.}
        \label{Tab:ablation}
             \vspace{-0.2cm}
        \end{table}
   
        
    The multi-scale pix2pix model (1) generates realistic images, but the model is deterministic ($repeat\_corr = 1$). As shown by the $label\_corr$ values, the generated images do not strictly follow the input neuron as non-annotated branches (e.g. axons) are generated in the background. The high $FID_{n}$ score of the model (2) supports our assumption that the refinement strategy of \cite{uzunova2019multi} relies on the one-to-one mapping to the real images and is not compatible with the generation of diverse images. The mode seeking loss enhances the diversity of the generated images and improves the compliance to the input neuron morphology. Finally, the $\mathcal{L}_{MAE}$ loss of the proposed model helps to maintain the coherency of the generation across scales, increasing the final image diversity without compromising realism.

         \vspace{-0.1cm}
	\subsection{Neuron segmentation}

       In this section, we evaluate the applicability of our generation method for neuron segmentation, which is often a prerequisite to tracing algorithms \cite{zhou2018deepneuron}. To this aim, we prepared four datasets; real images, images generated by our method from the training neurons of Dataset 1, images generated from the unseen neurons of NeuroMorpho 1, images generated by adding noise according to the method of \cite{radojevic2019automated}. %Dataset 2 was not retained for the evaluation due to interference from non-annotated surrounding neurons, which affected the evaluation metrics. 
       We train a U-Net model to segment the neurons on a given dataset and infer on another dataset (training details in Appendix \ref{appendix:training-detail}). We compute the Dice score of the predicted segmentation to measure the tranferability between generated and real images. The results can be found in Table \ref{Tab:results}.
        

        \begin{table}[!h]
        \centering
        \small
        \resizebox{\textwidth}{!}{\begin{tabular}{|c|c|c|c|c|}
        \hline 
         \diagbox[width=12em]{test}{train} & real & generated Dataset 1 & generated NeuroMorpho 1 & Dataset 1 + noise  \\
        \hline 
        real & $0.71 \pm 0.09$ & $0.59 \pm 0.11$ & $0.59 \pm 0.11$ & $0.30 \pm 0.11$ \\
        \hline
        generated Dataset 1 & $0.79 \pm 0.03$ & $0.90 \pm 0.01$ & $0.88 \pm 0.02$ & $0.40 \pm 0.17$\\
        \hline
        generated NeuroMorpho 1 & $0.76 \pm 0.08$ & $0.87 \pm 0.06$ & $0.89 \pm 0.07$ & $0.42 \pm 0.16$\\
        \hline
        Dataset 1 + noise & $0.16 \pm 0.15$ & $0.02 \pm 0.03$ & $0.03 \pm 0.02$ & $0.76 \pm 0.09$\\
        \hline
        \end{tabular}}
        \caption{Transferability study: neuron segmentation Dice scores obtained by transfer between real and generated datasets. }
        \label{Tab:results}
           \vspace{-0.25cm}
        \end{table}
     
 
        The images generated by our method show a high transferability to real images compared to the noise-based synthetic images. The gap in Dice score compared to the training on real images is caused by some false positives in the background, missing cell bodies, and segmentation of the axons absent in the ground-truth (Figure \ref{Img:segmentation} of Appendix \ref{appendix:figures} for visualizations). We can expect that tracing models trained on images generated using the proposed method generalize better to real data than the current models trained exclusively on noise-based synthetic data \cite{chen2021deep, prabhakar2024vesselformer}. Furthermore, our model maintains a good generation quality and diversity even when trained on small datasets (Table \ref{Tab:trainingset-size} of Appendix \ref{appendix:figures}), limiting the need for annotated data. Besides, as demonstrated in this study, and supported by Figure \ref{Img:examples} and Table \ref{Tab:neuromorpho-metrics} of Appendix A, it is able to generalize to input neuron morphologies that differ from the the training set. Therefore, it can find a potential application in cases where manual tracing can hardly be performed (e.g. intricated neurons with complex branching patterns). Indeed, our model only requires segmentations to generate images, which are less burdensome to produce than tracings. The paired ground-truth tracing and microscopy images could then be created by using other available tracing data.
        

 \vspace{-0.2cm}
\section{Conclusion and discussion}

We have developed a novel method utilizing a multi-scale cascade of conditional GANs to generate a diverse set of realistic, high-resolution microscopy images from neuron tractograms. Given its ability to provide accurate and rich ground-truth information, including the underlying topology and orientation of the neuronal trees, our method holds promise for facilitating more complex analyses such as tracing. Looking forward, we aim to extend our methodology to other tree-like structures, such as vessels and airways, and to explore its applicability across different imaging modalities. 
A limitation of our current approach is the reliance on histogram equalization of images, a pre-processing step that, while beneficial for the data used in this study, may not be suitable for all applications. In future work, we intend to explore the generation of images with intensity distributions that more closely mirror those found in real datasets. Additionally, we aim to enhance the interpretability of our generative model, striving for finer control over the generated images' characteristics.


\clearpage
% Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{This work was funded by the Japan Society for the Promotion of Science (JSPS, Fellowship P23757). It is supported by the program for Brain Mapping by Integrated Neurotechnologies for Disease Studies (Brain/MINDS) and the program for Multidisciplinary Frontier Brain and Neuroscience Discoveries (Brain/MINDS 2.0) from the Japan Agency for Medical Research and Development AMED (JP23wm0625001, JP15dm0207001).}


\bibliography{midl24-197}

\appendix

\section{Dataset details} \label{appendix:data-detail}

The images from Dataset 2 were collected from animals at various stages throughout larval development. About half of the images are of uninjured neurons, and half of the images are of neurons recovering from injury to their dendrites.  Injury had been delivered by a 2-photon laser either 24 hours prior to imaging, or 72 hours prior to imaging.  It has been previously demonstrated that injury causes alterations to dendrite morphology \cite{thompson2016vivo}.


\section{Training details for the segmentation model} \label{appendix:training-detail}

 Each dataset is divided into train/validation/test folds containing respectively $70\%$, $15\%$, and $15\%$ of the data. The number of images in each fold for the different datasets is $291/64/62$ for the real images, the noise-based synthetic images, and the generated Dataset 1 images, and $259/57/57$ for the generated NeuroMorpho 1 images. The architecture of our segmentation model is a U-Net. We train on patches of size $256 \times 256$, with a $50\%$ overlap. The training loss is a combination of BCE loss and Dice loss. We use the Adam optimizer with a learning rate of $0.01$ and a batch size of $64$. The training is automatically stopped based on the validation loss, and the best model is retained. At inference, the overlapping patches are merged with cosine weights, and the probability map is thresholded at $0.5$ to compute the Dice score. 

\section{Evaluation metrics} \label{appendix:metrics}

The details of the implementation of the metrics used in Section \ref{sec:ablation} are given hereafter.

\begin{itemize}
\item{$FID_{0}$ : Frechet inception distance (FID) between the low resolution (64x64) images generated by the base generation cGAN at scale $0$, and the real images, downsampled to the same resolution (64x64).}

\item{$FID_{n}$ : Frechet inception distance (FID) between the full-resolution images generated at the last scale $n$, and the real images. $FID_{0}$ and $FID_{n}$ were computed using the python package torchmetrics.}

\item{$scale\_corr$ : Mean cross-correlation between the images generated at different scales. The mean cross correlation value is averaged over $10$ different input segmentations.}

\item{$repeat\_corr$ : Mean cross-correlation between images generated for $10$ repetition of the same input. The mean cross-correlation is averaged over $10$ different input segmentations.}

\item{$label\_corr$ : Mean correlation between the input segmentations and the generated images. To be independent of the background intensity, the correlation is computed on local patches (10x10) along the neurites, and averaged.}

\end{itemize}



\section{Additional figures} \label{appendix:figures}

    \begin{figure}[!h]
    \centering
    \includegraphics[width=1\textwidth]{Img/examples.pdf}
    \caption{Examples from different datasets; images generated by our method with neuron morphologies from the NeuroMorpho dataset (NeuroMorpho 1 and 2), images generated by our method with neuron morphologies from the train dataset (Datasets 1 and 2), and real images (after histogram equalization).}
    \label{Img:examples}
    \end{figure}

    \begin{figure}[!h]
    \centering
    \includegraphics[width=1\textwidth]{Img/ablation.png}
    \caption{Images generated by the models tested in our ablation study. For each model, two samples generated from the same neuron segmentation input are displayed in order to visualize the diversity of the synthetic images.}
    \label{Img:ablation}
    \end{figure}
   

    \begin{figure}[!h]
    \centering
    \includegraphics[width=1\textwidth]{Img/segmentations.png}
    \caption{Neuron segmentation predicted from real microscopy images by models trained on different synthetic datasets.}
    \label{Img:segmentation}
    \end{figure}


    \clearpage
    
    \begin{table}[!h]
    \centering
    \small
    \begin{tabular}{|c|ccccc|}
    \hline     
    scale & 0 & 1 & 2 & 3 & 4\\ 
    \hline
    receptive field size / patch size & 1024 / 64 & 512 / 64 & 256 / 64 & 256 / 128 & 256 / 256\\  
    resolution factor & 1/16 & 1/8 & 1/4 & 1/2 & 1\\ 
    iterations &  3000 & 3000 & 6000 & 6000 & 6000\\
    \hline
    \end{tabular}
    \caption{Parameters used for training at each scale of the generation process. The receptive field size (i.e. the amount of the original image contained in the patch) and the patch size are expressed in number of pixels. The resolution factor refers to the ratio between the resolution of the original image at the resolution of the image generated at a given scale. A resolution factor of $1$ indicates an image at full resolution. The number of epochs is expressed relatively to the total number of iterations (dataset size $\times$ batch size), to make it independent from the size of the input dataset.}
    \label{Tab:parameters}
    \end{table}
    

    \begin{table}[!h]
    \centering
    \small
    \begin{tabular}{|c|c|ccc|}
    \hline     
    training & inference & $FID_{n} \downarrow$ & $repeat\_corr \downarrow$ & $label\_corr \uparrow$ \\
    \hline
    Dataset 1 & Dataset 1 & $0.38$ & $0.19 \pm 0.08$ & $0.70 \pm 0.03$\\
    Dataset 1 & NeuroMorpho 1 & $1.10$ & $0.16 \pm 0.04$ & $0.75 \pm 0.02$\\
    Dataset 2 & Dataset 2 & $1.20$ & $0.37 \pm 0.13$ & $0.76 \pm 0.05$\\
    Dataset 2 & NeuroMorpho 2 & $1.08$ & $0.43 \pm 0.16$& $0.82 \pm 0.04$\\
    \hline
    \end{tabular}
    \caption{Performance of the generation model for inference on NeuroMorpho neuron morphologies unseen during training. The images generated from NeuroMorpho neurons show similar levels of diversity, compliance with the input neuron morphology, and fidelity to the real images as the images generated from the training neuron morphologies. The bigger gap in $FID$ observed in Dataset 1 can be explained by the important differences between the neuron morphologies of Dataset 1 and NeuroMorpho.}
    \label{Tab:neuromorpho-metrics}
    \end{table}
    
   
    \begin{table}[!h]
    \centering
    \small
    \begin{tabular}{|c|ccccc|}
    \hline 
    training set size & 10 & 20 & 50 & 100 & 291 \\
    \hline
    $FID_{n}$ & $1.17$ & $0.57$ & $0.46$ & $0.30$ & $0.38$\\
    $repeat\_corr$ & $0.50 \pm 0.19$ & $0.40 \pm 0.10$ & $0.26 \pm 0.12$ & $0.29 \pm 0.07$ & $0.19 \pm 0.08$\\
    \hline
    \end{tabular}
    \caption{Impact of the size of the training set on the model performance. The decrease in the realism and diversity of the generated images observed when we reduce the number of training pairs remains within an acceptable range even with few data (20-50).}
    \label{Tab:trainingset-size}
    \end{table}



\end{document}

