\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images
\usepackage[nolist]{acronym}
\usepackage{booktabs}
\usepackage[font=small,labelfont=bf]{caption}
\usepackage{wrapfig}
\newcommand{\argmax}{\arg\!\max}
\newcommand{\ie}{i.\,e.\ }
\newcommand{\eg}{e.\,g.\ }
\jmlryear{2024}
\jmlrworkshop{Full Paper -- MIDL 2024}
\jmlrvolume{-- 037}
\editors{Accepted for publication at MIDL 2024}

%\everypar{\looseness=-1}
% can inscrease linepenalty more, but sometimes leads to ugly formatting...
\linepenalty=1000

\title[MSSG]{Heterogeneous Medical Data Integration with Multi-Source StyleGAN}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}
% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Wei-Cheng Lai\nametag{$^{1}$}} \Email{wei-cheng.lai@hpi.de}\\
\Name{Matthias Kirchler\nametag{$^{1, 5}$}} \Email{matthias.kirchler@hpi.de}\\
\Name{Hadya Yassin\nametag{$^{1}$}} \Email{hadya.yassin@hpi.de}\\
\Name{Jana Fehr\nametag{$^{1,2}$}} \Email{jana.fehr@bih-charite.de}\\
\Name{Alexander Rakowski\nametag{$^{1}$}} \Email{alexander.rakowski@hpi.de}\\
\Name{Hampus Olsson\nametag{$^{3}$}} \Email{n.hampus.olsson@gmail.com}\\
\Name{Ludger Starke\nametag{$^{3}$}} \Email{ludger.starke@mdc-berlin.de}\\
\Name{Jason M. Millward\nametag{$^{3}$}} \Email{jason.millward@mdc-berlin.de}\\
\Name{Sonia Waiczies\nametag{$^{3}$}} \Email{Sonia.Waiczies@mdc-berlin.de} \\
\Name{Christoph Lippert\nametag{$^{1, 4}$}} \Email{christoph.lippert@hpi.de} \\
\addr $^{1}$ Digital Health and Machine Learning, Hasso-Plattner-Institute, University of Potsdam, Germany \\
\addr $^{2}$ QUEST Center for Responsible Research, Berlin Institute of Health, Charit\'e Universit\"atsmedizin Berlin, Germany \\
\addr $^{3}$ Max-Delbrück-Center for Molecular Medicine in the Helmholtz Association (MDC), Berlin Ultrahigh Field Facility, Berlin, Germany \\
\addr $^{4}$ Hasso Plattner Institute for Digital Health at Mount Sinai, Icahn School of Medicine at Mount Sinai, New York, NY, United States of America \\
\addr $^{5}$ University of Kaiserslautern-Landau, Germany
% \\
}
% \raggedbottom % Prevents LaTeX from stretching content to fill the page
\begin{document}

\maketitle
\vspace{-6mm}
\begin{abstract}
Conditional deep generative models have emerged as powerful tools for generating realistic images enabling fine-grained control over latent factors. 
In the medical domain, data scarcity and the need to integrate information from diverse sources present challenges for existing generative models, often resulting in low-quality image generation and poor controllability. 
To address these two issues, we propose Multi-Source StyleGAN (MSSG). MSSG learns jointly from multiple heterogeneous data sources with different available covariates and can generate new images controlling all covariates together, thereby overcoming both data scarcity and heterogeneity.
We validate our method on semi-synthetic data of hand-written digit images with varying morphological features and in controlled multi-source simulations on retinal fundus images and brain magnetic resonance images. Finally, we apply MSSG in a real-world setting of brain MRI from different sources. Our proposed algorithm offers a promising direction for unbiased data generation from disparate sources. For the reproducibility of our experimental results, we provide detailed code implementation \footnote{\url{https://github.com/weslai/msstylegans}}.
\end{abstract}

\begin{keywords}
Generative Models, StyleGAN, Multi-Source, MRI, Retinal Fundus Images.
\end{keywords}

\section{Introduction}

\label{sec:intro}
Many medical \ac{dl} applications suffer from algorithmic biases, stemming from training with limited and biased data sets due to data restriction and low disease prevalence \cite{reviewgans_Kazeminia2020, Bak2022}. 
% \acp{gan} and Generative Diffusion Models
Currently, research has employed deep generative models to create synthetic Computer Tomography scans, brain \acp{mri} \cite{Frid-Adar2018_gan, Han2018_ganMR_imaging, 3Dstylegan, xiang2023ddm} and retinal fundus images \cite{CostaGMANMC17, End2EndAdversarial_retinal, ZhaoLC17}. This offers an opportunity to fill the data availability gap in the medical domain by augmenting available data with synthetic counterparts \cite{reviewgans_Kazeminia2020}. These applications generate high-resolution medical images but are not able to generate images with specific demographic and clinical characteristics. Other generative approaches use conditional \acp{vae} that learn causal relationships to infer brain \acp{mri} with specific clinical and demographic characteristics \cite{Reinhold2021, Jung2021_cond3D} and generate counterfactual brain \acp{mri} \cite{Pawlowski2020}.

\begin{wrapfigure}{r}{0.45\textwidth}
    \floatconts
    {fig:mri_ms}% label
    {\vspace{-5mm} \caption{Conditionally generated MRI from the multi-source model. Age is fixed at $59$. The model controls ventricle volumes and grey matter volumes (white part of the brain MRI) simultaneously.}}
    {\vspace{-7mm} \includegraphics[width=0.45\textwidth, keepaspectratio]{figures/main/mri_age_59.0.pdf}}
\end{wrapfigure}
In the medical domain, several crucial limitations arise. 
Generally, data sets are \emph{small}, compared with natural image data sets, different studies present \emph{diverging demographic and distributional stratifications}, and different sources release \emph{different sets of latent factors}.
Consider the example of building a generative model to synthesize brain \acp{mri} for developing models that predict dementia.
One data source may be a cohort study and collect data from a general population, with age at collection sampled uniformly between 40 and 70; due to the cohort design, \ac{ci} has a very low prevalence in this source.
Another data source, on the other hand, may be a specific \ac{ad} cohort with a very high prevalence of CI in the sampled individuals; since age is a major risk factor for AD, this second data source will likely also collect older individuals.
In addition, the two studies' collection protocols will be differently designed to satisfy different aims.
Hence, the available covariates for each study will only partially overlap.
While both studies are highly likely to collect information on age, demographics, and sex, the first study might collect additional information on other neurological issues, while the second study might collect clinical measurements specific to \ac{ad}.
Indeed, this example is the very case for the \acf{ukb}, a UK-based population study \cite{UKB}, and the \acf{adni} \cite{adni_Petersen2010-zh}.
Generally, due to privacy concerns and high acquisition costs, medical images are usually also released only in smaller studies and can not be scraped at the same scale as natural images.
For example, the \ac{ukb} contains one of the largest brain MRI studies to date and only acquired images of less than 50,000 individuals so far.

%\vspace{-2mm}

%Secondly, medical image data is scarcely labeled.
%Annotation of medical images requires high expertise in specialized medical subdisciplines and can take considerable time even for a single instance \cite{Aljabri2022Towardsannotation}.
%Additionally, researchers often exclude factors that are not of direct interest to the study design to protect the privacy of participants, even if it was collected in the original data.

Training reliable generative models on medical imaging data is challenging due to several factors. Limited data availability often leads to lower visual quality and incomplete coverage of the image space. Additionally, the scarcity of labels restricts control over generated images to the covariates present in a single dataset. Most existing conditional generative models cannot leverage multiple data sources simultaneously unless label overlap is perfect \cite{multiattr_pizza_gen}. Moreover, models trained on a single data source suffer from reduced data availability. To address these limitations, we introduce \ac{mssg}, which is based on StyleGAN3 \citep{Karras2021stylegan3} but learns from multiple data sources concurrently. This approach increases dataset size and enables conditional image generation with \emph{all} available latent covariates.

We first validate \ac{mssg} on a semi-synthetic data set of hand-written digits in which we can directly control specific morphological characteristics \cite{castro2019morphomnist}.
We also investigate MSSG's behavior on real-world medical imaging data from a single source with a simulated data source split.
Finally, we apply \ac{mssg} to the realistic setting of multiple sources of brain MRI data. % and on retinal fundus images.  
%\ac{MSSG} to generate brain \acp{MRI} and retinal fundus images with specific clinical and demographic characteristics.
%For this, we trained our model to learn conditional latent relationships from two datasets in parallel. For the brain \acp{MRI}, the first dataset is UKBiobank \cite{UKB}, from which we learn conditional relationships between demographic characteristics and \acp{MRI}. The second dataset is the \acf{ADNI} cohort \cite{adni_Petersen2010-zh}, from which we learn the relationships between clinical characteristics of dementia and \acp{MRI}. From this \ac{MSSG} we infer synthetic brain MRIs with specific age, brain volumes and normal or impaired cognitive functions. For the retinal imaging, the first dataset is also UKBiobank \cite{UKB}, which contains specific quantitative optical information. The second dataset is Eyepacs \cite{eyepacs} published on Kaggle, which contains diabetic retinopathy from different scales.
We show that \ac{mssg} can synthesize high-quality medical images and jointly control latent structures present in those images.


\section{Multi-Source StyleGAN}
\label{sec:methods}

\subsection{Conditional StyleGANs}
Conditional \acp{gan} generate natural images with specified categories \cite{mirza2014conditional}. Given the training data and labels, the labels are passed to the generator and the discriminator. In the generator, labels are concatenated with noise into a latent representation. In the discriminator, labels are fitted as input and the discriminator distinguishes synthetic images from real images based on labels.
For conditional StyleGANs, the generator transforms a one-hot-vector label $c$ into an embedding vector. This is passed to the mapping network $M$ with a latent vector $z$. The mapping network produces another latent vector $w$ and this latent vector $w$ is passed to the synthetic network for image generation. The StyleGAN discriminator applies a conditional projection discriminator \cite{miyato2018cgans}; see \sectionref{sec:related_work} for details. % We provide further background on conditional \acp{gan} in the \sectionref{sec:related_work}.

\paragraph{Mixed-type latent-variable conditional GANs}
\label{sec:our-cgan}
%Most works on conditional image generation with \acp{gan} have focused on the simple case of discrete classes instead of continuous latent factors. \citeauthor{continuous_ding2021ccgan} propose continuous conditional \acp{gan} (ccGANs) to deal with regression labels, such as angles and ages \cite{continuous_ding2021ccgan, improved_ccgan}. However, the proposed method was demonstrated on a single conditional label.
% \cite{ijcai2022_dobler} investigate on multi-conditional \acp{stylegan} and generate art images from different type of labels, including categorical and probabilistic labels. The proposed method does not deal with regression labels.

In this work, we focus on an extension of conditional \acp{gan} that can (i) handle mixed-type latent factors and (ii) integrate data sources that have different labels available.
% In our proposed method, 
Here, the generator network takes in both a noise variable $z\in \mathbb{R}^{d_z}$ and a conditioning variable $c\in \mathbb{R}^{d_c}$, which may consist of binary, multi-class (coded as a one-hot subvector), and continuous values.
Analogously to the standard StyleGAN definition, the generator maps $c$ through an embedding network and concatenates them to the noise variables $z$ to generate the image.
The discriminator predicts $1+d_c$ output variables, with the first output denoting the standard fake/real prediction.
The remaining $d_c$ outputs are predictions for each of the input variables, with appropriate loss functions for each variable (e.g., the cross-entropy loss for categorical subfeatures, and the quadratic loss for continuous subfeatures).
% Note that 
This requires a generative model of the latent distribution, which will be learned independently from the covariates, as described in the next section.
 
\subsection{Modeling of Latent Factors}
\label{subsec:latent_models}
For simplicity, we will focus on the case of two sources, $D^1$ and $D^2$.
Each data source $j$ consists of image-covariate pairs, $D^j = \left\{ (x_i^j, c_i^j) \right\}_i$.
The covariates consist of a \emph{shared part} $c^j_\text{shar}$ that is available in both sources, a \emph{unique part} $c^j_\text{uniq}$ that is only available in source $j$, and a \emph{hidden part} $c^j_\text{hidd}$ that is available in the other data source but not in $j$, i.e., $c^j = \left[c^j_\text{shar}, c^j_\text{uniq}, c^j_\text{hidd}\right]$.
The shared part denotes variables that are available in both data sets and coincide for both data sources, $c^1_\text{shar} \triangleq c^2_\text{shar} \in \mathbb{R}^{q_\text{shar}}$.
However, due to distribution shift, the unconditional distribution may differ between data sources, $p(c^1_\text{shar}) \ne p(c^2_\text{shar})$.
A typical example may be the age of each individual: different studies often collect data in different age cohorts.
The remaining variables are unique to each data source and may have different dimensionality, $q^j_\text{uniq}$. These could be specific variables, such as different brain volumes, and cataracts (a disease), which are not considered in each data source but related to the shared variable.
With only two data sources, $c^1_\text{uniq} \triangleq c^2_\text{hidd}$ and vice versa.

\begin{wrapfigure}{r}{0.40\textwidth}
    \floatconts
    {fig:ms_architecture}
    {\vspace{-4mm}\caption{Latent space model of \ac{mssg}. $f^1$ approximates $c^{2}_\text{hidd}$ from the shared covariate $c^{j}_\text{shar}$ and vice versa. They are integrated into a joint latent space $\hat{c}^j = (c^j_\text{shar}, \hat{c}^j_\text{hidd}, c^j_\text{uniq})$. The generator $G$ gets the concatenated vector from a latent noise $z$ and the joint latent space $\hat{c}^j$.}}
    {\vspace{-4mm}\includegraphics[width=.40\textwidth,keepaspectratio]{figures/main/latentspace_arch.pdf}}
\end{wrapfigure}

We assume that the \emph{conditional} distribution does not shift between data sets, that is, we assume that $p(c^1_\text{hidd} | c^1_\text{shar}=\xi) = p(c^2_\text{uniq} | c^2_\text{shar}=\xi)$  for any $\xi$.
This assumption allows us to model the distribution between the different latent variables.
Hence, we fit \emph{stochastic} models $f^j$ that can sample from the conditional distributions, respectively: on source 1, we fit $f^1(c^1_\text{shar}) = \hat{c}^1_\text{uniq}$, which can be used in source 2 to impute the missing $\hat{c}^2_\text{hidd}$, and vice versa.
Different parametric and non-parametric statistical methodologies can be employed for such a distribution estimation, such as structural equation modeling \cite{structural_counterfactuals, Pawlowski2020}, Bayesian methods \cite{bayesian_gan_yunus}, or GAN-based methods \cite{NEURIPS2021_decaf}.
In this work, we use \ac{mle} of a hand-designed parametric model for all conditional and unconditional distributions, %parameters, 
see \sectionref{sec:latent_space_modeling}.
%For all conditional and unconditional distributions, 
For all latent factors, we fit models from which we can sample, instead of only predicting the most likely outcome. %With parametric models, we can \emph{sample} from the conditional distribution, instead of predicting the most likely outcome.

\subsection{Training Paradigm}
\label{subsec:train_paradigm}
Before training the \ac{mssg}, we first fit the latent space models $f^1, f^2$ as described in the previous section. In this work, all conditional models are parametrized by (generalized) linear models or conditional \acp{gmm}.
We adapt the standard \ac{gan} training paradigm with alternating discriminator and generator update steps.
Our proposed model has a \acs{stylegan} backbone, but in principle, it can be used with most GANs. \figureref{fig:ms_architecture} shows how the concatenated vector $(z, c^j_\text{shar}, \hat{c}^j_\text{hidd}. c^j_\text{uniq})$ is passed to the generator $G$ to generate images. 

During the \textbf{generator step}, we sample the available ground-truth covariates $c^j_\text{shar}$ and $c^j_\text{uniq}$ directly from the training data sets, for both data sources at equal proportions.
We then estimate the hidden variables from the latent space models fit in the initialization stage, $\hat{c}^1_\text{hidd} = f^2(c^1_\text{shar})$ and vice versa.
We gather the latent data of both sources in a joint batch and sample $z$ from the noise distribution.
We use $c$ and $z$ to generate an image with the generator,
\begin{math}
    \text{Img} = G(z, c_\text{shar}, c_\text{uniq}, \hat{c}_\text{hidd}).
\end{math}
We then compute predictions from the discriminator,
\begin{math}
    \hat{y}, \hat{c}_\text{shar}, \hat{c}_\text{uniq}, \hat{c}_\text{hidd} = D(\text{Img}),
\end{math}
where $\hat{y}$ is the binary label if the image is generated or real, and propagate the loss as described in \sectionref{sec:our-cgan} to update the generator.

During the \textbf{discriminator step}, we generate covariates from the latent-variable model described in the previous \sectionref{subsec:latent_models}.
The shared and unique covariates are sampled separately for each data source. 
We sample the hidden conditional covariates from the corresponding latent space model from the other source.
%This means, that we sample the unconditional hidden covariates for source 1 with the latent model from source 2, and vice versa.
%
We then generate synthetic images with the generator and draw real samples from both sources at equal proportions.
The discriminator is again trained to both distinguish between fake and real images and to predict the correct latent covariates from the images.
However, we only compute the loss over the covariates for the real images to prevent shortcut learning, where the discriminator and generator cooperate to minimize the loss without solving the training task. 
%Based on the type of latent covariates, different loss functions are used. For example, for regression covariates, mean square error is applied and cross-entropy loss is used for the multi-categorical features, as mentioned in \sectionref{sec:our-cgan}.

% Furthermore, we introduce for training the discriminator a lambda, which serves as a regularization for training latent covariates.
%\vspace{-3mm}
\subsection{Inference Stage}
After training the proposed \ac{mssg}, we can generate images while controlling all covariates from different sources jointly.
To draw a fully random synthetic sample from any of the available sources, we can draw a sample from the shared covariate model, $c^j_\text{shar}$ and push it through latent space models $f^1, f^2$, to generate $c^j_\text{uniq}$ and $c^j_\text{hidd}$. 
Alternatively, of course, we can directly set the variables $c_\text{shar}$, $c_\text{uniq}$ and $c_\text{hidd}$ to any desired values. 
This is shown in \figureref{fig:ms_architecture}, where we describe how to use the joint latent covariables with \acp{gan}.

% \begin{wrapfigure}{r}{0.5\textwidth}
%     \centering
%     \vspace{-11mm}
%     \includegraphics[width=.49\textwidth,keepaspectratio]{MIDLLatex/figures/main/ms_gan_arch_latent.pdf}
%     \vspace{-8mm}
%     \caption{Latent space models of \ac{mssg}. $f^1$ approximates $c^{2}_\text{hidd}$ from the shared covariate $c^{j}_\text{shar}$ and vice versa. They are integrated into a joint latent space $\hat{c}^j = (c^j_\text{shar}, c^j_\text{uniq}, \hat{c}^j_\text{hidd})$. The generator $G$ gets the concatenated vector from a latent noise $z$ and the joint latent space $\hat{c}^j$.
%     }
%     \vspace{-10mm}
%     \label{fig:ms_architecture}
% \end{wrapfigure}

%\vspace{-4mm}
\section{Experimental Evaluation}
We first validate that \ac{mssg} can properly learn from multiple sources in the toy data set MorphoMNIST that allows full control of latent factors.
% In this section, we empirically evaluate \acp{mssg} in controlled semi-synthetic scenarios and real-world medical data.
% Initially, we validate \ac{mssg}'s reliable image generation from two different sources using a semi-synthetic dataset of controllable handwritten digits.
Subsequently, we apply our method to real-world datasets -- brain magnetic resonance images and retinal fundus images—from the \acl{ukb} and simulate a multi-source scenario with a data split.
% To further demonstrate efficacy on real-world medical data, we simulate a multi-source scenario by partitioning these datasets into two sources, each with access to partially overlapping latent factors. 
Lastly, we explore a fully realistic multi-source setting by incorporating images from \ac{adni} into the \ac{ukb} setting.
% Lastly, we explore a fully realistic multi-source setting by incorporating images from \acl{adni} and \ac{rfmid} \cite{rfmid} into the two \ac{ukb} settings.
% the EyePACS diabetic retinopathy data set, respectively.
We compare our \ac{mssg} model with single-source StyleGANs, which are trained without latent space models and only on a single source.

\paragraph{Evaluation Metrics}\label{sec:eval_metrics} Various image quality metrics have been proposed, including the Inception Score \cite{inception_score}, Precision-Recall \cite{precision_recall_gan}, \ac{fid} \cite{fid_score}, and \ac{kid} \cite{kernel_mmd}.
Here, we focus on the \ac{fid} and report \ac{kid} and further metrics, namely \ac{lpips} \cite{zhang2018perceptual}, \ac{ssim}, \ac{psnr}, in \sectionref{sec:ext_eval}.% metrics to evaluate how well a set of generated images matches the distribution of real test images. The \ac{kid} metric results are discussed in \sectionref{sec:ext_eval}.

Additionally, we evaluate the controllability of the latent factors in the generated images. Previous works such as the Intra-FID \cite{miyato2018cgans} do not apply to images with continuous labels and do not address the controllability of specific covariates.
%Calculating \ac{fid} for each category can be computationally expensive.
We propose a new metric, the \emph{strata prediction score}, to evaluate the controllability of continuous covariates.
We stratify test set samples into $m$ marginal bins per covariate, with each marginal bin containing $33\%$ of the total sample size to maintain adequate representation. Within each bin, we generate $15,000$ images corresponding to the test set labels. Predictions for all covariates are made using separately trained prediction models on both real and generated images. The strata prediction score is then calculated as the weighted average Pearson correlation coefficient between predicted covariates from generated and test set images across all bins. See \sectionref{sec:eval_regression} for details.

% We stratify test set samples based on their labels, dividing each covariate into $m$ marginal bins. Each marginal bin on a single variable contains $33\%$ of the total sample size in our experiments to ensure sufficient sample size in each stratum. Within each stratum, we generate $15,000$ images with the same test set labels.
% We then predict all covariates with a separately trained prediction model on both the true and the generated images.
% The strata prediction score then is the Pearson correlation coefficient between the predicted covariates from generated and test set images, averaged over all strata (weighted by the stratum size).

%\begin{table}[the]
\begin{wraptable}{r}{9cm}
\vspace{-6mm}
\floatconts
    {tab:semi_ms_fid}
    {\caption{\ac{fid} $\downarrow$ in three semi-multisource scenarios}}%
    {\footnotesize%
        \vspace{-6mm}
        \begin{tabular}{lccc}
            \toprule
            \ac{fid} $\downarrow$ & & \textbf{Data sets} & \\
            \textbf{Methods} & MorphoMNIST & MRI & Retina \\
            \midrule
            % Source 1 half (baseline) & 2.992 & & \\
            Source 1 (baseline) & 3.28 & 22.11 & 30.19\\
            % Source 2 half (baseline) & 3.147 & &\\
            Source 2 (baseline) & 3.08 & 8.91 & 14.21\\
            Multi-source (half) & 3.13 & 13.81 & 14.62\\
            Multi-source (full) & \textbf{2.24} & \textbf{8.62} & \textbf{10.17}\\
            \bottomrule
        \end{tabular}
    }
\end{wraptable}
%\end{table}

\subsection{Validation in MorphoMNIST}
\paragraph{Setting}
MorphoMNIST \cite{castro2019morphomnist} is a semi-synthetic dataset derived from the MNIST benchmark.
In contrast to MNIST, MorphoMNIST allows explicit control of morphological features like \texttt{thickness}, pixel \texttt{intensity}, \texttt{slant} (rotation) in addition to the \texttt{digit}. We modified the synthetic data generation model proposed by \citet{Pawlowski2020}; \sectionref{app:morpho_models} provides detailed information. The first data source includes \texttt{thickness}, \texttt{digit}, and \texttt{intensity}, while the second source includes \texttt{thickness}, \texttt{digit}, and \texttt{slant}, i.e., the shared variables are $c_\text{shar} = \left[\texttt{thickness}, \texttt{digit}\right]$, with $c^1_\text{uniq} = c^2_\text{hidd} = \texttt{intensity}$ and $c^2_\text{uniq} = c^1_\text{hidd} = \texttt{slant}$.

%the same structure for the first data source with two covariates, \ie thickness $T$ and intensity $I$.
%For the second data source, we created another structural scenario, in which the stroke thickness influences the slant of digits.
%Therefore, there are also two covariates, namely thickness $T$ and slant $S$.
%The shared covariate between both sources is  $T$. We formulate below the correlation between covariates.
%\begin{equation}
%    c^2_{uniq} = h(c^1_{shar}, \epsilon), \quad \epsilon \sim \mathcal{N}(0, 1)
%    c^1_{hidd} \triangleq c^2_{uniq}
%    \label{equ:approximate_source1}
%\end{equation}

\paragraph{Evaluation}
\label{sec:morpho_eval}
As a baseline, we train conditional StyleGANs on each source independently, each with a sample size of $N= 24,000$. %, ensuring comparable sample sizes ($N = 24,000$).
Our multi-source StyleGAN can integrate data from different sources and thus can utilize more data than the single-source models.
To disentangle the effect of increased sample size and covariate aggregation, we first train our model on a reduced data set with $12,000$ samples from each data source (i.e., $N = 24,000$ in total), denoted by ``half.''
We also train a model on the full data set of $N = 48,000$ images, denoted by ``full.''
%We trained our multi-source GAN on MorphoMNIST in two scenarios. In the first, with a dataset of $24,000$ images, half from source 1 (thickness and intensity) and half from source 2 (thickness and slant).
%In the second scenario, the dataset was doubled to 48,000 images, with equal distribution from source 1 and source 2 ($N = 24,000$ each).

% \begin{figure}[t]%{\textwidth}
%     \begin{minipage}[ht]{.49\textwidth}
%         \centering
%         \includegraphics[width=\textwidth, keepaspectratio]{MIDLLatex/figures/main/mri_age_59.0.pdf}
%         \caption{Conditionally generated MRI from the multi-source model. Age is fixed at $59$. The model controls ventricle volumes and grey matter volumes (white part of the brain MRI) simultaneously.}
%         \label{fig:mri_ms}% label
%     \end{minipage}
%     \hfill
    % \begin{minipage}[ht]{.49\textwidth}
    %     \centering
\begin{wrapfigure}{r}{0.45\textwidth}
    \floatconts
    {fig:result_heatmap}
    {\vspace{-8mm}\caption{The strata weighted Pearson's correlation $\uparrow$ between the outputs of prediction models.}}
    {\vspace{-6mm}\includegraphics[width=0.45\textwidth, keepaspectratio]{figures/main/strata_corr_semi_allcovs_w_cataract.pdf}}
\end{wrapfigure}
%     \end{minipage}
% \end{figure}

\tableref{tab:semi_ms_fid} (first column) shows that at comparable sample size, our multi-source model achieves comparable image quality performance to the baseline methods while using both full data sets (which is not possible for the single-source models) leads to a considerable improvement in image quality. 
%reveals that the multi-source model, trained on an equal number of samples, achieves a joint FID of $3.13$ (3rd row), which is comparable to single-source models (baselines). Doubling the data samples enhances the joint FID to $2.24$, surpassing baseline performance. Further evaluations are presented in \tableref{tab:kid_morpho}.\\
In \figureref{fig:result_heatmap} we explore the controllability of the different models.
As expected, our multi-source models can reliably control all three covariates, with slight improvements for larger sample sizes.
The single-source baseline models can only control the respectively available covariates well but show low-to-moderate correlation with the unavailable covariates due to the correlation between thickness, intensity, and slant in the training data (see \sectionref{app:morpho_models}).
Qualitatively, in \figureref{fig:app_morpho_ms} we fix two covariates and show that our model can control the remaining two variables; note that our model can jointly control the unique variables \texttt{slant} and \texttt{intensity}, even without instances in the training data with both variables concurrently available.
%Moreover, the controllability of image generation is explored through the strata prediction score, as depicted in \figureref{fig:result_heatmap}. Evaluation outcomes from multi-source models (central columns) and baseline models (columns 1 and 2) are showcased. Shared covariance (\texttt{thickness}) yields the highest correlation from multi-source models with the scores of $0.702$ and $0.713$. Independent covariates (\texttt{intensity} and \texttt{slant}) in sources 1 and 2 are effectively learned by single-source models but struggle with the unlearned covariate. Multi-source models (half) exhibit slightly lower performance, with correlations of $0.832$ for intensity and $0.887$ for slant, having only been exposed to half of the true latent labels for \texttt{intensity} and \texttt{slant}. However, with doubled data samples, the multi-source model achieves comparable correlations ($0.867$ for intensity, $0.902$ for slant) and demonstrates the ability to generate all three covariates independently. \\
%We can see 
% Next, we report the strata-FID. In each stratum, we measured $20000$ conditional generated images with the corresponding real images. At the end, we take the weighted sum of FID through every stratum. This is shown in \ref{tab:strata_fid_comparison}. For MorphoMNIST, the multi-source GAN reaches a similar result as single-source models, however, it is slightly worse. Interestingly, single source 1 performs better on the general FID score, but not on the Strata-FID.
%We further investigate visual examination for assessing image generation controllability. With four covariates from two sources, we emphasize the control of unique variables (\texttt{slant} and \texttt{intensity}), while maintaining \texttt{digit} and \texttt{thickness} constant. Results in \figureref{fig:app_thickness_constant} demonstrate the multi-source model's ability to independently manipulate \texttt{intensity} and \texttt{slant}, even without instances in the training data with both variables concurrently available. In contrast, a single-source GAN (baseline) can only manipulate either intensity or slant at a given time.
%\vspace{-3mm}
\subsection{Validation on Real-World Data}
\label{sec:semi-ms}
%\vspace{-1mm}
Next, we validate MSSG in real-world scenarios with more subtle data dependencies, focusing on two medical imaging modalities: brain MRI, used in diagnosing conditions like Alzheimer's Disease; and retinal fundus images, used in diagnosing ophthalmological conditions such as cataracts, and glaucoma, or diabetic retinopathy.

In the real-world setting, we use data from the \ac{ukb} resource. To validate the models' controllability, we divide each dataset into two artificial sources, allowing for a comprehensive evaluation. This approach addresses the absence of covariate labels in real-world multi-source settings, which we further explore in the next section.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%% images %%% 
% \begin{figure}[t]
% %\vspace{-14mm}
% \floatconts
%     {fig:retinal_ms}%
%     {}
%     {%
%         \subfigure[Generated MR images from the multi-source model. The age is also set to a constant value of $59$. The model controls ventricle volumes and grey matter volumes (white part of the brain MRI) simultaneously.][b]{%
%             \includegraphics[width=.48\textwidth]{MIDLLatex/figures/main/mri_age_59.0.pdf}
%             \label{fig:mri_ms}%
%         }
%         \subfigure[The strata weighted Pearson's correlation $\uparrow$ between the outputs of prediction models.][b]{%
%             \includegraphics[width=.48\textwidth]{MIDLLatex/figures/main/strata_corr_semi_allcovs_w_cataract.pdf}
%             \label{fig:result_heatmap}%
%         }
%     }
% \end{figure}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\paragraph{Brain MRI}
\label{sec:ukb_brain}

We use the coronal middle slice of T1-weighted brain MRI scans, aligning them with the MNI atlas.
We split the data set into two sources with $N = 13,414$ and $N = 13,408$ samples, respectively. 
%, is divided into two sources.
The first source has \texttt{age} and \texttt{ventricle volume} as covariates, which are positively correlated due to ventricle enlargement with age \cite{age_ventricular, brain_enlarged_ventricle}.
The second source has \texttt{age} and \texttt{grey matter volume}, which are negatively correlated \citep{age_related_change_greymatter, age_related_gm_decline, gm_cognitive_aging}.
%For model training, covariates are fitted into separate latent space models. The shared covariate, age, is modeled as a rescaled beta distribution, while ventricle and grey matter volumes are modeled using Gaussian Mixture Model regressions conditioned on age. Two scenarios are considered for the multi-source models: one with half data samples from both sources, equivalent to a single-source baseline, and another with doubled data samples.
%Conditioned on the age, for the estimation of hidden covariates, \ie ventricle and grey matter, in both sources. $p(c^1_\text{ventricle} | c^1_\text{age}=\xi)$ and $p(c^2_\text{grey matter} | c^2_\text{age}=\xi)$. 

Image quality between Source 1 and Source 2 differs strongly; our half-sized multi-source model achieves better performance than the average of the two models, and the full model outperforms all of the former models (\tableref{tab:semi_ms_fid}, second column). 
%In \ac{mri} data analysis, our multi-source model, trained with fifty percent of data samples from each source, achieves an FID score of $13.81$ on the test set. Notably, when trained with the complete dataset from both sources, the FID score improves significantly to $8.62$, as shown in \tableref{tab:semi_ms_fid} (second column), reaching a minimum but comparable value.
As shown in \figureref{fig:result_heatmap}, all models only reach low-to-moderate (but comparable) controllability of the individuals' age.
As in the MorphoMNIST example, ventricle and grey matter volumes can be jointly controlled moderately well with our multi-source model.
Note that all three variables -- \texttt{age}, \texttt{ventricle volume}, \texttt{grey matter volume} -- are all very hard to discern from only single-slice MRI \cite{age_mri_prediction}, explaining the lower controllability performance compared to MorphoMNIST.
%For further exploration, \figureref{fig:result_heatmap} illustrates the controllability assessment of individual covariates using three ResNet 50 prediction models.
% Regarding the control of ventricle volumes, our multi-source model with double data size performs slightly worse than Source 1, which specializes in training with ventricle volumes and ages. The multi-source model with half the data size of each source shows a $0.195$ difference compared to Source 1. In controlling grey matter volumes, the multi-source model with double data size exhibits performance similar to Source 2, trained with grey matter volumes and ages, while the multi-source model with half the data size of each source performs marginally worse than Source 2, with only a $0.014$ difference.

\figureref{fig:mri_ms} demonstrates the joint control of grey matter and ventricle volumes for fixed age. % displays generated MRI images for different combinations of ventricle and grey matter volumes, maintaining a constant age of $59$.
Independent control of ventricle volumes leads to their enlargement, while the grey matter consistently expands across every row, where ventricle and age remain constant. A subtle brightening effect at the periphery of the MR images aligns with the anatomical placement of grey matter at the brain's edge. An intriguing observation is the inverse relationship between ventricle volumes and grey matter volumes, suggesting a neurologically negative correlation.
%, as depicted in the generated MRI images.

\paragraph{Retinal fundus images}
\label{sec:ukb_retina}
We again use \texttt{age} as a shared covariate.
%We split the retinal fundus data set into two sources with the shared covariate $\texttt{age}$.
The first source includes \texttt{cataract} as a binary conditional covariate, %, a binary variable indicating normal or disease cases.
the second source contains \texttt{spherical power} as a unique variable, representing the lens power required to correct myopia (nearsightedness) or hyperopia (farsightedness).
Due to the low incidence of cataract cases in the \ac{ukb} dataset ($\approx 3.9 \%$) and to prevent class imbalances, we split the data into two sources, with $60\%$ normal and $40\%$ cataract images. Each source has approximately $1920$ images in the training set.
Due to the low sample size, in both sources, we included mirrored images and ADA-style data augmentation \citep{karras2020training}.

% \begin{figure}[t]
% %\vspace{-14mm}
% \floatconts
%     {fig:retinal_ms}%
%     {\caption{Generated retinal fundus images by the "full" multi-source \ac{gan}. Here, the model controls cataracts and spherical powers, while the age is set to $59$.}}
%     {%
%         \subfigure[cataract and spherical powers][b]{%
%             \includegraphics[width=.48\textwidth]{MIDLLatex/figures/main/retinal_age_59.2_2.pdf}
%             \label{fig:ret_cataract1}%
%         }
%         \subfigure[cataract][b]{%
%             \includegraphics[width=.48\textwidth]{MIDLLatex/figures/main/retinal_age_59.2.pdf}
%             \label{fig:ret_cataract2}%
%         }
%     }
% \end{figure}
%\paragraph{Evaluation}
%We trained again two single-source conditional \acp{stylegan} on Source 1 and Source 2, each with sample sizes of $N = 961$ and $N = 959$, respectively. Source 1 controls \texttt{age} and \texttt{cataract}, while Source 2 controls \texttt{age} and \texttt{spherical power}. Our multi-source conditional \acp{stylegan} is trained on both sources with these three covariates. For multi-source training, we considered two scenarios: half of the data sizes from both sources and the entire datasets from both sources, resulting in double sample sizes. 

In \tableref{tab:semi_ms_fid} image quality for the single-source models varies and the half-sized multi-source model is comparable to the better of the two, while the full multi-source model outperforms all other models.
%One possible reason for the low quality of the source-1 model may be an overfocus on the characteristic blurriness or haze of cataract-affected fundus images, which may lead to a decreased appearance of image quality. 
%As indicated in \tableref{tab:semi_ms_fid} (third column), the single-source 1, trained on age and cataract, achieves a final FID of $30.19$, while the single-source 2, trained on age and spherical power, attains a superior FID score of $14.21$. Our proposed method, utilizing half of the data samples from both sources, performs comparably to the single-source 2, yielding an FID score of $14.62$. Doubling the data samples improves our model's performance, outperforming the baseline methods (Source 1 and 2) with an FID score of $10.17$.
%This suggests that incorporating additional information from a hidden covariate contributes to generating more realistic images. Additionally, one plausible explanation for Source 1's lower performance could be the inherent difficulty in generating cataract images due to their blurry structure, making vessel observations less clear.
Controllability is less stable than for the previous data sets (\figureref{fig:result_heatmap}).
Spherical power can be controlled moderately well for those models that include it, but both age and cataract vary more strongly.
Potentially, our ``half'' model overspecializes on modeling the cataract phenotype but underperforms on age, while the full model strikes a more balanced performance. 
%Additionally, the strata prediction score metric assesses image controllability, detailed in \sectionref{sec:eval_metrics}. Results are presented in \figureref{fig:result_heatmap}, leveraging three ResNet 50 prediction models.
%In predicting ages based on retinal images (row 7), a general performance drop in generated images is observed. However, the proposed model with double data samples benefits from the increased sample size, enhancing age controllability to the score of $0.140$ compared to the half-data-samples model, which has a score of $0.072$. While the model with double data samples outperforms the Source 2 model ($0.102$), it falls short of the Source 1 model ($0.167$). By classifying binary cataract classes, the \ac{mssg} with half-data samples performs the best with a score of $0.303$ and is followed by the multi-source model with full-data samples with a score of $0.144$. The Source 1 model can not achieve a higher score ($0.061$), this could be a cause of its high \ac{fid} score. Regarding the prediction of spherical powers based on retinal images, both proposed models with half-data samples ($0.242$) and double-data samples ($0.266$) achieve comparable or even superior scores to the Source 2 model ($0.222$), trained to control spherical powers and ages.
\figureref{fig:retinal_ms} again shows that \ac{mssg} can control the variables (cataract and spherical power) jointly when the age is set to $59$. % with full data samples effectively controls \texttt{age} and \texttt{spherical power} across a range of values for both non-cataract and cataract conditions. Examining non-cataract images (\figureref{fig:ret_no_cataract}), changes in controlled spherical power result in variations in brightness, with subtle alterations occurring with age. Vessel structures are observable in these images. Conversely, for cataract images (\figureref{fig:ret_cataract}), the generated retina exhibits shadowiness and blurriness, hindering the clear observation of vessel structures.

\subsection{Application in true multi-source setting}
\label{sec:true-ms}
Finally, we investigate a real-world multi-source setting.
% in which the proposed method is trained with two different data sources. This can generate a mixture of images by controlling the characteristics of each data source.
%However, one concern is that it is difficult to evaluate multi-source models since there is no ground truth with the corresponding pair covariates between data sets.
%\paragraph{Brain MRI}#
We incorporate \ac{adni} \citep{adni_Petersen2010-zh}, a clinical dementia dataset, with \ac{ukb} (\sectionref{sec:ukb_brain}). From \ac{adni}, we select covariates \texttt{age}, \texttt{left hippocampus}, and \texttt{right hippocampus} from SynthSeg \cite{billot_synthseg_2023}. Studies underscore the significant correlation between hippocampal volumes and cognitive function \cite{cognitive_aging_hippocampus, subregion_hippocampus_EVANS2018129}, highlighting the relevance of these covariates. For the \ac{ukb} cohort, we consider \texttt{age}, \texttt{ventricle}, and \texttt{grey matter} volumes. Notably, the \ac{adni} cohort is older compared to \ac{ukb} ($\approx 74$ versus $\approx 55$, respectively).
% a higher average age ($\approx 74$) compared to \ac{ukb} ($\approx 55$).
%In this configuration, the \ac{adni} cohort has access to \texttt{age}, \texttt{left/right hippocampus}, while the \ac{ukb} cohort has access to \texttt{age}, \texttt{ventricle}, and \texttt{grey matter}.
\begin{wraptable}{r}{9cm}
    \floatconts
        {tab:brain_ms_fid}
        {\caption{\ac{fid} $\downarrow$ of \ac{ukb} and \ac{adni} in true multi-source setting. ``Joint'' denotes both cohorts are merged.}}%
        {\footnotesize%
            \vspace{-4mm}
            \begin{tabular}{lccc}
            \toprule
            \ac{fid} $\downarrow$ & & \textbf{Methods} & \\
            \textbf{Data sets} & Source \ac{ukb} & Source \ac{adni} & Multi-Source\\
            \midrule
            UKB & 8.0 & 74.5 & 15.5 \\
            ADNI & 66.1 & 16.8 & 34.3 \\
            Joint & 26.6 & 21.83 & 19.3 \\
            \bottomrule
            \end{tabular}
        }
\end{wraptable}
% \ac{CDR} is a cognitive rating that quantifies the severity of dementia symptoms.
% In ADNI, there are 3273 images (36\%) from cognitively normal individuals (CDR=0), 4943 images (53\%) from individuals with mild cognitive impairment (CDR=0.5), and 967 images (11\%) with dementia (CDR= greater or equal 1). 

% For the retinal application we use the EyePACS data set \cite{eyepacs} of retinal fundus images with \ac{DR} ratings as a second data source. \textcolor{red}{need to change this}
% \citet{predict_cardiovascular_retinal_poplin} showed that \texttt{DBP} and other cardiovascular risk factors can be predicted from retinal fundus imaging.
% \texttt{DBP} is a very common diagnostic variable for many cardiovascular conditions and is also associated with diseases such as malignant hypertensive retinopathy \cite{hypertensive_retinopathy}. 

% \ac{DR} is a common condition in long-term diabetic patients and can cause blindness.
% Images were rated on a scale from 0 to 4.
% The higher the score is, the more serious the diabetic eye disease is.
% Most images (73\%) in the data set have no \ac{DR} (level = 0).
% 7\% of images are mild \ac{DR} (level = 1) and around 15\% of images were rated as moderate \ac{DR} (level = 2).
% Levels $3$ and $4$ both contain $2\%$ of images and they are considered as severe and proliferative \ac{DR}.
% The training set has $12294$ images in total. Finally, we select the levels of the disease as the covariate for EyePACS and age, diastolic blood pressure, and spherical power as covariates for \ac{UKB}.

%\paragraph{Evaluation: brain MRI}
\tableref{tab:brain_ms_fid} compares \ac{fid} scores; MSSG outperforms the other models on the joint data set, while the more specialized single-source models model their specific data sources more closely (which would be expected). % among the proposed multi-source model and single-source models trained separately on \ac{ukb} and \ac{adni}.
%The \ac{mssg} outperforms both single-source models with a \ac{fid} score of $21.0$ on the joint datasets (\ac{ukb} and \ac{adni}) comprising $13222$ images.
%Equal contributions are made by \ac{ukb} ($6707$ images) and \ac{adni} ($6611$ images).
%All models benefit from ADA data augmentation and x-flipping during training.
\tableref{tab:brain_ms_regr} demonstrates that the multi-source model can model all respective covariates with similar performance as the specialized baseline models.
%presents the joint regression scores for each single covariate.
%\texttt{Age} is a shared covariate in both sources. Notably, both single-source models struggle with the out-of-distribution problem, failing to generate images with corresponding out-of-distribution ages. In contrast, the proposed \ac{mssg} produces images with a broader age distribution and achieves a superior correlation score of $0.561$ compared to the \ac{adni} single source. For other unique covariates, the \ac{mssg} attains comparable correlation scores, except the right hippocampus score.
In \figureref{fig:real_ms_mri}, we show brain MRI generated by the multi-source model jointly trained on UKB and ADNI, %Two age distributions from both cohorts are generated.
and \sectionref{sec:visual_true_mri} shows more qualitative examples of generated images.

\section{Discussion \& Conclusion}

We introduced Multi-source StyleGAN (MSSG), a conditional generative image model capable of learning from multiple disparate data sources concurrently. Our experiments demonstrate that integrating multi-source data does not compromise image quality compared to single-source generation, and it can enhance data quality by leveraging a larger dataset. Through various case studies, we validated MSSG's ability to control variables from different sources collectively, even without access to paired variables. We believe MSSG can address data scarcity and label scarcity issues in medical image data, especially for rare diseases. However, a limitation of our current method is its reliance on a hand-designed latent space model, since we wanted to ascertain that the latent model has a good fit onto the true latent distribution. Future work could explore using non-parametric general-purpose models like DECAF~\cite{NEURIPS2021_decaf} as a drop-in replacement. Additionally, extending our multi-source integration approach to other generative models such as VAEs and diffusion models holds promise for future research.

% We introduced Multi-source StyleGAN, a conditional generative image model that can learn from multiple disparate data sources at the same time.
% We showed that multi-source data integration does not hurt image quality compared to single-source image generation from the same number of images and can improve data quality through access to more data.
% We further validated in a range of case studies that MSSG can control all variables from different sources together, even without access to paired variables.
% We believe that MSSG can help alleviate the data scarcity and label scarcity gap on medical image data.
% We especially envision the integration of multiple disparate sources with enriched disease prevalence for rare diseases to be a promising direction for future research.
% % With the assumption that the conditional distribution does not shift between data sources \cite{cause_effect_learning}, the proposed model could work better when $c_\text{shar}$ is the cause of $c_\text{uniq}$ and $c_\text{hidd}$, where $p(c^1_\text{hidd} | c^1_\text{shar} = \xi) = p(c^2_\text{uniq} | c^2_\text{shar} = \xi) $.
% One limitation of our current method is that it requires a hand-designed latent space model.
% In the current work, we wanted to ascertain that the latent model has a very good fit onto the true latent distribution so as not to introduce additional artifacts due to unnatural latent variables; an extension could use non-parametric general-purpose models such as DECAF~\cite{NEURIPS2021_decaf} as a drop-in replacement.    
% Additionally, we envision an extension of our multi-source integration approach to other generative models such as VAEs and diffusion models as a promising future research direction.

\input{acros}

\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{The authors thank the participants of the UK Biobank study and the ADNI study. This research has been conducted using the UK Biobank Resource under Application Number 77717. Furthermore,
data used in preparation of this article were obtained from the Alzheimer's Disease Neuroimaging Initiative (ADNI) database (adni.loni.usc.edu). As such, the investigators within the ADNI contributed to the design and implementation of ADNI and/or provided data but did not participate in the analysis or writing of this report. A complete listing of ADNI investigators can be found at: \url{http://adni.loni.usc.edu/wp-content/uploads/how_to_apply/ADNI_Acknowledgement_List.pdf}
}

\paragraph{Data availability \\}
\hspace{-5mm}
In this work, three types of datasets are used. The Morpho-MNIST dataset is available in the Github repository (\url{https://github.com/dccastro/Morpho-MNIST}). The MR imaging is available by applications from the UK Biobank (\url{https://www.ukbiobank.ac.uk/register-apply}) and the ADNI database (\url{https://adni.loni.usc.edu/data-samples/access-data/}). Last but not least, the retinal images are available by the application from the UK Biobank (\url{https://www.ukbiobank.ac.uk/register-apply}). 
\paragraph{Funding \\ } 
\hspace{-5mm}
This work was kindly supported by the German Ministry of Research and Education (Bundesministerium für Bildung und Forschung - BMBF) within the project ‘Syreal’ (Grant No. 01/S21069A). The funders had no role in the study design, data analysis, interpretation, or report writing.

\bibliography{midl24_037}

\newpage
\appendix

\section{Background}
\label{sec:related_work}
\acp{gan} are among state-of-the-art architectures in \ac{dl} to generate high-dimensional data \cite{stateoftheart_gan}. The backbone of a \ac{gan} method contains two networks, a generator network $G$ and a discriminator network $D$ \cite{goodfellow2014gan}. The generator produces synthetic data from a noise vector $z$ and the discriminator distinguishes it from real data.
In recent years, frameworks such as StyleGAN \cite{Karrasstylegan1, Karras2019stylegan2, Karras2021stylegan3} and BigGAN \cite{brock2018large} have developed GANs to generate high-quality images at high resolutions.
Recent diffusion models \cite{diffusion_model_dickstein} have been shown to achieve promising image synthesis \cite{stablediffusion, song2020denoising, song2021scorebased, ddpm_neurips}, but have slower sampling times than \acp{gan} \cite{dm_beats_gans}, require larger training data sets \cite{moon2022finetuning}, and can have a slow, computationally expensive training process (for example, the training time on the CelebA-64x64 dataset required around $24$ hours with 16 V100 GPUs \cite{wang2023patch}).

\paragraph{Conditional GANs}
Conditional \acp{gan} \cite{mirza2014conditional} were introduced to generate image data conditional on categorical factors.
Several studies modified and improved the performance of conditional \acp{gan} \cite{miyato2018cgans, brock2018large, Karras2019stylegan2, Karras2021stylegan3}.
%Many extensions and adaptions to conditional GANs have been developed so far.
%c\ac{gan} use a Projection Discriminator \cite{miyato2018cgans} that focuses on conditional discriminators, where the inner product between feature maps of input data and the embedding conditional classes is taken.
%BigGAN \cite{brock2018large} further improved the class conditional \acp{gan}. The generator is given class information by categorical conditional BatchNorm \cite{conditionalBatchNorm}, together with a projection discriminator \cite{miyato2018cgans}. Current state-of-the-art \acp{gan} also demonstrate the usage of conditional labels \cite{Karras2019stylegan2, Karras2021stylegan3}.
%For the generator, conditional labels map into the embedding space and concatenate with the latent noise $z$, which is the input of the mapping network for \acp{stylegan}. In the discriminator, the embedding space of conditional labels are applied only to the last neural network block for training. A recent study from \citeauthor{ijcai2022_dobler} introduced multi-conditional \acp{stylegan} in art related datasets in which generators can produce art images with different characters, \eg emotion, genre, and style.
Most works only handle \emph{categorical} features instead of continuous or mixed-type labels and usually ignore dependencies between input labels.
%are \emph{categorical} and do not necessarily capture dependency between labels.
%Furthermore, most conditional \acp{gan} do not handle continuous or mixed-type labels, as discussed by \citeauthor{continuous_ding2021ccgan} and provided ccGAN (Continuous conditional \ac{gan}).
One alternative is ccGANs \citep{continuous_ding2021ccgan, improved_ccgan}; however, ccGANs only tackle univariate continuous labels, such as angles and ages.
%However, the proposed solution tackled single continuous labels.
Our proposed method targets multiple continuous conditional labels and the problem of missing covariates in a database.

%To consider the dependency of labels, causal inference is introduced in the \ac{GAN} architectures by \cite{kocaoglu2018causalgan}. Besides receiving the latent noise $z$, the generator receives also labels from a causal graph, which contains the dependent information between labels. The discriminator, however, does not receive any information from labels but distinguishes fake data from real data. Additionally, a labeler and an anti-labeler are proposed within CausalGAN to estimate the labels of images from the dataset and the generator. \cite{structural_counterfactuals} does not apply on \acp{GAN}, but utilized causal models to capture the relationships between biological information and brain structures, and produced counterfactual brain \acp{MRI}. 
\paragraph{Multi-Source/Multi-Domain Image Generation}
There have been several prior works on integrating multiple sources into a joint generative model but with a different focus from our work.
Most prominent are works on image-to-image translation; e.g., CycleGAN \cite{cyclegan} is trained with cycle-consistency loss using unpaired images to translate images between two domains.
DVG-Face \cite{fu2021dvg} focuses on generating dual heterogeneous paired face images to preserve identity.
%However, these unconditional generative model generates new samples from noises.
\citet{KANG_structurepreserve} proposed a method that translates medical images across domains while preserving structural information during translation.
This method requires an image from the target domain and the structure and texture features from the source domain as input.
Furthermore, StarGAN v2 \cite{choi2020starganv2} translates images between domains with a single generator and takes an image and a style code as input to increase the diversity of translated images in the target domain.
MPG \cite{multiattr_pizza_gen} proposed a multi-attribute version of StyleGAN2 to generate pizza images with specified ingredients and views.
In this approach, a pre-trained view attribute regressor is used to impute the missing values of labels.
Our work differs from these approaches, as our proposed method trains on two datasets simultaneously and uses latent space models to impute missing labels.

In another related research problem, image harmonization techniques \cite{bashyam2022_harmonization, styletransfer_harmonization, harmonizing_flow} address domain transfer, which is caused, e.g., by different MRI scanners or data collection setups \cite{styletransfer_harmonization}. Here, techniques take images as input and transfer them to different domains, but they are not able to control the image characteristics, such as ventricle volumes. 

% Most GAN architectures focus on generating high quality images \cite{brock2018large, Karras2019stylegan2, Karras2021stylegan3, pmlr-v97-zhang19d_sagan}. However, GANs can be also used to sample the joint distributions. DeLiGAN utilizes the gaussian mixture model for the latent space and sample the corresponding data from the selected latent space by approximating posteriors \cite{deligan}. \cite{jointgan, Dumoulin_Belghazi_ali_2017} acquire the ability to draw sample from conditional distributions across different domains. Nevertheless, the input is an image, but not a latent noise. Furthermore, these previous work focus mostly on how to approximate the distribution of different domains from latent spaces, but not the controllability of conditional covariates from different source. Therefore, current work is not able to control a single characteristic of mixture images with a single conditional covariate, which is a low dimentional value. 
% Our proposed method attempts to tackle this problem, where GANs can still control the missing covariates in the multi-source domain.

\vspace{-2mm}
\section{Multi-Source StyleGAN}
% \paragraph{Proposed Model Architecture}
\paragraph{Pseudo Code for the training paradigm}
Here, we provide the pseudo-code for training the proposed multi-source GAN method \algorithmref{alg:mssg}, which is described in \sectionref{subsec:train_paradigm}. For simplicity, we will focus on the case of two sources, $D^1$ and $D^2$. The required latent space models $f^1$ and $f^2$ for the GAN training paradigm are hand-designed parametric models as described in \sectionref{subsec:latent_models}, but they are integrated into the training script. 

\begin{algorithm2e}
\caption{Training a Multi-Source GAN}
\label{alg:mssg}
\SetKwInOut{KwRequire}{Require}
\DontPrintSemicolon
\LinesNumbered
\KwRequire{$f^1, f^2, G, D$}
\KwIn{$D^1 = \left\{ (x_i^1, c_i^1) \right\}_i , D^2 = \left\{ (x_i^2, c_i^2) \right\}_i$}
% \KwOut{$y$, the net activation}
Initialize Generator $G$ and Discriminator $D$ with random weights \\
Set number of steps $S$ and batch size $B$ \\
Set learning rates $\eta_G$ and $\eta_D$ \\

\For{$ \text{step} \gets 1$ to $S$}{
    Sample batches of real images $\{ x_1^1, \ldots, x_B^1 \}, \{ x_1^2, \ldots, x_B^2 \}$ from datasets $D^1$ and $D^2$ \\
    Sample batches of real labels $\{ c_1^1, \ldots, c_B^1 \}, \{ c_1^2, \ldots, c_B^2 \}$ from datasets $D^1$ and $D^2$ \\
    Estimate the hidden variables $\hat{c}^1_\text{hidd} = f^2(c^1_\text{shar}), \hat{c}^2_\text{hidd} = f^1(c^2_\text{shar})$ by using latent space models $f^1$ and $f^2$ \\
    Sample batches of noise vectors $\{z_1^1, \ldots, z_B^1\}, \{z_1^2, \ldots, z_B^2\}$ from the noise distribution \\
    
    Generate fake images $\{\text{Img}_1^1 = G(z_1^1, c_{1, \text{shar}}^1, c_{1, \text{uniq}}^1, \hat{c}_{1, \text{hidd}}^1), \ldots, \text{Img}_B^1 = G(z_B^1, c_{B, \text{shar}}^1, c_{B, \text{uniq}}^1, \hat{c}_{B, \text{hidd}}^1)\}$ using Generator $G$ for $D^1$\\
    
    Generate fake images $\{\text{Img}_1^2 = G(z_1^2, c_{1, \text{shar}}^2, c_{1, \text{hidd}}^2, \hat{c}_{1, \text{uniq}}^2), \ldots, \text{Img}_B^2 = G(z_B^2, c_{B, \text{shar}}^2, c_{B, \text{hidd}}^2, \hat{c}_{B, \text{uniq}}^2)\}$ using Generator $G$ for $D^2$\\

    Concatenate images $\{\text{Img}_1^1, \ldots, \text{Img}_B^1, \text{Img}_1^2, \ldots, \text{Img}_B^2\}$ and labels $\{\hat{c}_1^1, \ldots, \hat{c}_B^1, \hat{c}_1^2, \ldots, \hat{c}_B^2\}$ from two sources \\

    
    Use data from the specific loss of each covariate to update the Discriminator \\
        % $\theta_D \gets \theta_D - \eta_D \cdot \nabla_{\theta_D} \left[ \frac{1}{2B} \sum_{i=1}^{2B} \left( \log(D(x_i, c_i)) + \log(1 - D(\text{Img}_i, \hat{c}_i) \right) \right]$ \\
    Use data from the specific loss of each covariate (only with generated images) to update the Generator \\
        % \State $\theta_G \gets \theta_G - \eta_G \cdot \nabla_{\theta_G} \left[ \frac{1}{2B} \sum_{i=1}^{2B} \log(1 - D(\text{Img}_i, \hat{c}_i)) \right]$ \\
}

\end{algorithm2e}

\paragraph{Generalization to more than 2 data sources}
Our proposed multi-source GAN can be generalized to $\ge 3$ data sources.
Depending on which latent variables are available in which data source, this leads to modeling choices within the latent space models. For example, if a variable is available in sources 1 and 2 but not in source 3, we have to decide if we want to sample it from a model derived from source 1 or source 2, or perhaps alternate between these two. Therefore, this will be a more application-specific question.
In this paper, we mainly focus on dealing with two sources.


\section{Implementation Details}
\paragraph{Training details} We implemented \ac{mssg}, using the StyleGAN3 source code \cite{Karras2021stylegan3}, keeping many of the default parameters. The generator in \ac{mssg} is based on the StyleGAN3 generator, employing a latent noise vector $z \in \mathbb{R}^{512}$ and a conditional latent vector integrated with our joint conditional vector. The mapping network of the generator consists of two fully connected layers, producing another latent vector $w \in \mathbb{R}^{512}$. The discriminator, a StyleGAN2 Discriminator, is set to default configurations. Both the generator and discriminator are trained using Adam optimizers. Models were trained with ADA data augmentation to prevent overfitting. X-Flip was applied to the retinal data sets, as the data sets are small. We adapted the conditional loss function from StyleGAN3, modifying the discriminator's output to predict correct labels and disabling the mapping network in the discriminator. In preliminary experiments, we found that weighing the fake/real loss and the covariate prediction loss equally led to considerably decreased image quality. Instead, we multiply the covariate losses by a scaling parameter $\lambda$, set to 0.1 throughout our real-world experiments. For the first few iterations of training, we grow $\lambda$ exponentially from 0 to its target value. This helps emphasize image quality at the start, reaching a maximum of $0.9$ in semi-multisource cases and $0.1$ in the true multi-source case. %This regularization parameter serves as a tunable hyperparameter for users.

With the extension to the real multi-source data sets, we can express also the property of sources as an extra covariate $c^j_\text{source}$ in our conditional vector $c^j$. This allows us to explicitly sample from either of the data sources $c^j_\text{source}$ instead of it being expressed by the latent noise vector $z$. 

We trained our models on 2 A100 GPUs until the convergence of the FID score on a validation set. The duration of training is data-set dependent. For MorphoMNIST, the convergences of the FID score took $8192$ steps for the proposed "half" model and $22937$ steps for the proposed "full" model, as the resolution of images is $32 \times 32$, and the baseline models took $19661$ and $18022$ steps respectively. For MRI and retinal fundus image experiments, we used a resolution of $128 \times 128$. The convergence of the training took $13107$ steps for the proposed "half" model and $18022$ for the proposed "full" model on the MRI experiments. The baseline models on single-source data sets took $29286$ and $19661$ steps to reach the convergence.
In the retinal fundus image experiments, the proposed "half" model took $57384$ steps to converge and the proposed "full" model took $47937$ steps to converge. The baseline single-source models took $39601$ and $30719$ steps to converge.

\paragraph{Evaluation details}
\label{sec:eval_regression}
Quantitative evaluation consists of image quality assessment and controllability analysis. Image quality is evaluated using \ac{fid} and \ac{kid}, while controllability is measured through the proposed strata prediction score. In this evaluation, test sets are stratified for each covariate, divided into $m = 3$ marginal bins. Each marginal bin contains $33\%$ of the total samples. With three covariates in our experiments, this results in $3^3 = 27$ subsets. Regression models, specifically, ResNet50s, are employed in each stratum to predict covariates from both test sets and generated images. A total of $15,000$ images are generated for each stratum and the score is computed as the weighted (by stratum size) average of the Pearson correlation coefficients of predicted outputs from generated and test set images.

For MorphoMNIST, the ResNet50 regression models yield high performance, with Pearson correlation coefficients of $0.978$ for \texttt{thickness}, $0.996$ for \texttt{intensity}, and $0.999$ for \texttt{slant}.

On the real-world \ac{mri} data from the \ac{ukb} cohort, ResNet50 regression models achieve correlations of $0.78$ for age, $0.97$ for ventricle volumes, and $0.87$ for grey matter volumes. In the \ac{adni} cohort, ResNet50 models for age, left hippocampus, and right hippocampus achieve correlations of $0.964$, $0.980$, and $0.984$ respectively.

For retinal fundus images in the \ac{ukb} cohort, ResNet50 models exhibit correlations of $0.863$ for age and $0.910$ for spherical power. The binary covariate cataract achieves an accuracy of $0.70$ and a Pearson correlation coefficient of $0.338$ (we use Pearson correlation to stay consistent with the other measures).

Prediction models were trained using ResNet18, ResNet50, and ResNet100, and the model with the best performance was selected for our metric.

\section{Parametric Models}
\label{sec:latent_space_modeling}
In the example of training two sources, our proposed method, \ac{mssg}, requires latent models $f^2$ to approximate the hidden covariates $c^j_\text{hidd}$ for the first source $D^1$, and vice versa. In this work, we designed latent space models by using parametric models with \ac{mle}. That means that, e.g., in the case of a linear regression model, we also need to estimate the standard deviation of the noise term in addition to the weights and bias.
For example, assume that the shared variable is \texttt{age}, the unique variable in source 1 is \texttt{blood\_pressure}, and the unique variable in source 2 is \texttt{sex}.
We may then fit a linear regression model $\texttt{blood\_pressure} = f^1(\texttt{age}) = \alpha \texttt{age} + \beta + \epsilon$ with $\epsilon \sim \mathcal{N}(0, \sigma^2)$ on data source 1 to estimate $\alpha$, $\beta$, and $\sigma$.
At the same time, we fit a logistic regression to model $\mathbb{E}[\mathtt{sex}] = \mathtt{sigmoid}(\gamma \mathtt{age} + \delta)$  on data source 2 for $\gamma$ and $\delta$ (i.e., $f^2(\texttt{age})$ is 1 with probability $\mathtt{sigmoid}(\gamma \mathtt{age} + \delta)$ and 0 otherwise).
Given these models, for an instance from source 1 we can then e.g. randomly sample from $p(\mathtt{sex} | \mathtt{age})$ given the logistic regression model $f^2(\texttt{age})$.

\subsection{Synthetic Data}
\paragraph{Morphological digits}\label{app:morpho_models} We synthesize the MNIST dataset with the tool from \citet{castro2019morphomnist}, randomly split it into two data sources, and train on them with our proposed method. The first data source has access to \texttt{thickness}, \texttt{intensity}, and \texttt{digit}. The second data source has access to \texttt{thickness}, \texttt{slant}, and \texttt{digit}. We modified the data generation process from \citet{Pawlowski2020} to generate synthetic morphological digits. Thickness is sampled from a gamma distribution $\Gamma$. Given thickness, intensity, and slant are sampled as follows:
\begin{align*}
    \text{thickness} & = 0.5 + \epsilon_t, \quad &  \epsilon_t \sim \Gamma(10, 5) \\
    \text{intensity} & = 191 \cdot \sigma(0.5 \cdot \epsilon_I + 2 \cdot \text{thickness} - 5) + 64, \quad & \epsilon_I \sim \mathcal{N}(0, 1) \\
    \text{slant} & = 56 \cdot \text{tanh}(0.3 \cdot \epsilon_S + \text{thickness} - 2.5), \quad & \epsilon_S \sim \mathcal{N}(0, 1)
    %\\
    %\text{Img} & = \text{SetSlant}(\text{SetIntensity}(\text{SetThickness}(\epsilon_X ; \text{text}); \text{intensity}); \text{slant}), \quad & \epsilon_X \sim \text{MNIST}
    \label{equ:morpho_generation}
\end{align*}
$\sigma(\cdot)$ is the sigmoid function. The value range of the intensity is therefore between $[64, 255]$ and the digits rotate in the range of $[-56, 56]$. \\

To estimate the covariates in our latent model, for thickness, we use a beta distribution $\texttt{thickness} \sim \beta(4.13, 9.84)$. Intensity and slant are estimated by conditioning on thickness, $p(\texttt{intensity} | \texttt{thickness})$ and $p(\texttt{slant} | \texttt{thickness})$. We fitted the known covariates into a non-linear least square function with the sigmoid function to optimize the parameters. \figureref{fig:latent_morpho} shows that our latent models can approximate the actual label distributions well in both sources. 

\begin{figure}[ht]
\floatconts
    {fig:latent_morpho}%
    {\caption{Samples from latent models fit the ground-truth MorphoMNIST distributions. The blue legend shows samples from the ground truth and the orange dots show samples from latent models. 
    %Overlapped parts become grey. %This shows how well the distributions of our latent models fit the ground truth.
    }}
    {%
        \subfigure[Latent model $f^1$ samples intensity based on the sampled thickness.][b]{%
            \includegraphics[width=.48\textwidth]{figures/appendix/latent_models/morpho_latent_s1.pdf}
            \label{fig:latentmorpho_s1}%
        }
        \subfigure[Latent model $f^2$ samples slant based on the sampled thickness.][b]{%
            \includegraphics[width=.48\textwidth]{figures/appendix/latent_models/morpho_latent_s2.pdf}
            \label{fig:latentmorpho_s2}%
        }
    }
\end{figure}

\subsection{Real-World Data}
\paragraph{Brain MRI}
%ADNI for the true multi-source setting. However, the Uk Biobank was also used in our experiment for the synthetic multi-source setting. 
For model training, covariates are fitted into separate latent space models. We show here also how latent models $f^j$ approximate the covariates $c^j$. For the UK Biobank, \figureref{fig:latent_ukb_mri} shows that ventricle volumes and grey matter volumes are estimated well by the latent model with the given ages. \texttt{age} is rescaled to the interval $[0, 1]$ and modeled by a beta distribution, while \texttt{ventricle} and \texttt{grey matter volumes}, conditioned on \texttt{age}, can be sampled together by a Gaussian mixture regression model with $10$ Gaussian components. %Conditioned on the age, for the estimation of hidden covariates, i.e., ventricle and grey matter, in both sources. $p(c^1_\text{ventricle} | c^1_\text{age}=\xi)$ and $p(c^2_\text{grey matter} | c^2_\text{age}=\xi)$. 

\begin{figure}[htbp]
    \floatconts
    {fig:latent_mri}% label
    {\caption{Latent models sample the distributions of the \ac{ukb} and \ac{adni} cohorts in the MRI experiments.
    }}
    {
        \subfigure[The latent model samples ventricle volumes and grey matter volumes by conditioning on the given ages. Ventricle volumes increase with higher age, and grey matter volumes shrink with higher age.][b]{
        \includegraphics[width=.48\textwidth]{figures/appendix/latent_models/ukb_compare_pair.pdf}
        \label{fig:latent_ukb_mri}
        }
        \subfigure[Left and right hippocampus volumes conditioned on the given ages. The hippocampus volumes shrink slightly with higher age.][b]{
            \includegraphics[width=.48\textwidth]{figures/appendix/latent_models/compare_samples_hippo.pdf}
            \label{fig:latent_adni_mri}
        }
    }
\end{figure}

In the ADNI data set, we have three covariates, \texttt{age}, \texttt{left hippocampus}, and \texttt{right hippocampus}. Another 8-component GMM regression is applied to learn to sample the left hippocampus and right hippocampus. Age is again modeled by a beta distribution (with independent parameters) and the left and right hippocampus are predicted by age. \\
%$p(c^2_\text{left hippocampus}, c^2_\text{right hippocampus} | c^2_\text{age}=\xi)$.
\figureref{fig:latent_adni_mri} depicts the fit of the trained latent model for the left and right hippocampus, conditioned on the age.

\begin{figure}[htbp]
    \floatconts
    {fig:latent_mri_retinal}% label
    {\caption{Latent models in the MRI and retinal fundus images experiments.}}
    {
        \subfigure[Latent model trained on the \ac{adni} cohort samples hippocampal volumes by conditioning on two different age distributions from the \ac{ukb} and \ac{adni} cohorts.][b]{
            \includegraphics[width=.48\textwidth]{figures/appendix/latent_models/ukb_adni_adnisampler.pdf}
            \label{fig:latent_adnisampler}
        }
        \subfigure[Cataract and spherical power are sampled from the trained latent model conditioned on age.][b]{
            \includegraphics[width=.48\textwidth]{figures/appendix/latent_models/compare_samples_ukb_retinal.pdf}
            \label{fig:latent_ukb_retina}
        }
    }
\end{figure}

In \figureref{fig:latent_adnisampler} we illustrate how sampling from either of the two latent models leads to varying latent space distributions. %provides an illustration of utilizing latent models in the presence of distribution shifts. 
Given that the \ac{ukb} cohort exhibits a younger age distribution, the latent space model trained on the \ac{adni} dataset shows a distribution shift when sampling hippocampal volumes based on the age distribution of the \ac{ukb}.

\paragraph{Retinal fundus images}
%the EyePACS diabetic retinopathy data set for the retinal experiments. 
The shared covariate \texttt{age} was modeled as a rescaled beta distribution, with minimum and maximum values set based on observed data. The binary variable \texttt{cataract} is incorporated into a logistic regression model conditioned on \texttt{age}. \texttt{Spherical power} is modeled using a Gaussian Mixture Model regression with 13 Gaussian components, also conditioned on \texttt{age}.
\figureref{fig:latent_ukb_retina} shows the distributions of the ground truth labels and samples from the latent model. %There is no strong correlation between the covariates.
% The EyePACS data set contains retinal diabetic images with severity ratings from level $0$ to $5$. The higher, the more serious the disease is. As we only have one covariate in this data set, we simply used the prior to estimate its disease distribution $c_\text{level} \sim p(\text{level})$.

\section{Additional Experimental Results}
\label{sec:ext_eval}
In this section, we show additional training information and \ac{kid} scores in \tableref{tab:app_semi_ms_kid} for semi-multisource scenarios and \tableref{tab:brain_ms_kid} for the real-world multisource MRI. Furthermore, the evaluations of pairwise metrics for semi-multisource scenarios, \ie \ac{lpips}, \ac{ssim}, and \ac{psnr}, are depicted in \tableref{tab:app_semi_ms_metrics}. Nevertheless, it is not possible to evaluate the real-world multisource MRI with pairwise metrics, since the ground truth covariates do not necessarily exist.

\subsection{Validation in synthetic data}
\paragraph{MorphoMNIST} The experimental results of \ac{kid} in \tableref{tab:kid_morpho} differ slightly from \ac{fid} scores reported in \sectionref{sec:morpho_eval}. However, by boosting the data size, our proposed model reduces the \ac{kid} score and reaches comparable results. The proposed method can control three covariates, whereas single-source models can only control the existing covariates in data sets. Furthermore, \tableref{tab:metrics_morpho} shows the results of pairwise metrics, \ie LPIPS, SSIM, and PSNR. In these evaluations, we compared generated with real images from the test set, given the same covariates. The proposed method outperforms the single-source StyleGAN3 models. By doubling the data size, our proposed method performs slightly better in the evaluations of SSIM and PSNR.

\begin{table}[h]
    \floatconts
    {tab:app_semi_ms_kid}
    {\caption{\ac{kid} $\downarrow$ and additional information in three different semi-multisource scenarios. ``GM'' denotes grey matter volumes, ``SP'' denotes spherical powers.}}%
    {%
    %\vspace{-8mm}
    \subtable[MorphoMNIST][h]{
        \label{tab:kid_morpho}
        \begin{tabular*}{\textwidth}{lccc}
            \toprule
            \textbf{Methods} & \ac{kid} $\downarrow$ mean (std) & Training samples & Covariates\\
            \midrule
            Source 1 (baseline) & 0.00121 (0.00238) & 24000 & Thickness, Intensity \\
            Source 2 (baseline) & 0.00087 (0.00208) & 24000 & Thickness, Slant \\
            Proposed (half) & 0.00128 (0.00225) & 24000 & Thickness, Intensity, Slant\\
            Proposed (full) & 0.00088 (0.00278) & 48000 & Thickness, Intensity, Slant \\
            \bottomrule
        \end{tabular*}
    }
    \subtable[MRI: \ac{ukb}][h]{
        \label{tab:kid_mri}
        \begin{tabular*}{\textwidth}{lccc}
            \toprule
            \textbf{Methods} & \ac{kid} $\downarrow$ mean (std) & Training samples & Covariates\\
            \midrule
            Source 1 (baseline) & 0.02681 (0.00418) & 13414 & Age, Ventricle \\
            Source 2 (baseline) & 0.00818 (0.00231) & 13408 & Age, GM \\
            Proposed (half) & 0.01632 (0.00260)  & 13411 & Age, Ventricle, GM \\
            Proposed (full) & 0.00932 (0.00230)& 26822 & Age, Ventricle, GM \\
            \bottomrule
        \end{tabular*}
    }
    
    \subtable[Retina: \ac{ukb}][h]{
        \label{tab:kid_retinal}
        \centering
        \begin{tabular*}{\textwidth}{lccc}
            \toprule
            \textbf{Methods} & \ac{kid} $\downarrow$ mean (std) & Training samples & Covariates\\
            \midrule
            Source 1 (baseline) & 0.02029 (0.00400)  & 1922 & Age, Cataract\\
            Source 2 (baseline) & 0.00811 (0.00182) & 1918 & Age, SP\\
            Proposed (half) & 0.00926 (0.00201)  & 1920 & Age, Cataract, SP\\
            Proposed (full) & 0.00333 (0.00124) & 3840 & Age, Cataract, SP \\
            \bottomrule
        \end{tabular*}
    }
    }
\end{table}

\begin{table}[h]
    \floatconts
    {tab:app_semi_ms_metrics}
    {\caption{Pairwise metrics: LPIPS $\downarrow$, SSIM $\uparrow$, and PSNR $\uparrow$ in three different semi-multisource scenarios.}}%
    {%
    %\vspace{-8mm}
    \subtable[MorphoMNIST][h]{
        \label{tab:metrics_morpho}
        \begin{tabular*}{\textwidth}{lccc}
            \toprule
            \textbf{Methods} & LPIPS $\downarrow$ mean (std) & SSIM $\uparrow$ mean (std) & PSNR $\uparrow$ mean (std) \\
            \midrule
            Source 1 (baseline) & 0.093 (0.004) & 0.345 (0.013) & 34.7 (0.143) \\
            Source 2 (baseline) & 0.122 (0.004) & 0.326 (0.013) & 34.8 (0.107) \\
            Proposed (half) & 0.064 (0.004) & 0.468 (0.020) & 35.5 (0.121) \\
            Proposed (full) & 0.065 (0.003) & 0.471 (0.017) & 35.6 (0.146) \\
            \bottomrule
        \end{tabular*}
    }
    \subtable[MRI: \ac{ukb}][h]{
        \label{tab:metrics_mri}
        \begin{tabular*}{\textwidth}{lccc}
            \toprule
            \textbf{Methods} & LPIPS $\downarrow$ mean (std) & SSIM $\uparrow$ mean (std) & PSNR $\uparrow$ mean (std) \\
            \midrule
            Source 1 (baseline) & 0.102 (0.001) & 0.611 (0.003) & 31.5 (0.026) \\
            Source 2 (baseline) & 0.094 (0.001) & 0.588 (0.003) & 31.3 (0.021) \\
            Proposed (half) & 0.072 (0.001) & 0.617 (0.003) & 31.6 (0.035) \\
            Proposed (full) & 0.071 (0.001) & 0.654 (0.003) & 31.6 (0.030) \\
            \bottomrule
        \end{tabular*}
    }
    
    \subtable[Retina: \ac{ukb}][h]{
        \label{tab:metrics_retinal}
        \centering
        \begin{tabular*}{\textwidth}{lccc}
            \toprule
            \textbf{Methods} & LPIPS $\downarrow$ mean (std) & SSIM $\uparrow$ mean (std) & PSNR $\uparrow$ mean (std) \\
            \midrule
            Source 1 (baseline) & 0.347 (0.006) & 0.429 (0.008) & 28.8 (0.020) \\
            Source 2 (baseline) & 0.221 (0.007) & 0.541 (0.011) & 28.9 (0.030) \\
            Proposed (half) & 0.204 (0.012) & 0.572 (0.009) & 29.0 (0.031) \\
            Proposed (full) & 0.193 (0.008) & 0.581 (0.010) & 29.0 (0.043) \\
            \bottomrule
        \end{tabular*}
    }
    }
\end{table}


\subsection{Validation in real-world data sets}
\subsubsection{MRI}
\paragraph{Synthetic multi-source MRI} 
Similar to the experimental results for MorphoMNIST, we provide results of \ac{kid} in \tableref{tab:kid_mri}. The tendency of the results is similar to those of \ac{fid} reported in \sectionref{sec:ukb_brain}. However, the source 2 model performs better than our proposed "full" model. The proposed "half" model performs comparably to the average of the source 1 and source 2 models. \tableref{tab:metrics_mri} depicts the results of pairwise metrics. Besides the evaluation of PSNR, in which the results are similar between the proposed method and the baselines, the multi-source StyleGAN3 outperforms the single-source baselines on the evaluations of LPIPS and SSIM.

\paragraph{True multi-source MRI}
\tableref{tab:brain_ms_kid} again shows the \ac{kid}.
%Here, we report again the \ac{kid} score in \tableref{tab:brain_ms_kid}, as a second metric in additionally to \ac{fid} in \tableref{tab:brain_ms_fid}.
Interestingly, our proposed \ac{mssg} outperforms the specialized single-source \ac{ukb} model on the \ac{ukb} test set. Furthermore, on the joint test sets (\ac{ukb} and \ac{adni}) our model still reaches the lowest \ac{kid} score compared to the single-source models. \tableref{tab:brain_ms_regr} depicts the results of the controllability of models. 

\begin{table}[th]
    \floatconts
        {tab:brain_ms_kid}
        {\caption{KID mean (std) $\downarrow$ for each method in corresponding test sets in true multi-source MRI setting}}%
        {%
            %\vspace{-3mm}
            \begin{tabular}{lccc}

            \toprule
            \ac{kid} $\downarrow$ mean (std) & & \textbf{Methods} & \\
            \textbf{Data sets} & Source \ac{ukb} & Source \ac{adni} & Multi-Source\\
            \midrule
            \textbf{UKB} & 0.00771 (0.00247) & 0.08759 (0.00804) & 0.00660 (0.00167)\\
            \textbf{ADNI} & 0.08897 (0.00956) & 0.01353 (0.00355) & 0.02674 (0.00438)\\
            \textbf{Joint} & 0.02848 (0.00698) & 0.02182 (0.00525) & 0.01023 (0.00344) \\
            \bottomrule
            \end{tabular}
        }
\end{table}

\begin{table}[th]
    \floatconts
        {tab:brain_ms_regr}
        {\caption{CS: Correlation score $\uparrow$ on corresponding covariates in true multi-source setting. Vntr: Ventricle, GM: Grey Matter, LH: Left Hippocampus, RH: Right Hippocampus }}%
        {
            \begin{tabular}{lccc}
            \toprule
            CS $\uparrow$ & & \textbf{Methods} & \\
            \textbf{Covariates} & Source \ac{ukb} & Source \ac{adni}  & Multi-Source\\
            \midrule
            Age (UKB) & 0.704 & -0.163 & 0.509 \\
            Age (ADNI) & 0.264 & 0.402 & 0.532 \\ \hline
            Vntr & 0.958 & -0.601 & 0.879 \\
            GM & 0.827 & -0.189 & 0.755\\
            LH & -0.316 & 0.865 & 0.696 \\
            RH & -0.486 & 0.886 & 0.845 \\
            \bottomrule
            \end{tabular}
        }
\end{table}
%\begin{table}[th]
%\vspace{-7mm}
% \begin{wraptable}{r}{9cm}%[th]
%     \floatconts
%         {tab:brain_ms_regr}
%         {\vspace{-14mm}\caption{CS: Correlation score $\uparrow$ on corresponding covariates in true multi-source setting. Vntr: Ventricle, GM: Grey Matter, LH: Left Hippocampus, RH: Right Hippocampus }}%
%         {\footnotesize%
%             \vspace{-4mm}
%             \begin{tabular}{lccc}
%             \toprule
%             CS $\uparrow$ & & \textbf{Methods} & \\
%             \textbf{Covariates} & Source \ac{ukb} & Source \ac{adni}  & Multi-Source\\
%             \midrule
%             Age (UKB) & 0.704 & -0.163 & 0.509 \\
%             Age (ADNI) & 0.264 & 0.402 & 0.532 \\ \hline
%             Vntr & 0.958 & -0.601 & 0.879 \\
%             GM & 0.827 & -0.189 & 0.755\\
%             LH & -0.316 & 0.865 & 0.696 \\
%             RH & -0.486 & 0.886 & 0.845 \\
%             \bottomrule
%             \end{tabular}
%         }
% \end{wraptable}

\subsubsection{Retinal fundus images}
\paragraph{Synthetic multi-source Retina}
\tableref{tab:kid_retinal} illustrates the \ac{kid} scores of the models alongside the number of training samples. Similar to the trends observed in \tableref{tab:semi_ms_fid} for \ac{fid} scores, the \ac{kid} scores demonstrate a consistent pattern. The proposed ``half'' model achieves a comparable score to the source 2 model and outperforms the average of the source 1 model and source 2. Upon increasing data samples, the ``full'' model significantly reduces the \ac{kid} score. Additionally, despite being trained on a low-data-sample regime, all models, besides the source 1 model, demonstrate the capability to generate high-quality images. \tableref{tab:metrics_retinal} shows similar results as from the MRI use case. Nevertheless, the results are generally worse than those on the MRI data. This could be caused by the availability of fewer data samples ($<2000$ for each source). In general, our proposed multi-source models reach lower LPIPS scores and higher SSIM scores, compared with the single-source models. Nevertheless, the PSNR scores are comparable between multi-source and single-source models.
% \textcolor{red}{maybe need to add more details about the datasets} In the retinal application, we use \acf{rfmid} \cite{rfmid} with three further covariates as second data source. \ac{rfmid} is a multi-disease data set with 46 conditional annotations and is a multi-classification problem. We selected the first three most common conditional diseases and they are not directly overlapped with the first data source, namely \texttt{disease risk}, \texttt{\ac{mh}} and \texttt{\ac{tsln}}.

% %Similarly, in \figureref{fig:ms_retinal_dual}, despite the widely diverging data acquisition and patient characteristics, MSSG is able to control both the disease progression label from the EyePACS data set, as well as the \texttt{\ac{DBP}}. 
% \begin{table}[th]
%     \floatconts
%         {tab:retinal_ms_kid}
%         {\caption{KID $\downarrow$ for each method in corresponding test sets}}%
%         {%
%             %\vspace{-3mm}
%             \begin{tabular}{lccc}
%             \toprule
%             \textbf{Method} & Source \ac{ukb} (baseline) & Source \ac{rfmid} (baseline) & Multi-Source\\
%             \midrule
%             \textbf{UKB} &  &  & 0.06229 (0.00332) \\
%             \textbf{RFMID} & &  & 0.06096 (0.00483) \\
%             \bottomrule
%             \end{tabular}
%         }
% \end{table}


\section{Visualizing generated imaging from \acp{mssg}}
Here, we further explore the multi-factor manipulation capabilities visually.
%The novelty of \ac{MSSG} is that it allows us to use characteristics from different sources when we want to generate a mixture of images. In order to show that the covariates $c^1_{\text{uniq}}, c^2_{\text{uniq}}$ can be manipulated simultaneously, we set here the common $c^j_{\text{shar}}$ to a fixed number. 
\subsection{Synthetic Data: MorphoMNIST}
\paragraph{Synthetic multi-source morphological digits}
\begin{figure}[t]
%\vspace{-14mm}
\floatconts
    {fig:app_morpho_ms}%
    {\caption{Morphological digits generated using the proposed "full" MSSG.}}
    {%
        \subfigure[Varying \texttt{intensity} and \texttt{slant} with fixed \texttt{thickness} (2.99). Intensity and slant vary.][b]{%
            \includegraphics[width=.48\textwidth]{figures/appendix/thickness_2.992252.pdf}
            \label{fig:app_thickness_constant}%
        }
        \subfigure[Varying \texttt{thickness} and \texttt{slant} with fixed \texttt{intensity} (183.289).][b]{%
            \includegraphics[width=.48\textwidth]{figures/appendix/intensity_183.289672.pdf}
            \label{fig:app_intensity_constant}%
        }
    }
\end{figure}

In this section, we present additional examples generated by the multi-source GAN. These figures were produced through the interpolation of covariate values, ranging from the minimal to the maximal values, specifically within a $30\%$ range. This approach ensures the exclusion of outliers with extreme covariate values. As depicted in \figureref{fig:app_morpho_ms} and \figureref{fig:app_slant_constant}, the multi-source GAN adeptly controls three continuous covariates—thickness, intensity, and slant. 
In \figureref{fig:app_thickness_constant}, the shared covariate \texttt{thickness} is set to the mean value of the test set, which is $2.9$. Intensity and slant are modified, with images exhibiting an increase in intensity column-wise, resulting in a brighter appearance. Additionally, there is a rotation effect from left to right row-wise, ranging from $-21.3$ degrees to $50.5$ degrees. Conversely, \figureref{fig:app_intensity_constant} demonstrates various combinations of covariates (thickness and slant) with a fixed intensity. Here, the images become thicker column-wise, and there is a rotation effect from left to right within a single row. Finally, in \figureref{fig:app_slant_constant}, the slant (covariate from Source 2) is set to a constant value, while the generated images are controlled by thickness and intensity. In this scenario, the images become thicker in a column-wise manner while intensifying from left to right within each row.

\begin{figure}[t]
%\vspace{-14mm}
\floatconts
    {fig:app_morpho_mri_ms}%
    {\caption{Morphological digits and Coronal MRIs (two use cases) generated using the proposed "full" MSSG.}}
    {%
        \subfigure[Morphological digits were generated using the proposed "full" multi-source GAN. Here, the covariate \texttt{slant} is set to $14.612$ degree, and the model controls morphological attributes, the shared covariate (thickness), and intensity from Source 1 of digits.][b]{%
            \includegraphics[width=.48\textwidth]{figures/appendix/slant_degrees_14.612888.pdf}
            \label{fig:app_slant_constant}%
        }
        \subfigure[Coronal MRIs were generated using the proposed "full" multi-source GAN. Another example on controlling ventricle and grey matter respectively when the age is fixed to $59$.][b]{%
            \includegraphics[width=.48\textwidth]{figures/appendix/mri_age_59.0_2.pdf}
            \label{fig:app_age_constant}%
        }
    }
\end{figure}

\subsection{Real-World Data: MRI}
\paragraph{Synthetic multi-source MRI} We split \ac{ukb} randomly into two data sources and trained them with our proposed method. Here, we show more generated examples from the multi-source GAN. We created the following figures by interpolating the values of covariates, starting from the minimal $30\%$ until the maximal $30\%$. 
\figureref{fig:app_age_constant} and \figureref{fig:app_mri_ms} show the multi-source model can control three covariates in parallel.
\figureref{fig:app_age_constant} shows the tendency that if ventricle volumes increase row-wise, the grey matter volumes decrease (the slits on the edge of the brain becomes wider). On the other hand, if the grey matter volumes increase column-wise, this causes the shrinkage of ventricle volumes. This is related to the anatomical nature of these two covariates in \figureref{fig:latent_ukb_mri}. 
Ventricle volumes can be controlled monotonically well. In \figureref{fig:app_ventricle_constant}, the age and grey matter volumes are modified. The grey matter increases horizontally from left to right. However, the age-related changes are quite small. \figureref{fig:app_greymatter_constant} shows the change in brains when the age and ventricle volumes are manipulated. Again, the size of ventricle volumes increases monotonically.

% To ensure that our MSSG model follows the real anatomical pattern of MRI, \cref{fig:mri_ms_neg_pos} shows the images of extreme values in the ventricle and grey matter volumes and compares the real and generated images. MSSG can generate the MRI with growing grey matter (the first two rows go from left to right). A similar pattern can be observed when the large ventricle volumes are presented in the images.

\begin{figure}[h]
%\vspace{-14mm}
\floatconts
    {fig:app_mri_ms}%
    {\caption{Coronal \ac{mri} were generated using our "full" multi-source GAN.}}
    {%
        \subfigure[\texttt{Ventricle} is set to a constant value ($65876.6$), \ac{mssg} generates images by controlling covariates (age and grey matter volumes).][b]{%
            \includegraphics[width=.48\textwidth]{figures/appendix/ventricle_65876.6.pdf}
            \label{fig:app_ventricle_constant}%
        }
        \subfigure[\texttt{Grey matter} is set to a constant ($822924.9$), \ac{mssg} generates images by controlling covariates (age and ventricle volumes).][b]{%
            \includegraphics[width=.48\textwidth]{figures/appendix/grey matter_822924.9.pdf}
            \label{fig:app_greymatter_constant}%
        }
    }
\end{figure}
\clearpage
\paragraph{True multi-source MRI}
\label{sec:visual_true_mri}
We also implemented our experiments on the true multi-source data sets, \ie \ac{ukb} and \ac{adni}. 
\figureref{fig:mri_age1_rh}, demonstrates control over a younger age distribution from \ac{ukb} while increasing volumes of the right hippocampus column-wise. Additionally, in \figureref{fig:mri_age2_gm}, the model regulates an older age distribution from \ac{adni}, increasing grey matter volumes column-wise, with noticeable changes in brain volumes. \figureref{fig:app_real_ms_mri} shows additionally that the proposed \ac{mssg} can not only control covariates, \ie ventricle volumes and left hippocampus, but also modify data sources. \figureref{fig:app_real_ms_mri_2} gives two further examples that \ac{mssg} controls various covariates across two data sources and generates reasonable images.

\begin{figure}[ht]
\floatconts
    {fig:real_ms_mri}%
    {\caption{Generated \ac{mri} from the proposed \ac{mssg} trained on the \ac{ukb} and \ac{adni}.}}
    {%
        \subfigure[\ac{mssg} controls the younger age distribution from \ac{ukb} row-wise. In the meanwhile, the right hippocampus is modified column-wise.][b]{%
            \includegraphics[width=.48\textwidth]{figures/main/mri_real/mri_real_age1_rh_ukb.pdf}
            \label{fig:mri_age1_rh}%
        }
        \subfigure[\ac{mssg} regulates the older age distribution from \ac{adni} row-wise and increases grey matter volumes column-wise.][b]{%
            \includegraphics[width=.48\textwidth]{figures/main/mri_real/mri_real_age2_gm_adni.pdf}
            \label{fig:mri_age2_gm}%
        }
    }
\end{figure}

\begin{figure}[ht]
\floatconts
    {fig:app_real_ms_mri}%
    {\caption{Generated \ac{mri} from the proposed \ac{mssg} in the true multi-source scenario by controlling ventricle volumes and left hippocampus volumes.}}
    {%
        \subfigure[\ac{mssg} generates images with specific ventricle sizes and left hippocampus sizes in the style of  the \ac{ukb} cohort.][b]{%
            \includegraphics[width=.48\textwidth]{figures/appendix/mri_real/mri_real_ven_lh_ukb.pdf}
            \label{fig:mri_ven_lh_ukb}%
        }
        \subfigure[\ac{mssg} regulates ventricle volumes and left hippocampus volumes and generates images in the style of the \ac{adni} cohort.][b]{%
            \includegraphics[width=.48\textwidth]{figures/appendix/mri_real/mri_real_ven_lh_adni.pdf}
            \label{fig:mri_ven_lh_adni}%
        }
    }
\end{figure}

\begin{figure}[ht]
\floatconts
    {fig:app_real_ms_mri_2}%
    {\caption{Generated \ac{mri} from the proposed \ac{mssg} in the true multi-source scenario by controlling various covariates.}}
    {%
    \vspace{-1mm}
        \subfigure[\ac{mssg} generates images with specific ages and ventricle sizes in the style of the \ac{adni} cohort.][b]{%
            \includegraphics[width=.48\textwidth]{figures/appendix/mri_real/mri_real_age2_ven_adni.pdf}
            \label{fig:mri_age_ven_adni}%
        }
        \subfigure[\ac{mssg} generates images with different grey matter volumes and right hippocampal volumes in the style of the \ac{ukb} cohort.][b]{%
            \includegraphics[width=.48\textwidth]{figures/appendix/mri_real/mri_real_gm_rh_ukb.pdf}
            \label{fig:mri_gm_rh_ukb}%
        }
    }
\end{figure}

\clearpage
\subsection{Real-World Data: Retinal Fundus Images}
\paragraph{Synthetic multi-source Retina}
%Similar to the MRI experiments, we split retinal fundus images from \ac{ukb} randomly into two data sources. Both sources contain the shared covariate $c^j_\text{shar}$ (age), source 1 has the unique covariate $c^1_\text{uniq}$ (cataract) and source 2 has the unique covariate $c^2_\text{uniq}$ (spherical power). 
The following images are also generated when two covariates interpolately increase from the minimal $30\%$ until the maximal $30\%$. 
\figureref{fig:retinal_ms_cataract} describes generated retinal fundus images by controlling these three covariates independently. In \figureref{fig:ret_no_cataract} the variable cataract is set to $0$ (no cataract) while the age and spherical power are modified between the interval of the minimal $30\%$ and the maximal $30\%$. The brightness of images changes with the increasing spherical power. In \figureref{fig:ret_cataract} generated images with the cases of cataracts are generally more blurry and the vessels can not easily be observed, while the colors of images turn more yellow and grey compared to non-cataract retinal images.
Furthermore, \figureref{fig:retinal_ms_age} presents two more examples of generated retinal images when the age is $59$ years old. We can observe in \figureref{fig:ret_age} and \figureref{fig:ret_age_2} that generated images are diverse, even though they belong to the same class.
% \figureref{fig:retina_ms_cataract} describes the continuous changing of retinal images, when two covariates interactively change. The retinal images become sharper but also brighter around the optic disc, where the vessels are, when diastolic blood pressure increases. Diastolic blood pressure of higher than 90 is usually considered hypertensive. \cref{fig:retina_age_dbp_ms} shows that the images slightly change when the age gets older. When changing the diastolic blood pressure, it shows a similar tendency as \cref{fig:retina_dbp_spherical_ms}. \cref{fig:retina_age_spherical_ms} depicts also that retinal images have changed when the age got older and the spherical power increased.

\begin{figure}[ht]
\floatconts
    {fig:retinal_ms_cataract}%
    {\caption{Generated retinal fundus images by the "full" multi-source \ac{gan}. Here, the model controls ages and spherical powers.}}
    {%
        \subfigure[Generated images without cataracts][b]{%
            \includegraphics[width=.48\textwidth]{figures/main/cataract_0.0_1.pdf}
            \label{fig:ret_no_cataract}%
        }
        \subfigure[Generated cataract retinal images][b]{%
            \includegraphics[width=.48\textwidth]{figures/main/cataract_1.0_1.pdf}
            \label{fig:ret_cataract}%
        }
    }
\end{figure}
% \begin{wrapfigure}{r}{0.45\textwidth}
\begin{figure}[ht]
    \floatconts
    {fig:retinal_ms}% label
    {\vspace{-5mm}\caption{Retinal fundus images generated by the ``full'' multi-source \ac{gan}. The model controls cataract and spherical power, age is set to $59$. ``CAT.'' represents Cataract.}}
    {\vspace{-5mm}\includegraphics[width=.5\textwidth]{figures/main/retinal_age_59.2_fig3_new.pdf}}
\end{figure}
% \end{wrapfigure}
\begin{figure}[ht]
\floatconts
    {fig:retinal_ms_age}%
    {\caption{Two different seeds from the "full" multi-source \ac{gan} when generating retinal images. Here, the model controls cataracts and spherical powers. Generated images are various, even though they belong to the same class. "CAT." represents Cataract.}}
    {%
        \subfigure[Generated retinal images with an age of $59$ years][b]{%
            \includegraphics[width=.48\textwidth]{figures/appendix/retinal_age_59.2_app1_new.pdf}
            \label{fig:ret_age}%
        }
        \subfigure[Another example for generated retinal images with an age of $59$ years][b]{%
            \includegraphics[width=.48\textwidth]{figures/appendix/retinal_age_59.2_app2_new.pdf}
            \label{fig:ret_age_2}%
        }
    }
\end{figure}
\end{document}
