%\documentclass[anon]{midl} % Include author names
\documentclass[final]{midl}

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution
\usepackage{makecell}
\usepackage{multirow}
\usepackage{tikz}
\usepackage{scrextend}
\usepackage{algorithm2e}
%\usepackage{algpseudocode}

\newcommand{\joao}[1]{{\textcolor{red}{ #1}}}
\usepackage{tabularray}
%\usepackage{colortbl}
\usepackage{color}
\usepackage{tabularray}
%\definecolor{Black}{rgb}{NaN,NaN,NaN}



\newenvironment{Ualgorithm}[1][htpb]{\def\@algocf@post@ruled{\kern\interspacealgoruled\hrule  height\algoheightrule\kern3pt\relax}%
\def\@algocf@capt@ruled{under}
\begin{algorithm}[#1]}
{\end{algorithm}}
\makeatother

\usepackage{mwe} % to get dummy images
\jmlrvolume{-- nnn}
\jmlryear{2025}
\jmlrworkshop{Full Paper -- MIDL 2025}
\editors{Accepted for publication at MIDL 2025}

\title[Diffusion Models For MRI Counterfactual Data Augmentation]{Data Augmentation for Medical Imaging: Counterfactual Simulation of Acquisition Parameters via Conditional Diffusion Model}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Pedro Morão\nametag{$^{1}$}} \Email{pedro.morao@tecnico.ulisboa.pt}\\
\addr $^{1}$ Instituto Superior Técnico - Universidade de Lisboa, Lisboa, Portugal \AND
\Name{Yasna Forghani\nametag{$^{2}$}} \orcid{0009-0003-0515-7146}\Email{yasna.forghani@research.fchampalimaud.org}\\
\addr $^{2}$ Digital Surgery LAB, Breast Unit, Champalimaud Foundation, Lisboa, Portugal \AND
\Name{Nuno Loução\nametag{$^{2}$}} \orcid{0000-0003-1810-2817} \Email{nuno.loucao@research.fchampalimaud.org}\\
\Name{Pedro Gouveia\nametag{$^{2,3}$}} \orcid{0000-0001-5600-2783} \Email{pedro.gouveia@fundacaochampalimaud.pt}\\
\addr $^{3}$ Faculdade de Medicina, Universidade de Lisboa, Lisboa, Portugal \AND
\Name{Mário A. T. Figueiredo\nametag{$^{1,4}$}} \orcid{0000-0002-0970-7745} \Email{mario.figueiredo@tecnico.ulisboa.pt}\\
\addr $^{4}$ Instituto de Telecomunicações, Lisboa, Portugal\AND
\Name{João Santinha\nametag{$^{2,3}$}} \orcid{0000-0001-8174-2943} \Email{joao.santinha@research.fchampalimaud.org}\\
}

\begin{document}

\maketitle

\begin{abstract}
Deep learning (DL) models in medical imaging face challenges in generalizability and robustness due to variations in image acquisition parameters (IAP). In this work, we introduce a novel method using conditional denoising diffusion generative models (cDDGMs) to generate counterfactual medical images that simulate different IAP without altering patient anatomy. We demonstrate that using these counterfactual images for magnetic resonance (MR) data augmentation can improve segmentation accuracy in out-of-distribution settings, enhancing the overall generalizability and robustness of DL models across diverse imaging conditions. Our approach shows promise in addressing domain and covariate shifts in medical imaging. The code is publicly available at \url{https://github.com/pedromorao/Counterfactual-MRI-Data-Augmentation}
\end{abstract}

\begin{keywords}
Denoising Diffusion Generative Models, Data Augmentation, MRI, Medical Imaging, Generalizability
\end{keywords}

\section{Introduction}

Deep learning (DL) models in medical imaging continue to face generalizability and robustness challenges. This is specially relevant given the variability of imaging devices and their ability to change image acquisition parameters.
%While we often can achieve a fairly good estimation of the conditional probability distribution $P_{train}(Y|X)$ using our training dataset $D:\{(X_{train}, Y_{train}\}$, when applied to other data 
While data augmentation has been widely used to improve the performance of DL models in various fields, current augmentation techniques do not easily replicate domain, population, and covariate shifts that arise from variations in medical image scanners, acquisition settings, and patient populations. As variations in scanners and acquisition settings should only produce changes in the image \textit{style}, style transfer has been proposed as a possible solution to harmonize images across different acquisition settings and scanners \cite{karras2019style, zhu2017unpaired}. However, those methods usually work by mapping a source to a target domain on a pairwise basis. That approach thus leads to combinatorially growing numbers of possible combinations that exponentially increase as new scanners and acquisition protocols emerge.

Invariant-based methods, like the one proposed by \citet{arjovsky2019invariant}, offer a promising solution to mitigate performance drops under domain and covariate shifts. However, those methods often require detailed information about the environments in which the data were acquired, as well as known clinical outcomes. Advances in image generation and modification techniques could be leveraged to synthesize new images \cite{fernandez2022can,usman2024brain}, further enforcing invariance during training. Several studies have investigated generative methods for counterfactual image generation  \cite{ribeiro2023high,sanchez2022diffusion,mertes2022ganterfactual,konz2024anatomically}. In particular, \citet{ribeiro2023high}, \citet{mertes2022ganterfactual}, and \citet{konz2024anatomically} explored these methods in medical imaging, evaluating their ability to simulate variations in patient demographics, MRI sequences, and anatomical structures.
However, no prior work has investigated the use of generative models to produce counterfactual images by simulating different acquisition settings while preserving anatomical structures. Thus,
exploring how such techniques may improve DL models' robustness and generealizabily is a
promising avenue. Towards this goal, this study investigates the following open questions:
\begin{enumerate}
    \item[\textbf{Q1:}] Can we develop a generative model capable of counterfactually modifying medical images, in particular MRI, simulating different image acquisition settings?
    
    \item[\textbf{Q2:}] Can the counterfactual images \textit{fool} a classifier trained to accurately predict the image acquisition parameters from the pixel data?
    
    \item[\textbf{Q3:}] Does training a segmentation model with counterfactually modified images increase performance on out-of-distribution samples?

\end{enumerate}

Our work introduces a novel method for creating counterfactuals from existing data using conditional denoising diffusion generative models (cDDGMs). Our approach simulates the acquisition of magnetic resonance (MR) images across different scanners and image acquisition parameters (IAP). By incorporating IAP as conditioning context for the denoising diffusion generative model (DDGM), we are able to alter images without affecting the underlying patient anatomy.

We evaluate the effectiveness of the generated counterfactual IAP images using metrics such as the Fréchet inception distance (FID), structural similarity index metric (SSIM), and maximum mean discrepancy (MMD). Additionally, we assess the ability of these images to mislead a multi-task model trained to predict the IAP from MR images. Finally, we examine the impact of using these counterfactual images for data augmentation on the generalizability of DL segmentation models, focusing on both in-distribution (ID) and out-of-distribution (OOD) scenarios.

The main contributions of this study can be summarized as follows:
\begin{itemize}
    \item We explore and demonstrates the feasibility of using cDDGM to generate counterfactually IAP modified medical images.
    \item We assess the impact on generalizability when using the proposed cDDGM as an IAP data augmentation method for training segmentation models.
    \item We approach the stated open questions (Q1-3) by conducting experiments in a public dataset and we open-source the code used in our experiments, allowing further testing and extension of the proposed method by the research community.
\end{itemize}

\section{Materials and Methods}

\subsection{Dataset}
We used the Duke-Breast-Cancer-MRI dataset \cite{saha2018machine} to train and evaluate our deep generative model and to perform the different experiments. The dataset comprised pre-contrast dynamic contrast-enhanced breast MRIs from 922 patients, with 100 patients (29 acquired with Siemens scanners and 71 acquired with GE scanners) also containing breast tissue segmentation masks. Details pertaining to data normalization and pre-processing can be found in Section \ref{sec:data_norm}. More information regarding data partitioning for the different experiments are provided in Section \ref{sec:training}.

\subsection{Conditional Denoising Diffusion Generative Model}

Our proposed cDDGM is developed to modify MR images through the simulation of their acquisition with counterfactual IAP. The proposed cDDGM architecture is based on the DDPM architecture \cite{ho2020denoisingdiffusionprobabilisticmodels}, using a conditional U-Net as the noise estimation model which learns to reverse a Markovian diffusion process by gradually denoising an image, starting from pure-noise. Additionally, inspired by the U-Net design from latent diffusion models \cite{rombach2022highresolutionimagesynthesislatent}, the proposed conditional U-Net architecture also incorporates cross-attention mechanisms, which enhance the model's ability to effectively handle conditioning contexts that are more complex than simple image classes.

%The noise estimation model used during the reverse diffusion process is a conditional UNet. Its architecture is inspired by the UNet design from Latent Diffusion Models \cite{rombach2022highresolutionimagesynthesislatent}. This UNet architecture incorporates cross-attention mechanisms, which enhance the model's ability to condition on complex contexts. Without these attention mechanisms, the model would struggle to effectively handle conditioning contexts that are more complex than simple image classes.

Our U-Net architecture consists of six downsampling layers (number of channels per layer: 64, 64, 128, 128, 256, 256), one middle layer, and six upsampling layers, with each layer containing two residual convolutional blocks. Cross-attention blocks are included on the third and fifth of the downsampling layers, on the middle layer, and the corresponding positions in the upsampling layers. While adding more cross-attention blocks could improve the model's performance, it also significantly increases the computational cost, particularly if added to earlier layers of the U-Net. The conditioning is performed by adding the IAP embedding to the time embeddings and incorporating it through the cross-attention blocks. This "hybrid" conditioning approach, which combines adding the condition embedding to the time embeddings and cross-attention blocks, is similar to what is used by \citet{pinaya2022brainimaginggenerationlatent}.

The model was trained using the simplified loss

\begin{equation}
    \mathcal{L}(\theta) := \mathbb{E}_{t,\mathbf{x}_0,\mathbf{\epsilon} \sim N(0, \mathbf{I})}  \left[ \left\| \epsilon - \mathbf{\epsilon}_{\theta} (\sqrt{\bar{\alpha_t}}\mathbf{x}_0 + \sqrt{1-\bar{\alpha_t}} \mathbf{\epsilon}, c, t)  \right\|^2    \right],
\label{eqq:training_loss_cDDPM}
\end{equation}
\noindent where $t$ is a timestep, $\mathbf{x}_0$ represent the original image; $c$ are a set of IAP used for conditioning, $\bar{\alpha_t} = \prod_{s=1}^t \alpha_s$, with $\alpha_t = 1-\beta_t$, and $\beta_t$ corresponding to the forward process variances at step $t$. This loss function was adapted for the conditional training scenario, allowing the model $\mathbf{\epsilon}_{\theta}$ to receive the IAP conditioning as input but still work in an unconditional setting without $c$.

To condition across the multiple classes, corresponding to the different IAP, we selected the classifier free-guidance (CFG) method \cite{ho2022classifierfreediffusionguidance}, as it enables controlling the strength of the alignment with the conditional context through a guidance scale parameter, eliminating the need for an additional classifier, as opposed to classifier guidance.

The training algorithm for the cDDGM is equal to the original DDPM training algorithm \cite{ho2020denoisingdiffusionprobabilisticmodels}, except the model is conditioned on the IAP with a conditional dropout of 15\%. The algorithm to counterfactually modify images and simulate their acquisition with other IAP is shown in Algorithm \ref{alg:image_mod_alg}. For the diffusion process,  we use $1000$ steps of the original DDPM sampler with a cosine noise scheduler. Initially, noise is added to the original image $\mathbf{x}_0$ until we reach $t=steps$, then we use the CFG method \cite{ho2022classifierfreediffusionguidance} to denoise the image from $t=steps$ back to $t=0$, now conditioning the image on a new set of IAP, $c_{new}$, and controlling the guidance scale with a parameter $w$. After denoising $\mathbf{x}_{steps}$, we return the modified $\mathbf{x}_0$ with its IAP changed.

\begin{algorithm2e}
\caption{IAP modification algorithm using CFG. $t$: timestep; $\mathbf{x}_0$: original image; $c_{new}$: new set of IAP; $\mathbf{x}_{t}$: resulting images after step $t$; $\alpha_t = 1-\beta_t$; $\beta_t$: forward process variances at step $t$; $\bar{\alpha_t} = \prod_{s=1}^t \alpha_s$.}\label{alg:image_mod_alg}
$\mathbf{z} \sim \mathcal{
N}(0,\mathbf{I})$ \\
$\mathbf{x}_{steps} = \sqrt{\bar{\alpha}_{steps}}\mathbf{x}_0 + \sqrt{1-\bar{\alpha}_{steps}} \mathbf{z}$

\For{$t=steps,\cdots,0$}{
$\mathbf{z} \sim \mathcal{N}(0,\mathbf{I})$ if $t > 0$, else $\mathbf{z} = 0$ \\ 
$\tilde{\mathbf{\epsilon}_t} = (1-w)\mathbf{\epsilon}_{\theta}(\mathbf{x}_t,t) + w\mathbf{\epsilon}_{\theta}(\mathbf{x}_t,c_{new},t)$

$\mathbf{x}_{t-1} = \frac{1}{\sqrt{\alpha_t}}(\mathbf{x}_t - \frac{1-\alpha_t}{\sqrt{1- \bar{\alpha_t}}}\tilde{\mathbf{\epsilon}_t}) + \sigma_t \mathbf{z}$}
\Return{$\mathbf{x}_0$}
\end{algorithm2e}

This model was trained with a batch size of 32 over 15 epochs. The Adam optimizer was used with a learning rate of $10^{-4}$ and weight decay of $10^{-3}$. Additional training details are provided in Section \ref{sec:training}

\section{Experiments}
\subsection{Experiments and Metrics to Evaluate Counterfactual Data Augmentation}

After training the cDDGM, we counterfactually modified the original images by altering their IAP. To achieve this, we stopped the forward diffusion process at an early stage, when the perturbed image's IAP distributions would overlap, and then reversed the diffusion process while conditioning the image on a different set of IAP. This approach is similar to that of \citet{meng2022sdeditguidedimagesynthesis}, but we employ a conditional model. We explore the impact of varying the number of reverse diffusion steps and adjusting the CFG's scale parameter on the resulting counterfactual images. To generate the counterfactual version of the input image, a set of IAP from a different manufacturer was randomly selected and used to modify it.

%Since the cDDGM was designed for IAP counterfactual data augmentation, the expected behavior wTo evaluate the use of the cDDGM tGiven that the cDDGM was trained to perform IAP counterfactual data augmentation, for each image, we randomly sampled a set of IAP and generated new counterfactual images. In the case where the sampled set of IAP matches the original image’s IAP, the model’s output should preserve the image’s IAP and recover the original image.

To evaluate the proposed method, we used several image quality and generative metrics, including the structural similarity metric (SSIM), Fréchet inception distance (FID), and maximum mean discrepancy (MMD). Additionally, inspired by previous work from \citet{konz2024reverse}, we developed an IAP prediction models to assess whether cDDGM's counterfactual images could ”fool” the predictor into classifying them with the counterfactual IAP rather than the original one. We evaluated the IAP predictor model performance using top-1 accuracy for categorical IAP and mean squared error for continuous IAP; further details about this model are provided in Section \ref{sec:IAP}. Moreover, counterfactual image generation was also assessed through the use of counterfactual prediction gain \cite{nemirovsky2020countergan}.

Finally, since the developed cDDGM was trained to perform changes in tissue contrast based on the IAP, without changing the anatomy, we then used the IAP counterfactual images as data augmentation samples and assessed the effect on the performance of the segmentation models in the two scenarios presented in Section \ref{sec:training}. The segmentation models are described in \ref{sec:seg}. We assessed the impact of the counterfactual data augmentation in the segmentation models' using the Dice-S{\o}rensen coefficient and accuracy for each different breast tissue present in the segmentation masks. 

\subsubsection{Image Acquisition Parameters Prediction Model}
\label{sec:IAP}
Following the model proposed by \citet{konz2024reverse}, a ResNet-18 \cite{he2016deep} was modified to predict 7 image acquisition parameters through the final fully-connected layer. These 7 
MRI acquisition parameters change contrast in a non-linear manner, impacting DL segmentation models, as existing harmonization and normalization methods fail to fully compensate for these variations.

The four continuous ($M = 4$) IAP - Flip Angle (FA), Slice Thickness (ST), Echo Time (TE), and Repetition Time (TR) - are predicted directly using a single unit for each of them in our network's output layer. The three categorical ($K = 3$)  IAP considered - Scanner Manufacturer (SM), Field Strength (FS), and Scan Options (SO) - are converted into one-hot
encoding each with a different number of possible values/categories. For the categorical variables, with $C_k$ ($k = 1,\cdots , K$) denoting the number of categories in each categorical variable, the final layer, has a total width of $\sum_{k=1}^K C_k + M$.

The training of the IAP model involved a multi-task learning approach with the combination of loss functions for the categorical (weighted-cross-entropy losses, $\mathcal{L}_{WCE_k}$) and continuous IAP (mean squared error losses, $\mathcal{L}_{MSE}$):
\begin{equation}
    \mathcal{L} _{IAP} = \sum_{k=1}^K \mathcal{L}_{WCE_k} (\hat{y}, y) + \sum_{m=1}^M \mathcal{L}_{MSE} (\hat{y}, y).
    \label{eq:eq1}
\end{equation}

The IAP prediction model was trained using a batch size of 512 over 200 epochs. The Adam optimizer was used with a learning rate of $10^{-5}$ and a weight decay parameter of $10^{-4}$.


\subsubsection{Breast Tissue Segmentation Model}
\label{sec:seg}

For the breast tissue segmentation, a U-Net \cite{DBLP:journals/corr/RonnebergerFB15} with residual blocks to enable better gradient back-propagation and facilitate the optimization process, to segment MRI images into 3 different labels fat, fibroglandular tissue (FGT) and background. The segmentation models were trained using the Adam optimizer with a learning rate of 0.002, a weight decay of 0.001, and a batch size of 256. Early stopping was applied to determine the optimal stopping point during training. The number of channels per layer was 32, 64, 128, 256, 512, and 512.

\subsubsection{Additional Details on IAP, cDDGM, and Segmentation Model Training}
\label{sec:training}

We used images from the 822 patients without breast segmentations to train the cDDGM and IAP models. The training of the segmentation models used the images and corresponding breast tissue segmentation masks of the remaining 100 patients, while considering different scenarios: (1) mix of images from different manufacturers available for training; (2) images from only one manufacturer available for training. 

An iterative method was used to split the images into training, validation, and test sets, ensuring that different combinations of IAP were equally represented across all sets. Additionally, the training/validation/test splitting procedure ensured that images from the same patient were not included in different sets.

For the subset without segmentations, the dataset was split into 75\% for training, 10\% for validation, and 15\% for testing. In the subset with segmentations, 75\% of the data was used for training and 25\% used for validation. Due to the limited number of patients with segmentations, the validation set was also used as the test set to evaluate the segmentation model in an ID setting. All OOD images were used as the test set in the OOD evaluation, as they were not included in the training process.


%More details about training are provided in Appendix A.3.

%\subsection{Evaluation metrics}

%Given that the cDDGM was trained to perform IAP counterfactual data augmentation, for each image, we randomly sampled a set of IAP and generated new counterfactual images. In the case where the sampled set of IAP matches the original image's IAP, the model's output should preserve the image's IAP and recover the original image.

%The performance of the cDDGM on this task was evaluated using the Fréchet inception distance (FID), structural similarity metric (SSIM), maximum mean discrepancy (MMD), and the ability to "fool" the IAP prediction model and have the model predict the counterfactual IAP instead of the originally IAP.



\section{Results and Discussion}
\label{sec:results_discussion}
%The guidance scale and number of steps of the proposed cDDGM were optimized for counterfactual IAP modification. Based on the results presented in Tables \ref{table:table3_Gen_Evaluation_perf} and \ref{table:table4_GenIAP_perf} of Appendix \ref{sec:optimization}, it is possible to observe that an increase in these hyperparameters results in worse FID, SSIM$_{orig.\ and\ mod.}$, and MMD metrics, while improving the ability to "fool" the IAP predictive model. Therefore, to generate the counterfactual IAP data augmentation and assess its impact in the segmentation models performance, we selected a guidance scale $= 3$ and number of steps $= 50$, offering a compromise between image quality and generative metrics, and the ability to predict the IAP used to generate the counterfactual images.

The guidance scale and number of steps of the proposed cDDGM were optimized for counterfactual IAP modification. As shown in Tables \ref{table:table3_Gen_Evaluation_perf} and \ref{table:table4_GenIAP_perf} of Appendix \ref{sec:optimization}, increasing these hyperparameters leads to deteriorated image quality and generative metrics, such as FID, SSIM$_{orig.\ and\ mod.}$, and MMD, while simultaneously improving predictions of the IAP used to counterfactually modify the images. Consequently, to generate counterfactual IAP samples for data augmentation and to evaluate its impact on segmentation model performance, we selected a guidance scale of 3 and 50 steps. This configuration strikes a balance between preserving image quality and achieving effective IAP prediction.


The performance of the IAP prediction model is summarized in Table \ref{table:table1_IAP_perf}. We see that the IAP prediction model captures with very good accuracy the IAP of the test dataset. Considering the ranges of each continuous variable (FA: [7\textdegree-12\textdegree]; ST: [1.1mm-2.5mm]; TE:[1.250ms-2.756ms]; TR: [3.540ms-7.395ms]), the IAP prediction models was able to estimate all variables with low MSE, except ST, for which the MSE was relatively higher ($\sim$ 5-12\%).

\begin{table}[hbt!]
\centering
\caption{Model prediction performance for all IAP on the Test Set. An upward arrow indicates that a higher value is better, and vice versa.}
\label{table:table1_IAP_perf}
\begin{tabular}{lcc}
\Xhline{3\arrayrulewidth}
\begin{tabular}[c]{@{}l@{}}Image acquisition\\ parameter (IAP)\end{tabular} & \multicolumn{1}{l}{\begin{tabular}[c]{@{}l@{}}Top-1 pred. \\ acc. (\%) $\uparrow$ \end{tabular}} & \multicolumn{1}{l}{\begin{tabular}[c]{@{}l@{}}Pred. \\MSE $\downarrow$  \end{tabular}} \\ \Xhline{2\arrayrulewidth}
Manufacturer Model                                                          & 98.9                                                                                 & NA                            \\ \hline
Field Strength                                                              & 99.2                                                                                 & NA                            \\ \hline
Scan Options                                                                & 99.9                                                                                 & NA                            \\ \hline
Flip Angle (º)                                                              & NA                                                                                   & 0.080                         \\ \hline
Slice Thickness (mm)                                           & NA                                                                                   & 0.133                         \\ \hline
TE (ms)                                                                     & NA                                                                                   & 0.005                         \\ \hline
TR (ms)                                                                     & NA                                                                                   & 0.046                         \\ \Xhline{3\arrayrulewidth}
\end{tabular}
\end{table}

The proposed method also shows the ability to generate counterfactuals MRIs, which demonstrated the ability to improve counterfactual prediction gains for manufacturer, scanner manufacturer models, and field strength, as show in Table \ref{table:prediction_gain}.

\begin{table}[hbt!]
\centering
\caption{Counterfactual prediction gains for the categorical IAPs. The
range for counterfactual prediction gain is [0, 1] with an higher prediction gain indicating more improvement.}
\label{table:prediction_gain}
\begin{tabular}{lccc}
\Xhline{3\arrayrulewidth}
                               & Manufacturer & Scanner & Field Strenght \\ \hline
Counterfactual Prediction Gain & 0.372         & 0.568    & 0.254           \\ \Xhline{3\arrayrulewidth}
\end{tabular}
\end{table}

Table \ref{table:table2_Segmentations} presents the segmentation accuracies for background, fat, and FGT, along with the mean Dice scores for models trained using images from GE and Siemens MRI scanners. This table includes results for in-distribution (ID) settings (e.g., trained on GE, applied to GE; trained on Siemens, applied to Siemens) and out-of-distribution (OOD) settings (e.g., trained on GE, applied to Siemens; trained on Siemens, applied to GE). Additionally, we tested the use of the cDDGM as a data augmentation method for each setting, by generating modified images simulating the acquisition of the training images using set of IAP from images of a different manufacturer.%These results are based on a model selected from various configurations of number of steps and guidance scales, which are detailed in Tables \ref{table:table3_Gen_Evaluation_perf} and \ref{table:table4_GenIAP_perf} in Appendix \ref{sec:optimization}.


%\begin{table}[hbt!]
%\centering
%\caption{Segmentation performance in ID and OOD cases with and without counterfactual IAP data augmentation.}
%\label{table:table2_Segmentations}
%\begin{tabular}{lcccccc} %\Xhline{3\arrayrulewidth}
%Setting                                                                             & \multicolumn{1}{l}{\begin{tabular}[c]{@{}l@{}}Acc.\\ Background $\uparrow$\end{tabular}} & \multicolumn{1}{l}{\begin{tabular}[c]{@{}l@{}}Acc.\\Fat $\uparrow$\end{tabular}} & \multicolumn{1}{l}{\begin{tabular}[c]{@{}l@{}}Acc.\\FGT $\uparrow$\end{tabular}} & \multicolumn{1}{l}{\begin{tabular}[c]{@{}l@{}}Dice\\Fat $\uparrow$\end{tabular}} & \multicolumn{1}{l}{\begin{tabular}[c]{@{}l@{}}Dice\\FGT $\uparrow$\end{tabular}} & \multicolumn{1}{l}{\begin{tabular}[c]{@{}l@{}}Mean\\Dice $\uparrow$\end{tabular}} \\ \Xhline{3\arrayrulewidth}
%Both                                                                                & 99.4                                                                          & 93.6                                                                   & 90.3                                                                   & 0.929                                                                  & 0.716                                                                  & 0.840                                                                   \\ \Xhline{2\arrayrulewidth}
%GE trained in GE                                                                    & 99.4                                                                          & \textbf{96.6}                                                                   & 88.6                                                                   & 0.949                                                                  & \textbf{0.758}                                                         & \textbf{0.863}                                                          \\ %\hline
%\begin{tabular}[c]{r}  cDDGM - \# steps:50; gs:3\end{tabular}           & \textbf{99.5}                                                                 & 96.5                                                                   & \textbf{89.4}                                                          & \textbf{0.950}                                                         & 0.757                                                                  & \textbf{0.863}                                                          \\ \Xhline{2\arrayrulewidth}
%SIEMENS trained in GE                                                               & 98.3                                                                          & 94.0                                                                   & \textbf{62.3}                                                          & 0.860                                                                  & \textbf{0.555}                                                         & 0.730                                                                   \\ %\hline
%\begin{tabular}[c]{r}  cDDGM - \# steps:50; gs:3\end{tabular}      & \textbf{99.1}                                                                 & \textbf{94.2}                                                                   & 61.7                                                                   & \textbf{0.889}                                                         & 0.536                                                                  & \textbf{0.739}                                                          \\ \Xhline{2\arrayrulewidth}
%SIEMENS trained in SIEMENS                                                          & \textbf{99.4}                                                                 & 91.5                                                                   & 58.7                                                                   & \textbf{0.866}                                                         & \textbf{0.588}                                                         & \textbf{0.746}                                                          \\ %\hline
%\begin{tabular}[c]{r}  cDDGM - \# steps:50; gs:3\end{tabular} & 99.1                                                                          & \textbf{92.3}                                                          & \textbf{60.2}                                                                   & 0.863                                                                  & 0.571                                                                  & 0.737                                                                   \\ \Xhline{2\arrayrulewidth}
%GE trained in SIEMENS                                                               & \textbf{98.9}                                                                 & 89.3                                                                   & 59.7                                                                   & 0.886                                                                  & 0.549                                                                  & 0.742                                                                   \\ %\hline
%\begin{tabular}[c]{r}  cDDGM - \# steps:50; gs:3\end{tabular}      & 98.8                                                                          & \textbf{91.8}                                                          & \textbf{67.7}                                                                   & \textbf{0.896}                                                         & \textbf{0.553}                                                                  & \textbf{0.750}                                                                 \\ \Xhline{3\arrayrulewidth}
%\end{tabular}
%\end{table}

\begin{table}
\centering
\caption{Segmentation performance in ID (equal manufacturers in training and testing) and OOD (different manufacturers in training and testing) cases with and without counterfactual IAP data augmentation. CF: counterfactual; Acc.: accuracy; Backgr.: background; FGT: fibrogladular tissue; Manuf.: manufacturer; $^{*}$ - $p<0.05$.}
\label{table:table2_Segmentations}
\begin{tblr}{
  cell{2}{3} = {c},
  cell{2}{4} = {c},
  cell{2}{5} = {c},
  cell{2}{6} = {c},
  cell{2}{7} = {c},
  cell{2}{8} = {c},
  cell{3}{3} = {c},
  cell{3}{4} = {c},
  cell{3}{5} = {c},
  cell{3}{6} = {c},
  cell{3}{7} = {c},
  cell{3}{8} = {c},
  cell{4}{3} = {c},
  cell{4}{4} = {c},
  cell{4}{5} = {c},
  cell{4}{6} = {c},
  cell{4}{7} = {c},
  cell{4}{8} = {c},
  cell{5}{3} = {c},
  cell{5}{4} = {c},
  cell{5}{5} = {c},
  cell{5}{6} = {c},
  cell{5}{7} = {c},
  cell{5}{8} = {c},
  cell{6}{3} = {c},
  cell{6}{4} = {c},
  cell{6}{5} = {c},
  cell{6}{6} = {c},
  cell{6}{7} = {c},
  cell{6}{8} = {c},
  cell{7}{3} = {c},
  cell{7}{4} = {c},
  cell{7}{5} = {c},
  cell{7}{6} = {c},
  cell{7}{7} = {c},
  cell{7}{8} = {c},
  cell{8}{3} = {c},
  cell{8}{4} = {c},
  cell{8}{5} = {c},
  cell{8}{6} = {c},
  cell{8}{7} = {c},
  cell{8}{8} = {c},
  cell{9}{3} = {c},
  cell{9}{4} = {c},
  cell{9}{5} = {c},
  cell{9}{6} = {c},
  cell{9}{7} = {c},
  cell{9}{8} = {c},
  cell{10}{3} = {c},
  cell{10}{4} = {c},
  cell{10}{5} = {c},
  cell{10}{6} = {c},
  cell{10}{7} = {c},
  cell{10}{8} = {c},
  hline{1-3,5,7,9,11} = {-}{},
}
{Training\\Manuf.} & {Testing\\Manuf.} & {Acc.\\Backgr. $\uparrow$} & {Acc.\\Fat $\uparrow$} & {Acc.\\FGT $\uparrow$} & {Dice\\Fat $\uparrow$} & {Dice\\FGT $\uparrow$} & {Mean\\Dice $\uparrow$} \\
Both               & Both              & 99.4                          & 93.6                   & 90.3                   & 0.929                  & 0.716                  & 0.840                   \\
GE                 & GE                & 99.4                          & \textbf{96.6}          & 88.6                   & 0.949                  & \textbf{0.758}         & \textbf{0.863}          \\
GE + CF Siemens    & GE                & \textbf{99.5}                 & 96.5                   & \textbf{89.4}$^{*}$          & \textbf{0.950}         & 0.757                  & \textbf{0.863}          \\
GE                 & Siemens           & 98.3                          & 94.0                   & \textbf{62.3}          & 0.860                  & \textbf{0.555}$^{*}$         & 0.730                   \\
GE + CF Siemens    & Siemens           & \textbf{99.1}$^{*}$                 & \textbf{94.2}          & 61.7                   & \textbf{0.889}$^{*}$         & 0.536                  & \textbf{0.739}$^{*}$          \\
Siemens            & Siemens           & \textbf{99.4}                 & 91.5                   & 58.7                   & \textbf{0.866}         & \textbf{0.588}$^{*}$         & \textbf{0.746}$^{*}$          \\
Siemens + CF GE    & Siemens           & 99.1                          & \textbf{92.3}$^{*}$          & \textbf{60.2}$^{*}$          & 0.863                  & 0.571                  & 0.737                   \\
Siemens            & GE                & \textbf{98.9}                 & 89.3                   & 59.7                   & 0.886                  & 0.549                  & 0.742                   \\
Siemens + CF GE    & GE                & 98.8                          & \textbf{91.8}$^{*}$          & \textbf{67.7}$^{*}$          & \textbf{0.896}$^{*}$         & \textbf{0.553}$^{*}$         & \textbf{0.750}$^{*}$          
\end{tblr}
\end{table}

The results indicate that using IAP counterfactual images yields slight improvements in segmentation accuracy for the background and FGT, as well as an enhanced Dice score for fat in an ID setting with GE scanners. Similar improvements were observed for fat and FGT when using Siemens scanners. In these ID scenarios, we did not expect larger segmentation performance improvements, as the counterfactual data augmentation generated by the proposed cDDGM produces OOD samples. 

%The results indicate that using IAP counterfactual images yields slight improvements in segmentation accuracy for the background and FGT, as well as an enhanced Dice score for fat in an in-domain (ID) setting with GE scanners. Similar improvements were observed for fat and FGT when using Siemens scanners. In these ID scenarios, we did not expect larger performance gains, as the counterfactual data augmentation generated by the proposed cDDGM produces out-of-distribution (OOD) samples.



In OOD settings, when the model was trained with GE images, the inclusion of IAP counterfactual images positively impacted the accuracies for background and fat, as well as the mean and fat Dice scores. Despite  statistically significant improvements of mean and fat Dice, the same improvement was not observed for FGT. FGT is characterized by having smaller dimensions, a more variable intensity distribution, patients with very little or no FGT, and in some case less distinguishable boundaries from surrounding structures compared to fat, making segmentation harder and more uncertain. In the OOD setting, when the model was trained on Siemens images and applied to GE images, the IAP counterfactual model provide statistically significant improvements on segmentation accuracy for fat and FGT, along with enhancing the fat, FGT, and mean Dice scores.

\begin{figure}[hbt!]
  \centering
  \caption{Comparison between the ground truth (True) and DL breast segmentation models trained without data augmentation (Pred.) and with data augmentation using cDDGM (Pred.Aug.), in out-of-distribution settings. (A) results of models trained in GE when applied to Siemens MRIs. (B) results of models trained in Siemens when applied to GE MRIs. Blue - Fat mask; Orange - FGT mask.}
  \includegraphics[width=0.65\textwidth]{Segmentation_OOD.png}
  \label{fig:figure_1_seg_ood}
\end{figure}

Figure \ref{fig:figure_1_seg_ood} showcases several examples of breast MRIs from the test set, along with corresponding ground truth tissue masks and DL segmentation predictions without and with IAP counterfactual images used as data augmentation in the two OOD settings previously mentioned. In Figure 1-A, we observe that the DL segmentation model without cDDGM data augmentation has a propensity to incorrectly classify background areas (black) in the chest wall (top image) and liver (third image), where the model with cDDGM data augmentation was able to reduce these errors. As for Figure 1-B, we see several holes in the breast tissue masks of the first and third predictions using the DL segmentation model trained without cDDGM data augmentation that are reduced when the proposed data augmentation method is used. Figure 2 in the Appendix \ref{sec:appendix} demonstrates that for both ID scenarios.



Our work is limited by the lack of diversity in MRI scanner manufacturers and the dataset size (e.g., of the 100 patients containing breast tissue segmentations, only 29 patients were acquired in Siemens scanners). Nevertheless the use of the proposed cDDGM for counterfactual MRI data augmentation yielded promising results, demonstrating its potential to improve generalizability and robustness of DL models in medical imaging.

\section{Conclusions}

In this work, we demonstrated that integrating image acquisition parameters counterfactual images using conditional denoising diffusion generative models can enhance the generalizability and robustness of deep learning models in medical imaging. The generated counterfactual images successfully misled the image acquisition parameters prediction model into predicting the intended counterfactual parameters. Moreover, using these images for data augmentation led to slight improvements in segmentation accuracy, particularly in out-of-distribution settings, thereby improving the generalizability of deep learning models across diverse medical imaging conditions.


\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{This study received support from the “Health from Portugal - Agenda Mobilizadora para a Inovação Empresarial” project, funded by Plano de Recuperação e Resiliência português under grant agreement No C644937233-00000047.}


%\bibliography{midl-samplebibliography}
\bibliography{midl25_261}

\clearpage
\appendix

\section{Appendix}
\label{sec:appendix}

\subsection{Data Normalization and Preprocessing}
\label{sec:data_norm}

The Duke-Breast-Cancer-MRI dataset comprises multiple 3D and 4D MRI sequences. Since each sequence is associated with only one set of IAP, pairwise supervised image modification techniques are not applicable. Following the approach of \citet{konz2024reverse}, we focused on the 3D pre-contrast phase of 4D dynamic contrast-enhanced sequences. In 100 patients, this phase included corresponding 3D fat and fibroglandular tissue segmentations, enabling us to evaluate the impact of the proposed cDDGM model as a data augmentation technique for DL segmentation models.

Although the selected phase represents a 3D volume, due to the size of the cDDGM model and hardware constraints, we developed and evaluated our model using 2D slices extracted from the 3D volumes. The first and last 20 slices were discarded, as they typically contained more noise and lacked relevant information.

We performed image normalization by resizing the images to 224x224 to ensure a fixed size and to accelerate model training and optimization. Although more complex models, such as Latent Diffusion Models, could handle larger images, they would require additional training of encoder and decoder networks to obtain smaller latent space in which the diffusion process would be executed. Moreover, the encoder and decoder would also need to preserve IAP-related information to ensure that the latent representation would still contain such information.

Image intensity values were normalized using percentile normalization, setting the 10th percentile to 0 and the 99th percentile to 1, without clipping values. The lower percentile was adjusted to a higher value due to the large number of low-intensity voxels in the background and thoracic cavity, which are not particularly relevant for the cDDGM or breast tissue segmentation model.

To normalize the IAP, categorical features were one-hot encoded, and numeric features were normalized by dividing by the maximum value in the dataset. This approach was chosen over min-max normalization to create a gap from 0 to the ratio of $value_{min} / value_{max}$, allowing the model to use 0 as the unconditional value. And since $value_{min} \not = 0$, this is always achievable.

%\subsection{Train, Validation and Test sets}

%We initially divided the dataset into subsets of images of patients with segmentations and without. We used the subset without segmentations for training and evaluating the IAP prediction model and the cDDGM model, while the subset with segmentations was exclusively used for training and evaluating the segmentation models. An iterative method was used to split the images into training, validation, and test sets, ensuring that different combinations of IAP were equally represented across all sets. Additionally, the training/validation/test splitting procedure ensured that images from the same patient were not included in different sets.


%For the subset without segmentations, the dataset was split into 75\% for training, 10\% for validation, and 15\% for testing. In the subset with segmentations, 75\% of the data was used for training and 25\% used for validation. Due to the limited number of patients with segmentations, the validation set was also used as the test set to evaluate the segmentation model in an ID setting. All OOD images were used as the test set in the OOD evaluation, as they were not included in the training process.

%\subsection{cDDGM Arquitecture}
%\label{sec:cddgm_arch}

%The noise estimation model used during the reverse diffusion process is a conditional UNet. Its architecture is inspired by the UNet design from Latent Diffusion Models \cite{rombach2022highresolutionimagesynthesislatent}. This UNet architecture incorporates cross-attention mechanisms, which enhance the model's ability to condition on complex contexts. Without these attention mechanisms, the model would struggle to effectively handle conditioning contexts that are more complex than simple image classes.

%Our UNet architecture consists of six downsampling levels, one middle level, and six upsampling levels, with each level containing two residual convolution blocks. Cross-attention blocks are included on the third and fifth of the downsampling levels, on the middle level, and the corresponding positions in the upsampling levels. The conditioning is performed by adding the IAP embedding to the time embeddings and incorporating it through the cross-attention blocks. While adding more cross-attention blocks improved the model's performance, it also significantly increased the computational resources, particularly when added to earlier levels of the UNet. This 'hybrid' conditioning approach, which combines adding the condition embedding to the time embeddings and cross-attention blocks, is similar to the method used in \cite{pinaya2022brainimaginggenerationlatent}.

%The model was trained using the simplified loss function from \cite{ho2020denoisingdiffusionprobabilisticmodels}, as shown in equation \ref{eqq:training_loss_cDDPM}. This loss function was adapted for the conditional training scenario, allowing the model $\mathbf{\epsilon}_{\theta}$ to receive the IAP conditioning as input but still work in an unconditional setting without $c$.
%\begin{equation}
%    \mathcal{L}(\theta) := \mathbb{E}_{t,\mathbf{x}_0,\mathbf{\epsilon} \sim N(0, \mathbf{I})}  \left[ \left\| \epsilon - \mathbf{\epsilon}_{\theta} (\sqrt{\bar{\alpha_t}}\mathbf{x}_0 + \sqrt{1-\bar{\alpha_t}} \mathbf{\epsilon}, c, t)  \right\|^2    \right]
%\label{eqq:training_loss_cDDPM}
%\end{equation}

%The training algorithm for the cDDGM is equal to the original DDPM training algorithm \cite{ho2020denoisingdiffusionprobabilisticmodels} except the model is conditioned on the IAP with a conditional dropout of 15\%. The algorithm to counterfactually modify images and simulate their acquisition with other IAP is shown in \ref{alg:image_mod_alg}. Initially, noise is added to the original image $\mathbf{x}_0$ until we reach $t=steps$, then we use the CFG method \cite{ho2022classifierfreediffusionguidance} to denoise the image from $t=steps$ back to $t=0$, now conditioning the image on a new set of IAP, $c_{new}$, and controlling the guidance scale with a parameter $w$. After denoising $\mathbf{x}_{steps}$, we return the modified $\mathbf{x}_0$ with its IAP changed.

%\begin{algorithm}
%\caption{IAP modification algorithm using CFG}\label{alg:image_mod_alg}
%$\mathbf{z} \sim \mathcal{
%N}(0,\mathbf{I})$ \\
%$\mathbf{x}_{steps} = \sqrt{\bar{\alpha}_{steps}}\mathbf{x}_0 + \sqrt{1-\bar{\alpha}_{steps}} \mathbf{z}$ \\

%\For{$t=steps,\cdots,0$}{
%$\mathbf{z} \sim \mathcal{N}(0,\mathbf{I})$ if $t > 0$, %else $\mathbf{z} = 0$ \\ 
%$\tilde{\mathbf{\epsilon}_t} = (1-w)\mathbf{\epsilon}_{\theta}(\mathbf{x}_t,t) + w\mathbf{\epsilon}_{\theta}(\mathbf{x}_t,c_{new},t)$\\

%$\mathbf{x}_{t-1} = \frac{1}{\sqrt{\alpha_t}}(\mathbf{x}_t - \frac{1-\alpha_t}{\sqrt{1- \bar{\alpha_t}}}\tilde{\mathbf{\epsilon}_t}) + \sigma_t \mathbf{z}$}
%\Return{$\mathbf{x}_0$}
%\end{algorithm}

\subsection{Computational Resources and Training Setup}

The training of all deep learning models was carried out using Pytorch \cite{paszke2019pytorch}, MONAI Core \cite{cardoso2022monai},
and MONAI Generative \cite{pinaya2023generative} libraries.

The training processes were conducted on a single NVIDIA A6000 GPU with 48 GB of memory.

The IAP prediction model training and testing phases combined took approximately 5 hours.

For the cDDGM model, the training took 9 hours and the testing the IAP modifications applied to the test set took from 2 to 7 hours for configuration of steps and guidance scales, varying with the number of steps.

Data augmentation was performed, with processing times ranging from 3 to 10 hours, depending on the manufacturer and the number of steps specified. 

The segmentation model training, which followed the data augmentation, typically took from 30 minutes up to 1 hour, with the models trained on the larger GE subset requiring more time.

\subsection{cDDGM Optimization}
\label{sec:optimization}
The guidance scale and number of steps of the proposed cDDGM were optimized for counterfactual IAP modification. The performance of the model with the different hyperparameters was assessed using generative and similarity metrics, shown in Table \ref{table:table3_Gen_Evaluation_perf}, along with the IAP prediction model performance, shown in Table \ref{table:table4_GenIAP_perf}. Within Table \ref{table:table3_Gen_Evaluation_perf}, SSIM$_{orig.\ and\ mod.}$ was computed between between pairs original images and corresponding modified images, while SSIM$_{shuff.\ and\ mod.}$ was obtained between images with the set of IAP used to modify the images and the images modified by our cDDGM using those IAP as conditioning. A low SSIM$_{shuff.\ and\ mod.}$ value is expected as the images being compared were not from the same patients.

\begin{table}[ht]
\centering
\caption{cDDGM performance metrics on the IAP modification. FID: Fréchet inception distance, SSIM: Structural similarity index metric, MMD: Maximum mean discrepancy. SSIM$_{orig.\ and\ mod.}$ represents structural similarity index between original images and corresponding modified images. SSIM$_{shuff.\ and\ mod.}$ represents the structural similarity index between images from which the IAP were originally obtained and the images modified by our cDDGM using those IAP as conditioning - importantly, the images being compared were not from the same patients. An upward arrow indicates that a higher value is better, and vice versa.}
\label{table:table3_Gen_Evaluation_perf}
\begin{tabular}{lcccc}
\hline
Hyperparameters     & FID $\downarrow$  & \begin{tabular}[c]{@{}c@{}}SSIM$_{orig.\ and\ mod.}$$\uparrow$\end{tabular} & \begin{tabular}[c]{@{}c@{}}SSIM$_{shuff.\ and\ mod.}$$\downarrow$\end{tabular} & MMD $\downarrow$            \\ \hline
Without cDPPM       & 0     & 1.000                                                               & 0.258                                                               & 0               \\ \hline
\# steps: 25; gs: 3 & 0.416 & 0.742                                                               & 0.284                                                               & 0.010x10$^{-3}$ \\ \hline
\# steps: 25; gs: 5 & 0.501 & 0.709                                                               & 0.277                                                               & 0.016x10$^{-3}$ \\ \hline
\# steps: 25; gs: 7 & 0.590 & 0.689                                                               & 0.272                                                               & 0.022x10$^{-3}$ \\ \hline
\# steps: 50; gs: 3 & 0.513 & 0.657                                                               & 0.287                                                               & 0.016x10$^{-3}$ \\ \hline
\# steps: 50; gs: 5 & 0.606 & 0.630                                                               & 0.281                                                               & 0.025x10$^{-3}$ \\ \hline
\# steps: 50; gs: 7 & 0.702 & 0.613                                                               & 0.276                                                               & 0.037x10$^{-3}$ \\ \hline
\# steps: 75; gs: 3 & 0.573 & 0.606                                                               & 0.288                                                               & 0.029x10$^{-3}$ \\ \hline
\# steps: 75; gs: 5 & 0.669 & 0.583                                                               & 0.283                                                               & 0.036x10$^{-3}$ \\ \hline
\# steps: 75; gs: 7 & 0.774 & 0.566                                                               & 0.279                                                               & 0.049x10$^{-3}$ \\ \hline
\end{tabular}
\end{table}

In the table \ref{table:table3_Gen_Evaluation_perf} and \ref{table:table4_GenIAP_perf}, the row 'Without cDDGM' represents the baseline case where the model is not being applied to the images so the IAP are just being shuffled randomly for the computation of SSIM$_{shuff.\ and\ mod.}$ and the prediction of the IAP. 

\begin{table}[ht]
\centering
\caption{Model Prediction Performance for all IAP on the Test Set. An upward arrow indicates that a higher value is better, and vice versa.}
\label{table:table4_GenIAP_perf}
\begin{tabular}{lccccccc}
\hline
Hyperparam.                                                  & \multicolumn{1}{l}{\begin{tabular}[c]{@{}l@{}}MM Top-1\\ pred. acc.\\ (\%) $\uparrow$\end{tabular}} & \multicolumn{1}{l}{\begin{tabular}[c]{@{}l@{}}FS Top-1 \\ pred. acc.\\ (\%) $\uparrow$\end{tabular}} & \multicolumn{1}{l}{\begin{tabular}[c]{@{}l@{}}SO Top-1 \\ pred. acc.\\ (\%) $\uparrow$\end{tabular}} & \multicolumn{1}{l}{\begin{tabular}[c]{@{}l@{}}FA \\ Pred.\\ MSE $\downarrow$\end{tabular}} & \multicolumn{1}{l}{\begin{tabular}[c]{@{}l@{}}ST \\ Pred.\\ MSE $\downarrow$\end{tabular}} & \multicolumn{1}{l}{\begin{tabular}[c]{@{}l@{}}TR\\ Pred.\\ MSE $\downarrow$\end{tabular}} & \multicolumn{1}{l}{\begin{tabular}[c]{@{}l@{}}TE \\ Pred.\\ MSE $\downarrow$\end{tabular}} \\ \hline
\begin{tabular}[c]{@{}l@{}}Without\\ cDPPM\end{tabular}      & 22.8                                                                                     & 49.8                                                                                      & 28.0                                                                                      & 0.440                                                                         & 0.260                                                                         & 1.046                                                                        & 0.481                                                                         \\ \hline
\begin{tabular}[c]{@{}l@{}}\# steps: 25\\ gs: 3\end{tabular} & 77.7                                                                                     & 66.3                                                                                      & 87.8                                                                                      & 0.289                                                                         & 0.198                                                                         & 0.293                                                                        & 0.070                                                                         \\ \hline
\begin{tabular}[c]{@{}l@{}}\# steps: 25\\ gs: 5\end{tabular} & 82.4                                                                                     & 71.7                                                                                      & 92.8                                                                                      & 0.292                                                                         & 0.192                                                                         & 0.249                                                                        & 0.055                                                                         \\ \hline
\begin{tabular}[c]{@{}l@{}}\# steps: 25\\ gs: 7\end{tabular} & 83.4                                                                                     & 76.0                                                                                      & 93.0                                                                                      & 0.289                                                                         & 0.189                                                                         & 0.237                                                                        & 0.058                                                                         \\ \hline
\begin{tabular}[c]{@{}l@{}}\# steps: 50\\ gs: 3\end{tabular} & 85.8                                                                                     & 76.9                                                                                      & 96.0                                                                                      & 0.282                                                                         & 0.190                                                                         & 0.207                                                                        & 0.038                                                                         \\ \hline
\begin{tabular}[c]{@{}l@{}}\# steps: 50\\ gs: 5\end{tabular} & 87.2                                                                                     & 83.1                                                                                      & 96.3                                                                                      & 0.284                                                                         & 0.185                                                                         & 0.186                                                                        & 0.040                                                                         \\ \hline
\begin{tabular}[c]{@{}l@{}}\# steps: 50\\ gs: 7\end{tabular} & 88.3                                                                                     & 87.0                                                                                      & 96.6                                                                                      & 0.275                                                                         & 0.180                                                                         & 0.179                                                                        & 0.042                                                                         \\ \hline
\begin{tabular}[c]{@{}l@{}}\# steps: 75\\ gs: 3\end{tabular} & 87.1                                                                                     & 82.8                                                                                      & 97.0                                                                                      & 0.285                                                                         & 0.189                                                                         & 0.183                                                                        & 0.034                                                                         \\ \hline
\begin{tabular}[c]{@{}l@{}}\# steps: 75\\ gs: 5\end{tabular} & 88.5                                                                                     & 88.5                                                                                      & 96.5                                                                                      & 0.273                                                                         & 0.181                                                                         & 0.174                                                                        & 0.039                                                                         \\ \hline
\begin{tabular}[c]{@{}l@{}}\# steps: 75\\ gs: 7\end{tabular} & 89.3                                                                                     & 90.4                                                                                      & 96.6                                                                                      & 0.257                                                                         & 0.172                                                                         & 0.172                                                                        & 0.042                                                                         \\ \hline
\end{tabular}
\end{table}

Table \ref{table:table4_GenIAP_perf} shows that increasing the guidance scale and the amount of noise - through additional forward diffusion steps applied to the original image - enhances the proposed model's ability to predict the IAP used to generate counterfactual images. Regarding the MSE obtained for the continuous variables, we can observe that these results represent small error when compared with the range of value of each continuous variable presented in Section \ref{sec:results_discussion}. However, this comes at the cost of reduced image quality, as indicated by higher FID scores, lower SSIM$_{orig.\ and\ mod.}$ values, and increased MMD, as observed in Table \ref{table:table3_Gen_Evaluation_perf}. Specifically, the decline in SSIM between the original and modified images suggest that higher guidance scales and more diffusion steps lead to greater loss of the original image anatomical structure. Despite these changes, the MMD remains very small, indicating that the modified images stay close to the desired distribution after IAP modification. Additionally, the SSIM between the images of different patients from which the IAP were extrated to condition the image modification ($shuffled$) and the modified images remains low, suggesting that the proposed cDDGM is not altering the structure to resemble that of the $shuffled$ reference image. 

Figure \ref{fig:figure_2_seg_id} shows several examples of breast MRIs from the test sets, along with corresponding ground truth tissue masks and segmentation predictions without and with IAP counterfactual images used as data augmentation in the two ID settings. In both scenarios, A and B, corresponding to training and inference on images from GE and Siemens, respectively, we can observe that the DL segmentation models with and without cDDGM data augmentation perform similarly and are able approximate the ground truth.

%mathbf
\begin{figure}[ht]
  \centering
  \caption{Comparison between the ground truth (True) and DL breast segmentation models trained without data augmentation (Pred.) and with data augmentation using cDDGM (Pred.Aug.), in in-distribution settings. (A) results of models trained in GE when applied to GE MRIs. (B) results of models trained in GE when applied to GE MRIs. Blue - Fat mask; Orange - FGT mask.}
  \includegraphics[width=0.5\textwidth]{Segmentation_ID.png}
  \label{fig:figure_2_seg_id}
\end{figure}

\subsection{Radiological Assessment of Anatomical Changes in Counterfactual Images}

The central slice of the 100 volumes used for segmentation were assessed by breast radiologists. The radiologist compared both original and corresponding counterfactual images and classified the counterfactual images into: no, minimal, moderate, significant, and severe anatomical changes. When anatomical changes were observed the radiologist indicated whether such changes were present in the breast tissues or in non-breast tissues. Results of this classication are shown in Table \ref{table:radiologist}.


\begin{table}[hbt!]
\centering
\caption{Radiological assessment of anatomical changes in counterfactual images for 100 patients used in the segmentation task and location such changes obtained. NA: not applicable.}
\label{table:radiologist}
\begin{tblr}{
  width = \linewidth,
  colspec = {Q[208]Q[119]Q[281]Q[304]},
  cell{1}{2} = {c},
  cell{1}{3} = {c},
  cell{1}{4} = {c},
  cell{2}{2} = {c},
  cell{2}{3} = {c},
  cell{2}{4} = {c},
  cell{3}{2} = {c},
  cell{3}{3} = {c},
  cell{3}{4} = {c},
  cell{4}{2} = {c},
  cell{4}{3} = {c},
  cell{4}{4} = {c},
  cell{5}{2} = {c},
  cell{5}{3} = {c},
  cell{5}{4} = {c},
  cell{6}{2} = {c},
  cell{6}{3} = {c},
  cell{6}{4} = {c},
  hline{1-2,7} = {-}{},
}
{\textbf{Anatomical }\\\textbf{Change?}} & \textbf{Count} & {\textbf{Changes within }\\\textbf{breast tissues}} & {\textbf{Changes outside }\\\textbf{breast tissues}} \\
\textbf{No}                              & 98             & NA                                                  & NA                                                   \\
\textbf{Minimal}                         & 2              & 0                                                   & 2                                                    \\
\textbf{Moderate}                        & 0              & NA                                                  & NA                                                   \\
\textbf{Significant}                     & 0              & NA                                                  & NA                                                   \\
\textbf{Severe}                          & 0              & NA                                                  & NA                                                   \\
                                         &                &                                                     &                                                      
\end{tblr}
\end{table}

It is possible to observe that the majority of counterfactual images did not introduce anatomical changes, with just two cases presenting minimal anatomical changes, all of which outside breast tissues.


\subsection{Evaluation Metrics Formulas}
\begin{equation} \text{Top-1}\ Accuracy = Accuracy  = \frac{TP+TN}{TP+TN+FP+FN}\end{equation}
\noindent where TP, TN, FP, and FN are the number true positives, true negatives, false positives, and false negatives, respectively.

\begin{equation}\text{MSE}(y, \hat{y}) = \frac{\sum_{i=0}^{N - 1} (y_i - \hat{y}_i)^2}{N}\end{equation}
\noindent where $y_i$, $\hat{y_i}$ represent the true and predicted values, and $N$ the number of data points.

\begin{equation}\text{FID} = |\mu_{1} - \mu_{2}| + \text{Tr}(\sigma_{1} + \sigma_{2} - {2}\sqrt{\sigma_1*\sigma_2})\end{equation}

\noindent  where $\mu_1$ and $\mu_2$, and $\sigma_1$ and $\sigma_2$ represent the mean and covariance of the two distributions of feature vectors. We used use the RadImageNwt pretrained on medical datasets from MONAI, instead of activations of the pool\_3 layer of an Inception v3 pretrained with Imagenet.

\begin{equation}SSIM(x,y) = \frac{(2\mu_x\mu_y + c_1)(2\sigma_{xy} + c_2)}{(\mu_x^2 + \mu_y^2 + c_1)(\sigma_x^2 + \sigma_y^2 + c_2)}
\end{equation}

\noindent  where $\mu_x$ is the pixel sample mean of $x$, $\mu_y$ is the pixel sample mean of $y$, $\sigma_x^2$ is the sample variance of $x$, $\sigma_y^2$ is the sample variance of $y$, $c_1 = (k_1L)^2$ and $c_2 = (k_2L)^2$ are two variables to stabilize the division with weak denominator, $L$ is the dynamic range of the pixel-values (typically this is $2^{\#bits\ per\ pixel}-1$), and $ k_1 = 0.01$ and $k_2 = 0.03$ by default.

\begin{equation}\textup{MMD}(\mathbb{F},X,Y):=\sup_{f \in\mathbb{F}}(\frac{1}{m}\sum_{i=1}^{m}f(x_{i}) -\frac{1}{n}\sum_{j=1}^{n}f(y_{j}))\end{equation}

\begin{equation} Dice\ Similarity\ Score = \frac{2TP}{2TP+FP+FN}\end{equation}
\noindent where TP, FP, and FN are the number true positives, false positives, and false negatives, respectively.
\\

\begin{equation} \textup{Counterfactual Prediction Gain} = \mathbb{E}[C(x_i^{\text{cf}})\ -\ C(x_i)]\end{equation}
\noindent where $C$ is the target classifier and $x_i$ denotes the data point for which a counterfactual
$(x^{\text{cf}}_i)$ is sought through the proposed cDDGM, which is used to reconstruct $x^{\text{cf}}_i$ \cite{nemirovsky2020countergan}. The expectations are computed using the test sets.
\\

%\newline

\subsection{Data, Models' Weights and Code}

Derived data, obtained from the Duke-Breast-Cancer-MRI dataset \cite{saha2018machine}, and models' weights are made available at \url{https://zenodo.org/records/13495922}. Code is available at \url{https://github.com/pedromorao/Counterfactual-MRI-Data-Augmentation}.

\end{document}