\documentclass{midl} % Include author names
%DIF LATEXDIFF DIFFERENCE FILE
%DIF DEL midl-fullpaper.tex   Thu Mar  6 17:27:29 2025
%DIF ADD texdiff.tex          Thu Mar  6 17:27:19 2025

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images

\jmlryear{2025}\jmlrworkshop{Full Paper -- MIDL 2025}\jmlrvolume{-- 025}\editors{Accepted for publication at MIDL 2025}

\title[Can Diffusion Models Generalize?]{Can Diffusion Models Generalize? Privacy and Fairness Trade-offs for Medical Data Sharing.}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}    % use 8-bit T1 fonts
\usepackage{hyperref}       % hyperlinks
\usepackage{url}            % simple URL typesetting
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{xcolor}         % colors

\newcommand{\theHalgorithm}{\arabic{algorithm}}

\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{siunitx}

\usepackage{rotating}
\usepackage{tabularray}

%\usepackage[capitalize,noabbrev]{cleveref}
\usepackage{tikz}
\usepackage{hyperref}
\usepackage{url}
\usepackage{graphicx}
\usepackage{appendix}
\usepackage{booktabs}
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}
\usepackage{xcolor}         % colors
\usepackage{algorithm2e}
\usepackage{multirow}

\newcommand\myworries[1]{\textcolor{red}{#1}}

%%%
%%%
%%%

\newcommand{\odir}{ODIR-2019}
\newcommand{\cxr}{ChestX-ray14}
\newcommand{\bci}{BCI}
\newcommand{\clf}{$C_{f}$}
\newcommand{\clfp}{$C_{f+}$}


%\newcommand{\cp}{c_{p}(\bfx)}
\newcommand{\cid}{$C_{id}$}
\newcommand{\bfx}{\mathbf{x}}
\newcommand{\bfxsaf}{\mathbf{x}_{SAF}}
\newcommand{\bfxp}{\mathbf{x}_p}
\newcommand{\bfA}{\mathbf{A}}
\newcommand{\bfB}{\mathbf{B}}
\newcommand{\bfW}{\mathbf{W}}
\newcommand{\bfw}{\mathbf{w}}
\newcommand{\bfV}{\mathbf{V}}
\newcommand{\bfM}{\mathbf{M}}
\newcommand{\bfa}{\mathbf{a}}
\newcommand{\bfb}{\mathbf{b}}
\newcommand{\bfv}{\mathbf{v}}
\newcommand{\bfz}{\mathbf{z}}
\newcommand{\bfI}{\mathbf{I}}
\newcommand{\bft}{\mathbf{t}}
\newcommand{\bfu}{\mathbf{u}}
\newcommand{\bfr}{\mathbf{r}}
\newcommand{\bff}{\mathbf{f}}
\newcommand{\bfm}{\mathbf{m}}
\newcommand{\bfsigma}{\mbf{\Sigma}}
\newcommand{\bfF}{\mathbf{F}}
\newcommand{\bfL}{\mathbf{L}}
\newcommand{\bfzero}{\mathbf{0}}
\newcommand{\bfe}{{\bs{\epsilon}}}
\newcommand{\bftheta}{{\boldsymbol{\theta}}}
\newcommand{\bfalpha}{{\boldsymbol{\alpha}}}
\newcommand{\bfphi}{{\boldsymbol{\phi}}}
\newcommand{\bfy}{\mathbf{y}}
\newcommand{\bfs}{\mathbf{s}}
\newcommand{\bfh}{\mathbf{h}}
\newcommand{\bfg}{\mathbf{g}}
\newcommand{\bfG}{\mathbf{G}}
\newcommand{\bfxd}{\mathbf{x}_{\text{data}}}
\newcommand{\pd}{p_{\mathrm{data}}}
%\newcommand{\sm}{\sigma_{\text{min}}}
%\newcommand{\sM}{\sigma_{\text{max}}}
\newcommand{\bbeta}{\bar{\beta}}
\newcommand{\bomega}{{\bar{\Omega}}}
\newcommand{\bbetam}{\bbeta_{\text{min}}}
\newcommand{\bbetaM}{\bbeta_{\text{max}}}
\newcommand{\betam}{\beta_{\text{min}}}
\newcommand{\betaM}{\beta_{\text{max}}}
\newcommand{\bfmu}{{\boldsymbol \mu}}
\newcommand{\mbb}[1]{\mathbb{#1}}
\newcommand{\ud}{\mathrm{d}}
\newcommand{\up}{\mathrm}
\newcommand{\norm}[1]{\left\lVert#1\right\rVert}

\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}



 \midlauthor{\Name{Mischa Dombrowski\nametag{$^{1}$}} \Email{mischa.dombrowski@fau.de} \\
             \Name{Bernhard Kainz\nametag{$^{1,2}$}} \Email{bernhard.kainz@fau.de} \\
             \addr $^{1}$ Friedrich-Alexander-Universität Erlangen-Nürnberg, DE \\
             \addr $^{2}$ Imperial College London, UK}

% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
%\midlauthor{\Name{Author Name1\midljointauthortext{Contributed equally}\nametag{$^{1,2}$}} \Email{abc@sample.edu}\\
%\addr $^{1}$ Address 1 \\
%\addr $^{2}$ Address 2 \AND
%\Name{Author Name2\midlotherjointauthor\nametag{$^{1}$}} \Email{xyz@sample.edu}\\
%\Name{Author Name3\nametag{$^{2}$}} \Email{alphabeta@example.edu}\\
%\Name{Author Name4\midljointauthortext{Contributed equally}\nametag{$^{3}$}} \Email{uvw@foo.ac.uk}\\
%\addr $^{3}$ Address 3 \AND
%\Name{Author Name5\midlotherjointauthor\nametag{$^{4}$}} \Email{fgh@bar.com}\\
%\addr $^{4}$ Address 4
%}
%DIF PREAMBLE EXTENSION ADDED BY LATEXDIFF
%DIF UNDERLINE PREAMBLE %DIF PREAMBLE
\RequirePackage[normalem]{ulem} %DIF PREAMBLE
\RequirePackage{color}\definecolor{RED}{rgb}{1,0,0}\definecolor{BLUE}{rgb}{0,0,1} %DIF PREAMBLE
\providecommand{\DIFaddtex}[1]{{\protect\color{blue}\uwave{#1}}} %DIF PREAMBLE
\providecommand{\DIFdeltex}[1]{{\protect\color{red}\sout{#1}}}                      %DIF PREAMBLE
%DIF SAFE PREAMBLE %DIF PREAMBLE
\providecommand{\DIFaddbegin}{} %DIF PREAMBLE
\providecommand{\DIFaddend}{} %DIF PREAMBLE
\providecommand{\DIFdelbegin}{} %DIF PREAMBLE
\providecommand{\DIFdelend}{} %DIF PREAMBLE
\providecommand{\DIFmodbegin}{} %DIF PREAMBLE
\providecommand{\DIFmodend}{} %DIF PREAMBLE
%DIF FLOATSAFE PREAMBLE %DIF PREAMBLE
\providecommand{\DIFaddFL}[1]{\DIFadd{#1}} %DIF PREAMBLE
\providecommand{\DIFdelFL}[1]{\DIFdel{#1}} %DIF PREAMBLE
\providecommand{\DIFaddbeginFL}{} %DIF PREAMBLE
\providecommand{\DIFaddendFL}{} %DIF PREAMBLE
\providecommand{\DIFdelbeginFL}{} %DIF PREAMBLE
\providecommand{\DIFdelendFL}{} %DIF PREAMBLE
%DIF HYPERREF PREAMBLE %DIF PREAMBLE
\providecommand{\DIFadd}[1]{\texorpdfstring{\DIFaddtex{#1}}{#1}} %DIF PREAMBLE
\providecommand{\DIFdel}[1]{\texorpdfstring{\DIFdeltex{#1}}{}} %DIF PREAMBLE
\newcommand{\DIFscaledelfig}{0.5}
%DIF HIGHLIGHTGRAPHICS PREAMBLE %DIF PREAMBLE
\RequirePackage{settobox} %DIF PREAMBLE
\RequirePackage{letltxmacro} %DIF PREAMBLE
\newsavebox{\DIFdelgraphicsbox} %DIF PREAMBLE
\newlength{\DIFdelgraphicswidth} %DIF PREAMBLE
\newlength{\DIFdelgraphicsheight} %DIF PREAMBLE
% store original definition of \includegraphics %DIF PREAMBLE
\LetLtxMacro{\DIFOincludegraphics}{\includegraphics} %DIF PREAMBLE
\newcommand{\DIFaddincludegraphics}[2][]{{\color{blue}\fbox{\DIFOincludegraphics[#1]{#2}}}} %DIF PREAMBLE
\newcommand{\DIFdelincludegraphics}[2][]{% %DIF PREAMBLE
\sbox{\DIFdelgraphicsbox}{\DIFOincludegraphics[#1]{#2}}% %DIF PREAMBLE
\settoboxwidth{\DIFdelgraphicswidth}{\DIFdelgraphicsbox} %DIF PREAMBLE
\settoboxtotalheight{\DIFdelgraphicsheight}{\DIFdelgraphicsbox} %DIF PREAMBLE
\scalebox{\DIFscaledelfig}{% %DIF PREAMBLE
\parbox[b]{\DIFdelgraphicswidth}{\usebox{\DIFdelgraphicsbox}\\[-\baselineskip] \rule{\DIFdelgraphicswidth}{0em}}\llap{\resizebox{\DIFdelgraphicswidth}{\DIFdelgraphicsheight}{% %DIF PREAMBLE
\setlength{\unitlength}{\DIFdelgraphicswidth}% %DIF PREAMBLE
\begin{picture}(1,1)% %DIF PREAMBLE
\thicklines\linethickness{2pt} %DIF PREAMBLE
{\color[rgb]{1,0,0}\put(0,0){\framebox(1,1){}}}% %DIF PREAMBLE
{\color[rgb]{1,0,0}\put(0,0){\line( 1,1){1}}}% %DIF PREAMBLE
{\color[rgb]{1,0,0}\put(0,1){\line(1,-1){1}}}% %DIF PREAMBLE
\end{picture}% %DIF PREAMBLE
}\hspace*{3pt}}} %DIF PREAMBLE
} %DIF PREAMBLE
\LetLtxMacro{\DIFOaddbegin}{\DIFaddbegin} %DIF PREAMBLE
\LetLtxMacro{\DIFOaddend}{\DIFaddend} %DIF PREAMBLE
\LetLtxMacro{\DIFOdelbegin}{\DIFdelbegin} %DIF PREAMBLE
\LetLtxMacro{\DIFOdelend}{\DIFdelend} %DIF PREAMBLE
\DeclareRobustCommand{\DIFaddbegin}{\DIFOaddbegin \let\includegraphics\DIFaddincludegraphics} %DIF PREAMBLE
\DeclareRobustCommand{\DIFaddend}{\DIFOaddend \let\includegraphics\DIFOincludegraphics} %DIF PREAMBLE
\DeclareRobustCommand{\DIFdelbegin}{\DIFOdelbegin \let\includegraphics\DIFdelincludegraphics} %DIF PREAMBLE
\DeclareRobustCommand{\DIFdelend}{\DIFOaddend \let\includegraphics\DIFOincludegraphics} %DIF PREAMBLE
\LetLtxMacro{\DIFOaddbeginFL}{\DIFaddbeginFL} %DIF PREAMBLE
\LetLtxMacro{\DIFOaddendFL}{\DIFaddendFL} %DIF PREAMBLE
\LetLtxMacro{\DIFOdelbeginFL}{\DIFdelbeginFL} %DIF PREAMBLE
\LetLtxMacro{\DIFOdelendFL}{\DIFdelendFL} %DIF PREAMBLE
\DeclareRobustCommand{\DIFaddbeginFL}{\DIFOaddbeginFL \let\includegraphics\DIFaddincludegraphics} %DIF PREAMBLE
\DeclareRobustCommand{\DIFaddendFL}{\DIFOaddendFL \let\includegraphics\DIFOincludegraphics} %DIF PREAMBLE
\DeclareRobustCommand{\DIFdelbeginFL}{\DIFOdelbeginFL \let\includegraphics\DIFdelincludegraphics} %DIF PREAMBLE
\DeclareRobustCommand{\DIFdelendFL}{\DIFOaddendFL \let\includegraphics\DIFOincludegraphics} %DIF PREAMBLE
%DIF COLORLISTINGS PREAMBLE %DIF PREAMBLE
\RequirePackage{listings} %DIF PREAMBLE
\RequirePackage{color} %DIF PREAMBLE
\lstdefinelanguage{DIFcode}{ %DIF PREAMBLE
%DIF DIFCODE_UNDERLINE %DIF PREAMBLE
  moredelim=[il][\color{red}\sout]{\%DIF\ <\ }, %DIF PREAMBLE
  moredelim=[il][\color{blue}\uwave]{\%DIF\ >\ } %DIF PREAMBLE
} %DIF PREAMBLE
\lstdefinestyle{DIFverbatimstyle}{ %DIF PREAMBLE
	language=DIFcode, %DIF PREAMBLE
	basicstyle=\ttfamily, %DIF PREAMBLE
	columns=fullflexible, %DIF PREAMBLE
	keepspaces=true %DIF PREAMBLE
} %DIF PREAMBLE
\lstnewenvironment{DIFverbatim}{\lstset{style=DIFverbatimstyle}}{} %DIF PREAMBLE
\lstnewenvironment{DIFverbatim*}{\lstset{style=DIFverbatimstyle,showspaces=true}}{} %DIF PREAMBLE
%DIF END PREAMBLE EXTENSION ADDED BY LATEXDIFF

\begin{document}

\maketitle

\begin{abstract}
The recent surge in options for diffusion model-based synthetic data sharing offers significant benefits for medical research, provided privacy and fairness concerns are addressed.
Generative models risk memorizing sensitive training samples, potentially exposing identifiable information.
Simultaneously, underrepresented features -- such as rare diseases, uncommon medical devices, or infrequent patient ethnicities -- are often not learned well, creating unfair biases in downstream applications.
Our work unifies these challenges by leveraging artificially generated fingerprints (SAFs) in the training data as a controllable test for memorization and fairness.
Specifically, we measure whether a diffusion model reproduces these fingerprints verbatim (a privacy breach) or ignores them entirely (a fairness violation) and introduce an indicator t' to quantify trained models for the likelihood of reproducing training samples.
Extensive experiments on real and synthetic medical imaging datasets reveal that na\"ive diffusion model training can lead to privacy leaks or unfair coverage.
By systematically incorporating SAFs and monitoring t', we demonstrate how to balance privacy and fairness objectives.
Our evaluation framework provides actionable guidance for designing generative models that preserve patient anonymity without excluding underrepresented patient subgroups. Code is available at \url{https://github.com/MischaD/Privacy}.
\end{abstract}


\begin{keywords}
Image Generation, Privacy, Fairness, Chest X-ray
\end{keywords}

\section{Introduction}
\label{sec:introduction}
% (1.1) CONTEXTUALIZE BOTH MOTIVATIONS (Privacy & Fairness)
%   - State the importance of synthetic data sharing and potential pitfalls (privacy leaks, bias, etc.)
%   - Emphasize how these concerns are especially critical in domains like medical imaging.
% especially in the absence of individual consent for data sharing or when data cannot be transferred across institutions~\citep{jin2019review}.
%Incorporating privacy-preserving methods into the training process to enable sharing of statistically equivalent synthetic datasets to private data holds significant potential.
Since the development of statistical models capable of representing and synthesizing new samples from existing dataset distributions~\cite{kingma2013auto,goodfellow2020generative,rombach2022high,ho2020denoising,hamamci2025generatect,guo2024maisi}, the idea of training generative models on private data  and sharing \emph{only} the model or synthetic datasets has gained traction. 
Such methods could address issues like data scarcity for rare diseases, racial bias~\citep{larrazabal2020gender}, and challenges such as robust domain adaptation and generalisation~\citep{wang2022metateacher}.
However, maintaining privacy and anonymity is crucial when working with personally identifiable information~\citep{jin2019review}.%,

%Incorporating privacy-preserving techniques~\cite{} into the model training process to generate synthetic datasets mirroring only the statistical properties of private data but no individual training samples holds substantial potential for secure data sharing in healthcare.
%Incorporating privacy-preserving techniques~\cite{DP} into the training process -- so that only the statistical properties of private data are replicated, without revealing any individual samples -- holds therefore substantial promise for secure data sharing in healthcare. However, these methods usually lead to degerated downstream performance and an inaccurate data distribution model~\cite{DP}.  


Recent advances in generative modeling, including diffusion models~\citep{song2020denoising,dhariwal2021diffusion,rombach2022high,ruiz2022dreambooth}, have expanded the feasibility of sharing models directly~\citep{pinaya2022brain}.
Despite these efforts~\citep{bai2021training,dar2024unconditional,stein2024exposing}, it remains unclear to what extent shared models reproduce training samples, which would raise potential data privacy concerns.
Guarantees against such privacy breaches would allow models to be trained on proprietary data and shared instead of the underlying datasets, enabeling fully anonymous data sharing.
%Thus, healthcare providers could share complex but fully anonymous patient information, such as medical images, through distribution modelling at a population level without needing individual consent. Obtaining Individual consent is often infeasible -- particularly when pursued retrospectively.
%Healthcare providers could share complex patient information, such as medical images, on a population basis without obtaining individual consent. obtaining consent is often infeasible, particularly retrospectively.

However, recent works have shown that publicly released models can inadvertently regenerate training data during sampling, which  prevents us from freely sharing such models. For instance, \citet{somepalli2023diffusion} and \citet{dar2024unconditional} demonstrate that diffusion models can reproduce training samples, while \citet{carlini2023extracting} illustrate how re-identifyable  faces can be extracted from these datasets. This raises serious privacy concerns requiring robust mitigation strategies~\citep{ren2024unveiling}. Moreover, some generative models are explicitly designed to memorize training samples~\citep{cong2020gan}. Differential privacy-based strategies~\citep{dockhorn2022differentially} offer a promising avenue for mitigating these concerns. However, their adoption in high-resolution data synthesis remains limited due to a notable drop in distribution fidelity, restricted applicability to diffusion models, and low efficacy for multi-modal, high-resolution datasets~\citep{xie2018differentially}. Consequently, in this work, we propose a direct approach to evaluate models -- circumventing the need to obfuscate the model distribution -- and thereby preserving compatibility with high-resolution, high-fidelity generative tasks.

%Efforts to leverage methods such as differential privacy~\citep{dockhorn2022differentially} have not been widely adopted yet since they reduce the learned distribution fidelty and  often only work on low-resolution datasets.
%=======================
%DP why not perhaps here
%=======================

%However, it has been shown that trained and published models can reproduce training data during sampling.
%\citet{somepalli2023diffusion} and \citet{dar2024unconditional} have shown that diffusion models can reproduce training samples, and \citet{carlini2023extracting} demonstrated how faces can be retrieved from training data.
%This raises serious privacy concerns and that demand mitigation strategies~\citep{ren2024unveiling}.
%Other generative models are explicitly trained to memorize training samples~\citep{cong2020gan}.
%In a medical setting, it remains uncertain whether merging segments from various images truly poses privacy risks.


Additionally, not reproducing unique features has important implications for the fairness of generated data, which have not yet been discussed in literature.
Training diffusion models on private datasets while ensuring unique features are not reproduced results in models ignore these unique characteristics.
This contradicts the objective of generating a fair dataset, where such unique features should be reproduced.
To resolve the trade-off between privacy and fairness, models must learn to generalize. 
To approach this problem, we can conceptually divide the set of all images into two distinct categories: training set members and non-members.
After generating a dataset using a diffusion model, the generated samples fall into four categories:
(1) \textbf{Lost images}: not in the training set, unavailable, potentially affecting downstream tasks but posing no privacy or fairness concerns.
(2) \textbf{Memorized images}: reproduced from the private training set, raising privacy issues that require safeguards.
(3) \textbf{Forgotten images}: training images not generated, potentially causing fairness issues if certain subgroups are omitted.
(4) \textbf{Generalized samples}: the ideal case, where the model generates data reflecting the underlying distribution.
%We illustrate this in Fig. \ref{fig:abstract}.

To investigate what the model learns, we propose to use synthetic anatomical fingerprints.
These fingerprints can be directly controlled through synthetic manipulations of the training dataset and reliably detected in synthetic datasets.
We measure the probability of generating these fingerprints using a novel indicator metric $t'$. % (see Sec. \ref{sec:estimation_method}). 
Thus, our main contributions are:
\begin{itemize}
\setlength{\itemsep}{0pt}
\parskip0pt
    \item We formulate a realistic scenario where unconditional generative models face privacy and fairness problems due to the potential reproduction of training samples.
    \item We provide a formal approach to determine the maximum probability of producing sensitive data, from which we derive a computable indicator metric.
    \item We define a framework that quantifies and investigates privacy and fairness issues, enabling architectural decisions to create truly generalizing and fair generative models.
\end{itemize}

% --------------------------------------------------------------------------
% 2. RELATED WORK AND BACKGROUND
% --------------------------------------------------------------------------
\section{Background}
\label{Sec:background}
\noindent\textbf{Diffusion Models}, such as \citep{rombach2022high}, model different levels of perturbation $p_{\sigma}(\tilde{\bfx}) \coloneqq \int p_{data}(\bfx)p_{\sigma}(\tilde{\bfx} \mid \bfx)\ud\bfx$ of the real data distribution using a noising function defined by $p_{\sigma}(\tilde{\bfx} \mid \bfx) \coloneqq \mathcal{N}(\tilde{\bfx}; \bfx, \sigma^2 \bfI)$.
Here, $\sigma$ defines the strength of the perturbation, split into $N$ steps $\sigma_{1}, \dots, \sigma_{N}$.
The assumption is that $p_{\sigma_1}(\tilde{\bfx} \mid \bfx) \sim p_{data}(\bfx)$ and $p_{\sigma_N}(\tilde{\bfx} \mid \bfx) \sim \mathcal{N}(\bfx; \textbf{0}, \sigma_N^2\bfI)$.
We define the optimization as a score matching objective by training a model $\bfs_{\bftheta}(\bfx, \sigma)$ to predict the score function $\nabla_\bfx \log p_{\sigma}(\bfx)$ for the noise level $\sigma \in \{\sigma_i\}_{i=1}^{N}$.
For sampling, this process can be reversed, for example, using Markov chain Monte Carlo methods following~\citet{song2019generative}.
\citet{song2020score} extended this to a continuous formulation by redefining the diffusion process as a process governed by an SDE and training a dense model to predict the score function.
The continuous formulation of the noising process, denoted by $p_t(\bfx)$ and $p_{st}(\bfx(t) \mid \bfx(s))$, characterizes the transition kernel from $\bfx(s)$ to $\bfx(t)$, where $0 \leq s < t \leq T$.
\citet{anderson1982reverse} showed that the reverse of this diffusion process is also a diffusion process.
Finally, \citet{song2020score} show that the reverse diffusion process of the SDE can be modeled as a deterministic process, as the marginal probabilities can be expressed deterministically in terms of the score function.
As a result, the problem simplifies to an ODE, which can be solved using any black-box numerical solver, such as the explicit Runge-Kutta method.
This enables exact likelihood computation, commonly used to estimate the likelihood of generating a sample, \emph{e.g.}, images~\citep{song2020score}.
However, we propose $t'$, a more general indicator that extends this idea to approximate the likelihood of generating all samples that lead to privacy problems.

\noindent\textbf{Privacy:}
To formalize and contextualize our approach, we borrow the definitions of \emph{extractable memorization} and \emph{discoverable memorization} from the natural language processing domain~\citep{nasr2023scalable,carlini2021extracting} and apply them to generative image models.
Given a model $\bfs$ with a generation routine Gen, an example $\bfxp$ from the training set $D$ is \emph{extractably memorized} if an adversary (without access to $D$) can construct a conditioning \textbf{c} that makes the model produce $\bfxp$ (\emph{i.e.}, Gen(\textbf{c}) = $\bfxp$).

We also adopt and extend the definition of \emph{discoverable memorization} from~\citet{nasr2023scalable} and~\citet{carlini2021extracting} to image models:
For a model $\bfs$ with generation routine Gen, an example $\bfxp \in D$, and a perturbation function from the generative model's training $p_{\sigma}(\tilde{\bfx} \mid \bfx)$, $\bfxp$ is \emph{discoverably memorized} if Gen($\tilde{\bfxp}$, $\sigma$) = $\bfxp$.
The strength of the perturbation function directly influences how discoverable the training images are.
Our proposed indicator $t'$ measures the susceptibility of models to discoverable memorization.
It can be compared to the privacy budget in differential privacy \citep{dockhorn2022differentially}; however, unlike differential privacy methods -- which only work on low-resolution images -- our approach' post-hoc nature preserves image quality.

\noindent\textbf{Fairness:}
Fairness in AI is a well-explored yet unsolved problem.
Current directions in the literature for discriminative tasks suggest frameworks for benchmarking~\citep{jin2024fairmedfmfairnessbenchmarkingmedical} or reveal important design choices for training fair models.
Generative models are used to improve fairness~\citep{ktena2024generative}, but their own biases and unfairness remain underexplored.
It is often assumed that generative models learn the entire data distribution without further evaluation.

\section{Method}
\label{sec:definitions}
\begin{figure}
    \centering
    \includegraphics[width=0.6\linewidth]{figures/abstract.pdf}
    \caption{Type of learning for generative models. Memorization raises privacy issues, and forgetting raises fairness issues.}
    \label{fig:abstract}
\end{figure}

To ensure clarity, we first define the key terms used throughout our paper.
In Fig. \ref{fig:abstract} we visualize the key terms introduced Sec. \ref{sec:introduction}. 


\noindent\textbf{Privacy}: Sharing synthetic data poses a problem if an adversary, without access to any images from the training dataset but with prior knowledge, can extract an image from the synthetic dataset and recognize that it is memorized. An example of this is shown in Sec. \ref{sec:realworldcxr}.

\noindent\textbf{Memorization}: Memorization refers to the pixel-wise reproduction of training images. We distinguish between the memorization of full images and parts of images. To check for partial memorization, we define fingerprints a priori. Instead of computing a pixel-wise error, we use a classifier~\clf that is trained to be robust against perturbations. The second type is full-image memorization, where the model reproduces the entire image.

\noindent\textbf{Identity}: This refers to case-dependent sensitive information that may be leaked if the model is shared. Examples include full images in the case of ChestX-ray14 \mbox{%DIFAUXCMD
\cite{wang2017chestxray} }\hskip0pt%DIFAUXCMD
or CelebA-HQ \mbox{%DIFAUXCMD
\citep{karras2018progressive}}\hskip0pt%DIFAUXCMD
. In other cases, such as the Stable Diffusion experiment in Sec. \ref{sec:stable_diffusion}, identity refers to the actual identity of a person, independent of image context or background.

\noindent\textbf{Fingerprint}:
An image feature unique to a specific person or image in the dataset. 
Examples include distinctive diseases, objects, entire faces, bone structures, or any feature linked to an individual’s identity.
Similar to real fingerprints, their mere presence is not inherently problematic unless there is additional information that enables identification.

\noindent\textbf{Violation}: This occurs when revealing the identity is implied by the reproduction of a fingerprint. Formally, this means that the presence of a fingerprint implies the presence of the identity.

Using artificially generated fingerprints (SAFs), we investigate privacy and fairness issues simultaneously.
Privacy issues arise if we detect cases of memorization.
Fairness issues arise if SAFs are absent in synthetic datasets.
Our goal is to achieve generalization, where SAFs are reproduced in images with different identities.
To investigate when models start to generalize, we artificially inject detectable objects into the training data, \emph{i.e.}, SAFs.
We then train one classifier to detect these objects and another to identify the image's identity used as the target for injection.
A non-privacy-violating and fair model would reproduce the SAF on a synthetic image with a different identity than the training image containing the fingerprint.
%\noindent\textbf{Synthetic Anatomic Fingerprints (SAF)}
%\label{sec:saf_augmentation}
To establish SAFs, 
we synthetically augment a single sample $\bfxp$ from the dataset $D$.
We do this too simplify the training of classifiers used to detect SAFs.
In practice, this can be any feature that appears only once in the entire training dataset, such as a ring, a deformation, or a specific medical device.
%The SAF sample $\bfxp$ is defined as a randomly drawn real sample augmented with the fingerprint in a fixed location.
To investigate generalizability, we experiment with using either a constant gray circle, realistic fingerprints generated through image interpolation, or real features extracted based on labels as shown in Appx.~\ref{sec:app_pipandfeat}.

To assess whether the trained model raises privacy concerns, we define an adversarial attacker aware of the SAF who can train a model to detect it.
We refer to this classifier as~\clf.
A second classifier, \cid, is trained independently of the fingerprint on the unaugmented dataset using a one-versus-all approach to classify the image's identity.
The attacker does not have access to this classifier.
Its purpose is to determine if the presence of the fingerprint implies the identity of the image.

This setup allows us to disentangle the memorization of the SAF from the memorization of $\bfxp$, distinguishing generalization from memorization.
To track the number of memorized samples, we define $|q|$ as the number of synthetic samples where both classifiers have a positive outcome.

\noindent\textbf{Memorization Indicator $t'$:}
\label{sec:memorization_indicator}
It is possible to compute the likelihood of the exact sample (\emph{e.g.}, using numerical NLL estimation), but this does not ensure that images in the immediate neighborhood are free from privacy issues.
To address this, we propose estimating the upper bound of the likelihood of reproducing samples from the entire subspace belonging to the class of private samples.

Let $p_s(\bfxp)$ define the likelihood of the unconditional model $\bfs$ reproducing the private sample $\bfxp$ at test time.
This is insufficient because it does not account for slightly noisy versions of $\bfxp$, which can also pose privacy concerns.
We aim to compute $q(p)$, defined as the likelihood of reproducing any sample within $\Omega_p$.
Here, $\Omega_p$ represents the region in image space that is similar enough to $\bfxp$ to raise privacy concerns according to $q$.

In the supplementary material, we show this is equivalent to:
\begin{equation}
\begin{split}
   q(p) & = \int_{\Omega_{p}} p_s(\bfx)\ud\bfx \approx \int_0^{t'} p_s(\bfx_{t,p}) \ud\bft  \leq \sum_{i=0}^{t'} \sup_{t \in\left[t_i, t_{i+1}\right]} (\sigma_{t_{i+1}} - \sigma_{{t_i}})\mbb{E}_{p(\bfx_{t,p})}\big[p(\bfx'_{t,p})\big],
\end{split}
\end{equation}

To estimate $q(p)$, we observe that it depends only on the likelihood $p(\bfxp')$ and $t'$, which captures the entire region of $\Omega_p$.
$\bfxp'$ is the predicted sample of the diffusion model after applying $t$ forward diffusion steps to the private sample $\bfxp$ .
This synthetic $\bfxp'$ then serves as input to the classifiers.
$\Omega_p$ is defined as the region where \cid~and \clf~both give positive predictions.
Since this region depends only on its size, $t'$ serves as an indicator of how unlikely it is to generate critical samples from the model, without the necessity to compute the exact value for $p(\bfxp')$ .

We provide a pseudo-algorithm for the computation in Appx. \ref{sec:estimation_alg}.
Given $\bfxp$, we define $q_M(p|x_{t,p})$ as the estimate of a sample belonging to $\Omega_p$ for a given diffusion step $t$.
We then define $t' \coloneqq \text{max}(\mathbb{T})$, where $\mathbb{T} \coloneqq \{ \forall t \colon q_M(p|x_{t,p}) > 0 \}$.
The parameter $M$ allows us to trade off accuracy for computation time by choosing the number of generated samples.

\noindent\textbf{Intuition:} We model the image space using the learned distribution of the score function $\nabla_{\tilde{\bfx}} \log p_{\sigma_i}(\tilde{\bfx} \mid \bfx)$ by reversing the diffusion process and checking when the model starts to ``break out'' by generating images classified as different samples. 
For large $t$, the learned marginals $p(\bfx, t)$ span the entire image space. Importantly, by definition of the diffusion process, the distribution approaches the same distribution as the sampling distribution of the diffusion process if $\sigma_t$ gets large enough $p_{\sigma_N}(\tilde{\bfx} \mid \bfxp) \sim  \mathcal{N}(\bfx; \textbf{0}, \sigma_N^2\bfI)$. However, for lower $t$ the model has learned that the distribution collapses towards a single training image $\bfxp$. Essentially, it has modeled part of the subspace as a delta distribution around $\bfxp$. 
We want to estimate how far back in the diffusion process we have to go for the model to start to produce different images. 
The boundary $\Omega_p$ is defined as all images that would collapse towards this training image, estimated using the classifiers. 
Fig. \ref{fig:illustrationofmethodin1D} illustrates this process in one dimension. 
The indicator t' is then the strength of the perturbation function according to the definition of discoverable memorization introduced in Sec. \ref{Sec:background}.  
Note that this is different from simply defining a variance that is large enough for the classifiers to fail, as $s_{\theta}(\bfxp, \sigma_t)$ was trained to revert this noise.
Fig. \ref{fig:reversediffusion} illustrates how this looks in image space.


\noindent\textbf{Computational Overhead:}
Our proposed method computes $t'$ through forward passes of the diffusion model, making its computational cost equivalent to that of image sampling. 
The hyperparameter $M$ determines the trade-off between the accuracy of $t'$ and computational overhead, scaling linearly with $M$. For instance, with $M=16$, ensuring privacy for an image requires 16 times the computational cost of generating a single sample. 


\section{Experiments}
We consider the size of the training dataset, the time for training, and model size as the three most impactful factors determining a model's fairness and memorization capabilities.

\noindent\textbf{Dataset:}
For our initial experiments we use an a-priori selected selection of modalities from MedMNISTv2 \citep{DBLP:journals/corr/abs-2110-14795}.
For our main experiments, we use ChestX-ray14~\citep{wang2017chestxray}, a dataset of 112,120 frontal chest X-rays widely studied in privacy research~\citep{packhauser2022deep}.
Additional experiments on training length and number of fingerprints per dataset, we use three datasets with diverse modalities and sizes.
Specifically, we report results on \bci~\citep{Liu_2022_CVPR} and \odir~(\url{https://odir2019.grand-challenge.org/dataset/}).
Data is split (60/20/20), with diffusion models sharing training data with classifiers.

\noindent\textbf{Metrics:}
To evaluate generative quality, we report the Fréchet inception distance (FID).
To gauge memorization, we compute the peak signal-to-noise ratio (PSNR) between 1000 synthetic samples and all images of the training sets, reporting the maximum value.
To quantify privacy, we compute $t'$.
To quantify fairness, we compute the number $|$\clfp$|$ of positive predictions.

\begin{figure}[t]
    \centering
    \includegraphics[width=\linewidth]{figures/epochs.pdf}
    \caption{Impact of training length. To assess model memorization, we investigate PSNR and \clfp. The dashed line in the bottom row indicates the results of a fair model (SAFs appear equally often in training and synthetic datasets).}
    \label{fig:epochs}
\end{figure}

\noindent\textbf{Toy Dataset:}
We start by training \cid,~\clf~and diffusion models on MedMNIST. 
Training details and comprehensive quantitative results can be found in Appx. \ref{sec:toy_datasets}.
Next, we compute t' by evaluating  $q_M(p|x_{t,p})$. 
Examples of model input and output as well as a visualization of the evaluation process are shown in Fig. \ref{fig:reversediffusion} and Fig. \ref{fig:tdashmedmnistall}. 
In general we observe that t' is higher if the number of reproduced memorized samples is high. 
The cut-off value when the model stops to reproduce the SAF seems to be around $t' = 0.7$. 

\begin{figure}
      \centering
      \includegraphics[trim={1.5cm 0 1.5cm 0}, clip, width=1.\linewidth]{figures/supplements/mnistall.pdf}
    \caption{Likelihood of producing $\bfxp$ at sampling time as a function of $t$ for $M = 16$.\DIFdelbeginFL %DIFDELCMD < \vspace{-0.5cm}%%%
\DIFdelendFL }
    \label{fig:tdashmedmnistall}
\end{figure}

\noindent\textbf{Training Length:}
To establish the generalizability of our approach, we extend our experiments to real datasets (\cxr, \bci,\odir). 
First, we experiment with the training length.
We compute FID at different training lengths and select the best model based on the lowest value.
For this we use PII~\citep{tan2021detecting} as our SAFs.
The results are shown in Fig.~\ref{fig:epochs}.
%
For \cxr, overtraining results in higher PSNR and FID values.
The SAF itself is never reproduced.
\bci~has a few positive predictions during earlier epochs, which turn out to be false positives, likely caused by lower sample quality during early training.
The best epoch, based on FID, does not reproduce the fingerprint at all.
The high PSNR is caused by samples with large empty regions.
\odir~shows a high number of reproduced SAF and a high PSNR, indicating memorization.
Positive predictions confirm that all are copies of the training sample.
Overall, models either memorize the SAF, leading to privacy issues, or show limited sample diversity, resulting in unfair outcomes.

We repeat this experiment with the ideal training lengths and train diffusion models with different types of fingerprints to assess generalizability to real fingerprints.
Results are provided in Appx.~\ref{sec:app_training_length}.
None of the \cxr~and \bci~models reproduce the SAFs, indicating that none of the models are fair.
The results for the three different SAFs differ slightly when examining \clfp~and $t'$.
Generative models struggle with synthetic features like the circle, while PII shows intermediate retention due to its inpainting method.

\noindent\textbf{Dataset Size}
To analyze the impact of dataset size, we use a latent diffusion model for its higher efficiency and sample quality, leveraging the VQ-VAE from Stable Diffusion 2.0~\citep{rombach2022high}.
Details on training parameters are in Appx.~\ref{sec:app_training_details}.
The results are shown in Tab.~\ref{tab:datasetsizecxr}.
Interestingly, FID values are negatively correlated with dataset size, likely due to the model's ability to memorize training data, which produces realistic images.
Inspecting these images reveals that all are memorized.
For smaller datasets, the training image can often be retrieved from the sampled images, as indicated by the high $t'$ values.
Notably, while sampling with $|N_D| = 7001$ does not generate SAFs, the high $t' = 0.72$ suggests privacy risks.
For larger datasets, $t'$ is lower, indicating better privacy.
In all cases, the value of $|q|$ deviates from the expected one, highlighting fairness issues.

\begin{table}
    \centering
    \begin{minipage}[t]{0.58\textwidth} % Adjust width as needed
        \caption{Quantitative results on CXR data using two backbones: OD (out-of-domain) Inception and ID (in-domain) models for CXR~\citep{Cohen2022xrv}. Larger datasets reduce memorization risk, quantified by $t'$.}
        \resizebox{\textwidth}{!}{%
        \begin{tabular}{crccccccccc}
        \toprule
        &  & \multicolumn{2}{c}{Classification} & \multicolumn{2}{c}{OD (Inception)} & \multicolumn{2}{c}{ID (CXR)} & \multicolumn{3}{c}{Privacy} \\
        \cmidrule(lr){3-4}\cmidrule(lr){5-6}\cmidrule(lr){7-8}\cmidrule(lr){9-11}
         & $|N_D|$ & SAF (\%) & ID (\%) & $\text{FID}_{\text{train}}$ & $\text{FID}_{\text{test}}$ & $\text{FID}_{\text{train}}$ & $\text{FID}_{\text{test}}$ & $\mbb{E}(|q|)$ & $|q|$ & $t'$ \\
         \midrule
        \multirow{6}{*}{\rotatebox[origin=c]{90}{Chestxray14}} & 875  & \multirow{6}{*}{\rotatebox[origin=c]{90}{100.00}} & \multirow{6}{*}{\rotatebox[origin=c]{90}{100.00}}
                  & 15.1 & 30.3  &  1.0 &  2.3 & 34.3 & 47 & 0.75  \\
        & 1750  &&& 12.3 & 29.3  &  1.0 &  2.4 & 17.1 & 5 & 0.86  \\
        & 3500  &&& 13.6 & 32.0  &  1.2 &  2.6 & 8.6 & 1 & 0.67  \\
        & 7001  &&& 18.8 & 38.4  &  1.6 &  3.0 & 4.3 & 0 & 0.72  \\
        & 14003 &&& 22.1 & 41.4  &  1.9 &  3.3 & 2.1 & 0 & 0.66  \\
        & 28007 &&& 19.9 & 39.4  &  2.1 &  3.4 & 1.1 & 0 & 0.60  \\
            \bottomrule
        \end{tabular}%
        }
        \label{tab:datasetsizecxr}
    \end{minipage}
    \hfill
    \begin{minipage}[t]{0.40\textwidth} % Adjust width as needed
            \caption{Privacy metrics for different model sizes. See Appx.~\ref{sec:model_size} for the different hyperparameters we use for the diffusion model.\DIFdelbeginFL %DIFDELCMD < \vspace{0.07cm}%%%
\DIFdelendFL }
        \resizebox{\textwidth}{!}{%
        \begin{tabular}{lcccc}
        \toprule
        & \# Trainable parameters & $|q|$ & FID & $t'$ \\
        \midrule
        Default & \num{113675524} & 5 & 32.7 & 0.77 \\
        Model 1 & \num{77364740}  & 9 & 33.0 & 0.74 \\
        Model 2 & \num{71439108}  & 0 & 33.9 & 0.69 \\
        Model 3 & \num{49558020}  & 1 & 34.8 & 0.69 \\
        Model 4 & \num{28484612}  & 0 & 78.7 & 0.66 \\
        Model 5 & \num{28448388}  & 0 & 43.6 & 0.64 \\
        \bottomrule
        \end{tabular}%
        }
        \label{tab:modelsize_metrics}
    \end{minipage}
\end{table}
\iffalse
\begin{table}[t]
    \centering
    \caption{Quantitative results on CXR data using two backbones: OD (out-of-domain) Inception and ID (in-domain) models for CXR~\citep{Cohen2022xrv}. Larger datasets reduce memorization risk, quantified by $t'$.}
    \label{tab:datasetsizecxr}
    \begin{tabular}{crccccccccc}
    \toprule
    &  & \multicolumn{2}{c}{Classification} & \multicolumn{2}{c}{OD (Inception)} & \multicolumn{2}{c}{ID (CXR)} & \multicolumn{3}{c}{Privacy} \\
    \cmidrule(lr){3-4}\cmidrule(lr){5-6}\cmidrule(lr){7-8}\cmidrule(lr){9-11}
 & $|N_D|$ & SAF (\%) & ID (\%) & $\text{FID}_{\text{train}}$ & $\text{FID}_{\text{test}}$ & $\text{FID}_{\text{train}}$ & $\text{FID}_{\text{test}}$ & $\mbb{E}(|q|)$ & $|q|$ & $t'$ \\
 \midrule
\multirow{6}{*}{\rotatebox[origin=c]{90}{Chestxray14}} & 875  & \multirow{6}{*}{\rotatebox[origin=c]{90}{100.00}} & \multirow{6}{*}{\rotatebox[origin=c]{90}{100.00}}
          & 15.1 & 30.3  &  1.0 &  2.3 & 34.3 & 47 & 0.75  \\
& 1750  &&& 12.3 & 29.3  &  1.0 &  2.4 & 17.1 & 5 & 0.86  \\
& 3500  &&& 13.6 & 32.0  &  1.2 &  2.6 & 8.6 & 1 & 0.67  \\
& 7001  &&& 18.8 & 38.4  &  1.6 &  3.0 & 4.3 & 0 & 0.72  \\
& 14003 &&& 22.1 & 41.4  &  1.9 &  3.3 & 2.1 & 0 & 0.66  \\
& 28007 &&& 19.9 & 39.4  &  2.1 &  3.4 & 1.1 & 0 & 0.60  \\
    \bottomrule
    \end{tabular}
\end{table}

\begin{table}[t]
    \centering
    \caption{Privacy metrics for different model sizes. See Appx.~\ref{sec:model_size} for details.}
    \begin{tabular}{lcccc}
    \toprule
    & \# Trainable parameters & $|q|$ & FID & $t'$ \\
    \midrule
    Default & \num{113675524} & 5 & 32.7 & 0.77 \\
    Model 1 & \num{77364740}  & 9 & 33.0 & 0.74 \\
    Model 2 & \num{71439108}  & 0 & 33.9 & 0.69 \\
    Model 3 & \num{49558020}  & 1 & 34.8 & 0.69 \\
    Model 4 & \num{28484612}  & 0 & 78.7 & 0.66 \\
    Model 5 & \num{28448388}  & 0 & 43.6 & 0.64 \\
    \bottomrule
    \end{tabular}
    \label{tab:modelsize_metrics}
\end{table}
\fi
\noindent\textbf{Model size:}
Ideally, we want a model that avoids memorizing the fingerprint without requiring more training images.
To address this, we investigate how model architecture size influences memorization by training models with fewer parameters on $|N_D| = 1770$ images.
Details about the architecture are in Appx.~\ref{sec:model_size}.
The results are shown in Tab.~\ref{tab:modelsize_metrics}.
Smaller models stop reproducing the image, as indicated by the decreasing $t'$, but this reduces quality, as shown by the worse FID values.
Manual inspection reveals that fingerprints are either memorized or forgotten across all models.

\noindent\textbf{Real-world example:}
\label{sec:realworldcxr}
\begin{figure}[t]
    \centering
    \fbox{
    \includegraphics[width=0.99\linewidth]{figures/privacyattack.pdf}
    }
    \caption{Extracting memorized samples from a trained diffusion model. The attacker learns that a ring is in the patient's image, uses this information, and filters generated samples until reproducing the training image.\DIFdelbeginFL %DIFDELCMD < \vspace{-0.5cm}%%%
\DIFdelendFL }
    \label{fig:findring}
\end{figure}
Next we want to highlight the severity of this problem by presenting a realistic scenario where an attacker leverages prior knowledge to extract private data from diffusion models.
We assume the attacker knows that a patient has swallowed, \emph{e.g.}, a ring, a scenario observed in our dataset.
The attacker, without access to $\bfxp$, could train a classifier to detect objects other than soft tissue and use it to filter sampled images, as shown in Fig.~\ref{fig:findring}.
To test this, we manually label the presence of support devices (\emph{e.g.}, pacemakers, tubes) in 5000 images from~\citep{wang2017chestxray} and train \clf~to detect them.
We generate a synthetic dataset using a diffusion model trained on a set without support devices except for one image containing a ring to simulate accidental inclusion.
The resulting model has a high $t' = 0.82$.
Using~\clf, we filter 342 sampled images, most of which are false positives due to the difficulty of detecting unspecific support devices in medical images.
Manual inspection confirms that only memorized training samples contain rings.
This demonstrates how an attacker could use knowledge of a ring's presence in the training dataset to reconstruct the CXR image.

\noindent\textbf{Achieving Fairness:}
\begin{figure}
    \centering
    \includegraphics[width=\linewidth]{figures/figtdash.pdf}
    \caption{Number of fingerprints in the synthetic dataset and t’ relative to $N_f$. The grey line marks when models begin reproducing fingerprints.
    %We manually verify generalization for \cxr ~and \bci.
    \DIFdelbeginFL %DIFDELCMD < \vspace{-0.5cm}
%DIFDELCMD <     %%%
\DIFdelendFL }
    \label{fig:multiple_fingerprints}
\end{figure}
To address the observation that all models raise fairness or privacy issues, we investigate the number of images $N_f$ containing the fingerprint as a key factor for generalization.
The results are shown in Fig.~\ref{fig:multiple_fingerprints}.
For \cxr, even with three fingerprinted samples, the model has a slight chance of reproducing the fingerprint, with one successful run out of three.
At nine samples, the model begins reproducing the fingerprint more often, but the proportion remains low.
Only one in 50,000 synthetic samples contains the fingerprint, even though 0.1\% of the training data does.
Generalization begins after roughly three samples.
\bci~generalizes after about four fingerprinted images, despite having a smaller training size.
For \odir, the small dataset size causes memorization, showing that training diffusion models on small datasets does not enable generalization.


\noindent\textbf{Limitations:}
%The indicator $t'$ has high variance, especially for large values, due to the stochasticity involved in sampling $q_M(p|x_{t,p})$.
%Results with $t'$ close to 1 are hard to compare, and the absolute value is difficult to interpret.
%Additionally, while our framework provides privacy and fairness guarantees at a sample level, it does not guarantee these properties for entire datasets.
Because the indicator $t'$ depends on the inherent stochasticity of sampling $q_M(p \mid x_{t,p})$, it may exhibit notable variance for larger values. As a result, outcomes with $t'$ close to 1 can be more challenging to compare and interpret. Moreover, although our framework provides robust privacy and fairness assurances at the sample level, extending these protections to entire datasets remains an open avenue for future work.

\section{Conclusion}
We described scenarios where training generative models on personally identifiable image data can lead to training data leaks.
Using our framework, we successfully investigated common diffusion model design parameters. 
Our work reveals that, regardless of design choices, models are either not privacy-preserving or raise fairness issues by forgetting important long-tail information. Rare diseases will either be forgotten or memorized by the diffusion model.
This must be considered and can be quantified by using $t'$ when designing models for data sharing and 
we observe that increasing the range of unique features in a dataset fosters improved generalization.
The only alternative to avoiding memorization is to increase the size of the training dataset. However, in such cases, fairness should be carefully assessed, particularly concerning long-tail data.

\midlacknowledgments{
This work was supported by the High-Tech-Agenda Bavaria. HPC resources were provided by the Erlangen National High Performance Computing Center (NHR@FAU) of the Friedrich-Alexander-Universit\"at Erlangen-N\"urnberg (FAU) under the NHR project b143dc and b180dc. NHR funding is provided by federal and Bavarian state authorities. NHR@FAU hardware is partially funded by the German Research Foundation (DFG) – 440719683. Support was also received by the ERC - projects MIA-NORMAL 101083647 as well as DFG 513220538, 512819079 and DFG large scale infrastructure funding Art 91b GG.
}

\bibliography{midl25_025}

\appendix

\clearpage

\section{Derivation of Estimation Indicator t'}
\label{sec:estimation_method}
\citet{song2020score} show that the reverse diffusion process of the SDE can be modeled as a deterministic process as the marginal probabilities can be modeled deterministically in terms of the score function. As a result, the problem of learning transition kernels simplifies to an ODE: 
%
\begin{align}
    \ud \bfx = \Big[\bff(\bfx, t) - \frac{1}{2} g(t)^2\nabla_\bfx \log p_t(\bfx)\Big] \ud t, \label{eqn:deterministic_flow}
\end{align}
%
Solving Eqn. \ref{eqn:deterministic_flow} enables exact likelihood computation. 
However, this does not account for the fact that images in the immediate neighborhood, like slightly noisy versions of $\bfxp$, are not anonymous. 
Consequently, we are interested in computing $q(p)$, which is defined as the likelihood of reproducing any sample within $\Omega_p$, which is the region of the image space that is similar enough to $\bfxp$ that it raises privacy concerns: 
%
\begin{align}
   q(p) = \int_{\Omega_p} p_s(\bfx)\ud\bfx \label{eq:pofp}.%\approx \mbb{E}_{\bfx(0)} 
\end{align}
%
We determine this region by training a classifier tasked with detecting whether the image belongs to the image class \clf.
To search through the image manifold, we make use of the reverse diffusion process centered around the SAF image $\bfxp$ defined as $p_{t,b}  \coloneqq p(\bfx_t \mid \bfxp) =  \mathcal{N}(\tilde{\bfx}; \bfxp, \sigma_t^2 \bfI)$ for $\bfx(s)$ to $\bfx(t)$, where $0 \leq t \leq T$. 
We can employ the diffusion process centered around this image to sample from the neighborhood and then use the learned reverse diffusion process to generate noisy samples $\bfx_{t,p}$. Then we can use this as starting image for the reverse diffusion process to sample $\bfx_{t,p}'$:
%
\begin{equation}
\begin{split}
   q(p) & = \int_{\Omega_{p}} p_s(\bfx)\ud\bfx \approx \int_0^{t'} p_s(\bfx_{t,p}) \ud\bft = \int_0^{t'}\mbb{E}_{p(\bfx_{t,p})}\big[    p(\bfx'_{t,p})\big]\ud\bft .
\end{split}
\end{equation}
%
Technically, we could employ exact likelihood computation to estimate $q(p)$ but this would require integrating over the continuous image-conditioned diffusion process, which would be intractable in practice. Therefore, we propose to approach and estimate this integral by computing the Riemann sum of this integral and give an upper bound estimate for it using the upper Darboux sum: 
%
\begin{align}
\begin{split}
\int_0^{t'}\mbb{E}_{p(\bfx_{t,p})}\big[    p(\bfx'_{t,p})\big]\ud\bft &= \\
\sum_{t} (\sigma_t - \sigma_{t-1}) \mbb{E}_{p(\bfx_{t,p})}\big[p(\bfx'_{t,p})\big] \leq \sum_{i=0}^{t'} \sup_{t \in\left[t_i, t_{i+1}\right]} (\sigma_{t_{i+1}} - \sigma_{{t_i}})\mbb{E}_{p(\bfx_{t,p})}\big[p(\bfx'_{t,p})\big], \label{eq:full_equation_estimate} 
\end{split}
\end{align}
%
which approaches the real value for steps that are small enough. We can compute this value by using $\bfxp$ as a query image and estimating the expectation by performing Monte-Carlo sampling but this would be computationally infeasible due to the complexity of exact likelihood estimation.

\begin{figure*}
      \centering
      \fbox{
      \includegraphics[width=0.95\linewidth]{figures/method_sketch.png}
      }
    \caption{Illustration of our estimation method in 1D. The grey line denotes the query image $\bfxp$. The estimation method iteratively increases the search space in the latent space of the generative model. The green area corresponds to image regions resulting in non-privacy concerning generated samples, while the red area is considered critical. }
    \label{fig:illustrationofmethodin1D}
\end{figure*}

\begin{figure}
	\centering
	\fbox{
		\includegraphics[width=0.95\linewidth]{figures/reverse_diffusion.png}
	}
	\caption{Illustration of the reverse diffusion process. Left shows query images  $\bfx_{t,p}$ for $t \in \left[0, 0.7\right]$. Right shows the resulting sample.}
	\label{fig:reversediffusion}
 %\vspace{-5mm}
\end{figure}


\section{Training Details}
\label{sec:app_training_details}

The classifiers are randomly initialized ResNet50 \citep{he2016deep} architectures. 
To maximize robustness, we employ AugMix \citep{hendrycks2020augmix}, and in the case of \cid, we inject random Gaussian noise into the training images to increase the robustness towards possible artifacts from the diffusion process. 
Furthermore, we randomly mask out patches of the same shape as the SAF to reduce the effect of SAF on the prediction.  % TODO supps images
Robustness is crucial for these classifiers. Even if models have a 99.9\% accuracy on the test set, they produce a not negligible amount of false predictions on a dataset with 50000 synthetic images. Therefore, we carefully run several training sessions over different hyperparameter settings. 
Due to the simplicity of this detection and the self-supervised learning scheme, all of the classifiers trained to detect synthetic fingerprints reach an accuracy of 100\% on the test set.
%
\begin{figure}
      \centering
      \fbox{
      \includegraphics[width=0.95\linewidth]{figures/supplements/training_images.png}
      }
    \caption{Training image samples for~\clf~and~\cid}
    \label{fig:supp_training_images}
\end{figure}
%
To further elaborate on the training details of~\clf and~\cid, we show training samples for both classifiers in Fig. \ref{fig:supp_training_images}. 
Since both tasks are fairly easy binary classification tasks, we employed strong augmentation techniques to ensure that positively predicted samples from the classifiers are SAFs. 
We balanced the classification task for~\cid by adding SAFs to 50\% of the training images. For validation, we reduce this to 10\% to remain closer to the expected distribution. 
For~\cid we chose circular masking as training augmentation because we expected it might be necessary to mask out the SAF from the positive predictions of~\clf. 
However, closer inspection of the predictions showed this was unnecessary (compare Fig. \ref{fig:false_positives}). Another reason is, that we do not want to confuse the model at inference time by showing it SAFs which are not part of the training data of~\cid. 
The probability of $\bfxp$ appearing in the training dataset of~\cid is set to 10\% during training and 50\% during validation. 

\noindent\textbf{Diffusion Model Training:}
The custom diffusion model architecture for experiments on MedMNIST is based on the open-source implementation of a 2D U-Net\footnote{https://github.com/huggingface/diffusers}. Due to the $28 \times 28$ input images, we are forced only to use the three outermost downsampling and upsampling layers. 
Training the diffusion model on the toy datasets is done on a single A100 GPU and takes roughly eleven hours. 
For the real datasets, we either employ diffusion models in image or in latent space, both on $64 \times 64$ pixel images. 
A key difference between score-based models and diffusion models is that diffusion models use a discrete noise schedule instead. 
Switching to this discrete schedule is not a problem due to the reasonably small discretization error \citep{su2023dual}.
The classifiers are trained until convergence with a validation error patience of 20 epochs, which takes less than one hour. 
Exhaustive search for t', which is done by computing $q_{M=16}(p|x_{t,p})$ for all $t \in {0, \dots, 1}$, takes four hours. 


\section{Details on Dataset Size}
\label{sec:app_dataset_size}
For this experiment, the diffusion models are trained for 150000 step at which time the FID seems to saturate. For sampling we use 100 steps. Training takes roughly 9 hours on a node with 8 Nvidia A100 GPUs. For every model we sample 30000 images which takes roughly 5 hours on a single GPU.   

\section{Model Size}
\label{sec:model_size}
\begin{figure}
    \centering
    \includegraphics[width=0.3\textwidth]{figures/modelsize.pdf}
    \caption{Relationship between model size, FID, and t'. The size of the marker shows the size of the model.}
    \label{fig:modelsize_blobs}
\end{figure}
\begin{table}
        \caption{Model architecture for the unconditional U-Net used as backbone for the diffusion model. The standard value for the number of channels is $c=128$.}
        \centering
        \begin{tabular}{lccccc}
        & \# Trainable params  & Down blocks &Channels / layer &  Layers / block \\

          \toprule
        \textbf{Default}  & \num{113675524}& 6 & c,c,2c,2c,4c,4c & 2\\
        \midrule
        Model 1           & \num{77364740} & 6 & c,c,2c,2c,4c,4c  &   1\\
        Model 2           & \num{71439108} & 5 & c,c,2c,2c,4c,4c  &   2\\
        Model 3           & \num{49558020} & 5 & c,c,2c,2c,4c,4c  &   1\\
        Model 4           & \num{28484612} & 4 & c,c,2c,2c,4c,4c  &   2\\
        Model 5           & \num{28448388} & 6 & c/2,c/2,c,c,2c,2c&   2\\
        \bottomrule
        \end{tabular}
    \label{tab:model_size}
\end{table}

In Tab. \ref{tab:model_size} we summarize the different hyperparameters used to define the backbone architecture of the diffusion model. 





\section{Method Overview}
\label{sec:app_training_length}
\label{sec:app_pipandfeat}
\begin{figure}
\includegraphics[width=\textwidth]{figures/mainfig.png}
\caption{Overview of our experimental setup. The figure shows the three datasets we are evaluating on (\cxr, \odir, and \bci). To model different deviations from the rest of the training dataset we distinguish between three different kinds of fingerprints: grey circles, PII inpainting~\cite{tan2021detecting}, and unique image-level features. From top to bottom we show three samples of: The input dataset, the circle experiment, the PII experiment, and the feature experiment. The feature experiment works by leveraging image-level labels such as \emph{female}, \emph{left eye} or \emph{ihc staining}} \label{fig:mainfig}
\end{figure}

The main idea of our method is to artificially inject fingerprints into the diffusion model's training set. 
This means that we augment $N_f = 1$ training images with one of the fingerprint types described below. 
Then, given a robust classifiern\clf trained to detect this fingerprint, we can check if this fingerprint is reproduced in the synthesized dataset. 
Synthetic anatomic fingerprints track the memorization capabilities of unconditional diffusion models. 
They use visually dominant grey circles, which are easily detectable by the human eye or classifiers. 
The problem with this is that it remains unclear whether this high visual discrepancy influences  generation abilities.  
Firstly, \emph{Circular fingerprints} are used. They have a fixed radius of 9 (in a 64 by 64 pixel image) and are situated at a specific spot within the area typically occupied by content. 
They are trivial to spot, which makes their automated detection easy and reliable.
To analyze visually more realistic fingerprints, we create \emph{PII fingerprints} using Poisson image interpolation (PII) 
which is known to create features realistic enough to be used for anomaly detection \cite{tan2021detecting}. 
In the case of \cxr ~we picked a source image for the interpolation such that the area contains a medical support device and inpaint at a location that does not contain this kind of device to make sure that no real samples contain similar features. 
Finally, we experiment with using image-level labels as \emph{feature fingerprints}. For \cxr\ we use sex, for \bci\ the staining type, and for \odir\ the physical side of the eye. 
Since we rely on a robust boundary for the detection of the features, we additionally apply PII to all images that have a different image label. Therefore, we can use the same classifier we use for PII detection to detect these features. Essentially, this means that we add a synthetic fingerprint for a real feature. 

\noindent\textbf{Generalizability to Other Fingerprints:}
\begin{table}
\caption{Memorization results for all three datasets. Results are averaged over three different runs. All \cxr\ and \bci\ runs result in fairness issues due to the complete lack of reproducing the SAFs.}
\resizebox{\textwidth}{!}{%
\begin{tabular}{lccccccccccccc}
\toprule                                                                                                   
                        &&     & PSNR &     &&     &\clfp &     &&     & t' &     \\
\multicolumn{1}{l}{}    && Circle & PII  & Feature && Cirle & PII & Feature && Cirle & PII & Feature \\
\cmidrule(lr){3-5}\cmidrule(lr){7-9}\cmidrule(lr){11-13}
\cxr &&$24.76$ & $25.06$ & $24.54$ &&$ 0.00$  & $0.00$  & $0.00$  && $0.14$ & $0.25$ & $0.43$ \\ 
\bci &&$46.41$ & $46.48$ & $46.01$ &&$ 0.00$  & $0.00$  & $0.00$  && $0.10$ & $0.26$ & $0.25$ \\  
\odir &&$46.93$ & $46.18$ & $47.14$ &&$ 51.33$ & $61.67$ & $39.33$ && $0.76$ & $0.77$ & $0.78$ \\ 
\bottomrule
\label{tab:mem}
\end{tabular}%
}
\end{table}
%
%
To check whether the results presented in Tab. \ref{fig:epochs} is reproducible, we use the ideal training lengths and train the diffusion models with the different types of SAFs. 
Results are given in Tab. \ref{tab:mem}. 
None of the \cxr\ and \bci\ models reproduce the SAFs which means that none of the models are fair.
Moreover, we can see that we can measure this by computing t'. 
The results for the three different inpainters seem to differ slightly when looking at \clfp and t'. 
The circle, which is purely synthetic, seems to be easily forgotten by the models that do not memorize as indicated by the low value for t'. 
For PII the value is close to that of the \emph{feature fingerprint.}. 
The value for t' of PII is much lower than that of the feature fingerprint. 
To recap, the difference is that PII inpaints the PII feature on an inilier (male), whereas the feature fingerprint works by inpainting the PII fingerprint on an image with a different image label (female).
The variance in stature, leading to low-frequency features being more readily identifiable by diffusion models at a higher noise level, may explain the significantly higher value of t' observed in this context. From now on, we restrict our experiments to \emph{feature fingerprints}. 

\label{sec:filter}
\begin{figure}
    \centering
    \fbox{
    \includegraphics[width=0.95\linewidth]{figures/model_abstract.pdf}
    }
    \caption{Illustration of the filtering process to come up with the number of memorized images $|q|$. The lock symbol stands for keeping the model frozen and performing inference or, in the case of the diffusion models, sampling. The lightning indicates privacy issues. Green borders indicate that it is safe to share the images due to the lack of any identifiable information about it. Red borders indicate that sharing these images would pose a privacy risk.}
    \label{fig:model_abstract_filtering}
\end{figure}
\noindent\textbf{Filtering:} In order to determine whether or not an unconditional diffusion model reproduces samples at sampling time, we perform the filtering process as illustrated in Fig. \ref{fig:model_abstract_filtering}. The first step is training an unconditional diffusion model on a dataset, where one image contains a known fingerprint (prior information). 
\clf~is trained to find the fingerprint while~\cid is trained to determine the identity of the image without the fingerprint and, hence, if the image contains personal information. 
Finally, we sample the unconditional model and filter all generated images to only consist of images that contain the fingerprint and have the same identity as the training images. If any images fulfill these criteria, then we have a privacy issue, and the model should not be shared

\section{Estimation of Tightness of Bound}
\begin{figure}[h!]
    \centering
    \includegraphics[width=\linewidth]{figures/supplements/rebuttalupperbound.png}
    \caption{The accuracy of our estimate on a synthetic two-dimensional example. The red dot is the query image $\textbf{x}_p$ and the circle is the boundary $\Omega_p$ Left: Monte carlo estimation method. Middle: Idealised visualization of our proposed estimation method. Right: Accuracy of both estimation methods.}
    \label{fig:accuracyofbound}
\end{figure}
We show how Monte-Carlo sampling compares to our approach in a two-dimensional synthetic scenario with a bimodal Gaussian distribution shown in Fig. \ref{fig:accuracyofbound}. The sampling procedure of our method is shown in the middle.
We use an idealized scenario for sampling by choosing values on the exact circle around the query image. Importantly, we see that our proposed method successfully works as an upper bound for the real probability, whereas Monte-Carlo sampling underestimates the real probability at first. Additionally, we see that the estimate is close to the real value and would give a reasonably good estimate from only 32 samples.


\section{Estimation Algorithm}
\label{sec:estimation_alg}


%\usepackage[linesnumbered,ruled,vlined]{algorithm2e}
\begin{algorithm2e}
\caption{Upper bound likelihood estimation algorithm}
\label{alg:CKB}
\KwIn{$M$, $s_\theta(\bfx, t)$, $c_{f}(\bfx)$, $c_{ID}(\bfx)$, $\bfxp$}
\KwOut{$t'$}

\For{$t \leftarrow 1$ \KwTo $0$}{
    \For{$m \leftarrow 1$ \KwTo $M$}{
        $\bfx_{t,p} \gets p(\bfx_t \mid \bfx_p)$\;
        \For{$\tilde{t} \leftarrow t$ \KwTo $0$}{
            $\bfx_{t,p}' \gets s_\theta(\bfx_{t,p}', \tilde{t})$\;
        }
        $\bfxp' \gets \bfx_{t,p}'$\;
        \If{$c_{f}(\bfx)$ is \textbf{True} \textbf{and} $c_{ID}(\bfx)$ is \textbf{True}}{
            \Return $t$\;
        }
    }
}
\end{algorithm2e}





In Alg. \ref{alg:CKB} we describe our proposed algorithm to compute the indicator $t'$. 
To do an exhaustive search we set the step size to be the same as the sampling step size, start from the maximum value, and go to the minimum value. 
Since this computation takes too long to be feasible, we experiment with increased step sizes. 
To improve the computation time even further it is straightforward to change the algorithm to a binary search version or to increase the sampling step size. 

\section{Experiments on Toy Datasets}
\label{sec:toy_datasets}
We report detailed results for several MedMNIST datasets in Tab. \ref{tab:SAFTrainingResults}. We 
compute the Fr\'echet inception distance (FID) on the train and the test data to see if we can identify memorization issues. 
Most datasets show a large gap between these two values, which could partially be explained by memorization. 
But the size of this gap gives no information about the memorization capabilites and can be explained by other factors. PneumoniaMNIST has a larger drop in performance than RetinaMNIST but exhibits barely any memorized samples. 
For the smaller datasets we observe that the model only learned to reproduce samples.
In the context of data-sharing this would mean that the model is essentially a way of compressing training data and sharing it would raise major privacy issues. 
This results in a high value for $t'$ and several reproduced private image $\bfxp$.% resulting in severe privacy issues if these models were shared.   

Next, we compute $t'$. Detailed results for the computation of $t'$ are shown in Fig.~\ref{fig:tdashmedmnistall}.
The trained generative models exhibit a behavior of starting a slow decline in the probability of reproducing training samples for an increasing $t$. 
The end of the decline can be estimated by computing $t'$. 
For the three larger datasets we observe that the images are not reproduced. 
The values for $t'$ are low, indicating that the probability of reproducing samples is negligble. 
We demonstrate this by sampling the models without retrieving any training samples ($|q| = 0$). 
Overall, we conclude that t' nicely captions the memorization capabilities of the models. 
%We show this by naively sampling the models and not being able to retrieve the training sample ($|q| = 0$). 

We confirm our observations by artificially reducing $|N_D|$ on PathMNIST in Sec. \ref{sec:pathmnist}, similar to the experiment performed in Tab. \ref{tab:datasetsizecxr}. 
Our evaluations show that the turning point seems to be $|N_D| = 5000$ images, where smaller dataset sizes mean that models only learn to memorize, but larger datasets learn to generalize.  
The combined prediction $q \coloneqq $\cid$^+ \cap $\clf$^+$ is only positive for the smallest dataset which means that we did not observe a single image, where the SAF was reproduced but the identity not preserved. 
In other words, this means that every image containing the fingerprint is a direct copy of the training image, which implicates exposure of the identity.
%
\begin{table}
  \caption{Training results for different MedMNIST datasets. We report test accuracy for the SAF classifier, but only training accuracy for the ID classifier, as identification only makes sense if the sample was part of the training set.}
  \centering
      \label{tab:SAFTrainingResults}
  \begin{tabular}{lrccccccc}
      %\label{tab:SAFTrainingResults}
      \toprule
      \multicolumn{2}{c}{Description} & \multicolumn{2}{c}{SAF classification} & \multicolumn{5}{c}{Data synthesis} \\
       \cmidrule(r){1-2}\cmidrule(r){3-4}\cmidrule(r){5-9}
        Dataset  &$|N_D|$ & SAF (\%)& ID (\%) & $\text{FID}_{train}$ & $\text{FID}_{test}$ & $\mbb{E}(|q|)$ & $|q|$ & $t'$ \\
       \midrule
        BreastMNIST    & 546    &100   & 98.7        & 9.2   & 62.6  & 91.6 & 57 & 0.886\\
        RetinaMNIST    & 1080   & 100  & 99.6      & 5.9   & 19.7  & 46.3 & 52 & 0.998\\
        PneumoniaMNIST & 4708   &100   & 99.8 & 9.5   & 28.4  & 10.6 & 2 & 0.718\\
        BloodMNIST     & 11959  &100   & 99.5     & 9.3   & 11.0   & 4.2 & 0 & 0.241\\
        OrganSMNIST    & 13940  &99.47 & 99.8 & 19.6  & 19.7  & 3.6 & 0 & 0.582\\
        ChestMNIST     & 78468  &99.93 & 99.8  & 3.3   & 3.9   & 0.6 & 0 & 0.206\\
     \bottomrule
  \end{tabular}
\end{table}
%
%
%
\label{sec:pathmnist}
%To investigate the influence of the size of the dataset.
We keep the number of training steps fixed at 30000 steps because we observed that this is the length it takes the model to learn to reproduce samples for the smallest subset. After training we sample 150000 images for every model and measure the probability of reproducing our sample at test time. 
We do this by defining the null-hypothesis $H_{0}$ that the probability of sampling $\bfxp$ is equal to $1/N_{D}$. Hypothesis $H_{1}$ claims that the probability is lower. Therefore, we sample 150000 images for every trained model with dataset size $|N_{D}| \in \left\{ 1000, 5000, 10000, 20000, 50000 \right\}$. The results are shown in Tab. \ref{tab:datasetsizetable}
It can be seen that the model only learned to reproduce samples with the SAF when the dataset size was comparably low.
For $|N_{D}| = 1000$ the model was surprisingly close to the expected value, indicating that the size of the data is too small relative to the available parameter space and the model memorizes them as discrete distribution of $1000$ unrelated images. 
Every other model produces very few positive predictions from the classifier all of which turn out to be false positives. 

The combined prediction $q$ is only positive for the smallest dataset. 
All the larger models do not have any positive samples in their dataset. The p-value for this is smaller than 5\% in all cases, meaning that we can reject the null-hypothesis and assume that the probability of $\bfxp$ is smaller.
Next we look at the samples of different sizes. Initial observation suggest that image quality drops for medium-sized datasets. 
However, upon closer inspection we see that the smallest model simply learns to reproduce training data, which can be seen by the fact that some images appear multiple times. 
This confirms our observation that the model learned the training distribution in the form a discrete set of 1000 images but never learned to generalize. 
In the context of data-sharing this would mean that the model is essentially a way of saving and retrieving training data but sharing it would raise major privacy issues.
The model trained on 5000 images seems to lie in between generalizing and memorizing the learned distribution but the size of dataset was not large enough to learn a meaningful representation. The result indicate that the model learned low frequency information such as color or larger structure, but the images are lacking detail (compare Fig. \ref{fig:datasetsizeexperiment}). 

\subsection{MAE of Memorized Training Samples}
\begin{figure}
      \centering
      \includegraphics[width=0.95\linewidth]{figures/supplements/mae.png}
    \caption{The figure shows a grid-wise comparison of absolute pixel error between the training image $\bfxp$ and two sampled image $\bfxp'$ that raise privacy concerns (left) and the mean squared error (MSE) for an increasing amount of different training steps (right). $|N_D|$ is set to 1000. The samples on the left are from the model trained for 17000 steps.}
    \label{fig:suppmae}
\end{figure}

Our pipeline unveiled that training the score-based generative model for a long time on a small dataset leads to reproducing images at sampling time. We show this by applying our classification pipeline and filtering out all negative samples to get $q$. Fig. \ref{fig:suppmae} shows how much these samples are memorized. As can be seen, the sampled images $\bfxp'$ are barely distinguishable from the training image $\bfxp$. Interestingly, the mean squared error (MSE) between these images goes down rapidly but seems to stagnate after 19000 steps, at which point the reconstruction does not improve much, despite the observed higher memorization probability $q$. 
This suggests that overfitting occurs not only in the last reverse diffusion steps but also for higher $t$.  



\subsection{Dataset Size}
\label{sec:datasetsize}
\begin{figure}[ht!]
    \centering
        \centering
        \includegraphics[width=\linewidth]{figures/likelihood_t_dash.png}
        \caption{\label{fig:likelihood_t_dash}Likelihood of producing $\bfxp$ at sampling time as a function of $t$ for $t \in \{0, \dots, 0.5\}$ and $M = 16$. We stop plotting probabilities after $t'$. Due to the high observed probabilities of the $N_D = 1000$ model, we also compute and plot the probabilities for higher t.}

\end{figure}

\begin{table}[ht!]
    \centering
        \centering
         \caption{\label{tab:datasetsizetable}Number of positive predictions of the classifiers for models trained on different dataset sizes on 150000 images. All models use the same classifiers.}

        %\resizebox{\linewidth}{!}{
            \begin{tabular}{llllll}
                \toprule
                $|N_{D}|$ & 1000 & 5000 & 10000 & 20000 & 50000 \\
                \midrule
                $\mbb{E}\left[|q|\right]$ & 150 & 30 & 15 & 7.5 & 3 \\
                \midrule
                $|$\clf$^+|$ & 151 & 0 & 0 & 1 & 1 \\
                $|$\cid$^+|$ & 151 & 0 & 3 & 3 & 4 \\
                $|q|$ & 151 & 0 & 0 & 0 & 0 \\
                \bottomrule
            \end{tabular}
        %}

\end{table}

The trained classifiers only produce up to five false positives for 150000 generated images. The false positives for all $|N_{D}|$ are shown in Fig. \ref{fig:false_positives}. Both misclassified  samples from~\cid show great resemblance to the SAF by consisting of a circular monochrome patch. The misclassified  identification samples are really similar in terms of texture, color, and structure, although the differences to $\bfxp$ are distinct. None of the \cid$^+$ would lead to clear privacy issues in practice, which we successfully capture by computing $|q| = 0$ for these three models. 

\begin{figure}
      \centering
      \fbox{
      \includegraphics[width=0.95\linewidth]{figures/supplements/falsepositives.png}
      }
    \caption{All false positive predictions on MedMNISTv2 from the 750000 generated images. All misclassified images by one classifier were filtered and correctly classified by the other classifier.}
    \label{fig:false_positives}
\end{figure}

\begin{figure}
      \centering
      \fbox{
      \includegraphics[width=0.95\linewidth]{figures/dataset_size_experiment.png}
      }
      \caption{Representative samples from trained models on different dataset sizes $|N_{D}|$.}
      \label{fig:datasetsizeexperiment}
\end{figure}
Fig. \ref{fig:datasetsizeexperiment} shows visual results for the same diffusion model on different dataset sizes. As shown in Fig. \ref{fig:likelihood_t_dash}, the first model memorizes the samples, while the last model learns the underlying distribution and generalizes. This is nicely captured by $t'$. 


\subsection{Training Length}
\label{sec:apdx_trainlength}
\begin{figure}
      \centering
      \includegraphics[width=0.95\linewidth]{figures/epochs_vs_probability.png}
    \caption{Influence of training length on generative and memorization properties. A positively classified sample can be seen in the top-left corner of the rightmost image.}
    \label{fig:epoch_vs_probablity}
\end{figure}


We experiment with the influence of the training length on $|p|$ by sampling 10000 images from a model trained on $|N_{D}| = 1000$ and show the results in Fig. \ref{fig:epoch_vs_probablity}.
For the first 14000 steps, the model only learns low-frequency attributes of the data. The visual quality is low and, therefore, also the probability of reproducing $\bfxp$. Around 20000 steps the quality of the generated samples improves visually, but also the number of memorized training samples. At this point, the model already starts to accurately reproduce $\bfxp$ at sampling time. Every detected sample is visually indistinguishable from the training image. The MAE even goes down to  $1 \times 10^{-4}$. Based on these observations, we continue our investigations for MedMNISTv2 with a fixed training length of 30000 steps. 



\section{Results on Stable Diffusion}
\label{sec:stable_diffusion}
\begin{figure}[ht!]
    \centering
    \includegraphics[width=\linewidth]{figures/supplements/rebuttalsd.png}
    \caption{The problem of memorization: Conditional diffusion models memorize training data. Left: shows training and generation examples for both models. SDv1 \cite{rombach2022high} reproduces training samples (reproduced from \cite{carlini2023extracting}). Version 2 of the same model no longer exhibits this problem. Right: We show that using our proposed method we can measure this.}
    \label{fig:stablediffusion}
\end{figure}

This section reproduces the privacy problems of Stable Diffusion v1.4~\citep{rombach2022high} which were first discovered by~\cite{carlini2023extracting}. We prompt a text conditional model on a name  and see that it reproduces the training image at sampling time in one out of sixteen cases. Interestingly, we did not observe this for Stable Diffusion v2.0, which is a fine-tuned version of the same model. 
Using our proposed method, we can measure this. Therefore, we train a single classifier on re-identification of the image by using 500 randomly selected images of the same person generated by Stable diffusion v2.0. 
The results are shown in Fig. \ref{fig:stablediffusion} and show that we can quantify this difference in memorization which underlines that our method is useful in practice and even can be applied to pre-trained models.
In general we still observe that t' correlates nicely with memorization, however the values for t' are much higher. 
Apparently, the strong textual conditioning in combination with the way we generate the input noise $\bfx_{t,p}$ has important implications that need further experimentation. 

\section{Extractable Memorization} 
%\subsubsection{Extractable Memorization}
\label{sec:extractable_memorization}
Next, we employ improved denoising diffusion probabilistic models \citep{nichol2021improved} on CelebA-HQ images \citep{karras2018progressive}.  
Fig. \ref{fig:celebtdash} shows the computation of $t'$ for three models trained for 40000, 60000, and 280000 steps.
The results show that $t'$ is generally lower for the 40000 steps model than after 60000 steps. 
To do a naive search, we generate 5000 images and use an SAF classifier and a sunglasses classifier to search for fingerprints in generated samples. 
The 40000 model did not reproduce the sample; however, the 60000 model reproduced the SAF once. 
Interestingly, the results match with the non-synthetic fingerprint, where a single image was reproduced at training time. 
However, all three models have high $t'$ values, indicating that sharing them could be privacy-concerning.
Notably, the model trained for 280000 steps did not reveal that it had memorized the training sample. 
Out of the 5000 samples, none were the sunglass image. 
Interestingly, we observe something similar to mode-collapse as all of the images share similar visual properties (\emph{e.g.}, dark hair).


\begin{figure}[ht!]
    \centering
    \begin{minipage}{0.48\textwidth}  % Adjust the width as needed (e.g., 0.5 for side-by-side)
        \resizebox{\linewidth}{!}{
            \includegraphics[width=\linewidth]{figures/SAFvsSunglasses4060and280.png}
        }
        \caption{Computation of $t'$ for three different epochs on Celeba-HQ \citep{karras2018progressive}. We use the SAF as synthetic and sunglasses as non-synthetic fingerprint}
        \label{fig:celebtdash}
    \end{minipage}\hfill
    \begin{minipage}{0.48\textwidth}  % Adjust the width as needed
        \centering
        \resizebox{\linewidth}{!}{
        \includegraphics[width=\linewidth]{figures/attacks.png}
        }
        \caption{Example showing a case of extractable memorization. 
        \label{fig:backdoor}
        }
    \end{minipage}
    %\caption{Scalability experiments to real world datasets.}
    \label{fig:both_figures}
     %\vspace{-5mm}
\end{figure}

The results confirm the observation that diffusion models reproduce training images at sampling time and that we can measure this by measuring $t'$ for all three models. 
We make similar observations on experiments conducted on Stable Diffusion v1.4 \cite{rombach2022high} following the descriptions by \cite{carlini2023extracting}. 
However, since this model is a text-conditional model, it requires more experimentation.

Our work is related to backdoor learning attacks where adversaries use the trained model to inject images into the diffusion process to generate inappropriate images for \emph{extractable memorization} according to the definition introduced in Sec. \ref{Sec:background}. 
In our case, an inappropriate image would be a privacy breach. The attack works by exchanging the query image $\bfxp$ with a trigger image $\bfx_q$ that was not part of the training set and that shares visual similarities. 
In Figure~\ref{fig:backdoor}, we show that this attack can be used to increase $q_M(p|x_{t,q})$ of $\bfxp$ being generated to 18.75\%.
Since $t'$ is related to the variance of the change of the image performed through the diffusion model, we can measure the model's susceptibility to this attack. Sharing the model would be safe if $t'$ is small enough so that the change of the variance is too small to change the images from the trigger to the target image. 
Additionally, we test if we can use $t'$ to infer information about the membership of an image in the training dataset. 
Fig. \ref{fig:backdoor} demonstrates notably lower $t'$ values for training set images than test set images.
Designing a backdoor attack also confirms our observations from Fig. \ref{fig:celebtdash} as all three models were susceptible to it and reproduced the training image, even the one where we observed mode collapse. This underlines the efficacy of our method, as this case might have been overlooked when using a naive search.


\end{document}
