\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{booktabs}
\usepackage{array}
\usepackage{multirow}

\usepackage{rotating}
% If you use the hyperref package, please uncomment the following two lines
% to display URLs in blue roman font according to Springer's eBook style:
% \usepackage{color}
% \renewcommand\UrlFont{\color{blue}\rmfamily}
% \urlstyle{rm}

\newcolumntype{L}[1]{>{\raggedright\let\newline\\\arraybackslash\hspace{0pt}}m{#1}}
\newcolumntype{C}[1]{>{\centering\let\newline\\\arraybackslash\hspace{0pt}}m{#1}}
\newcolumntype{R}[1]{>{\raggedleft\let\newline\\\arraybackslash\hspace{0pt}}m{#1}}


\jmlrvolume{-- nnn}
\jmlryear{2026}
\jmlrworkshop{Full Paper -- MIDL 2026}
\editors{Accepted for publication at MIDL 2026}

\title[A Pragmatic Note on Evaluating Generative Models]{A Pragmatic Note on Evaluating Generative Models with Fréchet Inception Distance for Retinal Image Synthesis}


\midlauthor{\Name{Yuli Wu\nametag{$^{1,3}$}} \orcid{0000-0002-6216-4911} \Email{mail@wuyuli.com}\\
\Name{Fucheng Liu\nametag{$^{1}$}} \orcid{0009-0002-0299-2448} \Email{fucheng.liu@rwth-aachen.de}\\
\Name{R\"uveyda Yilmaz\nametag{$^{1}$}} \orcid{0009-0007-7351-698X} \Email{rueveyda.yilmaz@lfb.rwth-aachen.de}\\
\Name{Henning Konermann\nametag{$^{1}$}} \orcid{0009-0006-5736-5803} \Email{henning.konermann@lfb.rwth-aachen.de}\\
\Name{Peter Walter\nametag{$^{1,2}$}} \orcid{0000-0001-8745-6593} \Email{pwalter@ukaachen.de}\\
\Name{Johannes Stegmaier\nametag{$^{1,3}$}} \orcid{0000-0003-4072-3759} \Email{johannes.stegmaier@hhu.de}\\\newline
\addr $^{1}$ RWTH Aachen University, Aachen, Germany  \\
\addr $^{2}$ Uniklinik RWTH Aachen, Aachen, Germany \\
\addr $^{3}$ Heinrich Heine University D\"usseldorf, D\"usseldorf, Germany
}

\sloppy


\usepackage{etoolbox}
\makeatletter
\patchcmd{\@jmlrenddoc}{\label{jmlrend}\null}{\label{jmlrend}}{}{}
\makeatother

\begin{document}

\maketitle



\begin{abstract}
Fréchet Inception Distance (FID), computed with an ImageNet pretrained Inception-v3 network, is widely used as a state-of-the-art evaluation metric for generative models. It assumes that feature vectors from Inception-v3 follow a multivariate Gaussian distribution and calculates the 2-Wasserstein distance based on their means and covariances. While FID effectively measures how closely synthetic data match real data in many image synthesis tasks, the primary goal in biomedical generative models is often to enrich training datasets ideally with corresponding annotations. For this purpose, the gold standard for evaluating generative models is to incorporate synthetic data into downstream task training, such as classification and segmentation, to pragmatically assess its performance. In this paper, we examine cases from retinal imaging modalities, including color fundus photography and optical coherence tomography, where FID and its related metrics misalign with task-specific evaluation goals in classification and segmentation. We highlight the limitations of using various metrics, represented by FID and its variants, as evaluation criteria for these applications and address their potential caveats in broader biomedical imaging modalities and downstream tasks.
\end{abstract}

\begin{keywords}
FID, Generative models, Retinal imaging.
\end{keywords}


\section{Introduction}\label{sec:intro}

Deep generative models, particularly generative adversarial networks (GANs) \cite{goodfellow2014generative}, by adversarial training of the generator and discriminator, and diffusion models \cite{ho2020denoising}, by iteratively refining noise into structured images, have demonstrated significant potential in 2D and 3D biomedical image synthesis, as shown in studies such as \citet{eschweiler2024denoising,han2020breaking,muller2023multimodal,wu2024retinal,yilmaz2024annotated,yilmaz2024cascaded}. 
By learning from real biomedical data, these models can generate realistic synthetic images, helping to address challenges like limited data availability and data privacy concerns. 
Correspondingly, various studies have focused on generating fully annotated images to support downstream task training, such as classification and segmentation \cite{park2019semantic,zhang2023adding}. This can be achieved by conditioning generative models on different types of input, such as text, images, or sketches. Furthermore, several approaches control the diffusion process using structured guidance, where segmentation maps are used to steer image synthesis \cite{eschweiler2024denoising,park2019semantic,yilmaz2024annotated,zhang2023adding}. The resulting image-annotation pairs are then pragmatically used for training downstream tasks.
However, ensuring that synthetic images are both realistic and useful for downstream tasks remains a critical challenge \cite{theis2016note,konz2024rethinking}. 


{In ophthalmic imaging particularly, generative models have enabled explainable classification with GAN-based styles \cite{lang2021explaining}, lesion-map conditioned diabetic retinopathy fundus generation using GANs \cite{hou2023high}, diffusion-based fundus image generation \cite{muller2023multimodal}, OCT image--label generation for layer segmentation \cite{wu2024retinal,huang2024diverse}, counterfactual retinal synthesis \cite{ilanchezian2025development}, and controllable fundus synthesis via disentangled representations \cite{muller2025disentangling}.}


{A key open question is whether \textit{feature-distance} metrics (see \autoref{sec:metric}), which quantify distributional similarity in the embedding space of a fixed pretrained model, are reliable proxies for the practical utility of synthetic data in downstream training. In this work, we study data enrichment and ask: (i) do different feature-distance metrics yield consistent rankings of generative model variants, as quantified by Kendall’s $\tau$ rank correlation; and (ii) do these rankings align with downstream performance in classification and segmentation when synthetic samples are added to the training set? To answer these questions, we evaluate multiple generators on retinal fundus and OCT synthesis, compute seven representative metrics spanning different distances and feature extractors, and compare their rank orderings with downstream task performance on held-out test data.}

\begin{figure}[t]
    \centering
    \includegraphics[width=.99\linewidth]{pipeline.pdf}
    \caption{Overview of the generative model pipeline for data enrichment exemplified with retinal images, including generation, evaluation, and downstream task prediction. The discrepancy between generative evaluation metrics and downstream performance is investigated.}
    \label{fig:pipeline}
\end{figure}

As shown in \figureref{fig:pipeline}, we illustrate a common scenario using generative models, demonstrated with two retinal image modalities for data enrichment. 
On one hand, a synthetic dataset is generated with optional annotations and evaluated using various metrics, primarily FID and its variants (\sectionref{sec:metric}). 
On the other hand, the composite dataset containing a mixture of real and synthetic data is applied to a downstream task, where the trained model predicts the desired outputs on the test set (\sectionref{sec:task}). 
In \sectionref{sec:result}, we demonstrate that widely used generative evaluation metrics fail to align with downstream performance and draw a pragmatic note on the unreliability of such \textit{feature-distance} metrics when assessing generative models for data enrichment in the context of a utilitarian downstream task.





\section{Generative Evaluation Metrics}\label{sec:metric}

\subsection{Fréchet Inception Distance}\label{sec:fid}

Fréchet Inception Distance (FID) \cite{frechet1957distance,heusel2017gans} is the de facto standard metric for assessing the perceptual quality of generated images. It compares the feature distributions of generated and real images using an ImageNet \cite{deng2009imagenet} pretrained Inception-v3 model \cite{szegedy2016rethinking} and the Fréchet distance (equivalent to the 2-Wasserstein distance \cite{vaserstein1969markov,peyre2019computational}), which measures the difference between two probability distributions: \( p_r \), the distribution of real data, and \( p_g \), the distribution of generated data. Assuming Gaussian distributions with means \( \mu_r, \mu_g \) and covariances \( \Sigma_r, \Sigma_g \), the squared Fréchet distance (FD) is derived as:
\begin{equation}\label{eq:fid}
    D_{\text{Fréchet}}^2(p_r, p_g) = \|\mu_r - \mu_g\|^2 + \text{tr}\left( \Sigma_r + \Sigma_g - 2 (\Sigma_r \Sigma_g)^{1/2} \right),
\end{equation}
where \( \|\mu_r - \mu_g\|^2 \) is the squared Euclidean distance between means, and \( \text{tr}(\cdot) \) is the trace of a matrix.
A lower FID score indicates that the generated images more closely resemble the real images in terms of perceptual similarity.
It is important to note that the closed-form expression in \equationref{eq:fid} is not limited to the multivariate Gaussian distribution, but also applies to any two distributions in $\mathbb{R}^n$ within the family of \textit{elliptically contoured distributions} \cite{dowson1982frechet,peyre2019computational}, expressed as: 
\begin{equation}\label{eq:ecd}
f({x}; \mu, A) = \mathrm{const.}\times \frac{1}{|A|^{1/2}} \, g \left( ({x} - {\mu})^\top A^{-1} ({x} - {\mu}) \right), 
\end{equation}
where \(A\) is a positive definite matrix and \(g(z)\) is a non-negative function defined on the positive real axis ($r$), satisfying $0 < \int_0^\infty r^{n/2-1} g(r)dr < \infty$, which characterizes the specific distribution (e.g., for the Gaussian distribution, \(g(z) = \exp(-z/2)\)).
% Therefore, merely verifying the non-Gaussianity of the feature distributions does not undermine the reliability of FID (cf. \citet{jayasumana2024rethinking}). 
Therefore, merely verifying the non-Gaussianity of the feature distributions does not undermine the reliability of FID; however, this does not extend to Gaussian mixtures, which are generally not elliptically bounded (cf. \citet{jayasumana2024rethinking}).

By substituting the feature extractor in the FID calculation with alternative models or applying different distance measures for feature comparison, we can explore alternative approaches to evaluate generative models.
We refer to this class of generative evaluation metrics as \textit{feature-distance} metrics and present variants of FID, categorized by different distance measures in \sectionref{sec:distance} and feature extractors in \sectionref{sec:feature}.


\subsection{Distance and {Divergence} Variants}\label{sec:distance}


Besides the Fréchet distance, Maximum Mean Discrepancy (MMD) \cite{gretton2006kernel} is another key distance metric for comparing probability distributions, commonly used to evaluate generative models. 
MMD is a non-parametric measure that compares the means of two distributions in a higher-dimensional feature space. The squared MMD is defined as:
\begin{equation}
D_\text{MMD}^2(p_r, p_g) = \mathbb{E}_{x, x' \sim p_r} [k(x, x')] + \mathbb{E}_{y, y' \sim p_g} [k(y, y')] - 2 \mathbb{E}_{x \sim p_r, y \sim p_g} [k(x, y)],
\end{equation}
where \( \phi(\cdot) \) is a feature map and \( k(x, y) = \langle \phi(x), \phi(y) \rangle \) is a kernel function, which can be, e.g., a rational quadratic kernel as in Kernel Inception Distance (KID) \cite{binkowski2018demystifying} or a Gaussian RBF kernel as in CLIP-MMD (CMMD) \cite{jayasumana2024rethinking}. Unlike FID, MMD does not assume a specific distribution form, making it more flexible and capable of handling a broader range of distributions.

Various studies have attempted to estimate feature distributions using Gaussian Mixture Models (GMMs), such as in the case of class-aware Fréchet distance (CAFD) \cite{cafd2018}. Similarly, Wasserstein-GMM (WaM) \cite{luzi2023evaluating} introduces an approximate 2-Wasserstein metric based on the fitted mixture of Gaussian (MoG) distributions for both real and generated data. Furthermore, Feature Likelihood Divergence (FLD) \cite{jiralerspong2023feature} estimates the Kullback-Leibler divergence (KLD) between learned MoG distributions, claiming to capture the novelty, fidelity, and diversity of generated samples. Although KLD is not a true distance due to its lack of symmetry and violation of the triangle inequality, it is still included in this section for the sake of completeness. 




\subsection{Feature Extractor Variants}\label{sec:feature}

In addition to Inception-v3 \cite{szegedy2016rethinking}, recent advancements in foundation models, such as the vision-language model CLIP \cite{radford2021learning} and the self-supervised vision foundation model DINOv2 \cite{oquab2024dinov}, provide more powerful and general alternatives for generative feature extraction. Moreover, Fréchet AutoEncoder Distance (FAED) \cite{buzuti2023frechet} leverages the latent features from a VQ-VAE \cite{van2017neural}.

As a special case of the feature extractor variants, it is natural to consider using a modality-specific model for feature extraction in biomedical generative evaluation. The expectation is that a modality-specific feature extractor would produce better features than a general model. However, as demonstrated in \cite{woodland2024feature}, pretraining on a radiology image dataset RadImageNet \cite{mei2022radimagenet} leads to a poorer correlation with human judgment of realisticity (without evaluating downstream performance) compared to a model pretrained on ImageNet \cite{deng2009imagenet}.
In this study, we adopt RETFound \cite{zhou2023foundation}, a foundation model pretrained on retinal images using masked autoencoders \cite{he2022masked}, as the feature extractor to assess the pragmatic reliability of modality-specific feature-distance metrics with respect to the downstream performance.


\subsection{Other Related Metrics}

Several other generative evaluation metrics complement FID and assess various aspects of model performance. 
The Peak Signal-to-Noise Ratio (PSNR) is a traditional metric that evaluates image quality based on pixel-level differences, though it does not account for perceptual aspects of image quality.
To address this, the Structural Similarity Index Measure (SSIM) \cite{wang2004image} considers luminance, contrast, and structure in its comparison, providing a more perceptually meaningful measure of similarity between generated and real images. Another widely used metric is the Inception Score (IS) \cite{salimans2016improved}, which, similarly to FID, leverages a pretrained Inception network \cite{szegedy2016rethinking} to assess the diversity and quality of generated images based on classification confidence, though it has been critiqued for not fully addressing issues like mode collapse. 
Unbiased FID \cite{chong2020effectively} introduces a bias-free metric
by extrapolating FID scores to an infinite sample set, and shows that Quasi-Monte Carlo integration improves the estimation of FID for finite samples.
Finally, the Clean FID \cite{parmar2022aliased} metric has been introduced to improve upon FID by addressing aliasing artifacts that can arise from low-level image quantization and resizing, enhancing its robustness and comparability.

\subsection{Demonstrated Metrics}
In this paper, we report seven diverse generative evaluation metrics, covering Fréchet distance (FID, Clean-FID, CLIP-FD, RETFound-FD), MMD (KID, CMMD), KLD (FLD) and feature extractors, including ImageNet pretrained Inception-v3 (FID, Clean-FID, KID), CLIP (CLIP-FD, CMMD), DINOv2 (FLD), RETFound (RETFound-FD). In our experiments, all metrics are calculated between the generated data and the unseen test data for the downstream task, except for FLD, which uses both the training and test sets. 
{An overview of the generative evaluation metrics discussed in this section is presented in \autoref{tab:appendix}.}


\section{Experiments}\label{sec:task}

In this study we selected StyleGAN3 \cite{goodfellow2014generative,karras2021alias} and two diffusion-based architectures, namely Medfusion \cite{rombach2022high,muller2023multimodal} and DDPM \cite{ho2020denoising,eschweiler2024denoising,wu2024retinal}, because GANs and diffusion models remain among the best performing and most widely used generative frameworks in both natural and biomedical imaging. These models therefore provide a representative test bed for evaluating generative metrics in realistic settings. For StyleGAN3, we obtain model variants by selecting checkpoints along training according to their validation FID, which reflects standard practice in GAN model development and mirrors how researchers typically identify high- and low-performing generations. For diffusion models, we follow the conventional procedure of varying the number of sampling steps $t$, which directly controls synthesis quality and produces systematically different generations without retraining the model. Together, these approaches yield diverse model checkpoints that differ in perceptual quality in a principled and reproducible manner, allowing us to examine how generative metrics behave across a spectrum of model performance.




\subsection{Color Fundus Photography}

\noindent{\textbf{Fundus Dataset.}} 
We utilize the AIROGS dataset \cite{de2023airogs}, which consists of approximately 101,000 color fundus images, labeled as \textit{no referable glaucoma} (NRG) and \textit{referable glaucoma} (RG). The dataset is split into training and test sets at an 80:20 ratio. Specifically, the training set contains 78,537 NRG and 2,616 RG images, while the test set contains 19,635 NRG and 654 RG images.

\noindent{\textbf{Fundus Image Synthesis.}} 
Two deep generative models are employed for realistic fundus image synthesis: the advanced generative adversarial network StyleGAN3 \cite{karras2021alias} and the medical image-specific latent diffusion model \cite{rombach2022high}, Medfusion \cite{muller2023multimodal}. StyleGAN3 is trained only on RG fundus images to generate new RG samples. To verify that the generated images are indeed RG, we trained a binary classifier (distinguishing RG from NRG) that achieves 93.2\% accuracy on the generated images. We then select ten checkpoints based on the decreasing FID against the full dataset (i.e., \texttt{fid50k\_full} in \citet{karras2021alias}), with FID values of \{194, 149, 118, 87, 57, 46, 37, 25, 16, 6\}, denoted as SG-1 to SG-10. Diffusion models offer a more convenient way to obtain generative models of varying qualities corresponding to the sampling steps \(t\) compared to GANs with different checkpoints. For Medfusion, we select seven models with varying \(t\) from \{5, 10, 15, 25, 75, 150, 250\}, denoted as MF-1 to MF-7. In both synthesis approaches, 75,921 RG fundus images are synthesized to supplement the imbalanced dataset. For metric evaluation, we use 6,000 synthetic images---enough to capture diversity without distorting FID due to low-rank covariance caused by outweighing the 654-image test set \cite{jayasumana2024rethinking,konz2024rethinking}. 
{Similarly, we performed an auxiliary sanity check of label consistency by training a lightweight classifier on real data and evaluating it on the synthetic fundus images generated by Medfusion, obtaining an RG classification accuracy of 94.1\%.}

\noindent{\textbf{Fundus Downstream Task.}} The downstream task involves binary classification with class imbalance, where the minor class (RG) is augmented with synthesized data. Two widely used architectures are adopted: ResNet-50 \cite{he2016deep} and Swin Transformer Tiny (Swin-T) \cite{liu2021swin}. The F1 score according to the referable glaucoma class is calculated to highlight the low recall for RG, with imbalanced baseline F1 scores of 64.57\% for ResNet-50 and 63.73\% for Swin-T (cf. \figureref{fig:plot2}).




\subsection{Optical Coherence Tomography}

\noindent{\textbf{OCT Dataset.}} 
We utilize the dataset from the MICCAI GOALS Challenge~\cite{fang2022dataset}, consisting of 100 pixel-wise labeled circumpapillary Optical Coherence Tomography (OCT) images, split 50:50 for training and test. Three layers are annotated on the OCT scans, namely the retinal nerve fiber layer (RNFL), the ganglion cell-inner plexiform layer (GCIPL), and the choroid layer (CL). 
{
We note that this OCT setting is a small-sample regime (50 test images), which can increase the instability of FID \cite{jayasumana2024rethinking}. This regime is nevertheless representative for many annotated biomedical segmentation datasets (e.g., \citet{fang2022dataset,wu2023gamma}), where data collection and pixel-wise annotation are expensive.  
}


\noindent{\textbf{OCT Image Synthesis.}} 
Following \citet{eschweiler2024denoising} and \citet{wu2024retinal}, we employ a denoising diffusion probabilistic model (DDPM) to generate realistic retinal OCT images with a sketch from a processed segmentation mask, which enables the generation of fully annotated synthetic data. Similarly, we select seven diffusion models according to increasing sampling steps $t$ of \{100, 150, 200, 250, 300, 350, 400\}, denoted as DM-1 to DM-7. Layer statistics from 50 real OCT images are applied as priors to generate sketches, producing 200 synthetic ones as the new training set. 
{We further performed an analogous sanity check for OCT by training a lightweight segmentation model on real data and evaluating it on synthetic OCT images with their generated layer labels. The Dice score on the generated samples is 77.9\%, indicating that the generated segmentation labels are broadly plausible but not fully consistent with the predictions of a real-data-trained model (cf. \citet{wu2024retinal}).}

\noindent{\textbf{OCT Downstream Task.}} 
We focus on the layer segmentation task using two well-performing architectures: U$^2$-Net \cite{qin2020u2} and TransUNet \cite{chen2024transunet}.
Following \citet{fang2022dataset}, Dice scores for the test set segmentations of three retinal layers are computed using weights of 0.4, 0.3, and 0.3 for RNFL, GCIPL, and CL, respectively.



\section{Results}\label{sec:result}

\subsection{Feature Sparsity and Entropy}\label{sec:sparsity}
We begin with analyzing the sparsity with approximated L0 norm and the entropy of feature vectors from generated OCT images in \figureref{fig:plot1}. 
At the interface between feature extraction and distance measurement, sparsity and entropy of the features help to understand low-level behaviors of the evaluation pipeline.
On the raw feature vectors, we count absolute values above a threshold 0.01 and compute the relative L0 norm with respect to dimensionality. 
{Across four feature extractors on two retinal imaging modalities,} the features from DINOv2 \cite{oquab2024dinov} show the lowest sparsity, while the ImageNet pretrained Inception-v3 \cite{deng2009imagenet,szegedy2016rethinking} yields the most sparse feature vectors. 


Moreover, entropy (in nats) is calculated on the probability vectors, derived by applying a sigmoid function and normalizing the feature vectors ($\mathrm{sum} = 1$).
Lower entropy suggests that the feature vectors carry less information, with more values concentrated in specific dimensions, as seen with Inception \cite{deng2009imagenet,szegedy2016rethinking}. 
In contrast, higher entropy indicates a more even distribution of information across dimensions, with CLIP \cite{radford2021learning} exhibiting the highest entropy among four models. 
These feature-level properties help illuminate why metrics based on these features may behave inconsistently across downstream tasks. 
{Since entropy is computed after normalization, in a high-dimensional space (up to $d=2048$) it concentrates near a model-specific mean for dense representations; consequently, different image sets can yield similar entropy values for the same extractor. 
However, entropy remains informative \emph{across} extractors, as different architectures and pretraining objectives induce distinct feature concentration patterns, which influence the behavior of feature-distance metrics. 
We therefore use entropy as diagnostic statistics of representation geometry, rather than as a measure of image complexity.}


\begin{figure}[t]
\floatconts
  {fig:plot}% label for the whole figure
  {\caption{{(a) Feature sparsity (approximated using the relative L0 norm) and entropy (in nats) for feature vectors extracted by different pretrained models. These statistics illustrate how differently each backbone represents synthetic images, which can influence the behavior of feature-distance metrics.} 
(b)--(d) Comparison of downstream performance with the reciprocal of FID. Note that the reciprocal of FID is used in this figure solely for visualization purposes. The standard FID values and all other evaluation metrics are reported elsewhere in this paper. The left vertical axes (blue) show downstream evaluation scores, and the right vertical axes (orange) show $1/\text{FID}$. In an \textit{ideal} scenario, both curves would exhibit similar trends, indicating agreement between the perceptual metric and the downstream task performance.}}% overall caption
  {%
    \subfigure[]{%
      \label{fig:plot1}%
      \includegraphics[height=4.6cm]{entropy_full.pdf}%
    }\hspace{1.2cm}
    \subfigure[]{%
      \label{fig:plot2}%
      \includegraphics[height=4.4cm]{plot2}
    }\\
    \subfigure[]{%
      \label{fig:plot3}%
      \includegraphics[height=4.4cm]{plot3}%
    }\hspace{1.2cm}
    \subfigure[]{%
      \label{fig:plot4}%
      \includegraphics[height=4.4cm]{plot4}%
    }%
  }%
\end{figure}

Although not a full explanation, sparsity and entropy patterns demonstrate fundamental differences among feature extractors that may contribute to misalignment. 
An interesting observation is that the domain knowledge encoded in the retinal foundation model RETFound \cite{zhou2023foundation} does not lead to feature representations with improved sparsity or entropy compared to the other pretrained extractors.

\begin{table}[t!]
    \centering
    \small
        \caption{Generative evaluation metrics for fundus image synthesis with StyleGAN3 (SG) \cite{karras2021alias} and Medfusion (MF) \cite{muller2023multimodal}. Kendall's $\tau$ coefficients and $p$-values are reported for each metric in relation to the mean F1 score of ResNet-50 \cite{he2016deep} and Swin-T \cite{liu2021swin}. Note that $\tau = 1$ in the worst case and $\tau = -1$ in the best case.\vspace{0.5em}}
    \label{tab:fundus}
    \begin{tabular}{L{3.7em} R{4.5em}R{4em}R{4em}C{6em} C{4em}C{4.2em}R{3em}}
    \toprule
      \multirow{2}{*}[-0.2em]{Models}   &  \multicolumn{4}{C{18.5em}}{Fréchet Distance} & \multirow{2}{*}[-0.2em]{KID}  & \multirow{2}{*}[-0.2em]{CMMD} & \multirow{2}{*}[-0.2em]{FLD} \\\cmidrule{2-5}
      & Inception  & Clean  & CLIP & \;RETFound & & & \\\midrule
       SG-1  &  175.99 & 173.94 & 53.19 &   41.77 & 0.2073 & 5.165 & 106.45 \\
       SG-2  & 121.59 & 121.15 & 34.47 &   61.02 & 0.1391& 2.980 & 92.67 \\
        SG-3    &   95.29 & 93.20 & 23.45  & 33.64  & 0.1009 & 1.975 & 80.68 \\
        SG-4 & 69.70 & 68.66 & 15.88 &   33.38 & 0.0662 & 1.563 & 73.42 \\
        SG-5 & 49.45 & 47.22 & 7.76 &   23.51 & 0.0402 & 0.986 &  60.07 \\
        SG-6 & 39.17 & 36.17 & 5.91 &    19.14 & 0.0277 & 0.736 & 47.66 \\
        SG-7 & 30.92 & 28.70 & 5.33 &    15.12 & 0.0151 & 0.619 & 35.01 \\
        SG-8 & 24.83 & 23.66 & 4.81 &    14.82 & 0.0111 & 0.489 & 31.29 \\
       SG-9  & 21.11 & 20.09 & 4.60 &    11.25 & 0.0089 &0.440 & 26.56 \\
       SG-10  & 17.30 & 16.69 & 4.02 &    \; 9.26 & 0.0063 & 0.421 &  21.76 \\\midrule
      $\tau_{\,\mathrm{Kendall}} $ & 0.69 & 0.69 & 0.69 & \; 0.64 & 0.69 & 0.69 & 0.69 \\
       $p_{\,\mathrm{Kendall}}$ & $\ast \ast$ \, & $\ast \ast$ \, & $\ast \ast$ \, & \; $\ast \ast$ & $\ast \ast$ & $\ast \ast$ & $\ast \ast$\,\, \\\midrule\midrule
       MF-1  & 148.82 & 145.51 & 37.41 &  50.05  & 0.1632 & 1.537  &  67.72 \\
       MF-2  & 105.81 & 102.56 & 17.66 &  41.65  & 0.1109 & 0.735 &  53.80 \\
       MF-3  & 91.28 & 88.43 & 12.53 & 38.87   & 0.0940 & 0.631 &  52.70 \\
       MF-4  & 80.00 & 77.60 & 9.51 &  36.42  & 0.0823 & 0.573 & 45.86  \\
       MF-5  & 68.91 & 67.91 & 6.52  & 34.40   & 0.0740 & 0.558 &  41.36 \\
       MF-6  & 67.76 & 67.07 & 6.01 & 34.60   & 0.0741  & 0.562 &   42.54\\
       MF-7  & 68.00 &67.55  & 5.94 &  35.20  & 0.0749 & 0.567 & 40.95  \\\midrule
     $\tau_{\,\mathrm{Kendall}}$ & -0.24 & -0.24 & -0.33 & -0.24 & -0.24 & -0.25 & -0.43 \\
       $p_{\,\mathrm{Kendall}}$ & n.s. & n.s. & n.s. & \; n.s. & \; n.s. & \;   n.s. & n.s. \\
       \bottomrule
    \end{tabular}
\end{table}





\subsection{Consistent Trends of Metrics}\label{sec:consist}
\tableref{tab:fundus} and \tableref{tab:ddpm} report seven generative evaluation metrics for two retinal modalities and downstream tasks across 24 models, including StyleGAN3 and two diffusion models. To assess the consistency between these metrics, we compute the Kendall’s $\tau$ coefficient for all metric pairs, resulting in 63 pairs (21 metric pairs across 3 generative approaches; see \figureref{fig:heatmap}). Almost all of these pairs, except one, exhibit a Kendall’s $\tau$ coefficient greater than 0.5, with the majority (78\%) showing a $\tau$ coefficient above 0.7, indicating a strong correlation among these \textit{feature-distance} generative evaluation metrics, even though the features are distinct in sparsity and entropy from different extractors (\sectionref{sec:sparsity}). 

As demonstrated in the upper left triangles in \figureref{fig:heatmap}, the strong internal agreement among feature-distance metrics highlights their redundancy: despite differences in feature extractors and distances, these metrics largely rank models identically, yet this ranking fails to align with downstream utility. This redundancy generalizes from FID to other generative metrics (\figureref{fig:plot}) and contextualizes why many proposed variants of FID offer limited practical improvement.
{
In addition, FID is known to be unreliable in small-sample regimes, and several variants have been proposed to mitigate these issues (e.g., CMMD \cite{jayasumana2024rethinking}). However, in our experiments these variants do not exhibit meaningful improvements over vanilla FID.
}


\begin{table}[t]
\small
    \caption{Generative evaluation metrics for optical coherence tomography synthesis with DDPM \cite{ho2020denoising}. Kendall's $\tau$ coefficients and $p$-values are reported for each metric in relation to the mean Dice score of U$^2$-Net \cite{qin2020u2} and TransUNet \cite{chen2024transunet}.\vspace{0.5em}}
    \label{tab:ddpm}
        \centering
    \begin{tabular}{L{3.7em} R{4.5em}R{4em}R{4em}C{6em} C{4em}C{4.2em}R{3em}}
    \toprule
      \multirow{2}{*}[-0.2em]{Models}   &  \multicolumn{4}{C{18.5em}}{Fréchet Distance} & \multirow{2}{*}[-0.2em]{KID}  & \multirow{2}{*}[-0.2em]{CMMD} & \multirow{2}{*}[-0.2em]{FLD} \\\cmidrule{2-5}
      & Inception  & Clean  & CLIP & \; RETFound & & & \\\midrule
       DM-1  &  212.67 & 232.04  & 13.15 & 54.87  & 0.2578 & 1.005 & 38.74 \\
       DM-2  & 174.95 & 192.75 & 10.73  & 45.79 & 0.1925 & 0.795  & 29.65\\
       DM-3  & 177.17  & 197.43 & 10.33 & 44.08 & 0.1958 & 0.884  & 32.84  \\
       DM-4  & 146.87 & 167.19  &9.37  & 44.71 & 0.1503 & 0.972  &  22.01 \\
       DM-5  & 171.27 & 184.88 & 8.81 &  33.15  & 0.1860  & 0.775  & 25.58 \\
       DM-6  & 138.07 & 156.05 & 6.69  &  28.54  & 0.1390 &  0.748 &  29.56\\
       DM-7  & 126.42 & 142.63 & 6.33 & 34.32 & 0.1052 & 0.698  &   17.91\\\midrule
       $\tau_{\,\mathrm{Kendall}}$ & -0.14 & -0.14 & -0.14 & -0.24 & -0.14 & -0.05 & -0.33 \\
       $p_{\,\mathrm{Kendall}}$ & n.s. & n.s. & n.s. & n.s. & n.s. & n.s. & n.s. \\\bottomrule
    \end{tabular}
\end{table}

\subsection{Misaligned FID and Downstream Performance}
Kendall’s $\tau$ coefficients and $p$-values are provided in \tableref{tab:fundus} and \tableref{tab:ddpm} for each generative evaluation metric (the lower the better) and their correlation with downstream task performance (F1 or Dice, the higher the better). A $\tau = 1$ indicates negative correlation between the generative model's evaluation and downstream performance, while $\tau = -1$ suggests an ideal generative evaluation metric. The results show that for diffusion models, these metrics fail to capture downstream performance, as indicated by the non-significant $p$-values (n.s. when $p \ge 0.05$). More critically, for StyleGAN3, the metrics predict performance in the opposite direction with $0.001 \le p < 0.01$, denoted as $\ast \ast$.
We also illustrate the downstream performance with $1/\text{FID}$ to better depict the relationship in \figureref{fig:plot2,fig:plot3,fig:plot4}. No clear correlation is observed across the three plots, highlighting the unreliability of FID (and, by extension, the other six metrics due to their consistency, as discussed in \sectionref{sec:consist}) for evaluating generative models in the context of a downstream task.









\section{Limitations and Outlook}

Although this study evaluates three generative models across two retinal imaging modalities, the scope remains limited to color fundus photography and retinal OCT. These modalities are representative for retinal research, but do not capture the diversity of biomedical imaging. 
Prior findings in radiology, for example by \citet{konz2024rethinking}, suggest that similar misalignment effects may also arise in other imaging domains, although a systematic cross-modality investigation was beyond the scope of this work. That study examines radiology image-to-image translation, whereas our work focuses on retinal image synthesis using multiple generative models and downstream tasks, providing complementary evidence within a different modality setting.
Furthermore, we did not investigate domain-specific metrics such as the Fréchet Radiology Distance (FRD) \cite{konz2024fr}, which incorporates handcrafted radiological features. Developing analogous retinal-specific metrics is possible, but we believe a more generic solution is needed. 
{Finally, while we observe inverted or weak correlations between feature-distance metrics and downstream performance in some settings, we do not claim a definitive mechanism; plausible contributors include reduced diversity, underrepresentation of hard cases, and imperfect label--image consistency.}

Future work should extend the analysis to additional biomedical modalities 
{and to more recent conditional or controlled generative models, e.g., control-guided diffusion \cite{zhang2023adding}}, and explore evaluation strategies that do not rely on manually engineered domain features, aiming instead for more general and theoretically grounded metrics. Since our findings indicate that downstream evaluation provides the most reliable measure of generative model utility, an important challenge is to integrate such task-based assessment into model selection workflows without incurring prohibitive computational cost. Approaches such as Bayesian optimization, surrogate modeling, or partial downstream evaluation could help make downstream-aware generative model selection practical at scale.


\begin{figure}[t]
    \centering
    \includegraphics[width=1\linewidth]{heatmap.pdf}
    \caption{
    Kendall's $\tau$ rank correlations between feature-distance metrics and downstream task performance for all generative settings. Each cell reports the Kendall's $\tau$ value (in $[-1, 1]$) and its corresponding $p$-value ($\ast$$\ast\ast$ for $p < 0.001$, $\ast\ast$ for $0.001 \le p < 0.01$, $\ast$ for $0.01 \le p < 0.05$, and n.s.\ for $p \ge 0.05$). Warm colors denote stronger positive correlations and cold colors denote negative correlations. Average denotes the mean downstream performance across the two classification or segmentation models. The results indicate that (i) newer generations of feature-distance metrics do not consistently improve evaluation quality compared to classical variants, and (ii) feature-distance metrics rarely align with downstream performance and often exhibit inverted or insignificant correlations.}
    \label{fig:heatmap}
\end{figure}


\section{Conclusion}

In this study, we evaluated three generative models across two retinal imaging modalities and showed that Fréchet Inception Distance (FID) and a broad set of related feature-distance metrics do not reliably reflect downstream task performance when synthetic data are used for data augmentation. Despite their widespread adoption, these metrics often fail to capture the practical usefulness of generated images for classification or segmentation. Our results therefore suggest that downstream evaluation should serve as the primary criterion when assessing generative models intended for biomedical data enrichment. Identifying proxy metrics that correlate more closely with downstream utility, while remaining computationally efficient, remains an important direction for future research.


\midlacknowledgments{This work was supported by Deutsche Forschungsgemeinschaft (DFG, German Research Foundation) with the grant GRK2610: InnoRetVision (YW, HK, project number 424556709).}


\bibliography{midl26_195.bib}


\appendix 

\renewcommand{\tablename}{{\textbf{Appendix} \qquad Table}}
\renewcommand{\thetable}{{A\arabic{table}}} 
\renewcommand{\theHtable}{A\arabic{table}}
\setcounter{table}{0}  

\begin{sidewaystable}[t]

\caption{{A summary of generative evaluation metrics discussed in this paper.}}\vspace{4mm}
\label{tab:appendix}
\small

\setlength{\tabcolsep}{7pt}
\renewcommand{\arraystretch}{1.15}
\begin{tabular}{@{}p{0.2\linewidth} p{0.38\linewidth} p{0.34\linewidth}@{}}

\toprule
\textbf{Metrics} & \textbf{Distances / Divergences} & \textbf{Feature Extractors} \\
\midrule
FID \newline\citet{heusel2017gans} & Fr\'echet distance (FD) \newline\citet{frechet1957distance,dowson1982frechet} & Inception-v3 (ImageNet) \newline\citet{szegedy2016rethinking,deng2009imagenet} \\\midrule
Clean-FID \newline\citet{parmar2022aliased} & Fr\'echet distance (FD)  & Inception-v3 (ImageNet)  \\\midrule
Unbiased FID \newline\citet{chong2020effectively} & Bias-corrected / extrapolated FD & Inception-v3 (ImageNet)  \\\midrule
CLIP-FD & Fr\'echet distance (FD)  & CLIP \citet{radford2021learning} \\\midrule
RETFound-FD & Fr\'echet distance (FD) & RETFound \citet{zhou2023foundation} \\\midrule
KID \newline\citet{binkowski2018demystifying} & Maximum mean discrepancy (MMD) \newline\citet{gretton2006kernel} & Inception-v3 (ImageNet) \\\midrule
CMMD \newline\citet{jayasumana2024rethinking} & Maximum mean discrepancy (MMD)  & CLIP \\\midrule
CAFD \newline\citet{cafd2018} & Class-aware Fr\'echet distance \newline Mixtures of Gaussians (MoG)-based & Typically Inception features \\\midrule
WaM \citet{luzi2023evaluating} & Approx.\ 2-Wasserstein on fitted MoG & Implementation-dependent feature space \\\midrule
FLD \newline\citet{jiralerspong2023feature} & Kullback–Leibler divergence (KLD) \newline between learned MoG & DINOv2 \newline\citet{oquab2024dinov} \\\midrule
FAED \newline\citet{buzuti2023frechet} & Fr\'echet distance (FD)  & VQ-VAE latent \newline\citet{van2017neural} \\\midrule
\midrule
PSNR & Pixelwise error (MSE-derived) & Pixels (no feature extractor) \\\midrule
SSIM \citet{wang2004image} & Structural similarity & Pixels / local statistics (no learned extractor) \\\midrule
IS \citet{salimans2016improved} & KLD-based score from predicted label distribution & Inception-v3 (ImageNet) \\
\bottomrule

\end{tabular}
\end{sidewaystable}


\end{document}
