\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images
\usepackage{cite}
\usepackage{amsmath,amssymb,amsfonts}
\usepackage{algorithmic}
\usepackage{graphicx}
\usepackage{textcomp}
\usepackage{xcolor}
\usepackage{float}
\usepackage{multirow}
\usepackage{threeparttable}
\usepackage{mathrsfs}
\usepackage{url}
\usepackage{paralist}
\usepackage{soul}
\usepackage[T1]{fontenc}
\usepackage{hyperref}

\jmlrvolume{-- nnn}
\jmlryear{2026}
\jmlrworkshop{Full Paper -- MIDL 2026}
\editors{Accepted for publication at MIDL 2026}

\title[Fair Glaucoma Diagnosis with Cycle Diffusion]{Uncertainty-aware Cycle Diffusion Model for Fair Glaucoma Diagnosis}

\midlauthor{\Name{Ziheng Wang\midlotherjointauthor\nametag{$^{1}$}} \Email{ziheng.wang@kaust.edu.sa}\\
\addr $^{1}$ Bioengineering, Biomedical Sciences Division, King Abdullah University of Science and Technology (KAUST), Thuwal, Saudi Arabia \AND
\Name{Shuran Yang\midljointauthortext{Contributed equally}\nametag{$^{2}$}} \Email{sryang@xzmu.edu.cn}\\
\addr $^{2}$ School of Journalism and Communication, Xizang Minzu University, Xianyang, China \AND
\Name{Yan Lin\nametag{$^{3}$}} \Email{y.lin64@ncl.ac.uk}\\
\addr $^{3}$ School of Computing, Newcastle University, Newcastle upon Tyne, UK  \AND
\Name{Wenrui Zang\nametag{$^{4}$}} \Email{yc57670@um.edu.mo}\\
\addr $^{4}$ Faculty of Health Sciences, University of Macau, Macao SAR, China \AND
\Name{Yanda Meng\midljointauthortext{Corresponding author}\nametag{$^{1}$}} \Email{yanda.meng@kaust.edu.sa}
}


\begin{document}

\maketitle

\begin{abstract}
Fairness has become a critical ethical concern, particularly in AI-based healthcare applications. Data imbalance and limited sample size can lead to lower diagnostic performance. Consequently, this harms the fairness of AI when applied to real-world scenarios. Generative models, like diffusion models, offer a promising solution by generating diverse synthetic data to support underrepresented groups. This improves fairness and performance while mitigating privacy risks. We propose a shape-controlled framework that incorporates demographic information into an end-to-end diffusion model, along with an automatic selection strategy to identify overconfidently misclassified samples.
These challenging samples are then augmented via the generative model to enhance its classification performance. The strategy also removes potentially misleading ``lower-quality'' synthetic samples. 
Two ophthalmic experts validated the clinical relevance and plausibility of our synthetic images through random external examination. 
Our method outperforms state-of-the-art methods on the Harvard-FairVLMed dataset in both fairness and diagnosis accuracy.
Our code is available at \url{https://github.com/WANG-ZIHENG/CCG}.

\end{abstract}

\begin{keywords}
Fairness Learning, Image Synthesis, Diffusion Models, ControlNet
\end{keywords}

\section{Introduction}
\label{sec:intro}

Glaucoma is an irreversible optic nerve disorder that can lead to blindness if left untreated~\citep{blindness_1}. While AI-enhanced computer vision has been successfully applied to glaucoma diagnosis on retinal images~\citep{Glaucoma_CV,meng2022dual,yu2025robust,liu2025incomplete}, these models often inherit demographic imbalances from their training data, resulting in systematically poorer performance for minority groups~\citep{FairSeg}. A recent study~\citep{FairCLIP} reported that Black communities are more than four times as likely to have undiagnosed glaucoma compared with white communities, highlighting the need to address fairness in AI-assisted glaucoma diagnosis.

Generative models have been explored in the medical domain~\citep{Diffusion_survey,ktena2024generative}, enriching training datasets to mitigate representation imbalance of certain populations or disease conditions. However, existing approaches~\citep{SADM,male_CT} rarely consider fairness-related attributes during synthesis. Many prior methods~\citep{Fairdiff,balance_dataset} generate synthetic data based only on overall dataset distribution, which may yield redundant or less informative samples. Importantly, underrepresented subgroup samples often constitute many of the model's hard cases due to limited subgroup-specific training signals, as also observed in recent work~\citep{ktena2024generative}. 
In contrast, our method incorporates demographic identity information into the diffusion generation pipeline to support subgroup-aware generation, while introducing a behavior-driven selection mechanism that targets hard examples---i.e., samples misclassified with high confidence. 
By generating synthetic variants of these challenging cases, our approach improves both diagnostic performance and fairness.

We adopted the ControlNet-guided Stable Diffusion model (ControlNet-guided SD)~\citep{ControlNet} to generate scanning laser ophthalmoscopy (SLO) images for glaucoma diagnosis. By integrating demographic identity information and clinical records as text prompts and using the optic disc segmentation mask as a shape control input, our model generates fairness-aware, shape-controlled synthetic images, enhancing diagnostic performance. 
Additionally, we design a Sorter module that leverages an
overconfident error metric, calculated using prediction error and uncertainty, to automatically select challenging samples for data augmentation and subsequent training, while filtering out ``lower-quality'' synthetic images. 
Because the synthetic images are generated from overconfidently misclassified cases and are conditioned on consistent label and demographic attributes, the augmentation directly targets the classifier's failure modes. Guided by the Sorter rather than global data balancing, this targeted augmentation provides additional supervision exactly where the model underperforms, improving both diagnostic accuracy and subgroup fairness.
Notably, among the saved synthetic samples, 100 of them were randomly selected and validated by ophthalmology experts for clinical relevance and plausibility, ensuring their applicability in real-world glaucoma diagnosis. In summary, our method improves glaucoma diagnosis and enhances model fairness across both majority and minority groups.

Our main contributions are: 
(1) We propose Cycle Control Generation (CCG), an end-to-end framework combining ControlNet-guided SD and a CNN classifier to generate clinically meaningful, demographically conditioned SLO images. 
(2) We introduce a behavior-driven sample selection strategy based on overconfident errors to identify high-confidence misclassifications. 
(3) We design a dynamic augmentation pipeline that iteratively updates challenging samples and removes misleading ones, guiding the model to focus on failure regions. 
(4) Our generated images show high visual quality and diagnostic relevance, 
% validated by expert evaluation and quantitative metrics.
validated through ophthalmologist evaluations and quantitative metrics.

\section{Related Work}

Diffusion models~\citep{Diffusion_modes,Diffusion_survey} have recently gained popularity for their ability to generate high-quality and diverse samples. 
For instance, diffusion models with classifier guidance have been used to generate realistic and meaningful counterfactuals for retinal imaging~\citep{ilanchezian2025development}.
SynDiff~\citep{ozbey2023unsupervised} outperforms existing methods in multi-contrast MRI and MRI-CT translation.
ControlPolypNet~\citep{ControlPolypNet} synthesizes realistic colon polyp images from non-polyp frames, improving segmentation performance. 
UnIACorN~\citep{UnIACorN} leverages target-domain uncertainty and source-domain labels to synthesize labeled target-style OCT data, improving cross-domain segmentation.
In this work, we similarly use diffusion-based synthesis, but incorporate fairness by using demographic identity information as text prompt inputs for the diffusion model.

Fairness in AI-based medical image analysis is an essential ethical issue. 
Recent studies in ophthalmic imaging have introduced demographic-aware datasets to address subgroup disparities. 
FairSeg~\citep{FairSeg} proposes an error bound scaling method that reweights the loss by group-specific error bounds, improving fairness.
Similarly, FairDiff~\citep{Fairdiff} uses a two-stage diffusion framework that first generates cup-to-disc contours and then uses them as shape conditions in a ControlNet-guided SD to synthesize SLO images, aiming to balance subgroup representation and improve segmentation fairness. 
Unlike these methods, we adopt a behavior-driven strategy targeting overconfident misclassifications and use generative augmentation to address model weaknesses. This focus on challenging cases improves accuracy and fairness across subgroups.

\section{Method}

\subsection{Metrics of Sample Selection Strategy}
\label{sec: Metrics of Sample Selection Strategy}
\subsubsection{Prediction Error}
For a dataset $X=\{x_1,x_2,...,x_n\}$ containing $n$ samples, the prediction error $e_1$ for a sample $x_1$ with label $g_1$ and model output logits $l_1$ is calculated as follows: 
\begin{equation}
    e_1=\left | \sigma (l_1)-g_1 \right | ,
    \label{eq:e}
\end{equation}
where $\sigma$ is the sigmoid function, the prediction error $e_1$ quantifies the absolute difference between the predicted probability $\sigma(l_1)$ and the ground truth label $g_1$. A large $e_1$ indicates that the model struggles with the sample, implying it is under-learned or difficult. Thus, $e_1$ serves as a useful signal for identifying samples needing more attention during training.

\subsubsection{Uncertainty}

As emphasized in prior work~\citep{quantifying_uncertainty}, quantifying uncertainty is valuable for classification tasks. To leverage the value of uncertainty, we follow the method in~\citep{Uncertainty} and
add $T$ instances of random noise $\epsilon$ to the sample $x_1$, where $\epsilon \sim N(0,\sigma ^2_{\text{noise}})$ (with $\sigma^2_{\text{noise}}$ denoting the variance of the noise). This results in $\tilde{x}_{1,j} =x_1+\epsilon_j$ for $j \in \left \{ 1,2,...,T \right \}$. The average output logits of the model $f$ for the sample $x_1$ are computed as $\bar{l_1} = \frac{1}{T}  {\textstyle \sum_{j=1}^{T}} l_{1,j}$, where $l_{1,j}$ denotes the output of the model $f$ for $\tilde{x}_{1,j}$. Consequently, the uncertainty \( u_1 \) in the model's prediction for the sample \( x_1 \) is defined as follows:
\begin{equation}
    u_1=-\sum_{k}\sigma (\bar{l} _{1,k}) log(\sigma (\bar{l} _{1,k})),
    \label{eq:T}
\end{equation}
where \( k \) represents the classes of the dataset and $\sigma$ is the sigmoid function. 
$u_1$ is used as an uncertainty measure derived from the model outputs under stochastic perturbations.
The uncertainty measure \(u_1\) reflects the model's predicted variability and confidence in its output for a given instance. A lower \(u_1\) indicates greater stability and higher confidence, 
implying minimal influence from stochastic perturbations and noise on the prediction. 


\subsubsection{Overconfident Error}

Given that a smaller uncertainty $u$ indicates higher confidence, a large prediction error $e$ with small $u$ implies the model is confidently wrong---an \emph{overconfident error}, which is especially harmful in medical diagnosis~\citep{O_E_harmful_1,O_E_harmful_2}. To quantify this, we define the \emph{overconfident error value} $v_1$ for sample $x_1$ as the product of prediction error and confidence $(1-u_1)$:
\begin{equation}
    v_1 = e_1 \cdot (1 - u_1).
    \label{eq:v}
\end{equation}
Here, $(1-u_1)$ is a confidence factor, so samples with large prediction error and low uncertainty receive higher scores.
We then use $v_1$ to identify overconfidently misclassified samples and perform targeted augmentation by generating label-consistent synthetic variants,
where uncertainty helps prioritize failure cases over sample underrepresentation alone.


\begin{figure*}[!t]
\begin{center}
    \begin{tabular}{c|c}
    \centering
    \includegraphics[width=0.6\linewidth]{MIDL_2026_model.pdf} &
    \includegraphics[width=0.35\linewidth]{dynamic_sorter.pdf}\\
    (a)&(b)
    \end{tabular}
    \end{center}
    \caption{(a) Overview of the proposed Cycle Control Generation framework for generating synthetic SLO images. (b) Illustration of the dynamic update process of overconfidently misclassified samples automatically selected by the Sorter during training.}
    \label{fig:CCG}
\end{figure*}

\subsection{Cycle Control Generation}
\label{sec: Cycle Control Generation}

As illustrated in Fig.~\ref{fig:CCG}(a), the proposed Cycle Control Generation (CCG) is an end-to-end framework that integrates a CNN-based classification model, EfficientNet~\citep{Efficientnet}, with a ControlNet-guided SD~\citep{ControlNet}. We use ControlNet to condition synthesis on optic disc segmentation masks and on demographic and clinical text prompts, so the generator produces shape-controlled and subgroup-aware SLO images that align with downstream diagnosis. The pipeline is organized around the overconfident error in Eq.~\ref{eq:v}, which scores each sample and drives targeted augmentation and pruning during training.

Concretely, the CNN outputs evaluation scores $v$ for the source dataset $D_s$ according to Eq.~\ref{eq:v}. The Sorter module ranks the samples based on their $v$ values and selects the top $m\%$ samples, which are identified as overconfidently misclassified by the CNN model. These challenging samples, corresponding text prompts (including demographic identity information and clinical records), and segmentation masks are fed into the ControlNet-guided SD~\citep{ControlNet} to generate synthetic samples. The generated images are then stored in a generated dataset $D_g$, thereby expanding the number of challenging samples and improving classification performance in the subsequent training cycle.

Starting from Cycle 1, the CNN model is fine-tuned using both $D_s$ and $D_g$, initializing from the model weights that achieved the highest AUC on the validation set in the previous cycle. The CNN outputs $v$ values for both $D_s$ and $D_g$. The top $m\%$ samples from $D_s$, based on their $v$ values, are selected for the current cycle's generation process. 
Our method then ranks all samples in $D_g$ according to their $v$ values and removes the top $n\%$ samples. Here, ``lower-quality'' synthetic samples in $D_g$ should not be confused with the challenging source samples selected from $D_s$. In our framework, high-$v$ samples in $D_s$ are informative hard cases used to guide generation, whereas high-$v$ samples in $D_g$ are treated as less reliable synthetic training samples and are therefore removed. This prevents the model from being misled in subsequent training iterations.


Fig.~\ref{fig:CCG}(b) illustrates the dynamic update process of overconfidently misclassified samples automatically selected by the Sorter module during training. 
At Cycle 0, Sample 5 is identified as an overconfident error and is used for sample generation. 
After its sample count is augmented, the CNN may learn to classify it better in Cycle 1, resulting in a lower $v$ value. Consequently, it is no longer selected by the Sorter module. Instead, Sample 4, now identified as an overconfident error, is selected for generation. As training progresses, the set of overconfidently misclassified samples changes dynamically. In other words, the model focuses on refining challenging samples, leading to improved classification performance.


\section{Experiments}

\subsection{Datasets}

We utilized a subset of 7,363 images from the Harvard-FairVLMed (FairVLMed) dataset~\citep{FairCLIP}, as used in~\citep{GlaucoDiff},
including 5,266 images for training, 692 images for validation, and 1,405 images for testing.
Each SLO fundus image is accompanied by demographic identity group information and clinical records written by ophthalmologists.
During training, synthetic SLO images were iteratively generated and incorporated into the training set. By the end of training, 510 synthetic images had been added.

\begin{table}[h]
\begin{center}
\begin{tabular}[\columnwidth]{llccccc}
\hline
          Attribute                 &  Group   & train & val & test & \%  &  generated\\ \hline
\multirow{2}{*}{Glaucoma}  & With         & 2414  & 304        & 638  & 45.58 & 310 \\ 
                           & Without      & 2852  & 388        & 767  & 54.42 & 200 \\ \hline
\multirow{3}{*}{Race}      & Asian        & 396   & 41         & 98   & 7.27 & 43 \\ 
                           & Black        & 651   & 85         & 177  & 12.40 & 77 \\  
                           & White        & 4219  & 566        & 1130 & 80.33 & 390 \\ \hline
\multirow{2}{*}{Gender}    & Female       & 3055  & 406        & 789  & 57.72 & 224 \\  
                           & Male         & 2211  & 286        & 616  & 42.28 & 286 \\ \hline
\multirow{2}{*}{Ethnicity} & Non-Hispanic & 4789  & 626        & 1271 & 90.81 & 466 \\  
                           & Hispanic     & 477   & 66         & 134  & 9.19  & 44 \\ \hline
\end{tabular}
\end{center}
\caption{Attribute and group distribution in original and generated datasets.}
\label{tab:dataset}
\end{table}

A detailed breakdown of the dataset's attributes, including race, gender, and ethnicity distribution across the training, validation, test sets, and generated synthetic SLO images, is provided in Table~\ref{tab:dataset}. The percentage (\%) column in the table represents the proportion of each attribute across the entire dataset (training, validation, and test sets combined). For example, 45.58\% indicates that samples with glaucoma make up 45.58\% of the total dataset. 
We also provide the data distribution across various attributes and groups for the generated dataset, shown in the last column in Table~\ref{tab:dataset}.

\subsection{Text and Shape Conditioning}

We condition synthesis on both text and shape. For text, we convert demographic and clinical attributes from FairVLMed~\citep{FairCLIP} into short prompts (e.g., ``Male, White, Non-Hispanic, with Glaucoma''). 
This encourages the generated images to better reflect diverse subgroups and facilitates subgroup-aware fairness evaluation.
Diagnosis information is used solely for generation and no parameters are shared with the classifier, preventing information leakage. For shape, we fine-tuned a pretrained TransUNet~\citep{transunet} on Harvard FairSeg~\citep{FairSeg} and used it to segment optic disc regions in FairVLMed~\citep{FairCLIP}. The resulting masks serve as control inputs to the generator.

\subsection{Implementation Details and Evaluation Metrics}

We run our CCG framework for 5 cycles in PyTorch on an NVIDIA RTX 4090. Each cycle consists of: (1) fine-tuning the CNN classifier on the current dataset $D_s \cup D_g$ for 30 epochs; (2) scoring and ranking samples with the Sorter using the overconfident error $v$; 
(3) removing the top $n\%$ synthetic samples in $D_g$ and selecting the top $m\%$ hard cases from $D_s$, both ranked by $v$, to fine-tune ControlNet for 2 epochs and generate synthetic images;
and (4) augmenting $D_g$ with the newly generated samples for the next cycle. 
In all experiments, we set $n=50$ and $m=5$. $n=50$ was selected from candidate pruning ratios (e.g., 40\%, 50\%, and 60\%) based on validation AUC and subgroup performance, while $m=5$ controls the proportion of hard cases used to guide generation.
We use AdamW with learning rate $1\mathrm{e}{-5}$, weight decay $6\mathrm{e}{-5}$, batch size $16$, and binary cross-entropy loss for classification. We set $T=8$ in Eq.~\ref{eq:T}, following prior work~\citep{Uncertainty}. 
All remaining hyperparameters are chosen by cross-validation on the training set. 
To ensure a fair comparison, all competing methods are trained with the same total number of CNN fine-tuning epochs as our method.

We evaluate classification with ACC, AUC, Precision (Prec.), Sensitivity (Sens.), and F1. Fairness is assessed with Demographic Parity Difference (DPD)~\citep{DPD_DEOdds,DPD} and Difference in Equalized Odds (DEOdds)~\citep{DPD_DEOdds}. Image quality is measured with Fr\'echet Inception Distance (FID)~\citep{FID}, Learned Perceptual Image Patch Similarity (LPIPS)~\citep{LPIPS}, Structural Similarity (SSIM)~\citep{SSIM}, and Multiscale-SSIM (MS-SSIM)~\citep{MS_SSIM}, complemented by expert review. Classification metrics are reported as percentages, and image quality metrics are reported as raw scores.

\section{Results}

\subsection{Classification Performance and Fairness}
\label{sec:Performance}


The \textit{Baseline}, as shown in Tables~\ref{tab:result_1} and~\ref{tab:attribute}, is the framework that ablates the Sorter module and generative model, meaning without generative operations and uses only the classification model to classify the source data.
The compared fairness methods include FairCLIP~\citep{FairCLIP}, FIN~\citep{Harvard_glaucoma_fairness}, FairDomain~\citep{FairDomain}, and FairVision~\citep{FairVision}, which improve fairness through Sinkhorn-based distribution alignment, fair identity normalization, fair identity attention under domain shift, and fair identity scaling, respectively. 
Unlike these methods, our CCG framework introduces behavior-driven hard-case selection, cyclic refinement, and shape-controlled generative augmentation.
Table~\ref{tab:result_1} shows that our method
consistently outperforms previous related methods across all performance metrics on the FairVLMed dataset, significantly surpassing the baseline with notable improvements. 
Specifically, our method shows a 6.78\% increase in ACC, 8.47\% in AUC, 12.27\% in Precision, 5.27\% in Sensitivity, and an 8.11\% gain in F1-score compared to the baseline. Notably, the simultaneous improvement in both precision and sensitivity in AI-assisted medical diagnosis indicates a reduction in both false negatives and false positives. This enhancement strengthens disease detection capabilities in medical image analysis, which is crucial for early diagnosis.



\begin{table}[htbp]
\begin{center}
\begin{tabular}{lccccc}
\hline
\textbf{Method}     & \textbf{ACC $\uparrow$} & \textbf{AUC $\uparrow$} & \textbf{Prec. $\uparrow$} & \textbf{Sens. $\uparrow$} & \textbf{F1 $\uparrow$}  \\
\hline
Baseline   & 69.47          & 75.93          & 71.46              & 60.41            & 65.50          \\ 
FairCLIP~\citep{FairCLIP}  & 74.03          & 82.78          & 83.41              & 61.25            & 70.63          \\ 
FIN~\citep{Harvard_glaucoma_fairness}        & 74.86          & 80.43          & 75.86              & 65.49            & 70.29          \\ 
FairDomain~\citep{FairDomain} & 73.91          & 80.16          & 81.47              & 62.83            & 70.95          \\ 
FairVision~\citep{FairVision} & 74.78          & 81.29          & 75.55              & 65.41            & 70.12          \\
Our work   & \textbf{76.25} & \textbf{84.40} & \textbf{83.73}     & \textbf{65.68}   & \textbf{73.61} \\ \hline
\end{tabular}
\end{center}
\caption{Testing the effectiveness of our work, we compared its overall performance with other methods. The best performance results are highlighted in bold.}
\label{tab:result_1}
\end{table}

Table~\ref{tab:attribute} presents the classification performance and fairness metrics of our method across various demographic attributes. In terms of fairness, our approach consistently outperforms existing methods by achieving the lowest DPD and DEOdds values across all attributes, indicating better mitigation of demographic disparities. 
For classification performance, our method achieves balanced improvements across groups regardless of their proportion in the dataset (as shown in Table~\ref{tab:dataset}), without compromising any individual group's performance. 
For the Ethnicity attribute, both the non-Hispanic group (90.81\%) and the Hispanic group (9.19\%) achieved at least a 2\% AUC improvement over other methods. A similar trend of balanced improvement was observed for the Race and Gender attributes.

\begin{table*}[h]
\centering
% \setlength{\tabcolsep}{4pt} 
\begin{tabular}{llccccc}
\hline
\textbf{Attribute}         & \textbf{Method} & \textbf{DPD $\downarrow$} & \textbf{DEOdds $\downarrow$} & \multicolumn{3}{c}{\textbf{Group-wise AUC $\uparrow$}}                                                 \\ \hline
\multirow{7}{*}{Race}      &                 &                  &                     & \multicolumn{1}{c}{Asian}              & \multicolumn{1}{c}{Black}          & White          \\  
                           & Baseline        & 14.90            & 14.38               & \multicolumn{1}{c}{75.53}              & \multicolumn{1}{c}{72.26}          & 76.46          \\ 
                           & FairCLIP        & 14.19            & 9.72                & \multicolumn{1}{c}{84.92}              & \multicolumn{1}{c}{78.29}          & 83.14          \\  
                           & FIN             & 7.79             & 13.45               & \multicolumn{1}{c}{85.78}              & \multicolumn{1}{c}{79.04}          & 80.18          \\ 
                           & FairDomain      & 7.94             & 3.51                & \multicolumn{1}{c}{80.75}              & \multicolumn{1}{c}{77.91}          & 80.63          \\ 
                           & FairVision      & 14.73            & 9.72                & \multicolumn{1}{c}{86.41}              & \multicolumn{1}{c}{77.30}          & 81.26          \\  
                           & Our work        & \textbf{6.25}    & \textbf{3.29}       & \multicolumn{1}{c}{\textbf{87.03}}     & \multicolumn{1}{c}{\textbf{79.98}} & \textbf{84.29} \\ \hline
\multirow{7}{*}{Gender}    &                 &                  &                     & \multicolumn{1}{c}{Female}             & \multicolumn{1}{c}{Male}           &                \\ 
                           & Baseline        & 7.69             & 13.92               & \multicolumn{1}{c}{74.51}              & \multicolumn{1}{c}{77.78}          &                \\ 
                           & FairCLIP        & 2.17             & 5.94                & \multicolumn{1}{c}{80.82}              & \multicolumn{1}{c}{84.90}          &                \\ 
                           & FIN             & 1.91             & 5.91                & \multicolumn{1}{c}{78.64}              & \multicolumn{1}{c}{82.41}          &                \\ 
                           & FairDomain      & 5.22             & 9.96                & \multicolumn{1}{c}{77.08}              & \multicolumn{1}{c}{83.79}          &                \\ 
                           & FairVision      & 2.16             & 4.63                & \multicolumn{1}{c}{79.76}              & \multicolumn{1}{c}{82.73}          &                \\  
                           & Our work        & \textbf{1.74}    & \textbf{4.38}       & \multicolumn{1}{c}{\textbf{82.81}}     & \multicolumn{1}{c}{\textbf{86.09}} &                \\ \hline
\multirow{7}{*}{Ethnicity} &                 &                  &                     & \multicolumn{1}{c}{Non-Hispanic}          & \multicolumn{1}{c}{Hispanic}          &                \\ 
                           & Baseline        & 20.08            & 24.69               & \multicolumn{1}{c}{75.66}              & \multicolumn{1}{c}{73.18}          &                \\ 
                           & FairCLIP        & 2.71             & 3.94                & \multicolumn{1}{c}{82.19}              & \multicolumn{1}{c}{80.03}          &                \\ 
                           & FIN             & 18.46            & 23.88               & \multicolumn{1}{c}{79.95}              & \multicolumn{1}{c}{78.41}          &                \\ 
                           & FairDomain      & 12.53            & 13.45               & \multicolumn{1}{c}{79.72}              & \multicolumn{1}{c}{77.65}          &                \\ 
                           & FairVision      & 14.29            & 19.48               & \multicolumn{1}{c}{80.87}              & \multicolumn{1}{c}{78.39}          &                \\ 
                           & Our work        & \textbf{2.64}    & \textbf{3.53}       & \multicolumn{1}{c}{\textbf{84.28}}     & \multicolumn{1}{c}{\textbf{82.30}} &                \\ \hline
\end{tabular}
\caption{
Fairness metrics and group-wise AUC across demographics, comparing our method, baseline, and prior work (FairCLIP~\citep{FairCLIP}, FIN~\citep{Harvard_glaucoma_fairness}, FairDomain~\citep{FairDomain}, FairVision~\citep{FairVision}).
}
\label{tab:attribute}
\end{table*}


It is important to note that demographic sample size does not necessarily determine classification performance. For instance, although the Female group constitutes 57.72\% of the dataset---more than the Male group at 42.28\%---all methods still show lower performance on the Female subgroup (Table~\ref{tab:attribute}). 
This highlights that fairness disparities are not caused solely by demographic imbalance, and that simply increasing samples or improving average accuracy does not ensure fair learning across attributes~\citep{deho2023assessing}.
By incorporating uncertainty into the overconfident error, our method prioritizes confidently wrong samples, enabling targeted augmentation of subgroup-specific failure cases rather than uniform sample expansion.
Moreover, fairness-driven objectives, such as loss terms based on inter-group differences, may fail to capture subgroup-specific failure mechanisms and sometimes reduce disparities by lowering performance on stronger groups. 
In contrast, our method does not explicitly rebalance data according to demographic labels, but instead identifies overconfidently misclassified samples and applies semantic-preserving augmentation with a generative model. 
This behavior-driven strategy enables the model to learn from failure regions and yields consistent performance gains across subgroups, thereby improving fairness.

\subsection{Ablation Experiment}
\subsubsection{Effect of Sample Selection Metrics}

As shown in Section~\ref{sec:Performance}, our framework improves both classification and fairness over the Baseline. To assess the role of Overconfident Error (Eq.~\ref{eq:v}), we compared it with Prediction Error (Eq.~\ref{eq:e}) and Uncertainty (Eq.~\ref{eq:T}) individually. Results in Table~\ref{tab:ablation} indicate that Overconfident Error yields superior performance across all five metrics, with gains of at least 3\% in ACC, Precision, Sensitivity, and F1-score, and 5\% in AUC. This confirms it as a more effective criterion for selecting challenging samples and filtering lower-quality synthetic ones.

\begin{table}[h]
\begin{center}
\begin{tabular}[\columnwidth]{lccccc}
\hline
\textbf{Method}      & \textbf{ACC $\uparrow$}   & \textbf{AUC $\uparrow$}       & \textbf{Prec. $\uparrow$} & \textbf{Sens. $\uparrow$} & \textbf{F1 $\uparrow$} \\ \hline
Prediction Error       & 71.32          & 78.79                             & 78.24                          & 61.12           & 68.63                 \\ 
Uncertainty & 73.14          & 79.26                             & 80.51                          & 62.57          & 70.42                  \\
Our work   & \textbf{76.25} & \textbf{84.40} & \textbf{83.73}     & \textbf{65.68}   & \textbf{73.61}                  \\ \hline
\end{tabular}
\end{center}
\caption{Performance comparison of different sample selection strategies.}
\label{tab:ablation}
\end{table}

\subsubsection{Contribution of the Cyclic Framework}

To evaluate the effectiveness of our approach, we generated a comparable number of synthetic images using ControlNet, matching the total number of saved images in $D_g$. These images were directly added to the training set, referred to as CNN + Synth in Table~\ref{tab:ablation_each}. The results in Table~\ref{tab:ablation_each} show that, although CNN + Synth also improves model classification performance compared to training on the source dataset with CNN alone, our method outperforms CNN + Synth across all metrics. Specifically, our approach surpasses CNN + Synth by 3.06\% in ACC, 2.88\% in AUC, 4.06\% in Precision, 1.76\% in Sensitivity, and 2.68\% in F1-score.

\begin{table}[htbp]
\centering
\begin{tabular}{lccccc}
\hline
\textbf{Method} & \textbf{ACC $\uparrow$} & \textbf{AUC $\uparrow$} & \textbf{Prec. $\uparrow$} & \textbf{Sens. $\uparrow$} & \textbf{F1 $\uparrow$}  \\ \hline
CNN             & 69.47          & 75.93          & 71.46              & 60.41            & 65.50          \\
CNN + Synth     & 73.19          & 81.52          & 79.67              & 63.92            & 70.93          \\
CNN + 2Synth    & 73.02          & 81.86          & 77.91              & 64.50            & 70.57          \\
CNN + 5Synth    & 73.16          & 81.57          & 76.43              & 64.86            & 70.07          \\
Our work        & \textbf{76.25} & \textbf{84.40} & \textbf{83.73}     & \textbf{65.68}   & \textbf{73.61} \\ 
\hline
\end{tabular}
\caption{Ablation study comparing direct synthetic augmentation with our work.}
\label{tab:ablation_each}
\end{table}

We further report CNN + 2Synth and CNN + 5Synth in Table~\ref{tab:ablation_each}, where ``2Synth'' and ``5Synth'' denote synthetic datasets with twice and five times the size of $D_g$. Results show that simply enlarging synthetic data does not improve performance, as many generated samples are easily classified and add limited information, potentially leading to overfitting. In contrast, our method augments difficult samples, enabling the classifier to learn from challenging cases and achieve better generalization.

We also evaluated a variant without synthetic image filtering and observed an approximately 1\% drop in ACC, AUC, and F1, indicating that retaining all generated samples, including lower-quality ones, can negatively affect downstream training, whereas pruning improves cyclic augmentation. This aligns with prior studies that apply selective augmentation or quality control before downstream use~\citep{HistoGAN,ControlPolypNet}. 
A complementary direction would be to further disentangle the individual contribution of demographic identity prompts within the generative conditioning. As demographic prompts in our framework also serve as the attribute assignment mechanism for synthetic samples, exploring alternative protocols for isolating their effect is an interesting avenue that we leave for future work.


\subsection{Evaluation of Generated Data}
\subsubsection{Qualitative Quality}
To validate clinical quality, two experienced ophthalmologists independently evaluated 100 randomly sampled synthetic SLO images from $D_g$, without diagnostic prompts or labels to avoid bias. Each image was judged as ``suggestive of glaucoma'' or ``not suggestive,'' primarily based on the cup-to-disc ratio~\citep{CDR_glaucoma}. Experts noted that the images showed clinically relevant features, including clear optic disc boundaries, accurate cup-to-disc visualization, minimal artifacts, and physiologically consistent vascular structures, contributing to high confidence in their evaluations. The intraclass correlation coefficient (ICC)~\citep{ICC} exceeded 0.95, indicating excellent interrater agreement. The evaluation achieved nearly 90\% diagnostic accuracy compared to the intended generation labels, and as shown in Fig.~\ref{fig:generated_images}, expert assessments of ``with Glaucoma'' and ``without Glaucoma'' closely matched the generation conditions, confirming the clinical validity of the synthetic images.

\begin{figure*}[htbp]
\begin{center}
    \centering
    \begin{minipage}{0.16\textwidth}
        \centering
        \includegraphics[width=\linewidth]{Synthetic_Samples/yes/00017_yes.png} 
    \end{minipage}\hfill%
    \begin{minipage}{0.16\textwidth}
        \centering
        \includegraphics[width=\linewidth]{Synthetic_Samples/yes/00070_yes.png} 
    \end{minipage}\hfill%
    \begin{minipage}{0.16\textwidth}
        \centering
        \includegraphics[width=\linewidth]{Synthetic_Samples/yes/00071_yes.png} 
    \end{minipage}\hfill%
    \begin{minipage}{0.16\textwidth}
        \centering
        \includegraphics[width=\linewidth]{Synthetic_Samples_2/yes/00027_yes.png} 
    \end{minipage}\hfill%
    \begin{minipage}{0.16\textwidth}
        \centering
        \includegraphics[width=\linewidth]{Synthetic_Samples_2/yes/00037_yes.png} 
    \end{minipage}\hfill%
    \begin{minipage}{0.16\textwidth}
        \centering
        \includegraphics[width=\linewidth]{Synthetic_Samples_2/yes/00092_yes.png} 
    \end{minipage}

    \begin{minipage}{0.16\textwidth}
        \centering
        \includegraphics[width=\linewidth]{Synthetic_Samples/no/00003_no.png} 
    \end{minipage}\hfill%
    \begin{minipage}{0.16\textwidth}
        \centering
        \includegraphics[width=\linewidth]{Synthetic_Samples/no/00073_no.png} 
    \end{minipage}\hfill%
    \begin{minipage}{0.16\textwidth}
        \centering
        \includegraphics[width=\linewidth]{Synthetic_Samples/no/03005_no.png} 
    \end{minipage}\hfill%
    \begin{minipage}{0.16\textwidth}
        \centering
        \includegraphics[width=\linewidth]{Synthetic_Samples_2/no/00004_no.png} 
    \end{minipage}\hfill%
    \begin{minipage}{0.16\textwidth}
        \centering
        \includegraphics[width=\linewidth]{Synthetic_Samples_2/no/00023_no.png} 
    \end{minipage}\hfill%
    \begin{minipage}{0.16\textwidth}
        \centering
        \includegraphics[width=\linewidth]{Synthetic_Samples_2/no/00089_no.png} 
    \end{minipage}
    \end{center}
    \caption{Synthetic SLO images generated with text prompts: first row ``with Glaucoma,'' second row ``without Glaucoma.''}
    \label{fig:generated_images}
\end{figure*}


\subsubsection{Quantitative Quality}

We compute the FID, SSIM, MS-SSIM, and LPIPS metrics for the generated images, comparing them with the source dataset under the ``With Glaucoma'' and ``Without Glaucoma'' categories. As shown in Table~\ref{tab:Quantitative}, our method consistently outperforms ControlNet across all metrics. These results demonstrate the effectiveness of our approach in producing clinically meaningful, high-quality images, which also partially supports the conclusions drawn from the expert qualitative assessment. Our work is trained on challenging samples that may include complex or uncommon pathological features and variations in structure, angle, and clarity, enabling the model to learn richer details.

\begin{table}[htbp]
\centering
\begin{tabular}{llcc}
\hline
\textbf{Metric}          & \textbf{Method} & \textbf{With Glaucoma} & \textbf{Without Glaucoma} \\ \hline
\multirow{2}{*}{FID $\downarrow$}     & ControlNet       & 56.50                  & 98.20                     \\
                         & Our work        & \textbf{43.40}         & \textbf{63.50}             \\ \hline
\multirow{2}{*}{LPIPS $\downarrow$}   & ControlNet       & 0.5187                 & 0.5365                    \\
                         & Our work        & \textbf{0.4990}        & \textbf{0.4879} \\ \hline
\multirow{2}{*}{SSIM $\uparrow$}    & ControlNet       & 0.3136                 & 0.3301                    \\
                         & Our work        & \textbf{0.3411}        & \textbf{0.3848}           \\ \hline
\multirow{2}{*}{MS-SSIM $\uparrow$} & ControlNet       & 0.2402                 & 0.2278                    \\
                         & Our work        & \textbf{0.2459}        & \textbf{0.2417}           \\ \hline
\end{tabular}
\caption{Quantitative evaluation of the synthetic image quality generated by our proposed method compared to ControlNet~\citep{ControlNet} generation.}
\label{tab:Quantitative}
\end{table}



\section{Conclusion}

In this work, we propose Cycle Control Generation, an end-to-end framework that integrates a CNN classifier with a ControlNet-guided diffusion model to generate clinically relevant synthetic SLO images. By leveraging a behavior-driven Sorter module to identify and augment overconfidently misclassified samples while filtering out low-quality generated ones, our method improves diagnostic accuracy and fairness. Extensive experiments on the FairVLMed dataset demonstrate superior performance over advanced methods. 
While our framework uses demographic information to enhance fairness, it also applies to datasets without such attributes by improving the classification of challenging samples. 
Within our framework, demographic information is incorporated as part of the generative conditioning to support subgroup-aware synthesis, and further disentangling its individual contribution within the generative pipeline is a natural extension that we leave for future work.


\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{This work is supported by YM's KAUST baseline research funding BAS/1/1121-01-01.}


\bibliography{midl26_81}


\end{document}
