\documentclass{midl} % Include author names

% The following packages will be automatically loaded.
% Additional packages you added:
\usepackage{booktabs}
\usepackage{multirow}
\usepackage{caption}
\usepackage{rotating}
\usepackage{float}                     % required for \floatstyle
\floatstyle{ruled}\restylefloat{algorithm}
\usepackage{algorithmicx}
\usepackage{algpseudocode}
\usepackage{algorithm}



\jmlrvolume{-- 382}
\jmlryear{2026}
\jmlrworkshop{Full Paper -- MIDL 2026}
\editors{Accepted for publication at MIDL 2026}


\title[Diffusion-Driven Fine-Grained Nodule Synthesis]
{A Diffusion-Driven Fine-Grained Nodule Synthesis Framework for Enhanced Lung Nodule Detection from Chest Radiographs}



% ------------------------------------------------------
% Authors — FIXED, CLEAN, NO extra blank lines
% ------------------------------------------------------
\midlauthor{%
  % \Name{Shreshtha Singh\midljointauthortext{Contributed equally}\nametag{$^{1}$}} \Email{shreshtha.singh@qure.ai}\\%
  % \Name{Aryan Goyal\midlotherjointauthor\nametag{$^{1,2}$}} \Email{21d180006@iitb.ac.in}\\%
  \Name{Shreshtha Singh\midljointauthortext{Contributed equally}\nametag{$^{1}$}} \Email{shreshtha.singh@qure.ai}\\%
  % \Name{Aryan Goyal\midljointauthortext{Contributed equally}\nametag{$^{1}$}} \Email{21d180006@iitb.ac.in}\\%
  \Name{Aryan Goyal\midlotherjointauthor\nametag{$^{1,2}$}} \Email{21d180006@iitb.ac.in}\\%
  \Name{Ashish Mittal\midlotherjointauthor\nametag{$^{1}$}} \Email{ashish.mittal@qure.ai}\\%
  \Name{Manoj Tadepalli\nametag{$^{1}$}} \Email{manoj.tadepalli@qure.ai}\\%
  \Name{Piyush Kumar\nametag{$^{1}$}} \Email{piyush.kumar@qure.ai}\\%
  \Name{Preetham Putha\nametag{$^{1}$}} \Email{preetham.putha@qure.ai} \\
\addr $^{1}$ Qure.ai, India \\%
  \addr $^{2}$ Indian Institute of Technology, Bombay%
}


% ------------------------------------------------------

\begin{document}

\maketitle


% ------------------------------------------------------
% YOUR INPUTS — RESTORED EXACTLY
% ------------------------------------------------------

\begin{abstract}
Early detection of lung cancer in chest radiographs (CXRs) is crucial for improving patient outcomes, yet nodule detection remains challenging due to their subtle appearance and variability in radiological characteristics like size, texture, and boundary. For robust analysis, this diversity must be well represented in training datasets for deep learning based Computer-Assisted Diagnosis (CAD) systems. However, assembling such datasets is costly and often impractical, motivating the need for realistic synthetic data generation. Existing methods lack fine-grained control over synthetic nodule generation, limiting their utility in addressing data scarcity. This paper proposes a novel diffusion-based framework with low-rank adaptation (LoRA) adapters for characteristic controlled nodule synthesis on CXRs. We begin by addressing size and shape control through nodule mask conditioned training of the base diffusion model. To achieve individual characteristic control, we train separate LoRA modules, each dedicated to a specific radiological feature. However, since nodules rarely exhibit isolated characteristics, effective multi-characteristic control requires a balanced integration of features. We address this by leveraging the dynamic composability of LoRAs and revisiting existing merging strategies. Building on this, we identify two key issues: overlapping attention regions and non-orthogonal parameter spaces. To overcome these limitations, we introduce a novel orthogonality loss term during LoRA composition training.
Extensive experiments on both in-house and public datasets demonstrate improved downstream nodule detection. %, achieving higher sensitivity for subtle nodules and AUC gains. 
Radiologist evaluations confirm the fine-grained controllability of our generated nodules, and across multiple quantitative metrics, our method surpasses existing nodule generation approaches for CXRs. \\ 
\textbf{Keywords}: Lung Nodule Synthesis, Chest Radiograph, Diffusion Models , LoRA , LoRA Merging
\end{abstract}

\input{sec/1_intro}
\input{sec/2_formatting}
\input{sec/3_finalcopy}
% ------------------------------------------------------

\clearpage

% \midlacknowledgments{We thank a bunch of people.}

\bibliography{midl26_382}

\appendix
\section{Pseudocode}
\label{app:pseudocode}

\subsection{Training Pipeline for DiT backbone}

\begin{algorithm}[H]
\small
\caption{Training Pipeline for Mask-Conditioned DiT Backbone}
\label{alg:dit-training}
\begin{algorithmic}[1]
  \State \textbf{Input:} Dataset $\mathcal{D}=\{(x,m)\}$ where $x$ is a CXR patch and $m$ is a binary nodule mask.
  \State \textbf{Initialize:} VAE encoder--decoder, DiT backbone $f_\theta$, diffusion schedule $\{\alpha_t\}_{t=1}^T$, optimizer.
  \State \textbf{Repeat until convergence:}
      \State \hspace{1em} Sample $(x,m)\sim\mathcal{D}$.
      \State \hspace{1em} $z_0 \leftarrow \mathrm{VAE.encode}(x)$.
      \State \hspace{1em} $c \leftarrow \mathrm{process}(m)$.
      \State \hspace{1em} Sample $t\sim\{1,\dots,T\}$ and $\epsilon\sim\mathcal{N}(0,I)$.
      \State \hspace{1em} Form noisy latent $z_t \leftarrow \sqrt{\alpha_t}\,z_0 + \sqrt{1-\alpha_t}\,\epsilon$.
      \State \hspace{1em} Predict noise $\hat{\epsilon}\leftarrow f_\theta(z_t,t,c)$.
      \State \hspace{1em} Compute loss $\mathcal{L}_{\mathrm{diff}} \leftarrow \| \epsilon - \hat{\epsilon} \|^2$.
      \State \hspace{1em} Update parameters $\theta \leftarrow \theta - \eta \nabla_\theta \mathcal{L}_{\mathrm{diff}}$.
\end{algorithmic}
\end{algorithm}

\subsection{Training Pipeline for LoRA Adapter}


\begin{algorithm}[h]
\small
\caption{LoRA Adapter Training}
\label{alg:lora-basic}
\begin{algorithmic}[1]
    \State \textbf{Freeze:} backbone parameters $\theta$; train LoRA parameters $\phi$ only
    \State \textbf{Repeat until convergence:}
    \State \hspace{1em} Sample $(x, m) \sim \mathcal{D}$
    \State \hspace{1em} $z_0 \gets \mathrm{VAE.encode}(x)$
    \State \hspace{1em} $c \gets \mathrm{process}(m)$
    \State \hspace{1em} Sample $t \sim \{1, \dots, T\}$ and $\epsilon \sim \mathcal{N}(0, I)$
    \State \hspace{1em} $z_t \gets \sqrt{\alpha_t}\, z_0 + \sqrt{1 - \alpha_t}\, \epsilon$
    \State \hspace{1em} $\hat{\epsilon} \gets f_{\theta,\phi}(z_t, t, c)$
    \State \hspace{1em} Compute $\mathcal{L}_{\mathrm{diff}}$ (plus optional $\mathcal{L}_{\mathrm{ortho}}$, $\mathcal{L}_{\mathrm{con}}$)
    \State \hspace{1em} Update LoRA parameters $\phi$ using the combined loss
\end{algorithmic}
\end{algorithm}

\subsection{Inference Algorithm (Conditional Sampling)}

\begin{algorithm}[H]
\small
\caption{Inference via Mask-Conditioned Reverse Diffusion}
\label{alg:dit-inference}
\begin{algorithmic}[1]
    \State \textbf{Input:} Binary mask $m$, trained backbone $f_\theta$, VAE decoder, diffusion schedule $\{\alpha_t, \sigma_t\}_{t=1}^T$
    \State Sample $z_T \sim \mathcal{N}(0, I)$
    \State $c \gets \mathrm{process}(m)$
    \State \textbf{For timesteps $t = T$ down to $1$:}
    \State \hspace{1em} $\hat{\epsilon} \gets f_\theta(z_t, t, c)$
    \State \hspace{1em} $\mu_\theta \gets \frac{1}{\sqrt{\alpha_t}}\bigl(z_t - (1 - \alpha_t)\hat{\epsilon}\bigr)$
    \State \hspace{1em} Sample $z_{t-1} \sim \mathcal{N}(\mu_\theta, \sigma_t^2 I)$
    \State $\hat{x} \gets \mathrm{VAE.decode}(z_0)$
    \State \textbf{Output:} Synthesized CXR patch $\hat{x}$
\end{algorithmic}
\end{algorithm}

% \section{Training Details}
% % (content unchanged)
% ---------------------------------------------------------

\section{Experimental Setup}

\noindent \textbf{Base Diffusion Model  \&  Training Setup: }The DiT-XL/2 backbone contains 28 transformer blocks, each composed of attention and MLP components.  We initialize the DiT-XL/2 backbone from a publicly available pre-trained checkpoint. Full-resolution chest X-rays are standardized to $960 \times 960$ pixels, from which $256 \times 256$ nodule-centered patches are extracted. These patches are encoded into $32 \times 32$ latent representations using the StabilityAI VAE-FT-EMA. The diffusion process follows a 1000-step DDPM with a linear noise schedule, selected based on pilot experiments to ensure stable convergence and high-fidelity reconstructions. More details are given in the Appendix.

\noindent \textbf{LoRA Implementation: }
We build on the standard LoRA formulation described in Section 4.3. 
For all experiments, the adapter rank is fixed at $r=32$. 
The scaling factor is set to $\alpha=1.0$ for all characteristics, except for subtlety, where we adopt a variable scaling $\alpha(s) = 2^{2+s}$, with $s$ denoting the annotated subtlety levels (1--5). 
The down-projection matrix is initialized with Kaiming uniform initialization, while the up-projection matrix is initialized with zeros. 
%Training is performed in FP32 precision, with adaptive precision applied during inference.



% To allow flexible runtime control, we also include a multiplier \(\gamma\) that adjusts the effective update strength without retraining. 
% Here, \(\alpha\) refers to the LoRA scaling factor, distinct from the subtlety-dependent scaling \(\alpha(s)\) introduced in Section\ref{sec:subtlety}. 


\noindent \textbf{LoRA Integration with DiT Architecture :} LoRA adapters are inserted into the DiT-XL/2 attention mechanism, with Query--Key--Value (QKV) projections as the primary adaptation targets 
and output projections as secondary targets. %We adopt the \texttt{noxattn} training configuration, which excludes cross-attention and time embeddings from adaptation. 
This design allows efficient characteristic-specific adaptation while keeping the 675M-parameter backbone frozen. The total LoRA parameters per characteristic amount to approximately 6.2M parameters, representing only 0.9\% of the base model's parameters.

% ---------------------------------------------------------

\subsection{Hyperparameters}



% ===============================
% BACKBONE TRAINING HYPERPARAMETERS
% ===============================
\begin{table}[h]
\centering
\caption{Backbone Training Hyperparameters (DiT-XL/2)}
\label{tab:backbone-hparams}
\begin{tabular}{l l}
\toprule
\textbf{Parameter} & \textbf{Value} \\
\midrule
Model type              & DiT-XL/2 \\
Input size (latent)     & 32 \\
Patch size              & 2 \\
Hidden size             & 1152 \\
Depth                   & 28 \\
Attention heads         & 16 \\
MLP ratio               & 4.0 \\
Epochs                  & 50{,}000 \\
Batch size              & 80 \\
Learning rate           & $1\times 10^{-4}$ \\
Optimizer               & AdamW ($\beta_1=0.9$, $\beta_2=0.999$, $\epsilon=10^{-8}$) \\
Noise schedule          & Linear, $T=1000$ steps \\
Context conditioning    & Concat-transformer with SpatialConv+Drop \\
CFG scale               & 4.0 \\
\bottomrule
\end{tabular}
\end{table}

\begin{table}[h]
\centering
\caption{LoRA Adapter Training Hyperparameters}
\label{tab:lora-hparams}
\begin{tabular}{l l}
\toprule
\textbf{Parameter} & \textbf{Value} \\
\midrule
Rank ($r$)            & 32 \\
Scaling $\alpha$      & 1.0 \\
Training method       & noxattn \\
Epochs                & 150 \\
Batch size            & 200 \\
Learning rate         & $5\times 10^{-5}$ \\
Optimizer             & AdamW \\
Scheduler             & Constant \\
Weight decay          & 0.01 \\
Precision             & FP32 \\
\bottomrule
\end{tabular}
\end{table}


% ===============================
% CONTRASTIVE TRAINING HYPERPARAMETERS
% ===============================
\begin{table}[h]
\centering
\caption{Contrastive Fine-Tuning Hyperparameters}
\label{tab:contrastive-hparams}
\begin{tabular}{l l}
\toprule
\textbf{Parameter} & \textbf{Value} \\
\midrule
Temperature $\tau$       & 0.07 \\
Margin                   & 1.0 \\
Feature dimension        & 1152 \\
Pooling                  & Mean pooling \\
Feature normalization    & L2 norm \\
Positive pairs           & Subtlety-based grouping \\
Negative pairs           & Random shuffle \\
Min pos/neg samples      & 2 each per batch \\
\bottomrule
\end{tabular}
\end{table}

% ===============================
% ORTHOGONALITY HYPERPARAMETERS
% ===============================
\begin{table}[h]
\centering
\caption{Orthogonality Regularization Hyperparameters}
\label{tab:ortho-hparams}

\scriptsize       % reduce font size for text-width fit
\setlength{\tabcolsep}{4pt}   % tighten column spacing
\renewcommand{\arraystretch}{0.9}  % tighten row spacing

\begin{tabular}{@{}l l@{}}
\toprule
\textbf{Parameter} & \textbf{Value} \\
\midrule
Ortho weight            & 0.5 \\
Loss type               & Frobenius norm \\
Target                  & Identity matrix \\
Adapter pairs           & (calcified, homogeneous), (irregular, homogeneous), (calcified, irregular) \\
Training strategy       & Alternating batches + joint optimization \\
Gradient accumulation   & 2 \\
\bottomrule
\end{tabular}
\end{table}


% ===============================
% INFERENCE HYPERPARAMETERS
% ===============================


% ===============================
% EQUATIONS
% ===============================

% Inpainting objective
\begin{equation}
\mathcal{L}_{\text{inpaint}} = 
\mathbb{E}_{x,m,\epsilon,t} \left[
\left\| \epsilon -
\epsilon_\theta\left(
\sqrt{\bar{\alpha}_t}(x \odot (1-m)) +
\sqrt{1-\bar{\alpha}_t}\,\epsilon, m, t
\right) \right\|^2
\right]
\end{equation}

% Contrastive loss
\begin{equation}
\mathcal{L}_{\text{contrastive}} = 
-\sum_i \log
\frac{\exp\left(\text{sim}(z_i, z_i^+)/\tau\right)}
{\sum_j \exp\left(\text{sim}(z_i, z_j^-)/\tau\right)}
\end{equation}

% Orthogonality loss
\begin{equation}
\mathcal{L}_{\text{ortho}} =
\left\| W_a^\top W_b - I \right\|_F^2
\end{equation}

\begin{table}[H]
\centering
\small % slightly smaller font
\setlength{\tabcolsep}{3pt} % reduce spacing between columns
\caption{LoRA Parameter Distribution in DiT-XL/2}
\begin{tabular}{lccc}
\toprule
\textbf{Component} & \textbf{Dimensions} & \textbf{Per Block} & \textbf{Total} \\
\midrule
QKV Adapters      & $32\times1152 + 3456\times32$ & 147,456 & 4,128,768 \\
Proj Adapters     & $32\times1152 + 1152\times32$ & 73,728  & 2,064,384 \\
\midrule
\textbf{Total per Adapter} & -- & 221,184 & \textbf{6,193,152} \\
\bottomrule
\end{tabular}
\label{tab:lora_params}
\end{table}

\section{Characteristic Definitions}

\textbf{Homogeneity:} Homogeneity refers to the uniformity of radiographic density (intensity levels) within a pulmonary nodule throughout its entire cross--sectional area. In contrast, non-homogeneous (heterogeneous) nodules show uneven density patterns, with some areas appearing brighter and others darker, often indicative of malignancy. % \{add references\}

\noindent \textbf{Boundary morphology (regular vs. irregular):} Regular nodules have smooth, well--defined borders with clear demarcation from lung tissue. Irregular nodules show variable characteristics, including spiculated edges, lobulated contours, or poorly defined margins that blend with surrounding tissue, commonly associated with malignancy. % \{add references\}

\noindent \textbf{Calcification:} Calcified nodules are characterized by high radiographic intensity and are generally smaller in size. Calcification, resulting from calcium deposits, is often associated with benign nodules and appears brighter than the surrounding tissue. % \{add references\}

\noindent \textbf{Subtlety:} Subtle nodules refer to pulmonary lesions that demonstrate minimal radiographic contrast with surrounding lung parenchyma, making them challenging to detect on standard chest X-ray imaging. These nodules typically exhibit low-density characteristics with opacity levels that closely approximate normal lung tissue, resulting in poor visual conspicuity against the background. From the subtlety distribution analysis of our annotated scores, nodules cover a wide spectrum, with most having low subtlety scores (more subtle) and fewer having high scores (more visible).

\noindent \textbf{Nodule size:} Nodule size represents a critical malignancy risk factor, with larger nodules generally indicating higher malignancy probability. However, characteristics typically manifest in combination rather than isolation. Benign nodules commonly present as homogeneous lesions with regular margins and calcification, while malignant nodules frequently exhibit heterogeneous texture with ill-defined borders. The inherent difficulty of detecting subtle nodules underscores the importance of synthetic data generation that incorporates multiple co-occurring characteristics for improved detection and diagnosis models.
\section{Radiologist Evaluation Protocol}

\subsection*{Task 1: Real vs.\ Synthetic Nodule Assessment}
\textbf{Background:}  
To assess the visual realism of synthetic nodules, we inserted AI-generated nodules into authentic chest X-rays and asked radiologists to distinguish them from real clinical nodules.  

\textbf{Data:}  
The evaluation set comprised \textbf{50 chest X-ray images} containing nodules:
\begin{itemize}
    \item \textbf{Real nodules:} Pathological findings from patient scans.  
    \item \textbf{Synthetic nodules:} AI-generated nodules blended into authentic radiographs.  
\end{itemize}

\textbf{Procedure:}  
Radiologists reviewed each image and gave a binary response:
\begin{itemize}
    \item \textbf{Yes (Real):} Nodule appears clinically genuine.  
    \item \textbf{No (Synthetic):} Nodule appears AI-generated.  
\end{itemize}

\textbf{Goal:}  
This task measured how convincing AI-generated nodules appear relative to real clinical nodules.  

\subsection*{Task 2: Characteristic Verification}
\textbf{Background:}  
We next evaluated whether synthetic nodules accurately reflected specific radiological characteristics.  

\textbf{Data:}  
Five morphological categories were tested, with \textbf{10 images per characteristic}:  
Calcified, Homogeneous, Inhomogeneous, Irregular Border, and Regular Border.  
Each set of images was organized into a separate folder with an annotation sheet.  

\textbf{Procedure:}  
For each image, radiologists judged whether the nodule matched the stated feature:
\begin{itemize}
    \item \textbf{Yes:} Exhibits the described characteristic.  
    \item \textbf{No:} Does not match the characteristic.  
\end{itemize}

\textbf{Goal:}  
This task evaluated the morphological fidelity of AI-generated nodules across clinically relevant categories.  

\subsection*{Task 3: Subtlety Ranking}
\textbf{Background:}  
Subtlety, or how easily a nodule can be perceived, is clinically important. We generated nodules at different subtlety levels using our diffusion-based framework.  

\textbf{Data:}  
Radiologists received \textbf{20 sets of images}, each containing \textbf{3 versions of the same nodule} rendered at increasing levels of subtlety.  

\textbf{Procedure:}  
Within each set, radiologists ranked the three images:
\begin{itemize}
    \item Lowest Subtlety (hardest to detect) $\rightarrow$ Highest Subtlety (easiest to detect).  
\end{itemize}

\textbf{Goal:}  
This task tested whether the generative model produced nodules with perceptible and clinically meaningful differences in subtlety.  

\subsection*{Summary}
Together, these tasks: (1) Real vs.\ Synthetic classification, (2) Characteristic verification, and (3) Subtlety ranking, provided a comprehensive evaluation of realism, morphological fidelity, and perceptual detectability. This structured protocol ensured rigorous clinical validation of AI-generated nodules.  


\section{Results And Analysis}
\subsection{Subtlety LoRA Evaluation }
\label{sec:subtlety-eval}
We generated subtle nodules  with Subtlety LoRA($\alpha<24$) and assessed their impact on classification performance using the JSRT dataset, which provides a 5-level subtlety grading. As shown in Table~\ref{tab:subtlety-reversed}, at the highest subtlety level (S1), sensitivity increased by 12\%  at Youden index threshold.
\begin{table}[h]
\centering
\scriptsize
\setlength{\tabcolsep}{3pt}
\renewcommand{\arraystretch}{0.75}
\caption{JSRT classification accuracy across subtlety levels (S1 = most subtle , S5 = least subtle ).}
\label{tab:subtlety-reversed}
\begin{tabular}{l c c c c c}
\toprule
\textbf{Train} & \textbf{S5} & \textbf{S4} & \textbf{S3} & \textbf{S2} & \textbf{S1} \\
\midrule
10k Real            & 100\% & 96\% & 70\% & 69\% & 40\% \\
10k Real + 12k Fake & 100\% & 100\% & 76\% & 76\% & 52\% \\
\bottomrule
\end{tabular}
\end{table}

\subsection{Comparison with Existing Methods}
Table~\ref{tab:combined-metrics} reports quantitative scores for both full-patch synthesis and masked-patch inpainting. Across all metrics, our method consistently outperforms the GAN and fill-based methods, demonstrating its superiority in synthesizing realistic lung nodules.
\begin{table}[h]
\scriptsize
\setlength{\tabcolsep}{3pt}
\renewcommand{\arraystretch}{0.8}
\centering
\caption{Comparison of generation methods on \textit{ChestX-ray14}}
\begin{tabular}{l ccc ccc}
\toprule
\multirow{2}{*}{\textbf{Method}} &
\multicolumn{3}{c}{\textbf{Full}} &
\multicolumn{3}{c}{\textbf{Masked}} \\
\cmidrule(lr){2-4} \cmidrule(lr){5-7}
 & \textbf{PSNR} & \textbf{SSIM} & \textbf{FID} & \textbf{PSNR} & \textbf{SSIM} & \textbf{FID} \\
\midrule
ACGAN      & 37.18 & 0.916 & 0.534 & 26.51 & 0.768 & 0.831 \\
ReACGAN    & 37.44 & 0.916 & 0.604 & 27.54 & 0.786 & 1.227 \\
CR-Fill    & 37.93 & 0.918 & 0.522 & 29.50 & 0.830 & 0.781 \\
DiT-XL/2 (Ours) & \textbf{38.74} & \textbf{0.920} & \textbf{0.390} & \textbf{34.26} & \textbf{0.892} & \textbf{0.475} \\
\bottomrule
\end{tabular}

\label{tab:combined-metrics}
\end{table}


\subsection{{Comparison of CFG Control versus Separate LoRA for Label Guidance}}
\label{sec:cfgvslora}
Using classifier-free guidance (CFG) to control both nodule characteristics and the mask leads to suboptimal adherence to characteristic-specific attributes. To evaluate this, we perform an ablation comparing models trained with CFG-based label control against our approach using separate LoRA adapters. We compute FID scores for each characteristic, and as shown in Table~\ref{tab:cfg-vs-lora}, LoRA-based control yields consistently lower FID values, indicating stronger characteristic fidelity.

\begin{table}[h]
\centering
\caption{FID comparison between CFG-based label control and LoRA-based control across nodule characteristics.}
\label{tab:cfg-vs-lora}
\scriptsize
\setlength{\tabcolsep}{6pt}
\renewcommand{\arraystretch}{0.9}

\begin{tabular}{@{}lcc@{}}
\toprule
\textbf{Characteristic} & \textbf{CFG Control (FID)} & \textbf{LoRA Control (FID)} \\
\midrule
Calcification         & 15.87 $\pm$ 1.51 & 2.29 $\pm$ 0.09 \\
Regular Border        & 4.04  $\pm$ 0.84 & 1.96  $\pm$ 0.12 \\
Irregular Border      & 4.19  $\pm$ 0.59 & 2.90 $\pm$ 0.03 \\
Homogeneous Texture   & 8.48  $\pm$ 1.50 & 5.71 $\pm$ 0.36 \\
Inhomogeneous Texture & 12.58 $\pm$ 1.94 & 8.13 $\pm$ 0.28 \\
\bottomrule
\end{tabular}
\end{table}

\subsection{{Radiologist evaluation with confidence intervals}}
\paragraph{Pooled vs.\ majority-vote metrics.}
We report results using two aggregation schemes across radiologists.
\emph{Pooled (vote-level)} metrics treat each radiologist response as one independent vote. If there are $N$ cases (images or ranking sets) and $R$ radiologists, the pooled denominator is $n = N \times R$. The pooled rate answers: ``Across all individual ratings, how often was the target response selected?''
\emph{Majority-vote (case-level)} metrics first aggregate the $R$ votes per case into a single panel decision (e.g., $\geq 2/3$ agreement), and then compute performance across cases with denominator $n=N$. The majority-vote rate answers: ``For how many cases did the panel agree with the target outcome?'' This provides an image level measure of consensus. We also provide 95\% confidence intervals ( Wilson Score )

\paragraph{Task 1: Real vs.\ synthetic}
Radiologists were presented with real and synthetic samples and asked to judge whether each sample appears \emph{real} or \emph{synthetic}. We report (i) the fraction of real images judged as real and (ii) the fraction of synthetic images judged as real, using both pooled and majority-vote aggregation. The results are  shown in  table \ref{task1}. 


\paragraph{Task 2: Characteristic verification}
Radiologists were shown generated nodules targeting a specific radiological characteristic (e.g., border type, homogeneity, calcification) and asked whether the target characteristic is present. We report pooled and majority-vote ``Yes'' rates per characteristic to quantify how reliably the intended attribute is recognized by experts. The results are  shown in table \ref{task2}. 

\paragraph{Task 3: Subtlety ordering}
Radiologists were presented with small sets generated at different subtlety levels and asked to rank/order them by subtlety. Each set is scored as \emph{correct} vs.\ \emph{incorrect} ordering, and we report pooled and majority vote ranking accuracy, measuring how consistently the intended subtlety control aligns with expert perception. The results are  shown in table \ref{task3}. 


\begin{table}[H]
\centering
\caption{Task 1: Real vs Synthetic summary with 95\% CI}
\renewcommand{\arraystretch}{1.3} % Adds a little padding for readability
\begin{tabular}{@{}p{5.5cm}cccc@{}}
\toprule
\multirow{2}{*}{\textbf{Metric}} & \multicolumn{2}{c}{\textbf{Real images}} & \multicolumn{2}{c}{\textbf{Synthetic images}} \\ 
\cmidrule(lr){2-3} \cmidrule(l){4-5} 
 & \textbf{Rate} & \textbf{95\% CI} & \textbf{Rate} & \textbf{95\% CI} \\ 
\midrule
Pooled ``Looks real'' rate & 
$0.867$ & $[0.703, 0.947]$ & 
$0.700$ & $[0.521, 0.833]$ \\ 
\addlinespace
Majority-vote ``Looks real'' rate & 
$0.900$ & $[0.596, 0.982]$ & 
$0.800$ & $[0.490, 0.943]$ \\  
\bottomrule
\end{tabular}
\label{task1}
\end{table}


\begin{table}[ht]
\centering
\caption{Task 2: Characteristic verification Pooled + Majority-vote with 95\% CI}
\renewcommand{\arraystretch}{1.3}
\begin{tabular}{@{}lcccc@{}}
\toprule
\multirow{2}{*}{\textbf{Characteristic}} & \multicolumn{2}{c}{\textbf{Pooled ``Yes'' rate}} & \multicolumn{2}{c}{\textbf{Majority-vote ``Yes'' rate}} \\ 
\cmidrule(lr){2-3} \cmidrule(l){4-5} 
 & \textbf{Rate (k/n)} & \textbf{95\% CI} & \textbf{Rate (k/n)} & \textbf{95\% CI} \\ 
\midrule
Homogeneous & $24/30 = 0.800$ & $[0.627, 0.905]$ & $9/10 = 0.900$ & $[0.596, 0.982]$ \\
Inhomogeneous & $22/27 = 0.815$ & $[0.633, 0.918]$ & $9/9 = 1.000$ & $[0.701, 1.000]$ \\
Irregular border & $27/30 = 0.900$ & $[0.744, 0.965]$ & $10/10 = 1.000$ & $[0.722, 1.000]$ \\
Regular border & $28/30 = 0.933$ & $[0.787, 0.982]$ & $9/10 = 0.900$ & $[0.596, 0.982]$ \\
Calcified & $20/30 = 0.66$ & $[0.488, 0.808]$ & $8/10 = 0.800$ & $[0.490, 0.943]$ \\ 
\bottomrule
\end{tabular}
\label{task2}
\end{table}

\begin{table}[ht]
\centering
\caption{Task 3 (Subtlety ranking): Accuracy with 95\% CI}
\renewcommand{\arraystretch}{1.3}
\begin{tabular}{@{}p{6cm}ccc@{}}
\toprule
\textbf{Metric} & \textbf{Result (k/n)} & \textbf{Accuracy} & \textbf{95\% CI} \\ 
\midrule
Pooled & 
$20/30$ & $0.667$ & $[0.488, 0.808]$ \\ 
\addlinespace
Majority-vote & 
$8/10$ & $0.800$ & $[0.490, 0.943]$ \\ 
\bottomrule
\end{tabular}
\label{task3}
\end{table}
\subsection{{Orthogonality Verification via Adapter Weight Analysis}}
\label{app:orthogonality_verification}

To verify that the proposed orthogonality loss increases subspace orthogonality rather than trivially shrinking adapter weights, we analyze the behavior of adapter weight matrices under different training configurations. In particular, we compare adapters trained separately against configurations where two adapters are trained jointly with Frobenius-norm regularization.

\noindent Table~\ref{tab:orthogonality_frobenius} reports the aggregate magnitude of the adapter weight matrices across multiple characteristic pairings. Despite the presence of Frobenius-norm regularization, the jointly trained adapters consistently exhibit higher weight magnitudes compared to separately trained counterparts.

\begin{table}[h]
\centering
\caption{Comparison of adapter weight magnitudes for separately trained adapters versus two adapters trained jointly with Frobenius-norm regularization.}
\label{tab:orthogonality_frobenius}
\begin{tabular}{lccc}
\hline
\textbf{Comparison} & \textbf{Separate} & \textbf{Two Adapters (Frobenius)} & \textbf{\% Change} \\
\hline
Calcified vs.\ Homogeneous & 329.01 & 430.89 & +30.97\% \\
Homogeneous vs.\ Calcified & 388.23 & 502.42 & +29.41\% \\
Irregular vs.\ Homogeneous & 539.28 & 711.84 & +32.00\% \\
Homogeneous vs.\ Irregular & 388.23 & 803.59 & +106.99\% \\
\hline
\end{tabular}
\end{table}

\noindent Notably, this increase in weight magnitude occurs even though earlier results show a reduction in the Frobenius norm of individual adapters. This indicates that the orthogonality constraint does not simply suppress adapter activations. Instead, it encourages adapters to occupy more distinct directions in parameter space, leading to improved subspace separation.

\noindent These findings support the claim that the proposed orthogonality loss promotes genuine representational disentanglement between adapters, rather than acting as a magnitude-reducing regularizer.


\subsection{{Computational Cost Analysis}}
\label{app:computational_cost}

This section summarizes the computational requirements of the proposed method, including training time, inference time, and hardware usage.

\noindent All experiments were conducted on NVIDIA L40 GPUs (48\,GB memory). Backbone training was performed in two stages. In Stage~1, the DiT backbone was trained for approximately one week using four L40 GPUs. In Stage~2, training was continued for approximately two additional days on four L40 GPUs. After Stage~2, the backbone parameters were frozen for all subsequent experiments.

\noindent The DiT-XL/2 backbone contains approximately 682M parameters. During adapter training, we optimized only the LoRA parameters, with each adapter comprising 6.2M parameters (approximately 0.91\% of the backbone).Each LoRA adapter was trained for 150 epochs with a batch size of 25 on $256 \times 256$ nodule-centered patches. Training a single adapter required approximately 10 hours on four L40 GPUs. Jointly training two adapters with the proposed orthogonality constraint required approximately 20-30 hours on four L40 GPUs, depending on the characteristic pairing and dataset size.

\noindent Inference was performed with 250 DDIM sampling steps, with an average runtime of approximately 10 seconds per sample. Since the backbone remains frozen at inference time, the computational cost is dominated by diffusion sampling rather than adapter-specific operations.

\noindent  Overall, the proposed approach remains computationally practical, providing parameter-efficient adaptation and manageable inference cost relative to full backbone fine-tuning.

\subsection{{Extending Orthogonality Loss beyond two characteristics}} 
The orthogonality-based merging objective is not restricted to two attributes, it generalizes directly to K adapters by enforcing orthogonality across all selected LoRA updates (e.g., pairwise across the set), so in principle the same framework can synthesize nodules with three or more characteristics simultaneously.
\begin{equation}
\mathcal{L}_{\mathrm{orth}}=\sum_{i\ne j}\lVert W_i^{\top}W_j\rVert_F^2
\end{equation}

We did not include more than 2 characteristic compositions or downstream augmentation experiments in this submission for two practical reasons. First, our available datasets do not contain sufficiently dense co-occurrence annotations (i.e., reliable labels for multiple attributes on the same nodule) to form training/evaluation splits that would support a fair quantitative study of higher-order compositions. Second, as the number of characteristics increases, many combinations become rare or clinically incompatible in practice, leading to very small sample sizes and unstable estimates for both synthesis evaluation and downstream detection benchmarking.

\subsection{{Detection gains plateauing with more synthetic data:}} 
 We can think of two reasons why this is the case:
 \begin{itemize}
 \item Even if images look realistic, large batches of synthetic samples can be less diverse than real (mode concentration around common textures/borders), so additional samples add less new information than expected, leading to saturation.

 \item After some point, adding more synthetic nodules can over-emphasize the synthetic distribution relative to the real one, so the detector starts fitting synthetic-specific cues instead of generalizable ones.
 \end{itemize}


\end{document}
