\begin{figure}[t]
\centering
\setlength{\tabcolsep}{1pt} % very small horizontal spacing
\renewcommand{\arraystretch}{2} % tighter vertical spacing
\begin{tabular}{ccccc}
\scriptsize \textbf{Ground Truth} & \scriptsize \textbf{ACGAN} & 
\scriptsize \textbf{ReACGAN} & 
\scriptsize \textbf{CR-Fill} & 
\scriptsize \textbf{Ours} \\[-3pt]
\includegraphics[width=0.19\linewidth]{cxr14_gan_comparison_images/og_patch.png} &
\includegraphics[width=0.19\linewidth]{cxr14_gan_comparison_images/acgan.png} &
\includegraphics[width=0.19\linewidth]{cxr14_gan_comparison_images/reacgan.png} &
\includegraphics[width=0.19\linewidth]{cxr14_gan_comparison_images/crfill.png} &
\includegraphics[width=0.19\linewidth]{cxr14_gan_comparison_images/diffusion.png} \\
\end{tabular}
\vspace{-10pt}
\caption{Comparison of nodule generations across different methods: ACGAN\cite{odena2017acgan}, ReACGAN\cite{lee2021reacgan}, CR-Fill\cite{zhao2021crfill}, and Ours.}
\label{fig:existing methods comparison}
\end{figure}
\section{Introduction}
\label{sec:intro}
Lung cancer remains a leading cause of cancer mortality \citep{lc_1}. 
Early detection is critical and  localized tumors can reach five-year survival 
rates above 70\%~\citep{seer_csr_1975_2014}. Despite advances in imaging, many cases are 
first suspected on chest X-rays (CXR), one of the most common tests 
\citep{Rogers2010}; accurate nodule detection on CXR is therefore essential. 
Pulmonary nodules exhibit key characteristics such as size, calcification, 
border definition, and homogeneity that are essential for malignancy assessment 
\citep{albert2009solitary}. Robust CAD requires datasets spanning these traits, 
yet high-quality annotated CXRs are scarce, manual labeling is labor-intensive, 
costly, and suffer from intra-observer variability \citep{lc_5}. Synthetic augmentation can expand training 
data and improve detectors \citep{schultheiss2021, hanaoka2024, wang2022synthetic}, 
but current nodule synthesis or inpainting approaches lack fine-grained control over nodule characteristics. Given the high CXR miss 
rate, precise and clinically faithful control of nodule attributes is needed.

\noindent Diffusion models offer strong potential for precise generative control, outperforming GANs in image quality, stability, and diversity \cite{dhariwal2021diffusion, yang2022survey}. Among them, Diffusion Transformers (DiT) \cite{peebles-2023} achieve superior performance compared to large UNet-based architectures such as SDXL \cite{podell2023sdxl}. Controllability in diffusion models has advanced through lightweight conditioning mechanisms, including ControlNet \cite{zhang2023controlnet} and LoRA-based methods \cite{hu2021lora, lora-diffusion-2023, lyu2023lowrank}, which steer generation toward specific attributes without full-model retraining. Concept Sliders \cite{gandikota-2023} further enable continuous and compositional control by learning semantic directions in latent activation space. 

\noindent In our work, we adopt DiT as our backbone and condition it with binary masks to control nodule shape, size, and location. After training the backbone, we attach separate LoRA modules \cite{hu2021lora} for four radiological attributes:  calcification, border definition (regular/irregular), homogeneity, and subtlety by using characteristic-specific subsets to capture fine-grained distinctions without affecting general nodule synthesis. Since nodules often exhibit multiple attributes, we explore LoRA combination strategies like LoRA-Switch \cite{kong2024loraswitchboostingefficiencydynamic}, linear merging \cite{prabhakar2024lorasoupsmergingloras} and training-based fusion ZipLoRA \cite{shah2023ziplorasubjectstyleeffectively}. We find these methods limited by spatial competition and interference from non-orthogonal adapter weights. To address this, we propose a training-based merging strategy with a Frobenius norm penalty that encourages orthogonality across LoRA matrices.

\noindent The proposed framework (i) introduces a novel diffusion-based approach for generating synthetic lung nodules on CXRs, (ii) enables controllable synthesis of key radiological characteristics through characteristic-specific and merged LoRA adapters, and (iii) is validated through extensive experiments on both in-house and public datasets, demonstrating consistent improvements over existing methods, including downstream detection gains with AUCs of 0.90 on JSRT and 0.93 on CheXray14, further supported by radiologist evaluations. The code will be  released at:
\url{https://github.com/shreshthasingh00/Nodule-Crafter-Diffusion-driven-Nodule-synthesis-on-CXR/}

\section{Related Work}

\textbf{Lung Nodule Synthesis:}
Early work synthesized nodules by forward-projecting CT-derived annotations onto radiographs \citep{schultheiss2021, litjens-2010, behrendt2023systematic}. Other methods generate nodules directly in masked CXR regions using inpainting \citep{sogancioglu2018chestxrayinpaintingdeep} or feature-level blending \citep{gundel-2021}. GAN-based approaches \citep{shen-2022} enable factorized control over shape, size, and texture but lack fine-grained characteristic manipulation. To the best of our knowledge, no prior work has investigated diffusion models for fine-grained, controllable nodule synthesis in CXRs.


\noindent \textbf{Controllable Image Generation:}
Early controllability in diffusion models was achieved through classifier and classifier-free guidance \citep{dhariwal2021diffusion, ho2022classifierfreediffusionguidance}, followed by lightweight conditioning modules such as T2I-Adapters and ControlNet \citep{mou2023t2iadapter, zhang2023controlnet}. Editing and personalization methods, including prompt-to-prompt and DreamBooth \citep{hertz2022prompt, ruiz2023dreambooth}, enabled localized and concept-specific control.
Among these, LoRA \citep{hu2021lora} has emerged as a dominant mechanism for controllable generation as it enables efficient low-rank fine-tuning and allows for  composability of  adapter modules  % that can inject and combine specific attributes at inference, making it well suited for fine-grained characteristic control as pursued in this work.

\noindent \textbf{LoRA Merging: } Merging multiple LoRA adapters remains challenging, with works such as ZipLoRA, Mix-of-Show, and K-LoRA \citep{shah2023ziplorasubjectstyleeffectively, gu2023mixofshowdecentralizedlowrankadaptation, ouyang2025kloraunlockingtrainingfreefusion} showing that naive fusion leads to concept conflicts, loss of identity, and attenuation of fine details. Several approaches aim to mitigate these issues: DO-Merging \citep{zheng2025decoupleorthogonalizedatafreeframework} enforces layer-wise orthogonalization of LoRA directions, LoRI \citep{zhang2025lorireducingcrosstaskinterference} reduces cross-task interference via sparse masking and frozen projections, and ZipLoRA further introduces trainable merger coefficients to balance layer-wise adapter contributions. These methods attempt to resolve conflicts after independent trainings. In contrast, our method integrates a Frobenius norm based orthogonality loss directly into the training of each characteristic-specific adapter, ensuring that the learned LoRAs are inherently compatible for merging. 




\section{Datasets}





\noindent \textbf{Nodule Characteristics Definitions:} 
Pulmonary nodules exhibit several radiological attributes important for distinguishing them from mimickers and assessing malignancy. Their size ranges from a few millimeters up to 3 cm. Calcification, arising from calcium deposits, is frequently associated with benignity \cite{lc_7}. Border definition reflects edge morphology: regular, well-defined margins are typically stable, whereas irregular, spiculated, or lobulated borders may indicate malignancy \cite{lc_8}. Homogeneity describes texture uniformity; homogeneous nodules show consistent intensity, while inhomogeneous ones exhibit variation due to necrosis or vascularity, features often linked to malignant processes \cite{Balagurunathan2019}. Perceptual subtlety is also critical, as nodules may be faint or obscured by ribs and vessels, making detection challenging.




\noindent \textbf{In-house and Public Datasets: }
Our in-house dataset comprises 1.2M frontal-view CXRs from partner hospitals, including 40k chest X-rays with pulmonary nodules. Each nodule is delineated with shape annotations and labeled for calcification (7,875), regular border (10,424), irregular border (5,153), homogeneous texture (4,640), inhomogeneous texture (5,883), and subtlety (5,000 cases graded 1--5), with all annotations independently provided by three experienced radiologists. We further split the in-house trainset for generation and downstream trainings.
For evaluation, we additionally use the public \textit{ChestX-ray14} \cite{kufel-2023} and \textit{JSRT} \citep{shiraishi1996computer} datasets, where JSRT also includes subtlety scores (1 = most subtle, 5 = most obvious) for nodules. Both public datasets provide nodule bounding boxes; we refine these using our segmentation model trained on the in-house data and align predictions with the provided boxes. All selected segmentations were reviewed by radiologists for consistency. Dataset statistics for all the datasets are provided in Table~\ref{tab:test-dataset}. % Details on  the key radiological characteristics of pulmonary nodules is  given in Appendix 4
\begin{table}[h]
  \centering
  \caption{Summary of datasets used for training and evaluation.}
  \label{tab:test-dataset}

  \scriptsize
  \setlength{\tabcolsep}{6pt}
  \renewcommand{\arraystretch}{0.9}

  \begin{tabular}{@{} l cccc cc @{}}
    \toprule
      & \multicolumn{3}{c}{\textbf{In-house}} 
      & \multicolumn{1}{c}{\textbf{JSRT}}
      & \multicolumn{1}{c}{\textbf{CheX-ray14}} \\
    \cmidrule(lr){2-4}

      & \textit{Diffusion Trainset} 
      & \textit{Downstream Trainset} 
      & \textit{Testset} 
      & \textit{Testset}
      & \textit{Testset} \\

    \midrule
    \textbf{Total samples}  
      & 1{,}100{,}000 
      & 100{,}000 
      & 12{,}000 
      & 247 
      & 500 \\

    \textbf{Nodule samples} 
      & 28{,}000 
      & 10{,}000  
      & 2{,}000  
      & 154 
      & 66 \\
    \bottomrule
  \end{tabular}
\end{table}

\begin{figure}[t]
\centering
\begin{tabular}{ccc}

{\scriptsize (a) Original patch } & {\scriptsize (b) Nodule mask} & {\scriptsize (c) Generated Patch} \\[2pt]
\includegraphics[width=0.15\linewidth]{diffusion_baseline_gen/og_patch.png} &
\includegraphics[width=0.15\linewidth]{diffusion_baseline_gen/mask.png} &
\includegraphics[width=0.15\linewidth]{diffusion_baseline_gen/generated.png} \\[-1pt]
\end{tabular}
\caption{ Diffusion Backbone generation on a chest X-ray patch:  given an original CXR patch and a binary nodule mask, the model generates a nodule within the masked region}
\label{fig:Diffusion baseline generation}
\end{figure}

\begin{figure*}[t]
    \centering
    \includegraphics[width=\textwidth]{Diffusion_LoRA_final.png}
    % \caption{Overview of the proposed diffusion-based nodule synthesis framework. The pipeline comprises three main training phases. \textbf{Stage 1} (Base Model Training): The DiT-XL/2 backbone is pre-trained on 2 million chest X-ray (CXR) patches. \textbf{Stage 2} (Nodule Patch Fine-tuning): The base model is fine-tuned on nodule-centered patches using binary nodule masks as spatial conditioning to enable localized nodule synthesis. \textbf{Stage 3.1} (Characteristic-Specific LoRA Training): For each radiological attribute (e.g., calcification, homogeneity, regularity and subtlety), individual LoRA adapters are trained while keeping the backbone frozen, using characteristic-curated datasets. \textbf{Stage 3.2} (LoRA Merge Training): Multiple characteristic-specific LoRAs are jointly trained with an orthogonality regularization loss to mitigate interference and enable compositional multi-attribute control. \textbf{LoRA DiT Architecture}: LoRA modules are inserted into the attention layers of DiT blocks, allowing efficient adapter-based fine-tuning while keeping the backbone frozen}
    \caption{Overview of the pipeline with characteristic control. Training proceeds in 3 stages: \textbf{Stage 1 (Base Model Training)}-pre-train DiT-XL/2 on 2M of CXR patches; \textbf{Stage 2 (Nodule Patch Fine-tuning)}-finetune on nodule-centered patches with binary nodule masks for localized synthesis; \textbf{Stage 3.1 (Characteristic-Specific LoRA Training)}-train individual LoRA adapters (calcification, homogeneity, border regularity, subtlety) with the backbone frozen, using characteristic-curated datasets; \textbf{Stage 3.2 (LoRA Merge Training)}-jointly train selected adapters with an orthogonality regularizer}
    \label{fig:your_label}
\end{figure*}




