\documentclass[runningheads]{llncs}
\usepackage[T1]{fontenc}
\usepackage{graphicx}
\usepackage{amsmath,amssymb,amsfonts}
\usepackage{booktabs}
\usepackage{multirow}
\usepackage{xcolor}
\usepackage{colortbl}
\usepackage{array}
\usepackage{float}
\usepackage{svg}
\usepackage{marvosym}
\usepackage{color}
\usepackage{hyperref}
\renewcommand\UrlFont{\color{blue}\rmfamily}
\urlstyle{rm}
\definecolor{darkred}{RGB}{139,0,0}
\begin{document}
\title{Generation of Breast Tumors' Shear Wave Elastography Images from Corresponding Ultrasound Images with US2SWEdiff}
\titlerunning{US2SWEdiff}
\author{Jiaming Huang\inst{1}\textsuperscript{(\Letter)} \and
Kunpeng Qiu\inst{2} \and
Shuihua Wang\inst{3} \and
Feifei Liu\inst{4} \and
\mbox{Kun-Hsing Yu}\inst{1} \and
Christopher P. Bridge\inst{1}\textsuperscript{(\Letter)}}
\authorrunning{J. Huang et al.}
\institute{Harvard University, Boston, MA, USA \and
National University of Singapore, Singapore, Singapore\and
Xi'an Jiaotong-Liverpool University, Suzhou, China \and
Binzhou Medical University Affiliated Hospital, Binzhou, China\\
\email{jiaming\_huang@hsph.harvard.edu, kun-hsing\_yu@hms.harvard.edu, cbridge@mgh.harvard.edu}}
\maketitle

\begin{abstract}
Shear Wave Elastography (SWE) imaging provides tumor stiffness information beyond conventional ultrasound (US) imaging for breast malignancy assessment. Unfortunately, the acquisition of SWE images is costly and labor-intensive, leading to severe data scarcity that hinders the training of discriminative models. Despite our best efforts, we were only able to collect around 550 SWE images. Fortunately, recent advances in diffusion-based generative models offer a promising solution to this limitation. However, synthesizing high-quality and high-fidelity SWE images from such limited data remains a challenge, as naively applying existing methods often leads to unsatisfactory results. To address this challenge, we propose \textbf{US2SWEdiff}, a ControlNet-based framework for generating realistic SWE images from corresponding US images. Using the generated synthetic datasets, we conduct comprehensive downstream experiments with evaluation on an external multi-center dataset. Results consistently demonstrate the superiority of US2SWEdiff over existing baselines, yielding a 10.220\% improvement in AUROC and establishing a new state-of-the-art method for SWE image synthesis. Additionally, to facilitate future research, we release synthetic SWE image datasets generated from three public US datasets, BUSI, BUS-BRA, and Breast-Lesions-USG. Code is available at \url{https://github.com/Jiaming21/US2SWEdiff}.

\keywords{Shear Wave Elastography \and Diffusion Model \and ControlNet \and Breast Tumor \and Image Synthesis.}
\end{abstract}
\section{Introduction}
\label{sec:introduction}

Breast cancer is the most common cancer among women in the United States \cite{cdc_breast_stats_2025,acs_breast_stats_2026} and the second leading cause of cancer-related death in this population \cite{cdc_breast_stats_2025,acs_breast_stats_2026}. Early screening plays a decisive role in breast cancer treatment \cite{who_early_diagnosis_2025}. Benefiting from rapid advances in deep learning, computer-aided diagnosis systems have achieved remarkable success in breast tumor analysis, including lesion detection~\cite{yap2018automated}, segmentation~\cite{ronneberger2015unet}, and malignancy classification~\cite{shen2021ultrasound}, substantially improving the efficiency and accuracy of breast cancer screening. However, the majority of existing deep learning approaches are mainly developed based on US images, due to its wide availability, low cost, and non-invasive nature.

Although conventional US images aid breast tumor analysis with anatomical and morphological information, it lacks quantitative measurements of tissue stiffness, an important biomarker for characterizing breast lesions, as malignant tumors tend to be stiffer than benign lesions \cite{youk_2017,tay_2022}. As a result, US-based diagnosis often suffers from relatively high misclassification rates when differentiating benign from malignant tumors \cite{sood_2019,youk_2017}. In contrast, Shear Wave Elastography (SWE) tracks the propagation speed of mechanically induced shear waves, estimates the corresponding Young's modulus indicative of tissue elasticity \cite{youk_2017} and finally visualizes them in elasticity maps, providing valuable stiffness information for both clinical assessment and machine learning research.

Despite its demonstrated value, deep learning techniques based on SWE images remain underexplored compared with conventional US images due to data scarcity. One major reason is the relatively high cost and limited SWE devices capable of high-quality elasticity map acquisition. Moreover, even if sufficient high-quality SWE data are collected, they are inherently difficult to be shared due to patient privacy regulations and institutional restrictions. Furthermore, potential solutions such as aggregating SWE data from multiple hospitals are hindered by substantial domain discrepancies among images, arising from the lack of standardization in color-bar configurations across imaging devices. Consequently, obtaining large-scale, high-quality, and standardized SWE datasets remains challenging, which has become a major bottleneck for the development of deep learning methods based on SWE images.

Fortunately, the remarkable success of diffusion-based generative models offers a promising solution to the SWE data scarcity challenge. However, as breast tumors present unique lesion texture and elasticity distributions, naively adapting existing image generation models to SWE synthesis often produces suboptimal results. Therefore, we propose US2SWEdiff, a ControlNet-based framework for generating high-quality and high-fidelity SWE images from corresponding US images.

\textbf{Our contributions} are as follows:

(1)~We propose US2SWEdiff, the first model for synthesizing SWE images from corresponding US images.

(2)~We improve ControlNet with a novel mHC-enhanced Dense Hint Input (mHC-DHI) module for enhanced structural conditioning and faster convergence.

(3)~Comprehensive downstream experiments demonstrate the superiority of US2SWEdiff over existing baselines, yielding a 10.220\% improvement in AUROC and establishing a new state-of-the-art method for SWE image synthesis.

(4)~To the best of our knowledge, there is currently no publicly available SWE dataset. Therefore, we release synthetic SWE image datasets generated by US2SWEdiff using three public US datasets: BUSI~\cite{aldhabyani_2020_busi}, BUS-BRA~\cite{gomezflores_2024_busbra}, and Breast-Lesions-USG~\cite{pawlowska_2024_breastlesionsusg}, which are available in our GitHub repository.

(5)~We develop a Gradio interface for US2SWEdiff with two modes supporting both malignancy-aware and malignancy-free SWE image generation, along with the \textit{SWEBreCA-Pred} web server for SWE image-based breast tumor malignancy prediction, both available through the code repository. \textbf{(\textcolor{darkred}{Data Augmentation Only:} synthetic SWE images do not reflect true tumor stiffness; do not substitute them for genuine elastography in cancer prediction.)}

\section{Methods}
\label{sec:methods}

\begin{figure}[t]
    \centering
    \includegraphics[width=0.95\textwidth]{figs_final/fig1_model.pdf}
    \caption{Model architecture of our proposed method, US2SWEdiff.}
    \label{fig:model}
\end{figure}

Starting from an initial latent noise $z_T \sim \mathcal{N}(0,I)$, US2SWEdiff learns a conditional denoising model $\epsilon_{\theta}(z_t,t,c_{\mathrm{prompt}},c_{\mathrm{structure}})$ to estimate the noise at each diffusion timestep $t$. By iteratively removing the predicted noise from $z_t$, the model gradually reconstructs the latent representation of the target SWE image, which is then decoded into the final synthesized SWE image. As shown in Fig.~\ref{fig:model}, US2SWEdiff comprises three key components: (1) prompt guidance indicative of tumor malignancy; (2) the mHC-DHI module for enhanced structural guidance; and (3) the dispersive loss to mitigate feature collapse and improve training stability.

\subsection{Prompt Guidance}

The prompt (``a photo of a benign/malignant breast tumor.'') is first encoded
into token embeddings by a frozen CLIP text encoder. These embeddings are
then fed into the spatial transformer blocks to update the corresponding
in-flow feature maps through multi-head cross-attention, where the query $Q$
is derived from the corresponding in-flow feature maps while the key $K$ and
value $V$ are derived from the token embeddings (see bottom-left panel of
Fig.~\ref{fig:model}). This mechanism explicitly injects tumor-malignancy
information into the SWE generation process, enabling US2SWEdiff to jointly
learn the characteristics of benign and malignant tumors within a single
model and to perform malignancy-consistent synthesis during inference.
However, since prompt adherence is imperfect and the competing baselines do
not incorporate malignancy-aware prompts, we additionally include
US2SWEdiff$^{sep}$, trained separately on benign and malignant cases, to
ensure a fair comparison.

\subsection{mHC-DHI Module}

ControlNet~\cite{zhang_2023_controlnet} injects structural guidance through a
Hint Input (HI) module while preserving the generative capability of the
pre-trained Stable Diffusion backbone. Building on the Dense Hint Input (DHI)
module of Qiu et al.~\cite{qiu_2025_siamese_diffusion}, which improves image
quality and accelerates convergence, we propose an mHC-enhanced DHI (mHC-DHI)
module (Fig.~\ref{fig:model}, right) that incorporates Manifold-Constrained
Hyper-Connections (mHC)~\cite{xie_2025_mhc}. The US image and its Laplacian
edge map are first fused as
$X_{\mathrm{fuse}} = G \odot X_{\mathrm{US}} + (1-G) \odot X_{\mathrm{Lap}}$,
where $G=\sigma(\mathrm{Conv}_{1\times1}([X_{\mathrm{US}},X_{\mathrm{Lap}}]))$.
The fused image is split into $N_p=\tfrac{H}{8}\times\tfrac{W}{8}$
non-overlapping $8\times8$ patches, linearly projected into $C$-dimensional
tokens ($C{=}80$) with learnable positional embeddings, processed by a Conv1D
along the token axis (kernel size 3), and replicated once into $n=4$ parallel
streams $x_0 \in \mathbb{R}^{N_p\times n\times C}$. Three consecutive mHC
blocks follow, each applying two successive hyper-connections that wrap the
multi-head attention (MHA) and FFN sublayers, respectively:
$x_{l+1} = (\mathcal{H}^{\mathrm{post}}_{l})^{\top}
f_l(\mathcal{H}^{\mathrm{pre}}_{l}\, x_{l}) +
\mathcal{H}^{\mathrm{res}}_{l}\, x_{l}$, $f_l \in \{\mathrm{MHA},
\mathrm{FFN}\}$. All coefficients are predicted per token from the
RMS-normalized streams $\tilde{x}_{l}$ and kept on bounded sets:
$\mathcal{H}^{\mathrm{pre}}_{l} =
\sigma(\alpha^{\mathrm{pre}}_{l}(\tilde{x}_{l}\varphi^{\mathrm{pre}}_{l})
+ b^{\mathrm{pre}}_{l}) \in (0,1)^{n}$,
$\mathcal{H}^{\mathrm{post}}_{l} \in (0,2)^{n}$ analogously via
$2\sigma(\cdot)$, and $\mathcal{H}^{\mathrm{res}}_{l}$ is projected onto the
manifold of doubly stochastic matrices by the Sinkhorn--Knopp algorithm,
keeping inter-stream residual mixing norm-preserving across depth. Finally,
the streams are averaged, reshaped to the
$\tfrac{H}{8}\times\tfrac{W}{8}$ grid, and passed through a zero-initialized
$3\times3$ convolution, yielding the guided hint
($320\times\tfrac{H}{8}\times\tfrac{W}{8}$) that is added to the output of
ControlNet's first convolution.

\subsection{Dispersive Loss}

To mitigate feature collapse and improve training stability, we introduce a
dispersive loss~\cite{wang2025diffuse}, instantiated as the mean absolute
pairwise cosine similarity of pooled ControlNet features, applied to the
output feature maps after the zero-initialized convolutional layers (see
upper-left panel of Fig.~\ref{fig:model}). For the $k$-th ControlNet block,
let $f_i^{(k)}$ denote the pooled feature vector of the $i$-th sample in a
mini-batch of size $B$ (set to $15$ in our experiments), obtained by global
average pooling over the corresponding feature map, and let
$\hat{f}_i^{(k)} = f_i^{(k)} / \lVert f_i^{(k)} \rVert_2$ denote its
$\ell_2$-normalized version. The layer-wise dispersive loss is defined as
$\mathcal{L}_{\mathrm{disp}}^{(k)} = \frac{1}{B(B-1)} \sum_{i \ne j}
\big| \cos\!\big( \hat{f}_i^{(k)}, \hat{f}_j^{(k)} \big) \big|$, where
minimizing this objective penalizes high absolute cosine similarity between
samples and therefore encourages feature orthogonality within the mini-batch.
The dispersive loss is applied to the first $K$ (set to $8$ in our
experiments) ControlNet blocks, and the overall dispersive loss is computed
as $\mathcal{L}_{\mathrm{disp}} = \frac{1}{K} \sum_{k=1}^{K}
\mathcal{L}_{\mathrm{disp}}^{(k)}$. The final training objective is then
defined as $\mathcal{L}_{\mathrm{total}} = \mathcal{L}_{\mathrm{diffusion}} +
w \cdot \mathcal{L}_{\mathrm{disp}}$, where $w$ (set to $0.05$ in our
experiments) denotes the dispersive-loss weight and controls the contribution
of the dispersive regularization term.

\section{Experiments}
\label{sec:experiments}

\noindent\textbf{Datasets and preprocessing.}
To train the SWE image-generation model, we collected 554 diagnostic SWE reports from patients with confirmed breast tumor diagnoses. Collected cases go through pathological examination (clinical gold standard for malignancy determination) and confirms 329 benign and 225 malignant cases. Paired US--SWE images were extracted and resized to $256\times256$ pixels. For structural conditioning, each US image was smoothed with a Gaussian filter prior Laplacian edge map extraction to suppress speckle artifacts. The dataset was split into 424 training cases (264 benign and 160 malignant) and 130 testing cases (65 benign and 65 malignant), maintaining a 4:1 train--test split; malignant training cases were randomly oversampled to achieve a 1:1 benign-to-malignant ratio. Both US2SWEdiff$^{sep}$ and US2SWEdiff are trained for 4,000 steps.

\noindent\textbf{Baselines.} For image quality comparison baselines, we use (1) pix2pix \cite{isola_2017}, (2) pix2pixHD \cite{wang_2018_pix2pixhd}, (3) BBDM \cite{li2023bbdm}, (4) BBDM-MICCAI \cite{choo_2024_ct2mri}, a BBDM for CT-to-MRI translation, (5) DBIM \cite{zheng2024dbim}. All baselines used default settings.

\noindent\textbf{Downstream classification setting.}
We build our classifier based on an \mbox{Inception-v3} model~\cite{szegedy_2016_inceptionv3}. To evaluate the downstream benefits of each method, we first train the classifier under one \emph{Real SWE-only} setting (using all available real SWE images from the original training and test sets) and multiple \emph{Real SWE + Synthetic SWE} settings, where synthetic SWE images generated from the original test set by each method are combined with all available real SWE images. All trained classifiers are then directly evaluated on our independently collected outer multi-center test set with 59 benign and 143 malignant cases (see each case's data source in our GitHub repository), followed by the same preprocessing pipeline as before. The classification performance under different settings is subsequently compared to assess the downstream benefit of synthetic SWE images generated by different methods. All classification models are trained for up to 10,000 steps with early stopping, using a batch size of 32, AdamW with a weight decay of $1\times10^{-4}$, a learning rate of $1\times10^{-4}$, and cross-entropy loss.

\noindent\textbf{Implementation details.} PyTorch is used as the deep learning framework. Pretrained weights for Stable Diffusion v1.5 \cite{rombach_2022_ldm} and the CLIP ViT-L/14 \cite{radford_2021_clip} model can be accessed via Hugging Face. Model training was conducted on NVIDIA H200 GPUs.

\noindent\textbf{Evaluation.} On the test set, the synthetic SWE images are evaluated against the corresponding real SWE images using three categories of metrics: (1) distribution-level metrics, including FID$\downarrow$ \cite{heusel_2017_fid}, KID$\downarrow$ \cite{binkowski_2018_kid}, and CMMD$\downarrow$ \cite{jayasumana_2024_cmmd}; (2) per-image metrics, including SSIM$\uparrow$ \cite{wang_2004_ssim} and PSNR$\uparrow$ \cite{huynhthu_2008_psnr} for pixel-level fidelity, together with LPIPS$\downarrow$ \cite{zhang_2018_lpips} for perceptual similarity; and (3) semantic consistency metrics, including CLIP-I$\uparrow$ and CLIP-T$\uparrow$ \cite{radford_2021_clip}. Classification performance is evaluated using Sn (\%), Sp (\%), ACC (\%), F1, MCC, AUROC, and $\Delta$AUROC (relative to the \emph{Real SWE-only} setting). Results are reported as the mean $\pm$ SD over five independent runs. For each method, predictions from the five runs are further ensembled by averaging the predicted probabilities for each test sample. Based on these ensembled predictions, the 95\% confidence intervals (CIs) for AUROC and $\Delta$AUROC are estimated using bootstrap resampling with 10,000 iterations to assess the general performance on unseen outer test sets. Statistical significance is evaluated using a two-sided paired bootstrap test, where the p-value is computed as the proportion of bootstrap $\Delta$AUROC values crossing zero.


\section{Results}
\label{sec:results}
\subsection{Image Quality Comparison}

\textbf{Quantitative Evaluation:} As shown in Table~\ref{tab:image-quality-comparison} and Fig.~\ref{fig:tsne-all-methods}, in the malignancy-separately-trained setting, US2SWEdiff$^{sep}$ outperforms most baselines in terms of FID, KID, CMMD, and CLIP-I, indicative of closer distributional distance and better image-level semantic consistency, with only DBIM achieving comparable performance. However, US2SWEdiff$^{sep}$ substantially outperforms DBIM in terms of SSIM, PSNR, and LPIPS, exhibiting better structural fidelity and perceptual similarity to real SWE images. Furthermore, when evaluated separately by malignancy category, our method achieves notably better FID and KID scores than DBIM on malignant cases (see clearer evidence in the qualitative evaluation), which are of greater clinical importance, further supporting the use of our ControlNet-based scheme for this task. Benefiting from ControlNet’s prompt guidance mechanism, US2SWEdiff jointly learns the characteristics of both benign and malignant SWE images within a single model, whereas the competing baselines require separate models for different malignancy categories. As shown in Table~\ref{tab:image-quality-comparison}, US2SWEdiff maintains competitive distribution-level performance, achieving favorable FID, KID, CMMD, and CLIP-I/T scores. Although our method obtains lower SSIM, PSNR, and LPIPS scores, this is expected because prompt guidance is inherently imperfect and may occasionally generate inaccurate local patterns (see the typical example in the qualitative analysis). Such isolated failure cases are considered to disproportionately affect per-image metrics, whereas distribution-level metrics are less sensitive to these local errors and better reflect the overall quality of the generated images.


\begin{table}[H]
  \centering
  \small
  \caption{Comparison of image quality of synthetic SWE images generated by different methods, evaluated using FID~\cite{heusel_2017_fid}, KID~\cite{binkowski_2018_kid}, CMMD~\cite{jayasumana_2024_cmmd}, SSIM~\cite{wang_2004_ssim}, PSNR~\cite{huynhthu_2008_psnr}, LPIPS~\cite{zhang_2018_lpips}, CLIP-I~\cite{radford_2021_clip}, and CLIP-T~\cite{radford_2021_clip}.}
  \label{tab:image-quality-comparison}
  \resizebox{0.90\linewidth}{!}{
  \begin{tabular}{l|c|ccc|ccc|cc}
    \toprule
    Model & $n_{\text{training pairs}}$ & FID$\downarrow$ & KID$\downarrow$ & CMMD$\downarrow$ & SSIM$\uparrow$ & PSNR$\uparrow$ & LPIPS$\downarrow$ & CLIP-I$\uparrow$ & CLIP-T$\uparrow$ \\
    \midrule
    pix2pix     & $264\times2=528$ & 97.697 & 0.055 & 0.137 & 0.424 & 17.663 & 0.364 & 0.857 & 0.179 \\
    pix2pixHD   & $264\times2=528$ & 110.490 & 0.076 & 0.130 & 0.452 & 18.411 & 0.367 & 0.848 & 0.185 \\
    BBDM        & $264\times2=528$ & 123.642 & 0.072 & 0.096 & 0.488 & 16.155 & 0.442 & 0.879 & 0.185 \\
    BBDM-MICCAI & $264\times2=528$ & 189.839 & 0.114 & 0.112 & 0.484 & 16.378 & 0.461 & 0.870 & 0.191 \\
    \textbf{DBIM}        & $\mathbf{264\times2=528}$ & \textbf{61.829} & \textcolor{darkred}{\textbf{0.010}} & \textcolor{darkred}{\textbf{0.083}} & \textbf{0.584} & \textbf{19.212} & \textbf{0.298} & \textcolor{darkred}{\textbf{0.913}} & \textcolor{darkred}{\textbf{0.198}} \\
    \rowcolor{gray!20}
    --- Benign & $264\times2=528$ & 63.567 & 0.007 & 0.085 & 0.654 & 22.008 & 0.172 & 0.912 & 0.207 \\
    \rowcolor{gray!20}
    --- Malignant & $264\times2=528$ & \underline{86.956} & \underline{0.026} & \textcolor{darkred}{\underline{0.112}} & \underline{0.513} & \underline{16.416} & \underline{0.425} & \textcolor{darkred}{\underline{0.914}} & \textcolor{darkred}{\underline{0.189}} \\
    \textbf{US2SWEdiff$^{sep}$} & $\mathbf{264\times2=528}$ & \textcolor{darkred}{\textbf{61.489}} & \textbf{0.018} & \textbf{0.098} & \textcolor{darkred}{\textbf{0.681}} & \textcolor{darkred}{\textbf{24.477}} & \textcolor{darkred}{\textbf{0.121}} & \textbf{0.898} & \textbf{0.186} \\
    \rowcolor{gray!20}
    --- Benign & $264\times2=528$ & 64.790 & 0.018 & 0.102 & 0.746 & 26.381 & 0.103 & 0.886 & 0.190 \\
    \rowcolor{gray!20}
    --- Malignant & $264\times2=528$ & \textcolor{darkred}{\underline{74.456}} & \textcolor{darkred}{\underline{0.023}} & \underline{0.119} & \textcolor{darkred}{\underline{0.615}} & \textcolor{darkred}{\underline{22.573}} & \textcolor{darkred}{\underline{0.139}} & \underline{0.911} & \underline{0.185} \\
    \textbf{US2SWEdiff (Ours)} & $\mathbf{264\times2=528}$ & \textbf{82.237} & \textbf{0.027} & \textbf{0.108} & \textbf{0.388} & \textbf{16.186} & \textbf{0.390} & \textbf{0.894} & \textbf{0.189} \\
    \bottomrule
  \end{tabular}
  }
\end{table}

\begin{figure}[t]
  \centering
  \includegraphics[width=\textwidth]{figs_final/fig2_tsne_all_methods_grid.pdf}
  \caption{t-SNE visualizations of the feature distributions extracted by ResNet-50 \cite{he_2016_resnet} from synthetic SWE images generated by different methods, compared with those of real SWE images under a unified axis scale.}
  \label{fig:tsne-all-methods}
\end{figure}

\textbf{Qualitative Evaluation:} Fig.~\ref{fig:comparision_page_selected} presents representative SWE images (three benign and three malignant cases) generated by each method, together with the corresponding real SWE images, US images, and Laplacian edge maps. Pix2Pix and Pix2PixHD recover coarse structures but generate fragmented, powder-like elasticity patterns with unstable color distributions. BBDM produces indistinct tumor textures (e.g., Benign Case 3), while its malignant elasticity patterns are sparse and fragmented. BBDM-MICCAI improves tumor texture realism but often generates unrealistic homogeneous or ring-like elasticity patterns for malignant lesions. DBIM also struggles to generate realistic malignant elasticity patterns, which are frequently granular and fragmented. In contrast, US2SWEdiff$^{sep}$ produces more continuous elasticity maps that closely resemble real SWE images while preserving clearer and sharper tumor texture characteristics, thereby potentially facilitating more effective feature extraction for downstream machine learning tasks. Encouragingly, despite jointly modeling benign and malignant cases within a single model, US2SWEdiff still produces realistic malignant elasticity patterns with fine-grained tumor textures. Nevertheless, prompt guidance is inherently imperfect. As shown in Fig.~\ref{fig:comparision_page_selected} (Benign Case 2), the generated benign SWE image contains a localized malignant-like elasticity pattern. Such occasional failure cases explain the relatively lower per-image metrics in Table~\ref{tab:image-quality-comparison}.


\begin{figure}[t]
  \centering
  \includegraphics[width=0.95\textwidth]{figs_final/fig3_comparison_page_selected.png}
  \caption{(a)--(c) Three benign and three malignant examples of real SWE images, real US images, and Laplacian edge maps extracted from the corresponding real US images. (d)--(i) Corresponding examples of synthetic SWE images generated by each method.}
  \label{fig:comparision_page_selected}
\end{figure}

\subsection{Classification Performance Comparison}

Table~\ref{tab:downstream-realswe} shows that US2SWEdiff$^{sep}$ and US2SWEdiff achieve the largest AUROC improvements over the Real SWE-only baseline, with gains of 0.081 (+10.479\%) and 0.079 (+10.220\%), respectively. Both improvements are statistically significant ($p=0.001$ and $p=0.002$, respectively), demonstrating the effectiveness of our ControlNet-based framework.

\begin{table}[t]
  \centering
  \scriptsize
  \setlength{\tabcolsep}{2pt}
  \caption{Comparison of downstream breast malignancy classification performance on the collected multi-center SWE test set between Inception-v3~\cite{szegedy_2016_inceptionv3} classifiers trained on real SWE images alone and those trained on real SWE images augmented with synthetic SWE images generated by each method. Significance codes: ***$p\leq0.001$, **$p\leq0.01$, *$p\leq0.05$, and ns: not significant ($p>0.05$).}
  \label{tab:downstream-realswe}
  \resizebox{\linewidth}{!}{
  \begin{tabular}{l|c|c|c|c|c|c|c|c|c|c|c}
    \toprule
    Method & $n_{\text{training pairs}}$ & Sn (\%) & Sp (\%) & ACC (\%) & F1 & MCC & AUROC & AUROC 95\% CI & $\Delta$AUROC & $\Delta$AUROC 95\% CI & $p$-value \\
    \midrule
    Real SWE & $394\times2=658$ & $44.756 \pm 21.335$ & $90.850 \pm 8.181$ & $58.218 \pm 13.795$ & $0.577 \pm 0.209$ & $0.349 \pm 0.135$ & $0.773 \pm 0.043$ & [0.753, 0.874] & $0.000 \pm 0.000$ (REF) & [0.000, 0.000] (REF) & 1.000 (ns) (REF) \\
    pix2pix & $394\times2+65\times2=788$ & $73.006 \pm 25.008$ & $63.050 \pm 32.392$ & $70.100 \pm 9.051$ & $0.756 \pm 0.128$ & $0.391 \pm 0.053$ & $0.820 \pm 0.031$ & [0.794, 0.903] & $0.047 \pm 0.058$ & [-0.010, 0.081] & 0.122 (ns) \\
    pix2pixHD & $394\times2+65\times2=788$ & $83.216 \pm 20.585$ & $46.780 \pm 36.050$ & $72.576 \pm 6.490$ & $0.801 \pm 0.089$ & $0.341 \pm 0.143$ & $0.775 \pm 0.098$ & [0.790, 0.900] & $0.001 \pm 0.088$ & [-0.010, 0.074] & 0.136 (ns) \\
    BBDM & $394\times2+65\times2=788$ & $77.624 \pm 16.488$ & $58.306 \pm 24.404$ & $71.982 \pm 6.118$ & $0.790 \pm 0.077$ & $0.368 \pm 0.078$ & $0.791 \pm 0.027$ & [0.769, 0.891] & $0.018 \pm 0.046$ & [-0.030, 0.065] & 0.497 (ns) \\
    BBDM-MICCAI & $394\times2+65\times2=788$ & $79.300 \pm 20.544$ & $43.388 \pm 28.097$ & $68.812 \pm 6.847$ & $0.772 \pm 0.088$ & $0.266 \pm 0.050$ & $0.744 \pm 0.042$ & [0.749, 0.876] & $-0.029 \pm 0.054$ & [-0.050, 0.051] & 0.989 (ns) \\
    DBIM & $394\times2+65\times2=788$ & $78.184 \pm 13.571$ & $62.034 \pm 32.835$ & $73.466 \pm 2.536$ & $0.804 \pm 0.033$ & $0.400 \pm 0.135$ & $0.842 \pm 0.015$ & [0.812, 0.916] & $0.069 \pm 0.052$ & [0.007, 0.096] & 0.022 (*) \\
    US2SWEdiff$^{sep}$ & $394\times2+65\times2=788$ & $79.300 \pm 9.532$ & $73.562 \pm 14.660$ & $77.624 \pm 2.510$ & $0.832 \pm 0.033$ & $0.511 \pm 0.018$ & \textcolor{darkred}{$\mathbf{0.854 \pm 0.016}$} & [0.827, 0.926] & \textcolor{darkred}{$\mathbf{0.081 \pm 0.054}$} & [0.027, 0.104] & \textcolor{darkred}{\textbf{0.001 (***)}} \\
    \textbf{US2SWEdiff (Ours)} & $394\times2+65\times2=788$ & $59.022 \pm 18.351$ & $87.460 \pm 14.901$ & $67.326 \pm 9.938$ & $0.706 \pm 0.122$ & $0.443 \pm 0.094$ & \textcolor{darkred}{$\mathbf{0.852 \pm 0.019}$} & [0.821, 0.923] & \textcolor{darkred}{$\mathbf{0.079 \pm 0.052}$} & [0.020, 0.101] & \textcolor{darkred}{\textbf{0.002 (**)}} \\
    \bottomrule
  \end{tabular}
  }
\end{table}

\section{Conclusion}
\label{sec:conclusion}

In summary, we proposed US2SWEdiff, a ControlNet-based latent diffusion model for synthesizing breast tumor SWE images from corresponding US images. By incorporating prompt guidance and mHC-DHI-enhanced, dispersive loss-regularized structural guidance, US2SWEdiff achieves superior image synthesis quality and consistently improves downstream SWE-based malignancy classification, demonstrating its potential to alleviate the limited availability of SWE data.

\subsubsection*{Acknowledgments.} C.P.B. is supported by NIH grant R01EB033773. K.-H.Y. is supported in part by the National Institute of General Medical Sciences grant R35GM142879, the National Heart, Lung, and Blood Institute grant R01HL174679, the Department of Defense Peer Reviewed Cancer Research Program Career Development Award HT9425-23-1-0523, the Research Scholar Grant RSG-24-1253761-01-ESED (\href{https://doi.org/10.53354/ACS.RSG-24-1253761-01-ESED.pc.gr.193749}{DOI: 10.53354/ACS.RSG-24-1253761-01-ESED.pc.gr.193749}) from the American Cancer Society, and the Harvard Medical School Dean's Innovation Award. S.W. is supported by the Basic Research Program of Jiangsu under Grant BK20241815 and the XJTLU Development Fund RDF-23-02-004. F.L. is supported by the Shandong Provincial Natural Science Foundation (ZR2023QH231) and the Shandong Provincial Key Research and Development Program of Medicine and Health (202409021312).

\subsubsection*{Disclosure of Interests.} The authors have no competing interests to declare.
\bibliographystyle{splncs04}
\bibliography{references_final}
\end{document}