% This is a modified version of Springer's LNCS template suitable for anonymized MICCAI 2025 main conference submissions. 
% Original file: samplepaper.tex, a sample chapter demonstrating the LLNCS macro package for Springer Computer Science proceedings; Version 2.21 of 2022/01/12

\documentclass[runningheads]{llncs}
%
\usepackage[T1]{fontenc}
% T1 fonts will be used to generate the final print and online PDFs,
% so please use T1 fonts in your manuscript whenever possible.
% Other font encodings may result in incorrect characters.
%
\usepackage{graphicx,verbatim}
% Used for displaying a sample figure. If possible, figure files should
% be included in EPS format.
%
% If you use the hyperref package, please uncomment the following two lines
% to display URLs in blue roman font according to Springer's eBook style:
% \usepackage{color}
% \renewcommand\UrlFont{\color{blue}\rmfamily}
%\urlstyle{rm}
%
\usepackage{amssymb}
\usepackage{amsmath}
\usepackage{booktabs}
\usepackage{multirow}
\usepackage{color}
\usepackage[colorlinks=true,linkcolor=blue,citecolor=blue,urlcolor=blue,]{hyperref}
\usepackage{bbding}
\usepackage[misc]{ifsym}

\begin{document}
%
% \title{Structure-aware Latent Diffusion for High-fidelity Any-to-Any Brain Modality Synthesis}
% \titlerunning{Structure-aware Latent Diffusion for Any-to-Any Brain Modality Synthesis}
\title{A Unified Latent Diffusion for High-Fidelity Any-to-Any Brain Modality Synthesis}
\titlerunning{Unified Latent Diffusion for Any-to-Any Brain Modality Synthesis}
% If the paper title is too long for the running head, you can set
% an abbreviated paper title here
%
% \begin{comment}  %% Removed for anonymized MICCAI submission
\author{Yulong Dou\inst{1}\textsuperscript{*} \and
Guo Chen\inst{1}\textsuperscript{*} \and
Chenfan Xu\inst{1} \and
Yulin Wang\inst{1} \and
Zhe Xu\inst{2} \and \\
Zhiming Cui\inst{1} \and
Dinggang Shen\inst{1,3,4}\textsuperscript{(\Letter)}
}

%
\authorrunning{Y. Dou et al.}
% First names are abbreviated in the running head.
% If there are more than two authors, 'et al.' is used.
%
\institute{School of Biomedical Engineering \& State Key Laboratory of Advanced Medical Materials and Devices, ShanghaiTech University, Shanghai, China \and
Data Science Institute, Columbia University, New York, USA \and
Shanghai United Imaging Intelligence Co., Ltd., Shanghai, China \and
Shanghai Clinical Research and Trial Center, Shanghai, China \\
\email{Dinggang.Shen@gmail.com}}

% \end{comment}

% \author{Anonymized Authors}  %% Added for anonymized MICCAI submission
% \authorrunning{Anonymized Author et al.}
% \institute{Anonymized Affiliations \\
%     \email{email@anonymized.com}}
  
\maketitle              % typeset the header of the contribution

\begingroup
\renewcommand{\thefootnote}{*}
\footnotetext[1]{indicates equal contribution.}
\endgroup

%
\begin{abstract}
Comprehensive neurological diagnosis relies on the synergy of multimodal data from MRI, CT, and PET to capture distinct anatomical and pathological markers. However, clinical reality often results in incomplete imaging suites due to time constraint, high costs, or patient contraindication. Although cross-modality synthesis offers a potential solution, existing methods are limited by their reliance on strictly paired training data. This constraint prevents them from utilizing abundant single-modality data and from learning the distinct characteristics of each imaging domain, resulting in limited generative fidelity. To address these challenges, we present BrainDiff, a unified latent diffusion framework for flexible any-to-any brain modality synthesis. Our method employs a universal VQ-VAE to project all modalities into a shared latent space, where a prompt-guided diffusion model synthesizes corresponding images conditioned on descriptive prompts (e.g., patient attributes, imaging parameters). During cross-modality synthesis, a ControlNet module is incorporated to provide guidance that preserves anatomical structure of the source-modality image. Extensive multi-institutional experiments demonstrate that BrainDiff achieves state-of-the-art performance, delivering high-fidelity and anatomically consistent results for any-to-any generation. Code is available at \url{https://github.com/douyl/BrainDiff}.



\keywords{Brain generation \and Cross-modal synthesis \and Diffusion model.}
% Authors must provide keywords and are not allowed to remove this Keyword section.

\end{abstract}
%
%
%
\section{Introduction}
Neuroimaging modalities such as Magnetic Resonance Imaging (i.e., MRI), Computed Tomography (CT), and Positron Emission Tomography (PET) provide complementary anatomical and functional views of the same subject, essential for neurological diagnosis. However, acquiring a complete multimodal imaging suite for each patient is often infeasible due to cost, scan time, radiation exposure and modality-specific contraindications (e.g., contrast agent intolerance). Thus, missing modalities are common in clinical practice, motivating the development of cross-modality synthesis techniques to recover unavailable scans. 

Despite recent progress, existing brain image synthesis remains limited by fragmented modality coverage, underuse of large-scale single-modality data, and inadequate volumetric modeling. Many approaches focus on domain-specific tasks, including PET tracer conversion or denoising \cite{shen2025positron,zhou2025generalizable} and inter-contrast MRI synthesis \cite{wang2025toward,xiong2025learning}, restricting scalability across heterogeneous MRI, CT, and PET domains. More flexible frameworks support many-to-one or many-to-many synthesis within predefined MRI modalities, but often operate slice-wise without modeling volumetric coherence \cite{zhang2025structure,xiao2024fgc2f}. Recent 3D methods either rely primarily on affine feature modulation to bridge substantial cross-domain appearance differences \cite{wang2025unisyn} or remain restricted to a small set of modalities \cite{zhou2026adaptive}. These limitations motivate a unified 3D framework that can learn modality-specific distributions from large-scale single-modality data while promoting subject-specific anatomy during cross-modality synthesis.

To address this gap, we propose BrainDiff, a unified latent diffusion framework for high-fidelity any-to-any brain modality synthesis. BrainDiff learns a shared generative space across heterogeneous brain imaging modalities while controlling target appearance through descriptive prompts and anatomical guidance. A universal 3D VQ-VAE \cite{Oord2017NeuralDR} first projects diverse brain modalities into a shared latent space for efficient, high-fidelity volumetric compression. A prompt-guided latent diffusion model then incorporates patient attributes and imaging parameters via cross-attention to learn modality-specific distributions within a unified architecture. We further introduce a source-guided ControlNet \cite{zhang2023adding} that injects spatial information from the source modality to promote subject-specific anatomical consistency during synthesis. This design supports arbitrary source-target modality pairs while maintaining high structural fidelity. Experiments on multi-institutional MRI, CT, and PET datasets show that BrainDiff consistently outperforms state-of-the-art methods quantitatively and qualitatively.



\section{Method}
% As illustrated in Fig.~\ref{structure}, our BrainDiff framework integrates three synergistic components: (a) a universal VQ-VAE that compresses heterogeneous 3D volumetric data into a shared latent space; (b) a prompt-guided latent diffusion model that synthesizes modality-specific feature distributions guided by descriptive prompts, enabling it to faithfully capture both fine-grained appearance and structural patterns inherent of each modality; and (c) a structure-aware ControlNet module which promotes anatomical consistency by incorporating spatial guidance from the source modality during cross-modality synthesis.

As shown in Fig.~\ref{structure}, BrainDiff combines three components: (a) a universal VQ-VAE for shared 3D latent representation; (b) a prompt-guided latent diffusion model for target-modality generation; and (c) a source-guided ControlNet for anatomical consistency.


\begin{figure}[!t]
\includegraphics[width=\textwidth]{pipeline.png}
\caption{Overview of our proposed BrainDiff framework.} \label{structure}
\end{figure}

\subsection{Universal VQ-VAE for Latent Compression}
% To facilitate efficient synthesis of high-dimensional medical volumes, we employ a 3D VQ-VAE that learns a discrete latent representation shared across different modalities. Given the computational constraint associated with processing 3D volumetric data, we adopt a patch-based training strategy while supporting full-volume inference. Let $\mathbf{X} \in \mathbb{R}^{H \times W \times D}$ denote the original volumetric data. During training, we randomly sample sub-volumes $\mathbf{x} \in \mathbb{R}^{h \times w \times d}$ as input. The framework comprises an encoder $\mathcal{E}$, a decoder $\mathcal{G}$, and a learnable codebook $\mathcal{C} = \{ \mathbf{e}_k \in \mathbb{R}^C \}_{k=1}^{K}$, where $K$ represents the codebook size and $C$ denotes the embedding dimension.
% newly added
To facilitate efficient synthesis of high-dimensional medical volumes, we employ a 3D VQ-VAE to compress images into a compact latent space and learn a discrete representation shared across different modalities.

Given the computational constraint associated with processing 3D volumetric data, we adopt a patch-based training strategy while supporting full-volume inference. Let $\mathbf{X} \in \mathbb{R}^{H \times W \times D}$ denote the original volumetric data. During training, we randomly sample sub-volumes $\mathbf{x} \in \mathbb{R}^{h \times w \times d}$ as input. The framework comprises an encoder $\mathcal{E}$, a decoder $\mathcal{G}$, and a learnable codebook $\mathcal{C} = \{ \mathbf{e}_k \in \mathbb{R}^C \}_{k=1}^{K}$, where $K$ represents the codebook size and $C$ denotes the embedding dimension. 
% newly added
Each codebook vector $\mathbf{e}_k$ represents a learnable latent prototype, and vector quantization maps continuous latent features to their closest prototypes to obtain discrete semantic representations.

During quantization, each continuous latent vector is replaced by its closest prototype, which converts the continuous latent representation into discrete semantic tokens.
The encoder $\mathcal{E}$ first maps the input patch $\mathbf{x}$ to a continuous latent feature map, denoted as $\mathbf{z} = \mathcal{E}(\mathbf{x}) \in \mathbb{R}^{h' \times w' \times d' \times C}$, where the spatial dimensions are downsampled by a factor of 8 (i.e., $h' = h/8$). Subsequently, an element-wise quantization is performed by replacing each feature vector $\mathbf{z}_{i}$ with its nearest neighbor from the codebook $\mathcal{C}$, yielding the quantized latent $\hat{\mathbf{z}}_{i}$:
\begin{equation}
\hat{\mathbf{z}}_{i} = \mathbf{e}_k, \quad \text{where } k = \arg\min_{j} \left\| \mathbf{z}_i - \mathbf{e}_j \right\|_2.
\end{equation}
The decoder $\mathcal{G}$ then reconstructs the volumetric patch from the quantized representation, producing $\hat{\mathbf{x}} = \mathcal{G}(\hat{\mathbf{z}})$. 
% newly added
This discrete latent space provides a structured and compact representation for subsequent diffusion modeling, reducing the complexity of learning high-dimensional volumetric distributions.

To ensure high-fidelity reconstruction and perceptual realism, the model is optimized with a composite objective that integrates reconstruction, perceptual, quantization, and adversarial losses. The total training objective is expressed as:
\begin{equation}
\mathcal{L}_{\text{AE}} = \lambda_{\text{rec}}\mathcal{L}_{\text{rec}} + \lambda_{\text{perc}} \mathcal{L}_{\text{perc}} + \lambda_{\text{vq}} \mathcal{L}_{\text{vq}} + \lambda_{\text{adv}} \mathcal{L}_{\text{adv}}.
\end{equation}
Specifically, $\mathcal{L}_{\text{rec}} = \| \mathbf{x} - \hat{\mathbf{x}} \|_2$ promotes voxel-wise consistency during reconstruction. $\mathcal{L}_{\text{perc}}$ is employed to maintain high perceptual quality by measuring the semantic similarity between the input and reconstructed volumes \cite{johnson2016perceptual}. The quantization loss $\mathcal{L}_{\text{vq}} = \| \text{sg}[\mathbf{z}] - \hat{\mathbf{z}} \|_2 + \beta \| \mathbf{z} - \text{sg}[\hat{\mathbf{z}}] \|_2$ combines codebook and commitment terms, where $\text{sg}[\cdot]$ denotes the stop-gradient operator and $\beta$ balances the commitment \cite{Oord2017NeuralDR}. Finally, a patch-based discriminator $\mathcal{D}$ computes the adversarial loss 
$\mathcal{L}_{\text{adv}} = - [\log \mathcal{D}(\mathbf{x}) + \log(1 - \mathcal{D}(\hat{\mathbf{x}}))]$
encouraging the generated textures to be indistinguishable from real anatomical structures \cite{goodfellow2020generative}.

During inference, our model processes the full-resolution volume $\mathbf{X}$ to obtain a latent map $\mathbf{Z} \in \mathbb{R}^{H' \times W' \times D' \times C}$, which is then quantized to $\hat{\mathbf{Z}}$ via vector quantization. The decoder subsequently reconstructs the full-resolution output $\hat{\mathbf{X}}$ from the quantized representation, providing an efficient and consistent foundation for subsequent generation in the latent space.




\subsection{Prompt-guided Latent Diffusion for Unified Modality Synthesis}
Based on the shared discrete latent space established by VQ-VAE, we then train a conditional diffusion model \cite{wang20253d} to synthesize modality-specific latent representations guided by descriptive prompts (e.g., patient attributes, imaging parameters). This formulation enables a single model to capture the distributions of multiple modalities while leveraging textual metadata to steer generation toward desired imaging characteristics.

The input volume $\mathbf{X}$ is first mapped to a latent representation $\mathbf{Z} = \mathcal{E}(\mathbf{X})$. To model the forward diffusion process, we progressively inject Gaussian noise into $\mathbf{Z}$ over $T$ timesteps following the formulation as:
\begin{equation}
q(\mathbf{Z}_{1:T} | \mathbf{Z}) = \prod_{t=1}^T q(\mathbf{Z}_t | \mathbf{Z}_{t-1}), 
~\text{where}~ q(\mathbf{Z}_t | \mathbf{Z}_{t-1}) = \mathcal{N}(\mathbf{Z}_t; \sqrt{1-\beta_t} \mathbf{Z}_{t-1}, \beta_t \mathbf{I}),
\end{equation}
where $\sqrt{1-\beta_t} \mathbf{Z}_{t-1}$ is the mean and $\beta_t \mathbf{I}$ is the covariance of the distribution at timestep $t \in \{1, 2, . . . , T \}$, with $\mathbf{I}$ denoting the identity matrix, and $\beta_t \in (0, 1)$ is a noise-level hyper-parameter. Using the reparameterization trick, $\mathbf{Z}_t$ can be directly sampled from the clean latent $\mathbf{Z}$ as:
\begin{equation}
\mathbf{Z}_t = \sqrt{\bar{\alpha}_t}\mathbf{Z} + \sqrt{1-\bar{\alpha}_t}\boldsymbol{\epsilon},~ \boldsymbol{\epsilon} \sim \mathcal{N}(\mathbf{0}, \mathbf{I}), ~\text{where}~ \alpha_t = 1-\beta_t ~\text{and}~ \bar{\alpha}_t = \prod_{i=1}^{t}\alpha_i.
\end{equation}

The reverse denoising process is parameterized by a network which predicts the injected noise at each timestep $t$, conditioned on descriptive prompts. The text embedding $\mathbf{c}$ is extracted using a pre-trained BERT encoder \cite{devlin2019bert}, encoding textual metadata such as subject information, modality type, and acquisition parameters. These textual features are injected into denoising network via cross-attention layers \cite{vaswani2017attention}, enabling semantic alignment between latent representations and modality-specific guidance. The reverse process is defined as:
\begin{equation}
p_{\theta}(\mathbf{Z}_{t-1} | \mathbf{Z}_t) = \mathcal{N}(\mathbf{Z}_{t-1}; \boldsymbol{\mu}_{\theta}(\mathbf{Z}_t, t, \mathbf{c}),\sigma_t \mathbf{I}),
\end{equation}
where $\boldsymbol{\mu}_{\theta}(\mathbf{Z}_t, t, \mathbf{c})$ is the predicted mean at time $t$, which can be expressed as:
\begin{equation}
\boldsymbol{\mu}_{\theta}(\mathbf{Z}_t, t, \mathbf{c}) = \frac{1}{\sqrt{\alpha_t}}\left(\mathbf{Z}_t - \frac{\beta_t}{\sqrt{1-\bar{\alpha}_t}}\boldsymbol{\epsilon}_{\theta}(\mathbf{Z}_t, t, \mathbf{c})\right).
\end{equation}
Following standard diffusion formulations, the training objective is to optimize a noise estimator $\epsilon_{\theta}$ to predict the added Gaussian noise $\epsilon$ via the loss function:
\begin{equation}
\mathcal{L}_\text{DIFF} = \mathbb{E}_{\mathbf{Z},\epsilon,t,\mathbf{c}}\left[||\epsilon-\epsilon_\theta(\mathbf{Z}_t,t,\mathbf{c})||_2\right].
\end{equation}

To maintain fine-grained details and global structural consistency, we adopt the BiFlowNet \cite{wang20253d} architecture as our denoising backbone, which integrates both intra-patch and inter-patch flows by combining DiT blocks \cite{peebles2023scalable} and 3D UNet blocks \cite{ronneberger2015u} to generate the final latent representations.



\subsection{Source-guided ControlNet for Anatomical Consistency}
Since the anatomical structure of a given subject remains consistent across modalities, we incorporate a ControlNet \cite{zhang2023adding} branch into the prompt-guided latent diffusion model \cite{Rombach_2022_CVPR} to further promote structural coherence. In this design, the parameters of the pre-trained diffusion model are frozen, and a trainable copy of its encoder is introduced as a separate branch. This cloned encoder accepts the latent representation of the source-modality volume, denoted as $\mathbf{Z}_{\text{src}}$, and injects the extracted features into the corresponding layers of the frozen denoising network. These conditional latent features of the source modality are incorporated through a series of convolution layers initialized with zero weights, which ensures that the additional structural guidance does not disturb the pre-trained diffusion behavior during early training \cite{zhang2023adding}.

Formally, the denoising process now conditions on both prompt embedding $\mathbf{c}$ and source latent $\mathbf{Z}_{\text{src}}$. The noise estimator becomes $\boldsymbol{\epsilon}_\theta(\mathbf{Z}_t, t, \mathbf{c}, \mathbf{Z}_{\text{src}})$, where $\mathbf{Z}_t$ is the noisy latent of target modality at timestep $t$. The fine-tuning objective for the ControlNet branch is defined as:
\begin{equation}
\mathcal{L}_{\text{CTRL}} = \mathbb{E}_{\mathbf{Z}, \boldsymbol{\epsilon}, t, \mathbf{c}, \mathbf{Z}_{\text{src}}} \left[ || \boldsymbol{\epsilon} - \boldsymbol{\epsilon}_\theta(\mathbf{Z}_t, t, \mathbf{c}, \mathbf{Z}_{\text{src}}) ||_2 \right],
\end{equation}
where $\mathbf{Z}$ is the latent representation of the target-modality volume, $\boldsymbol{\epsilon} \sim \mathcal{N}(\mathbf{0}, \mathbf{I})$ is the noise added during forward diffusion process, and $\mathbf{Z}_t = \sqrt{\bar{\alpha}_t} \mathbf{Z} + \sqrt{1 - \bar{\alpha}_t} \boldsymbol{\epsilon}$. The source latent $\mathbf{Z}_{\text{src}}$ is obtained by encoding the source-modality volume with the same frozen VQ-VAE encoder $\mathcal{E}$ used for the target modality. After iterative denoising, the final target latent $\mathbf{Z}_0$ is quantized and then passed through the VQ-VAE decoder $\mathcal{G}$ to generate the target-modality volume $\hat{\mathbf{X}}_{\text{tgt}}$.


By conditioning on source-modality anatomy, ControlNet guides denoising toward the prompt-specified target modality while promoting local details and global structure. Fine-tuning is confined to the ControlNet branch, while the pre-trained diffusion backbone remains fixed to retain its general synthesis capability.

\section{Experiments and Results}
\subsection{Dataset Description}
\textbf{Huashan} dataset contains 1,002 participants for the Alzheimer's Disease study, recruited from Universal Medical Clinic and Shanghai Sixth People's Hospital, China. After preprocessing (e.g., skull stripping \cite{hoopes2022synthstrip} and modality registration \cite{tustison2021antsx}), the available scans include: T1-weighted (T1w) MRI (n=843), CT (n=872), Florbetapir (AV45)-PET (n=603), Fluorodeoxyglucose (FDG)-PET (n=582), T2-weighted (T2w) MRI (n=835), and Fluid-attenuated Inversion Recovery (FLAIR) MRI (n=803). This yields 4,538 single-modality scans for diffusion model and 22,494 intra-subject modality pairs for ControlNet. \\
% \textbf{***} dataset contains 1,002 participants for the Alzheimer's Disease study, recruited from *** Hospital. After preprocessing (e.g., skull stripping \cite{hoopes2022synthstrip} and modality registration \cite{tustison2021antsx}), the available scans include: T1-weighted (T1w) MRI (n=843), CT (n=872), Florbetapir (AV45)-PET (n=603), Fluorodeoxyglucose (FDG)-PET (n=582), T2-weighted (T2w) MRI (n=835), and Fluid-attenuated Inversion Recovery (FLAIR) MRI (n=803). This yields 4,538 single-modality scans for diffusion training and 22,494 intra-subject modality pairs for ControlNet training. \\
\textbf{ADNI2} dataset comprises longitudinal data from over 1,000 participants across USA and Canada as part of the Alzheimer's Disease Neuroimaging Initiative. After preprocessing, the available scans are: T1w (n=2,616), T2w (n=1,228), T2 Star (T2*) MRI (n=2,220), FLAIR (n=1,616), AV45-PET (n=716), FDG-PET (n=719), and Proton Density-weighted (PD) MRI (n=222) \cite{mueller2005alzheimer}. Similarly, there are 9,337 single-modality scans and 48,856 modality pairs. \\
\textbf{OASIS3} dataset comprises data from 1,098 participants at Washington University Knight Alzheimer Disease Research Center, USA. After preprocessing, scans include: T1w (n=2,558), T2w (n=1,467), T2* (n=1,669), FLAIR (n=1,171), CT (n=1,003), AV45-PET (n=738), FDG-PET (n=123), Pittsburgh Compound B (PIB)-PET (n=109), and Susceptibility-weighted Imaging (SWI) MRI (n=533) \cite{lamontagne2019oasis}. This yields 9,371 single-modality scans and 25,200 modality pairs.\\
All datasets are split at the participant level into training, validation, and testing sets at an 8:1:1 ratio, with scans from the same participant confined to one split.


\subsection{Implementation Details}
Models were trained on four NVIDIA A100 GPUs (80 GB). Following \cite{wang20253d}, the VQ-VAE uses a codebook of 8192 8-d codes, with loss weights \(\lambda_{\text{rec}}=4\), \(\lambda_{\text{perc}}=2\), \(\lambda_{\text{adv}}=2\), \(\lambda_{\text{vq}}=0.25\). It operates on \(96^3\) patches at \(8^3\) compression for 100 epochs (batch size 96, lr \(3\times10^{-4}\)).
The latent diffusion model follows the architecture in \cite{wang20253d} with a cosine noise schedule ($T=1000$). Full-volume resolution is \(256\times192\times256\), the base diffusion model is trained for 1000 epochs (batch size 48, lr \(1\times10^{-4}\)), while ControlNet is trained for 3000 epochs (batch size 64, lr \(1\times10^{-5}\)).
All datasets use subject-level 8:2 train/test split, consistent across three modules.

\begin{table*}[!t]
\centering
% \fontsize{8}{\baselineskip}\selectfont
\fontsize{8}{9.6}\selectfont
\setlength{\tabcolsep}{4pt}
\caption{Quantitative comparison across different cross-modality synthesis tasks. Best metrics are marked in bold, second best underlined; * denotes p < 0.05 (paired t-test).}
\begin{tabular}{l c c c c c}
\toprule
\multirow{2}{*}{} & \multirow{2}{*}{} 
&  PTNet~\cite{zhang2022ptnet3d} &  TUMSyn~\cite{wang2025toward} &  UniSyn~\cite{wang2025unisyn} &  Ours \\
\midrule
\multirow{2}{*}{CT$\rightarrow$AV45-PET} 
& PSNR & 26.28$\pm$1.28* & 26.57$\pm$2.22* & \underline{27.26$\pm$3.15}* & \textbf{28.59$\pm$1.49} \\
& SSIM & \underline{0.938$\pm$0.020}* & 0.931$\pm$0.025* & 0.937$\pm$0.026* & \textbf{0.947$\pm$0.016}\\
\midrule
\multirow{2}{*}{FDG-PET$\rightarrow$T1w} 
& PSNR & 21.98$\pm$0.99* &  22.75$\pm$0.83* &  \underline{23.21$\pm$0.97}* & \textbf{24.56$\pm$0.88}\\
& SSIM & 0.882$\pm$0.034*  & 0.879$\pm$0.032* & \underline{0.893$\pm$0.034}* & \textbf{0.904$\pm$0.031}\\
\midrule
\multirow{2}{*}{T1w$\rightarrow$T2w} 
& PSNR & 23.92$\pm$1.69* & 24.11$\pm$1.54* & \underline{26.54$\pm$1.27}* & \textbf{26.99$\pm$1.59} \\
& SSIM & 0.908$\pm$0.017* & 0.915$\pm$0.029* & \underline{0.947$\pm$0.016}* & \textbf{0.949$\pm$0.017} \\
\midrule
\multirow{2}{*}{T1w$\rightarrow$FLAIR} 
& PSNR & 24.81$\pm$0.56* & 25.00$\pm$1.43* & \underline{25.58$\pm$0.32}* & \textbf{27.85$\pm$0.23} \\
& SSIM & 0.912$\pm$0.017* & 0.915$\pm$0.028* & \underline{0.944$\pm$0.007}\phantom{*} & \textbf{0.945$\pm$0.006} \\
\midrule
\multirow{2}{*}{T2w$\rightarrow$CT} 
& PSNR & 24.22$\pm$1.58* & \textbf{24.27$\pm$2.97}\phantom{*} & 23.60$\pm$2.28* & \underline{24.26$\pm$1.56}\\
& SSIM & 0.878$\pm$0.023* & 0.882$\pm$0.034* & \underline{0.886$\pm$0.038}* & \textbf{0.888$\pm$0.039} \\
\bottomrule
\end{tabular}
\label{tab:quantitative_results}
\end{table*}

\begin{figure}[!t]
\includegraphics[width=\textwidth]{result.png}
\caption{(I) Qualitative comparison of four cross-modality tasks across different methods. (II) Results of ablation study. The baseline uses a one-hot target-modality label with a paired training strategy. Our full model replaces the one-hot label with BERT-based prompt encoding and incorporates a source-guided ControlNet for anatomical consistency. The table in the left indicates whether each component is enabled.} \label{qualitative}
\end{figure}

\begin{figure}[!t]
\includegraphics[width=\textwidth]{seg_and_result.png}
\caption{(a) Visualization for other modality pairs, each image bidirectionally alternates as source and GT, and $\hat{\cdot}$ denotes synthesized output. (b) Quantitative comparison in downstream segmentation tasks, with IoU \& DSC (unit 1), ASD \& 95HD (voxel).}
\label{others}
\end{figure}


\subsection{Result Analysis}

\textbf{Quantitative Analysis.}
% Quantitative comparisons across five selected cross-modality synthesis tasks (see Table~\ref{tab:quantitative_results}) show that our method consistently outperforms existing methods. It achieves the highest SSIM \cite{wang2004image} in all tasks and the highest PSNR in four out of five tasks, with the second-best PSNR in the remaining one. Compared to the best-performing baseline, our method improves PSNR by 4.2\% with a slight gain in SSIM. Furthermore, the low variance across different experiments  demonstrates the robustness of our proposed method. \\
Quantitative results on five cross-modality synthesis tasks (Table~\ref{tab:quantitative_results}) show that our method consistently outperforms existing baselines, achieving competitive SSIM \cite{wang2004image} across all tasks and the best PSNR on all but one task. Compared with the strongest baseline, BrainDiff improves PSNR by 4.2\% with a slight SSIM gain, while showing lower variance across experiments, indicating robust synthesis performance. \\
\textbf{Qualitative Analysis.}
% We qualitatively compare different methods in Fig.~\ref{qualitative}(I) across modality pairs: T1w-to-CT, PET-to-T1w, CT-to-PET, T2w-to-FLAIR, and CT-to-T2w. 
% While PTNet produces the least texture detail, TUMSyn introduces sporadic grid-like artifacts (see Case 3) and unrealistic hallucinations. UniSyn improves overall consistency but tends to over-smooth textures and edges, limiting its ability to capture complex anatomical structures.
% In contrast, our method produces more realistic modality-specific appearances while better preserving modality-invariant anatomical structures from source modality via ControlNet. To further demonstrate our method's capability in generating less common modalities, such as PIB-PET, SWI, and PD, we present additional cross-modality synthesis results in Fig.~\ref{others}(a), highlighting its ability to achieve any-to-any modality synthesis. \\
We qualitatively compare different methods across five modality pairs in Fig.~\ref{qualitative}(I), including T1w-to-CT, PET-to-T1w, CT-to-PET, T2w-to-FLAIR, and CT-to-T2w. PTNet lacks fine texture details, TUMSyn introduces grid-like artifacts and hallucinations, and UniSyn tends to over-smooth anatomical boundaries. In contrast, our method generates more realistic modality-specific appearances while better maintaining source-image anatomical structures with ControlNet guidance. Additional results on less common modalities, including PIB-PET, SWI, and PD, are shown in Fig.~\ref{others}(a), further demonstrating BrainDiff’s flexible any-to-any synthesis capability. \\
\textbf{Ablation Study.}
We perform ablation studies on the conditioning strategy. Firstly, replacing the BERT-based prompt with a one-hot modality label and removing fine-grained acquisition and patient information leads to weaker modeling of heterogeneous data. Secondly, replacing the unified ControlNet with paired source-target training reduces the breadth of any-to-any modality generation and compromises structural consistency across modality pairs. The results are shown in Fig.~\ref{qualitative}(II). Quantitatively, the BERT-based prompt encoder and ControlNet improve SSIM by 1.8\% and 2.4\% and PSNR by 1.9\% and 1.2\%, respectively, while combining both components yields larger gains of 5.8\% in SSIM and 4.8\% in PSNR over the baseline.
The improvement confirms the benefits of prompt-guided conditioning and ControlNet-based structural guidance. \\
\textbf{Downstream Performance.}
%To quantitatively assess anatomical consistency, we conduct a downstream brain tissue segmentation evaluation. Specifically, we employ a pre-trained Brain MRI segmentation network \cite{liu2025brainparc} to obtain gray matter, white matter, and cerebrospinal fluid masks for all synthesized images whose target modality is T1w, T2w, or FLAIR in the test set.
To assess anatomical consistency, we perform downstream brain tissue segmentation on synthesized MRI images using a pre-trained Brain MRI segmentation network \cite{liu2025brainparc}.
Gray matter, white matter, and cerebrospinal fluid masks from synthesized and GT images are compared using Intersection over Union (IoU), Dice Similarity Coefficient (DSC) \cite{crum2006generalized}, Average Surface Distance (ASD), and 95\% Hausdorff Distance (95HD) \cite{Taha2015MetricsFE}, with results averaged across tissues and cases.
% For each tissue, we compute the Intersection over Union (IoU), Dice Similarity Coefficient (DSC) \cite{1717643}, Average Surface Distance (ASD), and 95\% Hausdorff Distance (95HD) \cite{Taha2015MetricsFE} between the masks derived from the synthesized image and those from the corresponding GT image. These metrics are first averaged across the three tissues per case and then across all cases. 
As shown in Fig. \ref{others}(b), our method consistently outperforms the two competing approaches on almost all metrics. Higher IoU and DSC indicate better region-wise overlap, while lower ASD and 95HD indicate improved boundary fidelity. The consistent improvement demonstrates that our method generates synthesized images with superior anatomical accuracy.





\section{Conclusion}
We present BrainDiff, a unified latent diffusion framework for any-to-any brain modality synthesis, whose strong performance and robustness arise from a universal VQ-VAE for compact volumetric encoding, a prompt-guided diffusion model that captures shared anatomical content while controlling modality-specific appearance, and a source-guided ControlNet that injects spatial guidance from the source modality to enhance anatomical consistency during cross-modality synthesis. Extensive experiments on multi-institutional datasets demonstrate that BrainDiff consistently outperforms existing approaches, offering a promising solution for addressing missing neuroimaging data in clinical practice.


\begin{credits}
\subsubsection{\ackname} 
This work was supported in part by National Natural Science Foundation of China (grant numbers 62131015, U23A20295, 82441023, 82394432), the China Ministry of Science and Technology (STI2030-Major Projects-2022ZD0213100), Key R\&D Program of Guangdong Province, China (grant number 2023B0303040001), and HPC Platform of ShanghaiTech University.
% A bold run-in heading in small font size at the end of the paper is used for general acknowledgments, for example: This study was funded by X (grant number Y).

\subsubsection{\discintname}
The authors have no competing interests to declare that are relevant to the content of this article.
\end{credits}


%
% ---- Bibliography ----
%
% BibTeX users should specify bibliography style 'splncs04'.
% References will then be sorted and formatted in the correct style.
%
\bibliographystyle{splncs04}
\bibliography{ref}
%
% \begin{thebibliography}{8}
% \bibitem{ref_article1}
% Author, F.: Article title. Journal \textbf{2}(5), 99--110 (2016)

% \bibitem{ref_lncs1}
% Author, F., Author, S.: Title of a proceedings paper. In: Editor,
% F., Editor, S. (eds.) CONFERENCE 2016, LNCS, vol. 9999, pp. 1--13.
% Springer, Heidelberg (2016). \doi{10.10007/1234567890}

% \bibitem{ref_book1}
% Author, F., Author, S., Author, T.: Book title. 2nd edn. Publisher,
% Location (1999)

% \bibitem{ref_proc1}
% Author, A.-B.: Contribution title. In: 9th International Proceedings
% on Proceedings, pp. 1--2. Publisher, Location (2010)

% \bibitem{ref_url1}
% LNCS Homepage, \url{http://www.springer.com/lncs}, last accessed 2023/10/25
% \end{thebibliography}

\end{document}

