\documentclass[runningheads]{llncs}
%
\usepackage[T1]{fontenc}

\usepackage{graphicx,verbatim}
\usepackage{subfigure}
\usepackage{amsmath}
\usepackage{color}
\usepackage{amssymb} 
\usepackage[linesnumbered,ruled,vlined]{algorithm2e}
\usepackage{marvosym}



\begin{document}
\title{Making sparse labels reliable: validity-gated ROI guidance for medical image generation with conditional latent diffusion model}

\author{
Chang Li\inst{1,2} \and
Xiang Zhao\inst{3} \and
John Moraros\inst{1} \and
Jia Meng\inst{1} \and
Jingxin Liu\inst{4} \and
Kexin Wang\inst{3} \and
Shuihua Wang\inst{1}\textsuperscript{(\Letter)}
}
%
%index{Li Chang, Zhao Xiang, Moraros John, Liu Jingxin, Wang Kexin, Wang Shuihua}
\authorrunning{C. Li et al.}
% First names are abbreviated in the running head.
% If there are more than two authors, 'et al.' is used.
%
\institute{Department of Biosciences and Bioinformatics & Suzhou Municipal Key Lab of AI4Health, School of Science, Xi’an Jiaotong-Liverpool University, Suzhou, China\and
Department of Mathematical Sciences, School of Physical Sciences, University of Liverpool, Liverpool, United Kingdom \and
Department of General Surgery, Qilu Hospital of Shandong University, Jinan, China \and
School of AI and Advanced Computing, Xi’an Jiaotong-Liverpool University, Suzhou, China\\
\email{chang.li18@student.xjtlu.edu.cn; 202335815@mail.sdu.edu.cn; \{John.Moraros, Jia.Meng, Jingxin.Liu, Shuihua. Wang\}@xjtlu.edu.cn; wangkexin@qiluhospital.com}
}

\maketitle  
\begin{abstract}
Diffusion models have shown strong potential for medical image generation, yet their clinical applicability critically depends on the anatomical consistency of generated region-of-interest (ROI) areas, which are essential for supporting more accurate diagnosis. In real-world settings, ROI annotations are often sparse and time-consuming, resulting in a partial-label problem. Although Classifier-Free Guidance (CFG) enhances conditional control, a global CFG scale applied uniformly across the image space can misguide unannotated regions, leading to spatial inconsistencies and degraded clinical reliability. In this work, we present VGR-Diff, a validity-gated ROI guidance latent diffusion framework that robustly leverages sparse ROI annotations for medical image generation. Our method integrates three key components: 2.5D biased slice sampling to enhance cross-slice contextual consistency, validity-gated ROI supervision during training to prevent erroneous guidance from missing labels, and ROI-aware CFG during inference to decouple global anatomical structure from local ROI constraints. Extensive experiments on a public brain MRI dataset (BraTS2023) and a private pelvic MRI–CT dataset from Qilu hospital demonstrate that, under various annotation sparsity conditions, VGR-Diff consistently achieves superior ROI fidelity among state-of-the-art 2D and 3D generative models, while maintaining competitive global image quality. By minimizing dependence on dense annotations and ensuring anatomically consistent generation, VGR-Diff offers a practical and clinically viable solution for real-world diffusion-based medical image generation.
\keywords{diffusion model \and medical image generation \and partial-label problem.}
% Authors must provide keywords and are not allowed to remove this Keyword section.

\end{abstract}

\section{Introduction}
Modern clinical practice increasingly depends on multi-modal medical imaging to provide complementary anatomical and functional information for diagnosis \cite{zhang2025paired,qiu2025accurate}. However, in real-world scenarios, the target modality is often missing or of suboptimal quality due to cost, radiation exposure, or equipment heterogeneity \cite{bosch2023risk,hauptmann2023brain,kim2024adaptive}. This has motivated growing interest in medical image-to-image (I2I) generation, which aims to synthesize a missing modality from an available one using deep learning methods.

However, for medical image generation to be clinically useful and trusted by physicians, visual realism alone is insufficient \cite{han2023medgen3d}. Instead, the accuracy and consistency of lesions and critical anatomical structures, including their boundaries, textures, and cross-slice continuity, are of primary importance for diagnostic tasks such as tumor assessment \cite{guo2025unsupervised}. Precise region-of-interest (ROI) annotations can guide models toward anatomical accuracy, but in practice, such annotations are typically sparse and incomplete, as dense labeling is labor-intensive and costly \cite{konz2024anatomically}.

Therefore, a fundamental challenge arises: in medical imaging, the absence of annotation does not imply the absence of pathology—a phenomenon known as the partial-label problem \cite{tian2023partial}. Treating unlabeled regions as negative can introduce systematic bias, especially when using ROI masks as direct supervision or hard conditioning \cite{wang2025dealing,li2025expert}. Consequently, many existing generation methods—including GANs \cite{phan2023structure,phan2024structural}, VAEs \cite{rais2024medical}, and diffusion models \cite{choo2024slice,xiao2025deterministic}—often suffer from ROI degradation, including detail drift, illusory structures, and blurred boundaries. Although conditional latent diffusion models show strong potential in medical image generation, current conditioning strategies, such as CFG, often focus on global guidance while overlooking the impact of sparse local annotations, thereby misleading the generation process \cite{jiang2023cola,li2025brain}. This suggests that the key challenge lies not in the diffusion backbone itself, but in how ROI information is incorporated under sparse annotation settings. As shown in Fig. \ref{fig1}, state-of-the-art conditioning mechanisms in diffusion models fail to handle annotation sparsity, with performance dropping sharply as labels become sparser.
\begin{figure}[t]
\centering
\subfigure[]{
\includegraphics[width=0.45\textwidth]{figure/fig1a.png}
}
\hfill
\subfigure[]{
\includegraphics[width=0.45\textwidth]{figure/fig1b.png}
}
\caption{ROI (a) PSNR and (b) SSIM of different conditioning mechanisms under varying annotation sparsity. Different label ratios (\%) are simulated by randomly subsampling annotated slices from the BraTS2023 dataset.} \label{fig1}
\end{figure}

In this work, we propose VGR-Diff, a validity-gated ROI guidance latent diffusion model designed to robustly leverage sparse ROI annotations for medical image generation. A shared slice-wise validity indicator controls both ROI-weighted denoising supervision during training and ROI-specific spatial guidance during inference, preventing missing annotations from being interpreted as negative evidence. Moreover, our three-branch CFG formulation decomposes source-driven global guidance from the incremental ROI condition, allowing local ROI control to be applied without overwriting source-derived anatomical structure. The 2.5D biased sampler complements this formulation by increasing exposure to valid and adjacent slices while preserving global image coverage.  We evaluate VGR-Diff on both public brain dataset and private pelvic dataset. Under partial-label conditions, our model achieves higher ROI-SSIM and ROI-PSNR than existing conditioning methods. This demonstrates our model’s cross-organ generalization and its ability to reduce reliance on dense manual annotations, substantially lowering clinician workload while preserving high image quality.

\section{Methodology}
\subsection{Revisiting Conditional Latent Diffusion Model (cLDM)}
Given a source image $\mathbf{y}$ and target image $\mathbf{x}$, cLDM \cite{rombach2022high} aims to model $p(\mathbf{x}|\mathbf{y})$ by learning a denoising process in a compressed latent space. The target image is first encoded by a VAE as $\mathbf{z_0}=\text{E}(\mathbf{x})$, and reconstructed via $\text{D}(\mathbf{z_0})$.

The forward process adds Gaussian noise to $\mathbf{z_0}$ over $T$ steps:
\begin{equation}
 \mathbf{z}_t
    =
    \sqrt{\bar{\alpha}_t}\mathbf{z}_0
    +
    \sqrt{1-\bar{\alpha}_t}\boldsymbol{\epsilon},
    \qquad
    \boldsymbol{\epsilon}
    \sim
    \mathcal{N}(\mathbf{0},\mathbf{I}),
\end{equation}
where $\bar{\alpha}_t=\prod_{s=1}^{t}\alpha_s$. A conditional denoising network $\boldsymbol{\epsilon}_{\theta}$ predicts $\boldsymbol{\epsilon}$ given the noisy latent $\mathbf{z}_t$, diffusion timestep $t$, and source condition $\mathbf{c}_{\mathrm{src}}$:
\begin{equation}
\mathcal{L}_{\mathrm{diff}}
    =
    \mathbb{E}_{\mathbf{z}_0,\boldsymbol{\epsilon},t}
    \left[
    \left\|
    \boldsymbol{\epsilon}
    -
    \boldsymbol{\epsilon}_{\theta}
    \left(
    \mathbf{z}_t,
    t
    \mid
    \mathbf{c}_{\mathrm{src}}
    \right)
    \right\|_2^2
    \right].
\end{equation}

During inference, generation starts from noise $\mathbf{z}_T$ and applies the learned reverse process to obtain $\mathbf{\hat{z}}_0$, then decodes $\hat{\mathbf{x}}=\text{D}(\mathbf{\hat{z}}_0)$. To strengthen conditional control, CFG \cite{ho2022classifier} is commonly adopted by interpolating between unconditional and conditional predictions:
\begin{equation}
    \hat{\boldsymbol{\epsilon}}_\theta = \boldsymbol{\epsilon}_{\text{uncond}}+w(\boldsymbol{\epsilon}_{\text{cond}}-\boldsymbol{\epsilon}_{\text{uncond}}),
\end{equation}
where $w$ controls the guidance strength.

However, conventional CFG operates as a global guidance mechanism. In medical imaging, sparse ROI labels can bias this process and cause partial-label problem, motivating the improved conditioning strategies in our VGR-Diff. 

\begin{figure}[t]
\includegraphics[width=\textwidth]{figure/fig2.png}
\caption{Overview of VGR-Diff. \textbf{(A)} 2.5D biased slice sampling leverages slice-centered inputs with neighboring context and prioritizes annotated and adjacent slices. \textbf{(B)} Validity-gated ROI supervision activates ROI guidance only on valid slices to mitigate partial-label bias while enhancing structure fidelity. \textbf{(C)} During sampling, ROI-aware CFG combines full, source-only, and unconditional branches to balance global fidelity and ROI precision according to the ROI mask and its validity.} \label{fig2}
\end{figure}


\subsection{The VGR-Diff Framework}
\subsubsection{2.5D dataset construction with biased slice sampling}

As shown in Fig.\ref{fig2} (A), we construct a 2.5D dataset to incorporate through-plane anatomical context. Let 
$\mathbf{y},\mathbf{x}\in\mathbb{R}^{D\times H\times W}$ denote paired source and target volumes. For center slice $i$, the source input is $
\mathbf{Y}_i
=
\operatorname{Concat}
\left(
\mathbf{y}_{i-1},
\mathbf{y}_i,
\mathbf{y}_{i+1}
\right)
\in
\mathbb{R}^{3\times H\times W}$, with target slice $\mathbf{x}_i\in\mathbb{R}^{1\times H\times W}$. For boundary slices, the nearest available slice is replicated.

For each slice, let $\mathbf{L}_i\in\{0,\ldots,C\}^{H\times W}$ denote the categorical ROI annotation, where $0$ indicates background. It is converted into a $C$-channel one-hot map 
$\mathbf{Q}_i\in\{0,1\}^{C\times H\times W}$ for ROI conditioning. We further define a class-agnostic binary union mask $
\mathbf{B}_i
=
\mathbb{I}\!\left(\mathbf{L}_i>0\right)
\in
\{0,1\}^{H\times W}$, which is used for spatial weighting. Thus, the categorical indices are not directly used as numerical weights.

The slice-wise validity indicator is defined as
\begin{equation}
v_i
=
\begin{cases}
1, & \text{if }
\displaystyle\sum_{p=1}^{H}\sum_{q=1}^{W}
\mathbf{B}_i(p,q)
\geq A_{\min},\\[6pt]
0, & \text{otherwise},
\end{cases}
\label{eq:validity_indicator}
\end{equation}
where $A_{\min}=1{,}000$ pixels after preprocessing. This fixed threshold excludes very small peripheral annotations that may provide unstable ROI supervision.

Let $\mathcal{I}=\{0,\ldots,D-1\}$ denote all slice indices, 
$\mathcal{V}=\{i\in\mathcal{I}:v_i=1\}$ the valid-slice set, and $
\mathcal{N}
=
\left\{
j\in\mathcal{I}\setminus\mathcal{V}:
|j-i|=1
\text{ for some } i\in\mathcal{V}
\right\}$ the set of immediate neighboring slices. During training, the center slice is sampled as
\begin{equation}
i\sim
\begin{cases}
\operatorname{Uniform}(\mathcal{V}),
& \text{with probability }0.5,\\
\operatorname{Uniform}(\mathcal{N}),
& \text{with probability }0.3,\\
\operatorname{Uniform}(\mathcal{I}),
& \text{with probability }0.2.
\end{cases}
\label{eq:slice_sampling}
\end{equation}

This strategy emphasizes reliable ROI-supervised slices, includes adjacent anatomical transitions, and preserves coverage of the full slice distribution. The sampling proportions were fixed across all experiments. During inference, all slices are processed sequentially and stacked to reconstruct the target volume.








\subsubsection{Validity-Gated ROI Loss (Training Process)} Under partial labeling, the absence of an ROI annotation does not necessarily indicate the absence of pathology. Directly treating every unlabeled region as background can therefore introduce incorrect spatial supervision.

Because diffusion operates in a lower-resolution latent space, the binary mask is first resized to the spatial resolution of the latent representation, denoted as $\widetilde{\mathbf{B}}_i$. As shown in Fig. \ref{fig2} (B), we define the spatial weight map as
\begin{equation}
\mathbf{W}_i
=
\mathbf{1}
+
\lambda v_i \widetilde{\mathbf{B}}_i,
\label{eq:spatial_weight_map}
\end{equation}
where $\lambda$ controls the additional emphasis assigned to valid ROI pixels. When $v_i=0$, $\mathbf{W}_i=\mathbf{1}$, and training reduces to the standard diffusion objective without ROI-specific weighting.

The validity-gated diffusion loss is
\begin{equation}
\mathcal{L}_{\mathrm{VGR}}
=
\mathbb{E}
\left[
\frac{1}{dhw}
\sum_{k=1}^{d}
\sum_{u=1}^{h}
\sum_{v=1}^{w}
\mathbf{W}_i(u,v)
\left(
\epsilon_{k,u,v}
-
\epsilon_{\theta,k,u,v}
\right)^2
\right].
\end{equation}

\subsubsection{ROI-Aware CFG (Sampling Process)} Recent work has identified spatial inconsistency in conventional CFG, where global guidance strength fails to adapt to spatially varying relevance. For example, Shen et al. \cite{shen2024rethinking} proposed S-CFG to address this issue in natural images. Inspired by this, we develop an ROI-aware CFG mechanism tailored for medical image generation under sparse annotations.

As shown in Fig. \ref{fig2} (C), our approach integrates both spatial adaptive guidance and dual-condition disentanglement, allowing the model to (i) selectively enhance guidance within annotated ROIs while suppressing uncertain regions, and (ii) decouple global anatomical priors from local fine-grained constraints. This enables robust and controllable sampling under sparse annotations.

To be specific, we define a spatial guidance mask $\mathbf{M}_i
=
v_i\widetilde{\mathbf{B}}_i
\in
\{0,1\}^{h \times w}$. This mask modulates the ROI guidance strength spatially, enforcing stronger control inside annotated ROIs and weaker or no guidance elsewhere.

In addition to spatial modulation, we decouple the conditional inputs into two distinct signals: $\mathbf{c}_{\mathrm{src}}
=
\mathcal{C}_{\mathrm{src}}\!\left(\mathbf{Y}_i\right)$ are features extracted from the source image, providing global structural context; $\mathbf{c}_{\mathrm{roi}}
=
\mathcal{C}_{\mathrm{roi}}\!\left(\mathbf{Q}_i\right)$ are features extracted from the ROI mask, offering localized anatomical constraints. $\mathcal{C}_{\mathrm{src}}$ and $\mathcal{C}_{\mathrm{roi}}$ denote the source-condition and ROI-condition encoders, respectively. At each sampling step, we perform three noise predictions:
\begin{equation}
\boldsymbol{\epsilon}_{\mathrm{uncond}}=\boldsymbol{\epsilon}_{\theta}\left(\mathbf{z}_t,t\,\middle|\,\varnothing,\varnothing\right),\quad
\boldsymbol{\epsilon}_{\mathrm{src}}=\boldsymbol{\epsilon}_{\theta}\left(\mathbf{z}_t,t\,\middle|\,\mathbf{c}_{\mathrm{src}},\varnothing\right),\quad
\boldsymbol{\epsilon}_{\mathrm{full}}=\boldsymbol{\epsilon}_{\theta}\left(\mathbf{z}_t,t\,\middle|\,\mathbf{c}_{\mathrm{src}},\mathbf{c}_{\mathrm{roi}}\right).
\end{equation}
The final guided noise prediction is
\begin{equation}
\widehat{\boldsymbol{\epsilon}}_{\theta}
=
\boldsymbol{\epsilon}_{\mathrm{uncond}}
+
w_{\mathrm{src}}
\left(
\boldsymbol{\epsilon}_{\mathrm{src}}
-
\boldsymbol{\epsilon}_{\mathrm{uncond}}
\right)
+
w_{\mathrm{roi}}
\mathbf{M}_i
\odot
\left(
\boldsymbol{\epsilon}_{\mathrm{full}}
-
\boldsymbol{\epsilon}_{\mathrm{src}}
\right),
\end{equation}
where $w_{\mathrm{src}}$ controls global source-image guidance, $w_{\mathrm{roi}}$ controls additional ROI-specific guidance, and $\odot$ denotes element-wise multiplication. This differs from generic spatial CFG methods that spatially rescale a single conditional guidance residual. Thus, VGR-Diff enables independent and spatially selective control over the generative process, enforcing anatomical consistency in annotated regions while reducing artifacts from unreliable conditions.

\section{Experiments}
\subsection{Datasets and Implementation Details}
\subsubsection{Datasets} We evaluate our model on a public BraTS2023 brain MRI dataset (1,251 paired T2w/T1 scans) \cite{adewole2023brain} and a private pelvic MRI–CT dataset (220 paired) collected from Qilu hospital. BraTS2023 provides full voxel-wise annotations for three tumor subregions (ET, ED, NCR), allowing controlled simulation of different annotation-retention ratios. For each subject, we randomly retained 75\% of annotated slices and hid the remaining masks without relabeling them as background. The same retained subsets were used for all compared methods. In contrast, the pelvic dataset contains naturally sparse annotations: two experienced radiologists annotated rectal tumors and the mesorectal fascia only on slices where the structures were clearly visible, resulting in 553 labeled slices out of 2,191 pelvic MRI slices. Thus, BraTS2023 provides controlled sparsity experiments, whereas the pelvic dataset represents a clinically motivated sparse-annotation pattern. Both datasets were split at the subject level into training, validation, and test sets with ratios of 80\%, 10\%, and 10\%, respectively. All slices from the same subject were assigned to the same partition. All images were resampled to 1 $\text{mm}$ isotropic resolution and min–max normalized to [0,1].
\subsubsection{Implementation Details and Evaluation Metrics} We implement all models in PyTorch 2.4.1 and MONAI \cite{cardoso2022monai}, and train them on an NVIDIA RTX 4090 GPU. We adopt 1,000 diffusion steps for training and DDIM sampler with 200 denoising steps for inference, with $w_{\text{src}}=2.0$ and $w_{\text{roi}}=4.0$. The model was trained 500 epochs with a batch size of 8, using the Adam optimizer with $10^{-4}$ learning rate. Generation quality is evaluated with whole-image SSIM/PSNR for global similarity and rSSIM/rPSNR/rMAE computed within annotated regions for target-area structural and intensity accuracy \cite{wang2004image}. In ablation study, $\Delta \text{MAE}$ measures through-plane continuity by computing the mean absolute error between adjacent-slice intensity changes in the prediction and the ground truth images. For the downstream brain tumor segmentation task, performance is evaluated using Dice similarity coefficient (Dice) and 95th percentile Hausdorff distance (HD95), averaged across the ET, ED, and NCR tumor subregions.

\subsection{Evaluations}
\subsubsection{Qualitative Results} We first qualitatively evaluate the effectiveness of VGR-Diff under sparse annotation settings by comparing it with several state-of-the-art methods, including 3D GAN-based and 3D diffusion-based methods (MaskGAN \cite{phan2023structure}, Slice-consistent BBDM \cite{choo2024slice}, CoLa-Diff \cite{jiang2023cola}), and representative conditioning mechanisms (conventional CFG \cite{ho2022classifier}, cross attention \cite{rombach2022high}, ControlNet \cite{zhang2023adding}) built upon a 2D vanilla conditional latent diffusion model. The first row of Fig. \ref{fig3} and Fig. \ref{fig4} shows the generated images, with yellow boxes highlighting clinically critical ROI regions, while the second row presents the corresponding ROI difference maps, where brighter colors indicate larger structural discrepancies.

From Fig. \ref{fig3}, T2w-MRI is used to generate T1-MRI with a 75\% annotation-retention ratio to simulate partial-label scenarios. MaskGAN preserves coarse appearance but fails to recover accurate ROI details, while Slice-consistent BBDM and CoLa-Diff still show localized ROI errors and blurred boundaries. Without ROI guidance, vanilla cLDM exhibits clear ROI deviations with scattered hotspots in the difference maps. Although cross attention and ControlNet improve ROI awareness, residual distortions remain. In contrast, VGR-Diff achieves better structural fidelity and improved boundary consistency, demonstrating robustness to partial-label bias.
\begin{figure}[t]
\includegraphics[width=\textwidth]{figure/fig3.png}
\caption{Qualitative results on BraTS2023 dataset with 75\% annotation-retention ratio.} \label{fig3}
\end{figure}

From Fig. \ref{fig4}, we further evaluate a more challenging CT-to-MRI task, where the larger modality gap significantly increases the complexity of ROI reconstruction. MaskGAN only preserves coarse anatomy, Slice-consistent BBDM introduces severe intensity and structural artifacts, and CoLa-Diff still shows visible ROI discrepancies. Vanilla cLDM presents evident ROI deviations, while conventional CFG performs poorly under sparse supervision due to over-constrained guidance. Cross attention and ControlNet improve anatomical alignment but leave localized inconsistencies. In contrast, VGR-Diff best preserves fine-grained ROI structures and boundaries, indicating strong cross-modality generalization and robustness to severe annotation sparsity.
\begin{figure}[t]
\includegraphics[width=\textwidth]{figure/fig4.png}
\caption{Qualitative results on private pelvic dataset.} \label{fig4}
\end{figure}
\subsubsection{Quantitative Results} Table \ref{tab1} summarizes the quantitative results on both datasets. The 3D baselines, including MaskGAN and Slice-consistent BBDM, show limited ROI fidelity, while CoLa-Diff improves performance but still lags behind conditioning-based methods. Compared with vanilla cLDM, stronger conditioning generally improves ROI reconstruction; however, conventional CFG remains sensitive to sparse labels. VGR-Diff achieves the best ROI metrics on both BraTS2023 and the private pelvic dataset, with rMAE/rSSIM of 0.019/0.870 and 0.027/0.893, respectively. Compared with ControlNet, it further reduces rMAE by 9.5\% and 49.1\%, and improves rSSIM by 5.1\% and 19.8\% on the two datasets, demonstrating superior robustness to sparse annotations.
\begin{table}[t]
\caption{Quantitative results on two datasets. rMAE, rSSIM, rPSNR represent evaluation metrics on ROI. SSIM, PSNR represent evaluation metrics on whole image.}\label{tab1}
\centering
\resizebox{\textwidth}{!}{%
\begin{tabular}{lccccc| ccccc}
\hline
& \multicolumn{5}{c}{T2w$\xrightarrow[]{}$T1 (BraTS2023 dataset)} & \multicolumn{5}{c}{CT$\xrightarrow[]{}$MRI (private pelvic dataset)} \\
\cline{2-6}\cline{7-11}
Methods & rMAE & rSSIM & rPSNR & SSIM & PSNR & rMAE & rSSIM &  rPSNR & SSIM & PSNR \\
\hline
MaskGAN    & 0.081  &  0.748  & 23.972 &  0.876  & 26.697 & 0.107 & 0.538 & 18.033 & 0.594 & 18.909 \\
Slice-Consistent BBDM    & 0.123  &  0.634  & 24.010 &  0.872  & 25.983 & 0.256 & 0.502 & 16.987 & 0.519 & 17.448 \\
CoLa-Diff    &  0.039 &  0.791  & 27.313 &  0.922  & 28.549 & 0.087 & 0.603 & 22.154 & 0.689 & 22.777 \\
Vanilla cLDM    & 0.029  & 0.706   & 27.154 & 0.913   & 27.827 & 0.063 & 0.551 & 21.753 & 0.653 & 20.544 \\
+ CFG    & 0.052   & 0.655   & 24.103 & 0.886   & 26.776 & 0.121 & 0.498  & 17.077 & 0.526 & 17.235 \\
+ Cross Attention & 0.027  & 0.793   & 28.156 & 0.944 & 29.590 & 0.080 & 0.684   & 23.196 & 0.692 & 21.264 \\
+ ControlNet      & 0.021 & 0.828 & 30.107 & 0.967 & 33.342 & 0.053  & 0.745    & 24.966 & 0.841 & 23.984 \\
\textbf{Our model}       & \textbf{0.019}  & \textbf{0.870} & \textbf{31.791} & \textbf{0.976} & \textbf{35.562} & \textbf{0.027} & \textbf{0.893}   & \textbf{28.221}  & \textbf{0.837} & \textbf{25.466} \\
\hline
\end{tabular}%
}
\end{table}


\subsubsection{Ablation Study} Table \ref{tab2} shows the ablation results on the pelvic dataset. Compared with the vanilla cLDM baseline, 2.5D biased slice sampling improves inter-slice continuity, while validity gating further enhances ROI fidelity by mitigating partial-label bias. Adding ROI-aware CFG achieves the best ROI performance, with rSSIM/rPSNR increasing to 0.893/28.221, demonstrating the effectiveness of the full framework for robust ROI preservation under sparse annotations.

\subsubsection{Downstream task} To assess clinical utility, we trained an nnU-Net \cite{isensee2021nnu} brain tumor segmentation model on real T1 images from BraTS2023 and froze it to evaluate synthetic inputs. Table \ref{tab3} shows that VGR-Diff achieved the best Dice and HD95, approaching the real-T1 upper bound. This suggests that improved ROI fidelity can translate into better task-relevant structural preservation.

\begin{table}[t]
\centering
\scriptsize

\begin{minipage}[t]{0.52\textwidth}
\centering
\caption{Ablation study on the pelvic dataset. $\Delta$MAE evaluates inter-slice continuity.}
\label{tab2}
\setlength{\tabcolsep}{4pt}
\begin{tabular}{lccc}
\hline
Setting & rSSIM$\uparrow$ & rPSNR$\uparrow$ & $\Delta$MAE$\downarrow$ \\ \hline
Baseline cLDM     & 0.551 & 21.753 & 0.262 \\
+ 2.5D sampling   & 0.605 & 21.775 & 0.108 \\
+ Validity gating & 0.752 & 24.755 & \textbf{0.104} \\
+ ROI-aware CFG   & \textbf{0.893} & \textbf{28.221} & \textbf{0.104} \\ \hline
\end{tabular}
\end{minipage}
\hfill
\begin{minipage}[t]{0.43\textwidth}
\centering
\caption{Downstream segmentation utility. Dice and HD95 are averaged over ET, ED, NCR regions.}
\label{tab3}
\setlength{\tabcolsep}{4pt}
\begin{tabular}{lcc}
\hline
Input & Dice$\uparrow$ & HD95$\downarrow$ \\ \hline
Source T2w       & 0.48 & 21.6 \\
Vanilla cLDM-sT1 & 0.59 & 16.8 \\
+ ControlNet-sT1   & 0.68 & 11.3 \\
VGR-Diff-sT1     & \textbf{0.73} & \textbf{8.9} \\
Real T1          & 0.78 & 7.1 \\ \hline
\end{tabular}
\end{minipage}

\end{table}

\section{Conclusion}
We presented VGR-Diff, a validity-gated ROI guidance latent diffusion framework for mitigating the partial-label problem in medical image generation. By integrating 2.5D biased slice sampling, validity-gated ROI supervision, and ROI-aware CFG, our method enables reliable ROI conditioning while avoiding erroneous supervision from missing annotations. Experiments on both public and private datasets show that VGR-Diff consistently outperforms existing conditioning mechanisms in ROI fidelity and global image quality under sparse annotations, with consistent performance across two anatomical settings. By reducing the need for dense manual labeling while preserving anatomically reliable ROI structures, VGR-Diff provides a practical and clinically meaningful solution for real-world diffusion-based medical imaging workflows.

\begin{credits}
\subsubsection{\ackname} This work was funded by Basic Research Program of Jiangsu, BK20241815 and Xi’an Jiaotong Liverpool university research development fund RDF-23-02-004.

\subsubsection{\discintname}
The authors have no competing interests to declare.
\end{credits}







\bibliographystyle{splncs04}
\bibliography{reference}










\end{document}
