\documentclass{midl} % Include author names
\usepackage{booktabs}
\usepackage{multirow}
\usepackage{amsmath}
\usepackage{caption}
\usepackage{makecell} % Required for splitting headers
\jmlrvolume{-- 359}
\jmlryear{2026}
\jmlrworkshop{Full Paper -- MIDL 2026}
\editors{Accepted for publication at MIDL 2026}

\title[OrientDiff]{Orientation-Aware Diffusion Super-Resolution for 3T-Like Fetal MRI from Routine 1.5T Scans}

\midlauthor{
\Name{Xinliu Zhong\nametag{$^{1,2}$}} \Email{xinliu.zhong@emory.edu}\\
\Name{Ruiying Liu\nametag{$^{2}$}} \Email{rliu60@emory.edu}\\
\Name{Guohao Lin\nametag{$^{2}$}} \Email{linguohao111@gmail.com}\\
\Name{Chuan Huang\nametag{$^{3}$}} \Email{chuan.huang@emory.edu}\\
\Name{Adam Ezra Goldman-Yassen\nametag{$^{3,4}$}} \Email{adam.ezra.goldman-yassen@emory.edu}\\
\Name{Amy Robben Mehollin-Ray\nametag{$^{3,4}$}} \Email{amy.robben.mehollin-ray@emory.edu}\\
\Name{Yun Wang\nametag{$^{2}$}} \Email{yun.wang2@emory.edu}\\
% [0.8ex]
\addr $^{1}$ Department of Computer Science, Emory University, Atlanta, GA, USA\\
\addr $^{2}$ Department of Biomedical Informatics, Emory University, Atlanta, GA, USA\\
\addr $^{3}$ Department of Radiology and Imaging Sciences, Emory University, Atlanta, GA, USA\\
\addr $^{4}$ Children's Healthcare of Atlanta, Atlanta, GA, USA\\
% \addr $^{5}$ Individual Researcher
}

\begin{document}


\maketitle

\begin{abstract}
% 1.5T motion-friendly and low SNR -> remove motion, remove further more etc
Fetal MRI plays a central role in assessing early brain development. While 3T scanners offer higher SNR and improved cortical detail, their increased sensitivity to motion, susceptibility artifacts, and $B_1$ inhomogeneity limits wide adoption for routine fetal imaging. Consequently, most clinical examinations are performed at 1.5T, where greater motion tolerance comes at the cost of lower SNR, reduced gray-white matter contrast, and partial-volume blurring - factors that undermine downstream morphometric analysis. Bridging this quality gap without sacrificing motion robustness of 1.5T would enable 3T-like morphometric reliability in routine clinical acquisitions.

We propose an orientation-aware diffusion super-resolution framework that synthesizes 3T-like fetal brain contrast from routine 1.5T scans. The model combines a Swin-UNet backbone with gated FiLM-based orientation embeddings and a residual error-shifting diffusion mechanism. Training leverages the FaBiAN phantom to generate controllable high-/low-resolution pairs with monotonic intensity remapping, geometric perturbations, and simulated signal voids, thereby ensuring generalization to clinical data. Our model produces markedly sharper gyri and mitigates partial-volume effects in both synthesized and clinical data. When evaluated using Fetal-SynthSeg following NeSVoR reconstruction, the framework consistently improves tissue segmentation accuracy over state-of-the-art restoration baselines, yielding more reliable morphometric estimates for fetal brain analysis.

% Code will be released upon publication. Furthermore, existing diffusion-based quality-transfer models are primarily tuned for adult anatomy and lack paired fetal supervision, causing them to fail in correcting orientation-specific artifacts in 1.5T data.


\end{abstract}

\begin{keywords}
MRI, Diffusion Models, Image Enhancement, Fetal Neuroimaging
% Deep Learning, 
\end{keywords}


\section{Introduction}
MRI is widely used in neuroimaging due to its non-invasive nature, excellent soft tissue contrast, and painless procedure. Fetal brain assessment still depends largely on 1.5T scanners even though 3T acquisitions provide sharper cortical detail, higher SNR, and better gray--white separation. Consequently, routine fetal scans often suffer from an ``effective resolution'' gap: although the nominal pixel size may be sufficient, the inherently lower SNR at 1.5T results in partial-volume blurring and noise that obscure fine anatomical details. This degradation complicates diagnostic tasks, such as detecting cortical dysplasia, and hinders longitudinal studies that attempt to harmonize data across varying field strengths \citep{jannat2025advancing,zimmermann2025augment}. Bridging this gap via software—translating 1.5T scans to 3T quality—is therefore critical for modernizing fetal neuroimaging without costly hardware replacements.

Existing post-acquisition enhancement strategies falter under fetal-specific constraints. Classic model-based Super-Resolution (SR) methods rely on handcrafted priors (e.g., total variation, sparsity), which must precisely match the physical degradation to avoid artifact amplification. Supervised deep learning methods demand perfectly aligned low-/high-field pairs that are practically unobtainable in fetal cohorts, while physics-informed reconstructions require raw k-space data rarely archived during routine HASTE exams. While Denoising Diffusion Probabilistic Models (DDPMs) offer a generative alternative, standard implementations are prone to structural hallucinations when trained on limited medical datasets \citep{khateri2025mri}. Even recent residual-shifting approaches—such as ResShift \citep{yue2024resshift} or Res-SRDiff \citep{safari2025ressrdiff}—fail to account for the complex acquisition geometry, treating every slice as an independent, isotropic image.

This geometric oversight is critical. Routine fetal exams consist of orthogonal stacks (axial, coronal, sagittal) with highly anisotropic resolutions and distinct, view-dependent artifact patterns. Orientation-agnostic networks inevitably average these incompatible priors, resulting in suboptimal smoothing. To address this without clinical ground truth, we leverage high-fidelity simulation. Simulators like FaBiAN \citep{fabian} provide the only viable source of registered supervision, allowing us to explicitly learn these orientation-dependent degradations where reacquiring paired clinical data is impossible.

% However, current diffusion architectures have not been adapted to exploit either the controllable degradations of such simulators or the orientation metadata embedded in fetal protocols.

We address these gaps with an orientation-aware diffusion framework tailored to fetal MRI enhancement. Our contributions are threefold: (1) We introduce a gated FiLM orientation encoder that conditions the network on slice geometry, allowing it to adaptively invert view-specific anisotropies at different feature depths. (2) We propose a residual-shift diffusion formulation that anchors the generative process to the input, refining high-frequency details while explicitly mitigating the risk of hallucination common in standard DDPMs. (3) We introduce a multi-level augmentation suite---including monotonic intensity remapping, geometric perturbations, and blackout-style motion corruption---—to robustly generalize from FaBiAN synthetic supervision to clinical scans. Together, these components deliver 3T-like fidelity with sharper cortical detail and improved downstream segmentation utility.



\section{Related Work}



\subsection{Deep Learning-based Super-Resolution for Medical Imaging} 
Modern SR has evolved from residual CNNs to high-capacity architectures like Real-ESRGAN \citep{wang2018esrgan} and Vision Transformers like SwinIR \citep{liang2021swinir}, which utilize adversarial training or shifted-window attention to capture complex textures. Recently, state-space models such as GAMBAS \citep{baljer2025gambas} have introduced Mamba layers for volumetric context, while BME-X \citep{sun2025foundation} establishes a unified foundation model for multi-task restoration.
However, these approaches face distinct limitations in fetal imaging. Deterministic regressors (e.g., SwinIR) tend to suppress high-frequency details—the ``regression to the mean'' effect—particularly when pixel-aligned supervision is unavailable due to stochastic fetal motion. Furthermore, while GAMBAS assumes consistent volumetric inputs, it struggles with severe inter-slice motion of HASTE stacks. Similarly, BME-X's general-purpose design prioritizes global harmonization  but lacks explicit orientation conditioning, often leading to over-smoothed results that fail to resolve the view-dependent anisotropy inherent to single-shot acquisitions.


\subsection{Diffusion Models for Image Restoration}
DDPMs \citep{ho2020denoising, saharia2022image} have surpassed deterministic baselines by synthesizing the high-frequency textures essential for perceptual quality. This advantage has enabled successful applications in medical image reconstruction and harmonization \citep{chung2022score, peng2023ddmr, ozdenizci2023restoring}. However, standard DDPMs face critical hurdles in clinical deployment: generating anatomy from pure Gaussian noise is computationally intensive and prone to structural hallucinations, particularly given the domain shift between synthetic training data and real clinical scans. Furthermore, generic diffusion processes lack explicit priors to handle structured, view-dependent artifacts of fast HASTE sequences. We address these limitations by adopting a residual-shift formulation; rather than synthesizing images from scratch, our model iteratively refines high-frequency residual relative to input. This significantly constrains the generative search space, ensuring anatomical fidelity while recovering fine details.

\subsection{Orientation as a Conditioning Signal}
In fetal MRI, the trade-off between acquisition speed and spatial resolution necessitates highly anisotropic voxel dimensions, resulting in through-plane resolution (typically 3--4\,mm) that is substantially coarser than the in-plane resolution ($\approx 1.0$\,mm). While Slice-to-Volume Reconstruction (SVR) \citep{kuklisova2012reconstruction} mitigates this by fusing orthogonal stacks, its success depends critically on the fidelity of input slices. Our work therefore targets the acquisition space \textit{prior} to SVR: by enhancing in-plane resolution of individual stacks, we aim to stabilize subsequent registration and fusion.

From a modeling perspective, slice orientation is often treated as an implicit nuisance factor or handled only through geometric constraints. However, for texture-sparse medical images, deep networks lack sufficient visual cues to reliably infer orientation-dependent degradation kernels purely from appearance \citep{huang2024learn}. To address this limitation, several explicit conditioning strategies have been explored. Conditional normalization mechanisms such as FiLM (Feature-wise Linear Modulation) \citep{perez2018film} enable a shared backbone to adapt its feature responses to view-specific statistics, while view-specific 2D processing strategies, such as those adopted in QuickNAT \citep{roy2018quicknat}, explicitly decouple axial, coronal, and sagittal feature distributions. In the broader diffusion literature, more heavyweight conditioning paradigms have also emerged, including parallel control-branch architectures such as ControlNet \citep{zhang2023adding} and expert-routing schemes based on Mixture-of-Experts (MoE) \citep{shazeer2017outrageously}. While effective for strong external control or large-capacity modeling, these approaches introduce substantial architectural overhead and are not optimized for the specific constraints of fetal MRI.

In fetal HASTE acquisitions, orientation-aware conditioning is paramount: axial stacks emphasize ventricles, coronal stacks compare hemispheres, and sagittal stacks delineate midline structures, each exhibiting distinct artifact patterns and partial-volume effects. A uniform, orientation-agnostic model inevitably averages these conflicting priors, leading to degraded anatomical fidelity. In contrast, our approach treats orientation as a dynamic and explicit conditioning signal, using gated modulation to selectively activate view-specific priors at only the network depths where they are anatomically relevant.


\begin{figure}[ht]
    \centering
    \includegraphics[width= 
    \linewidth]{imgs/overall_c.png}
    \caption{Overview of our framework. The architecture employs a dual-stream strategy: the Conditioning Encoder extracts spatial guidance features from the fixed low-resolution reference $\mathbf{x}^{\mathrm{LR}}$ and validity mask $\mathbf{m}$. These features are concatenated with the noisy latent state $\mathbf{x}_t$ and passed to the Swin-UNet backbone, which predicts the clean high-resolution estimate $\hat{\mathbf{x}}_0$. Gated FiLM layers inject global orientation priors, dynamically adapting the feature hierarchy to view-specific acquisition characteristics.}

    \label{fig:architecture}
\end{figure}



\section{Methods}

Our proposed framework integrates an orientation-aware hybrid Swin-UNet backbone, $f_\theta$, with a residual-shift diffusion process (\figureref{fig:architecture}). The network processes inputs in two streams: a shallow conditioning encoder extracts the clean reference slice $\mathbf{x}^{\mathrm{LR}}$ and validity mask $\mathbf{m}$, which are concatenated with noisy diffused input $\mathbf{x}_t$ and passed through a four-stage U-Net encoder-decoder. To balance local texture recovery with global coherence, we employ standard Residual Blocks at higher resolutions and Swin Transformer Blocks at bottleneck levels, linked via skip connections. A global orientation embedding modulates features at every stage via gated FiLM layers. By operating directly on the residual manifold, this architecture focuses generative capacity on restoring high-frequency details while preserving the low-frequency structure provided by input.   



\subsection{Residual-Shift Diffusion Process}


We adopt the Res-SRDiff formulation \citep{safari2025ressrdiff}, which extends ResShift \citep{yue2024resshift} to medical image restoration. We define our inputs as follows: $\mathbf{x}^{\mathrm{LR}}$ denotes the fixed low-resolution reference slice (the condition), $\mathbf{x}^{\mathrm{HR}}$ represents the high-resolution target, and $\mathbf{m}$ is the binary validity mask indicating non-background regions. The diffusion process operates on the residual manifold. The forward process, detailed in Algorithm~\ref{alg:training}, perturbs the clean residual $\mathbf{r}_0 = \mathbf{x}^{\mathrm{HR}} - \mathbf{x}^{\mathrm{LR}}$ into a noisy latent state $\mathbf{x}_t$ at timestep $t$. The noisy observation $\mathbf{x}_t$ is defined as:
\begin{equation}
    \mathbf{x}_t = \mathbf{x}^{\mathrm{LR}} + \sqrt{\bar{\alpha}_t}(\mathbf{x}^{\mathrm{HR}} - \mathbf{x}^{\mathrm{LR}}) + \sqrt{1-\bar{\alpha}_t}\,\boldsymbol{\epsilon}, \quad \boldsymbol{\epsilon} \sim \mathcal{N}(\mathbf{0}, \mathbf{I}).
\end{equation}
Here, $\bar{\alpha}_t$ follows a noise schedule. At each timestep $t$, the backbone $f_\theta$ predicts the clean high-resolution estimate $\hat{\mathbf{x}}_0$. Crucially, to guide this generation, the network receives a channel-wise concatenation of the noisy state $\mathbf{x}_t$ and the conditioning features encoded from the reference pair $(\mathbf{x}^{\mathrm{LR}}, \mathbf{m})$. We set the prediction target to the clean signal ($\texttt{predict\_type}=\texttt{xstart}$), effectively training the network to recover the residual $\hat{\mathbf{r}}_0 = \hat{\mathbf{x}}_0 - \mathbf{x}^{\mathrm{LR}}$, anchoring generation to the acquired anatomy. The reverse sampling process, presented in Algorithm \ref{alg:sampling}, iteratively refines a noisy initial state $\mathbf{x}_T$ back to the clean estimate $\mathbf{x}_0$.
 % This prevents the hallucination of structures unrelated to the input \citep{khateri2025mri}.
\noindent
\begin{minipage}[t]{0.48\textwidth}
\begin{algorithm2e}[H]
\caption{Training}\label{alg:training}
\KwIn{$(\mathbf{x}^{\mathrm{LR}},\mathbf{x}^{\mathrm{HR}},\mathbf{m})\sim\mathcal{D}$}
\Repeat{converged}{
  Draw $t\sim\mathcal{U}(1,T)$, $\boldsymbol{\epsilon}\sim\mathcal{N}(\mathbf{0},\mathbf{I})$\;
  $\mathbf{x}_t \leftarrow \mathbf{x}^{\mathrm{LR}}
    + \sqrt{\bar{\alpha}_t}(\mathbf{x}^{\mathrm{HR}}-\mathbf{x}^{\mathrm{LR}})
    + \sqrt{1-\bar{\alpha}_t}\boldsymbol{\epsilon}$\;
  \tcp{Concat noise \& condition}
  $\hat{\mathbf{x}}_0 \leftarrow f_\theta(\mathbf{x}_t,
      \mathrm{concat}(\mathbf{x}^{\mathrm{LR}},\mathbf{m}), t)$\;
  $\theta \leftarrow \theta - \eta\nabla_\theta 
      \|\hat{\mathbf{x}}_0 - \mathbf{x}^{\mathrm{HR}}\|^2$\;
}
\end{algorithm2e}
\end{minipage}
\hfill
\begin{minipage}[t]{0.48\textwidth}
\begin{algorithm2e}[H]
\caption{Sampling}\label{alg:sampling}
\KwIn{$\mathbf{x}^{\mathrm{LR}},\mathbf{m}$}
$\mathbf{x}_T \leftarrow \mathcal{N}(\mathbf{x}^{\mathrm{LR}},\gamma^2\beta_T \mathbf{I})$\;
\For{$t \leftarrow T$; $t \ge 1$; $t \leftarrow t-1$}{
$\epsilon \leftarrow 
    \begin{cases}
    \mathcal{N}(0,I), & t>1\\
    0, & t=1
    \end{cases}$\;
  $\hat{\mathbf{x}}_0 \leftarrow 
    f_\theta(\mathbf{x}_t,\mathrm{concat}(\mathbf{x}^{\mathrm{LR}},\mathbf{m}),t)$\;
  $\mathbf{r}_{t-1} \leftarrow 
    \tilde{\mu}(\mathbf{x}_t-\mathbf{x}^{\mathrm{LR}},
               \hat{\mathbf{x}}_0-\mathbf{x}^{\mathrm{LR}}, t)
    + \sqrt{\tilde{\beta}_t}\boldsymbol{\epsilon}$\;
  $\mathbf{x}_{t-1} \leftarrow 
    \mathbf{x}^{\mathrm{LR}} + \mathbf{r}_{t-1}$\;
}
\Return{$\mathbf{x}_0$}\;
\end{algorithm2e}
\end{minipage}



\subsection{Orientation-Conditioned FiLM Modulation}\label{sec:film}

While deep networks may implicitly infer slice orientation from anatomical semantics, we explicitly condition the backbone on the viewing plane $y \in \{\text{axial, coronal, sagittal}\}$ as a direct inductive bias for domain adaptation. Slices acquired from different planes follow distinct 2D appearance distributions due to anisotropic sampling, orientation-dependent blur, and partial-volume effects. By injecting $y$ as a conditioning token, the shared 2D backbone dynamically adapts its feature distributions in a view-specific manner, without introducing heavy computational overhead or parameter growth associated with parallel expert-style architectures such as MoE or ControlNet.


The categorical orientation label is embedded and mapped to affine modulation parameters $(\boldsymbol{\gamma}, \boldsymbol{\beta}) \in \mathbb{R}^D$ through a lightweight two-layer MLP, where $D$ denotes dimensionality of the diffusion timestep embedding. These parameters are applied to timestep embedding $\mathbf{e}_t$ via FiLM, yielding a view-conditioned embedding that is injected into each residual block. To control the strength of orientation conditioning across network depth, we introduce \emph{depth-adaptive gating}, where a learnable scalar $g_i \in (0,1)$ modulates the injection at the $i$-th residual block:


\begin{equation}
\hat{\mathbf{e}}_{t,i} 
= \mathbf{e}_t \odot \bigl(1 + \alpha \cdot g_i \cdot \boldsymbol{\gamma}\bigr)
+ \alpha \cdot g_i \cdot \boldsymbol{\beta},
\label{eq:gated-film}
\end{equation}

where $\alpha$ is a global annealing factor linearly increased during early training phase. The modulated embedding $\hat{\mathbf{e}}_{t,i}$ is then passed to the $i$-th residual block following standard diffusion U-Net formulation. This gating mechanism allows network to autonomously optimize conditioning strength, selectively activating view-specific corrections at beneficial depths while suppressing modulation where invariant representations are preferred.
% As a result, the model selectively adapts to texture patterns and degradation characteristics intrinsic to each acquisition view.


\subsection{Multi-level Data Augmentations}\label{sec:dataaug}
Given scarcity of paired fetal MRI data, we employ a comprehensive augmentation pipeline designed to enforce invariance to pose and robustness to acquisition artifacts. Following \citet{zimmermann2025augment}, we model the degradation function $\mathcal{D}$ as a composition of geometric, intensity, and measurement perturbations applied on-the-fly to $(\mathbf{x}^{\mathrm{LR}}, \mathbf{x}^{\mathrm{HR}})$ pairs. Specific augmentation parameters are detailed in Appendix \ref{ap:implementation}.

\paragraph{Geometric Invariance.}
To simulate diverse fetal poses and preserve alignment, synchronized spatial transformations $\mathcal{T}_{geom}$ are applied to the input triplet $(\mathbf{x}^{\mathrm{LR}}, \mathbf{x}^{\mathrm{HR}}, \mathbf{m})$. These include random rotations, horizontal/vertical flips, and affine perturbations (shear and translation). Additionally, low-frequency B-spline deformations are applied to model non-rigid maternal/fetal motion.


\paragraph{Intensity and Artifact Robustness.}
We introduce realistic signal degradations to minimize the domain shift between synthetic training data and clinical inputs. These include: \textit{Contrast Shifts} applying a non-linear monotonic intensity mapping via piecewise-linear interpolation through random control points to simulate scanner-specific contrast variations; \textit{Signal Dropout} to emulate motion-induced signal voids common in single-shot HASTE, using ``blackout'' corruptions as a strong regularization surrogate: $\mathbf{x}_{\text{corrupt}} = \mathbf{x} \odot (1 - \mathbf{M}_{\text{b}}) + v_{\text{min}} \cdot  \mathbf{M}_{\text{b}}$, where $\mathbf{M}_{\text{b}}$ uniformly sample from four masking topologies (hemispheric, vertical band, oblique stripe, and multi-patch dropout); and \textit{Measurement Noise} where variable Gaussian noise and anisotropic blurring are injected to approximate coil-dependent Rician noise and slice-thickness–induced point-spread effects. 

\subsection{Training Objective}
Since our network $f_\theta$ directly predicts the clean estimate $\hat{\mathbf{x}}_0$, we use a combined reconstruction and perceptual loss. The primary objective is an $\ell_2$ loss on the prediction: 
\begin{equation}
    \label{eq:diff_loss}
    \mathcal{L}_{\text{diff}} 
    = \mathbb{E}_{\mathbf{x}^{\mathrm{HR}}, \mathbf{x}^{\mathrm{LR}}, \mathbf{m}, \boldsymbol{\epsilon}, t, y} 
    \left[ \big\| f_\theta(\mathbf{x}_t, \mathrm{concat}(\mathbf{x}^{\mathrm{LR}}, \mathbf{m}), t, y) 
    - \mathbf{x}^{\mathrm{HR}} \big\|_2^2 \right]
\end{equation}
where the expectation is taken over the data distribution, diffusion noise $\boldsymbol{\epsilon}$, timesteps $t$, and orientation labels $y$. To ensure perceptual fidelity and textural sharpness, we add an LPIPS term, yielding the final objective, with $\lambda_{\text{mse}}$ and $\lambda_{\text{lpips}}$ balancing the two terms:
\begin{equation}
    \label{eq:total_loss}
    \mathcal{L}_{\text{total}} 
    = \lambda_{\text{mse}}\, \mathcal{L}_{\text{diff}} 
    + \lambda_{\text{lpips}}\, \text{LPIPS}(\hat{\mathbf{x}}_0, \mathbf{x}^{\mathrm{HR}}),
\end{equation}




\section{Experiments}
\subsection{Datasets and Implementation}
We utilized both synthetic and clinical $\text{T}_\text{2}$ HASTE datasets for model development and evaluation. Full training hyperparameters and details are provided in Appendix \ref{ap:implementation}.
\paragraph{Synthetic Data.}
Our primary training dataset comprises paired synthetic HR and LR fetal $\text{T}_\text{2}$ HASTE MRI volumes. These, alongside their corresponding tissue segmentation maps, were all generated using the spatiotemporal IMAGINE atlas (21--38 weeks gestational age (GA)) \cite{IMAGINE_2023} and the FaBiAN numerical phantom \cite{fabian}. FaBiAN employs a fast spin-echo (FSE) model to simulate extended phase-graph physics, stimulated echoes, and bias fields, accurately mimicking clinical HASTE acquisitions. For each GA and orientation, we synthesized clean 3T reference stacks ($B_0=3$T, $TE_{\text{eff}}=90$ms, 0.3mm gap) and matched degraded 1.5T counterparts ($B_0=1.5$T, $TE_{\text{eff}}=133$ms, 0mm gap) both with 3mm slice thickness. The 1.5T inputs were further augmented with variable motion, TE perturbations, and noise to reflect clinical heterogeneity and preserve underlying atlas anatomy. These simulated acquisition parameters align with our clinical fetal HASTE protocol, ensuring the 3T and 1.5T images replicate characteristic contrast and resolution. The dataset was split 80/20 for training and testing, with training slices undergoing the additional augmentation pipeline described in Section~\ref{sec:dataaug}. We empirically validate the realism of these simulations by comparing their intensity distributions against the clinical target domain in Appendix~\ref{app:sim_validation}.

\paragraph{Clinical Data.}
For clinical validation, we utilized an IRB-approved clinical 3T T$_2$-weighted HASTE dataset from Children’s Healthcare of Atlanta (CHOA), acquired with a voxel size of $0.98\times0.98\times3.0\text{mm}^3$. Following manual QC (206/244 subjects retained) and brain extraction \citep{ranzini2021monaifbs}, we designated 103 subjects with fully intact stacks as held-out test set (664 stacks) and remaining 103 subjects as validation pool (464 stacks) to monitor training. To simulate low-field physics, we applied spectral k-space truncation \citep{chen2018brain}—zero-filling high frequencies in Fourier domain—which approximates physical point-spread blurring more faithfully than image-domain downsampling.






\subsection{Competing Methods}
We benchmark against four state-of-the-art MRI super-resolution methods: \textbf{SRCNN} \citep{dong2015image}, \textbf{Real-ESRGAN} \citep{wang2018esrgan}, \textbf{SwinIR} \citep{liang2021swinir}, and the 3D state-space model \textbf{GAMBAS} \citep{baljer2025gambas}. All baselines were retrained on our FaBiAN dataset using identical splits and optimized to convergence; detailed specifications are provided in Appendix~\ref{ap:baselines}.


\subsection{Evaluation Metrics}
\label{sec:evaluation_metrics}

To evaluate the volumetric consistency of the 2D super-resolved stacks, output slices from all methods are first reconstructed into isotropic volumes using NeSVoR \citep{xu2023nesvor} with fixed hyperparameters. We then assess both reconstruction fidelity and downstream anatomical utility under synthetic and clinical settings.

On FaBiAN test set, we utilize the paired ground truth to compute standard restoration metrics: PSNR, NRMSE, and MAE measure intensity accuracy, while SSIM and LPIPS quantify structural and perceptual realism. To evaluate downstream utility, reconstructed volumes are segmented using Fetal-SynthSeg \citep{zalevskyi2024improving}. We report region-based (Dice, Volume Bias) and boundary-based (ASSD, HD95) metrics against the simulation's ground truth tissue maps to assess anatomical integrity.

For CHOA-3T dataset, where pixel-aligned references are unavailable, we employ Tissue Contrast T-score (TCT) \citep{sun2025foundation} to quantify separability of White Matter (WM) and Gray Matter (GM) distributions:
\begin{equation}
TCT = \frac{|\mu_{wm} - \mu_{gm}|}{\sqrt{\sigma^2_{wm} + \sigma^2_{gm}}},
\end{equation}
where $\mu$ and $\sigma^2$ denote the mean and variance of tissue intensities derived from Fetal-SynthSeg masks. 1.5T acquisitions typically exhibit lower SNR, leading to increased intra-tissue variance ($\mu^2$) in the denominator. A higher TCT therefore indicates successful 3T-like super-resolution, driven by widened contrast separation ($|\mu_{wm} - \mu_{gm}|$) and suppressed noise. To eliminate confounders arising from biological maturation (e.g., myelination), we strictly limit TCT comparisons to within the same GA.
% A higher TCT indicates sharper gray-white matter differentiation characteristic of high-field imaging.
\section{Results}

Ablation studies validating individual components are detailed in Appendix \ref{ap:ablation}. An evaluation of inference efficiency and memory usage is provided in Appendix \ref{app:efficiency}.

\subsection{Quantitative Validation on Synthetic Data}

We first evaluate reconstruction fidelity and downstream utility on the FaBiAN test set.
\begin{table}[ht]
    \centering
    \caption{Volumetric reconstruction metrics on FaBiAN synthetic test set. }
    % Note the perception-distortion trade-off: while \textbf{SwinIR} achieves the highest PSNR due to its restoration-based objective (leading to smoother outputs), \textbf{Ours} achieves the best LPIPS score, indicating superior perceptual quality and texture recovery closer to the ground truth.
    \label{tab:volumetric}
    % \setlength{\tabcolsep}{3pt} 
    \resizebox{1.0\linewidth}{!}{
    \begin{tabular}{lccccc}
        \toprule
        \textbf{Method} & \textbf{PSNR (dB) $\uparrow$} & \textbf{NRMSE $\downarrow$} & \textbf{MAE $\downarrow$} & \textbf{SSIM $\uparrow$} & \textbf{LPIPS $\downarrow$} \\
        \midrule
        Input (Sim. 1.5T) & 24.90 $\pm$ 2.10 & 0.36 $\pm$ 0.02 & 29.50 $\pm$ 5.90 & 0.79 $\pm$ 0.06 & 0.055 $\pm$ 0.038 \\
        \midrule
        SRCNN & 25.15 $\pm$ 1.95 & 0.35 $\pm$ 0.02 & 28.80 $\pm$ 5.40 & 0.80 $\pm$ 0.05 & 0.052 $\pm$ 0.035 \\
        Real-ESRGAN & 23.40 $\pm$ 2.45 & 0.42 $\pm$ 0.04 & 34.10 $\pm$ 7.20 & 0.72 $\pm$ 0.08 & 0.048 $\pm$ 0.041 \\
        SwinIR & \textbf{26.12 $\pm$ 1.85} & \textbf{0.31 $\pm$ 0.02} & \textbf{26.50 $\pm$ 5.10} & 0.73 $\pm$ 0.05 & 0.047 $\pm$ 0.032 \\
        GAMBAS & 14.80 $\pm$ 3.50 & 0.93 $\pm$ 0.03 & 176.20 $\pm$ 6.40 & 0.53 $\pm$ 0.07 & 0.158 $\pm$ 0.060 \\
        \midrule
        \textbf{Ours} & 25.66 $\pm$ 2.06  & 0.34 $\pm$ 0.02 & 27.31 $\pm$ 5.70 & \textbf{0.81 $\pm$ 0.06} & \textbf{0.042 $\pm$ 0.037} \\
        \bottomrule
    \end{tabular}
    }
\end{table}
\begin{figure}[ht]
    \centering
    \includegraphics[width=\textwidth]{imgs/fab_ga_2.png}
    \caption{Qualitative evaluation on the held-out synthetic FaBiAN dataset across three representative gestational ages (23, 29, and 36 weeks). }
    \label{fig:qualitative_fabian}
\end{figure}
\noindent\textbf{Reconstruction Fidelity.} As shown in \tableref{tab:volumetric}, our method achieves the best perceptual metrics (\textbf{SSIM 0.81, LPIPS 0.042}). While SwinIR yields slightly higher PSNR due to its regression-based objective, it suffers from characteristic over-smoothing. In contrast, our model balances fidelity and realism, avoiding the artifacts seen in GAN baselines (Real-ESRGAN) and the degradation of GAMBAS under domain shifts. Appendix \ref{app:aug_fairness} confirms these architectural gains persist even when baselines are retrained with our full augmentation suite.
% GAN-based Real-ESRGAN suffers from unstable distortions, and GAMBAS degrades substantially under severe fetal-specific artifacts, confirming the advantage of our orientation-aware residual diffusion framework under realistic degradations.


\begin{figure}[htbp]
    \centering
    % \vspace{0.5em}
    \includegraphics[width=0.4\linewidth]{imgs/vis2-1.png}
    \caption{Comparative results on a clinical CHOA test subject. }
    % From left to right: Simulated Low-Field Input, High-Field Reference, SRCNN, RealESR-GAN, SwinIR, GAMBAS, and Ours. Our method (far right) produces the sharpest definition of cortical gyri and deep gray matter structures, avoiding the blurring artifacts seen in deterministic baselines.
    \label{fig:vis_results}
\end{figure}


% \subsection{Downstream Segmentation on Synthetic Data}
\begin{table}[htbp]
    \centering
    % \vspace{-1.5em}
    % Helper macro: #1 = mean, #2 = std. Stacks std below mean.
    \newcommand{\val}[2]{\shortstack{#1 \\ \tiny $\pm$#2}}
    % \newcommand{\val}[2]{\shortstack{#1 \\ $\pm$#2}}

    
    \caption{Region-wise segmentation metrics on FaBiAN synthetic test set.}
    \label{tab:dice}
    \setlength{\tabcolsep}{2pt} 
    \resizebox{1.0\linewidth}{!}{
    \begin{tabular}{llcccccc} 
        \toprule
        \textbf{Region} & \textbf{Metric} 
        & \makecell{\textbf{Input} \\ \textbf{(Sim. 1.5T)}} 
        & \textbf{SRCNN}
        & \makecell{\textbf{RealESR-} \\ \textbf{GAN}} 
        & \textbf{SwinIR} 
        & \textbf{GAMBAS} 
        & \textbf{Ours} \\
        \midrule

        % ---------------- GMC (Cortex)  ----------------
        \multirow{4}{*}{GM}
        & Dice $\uparrow$        & \val{0.551}{0.030} & \val{0.586}{0.028} & \val{0.495}{0.045} & \val{0.658}{0.032} & \val{0.352}{0.051} & \textbf{\val{0.704}{0.024}} \\
        & ASSD $\downarrow$      & \val{0.521}{0.034} & \val{0.486}{0.065} & \val{0.853}{0.123} & \val{0.396}{0.029} & \val{1.452}{0.180} & \textbf{\val{0.334}{0.021}} \\
        & HD95 $\downarrow$      & \val{1.414}{0.069} & \val{1.351}{0.157} & \val{2.124}{0.210} & \val{1.057}{0.085} & \val{3.214}{0.420} & \textbf{\val{0.853}{0.075}} \\
        & Vol. Bias              & \val{-7.20}{0.24}  & \val{-5.61}{0.31}  & \val{+3.42}{0.55}  & \val{-2.10}{0.18}  & \val{-15.33}{1.20}& \textbf{\val{-0.55}{0.14}} \\
        \midrule

        % ---------------- WM (White Matter) ----------------
        \multirow{4}{*}{WM} 
        & Dice $\uparrow$        & \val{0.674}{0.040} & \val{0.705}{0.035} & \val{0.621}{0.048} & \val{0.795}{0.038} & \val{0.512}{0.065} & \textbf{\val{0.837}{0.022}} \\
        & ASSD $\downarrow$      & \val{0.580}{0.038} & \val{0.544}{0.063} & \val{0.753}{0.099} & \val{0.440}{0.032} & \val{1.198}{0.154} & \textbf{\val{0.382}{0.025}} \\
        & HD95 $\downarrow$      & \val{1.412}{0.089} & \val{1.382}{0.114} & \val{1.954}{0.181} & \val{1.157}{0.095} & \val{2.946}{0.310} & \textbf{\val{0.984}{0.082}} \\
        & Vol. Bias              & \val{-3.70}{0.12}  & \val{-3.13}{0.18}  & \val{+4.53}{0.35}  & \textbf{\val{-1.07}{0.15}} & \val{-9.45}{0.85} & \val{-1.42}{0.11} \\
        \midrule

        % ---------------- Deep GM (GMS) - ----------------
        \multirow{4}{*}{Deep GM} 
        & Dice $\uparrow$        & \val{0.819}{0.040} & \val{0.838}{0.038} & \val{0.762}{0.057} & \val{0.885}{0.035} & \val{0.643}{0.072} & \textbf{\val{0.895}{0.019}} \\
        & ASSD $\downarrow$      & \val{0.705}{0.033} & \val{0.686}{0.055} & \val{0.924}{0.085} & \textbf{\val{0.374}{0.030}} & \val{1.547}{0.208} & \val{0.395}{0.028} \\
        & HD95 $\downarrow$      & \val{1.502}{0.099} & \val{1.458}{0.124} & \val{1.856}{0.160} & \val{1.134}{0.090} & \val{2.805}{0.354} & \textbf{\val{0.952}{0.085}} \\
        & Vol. Bias              & \val{-3.75}{0.15}  & \val{-2.59}{0.21}  & \val{+6.85}{0.40}  & \textbf{\val{+0.85}{0.12}} & \val{+10.20}{0.95}& \val{+1.02}{0.09} \\
        \midrule

        % ---------------- Ventricles (CSF)  ----------------
        \multirow{4}{*}{Ventricles} 
        & Dice $\uparrow$        & \val{0.844}{0.039} & \val{0.860}{0.036} & \val{0.793}{0.055} & \val{0.895}{0.032} & \val{0.694}{0.068} & \textbf{\val{0.905}{0.021}} \\
        & ASSD $\downarrow$      & \val{0.651}{0.044} & \val{0.616}{0.060} & \val{0.953}{0.114} & \textbf{\val{0.364}{0.041}} & \val{1.612}{0.220} & \val{0.392}{0.037} \\
        & HD95 $\downarrow$      & \val{1.490}{0.089} & \val{1.481}{0.143} & \val{2.106}{0.252} & \val{1.148}{0.080} & \val{3.789}{0.455} & \textbf{\val{0.981}{0.075}} \\
        & Vol. Bias              & \val{-7.11}{0.33}  & \val{-5.24}{0.28}  & \val{-8.53}{0.45}  & \val{-2.09}{0.22}  & \val{-14.39}{1.10}& \textbf{\val{-1.75}{0.16}} \\
        \bottomrule
    \end{tabular}
    }
\end{table}

\begin{table}[htbp]
    \caption{TCT Scores on CHOA-3T clinical dataset. }
    \label{tab:tct_scores}
    \centering
    
    % 减小列间距以节省宽度
    \setlength{\tabcolsep}{2pt}
    % 使用 resizebox 自动缩放表格以适应页面宽度
    \resizebox{\linewidth}{!}{
        \begin{tabular}{lcccccc}
            \toprule
            % 第一行：方法名 (Method)
            \textbf{Method} & \textbf{Input (Sim. 1.5T)} & \textbf{SRCNN} & \textbf{RealESR-GAN} & \textbf{SwinIR} & \textbf{GAMBAS} & \textbf{Ours} \\
            \midrule
            % 第二行：数值 (Value)
            \textbf{TCT $\uparrow$} & 0.78 $\pm$ 0.05 & 0.82 $\pm$ 0.04 & 0.90 $\pm$ 0.07 & 0.88 $\pm$ 0.03 & 0.65 $\pm$ 0.08 & \textbf{0.94 $\pm$ 0.02} \\
            \bottomrule
        \end{tabular}
    }
\end{table}
\begin{figure}[htbp]
    \centering
    % \vspace{-1.5em}
    \includegraphics[width=0.4\linewidth]{imgs/severe.png}
    \caption{Restoration of a severely degraded case exhibiting motion and signal dropout. }
    % The proposed method successfully recovers coherent anatomical structures and clear boundaries, enabling viable downstream segmentation where the input is heavily corrupted.
    \label{fig:severe}
\end{figure}

% As shown in \tableref{tab:dice}, our super-resolved volumes consistently improve segmentation accuracy over baselines. We observe the largest gains in intricate structures such as \emph{GM} (Dice +28\% vs input) and \emph{Ventricles}, corroborating the visual trends in Figure~\ref{fig:qualitative_fabian}.
\noindent\textbf{Downstream Segmentation.} Improved image quality translates directly to segmentation accuracy (\tableref{tab:dice}). Our method consistently outperforms baselines, with the largest gains observed in intricate structures such as \emph{GM} (\textbf{Dice +28\% relative to input}) and \emph{Ventricles}. This corroborates the visual recovery of fine gyral patterns shown in \figureref{fig:qualitative_fabian}.

\subsection{Clinical Generalization on CHOA Dataset}
We validate clinical performance by comparing reconstruction quality (\figureref{fig:vis_results}) and quantitative tissue contrast (\tableref{tab:tct_scores}). Qualitatively, our orientation-aware diffusion model preserves the contrast gradients essential for diagnosis, avoiding the smoothing artifacts of regression-based baselines. This is confirmed by the TCT metric, where our method achieves the highest score (\textbf{0.94 $\pm$ 0.02}), indicating superior gray-white matter separability. Finally, robustness to extreme corruption is demonstrated in \figureref{fig:severe}, where our method restores anatomical coherence from a heavily degraded input.

% \figureref{fig:vis_results} contrasts outputs on a representative CHOA subject: Real-ESRGAN and SwinIR oversmooth cortical detail, whereas our orientation-aware diffusion model preserves the contrast gradients radiologists rely on. \figureref{fig:severe} illustrates that while the LR input leads to severe segmentation fragmentation and geometric distortion in the reconstructed volume, our method successfully restores anatomical coherence, yielding tissue maps and 3D topology that closely approximate the high-resolution ground truth.


% On CHOA-3T dataset, gray--white matter separability is quantified using TCT metric, as reported in \tableref{tab:tct_scores}. Our method achieves the highest TCT ($0.94 \pm 0.02$), indicating enhanced tissue contrast and reduced intra-tissue noise relative to all competing approaches. This demonstrates improved clinical contrast recovery and narrows the gap between routine 1.5T acquisitions and high-field 3T imaging.

\section{Discussion} Our work effectively bridges the domain gap between routine 1.5T acquisitions and diagnostic-quality 3T imaging. A critical innovation is the explicit modeling of acquisition anisotropy: unlike standard models that treat axial, sagittal, and coronal stacks identically—often resulting in isotropic blurring—our framework conditions the diffusion process on slice orientation to invert view-specific degradations. This restoration improves downstream utility, yielding robust 3D reconstructions and precise tissue segmentation. Quantitatively, our method achieves a TCT score of 0.94 on clinical CHOA dataset (vs. 0.78 for input), indicating a significant recovery of gray-white matter contrast comparable to high-field imaging.

However, we acknowledge certain limitations. Potential failure modes may arise if clinical inputs contain artifacts strictly outside our simulated training distribution. While our current validation on the CHOA dataset demonstrates strong generalization, future work will aim to curate a multi-institutional dataset spanning diverse hardware vendors (e.g., Siemens, GE, Philips) to further stress-test these boundaries and ensure reliability across different scanner manufacturers.




\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{This work was supported by NIH grants R00HD103912 and R01MH133313 (Y.W.).}

% \bibliographystyle{plainnat}
\bibliography{midl26_359}

\clearpage
\appendix


\section{Implementation of Our Method}
\label{ap:implementation}
Our framework is implemented in PyTorch 2.5.1 and trained on three NVIDIA H100 GPUs. We utilize the FaBiAN-derived synthetic pairs with the augmentation suite detailed in Section~\ref{sec:dataaug}. During training, we draw random $128\times128$ patches, while inference uses $256\times256$ center crops; all inputs are symmetrically padded and normalized to $[-1,1]$. To simulate signal dropout, we apply blackout corruption with a probability of 0.5 (fill value -1.0) and enable monotonic intensity remapping with a probability of 0.2. Optimization is performed using RAdam with a batch size of 64 for 182k iterations. The learning rate follows a cosine-annealed schedule (base $2{\times}10^{-5}$ to $5{\times}10^{-5}$) after a 5k-step warm-up. We set $\lambda_{\text{mse}}=4.0$ and $\lambda_{\text{lpips}}=1.0$, and anneal the FiLM modulation strength over the first 50k iterations. At inference, we use the residual-shift sampler with a spatial scale factor $s_f=1.0$, an exponential noise schedule (power 0.3), $\eta_{\text{end}}=0.99$, 4 diffusion steps, minimum noise level 0.2, and $\kappa=2.0$. 
\subsection{Data Augmentation Hyperparameters}
We apply synchronized geometric transformations with the following parameters: random rotations sampled from $k \cdot 90^\circ$ and $\mathcal{U}(-10^\circ, 10^\circ)$; shearing from $\mathcal{U}(-0.05, 0.05)$; and translations within $\pm8$ pixels. Intensity remapping is applied with probability $p=0.2$ using piecewise-linear functions through random control points. Signal dropout (blackout) is applied with probability $p=0.5$ using a fill value of -1.0, masking regions with hemispheric, vertical band, oblique stripe, or multi-patch topologies.


\section{Implementation of competing methods}
\label{ap:baselines}
\noindent\textbf{SRCNN.}
As a lightweight convolutional baseline we use a modified SRCNN-style architecture consisting of an initial $3\times3$ convolution followed by two DenseBlocks, each containing four dense layers with growth rate~32 and a transition layer reducing the feature dimensionality back to~64 channels. Two residual blocks refine the features before reconstruction. Upsampling is performed using bilinear interpolation with scale factor~4, followed by a final $3\times3$ convolution to produce the single–channel output. The network is trained with the pixel-wise MSE loss and optimized using Adam with learning rate $\mathrm{lr}=1\times10^{-4}$.

\noindent\textbf{Real-ESRGAN.}
We use a standard ESRGAN generator configured for scale~$1$ quality enhancement. The network contains 64 base feature channels and 23 residual-in-residual dense blocks (RRDBs), each employing residual dense connections with a scaling factor of~0.2. No discriminator or adversarial objective is used; instead, the model is trained purely as a feed-forward regressor with the pixel loss in \equationref{eq:total_loss}. Optimization uses Adam with an initial learning rate of $2\times10^{-4}$ and $\beta=(0.9,0.99)$, together with a two-stage decay at 50k and 100k iterations. Training is run for 200k iterations with random $256\times256$ paired fetal MRI patches and standard flip/rotation augmentations.


\noindent\textbf{SwinIR.}
We adopt the SwinIR ``restoration'' configuration (upscale~$=1$) for medical image quality enhancement. The model takes single–channel grayscale inputs and uses $256\times256$ patches with a window size of $8$. The backbone follows the standard SwinIR-M design with six residual Swin Transformer blocks (RSTBs), each using six Swin layers with shifted windows. We set the embedding dimension to $180$, use six attention heads per stage, an MLP expansion ratio of $2$, and employ ``1conv'' residual connections without any upsampling modules. Training uses Adam with a learning rate of $2\times10^{-4}$, batch size~32, and 500k iterations with a~$\times0.5$ decay every~150k.


\noindent\textbf{GAMBAS.} The Generalised-Hilbert Mamba SR model operates on short 3D volumes, so we input the stacks for each orientation and map them through eight Hilbert-ordered state-space layers with 96 hidden channels. Training uses Ranger (RAdam + Lookahead) with an initial learning rate $5\times10^{-4}$, cosine restarts every 50k steps, stochastic depth 0.1, and the same $\ell_1$+LPIPS objective as the planar baselines. 

\section{Ablation Studies}
\label{ap:ablation}
We ablate the four main design choices of our framework on the synthetic FaBiAN test set and the CHOA clinical cohort.



\subsection{Residual vs. Image-space Diffusion.}
\begin{table}[t]
\centering
\caption{Ablation of residual-shift diffusion versus image-space diffusion on the FaBiAN test set.}
\label{tab:ablation_residual}

% Unified stacked value format
\newcommand{\val}[2]{\makecell{#1 \\   $\pm$#2}}

\begin{tabular}{lccccc}
\toprule
\textbf{Variant} & \textbf{PSNR (dB)} $\uparrow$ & \textbf{NRMSE} $\downarrow$ & \textbf{MAE} $\downarrow$ & \textbf{SSIM} $\uparrow$ & \textbf{LPIPS} $\downarrow$ \\
\midrule
\makecell{Vanilla DDPM \\ (image-space)} 
& \val{20.02}{4.43}
& \val{0.87}{0.04}
& \val{55.60}{6.22}
& \val{0.70}{0.06}
& \val{0.082}{0.041} \\

Residual-shift 
& \textbf{\val{25.66}{2.06}}
& \textbf{\val{0.34}{0.02}}
& \textbf{\val{27.31}{5.70}}
& \textbf{\val{0.81}{0.06}}
& \textbf{\val{0.042}{0.037}} \\
\bottomrule
\end{tabular}
\end{table}
We compare our residual-shift formulation to a vanilla image-space DDPM that directly diffuses the HR target while conditioning on the LR input via channel concatenation. Both variants share the same Swin-UNet backbone, noise schedule, and 4-step sampler. As shown in \tableref{tab:ablation_residual}, residual diffusion maintains strong reconstruction performance, whereas the image-space DDPM degrades substantially across all metrics under this extreme low-step regime. This demonstrates that residual anchoring significantly improves sampling efficiency, enabling stable and accurate reconstruction with very few diffusion steps.

 
\subsection{Orientation Conditioning.}

\begin{table}[h]
\centering
\caption{Ablation of orientation conditioning on the FaBiAN test set. Mean denotes the average Dice across all tissue classes.}
\label{tab:ablation_orientation}
% \setlength{\tabcolsep}{5pt}
\newcommand{\val}[2]{\shortstack{#1 \\  $\pm$#2}}
\begin{tabular}{lccccc}
\toprule
\textbf{Variant} & \textbf{GM} & \textbf{WM} & \textbf{Deep GM} & \textbf{Ventricles} & \textbf{Mean} \\
\midrule
w/o orientation         
& \val{0.602}{0.031}
& \val{0.742}{0.036}
& \val{0.851}{0.028}
& \val{0.866}{0.030}
& 0.765 \\

Additive embedding  
& \val{0.645}{0.029}
& \val{0.781}{0.032}
& \val{0.872}{0.024}
& \val{0.885}{0.027}
& 0.796 \\

FiLM (w/o gating)   
& \val{0.681}{0.026}
& \val{0.812}{0.028}
& \val{0.887}{0.021}
& \val{0.898}{0.024}
& 0.820 \\

Gated FiLM    
& \textbf{\val{0.704}{0.024}}
& \textbf{\val{0.837}{0.022}}
& \textbf{\val{0.895}{0.019}}
& \textbf{\val{0.905}{0.021}}
& \textbf{0.835} \\

\bottomrule
\end{tabular}
\end{table}

As shown in \tableref{tab:ablation_orientation}, removing orientation information leads to a clear degradation in Dice across all tissue classes. Additive embeddings provide limited gains, while FiLM modulation substantially recovers segmentation accuracy. The proposed depth-adaptive gated FiLM achieves the best overall performance, demonstrating the importance of selectively activating orientation-specific priors.

\subsection{Fetal-specific Augmentations.}
\label{app:aug_fairness}

\begin{table}[h]
    \centering
    \caption{Impact of Multi-level Augmentation on Baselines on the FaBiAN Test Set.}
    \label{tab:aug_fairness}
    
    % Define the command locally if not defined globally
    \newcommand{\val}[2]{\makecell{#1 \\ $\pm$#2}}
    
    % \scriptsize % Use scriptsize to fit the wide table
    % \setlength{\tabcolsep}{4pt} % Slightly adjust column spacing if needed
    
    \begin{tabular}{l c c c c c c}
        \toprule
        \textbf{Model} & \textbf{Augment.} & \textbf{PSNR}(dB) $\uparrow$ & \textbf{NRMSE} $\downarrow$ & \textbf{MAE} $\downarrow$ & \textbf{SSIM} $\uparrow$ & \textbf{LPIPS} $\downarrow$ \\
        \midrule
        SRCNN & Standard & \val{25.15}{1.95} & \val{0.35}{0.02} & \val{28.80}{5.40} & \val{0.80}{0.05} & \val{0.052}{0.035} \\
         & Multi-level & \val{25.61}{1.73} & \val{0.33}{0.03} & \val{28.80}{5.90} & \val{0.76}{0.06} & \val{0.049}{0.037} \\
        \midrule
        Real-ESRGAN & Standard & \val{23.40}{2.45} & \val{0.42}{0.04} & \val{34.10}{7.20} & \val{0.72}{0.08} & \val{0.048}{0.041} \\
         & Multi-level & \val{20.19}{3.66} & \val{0.43}{0.03} & \val{33.20}{7.00} & \val{0.77}{0.03} & \val{0.045}{0.042} \\
        \midrule
        SwinIR & Standard & \val{26.12}{1.85} & \textbf{\val{0.31}{0.02}} & \textbf{\val{26.50}{5.10}} & \val{0.73}{0.05} & \val{0.047}{0.032} \\
         & Multi-level & \textbf{\val{26.21}{3.04}} & \val{0.32}{0.03} & \val{26.60}{5.60} & \val{0.72}{0.06} & \val{0.050}{0.038} \\
        \midrule
        GAMBAS & Standard & \val{14.80}{3.50} & \val{0.93}{0.03} & \val{176.20}{6.40} & \val{0.53}{0.07} & \val{0.158}{0.060} \\
         & Multi-level & \val{14.44}{4.21} & \val{0.86}{0.07} & \val{125.00}{5.30} & \val{0.49}{0.08} & \val{0.099}{0.060} \\
        \midrule
        \textbf{Ours} & Multi-level & \val{25.66}{2.06} & \val{0.34}{0.02} & \val{27.31}{5.70} & \textbf{\val{0.81}{0.06}} & \textbf{\val{0.042}{0.037}} \\
        \bottomrule
    \end{tabular}
\end{table}
\begin{table}[h]
\centering
\caption{Impact of multi-level data augmentations on TCT on the CHOA clinical dataset.}
\label{tab:ablation_augment}
% \resizebox{\linewidth}{!}{
\begin{tabular}{lcc}
\toprule
\textbf{Variant} & \textbf{w/o Full Augment}  & \textbf{Full Augment}\\
\midrule
\textbf{ TCT $\uparrow$}&  0.81 $\pm$ 0.03 & \textbf{0.94 $\pm$ 0.02} \\
% Full Augment  & \\
\bottomrule
\end{tabular}
% }
\end{table}


We treat the proposed multi-level augmentation suite as a core contribution designed to bridge the synthetic-to-clinical domain gap. To verify that our performance gains stem from the synergy between our model architecture and this data strategy—rather than the augmentations alone—we retrained all baselines using the full multi-level augmentation pipeline. As shown in Table~\ref{tab:aug_fairness}, applying fetal-specific augmentations to baselines does not guarantee gains and can induce instability (e.g., Real-ESRGAN). While SwinIR maintains high pixel-wise accuracy, our method achieves superior structural preservation (highest SSIM) and perceptual realism (lowest LPIPS), confirming that our architecture is uniquely capable of leveraging these priors. Furthermore, Table~\ref{tab:ablation_augment} demonstrates that this strategy is indispensable for clinical deployment: disabling the augmentation suite significantly degrades the TCT score on the CHOA dataset ($0.94 \to 0.81$), validating their role in facilitating robust sim-to-real transfer.

\subsection{LPIPS Loss.}

To evaluate perceptual loss, we trained a variant with $\lambda_{\text{LPIPS}}=0$. Results in \tableref{tab:lpips_ablation} confirm the perception-distortion trade-off~\citep{blau2018perception}: removing LPIPS marginally improves pixel-wise error (PSNR) but significantly degrades structural integrity (SSIM 0.81 vs. 0.77) and perceptual realism (LPIPS 0.042 vs. 0.063). Given that learning-based metrics align better with expert radiological assessment~\citep{khateri2025mri}, we retain the perceptual loss to prioritize the structural fidelity and sharpness essential for 3T synthesis.
\begin{table}[h]
    \centering
    \caption{Impact of LPIPS perceptual Loss on FaBiAN Test Set.}
    \label{tab:lpips_ablation}
    % \setlength{\tabcolsep}{4pt} % Adjust column spacing if necessary
    \newcommand{\val}[2]{\shortstack{#1 \\ $\pm$#2}}
    \begin{tabular}{lccccc}
        \toprule
        \textbf{Variant} & \textbf{PSNR (dB)} $\uparrow$ & \textbf{NRMSE} $\downarrow$ & \textbf{MAE} $\downarrow$ & \textbf{SSIM} $\uparrow$ & \textbf{LPIPS} $\downarrow$ \\
        \midrule
        w/o LPIPS & \textbf{\val{26.19}{1.92}} & \val{0.34}{0.05} & \textbf{\val{25.78}{4.65}} & \val{0.77}{0.06} & \val{0.063}{0.050} \\
        % \addlinespace % Optional: adds a little space between rows for better readability with stacked text
        \textbf{Ours } & \val{25.66}{2.06} & \textbf{\val{0.34}{0.02}} & \val{27.31}{5.70} & \textbf{\val{0.81}{0.06}} & \textbf{\val{0.042}{0.037}} \\
        \bottomrule
    \end{tabular}
\end{table}


\section{Computational Efficiency}
\label{app:efficiency}
\begin{table}[h]
    \centering
    \caption{Inference efficiency on a single NVIDIA A100 (batch size=1). Our method matches GAN-level latency while outperforming baselines in reconstruction quality.}
    \label{tab:efficiency}
    \setlength{\tabcolsep}{10pt}
    \begin{tabular}{l c c c}
        \toprule
        \textbf{Model} & \textbf{Params (M)} & \textbf{Time (s)} & \textbf{Memory (GB)} \\
        \midrule
        SRCNN & 0.43 & 0.001 & 0.18 \\
        Real-ESRGAN & 16.70 & 0.065 & 1.18 \\
        SwinIR & 11.50 & 0.027 & 0.31 \\
        GAMBAS & 53.45 & 0.046 & 3.31 \\
        Vanilla DDPM & 56.67 & 18.054 & 0.92 \\
        \midrule
        \textbf{Ours} & 56.67 & 0.079 & 0.92 \\
        \bottomrule
    \end{tabular}
\end{table}

To validate clinical feasibility, we benchmarked inference performance on a single NVIDIA A100 GPU (batch size=1). Despite being diffusion-based, our Residual-Shift formulation converges in just 4 steps, significantly reducing latency compared to standard DDPMs ($\sim$1,000 steps) or LDMs (50+ steps). 

As shown in Table~\ref{tab:efficiency}, our method requires only 0.079 seconds per slice, achieving speeds comparable to single-pass GANs (e.g., Real-ESRGAN: 0.065s) and $\approx 230\times$ faster than vanilla DDPMs. Furthermore, our efficient Swin-UNet backbone maintains a minimal memory footprint (0.92 GB)—lower than even the Real-ESRGAN baseline (1.18 GB)—due to window-based self-attention. This confirms that our approach is lightweight enough for real-time deployment on standard clinical workstations.


\section{Validation of Simulation Assumptions}
\label{app:sim_validation}

\begin{figure}[!htbp]
    \centering
    \subfigure[FaBiAN (Synthetic Training Data)]{\includegraphics[width=0.47\textwidth]{imgs/fabian_intensity_comparison.png}}
    \hfill
    \subfigure[CHOA (Clinical Test Data)]{\includegraphics[width=0.47\textwidth]{imgs/choa_intensity_comparison.png}}
    \caption{\textbf{Intensity Distribution Analysis.} Comparison of voxel intensity densities between synthetic and clinical domains.}
    \label{fig:intensity_distributions}
\end{figure}


To validate our simulation, we compared the intensity distributions of synthetic (FaBiAN) training data against clinical (CHOA) test data. As shown in \figureref{fig:intensity_distributions}, the synthetic profiles (a) closely align with the clinical data (b), confirming the representativeness of our training set. Furthermore, both domains consistently demonstrate the expected physical contrast gain: 3T targets (Blue) exhibit significantly sharper, higher-density signal peaks compared to the broader distributions of 1.5T inputs (Orange), validating the modeled relationship between field strengths.
\end{document}