\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution
\usepackage{graphicx}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{booktabs}
\usepackage{amsmath}
\usepackage{algorithm}
\usepackage{algcompatible}
\usepackage{amssymb}

\usepackage{mwe} % to get dummy images
\jmlrvolume{-- 28}
\jmlryear{2025}
\jmlrworkshop{Full Paper -- MIDL 2025}
\editors{Accepted for publication at MIDL 2025}

\title[SRMRI]{ SRMRI: A Diffusion-Based Super-resolution Framework and Open Dataset for Blind MRI Super-Resolution}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Arpan Poudel}$^{1}$ \Email{arpanp@uark.edu}\\
  \Name{Mamata Shrestha}$^{1}$ \Email{mamatas@uark.edu}\\
  \Name{Nian Wang}$^{2}$ \Email{nian.wang@utsouthwestern.edu}\\
  \Name{Ukash Nakarmi}$^{1}$ \Email{unakarmi@uark.edu} \\
\addr $^{1}$  University of Arkansas, Fayetteville, Arkansas, USA \\
\addr $^{2}$ University of Texas Southwestern Medical Center, Dallas, Texas, USA \\
}

\begin{document}

\maketitle

\begin{abstract}
% Existing deep learning approaches for medical image super-resolution typically involve training a model to directly map low-resolution (LR) images (often simulated from high-resolution (HR) counterparts) to HR images using paired datasets. This reliance on simulated data introduces bias and often performs poorly in real-world scenarios.
Existing deep learning methods for medical image super-resolution (SR) often rely on paired datasets generated by simulating low-resolution (LR) images from corresponding high-resolution (HR) scans, which can introduce biases and degrade real-world performance. To overcome these limitations, we present an unsupervised approach based on a score-based diffusion model that does not require paired training data. We train a score-based diffusion model using denoising score matching on HR Magnetic Resonance Imaging (MRI) scans, then perform iterative refinement with a stochastic differential equation (SDE) solver while enforcing data consistency from LR scans. Our method provides faster sampling compared to existing generative approaches and achieves competitive results on key metrics, though it does not surpass fully supervised baselines in PSNR and SSIM. Notably, while supervised models often report higher numerical metrics, we observe that they can produce suboptimal reconstructions due to their reliance on fixed upscaling kernels. Finally, we introduce the SRMRI dataset, containing LR and HR images obtained from scanner for training and evaluating MR image super-resolution models. Code and dataset are available at:
\href{https://github.com/arpanpoudel/SRMRI}{https://github.com/arpanpoudel/SRMRI}.
\end{abstract}

\begin{keywords} unsupervised MRI super-resolution, MRI reconstruction, super-resolution dataset, score-based diffusion model
\end{keywords}

\section{Introduction}

MRI is a widely used technology in medical imaging that acquires data in k-space—the Fourier domain—by placing the subject on a magnetic field and generating corresponding signals. Images are then obtained using inverse Fourier transform on k-space data. Although HR images are desired for accurate clinical diagnosis, their acquisition is limited by hardware limitations, patient movement, and extended scan times, leading to a reliance on LR images. Super-resolution techniques provide a reliable solution to overcome these challenges.

Supervised deep learning method for medical image super-resolution \cite{DLSSS, Isaac_SR, Sano_SR} directly learns the mapping between LR and HR images by training on a large paired dataset. These paired datasets are prepared by simulating specific degradation methods such as Bicubic Downsample, Gaussian Blurring, or Median Filtering \cite{bicubic} on HR images to obtain LR images.
% Recent deep learning-based methods for medical image super-resolution \cite{Hayit_SR_MR, Isaac_SR, Sano_SR}  have been proposed, but most are supervised methods that require paired datasets. They learn to directly map LR images to HR images by training on a large paired dataset. These LR images are usually generated by simulating a specific degradation method such as Bicubic
% Downsample, Gaussian Blurring, or Median Filtering \cite{bicubic}.
In MRI super-resolution, acquiring matching LR and HR image pairs is often impossible due to practical challenges. These include subject motion between acquisitions and excessively long scanning times, making it difficult to obtain exact training pairs. Additionally, the degradation process in MRI is non-deterministic making simulated data unsuitable for practical cases. In real-world scenarios, high-resolution images may degrade due to several factors such as magnetic field inhomogeneity, improper acquisition parameters, patient motion during the scan, and scanner electrical noise or insufficient acquisition time leading to low-resolution images that cannot be accurately mimicked by simulated LR images. Consequently, deep learning models trained on specific degradation methods are biased \cite{Mamata} on the training data pair and only learn to super-resolve the training data degradation. When the simulated degradation method or degradation factor changes, we have to re-obtain the paired dataset and re-train the model.


In this work, we introduce the SRMRI dataset, a novel collection of LR and HR MRI images obtained directly from the scanner. Further, we also propose an unsupervised framework for reconstructing HR MRI images that does not require a paired training dataset for training, thus avoiding the limitation of a fixed degradation process. Our method is inspired by the idea of learning the prior distribution of HR images by training a score-based diffusion model \cite{Score-based} as the data prior and provide a sampling algorithm to sample images from data distribution that are consistent with the low-resolution images.

Our work makes the following contributions:
\begin{itemize}
\item We introduce the SRMRI dataset, a collection of low-resolution and high-resolution images on the same subjects acquired from the scanner.
\item We train a score-based diffusion model on MRI images to generate HR MRI images as unconditional samples using a numerical solver. We propose a sampling algorithm to reconstruct HR images from the LR images by alternating between Diffusion Posterior Sampling (DPS) \cite{DPS} and image fusion strategy. Finally, we evaluate our model on scanner-obtained LR-HR pairs to provide a better representation of performance.

% \item To the best of our knowledge, we are the first method to evaluate model performance on LR-HR pairs directly obtained from the scanner.
\end{itemize}

An overview of our method is illustrated in \figureref{fig:Introduction}, and the detail of our method in Section \ref{method}.
\begin{figure}
    \centering
    \includegraphics[width=0.8\linewidth,height=0.3\textheight]{assets/Intro.png} 
    \caption{Overview of the proposed score-based diffusion method for reconstructing HR MRI images. Starting from pure noise $\boldsymbol{x_T}$, we obtain $\boldsymbol{x_0}$
  through alternating reverse SDE numerical solver and image fusion step.}
    \label{fig:Introduction}
\end{figure}
\section{Related Works}

\subsection{Score-based Diffusion Model}

We can construct a diffusion process \{$\boldsymbol{x}(\textit{t})\}_{t=0}^1$ on a continuous time $\textit{t} \in [0,1]$ with $\boldsymbol{x}(\textit{t}) \in \mathbb{R}^n$, where n denotes the dimension of the image. We sample $\boldsymbol{x}(0)$ from unknown data distribution $\textit{p}_0(\textbf{x})$ and perturb the data points with a stochastic process over time $[0, 1]$ such that $\boldsymbol{x}(1) \sim \textit{p}_1(\textbf{x})$, with $\textit{p}_1(\textbf{x})$ is close to a predefined noise distribution . This process is governed by an Itô stochastic differential equation (SDE) \cite{Score-based} given by
\begin{equation}
    d\boldsymbol{x}_\textit{t} = \hat{f}(\textit{t})\boldsymbol{x}_\textit{t} d\textit{t} + \hat{g}(\textit{t}) d \textbf{w}_\textit{t}
    \label{eq1}
\end{equation}
where $\hat{f} :  \mathbb{R}^n\xrightarrow[]{}{\mathbb{R}^n}$ denotes the drift coefficient, $\hat{g} (\textit{t}) : \mathbb{R}\xrightarrow[]{}{\mathbb{R}}$ defines a diffusion coefficient, and $\textbf{w}_t \in \mathbb{R}^n$ denotes a  Wiener process. The  perturbation process SDE in \equationref{eq1} can be associated with the following reverse SDE given by Anderson's theorem \cite{ANDERSON1982313}
\begin{equation}
    d \mathbf{x}_t=\left[\hat{f}\textit(t)\mathbf{x}_t-\hat{g}(t)^2 \nabla_{\mathbf{x}_t} \log p_t(\mathbf{x_t})\right] d\textit{t}+\hat{g}(t) d \overline{\mathbf{w}}_\textit{t}
    \label{eq3}
\end{equation}
where  $\overline{\mathbf{w}}_\textit{t}$ is a Wiener process running backward in time from 1 to 0, and $d\textit{t}$ is an infinitesimal negative timestep. To solve \equationref{eq3}, we require the score function of $p_t(\textbf{x}_t)$, i.e. $\nabla_{\mathbf{x}_t} \log p_t(\mathbf{x_t})$, which can be estimated by the time-conditioned neural network $\mathbf{s}_{\boldsymbol{\theta}}(\mathbf{x}_t, t)$ .

To solve \equationref{eq3}, we use numerical solvers such as Euler-Maruyama discretization, and Predictor-Corrector (PC) solvers \cite{Score-based}. With a score-based diffusion model, we can generate unconditional samples from the prior distribution ${p}_0(\textbf{x})$ of HR images \textbf{x}. However, to obtain HR images from LR images, we need to sample from the posterior distribution ${p}_0(\textbf{x}|\textbf{y})$ where \textbf{y} denotes LR image.


\subsection{Diffusion Posterior Sampling (DPS)}
For image super-resolution, we aim to recover the unknown high-resolution image $\textbf{x} \in \mathbb{R}^n$ from a degraded measurement $\textbf{y} \in \mathbb{R}^m$ , which is modeled as:
\begin{equation}
    \textbf{y = Hx}_0
    \label{eq5}
\end{equation}
$H \in  \mathbb{R}^{m\text{x} n}$ is an unknown degradation process.  We can formulate the reverse SDE to sample from the posterior distribution by modifying the SDE in \equationref{eq3} as follows:

\begin{align}
    d \mathbf{x}_t ={}& \left[\hat{f}(t) \mathbf{x}_t - \hat{g}(t)^2 \left( \nabla_{\mathbf{x}_t} \log p_t(\mathbf{x}_t) + \nabla_{\mathbf{x}_t} \log p_t(\boldsymbol{y} | \boldsymbol{x}_t)\right)\right] dt 
    + \hat{g}(t) d\overline{\mathbf{w}}_t
    \label{eq7}
\end{align}


In \equationref{eq7}, we need to compute two terms: the score function $ \nabla_{\mathbf{x}_t} \log p_t(\mathbf{x}_t)$and the likelihood $\nabla_{\mathbf{x}_t} \log p_t(\boldsymbol{y} | \boldsymbol{x}_t)$. We can compute the first term using the pre-trained score function $s_\theta$. The second term can be obtained through Diffusion Posterior Sampling (DPS) \cite{DPS} that provides an approximation of the likelihood which does not have an analytical formulation.


The posterior mean in the case of VE-SDE \cite{Score-based} for $p\left(\boldsymbol{x}_0 | \boldsymbol{x}_t\right)$ can be obtained through the Tweedie's approach \cite{Tweedie_Efron, DPS} such that the posterior mean becomes as
$
    \hat{\boldsymbol{x}}_0 \simeq \boldsymbol{x}_t+b_t^2 \mathbf{s}_{\boldsymbol{\theta}}(\mathbf{x}_t, t)$.

With this posterior mean, we can approximate the gradient of the log-likelihood

\begin{equation}
    \nabla_{\boldsymbol{x}_t} \log p\left(\boldsymbol{y} | \boldsymbol{x}_t\right) \simeq \nabla_{\boldsymbol{x}_t} \log p\left(\boldsymbol{y}| \hat{\boldsymbol{x}}_0\right)
    \label{13}
\end{equation}

\section{Method}
\label{method}
\subsection{DPS for MRI super-resolution}
The forward model in \equationref{eq5} can be alternatively formulated as:
\begin{equation}
    \boldsymbol{y} \sim \mathcal{N}\left(\boldsymbol{y} | \boldsymbol{H}^f \boldsymbol{x_0},  \boldsymbol{I}\right)
\end{equation}

where $\boldsymbol{H}^f \in \mathbb{R}^{m\text{x}n}$ is an unknown downsampling block, and the forward model is assumed to follow a Gaussian distribution. Then, the likelihood function takes the form 

\begin{equation}
    p\left(\boldsymbol{y} | \boldsymbol{x}_0\right)=\frac{1}{\sqrt{(2 \pi)^m }} \exp \left[-\frac{\left\|\boldsymbol{y}-\boldsymbol{H}\boldsymbol{x}_0\right\|_2^2}{2 }\right]
    \label{eq15}
\end{equation}

Differentiating \equationref{eq15} with respect to $\textbf{x}_\textit{t}$, using \equationref{13}, we get
\begin{equation}
    \nabla_{\boldsymbol{x}_t} \log p\left(\boldsymbol{y} | \boldsymbol{x}_t\right) \simeq- \nabla_{\boldsymbol{x}_t}\left\|\boldsymbol{y}-\boldsymbol{H}\left(\hat{\boldsymbol{x}}_0\left(\boldsymbol{x}_t\right)\right)\right\|_2^2
\end{equation}
where we write $\hat{\boldsymbol{x}}_0:= \hat{\boldsymbol{x}}_0\left(\boldsymbol{x}_t\right)$, such that $\hat{\boldsymbol{x}}_0$ is a function of $\boldsymbol{x}_t$. Consequently, calculating the gradient $\nabla_{\boldsymbol{x}_t}$ is equivalent to performing backpropagation through the neural network. Finally, to calculate the gradient of marginal distribution $\nabla_{\boldsymbol{x}_t} \log p_t\left(\boldsymbol{x}_t | \boldsymbol{y}\right)$, we sum up the gradient of log-likelihood $\nabla_{\boldsymbol{x}_t} \log p\left(\boldsymbol{y} | \boldsymbol{x}_t\right)$ and use $\mathbf{s}_{\boldsymbol{\theta}}(\mathbf{x}_t, t)$ for prior to obtain

\begin{equation}
\nabla_{\boldsymbol{x}_t} \log p_t\left(\boldsymbol{x}_t | \boldsymbol{y}\right) \simeq \boldsymbol{s}_{\theta}\left(\boldsymbol{x}_t, t\right)-\rho \nabla_{\boldsymbol{x}_t}\left\|\boldsymbol{y}-\boldsymbol{\hat{H}}\left(\hat{\boldsymbol{x}}_0\right)\right\|_2^2
\label{eq17}
\end{equation}
where $\rho$ is the step size and a chosen downsampling block, $\boldsymbol{\hat{H}}$ (e.g., Bicubic), is used to estimate a downsampled $\boldsymbol{\hat{x}}_0$. Notably, the choice of downsampling block for data consistency does not affect the evaluation metrics, as demonstrated in \ref{sec:appendix_downsampling}. This differs from supervised methods that are trained on a simulated dataset, where the models become biased toward the specific downsampling kernel used \cite{Mamata}. Using only DPS for data consistency requires numerous computationally expensive backpropagations, resulting in a prolonged image reconstruction time.

\subsection{Iterative Image Fusion during Sampling}
To overcome the high computational cost of backpropagation, we propose a hybrid sampling approach that alternates between backpropagation and a more efficient image fusion strategy. Specifically, at time $\textit{t}$, we employ DPS to estimate $\textbf{x}_{t-1}$ and $\hat{\textbf{x}}_0$. For the subsequent step, we utilize an image fusion technique that integrates ${\textbf{x}}_{t-1}$, with $\textbf{y}$. We decompose an image into multiple subbands at different scales, including low-low, low-high, high-low, and high-high bands. An effective integration strategy is to select wavelet coefficients from both the LR and HR images: the low-low band from the LR image and the other coefficients from the HR image. Then, the Inverse Discrete Wavelet Transform (IDWT) is applied to the combined coefficients to construct a fused image. This fusion approach effectively harmonizes the high-resolution features of ${\textbf{x}}_{t-1}$ with the measurement data $\textbf{y}$, yielding ${\textbf{x}}_{t-2}$ that ensures data consistency.

We employ a wavelet-based fusion technique at time $\textit{t}-1$ to integrate the high-resolution features of ${\textbf{x}}_{t-1}$ with the upscaled low-resolution image \textbf{y} which can modeled as

\begin{equation}
\begin{aligned}
     & W\left({\textbf{x}}_{t-1}\right)=\left\{A_x, D_x\right\} ,W(\mathbf{y})=\left\{A_y, D_y\right\}
\end{aligned}
\end{equation}
where $W$ denotes the wavelet decomposition operation, $A$ represents the approximation (low-frequency) coefficients, and $D$ represents the detail (high-frequency) coefficients. 
The fusion of these components is defined as $A_{\text {fused }}=A_y, \quad D_{f u s e d}=D_x$.

Finally, the fused image $\mathbf{x}_{t-2}$ is reconstructed from the fused wavelet coefficients using the inverse wavelet transform: $ \mathbf{x}_{t-2}=W^{-1}\left(A_{\text {fused }}, D_{f u s e d}\right)$.

This method offers a computationally efficient solution, as it bypasses the need for backpropagation. As outlined in \algorithmref{Algorithm3}, our method combines the predictor and corrector steps of the PC algorithm with \textit{k} skip steps for image fusion.
\begin{algorithm}
\caption{SRMRI Predictor-Corrector (PC) Sampling}
\begin{algorithmic}[1] % The number here allows for line numbering
\REQUIRE $s_\theta$, $N$, $M$,$\boldsymbol{y}$, $k$ $\rhd k: \text{skip steps} $
\State $\boldsymbol{x}_N \sim \mathcal{N}(\mathbf{0}, \sigma_T^2 \boldsymbol{I})$
\FOR{$i = N$ \textbf{to} $1$} 
    
        \State $\boldsymbol{x}_{i-1}^{\prime} \leftarrow \operatorname{Predictor}\left(\boldsymbol{x}_{i}, \sigma_i, \sigma_{i-1}\right)$
        \IF{$x \mod k = 0$}
        \State $\hat{\boldsymbol{x}}_0 \leftarrow \boldsymbol{x}_i + \sigma_i^2 \mathbf{s}_{\boldsymbol{\theta}}(\mathbf{x}_i,\sigma_i )$
        
        \State $\boldsymbol{x}_{i-1} \leftarrow \boldsymbol{x}_{i-1}^{\prime} - \rho \nabla_{\boldsymbol{x}_i} \left\|\boldsymbol{y} - \hat{H}(\hat{\boldsymbol{x}}_0)\right\|_2^2$
        \ELSE
        
        \State $Ax_{i-1},Dx_{i-1} \leftarrow W\left({\boldsymbol{x}}_{i-1}^{\prime}\right)$; $Ay,Dy \leftarrow W\left(\boldsymbol{y}\right)$
        \State $\boldsymbol{x}_{i-1}=W^{-1}\left(Ay, Dx_{i-1}\right)$
        
    \ENDIF
        \FOR{$j = 1$ \textbf{to} $M$}
            \State $ \boldsymbol{x}_{i-1} \leftarrow \operatorname{Corrector}\left(\boldsymbol{x}_{i-1}, \sigma_{i-1}\right)$
        \ENDFOR
    
\ENDFOR
\State \text {\textbf{return}} $\boldsymbol{ x}_0$
\end{algorithmic}
\label{Algorithm3}
\end{algorithm}
\section{Experiments}
We assess the effectiveness of our algorithm outlined by \algorithmref{Algorithm3} and compared it with other unsupervised and supervised learning-based baselines. Additional details about implementation can be found in \ref{implementation}.

\subsection{Dataset}
\label{Dataset}
%  For training, we acquired ten Alzheimer’s disease (AD) with 5xFAD background mice brains on a 30-cm bore 9.4 T magnet (Bruker-BioSpec 94/30, Billerica, MA). A 3D gradient echo (GRE) pulse sequence was performed at both 25 $\mu$m and 50 $\mu$m  isotropic resolution. For the 25 um resolution scans, the field of view (FOV) was set to 18.0 mm × 12.8 mm × 7.6 mm, matrix size of 720 × 512 × 304, flip angle of 45°, bandwidth (BW) of 125 kHz, and repetition time (TR) of 100 ms, serving as HR volumes. We acquired image slices along the coronal plane, resulting in HR images with dimensions of 720×512. We removed the first and last fifteen slices from each volume to exclude noise-only data and improve training. This results in approximately 3k slices of training data.


% For evaluation, we acquired both HR and LR samples from four subjects, with isotropic resolutions of 25 $\mu$m and 50 $\mu$m, respectively. To obtain the 50 $\mu$m scans, we used a field of view (FOV) of 18.0 mm × 12.8 mm × 7.6 mm, a matrix size of 360 × 256 × 152, a 45° flip angle, a bandwidth (BW) of 125 kHz, and a repetition time (TR) of 100 ms. The 25 $\mu$m scans had the spacing of (0.025, 0.025, 0.025) and the 50 $\mu$m scans had the spacing of (0.05, 0.05, 0.05). Because of hardware and physical constraints, acquiring perfectly matched pairs of HR and LR images is generally not feasible. Therefore, we formed test pairs by selecting a 50 $\mu$m image and identifying the closest 25 $\mu$m image from the same subject using a voting scheme based on three metrics: LPIPS \cite{LPIPS}, PCA \cite{PCA}, and SSIM \cite{SSIM}. The pair with the most votes was chosen. Further details about datasets are provided in \ref{dataset_detail}.


We acquired brain images from 10 Alzheimer’s disease (AD) mice with a 5xFAD background using a 9.4 T magnet with a 30-cm bore (Bruker-BioSpec 94/30, Billerica, MA).
A 3D gradient echo (GRE) pulse sequence was performed at both 25 $\mu$m and 50 $\mu$m isotropic resolution. The field of view (FOV) was set to 18.0 mm × 12.8 mm × 7.6mm, flip angle of 45°, bandwidth (BW) of 125 kHz, and repetition time (TR) of 100 ms. 
We used the 25 $\mu$m resolution volumes, measuring 720 × 512 × 304, as high-resolution (HR) and the 50 $\mu$m resolution volumes, measuring 360 × 256 × 152, as low-resolution (LR). For training, We acquired image slices along the coronal plane, resulting in HR images with dimensions of 720×512. We removed the first and last fifteen slices from each volume to exclude noise-only data and improve training. This results in approximately 3k slices of training data. 
For evaluation, we used the HR and LR volumes from 4 subjects. The 25 $\mu$m scans had the voxel spacing of (0.025, 0.025, 0.025), while the 50 $\mu$m scans had a spacing of (0.05, 0.05, 0.05). It is important to note that our training and evaluation dataset consists of LR-HR pairs obtained directly from the scanner. Due to hardware and physical constraints, acquiring perfectly matched pairs of HR and LR images is generally not feasible. Therefore, we formed test pairs by selecting a 50 $\mu$m image and identifying the closest 25 $\mu$m image from the same subject using a voting scheme based on three metrics: LPIPS \cite{LPIPS}, PCA
\cite{PCA}, and SSIM \cite{SSIM}. The pair with the most votes
was chosen. Further details about datasets are provided in \ref{dataset_detail}.

\subsection{Comparison study}
\subsubsection{Unsupervised methods}
% For unsupervised techniques, we compare our approach with several existing methods that perform super-resolution on LR images obtained from scanners. Specifically, we compare with four strong baselines: DPS \cite{DPS},  manifold constrained gradients (MCG) \cite{chung2022improving}, KernelGAN \cite{kernelgan} + ZSSR \cite{ZSRGAN} and Score-SDE \cite{Score-based}. DPS, MCG, and Score-SDE are diffusion-based models that can be used for solving inverse problems. We used   PSNR, SSIM, LPIPS, and inference time to compare the methods on image pairs. 

We compared our approach with four unsupervised super-resolution methods baselines: DPS \cite{DPS},  manifold constrained gradients (MCG) \cite{chung2022improving}, KernelGAN \cite{kernelgan} + ZSSR \cite{ZSRGAN} and Score-SDE \cite{Score-based}. DPS, MCG, and Score-SDE are diffusion-based models that can be used for solving inverse problems. Comparisons were made using PSNR, SSIM, LPIPS, and inference time. One should note that standard metrics such as PSNR and SSIM may not fully capture perceptual quality in our dataset, given that LR and HR images do not align on a strict pixel-by-pixel basis. Although we report these metrics to provide a baseline comparison, their values should be interpreted with caution. In addition, we used more perceptually oriented metrics (LPIPS) to better reflect reconstruction quality.


\begin{table}[htp] % Use table* for a table that spans both columns
\centering
\caption{Quantitative evaluation (PSNR, SSIM, LPIPS) of MRI super-resolution (Unsupervised) on scanner images. \textbf{Bold}: Best, \underline{under}: second best. k : skip step.}
\small % Reduce the font size
\setlength{\tabcolsep}{12pt} % Reduce space between columns
\begin{tabular}{@{} l c c c c c c @{}} 
\toprule
Method & PSNR $\uparrow$  & SSIM $\uparrow$ & LPIPS $\downarrow$  & time (s) $\downarrow$  \\
\midrule
Score-SDE \cite{Score-based} & 23.58 & 0.49 & 0.17 & \underline{740}\\
DPS\cite{DPS} & 24.23 & 0.58 & \underline{0.11} & 1194 \\
MCG \cite{chung2022improving} & 24.10 & 0.53 & 0.12 & 1198 \\
KernelGAN \cite{kernelgan} \\+ ZSSR \cite{ZSRGAN} & 18.44 & 0.36 & 0.43 & \textbf{175} \\
\midrule
\textbf{SRMRI (Ours)(k=2)} & \textbf{24.75} & \textbf{0.61}  & \textbf{0.1}  & 873 \\
\textbf{SRMRI (Ours)(k=3)} & 24.54 & \underline{0.60} & \underline{0.11} & 790 \\
\bottomrule
\end{tabular}
\label{Table1}
\end{table}
Table \ref{Table1} shows the results for unsupervised training, where our method, SMRI, outperforms other techniques across evaluation metrics. Additionally, it is worth noting that our method offers a faster inference with higher performance compared to existing unsupervised approaches that use diffusion models for solving inverse problems. In the case of KernelGAN + ZSSR,  KernelGAN estimates the unknown downsampling kernel from the input LR images, while ZSSR uses this learned kernel to perform zero-shot super-resolution leading to lower overall performance.
\subsubsection{Supervised methods}
We also evaluated our method against the supervised techniques listed in Table \ref{Table2}, using the training dataset described in Section \ref{Dataset}. 

\begin{table}[htp] % Use table* for a table that spans both columns
\centering
\caption{Quantitative evaluation (PSNR, SSIM, LPIPS) of MRI super-resolution with sensor images (Supervised).}
\small % Reduce the font size
\setlength{\tabcolsep}{20pt} % Reduce space between columns
\begin{tabular}{@{} l c c c c c @{}} 
\toprule
Method & PSNR $\uparrow$  & SSIM $\uparrow$ & LPIPS $\downarrow$  \\
\midrule
SRCNN \cite{SRCNN} & 22.49 & \underline{0.64}  & 0.55 \\
%ESPCN \cite{DBLP:journals/corr/ShiCHTABRW16} & 26.01 & 0.63  & 0.43 \\
DDBPN \cite{DBLP:journals/corr/abs-1803-02735} & 25.16 & \underline{0.64}  & 0.38 \\
CARN \cite{DBLP:journals/corr/abs-1803-08664} & \textbf{26.31} & \textbf{0.65}  & 0.35 \\
Swinir \cite{liang2021swinirimagerestorationusing} & 25.88 & 0.62  & 0.36 \\
RCAN \cite{DBLP:journals/corr/abs-1807-02758} & 25.18 & \textbf{0.65}  & 0.31 \\
ESRGAN \cite{DBLP:journals/corr/abs-1809-00219} & \underline{26.11} & \textbf{0.65} & 0.34 \\
\midrule
\textbf{SRMRI (Ours) (k=2)} & 24.75 & 0.61  & \textbf{0.1} \\
\textbf{SRMRI (Ours) (k=3)} & 24.54 & 0.60 & \underline{0.11}  \\
\bottomrule
\end{tabular}
\label{Table2}
\end{table}
Table \ref{Table2} compares our method with supervised methods. Unlike unsupervised approaches, supervised methods require training pairs, which are often simulated. For these results, the supervised models were trained using LR-HR pairs from the scanner. Although the supervised methods achieve higher PSNR and SSIM on our dataset, they exhibit more artifacts and blurring in the reconstructed images (see Figure~\ref{fig:example}). We attribute this to their reliance on learning LR-HR correspondence during training to reduce specific loss functions (eg. pixel-based loss, adversarial loss). However, in our case, the degradation process is non-deterministic, and there is no exact pixel-to-pixel correspondence between the LR and HR pairs. As a result, these approaches produce lower perceptual quality despite better performance on standard numerical metrics. Further, supervised methods has to be retrained if the downsampling factor changes. However, our method can be applied to any downsampling factor. Figure \ref{fig:example} shows an example of super-resolutions (factor x2) from different supervised and unsupervised methods. More examples of super-resolutions can be seen at \ref{appendix-additional}. Finally, in Appendix~\ref{sec:appendix_k_sensitivity}, we present a parameter-sensitivity study of the skip steps  (k) used in our fusion strategy.

% In addition to the unsupervised approach, we evaluate our method against existing supervised techniques. We include SRCNN \cite{SRCNN}, Real-ESRGAN \cite{realESRGAN} as supervised learning baselines. The training dataset was created following the approach described in \ref{Dataset}. 
\begin{figure}[t]
    \centering
    \includegraphics[width=1\linewidth]{assets/Figures_main.png}  
    \caption{Examples of super-resolution (factor x2) results with scanner LR. You may zoom in to view more details. Blue: Unsupervised, Red: Supervised.  }  
    \label{fig:example}
\end{figure}

\section{Discussion and Conclusion}
In this paper, we introduce a blind MRI super-resolution method that alternates between image fusion and DPS during the diffusion-based sampling process. This approach reduces the computational load by replacing the computationally intensive DPS step with low-complexity image fusion technique. Our experimental results show that our method outperforms existing unsupervised approaches while offering a speed advantage. In comparison to supervised methods, which achieve higher metric values but introduce more artifacts and blurring, our approach produces better-quality images without the need for LR-HR training pairs, making it more practical for real-world scenario. Furthermore, our method can effectively recover complex and unknown degradations that may occur in real-world scenarios, even when the degradation is unknown or differs from the training data. Although we down-sample intermediate images to maintain data consistency during sampling using the chosen degradation process, our method can be extended to construct multiple diffusion processes to learn priors for each component. This allows for posterior sampling even when the degradation operator is unknown \cite{blindDPS}. Additionally, we introduce a dataset comprising both LR and HR MRI scans obtained directly from the scanner. To establish correspondence between the LR and HR pairs, we propose a voting scheme based on image quality metrics such as PSNR, SSIM, and LPIPS. This dataset can reduce dependence on simulated degradations for supervised super-resolution training. To the best of our knowledge, we are the first to evaluate model performance on such LR-HR pairs directly obtained from the scanner, providing a more accurate representation of performance in real-world scenarios.


 

\bibliography{midl25_28}

\appendix
\section{Experimental Details}

\subsection{Implementation Details}
\label{implementation}
\subsubsection{Training of the score function}
\label{Train-score}
We used the implementation of the time-dependent score function model \textit{ncsnpp \footnote{https://github.com/yang-song/score\_sde\_pytorch}} \cite{Score-based} as a score model. The model architecture consists of a time-conditioned U-Net, and the sub-block within U-Net is adopted from residual blocks of BigGAN \cite{biggan}. The network is conditioned on time (t) by incorporating Fourier features. These time-related features are combined with the original input features before being processed by the encoder.

The model was trained using a batch size of 4 and the Adam optimizer with standard hyperparameters ($\beta_1$= 0.9 and $\beta_2$= 0.999). To stabilize training, a linear learning rate warm-up was employed for the first 5000 steps, reaching a final learning rate of $2 \times 10^{-4}$. Gradient clipping was applied to prevent exploding gradients, and exponential moving averages were calculated for the model parameters. All experiments were conducted using PyTorch. The model was trained on the full training dataset for 1000 epochs, utilizing five RTX 3090 GPUs. This training process takes approximately five days of wall clock time. 

\subsubsection{Sampling}
We modify the Predictor-Corrector (PC) sampler, as described in \cite{Score-based}, due to its superior performance in solving VE-SDE. The PC sampler consists of two components: the predictor, which is a numerical solver for the reverse-time SDE, and the corrector, where we use Langevin dynamics for the Markov chain Monte Carlo (MCMC) method. For
the PC sampler, we used 2000 noise scales and 1 step of Langevin dynamics per noise scale. All the sampling steps outlined were executed on a single RTX 3090 GPU. In our experiments, we used the level two decomposition in the DWT for two levels of wavelet coefficients. Empirically, we found that bior4.4 performs best as the mother wavelet.
\subsubsection{Code Availability}
We will publish our code and dataset used in our experiments upon publication to boost
reproducibility.

\subsection{Dataset details}
\label{dataset_detail}

To prepare the dataset for training and evaluation of the supervised method, we used HR and LR image slices from two subjects, with isotropic resolutions of 25 $\mu$m and 50 $\mu$m, respectively. Although the HR and LR image volumes were acquired sequentially from the same subjects, no direct correspondence exists between the LR and HR slices, limiting their use as training pairs for supervised learning. Therefore, we employed three methods to pair the LR and HR images, forming a voting scheme where the image pair with the highest number of votes was selected. In cases where no consensus was reached during voting, the images from each method were visually inspected, and the closest pair was chosen. The following describes the implementation of those three methods. 
\subsubsection*{PCA based method}
We implemented a Principal Component Analysis (PCA) based method to match LR images with their HR counterparts. Given the difference in resolution between the LR images (360x256) and the HR images (720x512), we first downsampled the HR images, reducing them to the same size as the LR images using cubic interpolation. Following this, PCA was employed to reduce the dimensionality of both the downsampled HR images and the original LR images. For each HR image, the Euclidean distance between its PCA-transformed representation and that of the LR image was computed. The HR image with the smallest distance was identified as the closest match to the LR image. 
\subsubsection*{LPIPS based method}
In addition to the PCA-based approach, we implemented a method using Learned Perceptual Image Patch Similarity (LPIPS) to match LR images with their corresponding HR counterparts. LPIPS is a deep learning-based metric that evaluates perceptual similarity between images by comparing feature maps extracted from a pre-trained convolutional neural network (CNN). The LR and HR images were first resized to have identical dimensions. The perceptual distance between the LR and HR images was then calculated in the feature space, with lower LPIPS scores indicating higher perceptual similarity. The HR image with the lowest LPIPS distance was selected as the best match for the LR image. 

\subsubsection*{SSIM based Method}
For our third method, we utilized the SSIM to assess the structural similarity between the LR and HR images. As with the other methods, the HR images were resized to match the dimensions of the LR images. The SSIM value was then calculated for each HR-LR image pair, with higher SSIM values indicating greater structural similarity. The HR image with the highest SSIM score was considered the closest match to the LR image.

\section{Impact of Downsampling Methods}
\label{sec:appendix_downsampling}

\noindent
In this section, we investigate the effect of using different downsampling kernels in our data consistency block with k = 2. Specifically, we tested the following methods: bicubic, linear, Lanczos2 \cite{LanczosFilteringinOneandTwoDimensions}, Lanczos3 \cite{LanczosFilteringinOneandTwoDimensions}, and box. Table~\ref{tab:downsampling_comparison} summarizes the results in terms of PSNR, SSIM, and LPIPS. We observe that the choice of downsampling has a negligible impact on these evaluation metrics. Hence, any of these kernels can be employed without adversely affecting the final performance.




\begin{table}[htp] % Use table* for a table that spans both columns
\centering
\label{tab:downsampling_comparison}
\caption{Comparison of different downsampling methods.}
\small % Reduce the font size
\setlength{\tabcolsep}{12pt} % Reduce space between columns
\begin{tabular}{@{} l c c c c c c @{}} 
\toprule
Method & PSNR $\uparrow$  & SSIM $\uparrow$ & LPIPS $\downarrow$  \\
\midrule
Bicubic   & 24.75 & 0.61  & 0.1 \\
Bilinear    & 24.61 & 0.60 & 0.09 \\
Lanczos2 \cite{LanczosFilteringinOneandTwoDimensions}  & 24.61 & 0.6 & 0.1 \\
Lanczos3 \cite{LanczosFilteringinOneandTwoDimensions} & 24.65 & 0.6 & 0.1 \\
Box       & 24.72 & 0.61 & 0.1 \\
\bottomrule
\end{tabular}
\label{Table3}
\end{table}

As shown in Table~\ref{tab:downsampling_comparison}, the variation across downsampling methods is minimal, with differences typically within the margin of error. Consequently, for our main experiments, we chose the bicubic kernel as the default downsampling method for data consistency.

\section{Choice of skip steps (k)}
\label{sec:appendix_k_sensitivity}
We varied k from 1 (pure DPS) to 5 and observed that smaller values (2–3) provide the best trade-off between computational cost and reconstruction fidelity, as shown in Table \ref{tab:k_vari}. Notably, k=2 yields higher PSNR and SSIM than k=1 (full DPS) while reducing inference time from 1194s to 873s. As k increases beyond 3, speed gains become marginal, and quality metrics slightly decrease. To further validate our image-fusion step, we removed fusion for k=2, thereby omitting data consistency during the alternating steps. As expected, performance dropped notably—both quantitatively (PSNR fell from 24.75 to 17.88) and qualitatively, where reconstructions diverged from the ground-truth HR images. 
\begin{table}[htp] % Use table* for a table that spans both columns
\centering
\label{tab:k_vari}
\caption{Quantitative evaluation for different value of k}
\small % Reduce the font size
\setlength{\tabcolsep}{12pt} % Reduce space between columns
\begin{tabular}{@{} l c c c c c c @{}} 
\toprule
k & PSNR $\uparrow$  & SSIM $\uparrow$ & LPIPS $\downarrow$ & time (s) $\downarrow$   \\
\midrule
1 (DPS) & 24.23 & 0.58 & 0.11 & 1194\\
 k=2 & 24.75 & 0.61  & 0.1 & 873 \\
k=3 & 24.54 & 0.60 & 0.11 & 790\\
k=4 & 24.44 & 0.59 & 0.11 & 756\\
k=5 & 24.19 & 0.54 & 0.11 & 729\\
\midrule
k=2 (without Image Fusion) & 17.88 & 0.36 & 0.39 & 727 \\
\bottomrule
\end{tabular}
\label{Table3}
\end{table}


\section{Additional Examples}
\label{appendix-additional}
\subsection{Unsupervised Methods}
\clearpage
\begin{figure}[H]
    \centering
    \includegraphics[width=1\linewidth,height=0.85\textheight]{assets/appendix_unsupervised.drawio.png}
    \caption{Examples of super-resolution (x2) with unsupervised methods.}
\end{figure}

\subsection{Supervised Methods}
\begin{figure}[H]
    \centering
    \includegraphics[width=1\linewidth,height=0.85\textheight]{assets/appendix_supervised.drawio.png}
    \caption{Examples of super-resolution (x2) with supervised methods. Blue: ours.}
\end{figure}

\begin{figure}[H]
    \centering
    \includegraphics[width=1\linewidth]{assets/MIDL_rebuttal_bid.drawio.png}
    \caption{Full-resolution LR and super-resolved HR (scaled to half to fit).}
    \label{fig:enter-label}
\end{figure}

\end{document}
