\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

% Import other packages outside of the template.
\usepackage{multirow}
\usepackage[normalem]{ulem}
\useunder{\uline}{\ul}{}

% Set up \vect command to just to \mathbf
% <https://www.reddit.com/r/LaTeX/comments/qsiaa0/comment/hkda786>
\newcommand{\coord}[1][\mathbf]{#1}
\newcommand{\vol}[1][\mathbf]{#1}

\jmlrvolume{-- 337}
\jmlryear{2026}
\jmlrworkshop{Full Paper -- MIDL 2026}
\editors{Accepted for publication at MIDL 2026}

\title[EPINR]{EPI Distortion Correction without Opposite Phase Encodings with Unsupervised INR-Based Deformable Registration}

\midlauthor{
    \Name{Tyler Spears\nametag{$^{1}$}} \orcid{0000-0002-9104-9074} \Email{tas6hh@virginia.edu}\\
    \addr $^{1}$ Dept. of Electrical \& Computer Engineering, University of Virginia, Charlottesville, VA, USA \\
    \AND
    \Name{Myla Goldman\nametag{$^{2}$}} \orcid{0000-0002-5675-945X}\Email{Myla.Goldman@vcuhealth.org}\\
    \addr $^{2}$ Dept. of Neurology, Virginia Commonwealth University School of Medicine, Richmond, VA, USA \\
    \AND
    \Name{P. Thomas Fletcher\nametag{$^{1}$}} \orcid{0000-0003-3417-2380} \Email{ptf8v@virginia.edu}
}

\begin{document}

\maketitle

\begin{abstract}
Diffusion MRIs (dMRIs) provide a detailed look at the structure of the brain, but the acquired images come with many distortions. Echo planar imaging (EPI) sequences, nearly universal for dMRIs, are highly sensitive to inhomogeneities of the magnetic field in the scanner. This results in severe geometric distortion (up to tens of millimeters) in the phase encoding direction, particularly in areas with strong changes in tissue density such as the brainstem, temporal, and frontal regions. A common method for correcting EPI distortion is to collect an image with the opposite phase encoding (PE) direction and reconstruct the magnetic susceptibility field. However, many dMRI protocols, some still in use today, do not include this auxiliary acquisition. Other methods have attempted to register the distorted EPI to an anatomical reference, with less accurate results. In this work, we propose EPINR, an unsupervised implicit neural representation (INR) based registration model that builds on these previous works. EPINR learns the susceptibility field by warping a single b0 image to a T1w reference, without opposite PE acquisitions. EPINR also leverages its smooth and continuous representation to apply higher-order regularizations calculated analytically. We evaluate EPINR against several comparison methods, both traditional and learning-based, over two dMRI datasets. We perform further ablation analyses on the effect of different components in EPINR. Finally, we discuss the reasons for EPINR's high performance, and how it can bring structural precision to previously compromised diffusion images. 
\end{abstract}

\begin{keywords}
Intermodal Registration, INR, Susceptibility Distortion Correction
\end{keywords}

\section{Introduction}

Diffusion MRI (dMRI) is the most detailed structural description of the brain we can acquire \textit{in vivo}. Unfortunately, that detail is compromised by severe geometric distortions, on the order of tens of millimeters, caused by magnetic field inhomogeneities in the scanner \citep{Farzaneh.etal.1990,Haskell.etal.2023}. These distortions affect nearly all dMRI scan sequences, and field inhomogeneities are produced whenever any tissue or material is in the scanner. So, nearly all dMRIs need this distortion, often called susceptibility distortion or echo planar image (EPI) distortion, to be corrected.

Several susceptibility distortion correction (SDC) methods have been previously proposed, but the most effective methods require auxiliary acquisitions explicitly for SDC. For example, the popular FSL \texttt{topup} \citep{Andersson.etal.2003} tool performs SDC with impressive accuracy \citep{Gu.Eklund.2019,Graham.etal.2017,Wang.etal.2017}, and is a staple in dMRI processing pipelines \citep{Glasser.etal.2013,Cieslak.etal.2021}. The \texttt{topup} tool requires only a single b0 collected with the opposite phase encoding (PE) direction as the primary diffusion weighted images (DWIs). However, that single image is not collected in many dMRI datasets, especially in places that rely on older sequences. Examples include: the Open Access Series of Imaging Studies 3 (OASIS3) dataset \citep{LaMontagne.etal.2019}, the Center on Reproducible Research (CoRR) dataset \citep{Zuo.etal.2014}, the MASiVar multisite and multiscanner dataset \citep{Cai.etal.2021}, the TractoInferno dataset \citep{Poulin.etal.2021}, and many other public (and private) dMRI datasets.

Alternative SDC methods have been proposed that only require an undistorted anatomical reference image, such as a T1-weighted (T1w) or T2-weighted (T2w) image, which are almost always collected in dMRI protocols \citep{Kybic.etal.2000}. However, these deformable registration-based methods are significantly less accurate than methods, such as \texttt{topup}, that require auxiliary images  \citep{Gu.Eklund.2019,Graham.etal.2017,Wang.etal.2017}. New unsupervised implicit neural representation (INR) \citep{Sitzmann.etal.2020} registration methods \citep{Wolterink.etal.2022,Byra.etal.2023} could bring new life into this decades-old problem of SDC without extra acquisitions.

In this work, we propose EPINR, an unsupervised INR-based deformable registration model for EPI distortion correction without extra acquisitions. EPINR learns a smooth displacement field, constrained to the PE direction, that aligns a b0 and T1w image pair for a particular subject. EPINR also leverages the spatially-continuous and smooth nature of INRs to \textit{analytically} compute Jacobians and higher-order derivatives for regularization \citep{Rueckert.etal.1999,Wolterink.etal.2022}. Our contributions in this work are as follows:
\begin{itemize}
    \item EPINR, an unsupervised INR-based registration model that is the first of its kind to be applied to SDC without auxiliary acquisitions.
    \item A comparison of previous methods, traditional and deep learning-based, on two DWI datasets with vastly different image characteristics, including an in-house dataset not used in any previous work on SDC validation.
    \item Validation that EPINR outperforms previous methods, and an analysis of where a previous state of the art method struggles.
    \item An ablation analysis on the effects of regularization schemes and domain-specific enhancements of EPINR.
    \item An open source implementation of EPINR at \href{https://github.com/TylerSpears/epinr}{https://github.com/TylerSpears/epinr}.
\end{itemize}

\section{Background}

EPI distortions cause a strong geometric warping during the data readout step of an EPI sequence, which effects both dMRIs and functional MRIs (fMRIs) \citep{Studholme.etal.2000,Gholipour.etal.2006}. This is caused by inhomogeneities in the magnetic susceptibility field in the scanner, causing a non-linear shift in the scanner's gradient field at tissue boundaries \citep[Chapter 4]{Johansen-Berg.Behrens.2014}. This distortion is almost entirely constrained to the phase encoding (PE) direction, and it is most severe in the sinuses and the brainstem, while also affecting temporal and frontal regions \citep{Treiber.etal.2016}. We also note that for dMRIs, SDC is usually performed only on images with no diffusion weighting (b0s), as they do not contain eddy current-induced distortions \citep{Andersson.etal.2017,Rohde.etal.2004}.

\subsection{SDC with Extra Acquisitions}

As mentioned, SDC methods that leverage auxiliary acquisitions often reconstruct the most accurate susceptibility fields \citep{Gu.Eklund.2019,Graham.etal.2017,Wang.etal.2017}. This includes fieldmap-based SDC and reverse PE SDC. Some methods suggested collecting fieldmaps that measure the magnetic inhomogeneity of the subject \citep{Jezzard.Balaban.1995,Wan.etal.1997,Reber.etal.1998}, similar to those in fMRI, but comparisons \citep{Tao.etal.2009,Fritz.etal.2014,Wang.etal.2017} have found that these approaches are sensitive to acquisition parameters and partial volume effects. Reverse PE SDC is more common, where a b0 image with an opposite PE direction is collected. This only adds a few seconds to the scan time, but is still absent in many datasets and sequences. That said, reverse PE images are sufficiently common, and their performance is very high, such that they are raised to a ``silver standard'' in SDC \citep{Gu.Eklund.2019}. These methods assume that images with opposite PEs will have an equal and opposite displacement field, and a parameterized field is optimized with these constraints.

Reverse PE SDC was first proposed by \citet{Chang.Fitzpatrick.1992} and popularized with the FSL \texttt{topup} tool \citep{Andersson.etal.2003}. Other methods of this class continue to be developed, such as those in \citet{Holland.etal.2010}, \citet{Irfanoglu.etal.2015} (DR-BUDDI), \citet{Hedouin.etal.2017}, and \citet{Liu.etal.2021}. Deep learning has also been applied, with EPIs of both PE directions given to a convolutional neural network (CNN) for predicting the susceptibility field \citep{Hu.etal.2020,Qiu.etal.2026,Zahneisen.etal.2020,Legouhy.etal.2022,Qiao.Shi.2022,ZaidAlkilani.etal.2024}. We also mention that pulse sequence design for reducing EPI distortion is an active area of research \citep{Haskell.etal.2023}, but these sequences are still experimental and cannot help with post hoc analyses.

\subsection{Deep Learning Deformable Registration}

\textbf{VoxelMorph \& CNN Registration.} Deep learning-based registration methods have exploded in popularity after the popular VoxelMorph framework \citep{Balakrishnan.etal.2019} was proposed. VoxelMorph uses a CNN that takes the fixed and moving images as inputs and predicts a deformation field that warps the moving image to the fixed. The primary objective function is an image similarity measure, but also includes regularization terms that penalize the squared norm of the displacement spatial gradient. Note that this gradient is approximated with a finite difference method.

\textbf{INR-Based Registration.} INRs, first proposed as sinusoidal representation networks (SIRENs) in \citet{Sitzmann.etal.2020}, are multilayer perceptrons (MLPs) that learn a continuous signal representation by mapping coordinates to intensity values. INR registration represents a warp field by using an image similarity objective function, rather than image reconstruction \citep{Wolterink.etal.2022}. If properly constructed, this representation is $C^{\infty}$ smooth, and may analytically calculate higher order derivatives and use advanced regularization techniques \citep{Rueckert.etal.1999}. \citet{Wolterink.etal.2022} was one of the first of these methods with implicit deformable image registration (IDIR). \citet{Sun.etal.2024} proposed the Neural Image Registration (NIR) model, which extended IDIR with a hybrid coordinate sampling scheme, and \citet{vanHarten.etal.2024} proposed a cycle-consistent deformation with dual INRs. \citet{Byra.etal.2023} analyzed the impact of different INRs on deformable brain image registration.

\subsection{Previous Works in SDC Without Auxiliary Acquisitions}

\textbf{Traditional Registration.} A third class of SDC method is to register the distorted b0 to an undistorted anatomical reference, constrained to the PE direction. While this problem may seem straightforward, the intermodal differences between a b0 (T2-weighted) and a T1w, along with the difference in noise levels and resolution, prove challenging. \citet{Kybic.etal.2000} was the first to propose a deformable registration SDC method. Later, \citet{Gholipour.etal.2006} proposed a Free-Form Deformation method for fMRIs, and \citet{Tao.etal.2009} used a variational approach to register the b0 with a \textit{T2w} reference. However, one of the most common models for this approach to apply a general image registration framework, such as ANTs symmetric normalization (SyN) \citep{Avants.etal.2008} to the task, as shown in \citet{Wang.etal.2017,Gu.Eklund.2019}. Unfortunately, registration-based correction, while requiring no extra acquisitions, also gives the lowest performance of the three SDC methods \citep{Gu.Eklund.2019,Graham.etal.2017,Wang.etal.2017}.

\textbf{Learning-Based Registration} The availability of large DWI repositories has inspired applications of neural networks to EPI distortion correction. As mentioned previously, many works have used the opposite PE approach to predict the susceptibility fields, but few have attempted SDC without auxiliary images. \citet{Ye.etal.2021} proposed a model to perform simultaneous super-resolution and distortion correction with only one PE direction. \citet{Jimeno.etal.2024} proposed GDCNet, which applies VoxelMorph to EPI distortion in fMRIs, but has not yet been peer-reviewed at the time of this writing. Possibly the most popular deep network SDC method is Synb0 \citep{Schilling.etal.2019}, which is a conditional generative adversarial network (GAN) that synthesizes an undistorted b0 when given a distorted b0 and a T1. This method was trained in 2.5D (2D slices in all three axes) on a dataset of over 500 subjects, with output images at 2.5mm isotropic. The real and generated b0 images are then given to \texttt{topup} for SDC. This model was used for SDC for datasets in \citet{Begnoche.etal.2022}, \citet{Poulin.etal.2021},\citet{Cai.etal.2021}, and more.

\section{EPINR}

\begin{figure}[htbp]
\begin{center}
    \includegraphics[width=\linewidth]{figs/epinr_net_diagram.pdf}
\end{center}
   \caption{Illustration of our proposed EPINR registration model. EPINR learns to warp a distorted b0 to a non-distorted T1w reference in an unsupervised manner with a displacement field (blue) constrained in the phase encoding direction. The $C^{\infty}$ smoothness of EPINR also allows for \textit{analytically} calculating higher-order regularization methods and Jacobian modulation \citep{Jezzard.Balaban.1995}.
   }
    \label{fig:epinr_net_diagram}
\end{figure}

Our goal with EPINR is to correct for EPI distortion in a diffusion image protocol using only a single b0 (effected by EPI distortion) and a T1-weighted anatomical reference (not effected by this distortion), and our model is illustrated in Figure \ref{fig:epinr_net_diagram}. The underlying susceptibility field can be calculated simply by scaling a displacement field $u(x, y, z)$, so registration and field reconstruction are the same problem here. Like classical registration models, we train EPINR in an unsupervised manner with a newly initialized model for every subject.

Suppose we have a spin echo pulse sequence \citep[Chapter 2]{Johansen-Berg.Behrens.2014} that produces a b0 image $\vol{M}$ distorted only in the PE direction, and a non-distorted T1w image $\vol{F}$. We assume, without loss of generality, that the PE direction is $y$. Additionally, suppose that as a preprocessing step, $\vol{F}$ has been rigidly aligned to match the orientation and translation of $\vol{M}$. Note that we keep $\vol{F}$ and $\vol{M}$ in their native resolutions as we will just sample locations in $\vol{F}$ and $\vol{M}$ during training. We denote this sampling operation as $\vol{A}[\coord{c}]$, which is a trilinear interpolation of volume $\vol{A}$ at the 3D physical coordinate $\coord{c}$. So, our network, a SIREN \citep{Sitzmann.etal.2020} in our implementation, $u_{\theta}: \mathbb{R}^3 \rightarrow \mathbb{R}$ with parameters $\theta$, maps $u_{\theta}(x_i, y_i, z_i) = (0, \Delta y_i, 0),$ with zeros in the non-PE directions.

It is known that EPI distortion in spin echo sequences is mass preserving \citep{Chang.Fitzpatrick.1992,Jezzard.Balaban.1995} in that compressed signal is ``piled-up'' to a higher intensity, and vice-versa for ``stretched'' areas. This is expressed as a division of the image intensity by the determinant of the Jacobian of the deformation field, and we need to compensate for that when applying our predicted displacement field. This is also known as Jacobian modulation \citep{Andersson.etal.2003}. So, our similarity loss is defined as:
\begin{equation}
\label{eq:epinr_sim_loss}
    \mathcal{L}_{\text{sim}} = 
    \frac{1}{N} \sum^N_{i=1}
    -\text{MI}\left(
        \det J_{u} \left(\coord{c}_i + u_{\theta}(\coord{c}_i) \right)
        \vol{M}\left[\coord{c}_i + u_{\theta}(\coord{c}_i)\right],
        \vol{F}\left[ \coord{c}_i \right]
    \right),
\end{equation}
where $\coord{c}_i$ is a patch of contiguous coordinates in the $i$'th batch, $J_u$ is the Jacobian of the deformation field produced by the EPINR network $u$, and MI is the mutual information similarity \citep{Viola.WellsIII.1997}. The $\det J_u$ term applies Jacobian modulation, but this modulation is changing along with the network weights during training. This complicates the optimization process (as mentioned in \citet{Andersson.etal.2001,Tao.etal.2009}), but potentially increases final registration accuracy as modulation will always be applied to the final undistorted b0. We also reiterate that $J_u$ is analytically calculated on the network parameters $\theta$, not with a finite difference approximation \citep{Wolterink.etal.2022}.

Taking inspiration from the SDC method in fmriprep \citep{Esteban.etal.2019a}, we also added a similarity term for the Laplacians of the $\vol{F}$ and the undistorted $\vol{M}$:
\begin{equation}
\label{eq:epinr_sim_laplace}
    \mathcal{L}_{\text{lap}} = 
        \frac{1}{N} \sum^N_{i=1}
        -\text{NCC}\left(
             \nabla^2 \left(
                \det J_{u} \left(\coord{c}_i + u_{\theta}(\coord{c}_i) \right)
                \vol{M}\left[\coord{c}_i + u_{\theta}(\coord{c}_i)\right] 
            \right),
            \nabla^2 (1 - \vol{F} \left[ \coord{c}_i \right] )
        \right),
\end{equation}
where NCC is normalized cross-correlation, $\nabla^2$ is the image Laplacian operator, and $1 - \vol{F}$ flips the T1w intensity distribution to more closely match the b0.

Finally, we used two regularization functions: a smoothing loss as the Frobenius norm of the Jacobian, and a bending energy loss that requires the Hessian of the deformation \citep{Rueckert.etal.1999}. These are given as:
\begin{align}
\label{eq:epinr_regularize}
    & \mathcal{L}_{\text{smooth}} = 
    \| J_u(\coord{c}_i + u_{\theta}(\coord{c}_i)) \|_F =  
    \left| 1 + \frac{\partial u_\theta(\coord{c}_i)}{\partial y} \right| \\
    & \mathcal{L}_{\text{bend}} = 
    \sum_{p=1}^3 \sum_{q=1}^3 \sum_{r=1}^3 \left(\mathbf{H}^{(u)}_p(\coord{c}_i + u_\theta(\coord{c}_i))\right)_{q, r}^2 = 
    \sum_{q=1}^3 \sum_{r=1}^3 \left(\mathbf{H}^{(u)}_{p=2}(\coord{c}_i + u_\theta(\coord{c}_i))\right)_{q, r}^2, 
\end{align}
with simplified forms on the right side enabled by the constraint to the PE direction. Here, $(\mathbf{H}^{(u)}_p(\coord{c}_i + u_{\theta}(\coord{c}_i))_{q, r}$ is the analytically calculated Hessian matrix of the deformation field produced by the network $u_{\theta}$ at the $p$'th output dimension with relation to input dimensions $q$ and $r$ (where $x=1$, $y=2$, and $z=3$). Together with their corresponding weight terms, these four loss terms form our full objective function.

\section{Experiments \& Results}

\subsection{Data}
\label{ss:epinr_data}

We evaluated EPINR and the comparison methods on two different datasets. All methods were either unsupervised or trained on other datasets, so there were no train/test splits. All evaluated b0s were acquired with an anterior to posterior (AP) PE direction, and only one b0 was used. All DWIs were denoised with Marchenko-Pastur principal component analysis (MP-PCA) \citep{Veraart.etal.2016} and had $B_1$ bias field correction applied with N4 \citep{Tustison.etal.2010}. For topup, multiple (two for MICA-MICS, three for UVA MS) AP b0s and PA b0s were used for SDC. T1w images were bias corrected with N4 and denoised with a non-local means filter \citep{Manjon.etal.2010}. Then, T1w images were rigidly registered to the \textit{distorted} b0s with ANTs neighborhood normalized CC \citep{Avants.etal.2008}. Tissue masks were estimated with the FSL brain extraction tool (BET) (b0) \citep{Smith.2002} and SynthStrip (T1) \citep{Hoopes.etal.2022}. These masks weighed voxel contributions in training.

The first dataset was the publicly available Microstructure-Informed Connectomics (MICA-MICS) dataset \citep{Royer.etal.2021} with $N=49$ subjects. DWIs were collected with a spin-echo sequence at 1.6mm isotropic resolution, TR=3500ms, TE=64.40ms, and an FSL-style total readout time of 0.05282 seconds. T1w images were acquired with an MP-RAGE sequence, 0.8mm isotropic resolution, TR=2300ms, and TE=3.14ms. The second dataset was an in-house DWI dataset collected at Virginia Commonwealth University (VCU) as part of a study involving multiple sclerosis (MS) \citep{Pearsall.etal.2024,Goldman.etal.2023}, which we name ``VCU MS.'' This study was approved by the VCU Institutional Review Board. We evaluated methods on $N=48$ patients from VCU MS, including 10 healthy controls and 38 patients diagnosed with MS. DWIs were acquired at 1.875mm in-plane and 2mm slice thickness, TR=4900ms, TE=118ms, and a total readout time of 0.063054 seconds. The T1w images were acquired at 1mm isotropic resolution with TE=2.99ms.

\subsection{Implementation Details}

We implemented EPINR as a SIREN \citep{Sitzmann.etal.2020} MLP with 256 hidden features and 5 layers. Training was performed on randomly sampled patches as a full b0 could not fit into the graphical processing unit (GPU) memory, effective batch size of 6, 50 batches per epoch, over 60 epochs. Adam with weight decay \citep{Loshchilov.Hutter.2018} was used for optimization, with an initial learning rate of $5 \times 10^{-4}$ that linearly decreased to $5 \times 10^{-6}$ with a small number of warmup and cooldown epochs. Both the b0 and T1w volumes were normalized to $[0, 1]$. The input coordinates $\coord{c}_i$ were normalized to $[-1, 1]$ based on the b0's field of view, and MLP outputs were re-scaled to physical units. The image Laplacian $\nabla^2$ in Equation \ref{eq:epinr_sim_laplace} was implemented as a difference of Gaussian filters \citep{Marr.Hildreth.1980}.

\subsection{Comparison Methods}
\label{ss:comparison_methods}

We compare EPINR to several SDC methods on both datasets:

\begin{enumerate}
    \item \textit{EPINR} - Our proposed model that uses an INR to learn the displacement field between a b0 and a T1w image, without resizing either out of their native resolutions.
    \item \textit{Uncorrected} - The acquired b0 with preprocessing and no SDC applied.
    \item \textit{ANTs-SyN} - ANTs Syn deformable registration \citep{Avants.etal.2008} with parameters from the \texttt{antsIntermodalityIntrasubject.sh} script with PE direction constraints.
    \item \textit{QSIPrep} - Fieldmapless SDC built into the QSIPrep dMRI preprocessing pipeline \citep{Cieslak.etal.2021}, version 1.0.2. This is tweaked variant of ANTs-SyN \citep{Avants.etal.2008} with custom preprocessing; this is given as ``experimental'' within QSIPrep.
    \item \textit{Synb0+Topup} - Synb0 synthesized undistorted b0 volume with follow-up SDC with topup \citep{Schilling.etal.2019}. Synb0 uses a trained GAN to synthesize an undistorted b0 at 2.5mm isotropic resolution. The preprocessed b0s and T1s were masked for input, using the scilpy \citep{Bore.etal.2025} wrapper Synb0. The synthetic b0 is upsampled to the b0's native resolution, and topup performs SDC.
    \item \textit{Topup GT} - FSL topup \citep{Andersson.etal.2003} run with b0s that have AP and PA PE directions, considered as our ground truth.
\end{enumerate}

For our ablation analysis, we compared the full EPINR model with several variants: \textit{No Regular.}, which removed both regularization terms $\mathcal{L}_{\text{smooth}}$ and $\mathcal{L}_{\text{bend}}$; 
\textit{No Bend. Regular.}, which removed bending energy regularization term $\mathcal{L}_{\text{bend}}$;
\textit{No Lapl. Sim.}, where Laplacian similarity $\mathcal{L}_{\text{lap}}$ was removed;
\textit{No Jac. Mod.}, which disabled Jacobian modulation during training, but is still applied after training; and
\textit{EPINR-128x3}, a reduced EPINR model with 3 hidden layers with 128 features.

\subsection{Evaluation Metrics}

We transform each model's predicted displacement field (in mm) to a susceptibility field (Hz) with scaling by the PE direction resolution and total readout time. Then, we use the \texttt{applytopup} FSL tool to apply the susceptibility field to the distorted b0 with Jacobian modulation. We then measure each model's performance over five metrics: 1) a mean-squared error (MSE) of the model's undistorted b0 vs. topup's undistorted b0, 2) MSE of the model's predicted susceptibility field vs. topup's reconstructed field, 3) MI similarity between the model's corrected b0 and the T1w reference, with both normalized to $[0, 1]$, 4) local normalized CC (LNCC) of the corrected b0 and the T1w reference image, and 5) percentage of negative voxels in the determinant of the Jacobian of the model's deformation field, calculated with a central difference method to keep evaluation consistent.

\subsection{Results}
\label{ss:comparison_results}

\begin{figure}[htbp]
\begin{center}
    \includegraphics[width=\linewidth]{figs/epinr_viz_results.pdf}
\end{center}
   \caption{Qualitative results of susceptibility distortion correction over three subjects. Below each undistorted b0, the predicted susceptibility field in Hz is displayed. From top to bottom, subjects are from MICA-MICS, VCU MS, and MICA-MICS.}
    \label{fig:epinr_qual_results}
\end{figure}

\begin{table}[htbp]
\centering
\resizebox{\textwidth}{!}{%
\begin{tabular}{l|l|llll}
Dataset &
  Model &
  \begin{tabular}[c]{@{}l@{}}b0-topup MSE \\ $\times 10^{-6} \downarrow$\end{tabular} &
  \begin{tabular}[c]{@{}l@{}}SF-topup MSE \\ $\times 10^{-3} \downarrow$\end{tabular} &
  MI b0-Anat. $\uparrow$ &
  LNCC b0-Anat. $\uparrow$ \\ \hline
\multirow{6}{*}{\begin{tabular}[c]{@{}l@{}}MICA-MICS\\ N=49\end{tabular}} &
  Uncorrected &
  1.3021 (0.300) &
  0.5397 (0.125) &
  0.3675 (0.034) &
  0.2271 (0.014) \\
 &
  ANTs-SyN &
  1.6665 (0.354) &
  0.5229 (0.092) &
  0.3342 (0.054) &
  0.2395 (0.017) \\
 &
  QSIPrep &
  1.4983 (0.361) &
  0.5710 (0.128) &
  0.3677 (0.033) &
  0.2252 (0.012) \\
 &
  Synb0+Topup &
  1.3741 (0.338) &
  1.4491 (0.267) &
  0.4285 (0.034) &
  0.2138 (0.013) \\
 &
  EPINR &
  \textbf{0.9878 (0.237)} &
  \textbf{0.4221 (0.087)} &
  \textbf{0.4299 (0.051)} &
  \textbf{0.2501 (0.016)} \\
 &
  Topup GT &
  - &
  - &
  0.4629 (0.039) &
  0.2639 (0.019) \\ \hline
\multirow{6}{*}{\begin{tabular}[c]{@{}l@{}}VCU MS\\ N=48\end{tabular}} &
  Uncorrected &
  4.1649 (1.375) &
  0.8881 (0.248) &
  0.2750 (0.040) &
  0.1621 (0.016) \\
 &
  ANTs-SyN &
  3.5424 (1.448) &
  0.7522 (0.189) &
  0.2167 (0.039) &
  0.1840 (0.016) \\
 &
  QSIPrep &
  3.2523 (1.066) &
  0.8075 (0.222) &
  0.2667 (0.038) &
  0.1720 (0.017) \\
 &
  Synb0+Topup &
  3.3660 (1.324) &
  2.7373 (0.900) &
  0.3276 (0.023) &
  0.1540 (0.016) \\
 &
  EPINR &
  \textbf{2.2992 (1.798)} &
  \textbf{0.6569 (0.526)} &
  \textbf{0.3690 (0.031)} &
  \textbf{0.1877 (0.018)} \\
 &
  Topup GT &
  - &
  - &
  0.3146 (0.027) &
  0.1780 (0.020)
\end{tabular}%
}
\caption{Results of different EPI distortion correction methods applied to two DWI datasets. Metric name arrows indicate the direction of better performance, and the best performance between all models is bolded.}
\label{tab:epinr_results}
\end{table}

Quantitative results for comparing all datasets, models, and metrics (except the Jacobian negative percentage) are shown in Table \ref{tab:epinr_results}, and examples from both datasets are shown in Figure \ref{fig:epinr_qual_results}. For the negative Jacobian percentage, the large majority of methods showed 0 negative voxels, except for: Synb0+topup on MICA-MICS (0.0152\%), Synb0+topup on VCU MS (0.0017\%), and regular topup on VCU MS (0.0006\%).

As shown in Table \ref{tab:epinr_results}, EPINR outperforms comparison methods on every metric we evaluated over both datasets except for b0 MSE in the MICA-MCIS data, where the ablation model performed the best. We believe that this overall high performance may be due to the high number of parameters that allows EPINR to learn high-frequency warps. According to Figure \ref{fig:epinr_qual_results} rows 1 and 3, we also see how including the Jacobian modulation in training helps EPINR ``break through'' some of the more severe warping in frontal areas, compared to ANTs-SyN and QSIPrep. The 0 negative Jacobian voxels also indicate that the smooth and bending energy regularizers are effective at eliminating ``folds'' in the resulting deformation fields, in agreement with previous work in \citet{Wolterink.etal.2022}.

The performance of Synb0 is somewhat surprising. Despite its use in several publicly released datasets, Synb0 performed the worst on some of our metrics. The predicted susceptibility fields in Figure \ref{fig:epinr_qual_results} indicate that the topup registration to the synthesized b0 is a poor fit. This may be explained by the fixed spatial resolution of 2.5mm isotropic generated by Synb0, which must be upsampled to 1.6mm for MICA-MICS and 1.875mm for VCU MS and may be too large of a resolution gap for topup to handle. We also found that the generated b0s ``filled out'' brain tissue into the outer surface of the skull, which changed the basic shape and volume of the b0. This may be caused by an unexpected brain mask shape resulting from our preprocessing, but this would imply that Synb0 is overly sensitive to how masking is performed, which is a difficult task in distorted images.

The results also reveal the limitations of calculating image similarity between the b0 and T1w reference. In the VCU MS data, EPINR ``outperforms'' topup on image similarity for both MI and LNCC. However, a qualitative comparison of topup and EPINR still confirms that topup produces a better undistorted b0. This discrepency is likely due to 1) EPINR's much higher number of parameters allowing it to somewhat overfit to the anatomical reference, and 2) the simple fact that image similarity between two modalities is not on its own sufficient to measure SDC performance. So, while b0-T1w similarity is often reported in the literature, interpreting results must be done carefully.

\subsection{Ablation Results}

To understand the different components of EPINR, we also performed an ablation analysis with subsampled datasets, where 10 subjects were randomly selected for both MICA-MICS and VCU-MS. The ablated EPINR variants are listed in Section \ref{ss:comparison_methods}, and all training and evaluation configurations are the same as Section \ref{ss:comparison_results}.

The quantitative results for our ablation experiment are found in Table \ref{tab:epinr_ablate_results}, and example images are shown in Figure \ref{fig:ablation_qual_results}. Most models have 0\% negative Jacobian determinant voxels, except for the ``No Regular.'' model which had 0.230\% and 0.535\% negative Jacobian voxels for MICA-MICS and VCU MS, respectively. We find that overall, all included loss terms and the parameters in EPINR all contribute to its performance in different ways. For $\mathcal{L}_{\text{lap}}$ under ``No Lapl. Sim.,'' we found that removing this term reduced EPINR's performance slightly in most metrics, although the differences are subtle qualitatively (Figure \ref{fig:ablation_qual_results} columns 1 and 4). Specifically, we observe that the Laplacian similarity helps the model from displacing too far out of a T1w tissue boundary. Similarly, we found that the smaller 128x3 MLP (``EPINR-128x3'') reduced the model performance in most metrics due to the inability of the smaller network in capturing high frequencies.

\begin{table}[htbp]
\centering
\resizebox{\textwidth}{!}{%
\begin{tabular}{l|l|llll}
Dataset &
  Model &
  \begin{tabular}[c]{@{}l@{}}b0-topup MSE \\ $\times 10^{-6} \downarrow$\end{tabular} &
  \begin{tabular}[c]{@{}l@{}}SF-topup MSE \\ $\times 10^{-3} \downarrow$\end{tabular} &
  MI b0-Anat. $\uparrow$ &
  LNCC b0-Anat. $\uparrow$ \\ \hline
\multirow{8}{*}{\begin{tabular}[c]{@{}l@{}}MICA-MICS\\ N=10\end{tabular}} &
  Uncorrected &
  1.3488 (0.259) &
  0.5784 (0.135) &
  0.3849 (0.038) &
  0.2304 (0.018) \\
 &
  No Regular. &
  1.8039 (0.399) &
  1.0106 (0.251) &
  \textbf{0.4575 (0.077)} &
  0.2539 (0.020) \\
 &
  No Lapl. Sim. &
  1.0700 (0.246) &
  0.4942 (0.133) &
  {\ul 0.4500 (0.061)} &
  0.2514 (0.019) \\
 &
  No Bend. Regular. &
  1.0214 (0.238) &
  0.4778 (0.124) &
  0.4491 (0.058) &
  \textbf{0.2547 (0.020)} \\
 &
  No Jac. Mod. &
  \textbf{0.9232 (0.156)} &
  \textbf{0.4315 (0.082)} &
  0.4069 (0.040) &
  0.2519 (0.021) \\
 &
  EPINR-128x3 &
  {\ul 0.9577 (0.238)} &
  0.5112 (0.140) &
  0.4400 (0.055) &
  0.2494 (0.019) \\
 &
  EPINR &
  1.0054 (0.220) &
  {\ul 0.4710 (0.124)} &
  0.4466 (0.063) &
  {\ul 0.2543 (0.020)} \\
 &
  Topup GT &
  - &
  - &
  0.4835 (0.045) &
  0.2681 (0.021) \\ \hline
\multirow{8}{*}{\begin{tabular}[c]{@{}l@{}}VCU MS\\ N=10\end{tabular}} &
  Uncorrected &
  4.1100 (1.229) &
  0.8621 (0.298) &
  0.2680 (0.040) &
  0.1627 (0.013) \\
 &
  No Regular. &
  2.8701 (1.582) &
  1.5217 (0.933) &
  \textbf{0.3886 (0.040)} &
  0.1860 (0.021) \\
 &
  No Lapl. Sim. &
  2.6543 (1.200) &
  0.7439 (0.475) &
  0.3699 (0.036) &
  0.1853 (0.017) \\
 &
  No Bend. Regular. &
  \textbf{2.6021 (1.238)} &
  \textbf{0.7357 (0.529)} &
  {\ul 0.3713 (0.036)} &
  \textbf{0.1882 (0.017)} \\
 &
  No Jac. Mod. &
  3.2984 (1.424) &
  0.8411 (0.563) &
  0.2655 (0.040) &
  0.1821 (0.017) \\
 &
  EPINR-128x3 &
  2.6682 (1.249) &
  0.7803 (0.560) &
  0.3546 (0.032) &
  0.1847 (0.017) \\
 &
  EPINR &
  {\ul 2.6046 (1.247)} &
  {\ul 0.7400 (0.540)} &
  0.3712 (0.037) &
  {\ul 0.1878 (0.017)} \\
 &
  Topup GT &
  - &
  - &
  0.3057 (0.024) &
  0.1760 (0.013)
\end{tabular}%
}
\caption{Ablation results for EPINR models applied to 20 randomly selected subjects (10 from each dataset). Arrows indicate the direction of better performance, the best score is bolded, and the second best is underlined.}
\label{tab:epinr_ablate_results}
\end{table}

\begin{figure}[htbp]
\begin{center}
    \includegraphics[width=0.8 \linewidth]{figs/epinr_ablation_viz_results.pdf}
\end{center}
   \caption{Qualitative results of the ablation experiment. Examples from the MICA-MICS and VCU MS are shown in the top and bottom rows, respectively. Undistorted b0s and are shown above their respective susceptibility field maps.}
    \label{fig:ablation_qual_results}
\end{figure}

For our regularization terms $\mathcal{L}_{\text{smooth}}$ and $\mathcal{L}_{\text{bend}}$, we found that some regularization is necessary to prevent invalid deformations. As discussed in Section \ref{ss:comparison_results}, removing regularization causes a high MI similarity, but qualitative inspection in Figure \ref{fig:ablation_qual_results} indicates that this is the model overfitting to the T1w. The bending energy term $\mathcal{L}_{\text{bend}}$ specifically was helpful in MICA-MICS, but may have caused a slight performance hit in VCU MS.

The impact of using Jacobian modulation during training is mixed, depending on the dataset. Training with the modulation enabled reduces performance in MICA-MICS, but raises performance in UVA MS. We hypothesize that this depends on the
contrast between white and gray matter in the b0 images, where
the higher contrast in MICA-MICS leads to the model ``overrusing'' the
Jacobian modulation to match the contrast levels in the T1w image. However, in VCU MS, the contrast between gray and white
matter is very low. As shown in Figure 3, columns 3 and 4, we still see
that Jacobian modulation allows EPINR to properly decompress severe distortions, but at a higher risk of T1w overfitting.

\section{Discussion}

We have proposed EPINR, a novel INR-based unsupervised registration method for correcting EPI distortions in dMRIs without auxiliary acquisitions. We compared EPINR to several models currently in use, and found that EPINR produced higher quality susceptibility field reconstructions. We validated EPINR's flexibility over two dMRI datasets with different resolutions and image characteristics, and we performed an ablation analysis to better understand the effect of each component in EPINR.

We believe that EPINR's flexibility lends itself to further extensions. One more traditional extension is in utilizing multiple acquisitions or modalities when performing SDC \citep{Tao.etal.2009,Irfanoglu.etal.2015}. These may be incorporated simply by adding similarity terms to the objective function, and may come from different modalities and resolutions. While EPINR's unsupervised nature is appealing, we believe that performance could be greatly improved by incorporating learned priors into the model via methods such as meta-learning \citep{Finn.etal.2017}. We would also be interested in expanding our analyses to look at the downstream effects of these SDC methods on microstructure modeling and tractography. Finally, we would also like to expand EPINR to EPI corrections in fMRI, although this task is more challenging as distortion in gradient echo images causes signal dropout instead of ``pile-up.'' If this could be tackled, EPINR would continue to let us to make older data more useful.

\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{We thank Dr. Miaomiao Zhang and the UVA Medical Image Analysis Lab for many helpful discussions. This work was partially supported by NSF Smart and Connected Health grant 2205417.}

\bibliography{midl26_337_submission/midl26_337}

\appendix

\section{EPINR Runtime}

We note that EPINR does have a high runtime when compared to most other methods. Experiments were run on a workstation with 20 CPU cores and an Nvidia RTX A5000 with 24Gi of GPU memory. ANTs-SyN took approximately 1 minute for one subject, Synb0+topup required around 7 minutes, and QSIPrep took between 60 and 90 minutes to run the pipeline up to the SDC step. EPINR required between 45 and 60 minutes to fit one subject as configured in this experiment. One prominent bottleneck in EPINR's training time is the Hessian calculation for $\mathcal{L}_{\text{bend}}$ which, along with other parameter tweaks, may be disabled for faster training time.

% \appendix

% We initialized the output weight matrix with values chosen uniformly in the range $[-0.0001, 0.0001]$ to initialize the output displacements to be small, and disabled the output bias scalar to remove global translations. 
% The differentiable mutual information (MI) similarity function was implemented with a Parzen windowing scheme and 32 bins \citep{Pluim.etal.2003, Guo.2019}. 
% The MI similarity also included a tissue mask weighting of voxels which was used to weigh voxels while the joint histogram was being estimated.
% The image Laplacian was implemented as a difference of Gaussian blurs with higher $\sigma$ value being $1.6$ times the value of the lower $\sigma$ to approximate the image Laplacian \citep{Marr.Hildreth.1980}.

% \section{Proof of Theorem 1}

\end{document}
