\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images
\usepackage{booktabs}
\usepackage{tabularx}
\usepackage{amsmath}

\usepackage{siunitx} % 用于数字格式化和单位

\jmlrvolume{-- Under Review}
\jmlryear{2025}
\jmlrworkshop{Full Paper -- MIDL 2025 submission}
\editors{Under Review for MIDL 2025}

\title[SM-GICNET]{A Symmetric Multi-level Gradient-Inverse Consistency Network for Large Deformation Image Registration}




% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Author Name1\midljointauthortext{Contributed equally}\nametag{$^{1}$}} \orcid{1111-2222-3333-4444} \Email{abc@sample.edu}\\
\Name{Author Name2\midlotherjointauthor\nametag{$^{1}$}} \Email{xyz@sample.edu}\\
\Name{Author Name4\midljointauthortext{Contributed equally}\nametag{$^{1}$}}\\ 
\addr $^{1}$ Address 1
}

\begin{document}

\maketitle

\begin{abstract}

Three-dimensional (3D) deformable image registration is a critical technique in medical image analysis. While learning-based methods have made significant progress, they often lack robustness when handling large deformations arising from inter-subject heterogeneity, These methods frequently requiring pre-registration and exhibiting a strong reliance on data-driven mechanisms.  To address these challenges, we propose an end-to-end \textbf{Symmetric Multis-level Gradient-Inverse Consistency Network (SM-GICNet)} designed to enhance both the accuracy and robustness of registration for highly heterogeneous images.  Our approach employs: 1) a symmetric multi-level registration framework incorporating attention mechanisms to simultaneously capture large and small deformations and accelerate registration speed, with attention gating at the high-resolution level to improve learning of complex deformation regions; 2)forward-inverse deformation field consistency strategy at each level, employing simultaneous constraints on forward and inverse deformation fields to encourage mutual image proximity and improve deformation field stability and consistency; and 3) a gradient inverse consistency strategy to directly constrain bidirectional deformation fields, reducing reliance on data-driven aspects and suppressing deformation field complexity.  Experimental results demonstrate that our method, unlike state-of-the-art approaches, obviates the need for pre-registration (rigid or affine) and outperforms existing methods in directly handling large deformation registration tasks with high heterogeneity.  Our code is publicly available to facilitate further research and practical applications.

\end{abstract}

\begin{keywords}
Symmetric registration, Consistency-Constrained, Inverse-Consistent, Multi-level
\end{keywords}


\section{Introduction}

Medical image registration is a fundamental task in medical image analysis, aiming to establish a nonlinear spatial correspondence between a pair of images (source/moving and target/fixed images)\cite{Ref1}.  Traditional registration methods typically rely on iterative optimization strategies to maximize similarity metrics in the transformation space \cite{Ref1}. While these methods offer relatively stable performance, they suffer from low computational efficiency and limitations when handling complex deformation fields. Recently, deep learning-based registration methods have leveraged large datasets and the powerful modeling capabilities of neural networks \cite{Ref1} to achieve significant performance improvements. These methods directly predict the deformation field via neural networks, demonstrating superior performance in nonlinear alignment tasks.

\begin{figure*}[htbp]
\floatconts
  {fig:example}
  {\caption{(a) A symmetric multi-level registration framework incorporating an attention gating mechanism; (b) forward-inverse deformation field consistency strategy at each level; (c) a deformation field constraint based on gradient inverse consistency; and (d) a multi-level consistency strategy.}}
  {\includegraphics[width=0.9\linewidth]{image1.png}}
  {\label{fig:image1}}
\end{figure*}

However, most existing deep learning methods focus on unidirectional registration, resulting in deformation fields that often lack inverse consistency and bidirectional symmetry when dealing with large deformations or high inter-subject heterogeneity. Some symmetric registration methods have been proposed to simultaneously predict forward and backward transformations between images\cite{Ref1}, enhancing the invertibility and accuracy of the deformation field. However, these methods mainly rely on similarity-driven optimization, leading to slow convergence and high computational costs \cite{Ref1}. Furthermore, without rigid or affine pre-registration, the registration performance for large deformation fields remains unsatisfactory, particularly in scenarios with high inter-subject heterogeneity or significant lesion variations \cite{Ref1}. %(Balakrishnan et al., 2019; Dalca et al., 2019; de Vos et al., 2019; Qin et al., 2019).

To address these issues, we propose a \textbf{Symmetric Multis-level Gradient-Inverse Consistency Network (SM-GICNet)} capable of directly handling large deformation registration tasks with high heterogeneity without pre-registration.  Specifically, we introduce a method that combines a \textbf{symmetric multi-level} registration framework with an \textbf{attention gating mechanism} to capture deformation features at different scales and focus attention on complex deformation regions during the high-resolution stage. At each level, we employ a \textbf{forward-inverse deformation field consistency strategy}, where images A and B are simultaneously registered as each other's moving image, predicting both forward (A → B) and backward (B → A) deformation fields to ensure the stability and consistency of the transformations.  Furthermore, we incorporate a \textbf{gradient inverse consistency constraint} to directly regularize the gradient alignment of the forward and backward deformation fields, limiting deformation field complexity and mitigating reliance on purely data-driven optimization.

Our main contributions are:
\begin{itemize}
\item The design of a symmetric multi-level registration framework incorporating an attention gating mechanism.%, effectively capturing multi-scale deformation information and enhancing focus on complex deformation regions.
\item The utilization of a forward-inverse deformation field consistency strategy, which improving field stability and inverse consistency.
\item The use of a gradient inverse consistency constraint to reduce reliance on purely data-driven optimization.
\item The complete elimination of the need for pre-registration ( rigid or affine registration), demonstrating superior performance to existing state-of-the-art methods in large deformation registration tasks with high heterogeneity in brain MRI.
\end{itemize}



\section{Related Work}

\subsection{Symmetric Diffeomorphic Registration}

Symmetry plays a crucial role in the registration of medical images, particularly in the estimation of deformations between paired images, significantly improving geometric consistency and precision \cite{Ref1}.  Early research often independently estimated transformations from image A to image B, and vice-versa, failing to guarantee that these transformations are inverse mappings of each other. Symmetric image registration addresses this limitation by explicitly enforcing inverse consistency between transformations, thus mitigating biases inherent in general-purpose directional image registration methods \cite{Ref1}. However, most existing deformable registration methods typically parameterize the deformable model using displacement fields. While this parameterization is simple and intuitive, it often neglects differential properties, including topology preservation and the invertibility of the deformation field \cite{Ref7, Ref24}. This makes it difficult to obtain truly symmetric inverse deformation fields.  Therefore, diffeomorphic registration \citep{DiffeomorphicCitation}, employing stationary velocity fields to realize a diffeomorphic deformation model, has been proposed. Theoretically, diffeomorphisms are differentiable and invertible, ensuring a smooth one-to-one mapping and thus preserving topology. The path of the diffeomorphic deformation field, $\phi_t$ parameterized by $t \in [0, 1]$, can be generated from the velocity field as:

\begin{equation} \label{eq:1}
\frac{d\phi_t}{dt} = v_t(\phi_t) = v_t \circ \phi_t
\end{equation}

Leveraging diffeomorphisms to construct inverse deformation fields for building symmetric registration networks is highly advantageous.


\subsection{Attention Gating Mechanism}

The Attention Gating Mechanism (AGM) is an attention mechanism readily integrated into various Convolutional Neural Networks (CNNs) to enhance performance in tasks such as image segmentation, object detection, and image classification \cite{Ref1}. The core concept mirrors human visual attention, focusing on salient regions while de-emphasizing less important areas. AGM guides the network to concentrate on significant features in the input data and suppress irrelevant information. Initially proposed in the context of medical image analysis to focus on target structures of varying shapes and sizes, AGM has since found broad application in computer vision and other domains.

In deep learning, the introduction of attention mechanisms enhances feature representation by dynamically adjusting the weight distribution of regions of interest. Numerous studies have shown that attention mechanisms significantly improve the accuracy of registration models in local regions, particularly in tasks requiring attention to fine anatomical structures \cite{Ref1}. Attention networks based on U-Net architectures are especially prevalent in image registration. For example, Attention U-Net \cite{AttentionUNetCitation} incorporates attention modules to focus on key regions of the input image, enhancing the model's response to specific areas while reducing interference from background noise and irrelevant features. This approach has demonstrated performance improvements on several public medical image datasets.  



\section{Methods}

We present a symmetric multi-level gradient inverse consistency network for large-deformation image registration, As illustrated in Figure~\ref{fig:image1}, it includes: (1) a symmetric multi-level registration framework incorporating an attention gating mechanism; (2) forward-inverse deformation field consistency strategy at each level; and (3) a deformation field constraint based on gradient inverse consistency.



\subsection{Symmetric Multi-level Registration Framework}

A novel symmetric multi-level registration framework with attention gating is proposed to effectively capture multi-scale deformations between images.  The multi-level architecture comprises four consecutive levels, efficiently capturing both global and complex local deformations within a single forward pass.  Symmetry is achieved by alternately using each image as the moving image in a single registration, yielding $\Phi_{AB}$ and $\Phi_{BA}$.  Simultaneously, both forward ($\Phi_{AB}$) and inverse ($\Phi_{BA}^{-1}$) deformation fields are directly obtained for each image, enabling multi-constraint network learning.

The multi-level network is constructed using a 3D discrete wavelet transform (3D-DWT) \citep{Ghasemzadeh2018} to leverage both low-frequency global and high-frequency local information, following the input method of AMNet \citep{AMNetCitation}.  An attention gating mechanism is introduced at level 4 to automatically enhance deformation field learning in crucial regions while suppressing background influence, improving the model's ability to capture large deformations and fine structural details. Further details regarding the attention network architecture are provided in Appendix A.



\subsection{Forward-Inverse Deformation Field Consistency Strategy}

\begin{figure*}[htbp]
\floatconts
  {fig:example}
  {\caption{ Qualitative registration results by three different methods. The second row shows corresponding heat maps of the absolute difference with respect to the fixed image.}}
  {\includegraphics[width=1\linewidth]{result1.png}}
  {\label{fig:result1}}
\end{figure*}



Our network employs a forward-inverse deformation field consistency strategy at each level, promoting bidirectional symmetry during unidirectional registration to enhance deformation field stability and consistency.  Following diffeomorphic principles, a stationary velocity field is used instead of a displacement field for parameterization.  The deformation field is defined as in the equation(1). The velocity field $v$ is integrated over a unit time using a scaling and squaring operation with a time step $T=7$ to obtain the final deformation field $\phi(1)$. The diffeomorphic model adapts to large or complex deformations, and since the output is a velocity field, the inverse velocity field, and subsequently the inverse deformation field, can be obtained by negating the velocity field. This forms the basis of the symmetric registration network.  

Specifically, assuming the network learns the deformation field $\Phi_{M\to F}$ from moving image $M$ to fixed image $F$, the inverse deformation field $\Phi_{F\to M}$ is derived using diffeomorphic properties. The network then learns $\Phi_{F\to M}$ from $F$ to $M$, ensuring consistency between these two deformation fields. This reduces reliance on training data and improves the inverse representation capabilities during forward registration.



\subsection{Gradient Inverse Consistency and Forward-Backward Deformation Field Consistency Constraints}

In a single registration, images A and B are registered reciprocally, alternately serving as the moving image.  This yields both forward ($\Phi_{AB}$) and backward ($\Phi_{BA}$) deformation fields. To avoid data-driven dependence, a symmetry constraint is introduced: gradient inverse consistency is applied directly to the generated pair of deformation fields. This ensures that the matrix resulting from the forward deformation field and its inverse is consistent with the identity grid I, suppressing unreasonable complexities in the deformation field.




\subsection{Losses and objectives}
Our method employs a multi-component loss function to optimize image registration.  The total loss is a weighted sum of the following terms.

Consistency Loss: This term enforces consistency between forward and inverse transformations.  It comprises three sub-components: \\
\textbf{Symmetric Consistency}:  
\begin{equation}
\mathcal{L}_{Sy} = \mathcal{L}_{MSE}(I^A \circ \Phi[I^A, I^B] , I^B \circ \Phi[I^B, I^A])
\end{equation}
\textbf{Inverse Consistency}:
\begin{equation}
 \mathcal{L}_{In} = \mathcal{L}_{MSE}(\Phi[I^A, I^B] \circ \Phi^{-1}[I^B, I^A]) , \mathbf{I})
\end{equation}
\textbf{Multi-level Consistency}:
\begin{equation}
  \mathcal{L}_{Mu} = \mathcal{L}_{MSE}(\mathbf{w}_{i} , (\mathbf{w}_{i}+ \mathbf{w}_{i-1})/2 )
\end{equation}
And the Consistency Loss is the sum of them: 
\begin{equation}
\mathcal{L}_{consistency} = \lambda_s\mathcal{L}_{Sy} + \lambda_i\mathcal{L}_{In} + \lambda_m\mathcal{L}_{Mu}
\end{equation}
where $\lambda_s = 0.1$, $\lambda_i$ and $\lambda_m = 0.001$.

\textbf{Multi-level NCC Similarity Loss}: This term maximizes the Normalized Cross-Correlation (NCC) similarity between warped and fixed images across multiple resolution levels.  Let $L_{NCC}^i$ denote the NCC loss at level i. Then:
\begin{equation}
L_{NCC} = \sum_{i} \alpha_i L_{NCC}^i
\end{equation}
where $\alpha_i$ are weights assigned to each level.


\textbf{Smoothness Regularization Loss}: This term penalizes overly complex deformations by minimizing the L2 norm of the deformation field gradients:
\begin{equation}
L_{smooth} = \sum_{i} \beta_i ||\nabla \mathbf{d}_i||^2_2
\end{equation}
where $\mathbf{d}_i$ is the deformation field at level i and $\beta_i$ are weights.


% \textbf{Anti-folding Loss}: This term prevents folding artifacts in the deformation field:
% $L_{antifold} = \sum_{i} \gamma_i \sum_{p} \delta(\Delta \mathbf{d}_i(p)) ||\Delta \mathbf{d}_i(p)||^2_2$, where  $\Delta \mathbf{d}_i(p)$ is the Laplacian of the deformation field at point p and level i, $\delta(.)$ is an indicator function for folding, and $\gamma_i$ are weights.

The total loss function is given by:
\begin{equation}
L_{total} = L_{consistency} + L_{NCC} + L_{smooth}
\end{equation}



\section{Experiment}

\subsection{Data}

We evaluated our method using the IXI\footnote{https://brain-development.org/ixi-dataset/} brain MRI dataset, which comprises T1-weighted 3D MRI scans.  A subset of 314 subjects was selected and partitioned into training (n=269), validation (n=15), and testing (n=30) sets.  All images underwent standard skull stripping and contrast correction preprocessing; however, no prior registration (e.g., affine or rigid transformations) was performed.  Image segmentation was performed using FreeSurfer software, employing its default brain template (e.g., fsaverage or a derivative thereof, using FreeSurfer version XXX), resulting in 36 regions of interest (ROIs).  All performance evaluations were based on the overlap of ROIs in the test images.  For comparison, a separate training and testing dataset was created using affine registration with FSL's \texttt{flirt} command.

\subsection{Training Details}

Our models are trained using the Adam optimizer on a single NVIDIA A100 GPUs.  We trained the network for 4, 4, 6 and 6 epochs in levels 1, 2, 3, and 4. We set the initial learning rate as $1 \times 10^{-4}$ and then it was multiplied by 0.5 every 50k iterations after the first 60k iterations for each level. 

\subsection{Evaluation metrics}

To evaluate registration accuracy, we employed the Dice Similarity Coefficient (DSC) and Jacobian Determinant(JD). We first evaluated our method using the Dice score of the subcortical segmentation maps. The closer the Dice value is to 1, the better the overlap of the two images, indicating better registration performance.
The Jacobian determinant of the deformation field is used to evaluate the local properties of the deformation. For a deformation field $\phi$, the Jacobian determinant $J_{\phi}$ at each spatial location measures the local volume change induced by the transformation. We also evaluated the computational efficiency of the registration method by measuring the processing time required for a single pair of images.

\section{Results}

\begin{table}[htbp]
  \centering
  \caption{Quantitative evaluation of different registration methods. Higher DSC values indicate better performance, while lower proportions of $|JD| \le 0$ and registration times are preferred.}
  \renewcommand{\arraystretch}{1}
  \begin{tabularx}{\textwidth}{@{}l*{6}{>{\centering\arraybackslash}X}@{}}
    \toprule
    \textbf{} & \textbf{SyN} & \textbf{ICNet} & \textbf{ICNet w.affine} & \textbf{LapIRN} & \textbf{LapIRN w.affine} & \textbf{Ours} \\
    \midrule
    DSC & 0.753 ($\pm$0.103) & 0.289 ($\pm$0.211) & 0.714 ($\pm$0.081) & 0.478 ($\pm$0.272) & \textbf{0.803 ($\pm$0.069)} & \underline{0.797} \underline{($\pm$0.137)} \\
    $\vert JD \vert \le 0$ & 0.000 ($\pm$0.000) & 0.499 ($\pm$0.002) & 0.489 ($\pm$0.014) & 0.503 ($\pm$0.041) & \underline{0.488} \underline{($\pm$0.028)} & \textbf{0.483 ($\pm$0.027)} \\
    Time & about \SI{1}{\hour} & \textbf{\SI{0.252}{\second} ($\pm$\SI{0.021})} & \SI{7.893}{\second} ($\pm$\SI{0.02}) & \SI{1.100}{\second} ($\pm$\SI{0.006}) & \SI{8.743}{\second} ($\pm$\SI{0.006}) & \SI{0.272}{\second} ($\pm$\SI{0.042}) \\
    \bottomrule
  \end{tabularx}
  \label{table}
\end{table}



\subsection{Comparisons with the state-of-the-art methods}

\begin{figure*}[htbp]
\floatconts
  {fig:example}
  {\caption{(a) The deformed images resulting from reciprocal registration of the image pair, along with the resulting symmetric deformation fields.
(b) Ablation study results showing DSC and JD values obtained after removing each component of the network individually.}}
  {\includegraphics[width=1\linewidth]{result2.png}}
\end{figure*}


We compared our method with three widely-used registration approaches:

\begin{itemize}
  \item SyN \citep{Avants2008} -- A widely used registration method implemented in the Advanced Normalizing Tools (ANTs) software package.  We employed cross-correlation (CC) with a sampling radius of 4. A multi-resolution optimization strategy was used, consisting of three scales with iteration numbers \{100, 70, 50\} and Gaussian smoothing standard deviations \{1, 0.5, 0\}.  An initial affine transformation (Affine[0.1]) was applied, optimized using the mutual information (MI) metric with iteration numbers \{1000, 500, 250\}, a convergence threshold of 1e-6, and a maximum of 10 iterations. 

  \item ICNet \citep{ICNet} - An Inverse-Consistent deep Network for unsupervised deformable image registration. The model was trained from scratch using their official implementation, maintaining the same hyperparameters as specified in the original paper (which were determined to be optimal).
  
  \item LapIRN \citep{Mok2020} - A multi-level diffeomorphic registration algorithm based on Laplacian pyramid architecture, which utilizes three identical CNN-based registration networks to mimic multi-resolution scheme registration. We trained the model from scratch using their official implementation, with optimal parameters determined through grid search.
\end{itemize}

Table~\ref{table} summarizes the quantitative results of our method and three comparison methods across all ROIs.  For a fair comparison, LapIRN and ICNet were trained and tested on both unregistered and pre-registered datasets. Our method achieved the highest Dice score and the lowest JAD score on the unregistered dataset. While LapIRN achieved slightly better performance on the pre-registered dataset, it incurred a significantly higher computational cost with only marginal improvement in accuracy.

Figure~\ref{fig:result1} illustrates a prediction example for different methods.  Our method demonstrates results closest to the fixed image, achieving high accuracy even without pre-registration. The energy map of high-frequency information provides useful guidance for further refining the registration of structural boundaries.

Figure~\ref{fig:result1}(a) presents the results of reciprocal registration for a pair of images.  Our network effectively registers these images, even in subjects with high heterogeneity, and the resulting deformation fields exhibit symmetry.



\subsection{Ablation Study:}

Figure~\ref{fig:result1}(b) demonstrates the importance of each component of our model.  By individually removing the gradient inverse consistency constraint, the multi-level structure, the symmetry structure, and the attention mechanism, we observe that each component contributes to the overall performance.



\section{Conclusion}
We introduce a novel Symmetric Multi-level Gradient-Inverse Consistency Network (SM-GICNet) for robust large deformation image registration, specifically addressing the challenges posed by high inter-subject variability in medical images. Unlike many existing deep learning-based methods, SM-GIC Net directly handles large deformations without requiring pre-registration steps. This is achieved through a synergistic combination of three key innovations: 1) a symmetric multi-level architecture incorporating an attention mechanism for efficient multi-scale deformation capture; 2) a forward-inverse deformation field consistency strategy to ensure bidirectional symmetry and stability; and 3) a gradient inverse consistency constraint to reduce reliance on purely data-
driven optimization and complexity.





\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version

\midlacknowledgments{We thank a bunch of people.}

\bibliography{ref}



\appendix

\section{Attention network architecture}

\begin{figure*}[htbp]
% \begin{minipage}[b]{1.0\linewidth}
\floatconts
  {fig:example}
  {\caption{Attention network architecture.}}
  {\includegraphics[width=1\linewidth]{attention.png}}
\end{figure*}

The level 4 architecture retains an input layer, three residual blocks, and three convolutional layers with stride 2 in the encoder, along with three residual blocks, three upsampling layers, and an output layer in the decoder.  Each residual block contains two consecutive convolutional layers. Three skip connections between the encoder and decoder incorporate attention gates.  The attention gate first applies a 1x1x1 convolution to the input feature $F_g$ from the encoder and, similarly, to the downsampled feature from the corresponding encoder branch. These two outputs are then summed, followed by ReLU activation and another 1x1x1 convolution to reduce the channel dimension to 1.  A sigmoid activation is applied to the result, resampled to match the original feature size, creating a 1D weight matrix. Finally, this weight matrix is multiplied with the input feature, producing a new feature map. This process enhances focus on registration-relevant local features, resulting in more accurate deformation fields and improved network convergence speed.









\end{document}
