\documentclass{midl} % Include author names
%\documentclass[anon]{midl} % Anonymized submission

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images
\usepackage{floatrow}
\usepackage[noend]{algpseudocode}
\usepackage{mathtools}
\usepackage{textcomp}
\newcommand{\pluseq}{\mathrel{+}=}
\newfloatcommand{capbtabbox}{table}[][\FBwidth]
%\jmlrvolume{-- Under Review}
\jmlryear{2020}
\jmlrworkshop{Full Paper -- MIDL 2020}
%\editors{Under Review for MIDL 2020}

\title[DRMIME]{DRMIME: Differentiable Mutual Information and Matrix Exponential for Multi-Resolution Image Registration}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
%   \midlauthor{\Name{Abhishek Nan} \Email{anan1@ualberta.ca}\and
%   \Name{Nilanjan Ray} \Email{nray1@ualberta.ca}\\
%   \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Abhishek Nan}\nametag{$^{1}$} \Email{anan1@ualberta.ca}\\
%\AND
\Name{Matthew Tennant\nametag{$^{2}$}} \Email{mtennant@ualberta.ca}\\
\Name{Uriel Rubin\nametag{$^{3}$}} \Email{urielrubin@gmail.com}\\
\Name{Nilanjan Ray\nametag{$^{1}$}} \Email{nray1@ualberta.ca}\\
\addr $^{1}$ Department of Computing Science, Univeristy of Alberta, Edmonton, Alberta, Canada \\
\addr $^{2}$ Department of Ophthalmology, University of Alberta, Edmonton, Alberta, Canada\\
\addr $^{3}$ Department of Ophthalmology, Hospital Aleman, Buenos Aires, Argentina
}

\begin{document}

\maketitle

\begin{abstract}
We present a novel unsupervised image registration algorithm using mutual information (MI). It is differentiable end-to-end and can be used for both multi-modal and mono-modal registration. The novelty here is that rather than using traditional ways of approximating MI which are often histogram based, we use a neural estimator called MINE and supplement it with matrix exponential for transformation matrix computation. The introduction of MINE tackles some of the drawbacks of histogram based MI computation and matrix exponential makes the optimization process smoother. Our use of multi-resolution objective function expedites the optimization process and leads to improved results as compared to the standard algorithms available out-of-the-box in state-of-the-art image registration toolboxes empirically demonstrated on publicly available datasets.
\end{abstract}

\begin{keywords}
Image registration, mutual information, neural networks, differentiable programming, end-to-end optimization.
\end{keywords}

\section{Introduction}
Image registration is a common task required for digital imaging related fields that involves aligning two (or more) images of the same objects or scene. In medical image processing, we may wish to perform an analysis of a particular body part over a period of time. Images captured over time, of the same body part or location will change due to changes in the target organ over time as well variability in angle and distance of the target organ from the capture device. Furthermore, different imaging modalities can provide different and additive information for the clinician or researcher regarding human tissue. A common way to perform a holistic analysis is to combine the (complimentary) information from these different modalities. Alignment of the different modalities requires multi-modal registration.

One of the most successful metrics used for multi-modal medical image registration is mutual information (MI) \cite{MI_survey}, for which the most common computation method is histogram-based. As a result, MI suffers from the curse of dimensionality when multi-channel images are used. Recent MI estimation method MINE (mutual information neural estimation) \cite{belghazi2018mine} offers a way to curb this difficulty using a duality principle and Monte Carlo method to estimate a lower bound for MI. Additionally MINE is differentiable because it is computed by neural networks.

Our proposed registration method uses this differentiable mutual information, MINE, so that the automatic differentiation of modern optimization toolboxes, such as PyTorch \cite{NEURIPS2019_9015} can be utilized. Additionally, our method computes transformation matrix via matrix exponential of a linear combination of basis matrices. We demonstrate experimentally that matrix exponential method yields more accurate registration. Our method also makes use of multi-resolution image pyramids. Unlike a conventional method where computation starts at the highest level of the image pyramid and gradually proceeds to the lower levels, we simultaneously use all the levels in gradient descent-based optimization using automatic differentiation. We refer to our proposed method as DRMIME (differentiable registration with mutual information and matrix exponential). DRMIME is able to achieve state-of-the-art accuracy on two benchmark data sets: FIRE \cite{hernandez2017fire} and ANHIR \cite{ANHIR}.

\section{Background}

\subsection{Optimization for Image Registration}
Let us denote by $T$ the fixed image and by $M$ the moving image to be registered. Let $H$ denote a transformation matrix signifying affine or any other suitable transformation. Further, let $Warp(M,H)$ denote a function that transforms the moving image $M$ by the transformation matrix $H.$ Optimization-based image registration minimizes the following objective function to find the optimum transformation matrix $H$ that aligns the transformed moving image with the fixed image:
\begin{equation}
    \min_H D(T,Warp(M,H)),
\label{eqn:opt}
\end{equation}
where $D$ is a loss function that typically measures a distance between the fixed and the warped moving image.

\subsection{Matrix Exponential}

The optimization problem (\ref{eqn:opt}) can be carried out by gradient descent when the loss function $D$ and $Warp$ are differentiable with respect to elements of $H$ that are not constrained. When the elements of matrix $H$ are constrained, as in the rigid-body transformation, matrix exponential provides a remedy for gradient descent. For example, finding the parameters for rigid transformation can be seen as an optimization problem on a finite dimensional Lie group \cite{schroter2010lie}. 

One of the earliest works \cite{taylor1994minimization} shows how to perform optimization procedures over the Lie group \textit{SO(3)} and related manifolds. Later, Wachinger and Navab \cite{Wachinger} showed the use of matrix exponential for image sequence registration. For brevity, here we just state the mapping for the \textit{Aff(2)} group, which is the group of affine transformations on the 2D plane. This group has 6 generators:\\
\begin{equation*}
B_1=
\begin{psmallmatrix}
0 & 0 & 1\\
0 & 0 & 0\\
0 & 0 & 0\\
\end{psmallmatrix},
B_2=
\begin{psmallmatrix}
0 & 0 & 0\\
0 & 0 & 1\\
0 & 0 & 0\\
\end{psmallmatrix},
B_3=
\begin{psmallmatrix}
0 & 1 & 0\\
0 & 0 & 0\\
0 & 0 & 0\\
\end{psmallmatrix},
B_4=
\begin{psmallmatrix}
0 & 0 & 0\\
1 & 0 & 0\\
0 & 0 & 0\\
\end{psmallmatrix},
B_5=
\begin{psmallmatrix}
1 & 0 & 0\\
0 & -1 & 0\\
0 & 0 & 0\\
\end{psmallmatrix},
B_6=
\begin{psmallmatrix}
0 & 0 & 0\\
0 & -1 & 0\\
0 & 0 & 1\\
\end{psmallmatrix}.
\end{equation*}

If $v=[v_1,v_2,...,v_6]$ is a parameter vector, then the affine transformation matrix is obtained using the expression: $Mexp(\sum_{i=1}^6v_i B_i)$, where $Mexp$ is the matrix exponentiation operation that can be computed as:\\
\begin{equation}
\small
    Mexp(A) = \sum_{n=0}^{\infty} \frac{A^n}{n!},    
\label{eqn:mat_exp}
\end{equation}
for a matrix $A.$
In DRMIME, we truncate the series after $10$ terms and empirically find that this choice yields good registration accuracy. The image registration optimization defined in (\ref{eqn:opt}) now takes the following form:
\begin{equation}
\small
    \min_{v_1,...,v_6} D(T,Warp(M,Mexp(\sum_{i=1}^6v_i B_i))).
\label{eqn:opt_me}
\end{equation}
We can apply standard mechanisms of partial derivative $\frac{\partial D}{\partial v_i}$ computation by automatic differentiation (i.e., chain rule) and adjust parameter $v_i$ by gradient descent.


\subsection{Multi-resolution Computation}

Large displacements between the fixed and the moving images pose a significant challenge for the optimization that can be mitigated by the use of multi-resolution pyramids \cite{thevenaz1998pyramid, kruger1998image, alhichri2002multi}, In multi-resolution method a pyramid of images is constructed where the original image lies at the bottom level and subsequent higher levels have down-scaled, Gaussian blurred versions of the image.

Using the multi-resolution recipe, two image pyramids are built: $T_l$ and $M_l,$ $l=1,...,L,$ where $L$ is the maximum level in the pyramid. $T_1=T$ and $M_1=M$ are the original fixed and moving images, respectively. The registration problem (\ref{eqn:opt_me}) takes the following form:
\begin{equation}
\small
    \min_{v_1,...,v_6} \sum_{l=1}^L D(T_l,Warp(M_l,Mexp(\sum_{i=1}^6v_i B_i))).
\label{eqn:opt_me_mr}
\end{equation}
The usual practice for a multi-resolution approach is to start computation at the highest (i.e., coarsest) level of the pyramid and gradually proceed to the original resolution. In contrast, we found that working simultaneously on all the levels as captured in the optimization problem (\ref{eqn:opt_me_mr}) is more beneficial. 

% Note that using the same transformation matrix $Mexp(\sum_{i=1}^6v_i B_i)$ for all resolution levels makes sense only when the image transformation i.e., $Warp$ uses the same canonical range of pixel coordinates at every resolution. For example, our implementation uses the range $[-1,1]\times[-1,1]$ for pixel coordinates. With this view, a multi-resolution pyramid adds more samples in the space $[-1,1]\times[-1,1]$ as we go from lower to higher resolutions.

However, note also that image structures are slightly shifted through multi-resolution image pyramids. So, a transformation matrix suitable for a coarse resolution may need a slight correction when used for a finer resolution. To mitigate this issue, we exploit matrix exponential parameterization and introduce an additional parameter vector $v^1=[v^1_1,...,v^1_6]$ exclusively for the finest resolution and modify optimization (\ref{eqn:opt_me_mr}) as follows:
\begin{equation}
\small
    \min_{\substack{v_1, \cdots, v_6 \\ v^1_1, \cdots, v^1_6}}
     \{\sum_{l=2}^L D(T_l,Warp(M_l,Mexp(\sum_{i=1}^6v_i B_i))) +
     D(T_1,Warp(M_1,Mexp(\sum_{i=1}^6(v_i+v^1_i) B_i))) \}.
\label{eqn:opt_me_mr2}
\end{equation}


\subsection{Mutual Information}

% While there are various metrics used for image registration, probably the simplest is mean squared error (MSE).
% If successfully registered, the MSE between the fixed and transformed moving image would be close to zero. 
% Often gradient descent based techniques can be used for such intensity-based measures to find the correct registration parameters \cite{klein2009adaptive}. This can also be framed as a supervised learning problem \cite{detone2016deep}, where the goal is to learn the parameters of the homography transformation.
Since different modalities can have different image intensities and varying contrast levels between them, it is unlikely that simply using Mean Square Error as a registration metric will work well. This is why Mutual Information (MI) is commonly used in multi-modality registration. MI, in general, is defined as a measure of dependence between two random variables. 
% Two highly dependent variables will have a high MI score, while two less dependent variables will have a low MI score. 
In the context of image registration, this means that two initially unregistered images will have an MI score which is lower than the MI score between the images once they are completely registered. Gradient-based methods \cite{maes1997multimodality} for MI based image registration work quite well for such cases. In these implementations, MI between two random variables are computed by Kullback-Leibler (KL-) divergence \cite{kullback1997information}, which uses both joint and marginal probability densities.
%\begin{equation}
%\small
%    \label{eqn:MI1}
%    MI = \int p_{XY}(x,y) \log %\dfrac{p_{XY}(x,y)}{p_X(x)p_Y(y)}dxdy,
%\end{equation}
 
For scalar-valued images, joint probabilities are calculated using a two-dimensional histogram of the two images. Most current MI-based techniques for registration use slight variations of the above method to approximate MI. While this works well, there are some issues associated with this method of evaluation as follows.
\begin{itemize}
    \item The number of histogram bins chosen becomes a hyperparameter. While increasing the number of bins would lead to better accuracy in computation, this comes at the cost of time. 
    % Furthermore, there is no theoretical upper bound on the number of bins that should be used for accurate results.
    \item Images with higher dimensions (e.g., color images), would need a higher dimensional histograms and a joint histogram requiring a very large sample size that is often computationally prohibitive. For instance, an RGB image with 3 channels would need a 6-dimensional joint histogram. A common way to bypass this restriction is to work with grayscale intensities of images, but this leads to loss of valuable information.
\end{itemize}

A potential solution to the above problems is presented by MINE \cite{belghazi2018mine} that uses the Donsker-Varadhan (DV) duality to compute MI (we provide a simple proof in the Appendix (Section \ref{sec:proof})):
\begin{equation}
\small
    MI = \sup_{f} J(f),
\label{eqn:MI2}
\end{equation}
where $J(f)$ is the DV lower bound:
\begin{equation}
\small
    J(f) =  \int f(x,z)P_{XZ}(x,z)dxdz - log(  \int exp(f(x,z)) P_{X}(x)P_{Z}(z)dxdz),
\label{eqn:DV}
\end{equation}
where $P_{XZ}$ is the joint density for random variables $X$ and $Z.$ $P_{X}$ and $P_{Z}$ are marginal densities for $X$ and $Z,$ respectively.
MINE uses a neural network to compute $f(x,z)$ and uses Monte Carlo technique to approximate (\ref{eqn:DV}). MINE claims that computations of (\ref{eqn:MI2}) scales better than histogram-based computation of MI \cite{belghazi2018mine}. 

The optimization for image registration (\ref{eqn:opt_me_mr2}) using mutual information now becomes:
\begin{equation}
\small
    \max_{\substack{v_1, \cdots, v_6 \\ v^1_1, \cdots, v^1_6 \\ \theta}}
     \{\sum_{l=2}^L DV(T_l,Warp(M_l,Mexp(\sum_{i=1}^6v_i B_i))) +
     DV(T_1,Warp(M_1,Mexp(\sum_{i=1}^6(v_i+v^1_i) B_i))) \},
\label{eqn:opt_drmime}
\end{equation}
where $\theta$ denotes the parameters of the neural network that MINE uses to realize $f.$ Notation $DV(X,Z)$ in (\ref{eqn:opt_drmime}) is used to denote DV lower bound (\ref{eqn:DV}) computed on two images $X$ and $Z$.

\section{DRMIME Algorithm}
\begin{figure}[h!]
    \centering
    \includegraphics[scale=0.4]{Mine_revised.png}
    \caption{Pipeline for the DRMIME Registration algorithm}
    \label{fig:full_algorithm}
\end{figure}

Fig. \ref{fig:full_algorithm} shows a schematic for the optimization problem (\ref{eqn:opt_drmime}). Our proposed Algorithm \ref{alg:DRMIME} implements DRMIME that uses DV lower bound (\ref{eqn:DV}) computed in turn by Algorithm \ref{alg:MINE}, which employs a fully connected neural network $f_\theta$ MINEnet. MINEnet has two hidden layers with $100$ neurons in each layer. We use ReLU non-linearity in both the hidden layers. Appendix contains details about implementation including learning rates, hyperparameters and optimizations used. The code for DRMIME is available \href{https://github.com/abnan/DRMIME}{here}.


\begin{algorithm2e}[h!]
\SetAlgoLined
{\textit{MINE}($X, Z, I$)} \\
\textbf{Input:} Image $X$, Image $Z$, Sampled pixel locations $I$ \\
\textbf{Output:} Estimated mutual information (DV lower bound) \\
\Indp
    Shuffle pixel locations: $I^s = RandomPermute(I)$ \;
    $N = length(I)$ \;
    $DV = \frac{1}{N}\sum_{j}f_\theta(X_{I_j}, Z_{I_j}) - log(\frac{1}{N}\sum_{j}exp(f_\theta(X_{I_j}, Z_{I^s_j})))$ \;
    Return $DV$;
\caption{Mutual Information Neural Estimation}
\label{alg:MINE}
\end{algorithm2e}


Algorithm \ref{alg:MINE} takes in two images $X$ and $Z$ along with a subset of pixel locations $I.$ It creates a random permutation $I^s$ of the indices $I.$ $I_j$ denotes the $j^{\text{th}}$ entry in the index list $I$, while $X_{I_j}$ denotes the $I_j^{\text{th}}$ pixel location on image $X.$ Finally, the algorithm returns the DV lower bound \cite{belghazi2018mine} computed by Monte Carlo approximation of (\ref{eqn:DV}). 

\begin{algorithm2e}[h!]
\SetAlgoLined
\textbf{Input:} Fixed image $T$, moving image $M$\\
\textbf{Output:} Transformation matrix $H_1$\\
%\Indp
    Set learning rates $\alpha$, $\beta$, $\gamma$ and pyramid level $L$\;
    Build multiresolution image pyramids $\{T_l\}_{l=1}^{L}$ from $T$ and $\{M_l\}_{l=1}^{L}$ from $M$\;
    Use random initialization for MINEnet parameters $\theta$ \;
    Initialize $v$ and $v^1$ to the 0 vectors \;
    \For {each iteration}{
        $MI = 0$ \;
        $H = Mexp(\sum_{i=1}^6 v_i B_i)$ \;
        $H_1 = Mexp(\sum_{i=1}^6 (v_i+v^1_i) B_i)$ \;
        $I_1 = \text{Sample pixel locations on } T_1$ \;
        $ MI \pluseq MINE(T_1, Warp(M_1, H_1), I_1)$ \; 
        \For {$l = [2,L]$}{
            $I_l = \text{Sample pixel locations on } T_l$ \;
            $ MI \pluseq MINE(T_l, Warp(M_l, H), I_l)$ \;
        }
        Update MINEnet parameter: $\theta \pluseq \alpha \nabla_{\theta} MI$ \;
        Update matrix exponential parameters: $v \pluseq \beta \nabla_v MI$ and $v^1 \pluseq \gamma \nabla_{v^1} MI$\;
    }
    Compute final transformation matrix: $H_1 = Mexp(\sum_{i=1}^6 (v_i+v^1_i) B_i)$ \;
\caption{DRMIME}
\label{alg:DRMIME}
\end{algorithm2e}

Algorithm \ref{alg:DRMIME} builds two image pyramids, one for the fixed image $T$ and another for the moving image $M$. Due to memory constraints, especially for GPU, a few pixel locations are sampled that enter actual computations. This step appears as ``Subsample'' in Fig. \ref{fig:full_algorithm}. We have used two variations of sampling: (a) randomly choosing only 10\% of pixels locations on each resolution and (b) finding Canny edges \cite{canny1986computational} on the fixed image and choosing only the edge pixels. Our ablation study shows a comparison between these two options. Fig. \ref{fig:full_algorithm} illustrates two other computation modules- ``Matrix Exponential'' and ``Geometric Transformation'' that denote $Mexp$ and $Warp$ operations, respectively. 

\section{Datasets and Evaluation Metric}
The datasets chosen for our experiments correspond to testing two important hypotheses. First, performing image registration with our algorithm on images within the same modality fares comparably (or better) to other standard algorithms. For this, we use the FIRE dataset \cite{hernandez2017fire}. Second, since our algorithm is based on MI, it can handle multi-modal registration successfully as well. For this we use data from the ANHIR (Automatic Non-rigid Histological Image Registration) 2019 challenge  \cite{ANHIR}. Both datasets contain color images.

The FIRE dataset \cite{hernandez2017fire} provides 134 retinal fundus image pairs divided into 3 categories: S, P and A. The primary uses of the categories being Super Resolution, Mosaicing and Longitudinal Study, respectively. The dataset states that while categories S and A have $>75\%$ overlap, category P has very little overlap ($<75\%$); so none of the algorithms we evaluated (including ours) perform well on P category, leading to little or no registration in most cases (even diverging in some instances). So for a fair evaluation, we leave out category P.

The ANHIR dataset \cite{ANHIR} provides pairs of 2D microscopy images of histopathology tissue samples stained with different dyes. The task is difficult due to non-linear deformations affecting the tissue samples, different appearance of each stain, repetitive texture, and the large size of the whole slide images.

The FIRE dataset provides the location of 10 points in each image and the location of the corresponding 10 points in the paired (to-be-registered) image. These points were obtained by annotations created by experts and further refined to mitigate human error. ANHIR dataset usually contains more than 10 ground truth points. 

Once we obtain the transformation matrix, we transform ground truth points on the moving image and compute the Euclidean distance between these transformed points and the ground truth points on the fixed image. Further we normalize these distances between 0 and 1 and call this metric Normalized Average Euclidean Distance (NAED). In our evaluation we used NAED for both the datasets. For ANHIR, only 230 pairs are available with their ground truth as part of the training data, so we only evaluate on this set of images.

\begin{figure}
\begin{floatrow}

\capbtabbox{%
  \begin{tabular}{||c|c|c||}
        \hline
        Algorithm & NAED ($\mu$ $\pm$ $\sigma$) & p-value \\ 
         \hline\hline
         DRMIME($v$) & \textbf{0.0048} $\pm$ 0.014 & - \\ 
         \hline
         DRMIME & \textbf{0.0048} $\pm$ 0.026 & - \\ 
         \hline
         NCC & 0.0194 $\pm$ 0.033 & 1.3e-04 \\
         \hline
         MMI & 0.0198 $\pm$ 0.034 & 5.4e-05 \\
         \hline
         NMI & 0.0228 $\pm$ 0.032 & 1.7e-08 \\
         \hline
         JHMI & 0.0311 $\pm$ 0.046 & 4.5e-07 \\
         \hline
         AMI & 0.0441 $\pm$ 0.028 & 1.4e-27 \\
         \hline
         MSE & 0.0641 $\pm$ 0.094 & 3.5e-03 \\ [1ex] 
        \hline
    \end{tabular}
}{%
  \caption{NAED for FIRE dataset along with paired t-test significance values
  \label{tab:fire_res}}%
}
\ffigbox{%
  \includegraphics[trim={0 2.5cm 0 0},scale=0.3]{FIRE_Stats.png}%
}{%
  \caption{Box plot for NAED of the best 5 performing algorithms on FIRE
  \label{fig:fire_res}}%
}
\end{floatrow}
\end{figure}

\section{Experiments and Discussion}
Competing algorithms were selected based on whether they use MI or can be used for multi-modal registration. More information about these algorithms can be found in the Appendix \ref{sec:comp_algo}: (a) Mattes Mutual Information \textbf{(MMI)}, (b) Joint Histogram Mutual Information \textbf{(JHMI)}, (c) Normalized Cross Correlation \textbf{(NCC)}, (d) Mean Square Error \textbf{(MSE)}, (e) AirLab Mutual Information \textbf{(AMI)}, (f) Normalized Mutual Information \textbf{(NMI)}.

While it is possible to use perspective transforms with DRMIME by changing the coefficient vector dimension and generators for matrix exponential, in order to have a fair comparison, we limit our algorithm to affine transform, because most libraries do so.
The implementations of the above algorithms were used from these packages: \textbf{SITK}: MMI, JHMI, NCC, MSE. \textbf{AirLab}: AMI. \textbf{SimpleElastix}: NMI
% \begin{itemize}
%     \item SITK: MMI, JHMI, NCC, MSE
%     \item AirLab: AMI
%     \item SimpleElastix: NMI
% \end{itemize}

For all evaluations, we also conduct a paired t-test with DRMIME to investigate if the results are statistically significant (p-value $<$ 0.05). Fig. \ref{fig:fire_samples} and Fig. \ref{fig:anhir_samples} in the Appendix show registration results for two random samples from FIRE and ANHIR datasets, respectively. 

\subsection{Accuracy}
Table \ref{tab:fire_res} shows the NAED for all algorithms on the FIRE dataset. Here, DRMIME performs almost an order of magnitude better than the competing algorithms and the results are statistically significant. Fig. \ref{fig:fire_res} presents box plots the same metrics from Table \ref{tab:fire_res}. We note that DRMIME has very few outliers due to the robustness of the algorithm. Table \ref{tab:fire_res} also shows a variation DRMINE($v$) that does not use the finetuning coefficients $v^1$ for the finest resolution. For FIRE dataset, we do not notice any difference these two versions.

\begin{figure}
\begin{floatrow}

\capbtabbox{%
  \begin{tabular}{||c|c|c||}
    \hline
    Algorithm & NAED ($\mu$ $\pm$ $\sigma$) & p-value \\ 
     \hline\hline
     DRMIME($v$) & 0.0393 $\pm$ 0.081 & - \\ 
     \hline
     DRMIME & \textbf{0.0384} $\pm$ 0.087 & - \\ 
     \hline
     NCC & 0.0461 $\pm$ 0.084 & 7.0e-04 \\
     \hline
     MMI & 0.0490 $\pm$ 0.082 & 6.2e-05 \\
     \hline
     MSE & 0.0641 $\pm$ 0.094 & 5.5e-14 \\
     \hline
     NMI & 0.0765 $\pm$ 0.090 & 3.0e-31 \\
    \hline
     AMI & 0.0769 $\pm$ 0.090 & 3.7e-30 \\
     \hline
    JHMI & 0.0827 $\pm$ 0.100 & 8.3e-21 \\  [1ex] 
     \hline
\end{tabular}
}{%
  \caption{NAED for ANHIR dataset along with paired t-test significance values
  \label{tab:anhir_res}}%
}
\ffigbox{%
  \includegraphics[trim={0 2.5cm 0 0},scale=0.3]{ANHIR_Stats.png}%
}{%
  \caption{Box plot for NAED of the best 5 performing algorithms on ANHIR
  \label{fig:anhir_res}}%
}
\end{floatrow}
\end{figure}
Table \ref{tab:anhir_res} presents the NAED metrics for the ANHIR dataset. While the margin of improvement is not as large as in case of the FIRE dataset, DRMIME is still statistically the best performing algorithm. The box-plots in Fig. \ref{fig:anhir_res} also emphasise the same conclusion as we saw before, i.e. DRMIME outperforms the other competing algorithms. DRMIME algorithm using $v^1$ shows a slight improvement in accuracy for the ANHIR dataset.

\subsection{Efficiency}
On a set of 10 randomly selected images (the set remains the same across all algorithms) from the FIRE dataset, we run these two sets of experiments for all the algorithms. We report the registration accuracy in terms of the ground truth (NAED) of these 10 images. The hardware for these experiments was NVIDIA GeForce GTX 1080 Ti, Intel(R) Xeon(R) CPU E5-2620 v4 @ 2.10GHz, 32GB RAM.

We run each algorithm for 1000 epochs, report the time taken and the accuracy achieved. The time taken tells us the fastest algorithm among those being considered, and also at the same time, its accuracy should at least be on par with other slower algorithms.
% 22671.17363 sec
\begin{center}
    \begin{table}[!h]
    \caption{Time taken for 1000 epochs and resultant NAED (lower is better)}
    \centering
    \begin{tabular}{||c|c c||}
        \hline
        Algorithm & Time (seconds) & NAED \\ 
         \hline\hline
         DRMIME (50 epochs) & \textbf{58} & 0.02037\\
         \hline
         NMI & 60 & 0.02503\\
         \hline
         AMI & 620 & 0.02942\\ 
         \hline
         DRMIME & 1425 & \textbf{0.00368} \\ 
         \hline
         MMI & 2904 & 0.00598\\
         \hline
         JHMI & 1859 & 0.00605\\
         \hline
         NCC & 3804 & 0.00697\\
          \hline
         MSE & 2847 & 0.02918\\ [1ex] 
        \hline
    \end{tabular}
    \label{tab:perf_table1}
\end{table}
\end{center}

From Table \ref{tab:perf_table1}, we can infer that while our algorithm attains the best NAED, it ranks third in terms of time taken to execute 1000 epochs. While AMI and NMI are faster, they are almost an order of magnitude worse in terms of the NAED performance.

Since this is a tradeoff between time and efficiency, DRMIME can perform extremely well at both ends of the spectrum. For instance, while individual epochs on AMI and NMI might be faster, we can achieve comparable accuracy by running DRMIME for much less epochs; within 50 epochs of optimization DRMIME achieves an NAED of 0.02037 taking only 58 seconds. The reason for a single epoch taking longer for DRMIME can be attributed to the fact that it works with batched data.

Also as a note, only DRMIME and AMI are GPU compatible, while the remaining algorithms run on CPU.

\subsection{Ablation study}
\label{sec:ablation}
In this section we perform several ablation studies to have an understanding of the roles of all the components used in DRMIME, such as multi-resolution pyramids, matrix exponential and smart feature selection via Canny edge detection. We compare the performance of DRMIME to versions of it without using the aforementioned components.

\subsubsection{Effect of multi-resolution}
All hyperparameters are kept the same in the with and without experiments, the only difference being in the with multi-resolution experiment we use 6 levels of the Gaussian pyramids in the DRMIME algorithm, whereas in the without experiment we have a single level which is the native resolution of the image. Table \ref{tab:ablation_table1} lists the results for these experiments.


\begin{center}
    \begin{table}[!h]
    \caption{NAED for MINE with and without using multi-resolution pyramids}
    \centering
    \begin{tabular}{||c|c|c|c||}
        \hline
        \textbf{Dataset} & \textbf{DRMIME} & \textbf{Without MultiRes} & \textbf{p-value} \\ 
         \hline\hline
         FIRE & 0.0048 $\pm$ 0.014 & \textbf{0.0043} $\pm$ 0.014 & 0.365 \\ 
         \hline
         ANHIR & \textbf{0.0384} $\pm$ 0.087 & 0.1089 $\pm$ 0.150 & 1.78e-15\\
         \hline
    \end{tabular}
    \label{tab:ablation_table1}
\end{table}
\end{center}

While the idea of multi-resolution was introduced in image registration to facilitate optimization, we note that many of the off-the-shelf algorithms have the same learning rate for all levels. As we are working with only an approximation of the distribution of the original data at different levels of the pyramid, there is a small chance that optimization at a particular sublevel could diverge. This leads to poor registration results occasionally. In our implementation of DRMIME, we produce batches which include data from all levels of the pyramid, making the optimization process much more robust, faster and less prone to divergence. Fig. \ref{fig:fire_res} provides evidence to this since very few results fall outside the interquartile range (as compared to other algorithms).

\subsubsection{Effect of matrix exponentiation}
All hyperparameters are again kept the same in the with and without experiments; the only difference being, that rather than using a manifold basis vector, we now have 6 parameters indicating the degrees of freedom of an affine transform in a transformation matrix, i.e.
\begin{equation*}
\begin{psmallmatrix}
\theta_1 & \theta_2 & \theta_3\\
\theta_4 & \theta_5 & \theta_6\\
0 & 0 & 1\\
\end{psmallmatrix}.
\end{equation*}


\begin{center}
    \begin{table}[!h]
    \caption{NAED for MINE with and without using matrix exponentiation}
    \label{tab:ablation_table2}
    \centering
    \begin{tabular}{||c|c|c|c||}
        \hline
        \textbf{Dataset} & \textbf{DRMIME} & \textbf{Without Manifolds} & \textbf{p-value} \\ 
         \hline\hline
         FIRE & 0.0048 $\pm$ 0.014 & \textbf{0.0045} $\pm$ 0.015 & 0.4933 \\ 
         \hline
         ANHIR & \textbf{0.0384} $\pm$ 0.087 & 0.0580 $\pm$ 0.134 & 0.0012\\
         \hline
    \end{tabular}
\end{table}
\end{center}

Table \ref{tab:ablation_table2} presents the results for these experiments. While the ablation study on the FIRE dataset results in similar results, the p-values from the paired t-test tells us that the results are not very significant to be able to interpret anything. The ANHIR dataset on the other hand sees a statistically significant improvement with use of matrix exponentiation.

\subsubsection{Effect of Sampling strategy}
It could be argued that our smart feature extraction via Canny edge detection helps DRMIME perform better than other algorithms, since other algorithms do not have such custom feature detectors embedded in their pipeline. In order to reduce this potential confounding variable, we also assessed the performance of DRMIME with random sampling as well to make a fair comparison.

\begin{center}
    \begin{table}[!h]
    \caption{NAED for MINE with Canny edge detection and Random Sampling (10\%)}
    \centering
    \begin{tabular}{||c|c|c|c||}
        \hline
        \textbf{Dataset} & \textbf{With Canny} & \textbf{Random Sampling(10\%)} & \textbf{p-value} \\ 
         \hline\hline
         FIRE & \textbf{0.0048} $\pm$ 0.014 & 0.0097 $\pm$ 0.026 & 0.0296 \\ 
         \hline
         ANHIR &  \textbf{0.0384} $\pm$ 0.087  & 0.0588 $\pm$ 0.167 & 0.0333 \\
         \hline
    \end{tabular}
    \label{tab:ablation_table3}
\end{table}
\end{center}

Table \ref{tab:ablation_table3} presents these results. As we can be seen, there is a small drop in performance, but DRMIME still performs better than all the other algorithms with FIRE (Table \ref{tab:fire_res}) and better than most other algorithms with ANHIR (Table \ref{tab:anhir_res}). This comes at a small cost of the optimizer taking longer to converge. It is important to note, that DRMIME results are using only 10\% sampling due to limited memory available on the GPU, whereas the other algorithms use 50\% sampling (see Appendix).

\bibliography{Nan20}

\newpage
\section{Appendix}
\label{sec:appendix}
\subsection{DV Lower Bound Reaches Mutual Information}
\label{sec:proof}
MINE maximizes the DV lower bound (\ref{eqn:DV}) with respect to a function $f(x,z)$. Let us consider a perturbation function $g(x,z)$ and the perturbed objective function $J(f+\epsilon g)$ for a small number $\epsilon$. Taking the following limit (using L\textquotesingle Hospital\textquotesingle s rule), we obtain:
\begin{equation}
\small
  \lim_{\epsilon\to0}\frac{J(f+\epsilon g) - J(f)}{\epsilon} = \int g(x,z)[P_{XZ}(x,z) -
   \frac{exp(f(x,z))P_X(x)P_Z(z)}{\int exp(f(x,z))P_X(x)P_Z(z)dxdz}]dxdz.
\end{equation}
Using principles of calculus of variations\cite{gelfand2000calculus}, this limit should be 0 for $J$ to achieve an extremum. Since perturbation function $g(x,z)$ is arbitrary, this condition is possible only when
\begin{equation}
P_{XZ}(x,z) = \frac{exp(f(x,z)) P_{X}(x) P_{Z}(z)}{\int exp(f(x,z)) P_{X}(x) P_{Z}(z) dxdz},
\label{eq:gibbs}
\end{equation}
i.e., the Gibbs density \cite{belghazi2018mine} is achieved.
From (\ref{eq:gibbs}), we obtain:
\begin{equation}
    f(x,z) = log(\frac{P_{XZ}(x,z)}{P_{X}(x)P_{Z}(z)} \int exp(f(x,z)) P_{X}(x) P_{Z}(z) dxdz).
\end{equation}
Using this expression in equation (\ref{eqn:DV}), we obtain:
\begin{equation}
    J(f) = \int P_{XZ}(x,z) log \frac{P_{XZ}(x,z)}{P_{X}(x)P_{Z}(z)} dx dz.
\end{equation}
Thus, maximization of $J(f)$ leads to mutual information.
\subsection{Competing algorithms}
\label{sec:comp_algo}
\begin{enumerate}
    \item\textbf{Mattes Mutual Information (MMI)} \cite{mattes2001nonrigid, mattes2003pet, MMI}: MI is usually defined as:
    \begin{equation}
    \small
        \label{eqn:MI1}
        MI = \int p_{XY}(x,y) \log \dfrac{p_{XY}(x,y)}{p_X(x)p_Y(y)}dxdy,
    \end{equation}
According to equation (\ref{eqn:MI1}), we need to compute the joint ($p_{XY}$) and marginal ($p_X, p_Y$) probabilities of the fixed and moving images. To reduce the effects of quantization from interpolation and discretization due to binning, this version of MI computation uses Parzen windowing to form continuous estimates of the underlying image histogram.
    
    \item \textbf{Joint Histogram Mutual Information (JHMI)} \cite{thevenaz2000optimization, JHMI}: This method computes Mutual Information using Parzen windows as well, but it uses separable Parzen windows. By selection of a Parzen window that satisfies the partition of unity, it provides a tractable closed-form expression of the gradient of the MI computation with respect to transformation parameters.
    
    \item \textbf{Normalized Cross Correlation (NCC)}\cite{NCC}: As the names says, the correlation between the moving and the fixed image pixel intensities is computed. The correlation is normalized by the autocorrelations of both the fixed and moving images.
    
    \item \textbf{Mean Square Error (MSE)}\cite{MSE}: This is the mean squared difference of the pixelwise intensity between the fixed and moving image.
    
    \item \textbf{AirLab Mutual Information (AMI)}\cite{DBLP:journals/corr/abs-1806-09907}: AirLab is a PyTorch based image registration framework. It performs histogram based mutual information computation\cite{viola1997alignment,maes1997multimodality}. Since it is a deep learning based solution, it provides support for using batches as well as state-of-the-art optimizers and GPU support.
    
    \item \textbf{Normalized Mutual Information (NMI)}\cite{studholme1999overlap, NMI}: The initial PDF (probability density function) construction is done using Parzen histograms, and then MI is obtained by double summing over the discrete PDF values. In this metric, the final MI is normalized to a range between 0 and 1.
\end{enumerate}

\begin{figure*}[h!]
    \centering
    \includegraphics[scale=0.37]{FIRE_Comp.png}
    \caption{The images on the left show a pair to be registered from the FIRE dataset. The images on the right represent the difference between the transformed moving image and the fixed image after registration by different algorithms.}
    \label{fig:fire_samples}
\end{figure*}

\begin{figure*}[h!]
    \centering
    \includegraphics[scale=0.37]{MINE_samples.png}
    \caption{The images on the left show a pair to be registered from the ANHIR dataset. The images on the right represent the difference between the transformed moving image and the fixed image after registration by different algorithms.}
    \label{fig:anhir_samples}
\end{figure*}

\subsection{Preprocessing}
\subsubsection{FIRE}
Each image in this dataset is $2912 \times 2912$ pixels, but only the central portion of the images contain the retinal fundus, the rest of the image being black. While it's possible to use masks to remedy this, not all frameworks support masks, so in order to have a fair comparison across all algorithms, we crop these images to include only the retinal fundus. The cropping was selected such that it includes no blank (black) space and it remains rectangular (square). The cropped area was $1941 \times 1941$ pixels.
\subsubsection{ANHIR}
The ANHIR dataset has extremely high resolution pictures (some categories go upto $65k \times 60k$ pixels on average) and some registration frameworks fail to process such large images. Furthermore, different stainings of the same tissue have different resolutions as well. To solve these two problems when registering a pair of images, they are scaled down by a factor of 5 while keeping the original aspect ratio; this solves the first problem. Then the image with the smaller aspect ratio is rescaled to match the width of the image with the larger aspect ratio and the top and bottom of the smaller one are padded to match the height of the larger. This way we keep the aspect ratio of the original images with no distortions and still arrive at a common and smaller, more manageable resolution.

\subsection{Hyperparameters}

All architectures and hyper-parameters for our experiments are listed here:

\textbf{DRMIME}:
\begin{itemize}
\itemsep-0.5em 
    \item learningRate: $\alpha= 1e-3$, $\beta = 5e-3$, $\gamma = 1e-4$
    \item number of pyramid levels $L = 6$
    \item numberOfIterations: 500 (FIRE)/1500 (ANHIR)
    \item Optimizer : ADAM with AMSGRAD
\end{itemize}


\textbf{MMI}:
\begin{enumerate}
\itemsep-0.5em 
    \item learningRate: 1e-5
    \item numberOfIterations: 5000
    \item numberOfHistogramBins: 100
    \item convergenceMinimumValue: 1e-9
    \item convergenceWindowSize: 200
    \item SamplingStrategy: Random
    \item SamplingPercentage: 0.5
\end{enumerate}

\textbf{JHMI}:
\begin{enumerate}
\itemsep-0.5em 
    \item learningRate: 1e-1
    \item numberOfIterations: 5000
    \item numberOfHistogramBins: 100
    \item convergenceMinimumValue: 1e-9
    \item convergenceWindowSize: 200
    \item SamplingStrategy: Random
    \item SamplingPercentage: 0.5
\end{enumerate}

\textbf{MSE}:
\begin{enumerate}
\itemsep-0.5em 
    \item learningRate: 1e-6
    \item numberOfIterations: 5000
    \item convergenceMinimumValue: 1e-9
    \item convergenceWindowSize: 200
\end{enumerate}

\textbf{NCC}:
\begin{enumerate}
\itemsep-0.5em 
    \item learningRate: 1e-1
    \item numberOfIterations: 5000
    \item convergenceMinimumValue: 1e-9
    \item convergenceWindowSize: 200
\end{enumerate}

\textbf{NMI}:
\begin{enumerate}
    \item numberOfIterations: 5000
\end{enumerate}

\textbf{AMI}:
\begin{enumerate}
\itemsep-0.5em 
    \item learningRate: 1e-4
    \item numberOfIterations: 5000
    \item bins: 64
    \item sigma: 3
    \item spatialSamples: 0.1
    \item Optimizer : AMSGRAD
\end{enumerate}

\end{document}
