\documentclass{midl}
%\usepackage{cite}

% Include author names
%\documentclass[anon]{midl} % Anonymized submission

% The following packages will be automatically loaded:
% jmlr, amsmath\part{title}, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images
\usepackage{makecell}
\usepackage{multirow}
\usepackage{soul}
\usepackage{url}
%\usepackage{float} 
\jmlryear{2020}
\jmlrworkshop{Full Paper -- MIDL 2020}


\title [SAU-Net: Efficient 3D Spine MRI Segmentation Using Inter-Slice Attention]{SAU-Net: Efficient 3D Spine MRI Segmentation Using Inter-Slice Attention}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 % \Name{Author Name2} \Email{xyz@sample.edu}\\
 % \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 % \Name{Author Name2} \Email{an2@sample.edu}\\
 % \Name{Author Name3} \Email{an3@sample.edu}\\
 % \addr Address}

 \midlauthor{\Name{Yichi Zhang\nametag{$^{1}$}}\\
 \Name{Lin Yuan\nametag{$^{1,2}$}}\\
 \Name{Yujia Wang\nametag{$^{1}$}} \\
 \Name{Jicong Zhang\midljointauthortext{Corresponding author. E-mail address: jicongzhang@buaa.edu.cn}\nametag{$^{1,3,4,5}$}} \\
\\
 \addr $^{1}$School of Biological Science and Medical Engineering, Beihang University, Beijing , China \\
 \addr $^{2}$School of Biomedical Engineering, Capital Medical University, Beijing , China\\
 \addr $^{3}$Hefei Innovation Research Institute, Beihang University, Hefei, China\\
 \addr $^{4}$Bejing Advanced Innovation Centre for Biomedical Engineering, Beihang University, Beijing, China\\
 \addr $^{5}$Bejing Advanced Innoration Centre for Big Data-Based Precision Medicine, Beihang University, Beijing, China
 }




\begin{document}

\maketitle

\begin{abstract}
Accurate segmentation of spine Magnetic Resonance Imaging (MRI) is highly demanded in morphological research, quantitative analysis, and diseases identification, such as spinal canal stenosis, disc herniation and degeneration. However, accurate spine segmentation is challenging because of the irregular shape, artifacts and large variability between slices. To alleviate these problems, spatial information is used for more continuous and accurate segmentation such as by 3D convolutional neural networks (CNN) . However, 3D CNN suffers from higher computational cost, memory cost and risk of over-fitting, especially for medical images where the number of labeled data is limited. To address these problems, we apply the attention mechanism for the utilization of inter-slice information in 3D segmentation tasks based on 2D convolutional networks and propose a spatial attention-based densely connected U-Net (SAU-Net), which consists of Dense U-Net for extraction of intra-slice features and an inter-slice attention module (ISA) to utilize inter-slice information from adjacent slices and refine the segmentation results. Experimental results demonstrate the effectiveness of ISA as well as higher accuracy and efficiency of segmentation results of our method compared with other deep learning methods.
\end{abstract}

\begin{keywords}
spine segmentation, MRI, deep learning, inter-slice attention
\end{keywords}

\section{Introduction}
\noindent 
Due to changes in lifestyles caused by social development, spine-related diseases like spinal canal stenosis, disc herniation and degeneration have become common clinical diseases, posing a serious threat to patients' health. Clinically, spine MRI could show the anatomical structure and relative position of vertebra steadily and distinctly, which has become one of the most common and effective methods for diagnosis \cite{korez2015a}.

Accurate and robust segmentation of spine MR images is an essential tool for identification and quantitative analysis of diseased region. Segmentation results can be used to diagnose various characteristic diseases such as disc degeneration \cite{wu2014degenerative}, adolescent idiopathic scoliosis (AIS) \cite{guerroumi2019automatic} and Lytic bone metastases in thoracolumbar spine \cite{yao2007computer} after subsequent processing, considerably aid surgical planning for doctors. Besides, the results can also be used to assist in clinical treatment of spine classification, 3D reconstruction of spine and associated research.

However, spine segmentation is still considered a difficult task owing to many challenges such as unclear spine boundaries, abnormal spinal curvature and intricacy of vertebral structures. Traditional segmentation methods which rely on manual intervention and prior knowledge usually require a lot of effort and time. Besides, these methods are prone to errors on account of variability between operators. Automatic image segmentation could increase precision by eliminating the subjectivity and exhaustive processes. Consequently, extensive work is devoted to the research of automatic spine segmentation.

With the success of deep learning in medical imaging, most spine segmentation methods published recently are based on deep learning and have replaced explicit modeling of the vertebral shape. For example, \cite{sekuboyina2017a} segmented the lumbar vertebrae in 2D sagittal slices using CNN for pixel labeling, then a simple multi-layer perceptron estimated a bounding box to identify the region of interest in the image. \cite{lessmann2019iterative} used iterative fully convolutional neural network (FCN) that performs multiple tasks concurrently for spine segmentation. 

Although Spine segmentation based on 2D slices has produced superior segmentation results with satisfactory performance, these methods ignore the spatial continuity between slices, limiting further improvements in the segmentation performance. In order to deal with these limitations, segmentation methods based on 3D convolutions were proposed like 3D U-Net \cite{cicek20163d} and V-Net \cite{milletari2016v-net}. These methods achieved better performance in 3D medical image segmentation tasks compared with 2D methods. For spine segmentation, with the addition of increased dimensionality, the disadvantages of 3D networks are gradually emerging. Despite various creative 3D network training assisted by the scale-down of the inputs \cite{chen2020vertebrae} the large calculations caused by 3D full-size inputs resulting in high computer memory requirements and unnecessary time costs cannot be fundamentally mitigated, especially for high resolution images.

To deal with these problems, a novel network architecture called spatial attention-based densely connected U-Net (SAU-Net) is proposed for 3D spine MRI segmentation. First, the initial 3D image is decomposed into a stack of 2D slices $\footnote{ In this paper, slices refers specifically to sagittal slices of 3D spine MR images.}$. Then a Dense U-Net structure is constructed to to get rough probability results based on intra-slice information. In the end, an inter-slice attention module is appended to capture and fuse 3D inter-slice spatial information with 2D contextual information, therefore refines the voxel-wise segmentation results. 

In summary, our main contributions can be summarized as follows:
\begin{itemize}
\item To the best of our knowledge, we are the first to apply the attention mechanism for the utilization of inter-slice information in 3D segmentation tasks based on 2D convolutional networks and propose inter-slice attention module (ISA) . 
\item We propose a novel structure called spatial attention-based densely connected U-Net (SAU-Net) for effective and accurate spine segmentation from 3D MR images. 
\end{itemize}

In the following paragraphs, we firstly introduce some related work about deep-learning-based image segmentation. Then we demonstrate the detail of our proposed method. After that, a series of experiments are conducted to prove the effectiveness and superiority of ISA and SAU-Net.

\section{Related work}
\begin{subsection}{Image Segmentation by Deep Convolutional Neural Networks}
Image segmentation aims to understand images in pixel level and label each pixel into a certain class. Deep learning based object detection and semantic segmentation in computer vision has made a big advancement recently. The concept of convolution neural network (CNN) was first introduced 
by \cite{lecun1989backpropagation} and it was widely applied for classication\cite{krizhevsky2012imagenet}. Fully convolutional networks (FCN) proposed by \cite{long2015fully} is a landmark in image segmentation. It applied CNN to dense prediction and first realized end to end segmentation by replacing fully connected neural layers with convolutional neural layers. With the development of deep learning, more and more studies on image segmentation using neural networks have been proposed. The most popular structure for medical image segmentation is U-Net proposed by \cite{ronneberger2015u-net}. The architecture of U-Net fused the features of different scales by concatenating the feature maps of the downsampling layers and the corresponding upsampling layers. For segmentation of 3D images, \cite{cicek20163d} proposed 3D U-Net architecture that inputs a 2D slice sequence of 3D images. \cite{milletari2016v-net} evolved U-Net into V-Net by using 3D convolution kernels to extract features from images. These methods have achieved relatively good results, showing the advantages of encoder-decoder structures based on U-Net in medical image segmentation tasks.
\end{subsection}

\begin{subsection}{Attention-based Methods for Image Segmentation}
Attention can be viewed as using information transferred from several subsequent feature maps to select and localize the most discriminative part of the feature maps. \cite{oktay2018attention} proposed attention gates to learn to suppress irrelevant regions while highlighting salient features useful for a specific task. \cite{hu2019squeeze-and-excitation} proposed a selection mechanism where feature maps are first aggregated using global average pooling and then reduced to a single channel descriptor with an activation gate applied to highlight the most discriminant features. For medical image segmentation, \cite{Sinha2019} proposed a multi-level attention based architecture for abdominal organ segmentation from MR images. \cite{qin2018autofocus} proposed a dilated convolution base block to preserve more detailed attention in 3D medical image segmentation. \cite{sekuboyina2017a} proposed an attention net for spine detection before using 3D U-Net for segmentation.
\end{subsection}


\section{Methods}
The general architecture of our proposed SAU-Net is shown in Figure\ref{SAU}. The input 3D MR image $X \in R^{d \times l \times w}$, where $d, l, w $ represents the depth, length and width of the images respectively. At first, the 3D volume is divided into a sequence of slices of 2D images ${[x_1,x_2, ... ,x_d]} \in R^{l \times w}$. Then, an encoder-decoder Dense U-Net structure is utilized to capture intra-slice information and obtain rough probability results of each slice ${[\widehat{y_1},\widehat{y_2}, ... , \widehat{y_d}]}$. In the end, an inter-slice attention module is appended to capture and fuse 3D spatial information with 2D contextual information, therefore refines the voxel-wise segmentation results $\widehat{Y} \in R^{d \times l \times w}$ .

In the following subsections, we will introduce the detailed structure of Dense U-Net and the design of inter-slice attention module.

\begin{figure}[h]

 % Caption and label go in the first argument and the figure contents
 % go in the second argument
\centering
\setlength{\abovecaptionskip}{-0.9cm}
\includegraphics[width=0.825\linewidth]{SAUNet}

\caption{The general architecture of SAU-Net for spine segmentation. At first, input 3D images are predicted by Dense U-Net structure based on intra-slice information. Then an inter-slice attention module is applied to refine the rough segmentation results using inter-slice information and produce the final segmentation results.}
\setlength{\belowcaptionskip}{-0.9cm}
\label{SAU}
\end{figure}

\begin{subsection} {Dense U-Net Structure}
The typically-used architecture for medical image segmentation is encoder-decoder structure based on U-Net \cite{ronneberger2015u-net}. However, there may be information loss during convolution. Recent work \cite{huang2017densely} \cite{jegou2017the} has shown that short connections between layers could merge the contextual information before and after the convolution layers, therefore make convolutional networks more accurate and efficient. In this work, we modify the structure of classic U-Net via appending dense connections to its convolutional layers. It is designed to gain better capacity of extracting features among different slices, which is of great importance for increasing the accuracy. For segmentation of 3D images, Dense U-Net is utilized to obtain the predicted probability results based on intra-slice information of each slice. 

 The detailed structure of the convolution blocks of encoder (DownBlock) and decoder (UpBlock) of Dense U-Net is shown in Figure\ref{SAU}. The DownBlock includes convolutional layers (Conv), batch normalization (BN), rectified linear units (ReLU), dense connections (DC) and max-pooling layers (Maxpooling) while the UpBlock includes concatenation from encoder, Conv, BN, ReLU, DC and de-convolution layers (Deconv) .
\end{subsection}

\begin{subsection}{Inter-slice Attention Module}
Using 2D Dense U-Net structure can produce segmentation results based on intra-slice contextual information. However, the spatial information of continuity between adjacent slices is neglected, which is a restriction of higher performance in 3D image segmentation tasks. Due to the spatial continuity, the segmentation results of each layer are spatially correlated with the upper and lower layers. Therefore, the information of adjacent slices is useful for the segmentation of each single slice of the image. 
\end{subsection}

\begin{figure}[h]
\centering
\setlength{\abovecaptionskip}{-0.9cm}

\includegraphics[width=0.82\linewidth]{ISA}
\caption{The overall structure of inter-slice attention module. For the refinement of segmentation results of each slice, the information of adjacent slices are used.}
\setlength{\belowcaptionskip}{-0.9cm}
\label{ATT}
\end{figure}
To address the issue, we propose an inter-slice attention module (ISA) to utilize contextual information between adjacent slices and augment the continuity of segmentation results. For segmentation tasks, attention is usually achieved by creating masks that represent an informative region on feature maps, so as to highlight the most salient regions and suppress irrelevant regions.

The overall structure of the inter-slice attention module is shown in Figure\ref{ATT}. In order to utilize the information of spatial continuity, the feature maps of adjacent slices are used for the segmentation by generating attention masks and fusing into the feature maps of the slice, therefore the refined segmentation results are obtained. 

\begin{figure}[h]

\setlength{\abovecaptionskip}{-0.9cm}
\centering
\includegraphics[width=0.8\linewidth]{AFM}
\caption{The detailed structure of the proposed attention fusion module.}
\setlength{\belowcaptionskip}{-0.9cm}
\label{AFM}
\end{figure}

The detailed structure of attention fusion module is shown in Figure\ref{AFM}. For the segmentation of slice i, the feature maps of slice i+1 and/or (the first and last slice only have one adjacent slice) slice i-1 are utilized by generating attention masks using $1\times 1$ convolutional layers and sigmoid. The generated masks represent the attention area for the segmentation of slice i based on information of adjacent slices. Then the masks are fused with feature maps of slice i. The size of the masks is the same as which of the feature maps of slice i. By multiplying the corresponding elements and adding the feature maps to the results after multiplication, we get the attention fusion output. For the first and the last slice, the process of attention fusion is halved. At the end of the module, sigmoid activation function is applied to get the final segmentation results.

\section{Experimental results}

\begin{subsection}{Datasets and Evaluation Metrics} 
We use SpineSegT2W dataset to train and validate the models. The dataset contains 195 T2-weighted spine MR images of patients with disc herniation and degenerative. Each image is paired with ground truth labeled by expert radiologists. In the direction of the sagittal viewpoint, the size of each slice varies from $512 \times 512$ to $960 \times 960$ and the number of slices varies from 12 to 15 for different patients. Besides, we applied SAU-Net to an openly available MRI dataset in \cite{chu2015fully} for testing. The dataset contains 23 T2-weighted turbo spin echo MR images from 23 patients and the associated ground truth segmentation. The size of sagittal slices is $305 \times 305$ and the number of slices is 39 for all MR Images.

The Dice similarity coefficient, Jaccard coefficient, Hausdorff Distance (HSD), precision and recall were used to evaluate our segmentation results.
\end{subsection}

\begin {subsection}{Implementation Details}
Our following experiments are implemented using Keras with TensorFlow backend on NVIDIA Tesla V100 GPUs (32GB memory) . All networks use binary cross-entropy as loss function and Adam optimizer with initial learning rate 0.0001 for the training. Experimental results are validated by random 5-fold cross-validation by randomly shuffling the sequence of images and splitting the dataset into 5 fixed folds with 39 MR images in each fold, using 4 folds for training and the other one for testing. Therefore, we could achieve a more comprehensive evaluation of these models by analysing these results. The average training time of SAU-Net was approximate 5 hours on a single standard NVIDIA Tesla V100 GPU. 
\end {subsection}

\begin {subsection}{Comparison Experiments}
In the conparison experiments, we compare SAU-Net with 2D segmentation methods like U-Net \cite{ronneberger2015u-net} and 3D segmentation methods like 3D U-Net \cite{cicek20163d} on the same experimental environment. The overall cross validation results of the comparison experiments are shown in Table\ref{TABLE1}.


\begin{table}[!ht]
\label{TABLE1}
	{\caption{Cross validation results of comparison experiments on SpineSegT2W dataset}}
	{\begin{tabular}{llllll}	  \hline
\textbf Models   & \textbf fDSC (\%)     &\textbf Precision (\%)   &\textbf  Recall (\%)    &\textbf Jaccard (\%)  &\textbf HSD (mm) \\ \hline
2D U-Net & $88.06\pm0.16$   & $86.43\pm1.24$   & $89.85\pm0.84$   & $78.86\pm0.62$   & $3.12\pm0.035$ \\
3D U-Net   & $89.28\pm0.33$   & $89.35\pm0.35$   & $89.54\pm0.48$   & $80.72\pm0.88$   & $1.94\pm0.026$ \\
\textbf{SAU-Net} & $\textbf{89.86}\pm\textbf{0.10}$ & $\textbf{89.75}\pm\textbf{0.24}$ & $\textbf{90.41}\pm\textbf{0.36}$ & $\textbf{81.56}\pm\textbf{0.20}$ & $\textbf{1.76}\pm\textbf{0.023} $\\  \hline
	\end{tabular}}
	
\end{table}
\bigskip

\begin{table}[h]
\begin{center}
\caption{Comparison of time consumption and model size of different methods}
{\begin{tabular}{lll}
\\\hline
\textbf{Methods} & \textbf{Time Consumption} & \textbf{Model Size} \\ \hline
2D U-Net   & \makecell[c]{153s}    &\makecell[c]{35M} \\
3D U-Net   & \makecell[c]{534s}      &\makecell[c]{83M}  \\
SAU-Net    &\makecell[c]{178s}      &\makecell[c]{38M}     \\ \hline
\end{tabular}}
\label{TABLE2}
\end{center}
\end{table}


\bigskip

According to Table\ref{TABLE1}, SAU-Net achieve better segmentation results on all metrics compared with the other competitive models. As shown in Table\ref{TABLE2}, comparing with using 3D convolutions, SAU-Net could achieve more accurate and robust segmentation with less cost of time consumption. We could get the conclusion that SAU-Net collaborate inter-slice information and intra-slice information collaborate in a better way and outperform other architectures on spine segmentation tasks.

To validate the effectiveness and robustness of our method, we also conduct experiments on the lower spine MRI dataset in \cite{chu2015fully}. We evaluated our method on the dataset by running leave-one-out experiments as the way they used in \cite{chu2015fully}. We can see that our method achieved better performance on all metrics. Additionally, we tested the well-trained model from SpineSegT2W dataset on the lower spine MRI dataset. As shown in Table\ref{TABLE3}, the result indicates the effectiveness and good generalization capability of our method.


\bigskip

\begin{table}[h]
\caption{Comparison of spine segmentation results on lower spine MRI dataset}
{\begin{tabular}{llllll}
\\\hline
\textbf{Models}  & \textbf{DSC (\%) }   & \textbf{Precision (\%) }  &\textbf{ Recall (\%) }  &\textbf{ Jaccard (\%) }  & \textbf{HSD (mm) }   \\\hline
\begin{tabular}[c]{@{}c@{}}SAU-Net \ \\ (pre-trained) \end{tabular} & 89.31    & 88.33    & 90.56    & 80.80    & 4.29    \\\hline
SAU-Net$^{\star}$ & $\textbf{91.3} \pm\textbf{0.5}$ & $\textbf{95.1}\pm\textbf{0.6} $& $\textbf{87.7}\pm\textbf{1.3} $& $\textbf{83.8}\pm\textbf{0.7}$ & $\textbf{3.0}\pm\textbf{0.8}$ \\
Chu et al$^{\star}$ & $88.7\pm2.9  $ & -     & - & -     & $6.4\pm1.2 $\\\hline 
\end{tabular}}
Note: $\star$ denotes leave-one-out experimental results; - denotes the result is not reported;
\label{TABLE3}
\end{table}


\end {subsection}

\begin {subsection}{Ablation experiments}
To demonstrate the effectiveness of inter-slice information extraction of SAU-Net, we performe a comparative performance experiment for spine segmentation with and without the inter-slice attention module (ISA) . The performance represented in Table\ref{TABLE4} demonstrated that the addition of ISA could refine the segmentation results and therefore get better results on all metrics.

\begin{table}[h]
\caption{Cross validation results of ablation experiments SpineSegT2W dataset}
{\begin{tabular}{llllll}
\\\hline
\multicolumn{1}{l}{}                                                           
  & \textbf{Dice (\%) } & \textbf{Precision (\%) } & \textbf{Recall (\%) } & \textbf{Jaccard (\%) } & \textbf{HSD (mm) }\\ \hline
\ Ours w/o ISA & $88.89\pm0.20$ & $87.83\pm0.94$ & $90.34\pm0.86$ & $80.11\pm0.50 $& $2.87\pm0.039$ \\ 
\textbf{Ours w/ ISA} & $\textbf{89.86}\pm\textbf{0.10}$ &$ \textbf{89.75}\pm\textbf{0.24}$ & $\textbf{90.41}\pm\textbf{0.36} $& $\textbf{81.56}\pm\textbf{0.20} $& $\textbf{1.76}\pm\textbf{0.023}$ \\ \hline
\end{tabular}}
\label{TABLE4}
\end{table}

\begin{figure}[h]
\setlength{\abovecaptionskip}{-0.9cm}

\centering
\includegraphics[width=0.82\linewidth]{ES}
\caption{Examples of spine segmentation results with and without ISA.}
\setlength{\belowcaptionskip}{-0.9cm}
\label {ES}
\end{figure}


Although the improvement of ISA in dice coefficient is not significant, according to the visualization of segmentation results in Figure\ref{ES}, we can see that the segmentation results of some areas that tend to be misclassified by 2D segmentation method are significantly improved with the application of ISA. As a supporting evidence, the performance on Hausdorff distance are significantly improved. Therefore, the improvement of ISA could improve the accuracy and practicability of segmentation results without excessive computation cost, so as to assist in clinical treatment.
\end {subsection}


\section{Conclusion}
In this paper, we apply the attention mechanism to the utilization of inter-slice information on 3D image segmentation tasks based on 2D convolutional networks and proposed a novel structure called spatial attention-based densely connected U-Net (SAU-Net) for spine segmentation on 3D MR images. The architecture address the problems that 2D convolutions ignore the spatial information between adjacent slices and 3D convolutions suffer from high computation and memory cost, and risk of overfitting. Our method could substantially improve the segmentation accuracy and efficiency by fusing intra-slice and inter-slice features, which is crucial in the clinical practice. In addition, the strategy of using attention mechanism for extraction of inter-slice information could be easily adopted to other 3D image segmentation problems. Our future work includes more comprehensive validation and improvement of ISA and SAU-Net as well as possible application prospects for other tasks like multi-class segmentation and detection based on 3D medical images.
%\end{section}


\midlacknowledgments{This work is supported by the National Key Research and Development Program of China under Grant 2016YFF0201002, the University Synergy Innovation Program of Anhui Province (Grant Number: GXXT-2019-044), and the National Natural Science Foundation of China under Grant 61301005.}


\bibliographystyle{plain}

\bibliography{Zhang20}

\end{document}
