\documentclass{midl} % Include author names
% \documentclass[anon]{midl} % Anonymized submission

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution
\usepackage{caption}
\usepackage{graphicx}
\usepackage{multirow}
\usepackage{float}
\usepackage{wrapfig}
% \usepackage[table,xcdraw]{xcolor}
\graphicspath{ {./images/} }


\jmlryear{2020}
\jmlrworkshop{Full Paper -- MIDL 2020}
%\jmlrworkshop{Extended Abstract -- MIDL 2020 submission}
% \editors{Under Review for MIDL 2020}

\title[Efficiently-Layered Network]{Knee Injury Detection using MRI with Efficiently-Layered Network (ELNet)}


% \the\fontdimen2\font <-- look up default spacing (3.65pt)

\midlauthor{\Name{Chen-Han Tsai} \Email{maxwelltsai@yahoo.com}\\
\addr School of Electrical Engineering, Tel Aviv University, Israel
\AND
\Name{Nahum Kiryati} \Email{nk@eng.tau.ac.il}\\
\addr The Manuel and Raquel Klachky Chair of Image Processing, School of Electrical Engineering, Tel-Aviv University, Israel
\AND
\Name{Eli Konen} \Email{eli.konen@sheba.health.gov.il}\\
\Name{Iris Eshed} \Email{iris.eshed@sheba.health.gov.il}\\
\Name{Arnaldo Mayer} \Email{arnaldo.mayer@sheba.health.gov.il}\\
\addr Diagnostic Imaging, Sheba Medical Center, affiliated to the Sackler School of Medicine, Tel-Aviv University, Israel 
}



\begin{document}

\maketitle

\begin{abstract}
Magnetic Resonance Imaging (MRI) is a widely-accepted imaging technique for knee injury analysis. Its advantage of capturing knee structure in three dimensions makes it the ideal tool for radiologists to locate potential tears in the knee. In order to better confront the ever growing workload of musculoskeletal (MSK) radiologists, automated tools for patients' triage are becoming a real need, reducing delays in the reading of pathological cases. In this work, we present the Efficiently-Layered Network (ELNet), a convolutional neural network (CNN) architecture optimized for the task of initial knee MRI diagnosis for triage. Unlike past approaches, we train ELNet from scratch instead of using a transfer-learning approach. The proposed method is validated quantitatively and qualitatively, and compares favorably against state-of-the-art MRNet while using a single imaging stack (axial or coronal) as input. Additionally, we demonstrate our model's capability to locate tears in the knee despite the absence of localization information during training. Lastly, the proposed model is extremely lightweight ($<$ 1MB) and therefore easy to train and deploy in real clinical settings.
\end{abstract}

\begin{keywords}
Knee Diagnosis, MRI, Deep Learning, ACL Tear, Meniscus Tear, Knee Injury, Medical Triage
\end{keywords}

\section{Introduction}

Magnetic Resonance Imaging (MRI) has long been considered the most robust knee examination tool available \cite{Saeed2018}. Its widespread use is partly due to its capability to capture detailed structures in the knee joint while remaining a non-invasive procedure \cite{Crues1987, Boeree1991}.  Given its profound capabilities to capture the knee in three dimensions, MRI has become the tool-of-choice for radiologists in an extensive range of examinations such as knee osteoarthritis and internal derangement of the knee. \cite{Hayashi2014,Arumugam2015}. Considering the ever growing workload of musculoskeletal (MSK) radiologists, automated tools for patients' triage are needed, leading to shorter delays in the reading of pathological cases. Several techniques have been proposed for this purpose. \citet{Stajduhar2017} presented a semi-automated approach that used support vector machines (SVM) to diagnose anterior cruciate ligament (ACL) injuries in the knee. In their work, an ROI is first manually extracted before being fed into the SVM for prediction. \citet{Liu2018} introduced a fully-automated cartilage lesion detection system by employing a CNN for segmentation followed by another CNN for patch classification. Although their network is trained end-to-end, the amount of manual labeling required to create the patch training set makes it an overwhelmingly cumbersome task. \citet{Bien2018} proposed an architecture that consists of three individual MRNets whose output are combined using logistic regression. An MRNet extracts a distinctive feature vector for each slice of the scan, stacks the vectors into a 2D array, max-pools the array to obtain a single vector, and performs classification by a fully connected layer with softmax activation. The backbone of the feature extractor is a pre-trained AlexNet \citep{NIPS2012_4824}. 

In this work, we present an Efficiently-Layered Network (ELNet) architecture optimized for knee diagnosis using MRI. The main contribution of this work is a novel slice feature extracting network that incorporates multi-slice normalization along with BlurPool down-sampling. The proposed methods will be detailed in \sectionref{methods}, followed by quantitative and qualitative experimental results in \sectionref{experiments}. Conclusion and future work will be given in \sectionref{conclusion}. 



\section{Methods}
\label{methods}
The ELNet architecture is illustrated in \figureref{fig:ELNet Design} and the details are listed in \tableref{tab:ELNet Design}. The backbone of ELNet's design centers around \textit{Block} modules. Inspired by ResNet \citep{7780459}, we define a \textit{Block} as a sequence of:
\begin{center}
    [2D Convolution $\to$ Multi-slice Normalization $\to$ ReLU activation] 
\end{center}
\textit{Blocks} are designed to allow for non-linearities in the network, and they may be repeated while ensuring equal input and output dimensions. A skip connection is added between the input and output, allowing better optimization of the network. The first two \textit{Blocks} are repeated twice with $4K$ and $8K$ channels, and the remaining \textit{Blocks} are fixed with $16K$ channels.

Each \textit{Block} is followed by another 2D Convolution and ReLU activation, and they serve to increase channel dimension. The spatial height and width are reduced using a BlurPool layer. Eventually, in the final layer of the feature extractor, 2D max-pooling is applied to obtain a $16K$-dimensional feature vector for each MRI slice. Max-pooling is consecutively applied to obtain a single $16K$-dimension feature vector that combines feature information across slices. Dropout is performed before feeding into a fully-connected layer with two output logits, and the final probability $p(y|x)$ is computed by softmax \citep{Goodfellow-et-al-2016}. 

In the following two subsections, we detail two innovative features of ELNet: the use of multi-slice normalization, and BlurPool.

\begin{figure}[t]
\begin{minipage}[c]{0.3\linewidth}
\centering
\includegraphics[scale=0.61]{images/ELNet_New.pdf}
\caption{ELNet Design}
\label{fig:ELNet Design}
\end{minipage}
\hspace{0.023\linewidth}
\begin{minipage}[c]{0.66\linewidth}
\centering
\resizebox{\textwidth}{!}{
\begin{tabular}{c|cc}
\hline
Output Size                                          & \multicolumn{1}{c|}{Layer Operation}                              & Trainable Parameters  \\ \hline
\multirow{2}{*}{$s \times 4K \times 128 \times 128$} & \multicolumn{1}{c|}{$7\times7$ Conv, $4K$}                        & $196K$                \\ \cline{2-3} 
                                                     & \multicolumn{1}{c|}{Normalization}                                & $4K$                  \\ \hline
\multirow{2}{*}{$s \times 4K \times 62 \times 62$}   & \multicolumn{2}{c}{ReLU $\to$ BlurPool}                                                   \\ \cline{2-3} 
                                                     & \multicolumn{1}{c|}{Block {[}$5\times5${]}$\times2$} & $800K^2 + 16K$        \\ \hline
$s \times 8K \times 62 \times 62$                    & \multicolumn{1}{c|}{5$\times$5 Conv, $8K$}                        & $800K^2$              \\ \hline
\multirow{2}{*}{$s \times 8K \times 29 \times 29$}   & \multicolumn{2}{c}{ReLU $\to$ BlurPool}                                                   \\ \cline{2-3} 
                                                     & \multicolumn{1}{c|}{Block {[}$3\times3${]}$\times2$} & $1152K^2 + 32K$       \\ \hline
$s \times 16K \times 29 \times 29$                   & \multicolumn{1}{c|}{3$\times$3 Conv, $16K$}                       & $1152K^2$             \\ \hline
\multirow{3}{*}{$s \times 16K \times 13 \times 13$}  & \multicolumn{2}{c}{ReLU $\to$ BlurPool}                                                   \\ \cline{2-3} 
                                                     & \multicolumn{1}{c|}{Block {[}3$\times$3{]}}                       & $2304K^2 + 32K$       \\ \cline{2-3} 
                                                     & \multicolumn{1}{c|}{3$\times$3 Conv, $16K$}                       & $2304K^2$             \\ \hline
\multirow{3}{*}{$s \times 16K \times 5 \times 5$}    & \multicolumn{2}{c}{ReLU $\to$ BlurPool}                                                   \\ \cline{2-3} 
                                                     & \multicolumn{1}{c|}{Block {[}3$\times$3{]}}                       & $2304K^2 + 32K$       \\ \cline{2-3} 
                                                     & \multicolumn{1}{c|}{3$\times$3 Conv, $16K$}                       & $2304K^2$             \\ \hline
$s \times 16K$                                       & \multicolumn{2}{c}{ReLU $\to$ BlurPool $\to$ 2D Max-Pool}                                 \\ \hline
$16K$                                                & \multicolumn{2}{c}{1D Max-Pool $\to$ Dropout}                                             \\ \hline
2                                                    & \multicolumn{1}{c|}{Fully Connected $\to$ Softmax}                & $32K + 2$             \\ \hline
\multicolumn{2}{r|}{Total Trainable Parameters}                                                                          & $13120K^2 + 348K + 2$ \\ \hline

\end{tabular}}
\captionsetup{type=table}
\caption{ELNet architecture in detail}
\label{tab:ELNet Design}


% \label{table:architecture}
\end{minipage}
\end{figure}


\subsubsection*{Multi-Slice Normalization}
We propose two possible variants of multi-slice normalization: a first one based on \textit{layer normalization} \citep{Ba2016LayerN}, and a second one based on \textit{contrast normalization} \citep{Ulyanov2016InstanceNT}. Let's assume a feature representation $x^{(i)} \in \mathbb{R}^{S \times C \times H \times W}$ from some layer $i$ in the network (usually a 2D-convolution), where $S$ is the number of slices in the MRI sequence, $C$ is the number of channels in the representation, and $H,W$ are the spatial height and width of the representation. The network applies a normalization on $x$ (omitting $i$ for simplicity) by computing the appropriate mean and variance.

In the \textit{layer normalization} variant, the mean $\mu_s$ and variance $\sigma_s^2$ are computed from $x$ for each slice $s$ ($1\leq s \leq S$). In \textit{contrast normalization}, the mean $\mu_{sc}$ and variance $\sigma_{sc}^2$ are computed for each slice $s$ and also for each channel $c$ ($1 \leq c \leq C$) (\figureref{Novelties} a-c). Using the computed mean and variance, $x$ is standardized into $\hat{x}$. An affine transform is applied to $\hat{x}$ to obtain the normalized output $y$. The normalization process is expressed by \equationref{Layer_norm} for \textit{layer normalization} and \equationref{Contrast_Norm} for \textit{contrast normalization} respectively:

\begin{equation}
    \hat{x}_{n} = \frac{x_{n}-\mu_{n}}{\sqrt{\sigma_{n}^2 + \epsilon}} \to
 y_{n} =\gamma \hat{x}_{n}+ \beta \;\;\;\;\;\;\;\;\;  \forall n:1 \to N
 \label{Layer_norm}
\end{equation}

\begin{equation}
    \hat{x}_{nc}= \frac{x_{nc}-\mu_{nc}}{\sqrt{\sigma_{nc}^2 + \epsilon}} \to y_{nc} = \gamma \hat{x}_{nc} + \beta \;\;\;\;\;\;\;\;\; \forall n:1 \to N , c:1 \to C
    \label{Contrast_Norm}
\end{equation}
Parameters $\gamma$, $\beta$ ($C$ dimensional vectors) are learned independently for each normalization layer. Typically, $\gamma, \beta, \epsilon$ are initialized to $\mathbf{1}$
, $\mathbf{0}$, and 1e-8 respectively.

\begin{figure}[]
    \centering
    \includegraphics[width=.95\textwidth]{images/Novelties.pdf}
    \caption{\textbf{(a)} An MRI sequence fed as input to ELNet, and an illustration of an ELNet Block. \textbf{(b\&c)} Our proposed multi-slice normalization: Layer normalization and Contrast normalization (multi-slice norm standardizes slice-wise unlike batch norm which standardizes channel-wise) \textbf{(d)} BlurPool example: Input $X$ is convolved with binomial filter (kernel $B=3$) to obtain an anti-aliased representation $\text{Blur}(X)$. Pooling is then applied to obtain $\text{BlurPool}(X)$.}
    \label{Novelties}
\end{figure}

\subsubsection*{BlurPool}
In the work of \citet{zhang2019shiftinvar}, a BlurPool operation was proposed to mitigate the shift-variance phenomenon observed in modern CNN architectures where max-pooling is often utilized. BlurPool functions by first applying an anti-aliasing filter (binomial filter with kernel size $B$ and stride 1) to the input representation, then strided pooling is applied to obtain the pooled feature map (see \figureref{Novelties}d). The resulting representation is therefore a pooled version of the blurred input representation, and a more detailed analysis is available in the paper \citep{zhang2019shiftinvar}.

% \begin{figure}[]
%     \centering
%     \includegraphics[width=0.8\textwidth]{images/ELNet_Horizontal.pdf}
%     \caption{The ELNet architecture}
%     \label{ELNet_Architecture}
% \end{figure}

 

% \begin{figure}[h]
% \begin{minipage}[c]{0.3\linewidth}
% \centering
% \includegraphics[scale=0.64]{images/ELNet_New.pdf}
% \caption{ELNet Design}
% \label{fig:ELNet Design}
% \end{minipage}
% \hspace{0.025\linewidth}
% \begin{minipage}[c]{0.65\linewidth}
% \centering
% \resizebox{\textwidth}{!}{
% \begin{tabular}{c|cc}
% \hline
% Output Size                                          & \multicolumn{1}{c|}{Layer Operation}                              & Trainable Parameters  \\ \hline
% \multirow{2}{*}{$s \times 4K \times 128 \times 128$} & \multicolumn{1}{c|}{7$\times$7 Conv, $4K$}                        & $196K$                \\ \cline{2-3} 
%                                                      & \multicolumn{1}{c|}{Normalization}                                & $4K$                  \\ \hline
% \multirow{2}{*}{$s \times 4K \times 62 \times 62$}   & \multicolumn{2}{c}{ReLU $\to$ BlurPool}                                                   \\ \cline{2-3} 
%                                                      & \multicolumn{1}{c|}{Block {[}5$\times$5{]}\textbackslash{}times2} & $800K^2 + 16K$        \\ \hline
% $s \times 8K \times 62 \times 62$                    & \multicolumn{1}{c|}{5$\times$5 Conv, $8K$}                        & $800K^2$              \\ \hline
% \multirow{2}{*}{$s \times 8K \times 29 \times 29$}   & \multicolumn{2}{c}{ReLU $\to$ BlurPool}                                                   \\ \cline{2-3} 
%                                                      & \multicolumn{1}{c|}{Block {[}3$\times$3{]}\textbackslash{}times2} & $1152K^2 + 32K$       \\ \hline
% $s \times 16K \times 29 \times 29$                   & \multicolumn{1}{c|}{3$\times$3 Conv, $16K$}                       & $1152K^2$             \\ \hline
% \multirow{3}{*}{$s \times 16K \times 13 \times 13$}  & \multicolumn{2}{c}{ReLU $\to$ BlurPool}                                                   \\ \cline{2-3} 
%                                                      & \multicolumn{1}{c|}{Block {[}3$\times$3{]}}                       & $2304K^2 + 32K$       \\ \cline{2-3} 
%                                                      & \multicolumn{1}{c|}{3$\times$3 Conv, $16K$}                       & $2304K^2$             \\ \hline
% \multirow{3}{*}{$s \times 16K \times 5 \times 5$}    & \multicolumn{2}{c}{ReLU $\to$ BlurPool}                                                   \\ \cline{2-3} 
%                                                      & \multicolumn{1}{c|}{Block {[}3$\times$3{]}}                       & $2304K^2 + 32K$       \\ \cline{2-3} 
%                                                      & \multicolumn{1}{c|}{3$\times$3 Conv, $16K$}                       & $2304K^2$             \\ \hline
% $s \times 16K$                                       & \multicolumn{2}{c}{ReLU $\to$ BlurPool $\to$ 2D Max-Pool}                                 \\ \hline
% $16K$                                                & \multicolumn{2}{c}{1D Max-Pool $\to$ Dropout}                                             \\ \hline
% 2                                                    & \multicolumn{1}{c|}{Fully Connected $\to$ Softmax}                & $32K + 2$             \\ \hline
% \multicolumn{2}{r|}{Total Trainable Parameters}                                                                          & $13120K^2 + 348K + 2$ \\ \hline

% \end{tabular}}
% \captionsetup{type=table}
% \caption{ELNet architecture in detail}
% \label{tab:ELNet Design}


% % \label{table:architecture}
% \end{minipage}
% \end{figure}

\subsection{Training Pipeline}
As suggested by \citet{doi:10.1002/(SICI)1522-2594(199912)42:6<1072::AID-MRM11>3.0.CO;2-M}, we perform histogram-based intensity standardization according to the training set statistics, thus enabling similar-valued pixels to be associated with the relevant tissue type. In addition, we perform randomized data augmentations to each series which includes translation, horizontal flip, scaling, and minor rotations up to $\pm$10 degrees around the center of the volume. For volumes captured in the axial and coronal orientations, we apply an additional random rotation of a multiple of 90 degrees to the volume. Finally, all the images are resized to $256 \times 256$ before entering the network.

Aside from data augmentation, we implement oversampling to compensate for dataset imbalance. For each pathology, we select the minority class samples (allowing repeats) from our training set and apply augmentations on them until the number of minority class samples (along with their augmented copies) equals the number of majority samples.

We train ELNet using standard cross-entropy loss \citep{Goodfellow-et-al-2016}. Optimization can be done using a simple grid-search over relevant hyperparameters such as learning rate, choice of multi-layer normalization, BlurPool kernel sizes, dropout rate, etc. 


\section{Experiments}
\label{experiments}
\subsection{Datasets}
\textbf{MRNet Dataset.} The MRNet Dataset contains 1,370 knee MRI examinations that were carried out at the Stanford University Medical Center. Each case was labeled according to the presence/absence of an anterior cruciate ligament (ACL) tear, a meniscus tear, or other signs of abnormalities in the corresponding knee. Each exam was randomly assigned either to the training, validation, or test set \citep{Bien2018}. It should be noted that each exam may contain multiple labels (e.g. an exam labeled positive for abnormality and ACL tear indicates other forms of abnormality in addition to an ACL tear).

The provided dataset includes, for each case, corresponding axial, coronal and sagittal MRI acquisitions. As reported by Bien et al., a sagittal T2-weighted series, a coronal T1 weighted series, and an axial proton density weighted series were selected for this dataset. Each image is of size $256 \times 256$ and the number of slices ranges between 17- 61 (mean 31 and standard deviation 7.97).
The MRNet Dataset is currently the largest public labeled knee MRI dataset. \\


\noindent \textbf{KneeMRI.} The KneeMRI dataset collected at the Clinical Hospital Centre Rijeka, Croatia by {\v{S}}tajduhar et al consists of 917 exams labeled with ACL conditions in the corresponding knee. For each exam, the ligament condition was classified as either healthy (690 exams, 75.2\%), partially injured (172 exams, 18.8\%), or completely ruptured (55 exams, 6\%).  Each assessment corresponds to a T1-weighted sagittal MRI series, containing $320 \times 320$ or $290 \times 300$ images. The number of images in each series ranges between 21-45 (mean 31 and standard deviation 2.27). The dataset was divided into 10 strata with similar distributions, and we perform stratified sampling for evaluation.



\subsection{Training}
\textbf{MRNet Dataset.} In the MRNet dataset, we were provided with three imaging orientations per examination. For the three pathologies, we trained three separate ELNet's with $K=4$, and the network weights were initialized uniformly by choosing the best random seed between 0-4 \citep{10.1109/ICCV.2015.123}. Based on experiments, we selected coronal images for detecting meniscus tears, and axial images for detecting ACL tears and abnormalities. Contrast normalization yielded the best results for detecting meniscus tears, and layer normalization for detecting ACL tears and abnormalities (notice the correspondence between the selected multi-slice normalization and image modality.) Each model was trained using Adam with a learning rate between 1e-5 and 3e-5 for 200 epochs, taking roughly 1.5 hours \citep{Adam}.\\

\noindent \textbf{KneeMRI Dataset} With the KneeMRI dataset, we perform 5-fold cross validation using eight out of the ten strata, and validation using the remaining two. Similar to the MRNet Datset, we train an ELNet with K=2 using SGD+Momentum for 200 epochs and the training time is roughly an hour for each fold \citep{10.5555/3042817.3043064}. \\

\noindent By choosing K=2, and K=4 for the ELNet architectures, our trained model involves 53,178, and 211,314 trainable parameters respectively. In relation to AlexNet ($\sim$61M trainable parameters), ELNet (with K=4) contains $288\times$ fewer parameters than AlexNet. In comparison with MRNet, ELNet with K=4 contains $866\times$ less parameters, and ELNet with K=2 contains $1147\times$ less parameters.  Each trained model was saved using standard PyTorch format. Model sizes are 850kB and 435kB for K=4 and K=2 respectively. Our experiments were perfomed on an NVIDIA GTX 1070 8GB GPU. 




\subsection{Evaluation}
\textbf{MRNet Dataset.} We evaluate ELNet's performance using the validation set provided by the MRNet dataset (since the test set is not publicly available), and we compare it with the MRNet model proposed and trained by Bien et al. Although they evaluated their models primarily using the ROC-AUC, we perform a more thorough analysis by considering additional metrics that are just as significant, such as Sensitivity and the Matthew Correlation Coefficient (MCC). The evaluation results are presented in \tableref{MRNet_Dataset_Eval_Table} and the ROC is plotted in \figureref{Figures_Page} (a-c), where we can observe noticeably higher MCC of the ELNet model.\\

\noindent \textbf{KneeMRI Dataset.} We evaluate ELNet using a 5-fold cross-validation scheme in detecting injuries in the ACL. The evaluation metrics following the 5-folds are shown on figure \ref{Figures_Page} (d-g); we highlight the lowest value in for each metric in red. In the original paper, {\v{S}}tajduhar et al trained an SVM and reported an AUC of 0.894 using 10-fold cross-validation. Bien et al reported an AUC of 0.911 on a particular train/valid/test set split using a pre-trained MRNet. In our experiment, we obtain an average AUC of 0.913 from the 5-folds, with three of the five folds exceeding 0.92 and the highest being 0.924. Moreover, we observe just minor variations in multiple performance metrics across folds; this demonstrates our model's robustness despite limited data and a highly unbalanced distribution.

\begin{table}[]
\centering
\resizebox{0.99\textwidth}{!}{%

\begin{tabular}{c|cccccc}
\hline
Architecture      & Pathology     & Accuracy                    & Sensitivity & Specificity                 & ROC-AUC                               & MCC                                   \\ \hline
                        & Meniscus Tear & 0.735                       & 0.827       & 0.662                       & 0.826                                 & 0.489                                 \\
                        & ACL Tear      & 0.9                         & 0.907       & 0.894                       & 0.956                                 & 0.769                                 \\
\multirow{-3}{*}{MRNet} & Abnormality   & 0.883                       & 0.947       & 0.64                        & 0.936                                 & 0.628                                 \\ \hline
                        & Meniscus Tear & {\color[HTML]{333333} 0.88} & 0.86        & {\color[HTML]{333333} 0.89} & {\color[HTML]{333333} \textbf{0.904}} & {\color[HTML]{000000} \textbf{0.745}} \\
                        & ACL Tear      & 0.904                       & 0.923       & 0.891                       & \textbf{0.960}                        & {\color[HTML]{000000} \textbf{0.815}} \\
\multirow{-3}{*}{ELNet} & Abnormality   & 0.917                       & 0.968       & {\color[HTML]{000000} 0.72} & \textbf{0.941}                        & {\color[HTML]{000000} \textbf{0.736}} \\ \hline

\end{tabular}}
\caption{Evaluation Statistics between MRNet and ELNet on the MRNet validation set}
\label{MRNet_Dataset_Eval_Table}
\end{table}


\begin{figure}[]
    \centering
    \includegraphics[width=\textwidth]{images/Figures_New.pdf}
    \caption{ \textbf{MRNet Dataset:} (a-c) Comparision of ELNet and MRNet ROC ~~~~\textbf{KneeMRI Dataset:} (d) ELNet ROC's obtained from 5-fold cross-validation (e) ELNet metrics following 5-fold cross-validation}
    \label{Figures_Page}
\end{figure}


\subsection{Ablation Studies}
This section aims to compare ELNet performance when multi-slice normalization and BlurPool are replaced with batch normalization \citep{ioffe2015batch} and max-pooling. The objectives are the three pathologies presented in the MRNet dataset, and the best results following the modified ELNet designs are listed in \tableref{Ablation_Table}. Stemming from the fact that batch normalization induces an undesired standardization for each channel of feature representations across all slices, the feature extractor (designed to extract per-slice features) would no longer process each slice independently, and degraded performance deems reasonable. Following our experiments, it is evident that the use of batch normalization aggravates ELNet performance. In practice, we observe network divergence during training after 10-15 epochs. To our surprise, ELNet with batch norm and max-pool delivered slightly improved performance when compared with ELNet with batch norm and BlurPool, but when BlurPool is paired with the intended multi-slice normalization, we observe an overall improvement in performance compared to max-pooling. 

\begin{table}[]
\centering
\resizebox{0.99\textwidth}{!}{%

\begin{tabular}{c|cc|cc|cc}
\hline
\multirow{2}{*}{\begin{tabular}[c]{@{}c@{}}ELNet Configuration\\ (K=4)\end{tabular}} & \multicolumn{2}{c|}{Meniscus Tear} & \multicolumn{2}{c|}{ACL Tear}   & \multicolumn{2}{c}{Abnormalities} \\
                                                                                     & ROC-AUC          & MCC             & ROC-AUC        & MCC            & ROC-AUC         & MCC             \\ \hline
Multi-Slice Norm + BlurPool                                                          & \textbf{0.904}   & \textbf{0.745}  & \textbf{0.960} & \textbf{0.815} & 0.941           & \textbf{0.736}  \\
Batch Norm + BlurPool                                                                & 0.751            & 0.391           & 0.871          & 0.530          & 0.841           & 0.440           \\
Multi-Slice Norm + MaxPool                                                           & 0.848            & 0.534           & 0.923          & 0.633          & \textbf{0.943}  & 0.557           \\
Batch Norm + MaxPool                                                                 & 0.7972           & 0.403           & 0.906          & 0.693          & 0.880           & 0.312           \\ \hline
\end{tabular}}
\caption{Comparison of ELNet performance when multi-slice normalization and BlurPool are replaced with batch normalization and max-pool. The ROC-AUC and MCC of the best performing model (one for each pathology) of each ELNet configuration is reported.}
\label{Ablation_Table}
\end{table}


\subsection{Model Interpretation}
\label{model_interpretation}
To understand how ELNet identifies certain attributes for diagnosis, we compute the Full-Gradient representation of ELNet using the FullGrad algorithm \citep{srinivas2019fullgrad}. FullGrad generates a heat-map that corresponds to parts of the input that most influence the output prediction. Conceptually, the generated heat-map should be ``hotter" in areas indicating an injury and ``cold" elsewhere. 

To verify that ELNet is indeed performing diagnosis based on features in the given acquisition, we randomly selected one of the five cross validation splits and evaluated the trained ELNet from that split. Samples from the validation set were randomly selected from both classes, resulting in 9 cases containing ACL tear and 7 cases without. A board-certified MSK radiologist with 17 years of experience was asked to identify the most informative slice (slice containing the most area for which a tear resides) in a given series and furthermore indicate the region in the (most informative) slice corresponding to an ACL injury. The identical task was performed on the trained ELNet, and of the 9 cases that contain ACL tear, the trained ELNet’s prediction of the most informative slice and tear region coincided with the radiologist's evaluation in 8 of the cases. Of the 7 cases where the ACL is intact, our model’s prediction matched the radiologist’s assessment in all 7 cases. In \figureref{Interpretation}, we present a few examples of the generated heat-maps.

\begin{figure}[t]
    \centering
    \includegraphics[width=0.99\textwidth]{images/Interpretation-New.pdf}
    \caption{\textbf{Top:} Sample MRI slices containing ACL tears. \textbf{Bottom:} Full-Grad visualization computed using the above slices. ``Hotter" areas indicate regions containing ACL tear.}
    \label{Interpretation}
\end{figure}



\section{Conclusion}
\label{conclusion}
In this work, we present ELNet, a unique CNN architecture optimized for knee injury detection. The novel integration of multi-slice normalization and BlurPool operations allow ELNet models to remain lightweight ($\sim$ 0.2M parameters, requiring single imaging stack, trained from scratch) while performing favorably against MRNet models ($\sim$ 183M parameters, requiring three imaging stacks, pretrained AlexNet) on the MRNet dataset. Cross-validation on the KneeMRI dataset have demonstrated consistent improved performance with ELNet models, proving the architecture to be robust regardless of a highly unbalanced distribution. In a clinical setting, where large number of cases await evaluation, our algorithm may be used for triage, improving workflow efficiency. In addition, by having our algorithm locate regions containing tears, radiologists can benefit by having the most significant slice presented first for each case. 

Future work may include performance enhancement by incorporation of all three MRI volumes, axial, coronal and sagittal, if available. Further research is also needed to facilitate application of trained models on MRI data acquired using different scanners with various intensity scales. With the promising findings thus far, we believe ELNet may serve as a solid basis for future works involving knee injury triage.



%  In conclusion, our work may serve as a prominent backbone for radiological examination triage helping to reduce reading delay for pathological cases. 




% despite the limited and  unbalanced data. Furthermore, we were succ
% essful in locating regions containing tears using trained ELNet models, and this may be beneficial to a radiologist examining a large number of scans to have the most significant slice, as predicted by the algorithm, presented first in each case. 


% consistency of ELNet models across multiple evaluation metrics, proving effectiveness in generalizability despite the limited unbalanced data.



% By integrating multi-slice normalization and BlurPool operations around several \textit{Block} layers, we obtain a network characterized by the choice of multi-layer normalization, channel factor $K$, and BlurPool kernel size $B$. Such a design allow ELNet models to remain lightweight ($\sim$ 0.2M parameters, requires single imaging stack) while performing favorably against MRNet models ($\sim$ 183M parameters, requires three imaging stacks) in detecting meniscus tear, ACL tear, and abnormalities using the MRNet dataset. We also validate ELNet on the KneeMRI dataset, and we observe a consistent robust performance using several metrics across folds in a 5-fold cross validation scheme. Lastly, we demonstrate localization using ELNet   


% In this work, we present ELNet, a unique feature extracting architecture that incorporates multi-slice normalization and BlurPool operations. This novel design enables our model to extract informative features that are more indicative of knee injuries such as meniscus tear or ACL tear when compared with past approaches. Instead of the traditional transfer-learning approach, experiments preformed on the two largest public knee datasets have proven exceptional performance the effectiveness demonstrated the ease in optimizing ELNet models from scratch. By selecting  In addition to injury detection, our model is capable of locating the region containing the tear; it may be beneficial to a radiologist examining a large number of scans to have the most significant slice, as predicted by the algorithm, presented first in each case. 

% Future work may include performance enhancement by incorporation of all three MRI volumes, axial, coronal and sagittal, if available. Further research is also needed to facilitate application of trained models on MRI data acquired using different scanners with various intensity scales.



% Although our model performed well by using scans from a single image plane, it is possible for a tear to reside between slices where the particular image plane failed to capture. One solution would be to fuse information extracted from the other image planes somewhere along the network, and this can potentially improve injury detection. Additional works may also involve designing the the CNN to be robust to MRI intensity variations. This could allow a trained network to operate on scanners with different intensity scales, thereby reducing the need to retraining a network to fit the relevant scanner.


% In spite of 

% we believe that the ELNet may serve as a prominent backbone for 


% In conclusion, our work may serve as a prominent backbone for radiological examination triage helping to reduce reading delay for pathological cases. 




\bibliography{Tsai20}





\end{document}
