\documentclass{midl} % Include author names
% \documentclass[anon]{midl} % Anonymized submission

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution
\usepackage[utf8]{inputenc}
\usepackage{cite}
\usepackage{graphicx}
\usepackage{wrapfig}
\usepackage{caption}
%\usepackage[subtle]{savetrees}
\usepackage{comment}
\usepackage{hyperref}
\usepackage{booktabs}       % professional-quality tables
\input{Macros.tex}
\usepackage{lipsum}
\usepackage{todonotes}
\usepackage{mwe} % to get dummy images
%\jmlrvolume{-- Under Review}
\jmlryear{2020}
\jmlrworkshop{Full Paper -- MIDL 2020}

%\jmlrworkshop{Extended Abstract -- MIDL 2019 submission}
%\editors{Under Review for MIDL 2020}

\title[LoTeNet for Medical Imaging]{Tensor Networks for Medical Image Classification}

\midlauthor{\Name{Raghavendra Selvan} $^1$ \Email{raghav@di.ku.dk}\\
%
\Name{Erik B Dam} $^{1}$ \Email{erikdam@di.ku.dk}\\
\addr $^{1}$ Department of Computer Science, University of Copenhagen, Denmark
}

\begin{document}

\maketitle

\begin{abstract}
With the increasing adoption of machine learning tools like neural networks across several domains, interesting connections and comparisons to concepts from other domains are coming to light. In this work, we focus on the class of Tensor Networks, which has been a work horse for physicists in the last two decades to analyse quantum many-body systems. Building on the recent interest in tensor networks for machine learning, we extend the Matrix Product State tensor networks (which can be interpreted as linear classifiers operating in exponentially high dimensional spaces) to be useful in medical image analysis tasks. We focus on classification problems as a first step where we motivate the use of tensor networks and propose adaptions for 2D images using classical image domain concepts such as local orderlessness of images. With the proposed locally orderless tensor network model (LoTeNet\footnote{Official repository: \url{https://github.com/raghavian/loTeNet_pytorch/}}), we show that tensor networks are capable of attaining performance that is comparable to state-of-the-art deep learning methods. We evaluate the model on two publicly available medical imaging datasets and show performance improvements with fewer model hyperparameters and lesser computational resources compared to relevant baseline methods.
\end{abstract}

\begin{keywords}
Tensor Networks, Image classification, histopathology, thoracic CT, lesions
\end{keywords}

\section{Introduction}

Kernel methods revolutionised pattern recognition and machine learning with the class of support vector machines (SVMs) in the 90's, based on the fundamental insight that difficult problems in low dimensions may become easier when lifted to high dimensional spaces~\citep{boser1992training, cortes1995support, hofmann2008kernel}. 
%However, working with large datasets  in high dimensional spaces can quickly become prohibitive as these spaces grow at least quadratically with the dataset size and learning even a linear decision function in such spaces can become untenable~\citep{bordes2005fast, novikov2016exponential}. 
%Further, these models are prone to overfit on sparse data in high dimensions. 
%One approach to overcome the high dimensionality problem is the kernel trick with SVMs which computes the distance between the data points in high dimensions implicitly~\citep{hofmann2008kernel}. 

%Although such implicit mapping overcomes the high dimensionality problem, SVMs are known to be sensitive to the specific choice of the kernel parameters 
%and as a consequence the decision boundaries learnt are prone to overfitting
%~\citep{burges1998tutorial,bordes2005fast}.
%These drawbacks of high dimensional spaces are commonly attributed as the \emph{Curse of dimensionality}. 

%On the other hand, neural networks have shown drastic improvmenents in performance on a variety of tasks when compared to kernel based methods. This is largely due to their ability to learn highly non-linear decision functions from low dimensional data spaces, facilitated by automatic differentiation tools like backpropagation~\citep{lippmann1987introduction,lecun2015deep}.

%Tensor Networks offer a different approach to dealing with high dimensional spaces. 
 An efficient approach to dealing with such high dimensional spaces can be with \emph{tensor networks}, also known as tensor trains. Tensor networks are factorisations of high dimensional tensors into networks of low rank tensors and come with a class of efficient algorithms to perform these approximations~\citep{oseledets2011tensor,bridgeman2017hand}. The number of parameters needed to specify an $N$ dimensional tensor using tensor networks can be drastically reduced, from exponentially increasing with $N$ to a polynomial dependence on $N$~\citep{perez2006matrix}. %This massive compression in number of parameters using Tensor Networks has been predominantly applied to better understand quantum wave functions~\citep{shi2006classical}. 
 %Tensor Networks have also seen applications in data compression~\citep{cichocki2016tensor}, and recently to better understand the expressive power of deep learning models models~\citep{cohen2016expressive,glasser2019expressive}. 
 
 Recently, there has been an increasing interest in using tensor networks in the context of supervised machine learning, specifically focused on image classification tasks~\citep{stoudenmire2016supervised,efthymiou2019tensornetwork}. These methods rely on transforming the 2-d input images into 1-d vectors before encoding them into high dimensional spaces. As a consequence of this flattening these methods are constrained to work with images of small spatial resolution ($12\times 12$ px to $28 \times 28$ px), and focus on employing improved flattening strategies to maximize the retained pixel correlation~\citep{stoudenmire2016supervised,efthymiou2019tensornetwork}. 
%This is primarily to retain the correlation between pixels as flattening larger images can throw away essential features. 
For small enough images (like in MNIST or Fashion MNIST datasets) there is some residual correlation in the flattened images which can be exploited using tensor networks. In medical imaging tasks, however, images with such low spatial resolutions are rarely encountered. Further, the information lost by flattening of images in medical imaging tasks can be crucial as many decisions can be dependent on the global structure of the pixels.

In this work, we extend the use of tensor networks to be useful in classification of medical images. We propose the locally orderless tensor network or LoTeNet (pronounced ``low tenet") inspired from the classical theory of locally orderless images in~\citet{koenderink1999structure}. According to the theory of locally orderless images, statistics from small neighbourhoods in images can be derived by ignoring the local order of pixels while still capturing the global structure by operating at different scales. The proposed LoTeNet model is used to perform linear classification in high dimensional spaces and it is optimized end-to-end by backpropagating the error signal through the tensor network. We present experiments on two medical imaging datasets: PCam dataset with histopathology images~\citep{veeling2018rotation} and LIDC-IDRI dataset with thoracic CT images~\citep{armato2004lung}. Our model fares comparably to state-of-the-art deep learning models with fewer model hyperparameters and utilizing a fraction of the GPU memory when compared to their CNN counterparts.

%Our key contributions are:
%\begin{itemize}
%    \item A novel Tensor Network motivated from locally orderless images (\emph{lo}-TeNet)
%    \item Extending Tensor Networks to (spatially) larger images
%    \item Validation on two medical image classification tasks
%\end{itemize}
%An $N$ dimensional tensor can be approximated effectively such that the number of parameters needed to specify reduce the dependence of number of parameters needed to specify high dimensional spaces from an exponential dependence a class of methods used in tackling the quantum many-body problems. While aThey offer efficient means of expressing exponentially large dimensional spaces with polynomially increasing number of parameters with the number of dimensions.

%of dealing with data in high dimensional spaces is to approximate them efficiently with low-rank tensors using Tensor Networks. Tensor Networks (TNs) allow for obtaining simpler decision functions in high dimensional spaces without implicit mapping like in SVMs. 

%They can perform sufficiently expressive approximations of high dimensional tensors with linearly increasing number of parameters with the number of parameters (instead of exponential number of parameters) are specialized tensor factorisations that can be used to obtain linear decision functions in exponentially high dimensions without succumbing to the \emph{curse of dimensionality}~\citep{stoudenmire2016supervised}. 
%In contrast to neural networks which learn non-linear functions in low dimensional spaces, the class of Tensor Networks (TeNets) manipulation of high dimensional spaces.


%\begin{itemize}
    %\item Kernel methods
    %\item Non-linear functions in low dimensions vs linear functions in high dimensions
    %\item Tensor networks to approximate high dimensional tensors
    %\item Retaining correlation between neighbours. Primarily used in small image tasks
    %\item Contributions
%\end{itemize}


\section{Background and Problem Formulation}

%In this section, we introduce several key concepts pertaining the use and optimization of Tensor Networks which will be put together in the next section to describe our proposed method.

\begin{figure}[t]
\centering
%\floatconts
{\includegraphics[width=0.8\linewidth]{figures/tensorBasics.pdf}}
        \captionsetup{format=plain}
\caption{(left) Tensor notation depicting a scalar $s$, vector ${v}^i$, matrix $M^{ij}$ and a general 3-D tensor $T^{ijk}$. (center) Tensor notation for matrix multiplication or \emph{tensor contraction}, which are used extensively in the matrix product state networks used in this work. We adhere to the convention that the contracted indices are written as subscripts. (right) Tensor notation for trace of product of two matrices.}
\label{fig:tensorBasics}
\end{figure}

\subsection{Tensor Network Notations}

Tensor networks and operations on them are described using an intuitive graphical notation, introduced in~\citet{penrose1971applications}. Figure~\ref{fig:tensorBasics} (left) shows the commonly used notations for a scalar $s$, vector $v^i$, matrix $M^{ij}$ and a general 3-D tensor $T^{ijk}$. The number of dimensions of a tensor are captured by the number of edges emanating from the nodes denoted by the edge indices. For instance, the vector $v^i$ has one dimension indicated by the edge with index $i$ and a 3-d tensor has three indices $(i,j,k)$ depicted by the three edges, and so on. 

Operations on high dimensional tensors can be succinctly captured using tensor networks as shown in Figure~\ref{fig:tensorBasics} (center) where matrix product is depicted, which is also known as \emph{tensor contraction}. The edge between the tensor nodes $X^i_j$ and $Y^k_j$ is the dimension subsumed in matrix multiplication resulting in $Z^{ik}$. 
%The size of the common edge ($j$), or the contracting dimension, is known as the bond dimension denoted by $\beta$ in rest of this work. 
More thorough introduction to tensor notations can be found in~\citet{bridgeman2017hand}.

\begin{comment}
\begin{figure}[t]
    \centering
\begin{minipage}{0.4\textwidth}
\centering
%\begin{figure}[htbp]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
  {\includegraphics[width=0.79\linewidth]{figures/mps.pdf}}
  \end{minipage}
\begin{minipage}{0.58\textwidth}
\centering
%\begin{figure}[htbp]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
  {\includegraphics[width=0.8\linewidth]{figures/squeezeOperation.pdf}}
%\end{figure}
\end{minipage}
\captionsetup{format=plain}
\caption{(left)The linear model and its MPS approximation in tensor notation. (right) Squeeze operation with stride $k=3$  which reshapes a $6\times 6 \times 1$ image patch into $ 2 \times 2 \times 9$ stack.}
    \label{fig:my_label}
\end{figure}
\end{comment}

\subsection{Linear model in high dimensions}

A linear model in a sufficiently high dimensional space can be very powerful~\citep{novikov2016exponential}. In SVMs, this is accomplished by the \emph{implicit} mapping of the input data into an infinite dimensional space using radial basis function kernels~\citep{hofmann2008kernel}. In this section, we describe the procedure followed in this work to map the input data into a higher dimensional space.

Consider an input vector $\xbf \in [0,1]^N$, which can be obtained from a flattened 2-d image with $N$ pixels in total and intensity values normalized in the interval $[0,1]$. A commonly used feature map for tensor networks is obtained by taking the \emph{tensor product} of pixel wise feature maps~\citep{stoudenmire2016supervised}:
\begin{equation}
    \Phi^{i_1,i_2,\dots i_N}(\xbf) = \phi^{i_1}(x_1) \otimes \phi^{i_2}(x_2) \otimes \cdots \phi^{i_N}(x_N)
    \label{eq:jointRef}
\end{equation}
where the local feature map acting on pixel $x_j$ is indicated by $\phi^{i_j}(\cdot)$. The local feature map is $d$-dimensional and usually is a simple non-linear function which additionally is restricted to have unit norm so that the joint feature map in Eq.~\eqref{eq:jointRef} also has unit norm. A widely used local feature map with $d=2$ inspired from quantum wave function analysis is~\citep{stoudenmire2016supervised}:
%XXXXX this reads badly, a comma in an equation making the stretch across an equation, a page shift, and a figure. XXXXX
\begin{equation}
    \phi^{i_j} (x_j) = [\cos (\frac{\pi}{2}x_j), \sin (\frac{\pi}{2}x_j)].
    \label{eq:localRef}
\end{equation}
%Local feature map of the form in~\eqref{eq:localRef} ensure the joint feature map in Eq.~\eqref{eq:jointRef} also has unit norm. 

The dimensionality of the joint feature map $\Phi(\xbf)$ is $d^N$ due to tensor products in Eq.~\eqref{eq:jointRef}, as the local feature maps  in Eq.~\eqref{eq:localRef} are of dimensionality $d=2$. The joint feature map $\Phi(\xbf)$ basically maps each image as a vector in the $d^N$ dimensional feature space. For an RGB image, or other image modalities with $C$ input channels as commonly encountered in medical imaging, the local feature map can be applied to each channel separately such that the resulting space is of dimension $(d \cdot  C)^N$~\citep{efthymiou2019tensornetwork}. 

%where $d=2$ 
%due to Eq.~\eqref{eq:localRef} and $C=3$ is the number of channels in the image resulting in a $6^N$ dimensional space
%Such feature maps are inspired from quantum physics applications where Tensor Networks are used to approximate wave functions~\citep{stoudenmire2016supervised}. 
%can be encoded into a high dimensional space using 

Given the high dimensional feature map $\Phi(\xbf)$ of Eq.~\eqref{eq:jointRef} for the input data $\xbf$, a decision rule for a multi-class classification task can be formulated of the form:
\begin{equation}
    f(\xbf) = \arg\max_m f^{m} (\xbf),
    \label{eq:deRule}
\end{equation}
where $ m=[0,1,\dots M-1]$ are the $M$ classes,
%linear decision function for a multi-class classification task can be written based on the following dot product,
\begin{equation}
f^{m}(\xbf) = W ^{m} \cdot \Phi(\xbf).    
\label{eq:linModel}
\end{equation}
and the weight tensor $W^{m}$ is an $N+1$ dimensional tensor with output tensor index $m$. 

%$\Phi(\cdot)$ is a feature map that transforms the input data to a high dimensional space. 
%Images are assigned the label based on the following decision rule:

In tensor notation, the linear model of Eq.~\eqref{eq:linModel} is depicted in Figure~\ref{fig:mps} (Step 1) where the first column of gray nodes are the individual pixel feature maps of feature dimension $d$. The pixel feature maps are connected to the weight tensor $W^{m}$ along $N$ edges and $W^{m}$ has one output dimension marked with index $m$.

%The linear model in Eq.~\eqref{eq:linModel} can yield decision boundaries that can p
The $N+1$ dimensional weight tensor $W^{m}$ results in total of $M \cdot d^N$ number of weights. Even for a relatively small gray scale image, say of size $100\times 100$, the total number of components in $W^m$ can be massive: $2\cdot 2^{10000} \approx 10^{3010}$. In the next section we will see how tensor networks can represent such high dimensional tensors with parameters that grow linearly with $N$ instead of growing exponentially with $N$. 
\subsection{Matrix Product State (MPS)}
\begin{figure}
    \centering
    \includegraphics[width=0.7\textwidth]{figures/mpsAll.pdf}
    \caption{(Step 1) Linear model of Eq.~\eqref{eq:linModel} in tensor notation. (Step 2) MPS approximation of the linear model. (Step 3) Series of tensor contractions done with MPS to compute $W ^{m} \cdot \Phi(\xbf)$ in Eq.~\eqref{eq:linModel}}
    \label{fig:mps}
\end{figure}

Consider two 1-d vectors, $v^i$ and $u^j$  with dimension indices $i$ and $j$ respectively. The tensor (outer) product of these two vectors yields a 2-d matrix $X^{ij}$. The matrix product state (MPS)~\citet{perez2006matrix} is a type of tensor network that expands on this notion allowing the factorization of an $N$-dimensional tensor (with $N$ edge indices) into a chain of rank-3 tensors (with three edge indices) except on the borders where they are of rank-2, as shown in Figure~\ref{fig:mps} (Step 2). Concretely, a tensor of $N$ dimensions with indices $i_i, i_2, \dots i_N$ can be approximated using lower rank tensors $A^{i_j}$ as
\begin{equation}
    W^{m,i_1,i_2,\dots i_N} = \sum_{\alpha_1, \alpha_2,\dots \alpha_N} A^{i_1}_{\alpha_1} A^{i_2}_{\alpha_1 \alpha_2} A^{i_3}_{\alpha_2 \alpha_3} \dots A^{m,i_j}_{\alpha_j \alpha_{j+1}} \dots A^{i_N}_{\alpha_N}.
    \label{eq:mps}
\end{equation}
The subscript indices $\alpha_j$ are the virtual indices that are contracted and are of dimension $\beta$ which is known the \emph{bond dimension}. The components of the intermediate lower rank tensors $A^{i_j}$ are the parameters of the MPS approximation. The placement of the output dimension $m$ on $A^{i_j}$ in Eq.~\eqref{eq:mps} is an arbitrary choice and can be adapted during the optimisation~\citep{stoudenmire2016supervised}. 
%The MPS approximation of Eq.~\eqref{eq:mps} is shown shown in Figure~\ref{fig:tensorBasics}(center) is carried out. 
Note that any $N$ dimensional tensor can be represented exactly using an MPS if $\beta=d^{N/2}$, where $d$ is the feature dimension. In most applications, however, $\beta$ is fixed to a small value or allowed to adapt dynamically when the MPS is used to approximate a high dimensional tensor ~\citep{perez2006matrix,torchmps}. 
%A drawback of MPS is that it operates along one dimension (as a chain). This is the primary reason for 2-d image data has to be flattened when working with Tensor Networks. Tensor Networks that can work on arbitrary graphs which might be more suitable for image data like the PEPS~\citep{verstraete2004renormalization} are not as well understood and do not yet have efficient algorithms like the MPS.

The decision function in Eq.~\eqref{eq:linModel} can now be computed using the MPS approximation of $W^m$ in Eq.~\eqref{eq:mps} depicted in Figure~\ref{fig:mps} (Step 2). The order in which the tensor contractions are performed can yield a computationally efficient algorithm. The original MPS algorithm~\citep{perez2006matrix} starts from one of the ends, contracts a pair of tensors to obtain a new tensor which is then contracted with the next tensor and this process is repeated until the output tensor is reached. The cost of this algorithm as $N \cdot \beta^3 \cdot d$ when compared to the cost that scales as $d^N$ without the MPS approximation. 
In this work we use the MPS implementation in~\citet{torchmps} which contracts the horizontal edges parallely and proceeds to contract these contracted tensors vertically as depicted in Figure~\ref{fig:mps} (Step 3) and yields improved approximations~\citep{efthymiou2019tensornetwork}.
%It is in such scenarios that Tensor Networks are most useful. They allow us to represent such high dimensional tensors as a contracted product of lower-order tensors. This can be seen as a generalisation of being able to write a $2$-d matrix as the outer product of two $1$-d vectors. Instead of using exponential number of components the tensor network components only grow linearly with dimensionality N. 

%\subsection{Locally orderless images}

\section{Methods}

\begin{wrapfigure}{r}{0.5\textwidth}
%\centering
\includegraphics[width=0.45\textwidth]{figures/squeeze2.pdf}
\captionsetup{format=plain}
\caption{Squeeze operation with stride $k=2$  which reshapes a $4\times 4 \times 1$ image patch into $ 2 \times 2 \times 4$ stack. Raveling the squeezed image yields a vector of size $4$ with feature dimension d=$4$.}
\label{fig:squeeze}
\end{wrapfigure}

Recently proposed Tensor Network models for image classification purposes flatten entire 2-d images into 1-d vectors with different raveling strategies~\citet{han2018unsupervised,efthymiou2019tensornetwork}. In contrast to these methods, we only flatten small regions of the images which can be assumed to be locally orderless and derive useful features using MPS operations~\citet{koenderink1999structure}. We process these locally orderless regions using a hierarchical MPS tensor network which we call the Locally Orderless Tensor Network or LoTeNet, which is shown in Figure~\ref{fig:loTenet}. This enables our model to handle larger images without losing their global structure.

\subsection{Locally orderless Tensor Network (LoTeNet)}
\label{sec:loTeNet}

%The primary contribution in this work is a Tensor Network model that can handle images of larger spatial resolutions, while having access to the local neighbourhood information. 

The locally orderless image regions are created in two steps. First, the \emph{squeeze} operation illustrated in Figure~\ref{fig:squeeze} is applied on $k^2 \times k^2$ image patches where $k$ is the stride of the square kernel. This operation rearranges pixels in spatially local regions and stacks them along the feature dimension. Similar strategies have also been used in normalizing flow models to provide spatial context via feature dimensions such as in~\citet{dinh2016density}. The stride of the kernel $k$ decides the extent of reduction in spatial dimensions and the corresponding increase in feature dimension of the squeezed image. In the second step, the squeezed image with an inflated feature dimension of $C\cdot k^2$ is flattened from 2-d to 1-d. Flattening these local regions with spatial information along the feature axis provides our model with additional structural information. Further, the increase in $d$ makes the tensor network more flexible as it increases the dimensionality of the feature space~\citep{stoudenmire2016supervised}.

%\begin{wrapfigure}{r}{0.3\textwidth}
%\centering
%\includegraphics[width=0.25\textwidth]{figures/mpsBlock.pdf}
%\caption{}{MPS contraction for each image patch.}
%\label{fig:squeeze}
%\end{wrapfigure}

%Given an input image, each of its $k^2\times k^2 \times C$ patch is squeezed into a 1-d vector of size $k^2$ with $C \cdot k^2$ features. Each of these squeezed vectors are input to an MPS for contraction of the form in Eq.~\eqref{eq:linModel} resulting in an output vector of size $\nu$ which we set to be the same as the bond dimension $\beta$. This comprises one layer of the LoTeNet. 
\begin{figure}[t]
\centering
  {\includegraphics[width=0.89\linewidth]{figures/mpsTree.pdf}}
  \caption{The proposed Locally orderless Tensor Network shown as a series of layers. Each layer consists of several MPS blocks. The squeeze operation is as described in Figure~\ref{fig:squeeze}. The final MPS block outputs the predictions for the $M$ classes depicted as the edge with index $m$.}
  \label{fig:loTenet}
\end{figure}

Consider the input image to the first MPS layer in Figure~\ref{fig:loTenet} with grids marking the different $k^2 \times k^2$ patches. Each of these patches are squeezed and input into an MPS block. The MPS block first embeds these $C\cdot k^2$ vectors into the joint feature space of dimensionality ${d}^N$ according to Eq.~\eqref{eq:localRef} and Eq.~\eqref{eq:jointRef}, with $d=(2\cdot C \cdot k^2)$. Then, the image patches in the high dimensional feature space are contracted to output a vector with dimension $\nu$. In our model $\nu$ is set to be the same as the bond dimension $\beta$. The functionality of the MPS blocks can be interpreted as summarising the patch with a vector of size $\nu$ using a linear model in the high dimensional feature space. 

The output vectors from all the MPS blocks at a given layer are reshaped back into the 2-d image space. However, due to the MPS contractions in the first layer, the intermediate image map will be of lower resolution as indicated by the smaller image with fewer patches in Figure~\ref{fig:loTenet}. This is analogous to obtaining an average pooled version of the intermediate feature maps in CNN operations. The smaller 2-d patches formed after the first layer of MPS blocks are further squeezed and contracted in the next layer of the model. This process is continued for $L$ layers and the final MPS block performs the decision contraction of Eq.~\eqref{eq:linModel}.

\subsection{Model Optimization}

The components of the weight matrix $W^{m}$ (parameters of the model) are approximated using the layers of MPS blocks as described in Section~\ref{sec:loTeNet}. 
%Instead of using the sweeping iterative algorithms used in physics applications~\citep{schollwock2005density}, w
We view the sequence of MPS contractions in successive layers of our model as the forward pass and rely on automatic differentiation in PyTorch~\citep{paszke2019pytorch} to compute the backward computation graph~\citep{efthymiou2019tensornetwork}. The Torch MPS module~\citep{torchmps} is used to learn the MPS parameters from the training data in an end-to-end fashion. 
%These weights can be interpreted as the weights of neural network layers and in a similar manner can be updated iteratively by backpropagating a relevant error signal computed from the final predictions. 

%As we are interested in a multi-class classification task we use the decision rule in Eq.~\eqref{eq:deRule}. 
Similar to~\citet{efthymiou2019tensornetwork} we minimize the cross-entropy loss between the true label $y_i \in [0,\dots,M-1]$ for each image $\xbf_i$ and the predicted label $f^{(y_i)}(\xbf_i)$ in the training set $\mathcal{D}$:
\begin{equation}
    \mathcal{L}(f^{(y_i)}(\xbf_i)) = -\sum_{(\xbf_i,y_i)\in \mathcal{D}} \log \frac{\exp{f^{(y_i)}(\xbf_i)}}
    { \sum_{m=0}^{M-1} \exp{f^{(m)}(\xbf_i)}} 
    = - \sum_{(\xbf_i,y_i)\in \mathcal{D}} \log \Big({\sigma}(f^{(y_i)}(\xbf_i))\Big)
\end{equation}
where $\sigma(\cdot)$ is the softmax operation used to obtain normalized scores that can be interpreted as the predicted class probabilities.

\section{Data and Experiments}

\subsection{Data}

\begin{figure}[t]
\centering
  {\includegraphics[width=0.6\linewidth]{figures/pcam.pdf}}
  \caption{Four sample patches from the PCam dataset. The green region of size $32\times 32$ px in the second and third patches denotes the presence of tumour with a positive label whereas the other two have negative labels.}
  \label{fig:pcam}
\end{figure}

We perform experiments on two publicly available datasets with the task formulated as binary classification for: metastasis detection from histopathologic scans; and detection of nodules in thoracic computed tomography (CT) scans. \\
%
%\subsubsection
\textbf{PCam Dataset}: The PatchCamelyon (PCam) dataset is a recently introduced binary image classification dataset in~\citet{veeling2018rotation}. Image patches of size $96\times 96$ px are extracted from the Camelyon16 Challenge dataset~\citep{bejnordi2017diagnostic}. A positive label indicates the presence of at least one pixel of tumour tissue in the central $32\times 32$ px region and a negative label indicates the absence of tumour, as shown in Figure~\ref{fig:pcam}. Patch extraction is performed to ensure the class balance is close to $50-50$. We use the modified PCam dataset from the Kaggle challenge\footnote{\url{https://www.kaggle.com/c/histopathologic-cancer-detection}} which excludes duplicate image patches and consists of about $220,000$ patches for training which is further split into $80:20$ for training and validation purposes. An independent test set of about $57,500$ patches is provided for evaluating the models. 
%for testing purposes. The $220K$ patches in the training set are further split into $80:20$ for training and validation purposes. 
All image planes are normalized to have mean and standard deviation of $0.5$. We use random rotation, random horizontal and vertical flips on the training data for data augmentation.\\
%
%\subsubsection
\textbf{LIDC Dataset}: The LIDC-IDRI datatset comprises of 1018 thoracic CT images with lesions annotated by four radiologists~\citep{armato2004lung}. Similar to~\citet{kohl2018probabilistic,baumgartner2019phiseg} we extract $128 \times 128$ px image patches centered on a lesion and use the preprocessed data from~\citet{lidc} (shown in Figure~\ref{fig:lidc} in Appendix). This yields a total of about $15,000$ patches. To transform this into a classification task we pose it as a task of predicting the presence or absence of lesions based on the four annotations. We indicate a patch to have a lesion if more than two (i.e. $\geq$ 3) radiologists have annotated presence of a lesion and a negative label in the remaining two cases. The binary task then transforms it into capturing the majority vote amongst the radiologists. We split the dataset into $60:20:20$ splits for training, validation and a hold-out test set. %As with PCam dataset, the images are normalized to have both mean and standard deviation of $0.5$.
\subsection{Experiments and Results}

The proposed model (LoTeNet) is evaluated with $L=3$ layers and a kernel size $k=2$ (for the squeeze operation) and it is implemented based on the efficient MPS implementations in~\citet{torchmps}. The only critical hyperparameter inherent to LoTeNet is its bond dimension $\beta$; it was set to $\beta=5$ obtained from the range $[2,4 \dots 20]$ based on the performance on the PCam validation set (Figure~\ref{fig:bondDim} in Appendix).
%, consistent with other reporting on the influence of the bond dimension~\citep{efthymiou2019tensornetwork}. The performance of the model was robust to different values of $\beta$ showing only minor variations in performance. 
We used the Adam optimizer~\citep{kingma2014adam} with a learning rate of $5\times 10^{-4}$ and a batch size of $512$. Models were assumed to have converged if there was no improvement in validation AUC over $5$ consecutive epochs and the model with the best validation performance was used to predict on the test set. All experiments were run on a single Tesla K80 GPU with 12 GB memory. The same settings were used for the experiments on LIDC dataset.

We compare performance of our model with DenseNet baseline with $4$ layers and a growth rate of $12$ as described in~\citet{huang2017densely} and also the single layer MPS model in~\citet{efthymiou2019tensornetwork} reported as Tensor Net-X. 
%The Tensor Net-X model uses a different feature map of the form: $\phi(x_i) = [x_i, 1-x_i]$. 
Additionally, we compare the PCam dataset performance to the rotation equivariant CNNs method which also introduced the dataset~\citep{veeling2018rotation}. We report area under the ROC curve (AUC) as the metric to compare the different models.

The test set performance for PCam and LIDC datasets with the relevant comparing methods are are reported in Table~\ref{tab:pcam}. We notice that LoTeNet attains an AUC of $0.943$ on the PCam dataset which is comparable with the methods of~\citet{veeling2018rotation} and~\citet{dinh2016density}. There is a clearer improvement when compared to DenseNet on the LIDC dataset. Further, LoTeNet outperforms the Tensor Net-X  with a single layer MPS~\citep{efthymiou2019tensornetwork} on both datasets. LoTeNet takes about $3$ minutes per training and validation epoch on the PCam dataset and $30s$ on the LIDC dataset.

\begin{table}[t]
\small
\centering
    \caption{Performance comparison on PCam dataset (left) and LIDC dataset (right). For the LIDC models we also compare the GPU memory utilisation shown in gigabytes.}
    \label{tab:pcam}
    \begin{minipage}{.39\linewidth}
     \centering
    \begin{tabular}{lccc}
    \toprule
    PCam Models & GPU (GB) & AUC  \\
    \midrule
    Rotation Eq-CNN & $11.0$ &$0.963$     \\
    {DenseNet} &$10.5$& $0.962$    \\
    {LoTeNet (ours)} & $0.8$ &$0.943$ \\
    {Tensor Net-X ($\beta=10$)} & $5.2$ & $0.908$ \\
    \bottomrule
  \end{tabular}
    \end{minipage}%
    \hspace{0.5cm}
    \begin{minipage}{.55\linewidth}
      \centering
  \begin{tabular}{lccc}
    \toprule
LIDC Models & GPU (GB)& AUC\\
%& (in GB)& ($\%$)\\
    \midrule
    {LoTeNet (ours)} & $0.7$ & $0.874$  \\
    {Tensor Net-X ($\beta=10)$} & $4.5$ & $0.847$ \\
    {DenseNet} & $10.5$ & $0.829$ \\
    {Tensor Net-X ($\beta=5)$} & $1.5$ & $0.823$ \\
    \bottomrule
  \end{tabular}
    \end{minipage}
\end{table}

\section{Discussion and Conclusions}% and Future Work}

The most important hyperparameter of any MPS model is its bond dimension $\beta$ as it controls the quality of approximation of the high dimensional tensor. In our proposed model, LoTeNet, which is composed of layers with MPS blocks, we noticed the performance to be robust to the changes in $\beta$ (Figure~\ref{fig:bondDim} in Appendix). This is consistent with other findings where bond dimension after a certain number (around $10$) has shown to have no impact on the performance of the models~\citep{efthymiou2019tensornetwork}. Due to the distributed nature of approximation in LoTeNet across several layers this is all the more pronounced and we find only minor fluctuations in performance of the model and we get away with a much smaller $\beta=5$.

The results reported for the LIDC-IDRI dataset in Table~\ref{tab:pcam} are based on the model configuration (including hyperparameters such as learning rate and batch size) obtained for the PCam dataset. This further strengthens the case for Tensor Network based methods as they can be easily transferred to different datasets. 

%Tensor Networks are easier to interpret as their decision rule is a linear model in Eq.~\eqref{eq:deRule}. %Each MPS block in Figure~\ref{fig:loTenet} is performing a linear regression on high dimensional features of the input. 
%Investigating intermediate predictions between MPS layers can also give more insight into how the final decision rules are learnt. 

In Table~\ref{tab:pcam}, we also report the GPU memory requirement for each of the models. Tensor network models require only a fraction of the memory utilised by the corresponding DenseNet or Rotation Eq-CNN models even when the number of parameters in LoTeNet is higher ($1M$ when compared to $120,000$ for the other two models~\citep{veeling2018rotation}). This drastic reduction in GPU memory utilisation is because tensor networks do not maintain massive intermediate feature maps, unlike CNNs which use a large chunk of GPU memory mainly to store intermediate feature maps~\citep{rhu2016vdnn}. As the entire pipeline of LoTeNet is based on contracting input data into smaller tensors it does not grow in memory consumption with successive contracted layers. This can be an important feature in medical imaging applications as larger images and larger batch sizes can be processed.
%The recent interest in adapting Tensor Networks to machine learning tasks is promising. Extending these models to volume data would be a natural next step. While the optimisation in this work was performed using automatic differentiation, it might be interesting to apply the class of sweeping algorithms~\citep{schollwock2005density} that are used in physics. Currently, these algorithms are not scalable to the large scale data encountered in medical imaging tasks~\citep{klus2019tensor}.

%\section{Conclusion}

In conclusion, the proposed model, LoTeNet, overcomes the loss of global structure due to flattening in tensor networks using locally orderless regions that are added to the feature dimension of the input image. By using a hierarchical approach, the model also retains the global structure. We have demonstrated the ability of the model to perform classification on two publicly available datasets, yielding performance comparable to state-of-the-art deep learning models -- using fewer model hyperparameters and substantially smaller GPU memory consumption.

\section*{Acknowledgements }
The authors would like to thank Silas Ørting and Mathias Perslev for their useful feedback on the manuscript. The authors also thank the four anonymous reviewers and the Area Chair for their insightful feedback which has strengthened the manuscript enormously.
%\newpage

% \bibliographystyle{plain}
\bibstyle{plain}
\small
\bibliography{selvan20}

%\newpage
\appendix

\section{Supplementary Material}
\subsection{Further details on LIDC Dataset}
\begin{figure}[htbp]
\centering
  {\includegraphics[width=0.9\linewidth]{figures/lidc.eps}}
  \caption{Four instances where the different raters do not agree. From left to right: One rater, two raters, three raters and four raters, indicated presence of nodules. In the binary task formulation, the first two patches will have a negative label and the last two will have positive labels.}
  \label{fig:lidc}
\end{figure}

\begin{figure}[htbp]
\centering
  {\includegraphics[width=0.65\linewidth]{figures/lidcHist.pdf}}
  \caption{Histogram of number of raters agreeing on the presence of a lesion in each image. We merge the 1R and 2R classes to form the negative class and merge 3R and 4R class to obtain the positive class. It naturally leads to a well balanced data set.}
  \label{fig:lidcHist}
\end{figure}

\subsection{Model selection using PCam dataset}

\begin{figure}[h]
\centering
  {\includegraphics[width=0.65\linewidth]{figures/bondDim.pdf}}
  \caption{Influence of varying the bond dimensions reported with the best validation AUC on the PCam dataset.}
  \label{fig:bondDim}
\end{figure}

\begin{figure}[h]
\centering
  {\includegraphics[width=0.65\linewidth]{figures/pcamLearning.pdf}}
  \caption{Learning curve for our model showing the evolution of the loss and AUC for training and validation data.}
  \label{fig:lrCurvePcam}
\end{figure}
\end{document}
