\documentclass{midl} % Include author names
%\documentclass[anon]{midl} % Anonymized submission

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{xcolor}
\usepackage{multirow}

\usepackage{color, colortbl}
\definecolor{Gray1}{gray}{0.97}
\definecolor{Gray2}{gray}{0.9}

\usepackage{mwe} % to get dummy images
%\jmlrvolume{-- Under Review}
\jmlryear{2020}
\jmlrworkshop{Full Paper -- MIDL 2020}
%\editors{Under Review for MIDL 2020}

\title[Cross-modal vessel segmentation]{Deep learning-based retinal vessel segmentation with cross-modal evaluation}

\midlauthor{\Name{Luisa Sanchez Brea} \Email{m.sanchezbrea@erasmusmc.nl}\\
\Name{Danilo Andrade De Jesus} \Email{d.andradedejesus@erasmusmc.nl}\\
\Name{Stefan Klein} \Email{s.klein@erasmusmc.nl}\\
\Name{Theo van Walsum} \Email{t.vanwalsum@erasmusmc.nl}\\
\addr Biomedical Imaging Group Rotterdam, Department of Radiology and Nuclear Medicine, \\Erasmus MC, Rotterdam, The Netherlands}

\begin{document}

\maketitle

\begin{abstract}
%In this work, a deep-learning based approach is presented to segment the retinal vascular tree in images from Fundus Photography (FP) and Scanning Laser Ophthalmoscopy (SLO). FP and SLO images of four and two publicly available datasets, respectively, were used in this study. A state-of-the-art convolutional neural network, U-net, was trained on each dataset individually and either on FP or SLO images without augmentation or pre-processing. For each dataset, 70\% of the images were used for training, 20\% for validation, and a final 10\% for testing. The patch size %(32, 64, 128, and 256) 
%and number of patches were studied in terms of accuracy, sensitivity, specificity, and Dice score. The performance of each network on a different modality was also tested. The models trained on each dataset showed a performance comparable to the state-of-the art and to the inter-rater reliability. Overall, the best performance was observed for the largest patch size (256) and the maximum number of overlapped images in each dataset, with a mean sensitivity, specificity, accuracy, and Dice of 0.89$\pm$ 0.05, 0.95$\pm$0.02, 0.95$\pm$0.02, and 0.73$\pm$0.07, respectively. Models trained and tested on the same modality presented a sensitivity, specificity, and accuracy equal or higher than 0.9. However, the validation on a different modality has shown significantly better sensitivity and Dice on those trained on FP. A state-of-the-art network, such as U-net, can thus be trained without pre-processing and augmentation and still perform as good as a manual grader as far as a large patch size and enough images are assured. 

%This work proposes a general pipeline for retinal vessel segmentation on \emph{en-face} images, based on previous developments on Fundus Photography (FP), and studies its applicability to a different imaging modality, Scanning Laser Ophthalmoscopy (SLO). FP and SLO images of four and two publicly available datasets, respectively, were used in this study. First, the effect of varying the patch size and number of patches was studied in terms of accuracy, sensitivity, specificity, and Dice score. To that end, a U-net model was trained on each dataset individually, using 70\% of the images for training, 20\% for validation, and a final 10\% for testing. Then, the model was trained in either FP or SLO images. The performance of each network was also tested on the other modality, in order to assess if the knowledge is transferable between them. The models trained on each dataset showed a performance comparable to the state-of-the art and to the inter-rater reliability. Overall, the best performance was observed for the largest patch size (256) and the maximum number of overlapped images in each dataset, with a mean sensitivity, specificity, accuracy, and Dice of 0.89$\pm$ 0.05, 0.95$\pm$0.02, 0.95$\pm$0.02, and 0.73$\pm$0.07, respectively. Models trained and tested on the same modality presented a sensitivity, specificity, and accuracy equal or higher than 0.9. The validation on a different modality has shown significantly better sensitivity and Dice on those trained on FP.

This work proposes a general pipeline for retinal vessel segmentation on \emph{en-face} images. The main goal is to analyse if a model trained in one of two modalities, Fundus Photography (FP) or Scanning Laser Ophthalmoscopy (SLO), is transferable to the other modality accurately. This is motivated by the lack of development and data available in \emph{en-face} imaging modalities other than FP. FP and SLO images of four and two publicly available datasets, respectively, were used. First, the current approaches were reviewed in order to define a basic pipeline for vessel segmentation. A state-of-art deep learning architecture (U-net) was used, and the effect of varying the patch size and number of patches was studied by training, validating, and testing on each dataset individually. Next, the model was trained in either FP or SLO images, using the available datasets for a given modality combined. Finally, the performance of each network was tested on the other modality. The models trained on each dataset showed a performance comparable to the state-of-the art and to the inter-rater reliability. Overall, the best performance was observed for the largest patch size (256) and the maximum number of overlapped images in each dataset, with a mean sensitivity, specificity, accuracy, and Dice score of 0.89$\pm$ 0.05, 0.95$\pm$0.02, 0.95$\pm$0.02, and 0.73$\pm$0.07, respectively. Models trained and tested on the same modality presented a sensitivity, specificity, and accuracy equal or higher than 0.9. The validation on a different modality has shown significantly better sensitivity and Dice on those trained on FP.


%In this work, a state-of-the-art convolutional neural network, U-net, is used to segment the retinal vascular tree in images from Fundus Photography (FP) and Scanning Laser Ophthalmoscopy (SLO). FP and SLO images of four and two publicly available datasets, respectively, were used in this study. First, the effect of varying the patch size and number of patches was studied in terms of accuracy, sensitivity, specificity, and Dice score. To that end, a model was trained on each dataset individually, using 70\% of the images for training, 20\% for validation, and a final 10\% for testing. Then, the model was trained in either FP or SLO images. The performance of each network was also tested on the different modality, in order to assess if the knowledge is transferable between them. The models trained on each dataset showed a performance comparable to the state-of-the art and to the inter-rater reliability. Overall, the best performance was observed for the largest patch size (256) and the maximum number of overlapped images in each dataset, with a mean sensitivity, specificity, accuracy, and Dice of 0.89$\pm$ 0.05, 0.95$\pm$0.02, 0.95$\pm$0.02, and 0.73$\pm$0.07, respectively. Models trained and tested on the same modality presented a sensitivity, specificity, and accuracy equal or higher than 0.9. The validation on a different modality has shown significantly better sensitivity and Dice on those trained on FP.
\end{abstract}

\begin{keywords}
deep learning, retina, vessel segmentation, scanning laser ophthalmoscopy, fundus photography
\end{keywords}

\section{Introduction}

% The retina, why is important for diagnosis: observe the inside of the human body without operations
The eye is one of the most complex organs in the human body. Its importance is not just limited to the vision, as it also offers a possibility to non-invasively look at structures such as vessels. A healthy eye is composed of a sequence of refractive transparent structures that allow the light to be focused at the retina~\cite{jesus2015simplifying}, where it is converted to electrical signals via chemical reactions. This process consumes high levels of oxygen and nutrients, making the retina one of the most metabolically active tissues in the human body. A well-organized ocular vascular system adapts to meet these metabolic requirements to ensure visual function. Hence, changes in the retinal vasculature serve as a biomarker for a number of ocular pathologies, such as advanced macular degeneration~\cite{mullins2011choriocapillaris} or glaucoma~\cite{jesus2019quantitative}, but also for other diseases such as hypertension~\cite{klein1997relation} or diabetes~\cite{fong2004retinopathy}.

% Different imaging modalities: fundus, but also scanning
A number of modalities have been developed for imaging the human retina over the last decades. Due to its simplicity and affordability, Fundus Photography (FP) has been extensively used, and it is nowadays present in most of the ophthalmic clinics. The fast acquisition and wide-field coloured images have kept FP relevant even when more complex/newer techniques, such as Optical Coherence Tomography (OCT), appeared. Although the vascular tree is usually clearly visible on FP images, changes in the retinal pigmentation, or lesions, may not be seen within the visible spectrum range. Hence, other \emph{en face} imaging modalities have been developed, such as  Scanning Laser Ophthalmoscopy (SLO). While in FP the image is obtained with one single shot, in SLO the laser beam scans the retina line by line. Although the acquisition time is slightly longer, the image quality is usually better than in FP. The increase in terms of quality or sharpness may not be clearly noticeable in healthy subjects, but has been reported in patients with diseases that affect the anterior chamber, such as cataract. In these cases, the evaluation of structures in scanning modalities has been described as more consistent \cite{kirkpatrick1995fundus}. Also, in diseases that only alter certain areas of the retina, such as geographic atrophy \cite{schmitz2008evaluation}, or reticular pseudodrusen \cite{schmitz2011reticular}, an improvement has been observed. Another advantage of SLO is that this modality is usually integrated in OCT imaging devices. Thus, the findings of the SLO can be complemented with the OCT in-depth information. Lastly, unlike FP, SLO does not require pupil dilation to attain high-quality images, making it more comfortable for the patient \cite{kelly2003imaging}. 

% Advantages and motivation for automatic vessel segmentation
Regardless the imaging modality adopted in ophthalmic care, an accurate segmentation of the arteriovenous retinal tree is needed to support the clinical diagnosis and follow-up. However, to perform a manual segmentation is a tedious and a time-consuming task, specially if the capillaries are needed. 
% Most works on fundus, in all types of techniques
Hence, extensive literature in vessel segmentation based on FP imaging \cite{srinidhi2017recent} has been published over the last two decades. Despite the number of procedures presented, conventional image processing  has been replaced by convolutional neural network (CNN) based techniques over the last years. This is a trend observed not only on ophthalmic care but also in other medical fields of research. Therefore, a prior attention has been given to CNN in this work.

\subsection{State-of-the-art}

In \cite{wang2015hierarchical}, a CNN was used as feature extractor, combined with a random forest for classification. The architecture consisted of 6 layers, with input size 25$\times$25. The approach was validated in two public databases, 
%DRIVE and STARE
using $\sim$200k samples of each one. The input data was pre-processed. Similarly, \cite{fu2016retinal} proposed a CNN in combination with a conditional random field, with the goal of creating an architecture specific for retinal vessel segmentation. An opposing point of view is explored in~\cite{wu2016deep}, where the authors' goal was to find a method widely applicable to diverse vessel tracking problems, not necessarily in the retina. The CNN is refined using principal components analysis (PCA). The approach is based on N4-fields \cite{ganin2014n}, and it had a similar performance as the original N4-fields. In \cite{guo2018novel}, an ensemble of networks was proposed, each of which had 10 layers and 64$\times$64 inputs, without pre-processing. In \cite{liskowski2016segmenting}, RGB patches of 27$\times$27$\times$3 from three public datasets 
%DRIVE, STARE, and CHASE, %
were used. They training set consisted of 3 to 5 millions of patches, depending on the dataset. The authors analysed the effect of pooling, different types of pre-processing, and the use of data augmentation. They did not observe strong changes in these experiments, but they noticed an improvement when adding more patches on problematic/difficult regions. In \cite{oliveira2018retinal}, a new architecture was proposed. The authors applied both pre-processing and data augmentation. The data augmentation either did not improve the performance of the model or lead to a change in the dropout values or oversampling. The patch size was 88$\times$88, and three public databases 
%and DRIVE, STARE and CHASE datasets 
were used, taking between 2750 and 3750 patches per image depending on the image size in each dataset. In \cite{melinvsvcak2015retinal}, a 10-layer architecture was proposed, without any pre-processing besides extracting the green channel.
Some authors have also used pre-trained networks. In~\cite{maninis2016deep}, a pre-trained VGG was used, combining ideas from the Inception architecture to use feature maps of different sizes. In~\cite{jiang2018retinal}, another transfer learning approach was proposed, based on the AlexNet architecture. The authors created patches of 50$\times$50, and then resized them to 500$\times$500 in order to enlarge the details. They applied pre-processing, and used more than 80000 patches for training. Moreover, they also focused on the post-processing to refine the outputs. In~\cite{mo2017multi}, another pre-trained VGG was tested, similar to \cite{maninis2016deep}. The validation was performed on three public datasets,  %also on DRIVE, STARE, and CHASE, %
and 5 to 10 patches were selected per image. %The authors performed tests removing certain branches of the network.\\
Finally, there are some approaches such as \cite{girard2019joint}, that combined vessel segmentation with specific applications, such as artery/vein classification. For the segmentation part, a U-net architecture was used. A median filter was applied in each channel of each patch, and then concatenated to the input, so that each input was composed by six channels. Data augmentation was also used.

% Can the knowledge on fundus be transferred to SLO?
In contrast with the amount of approaches that segmented blood vessels on FP, only one approach focused on SLO vessel segmentation was found in the literature. In \cite{meyer2017deep}, the authors used a U-net \cite{ronneberger2015u} trained on public datasets, 
%IOSTAR and RC-SLO, 
and they achieved good results with patches of size 128$\times$128. 

From this succinct literature review, it can be concluded that most of the authors obtained comparable results with a wide range of different approaches. Although some works study the effect of pre-processing or data augmentation, most authors proposed specific architectures. Only \cite{meyer2017deep} for SLO and \cite{girard2019joint} for FP used a state-of-the-art network on image segmentation, U-net. The number of patches that has been reported changes from less than ten patches per image up to a few millions of patches in total. Such variability has also been observed for the size of the patches. Regarding validation, all the reviewed works used at least one of the publicly available datasets listed in Table \ref{tab:datasets}, making the results of different approaches comparable. %These datasets have the advantage of making the results of different approaches comparable. 

\begin{table}[h!]
    \centering
    \caption{Publicly available datasets used in this work displayed by name, modality, number of images (\# Im.), image size, and field-of-view (FoV). }
    \label{tab:datasets}
    \begin{scriptsize}
    \begin{tabular}{c|c|c|c|c}
        Name      & Modality & \# Im. & Image size & FoV  \\\hline
        DRIVE \cite{staal2004ridge} & FP  & 40 & 584$\times$565   & 45$^{\circ}$ \\
        %STARE \cite{hoover2000locating,hoover2003locating}      & FP  & 30 & 700$\times$605   & 35$^{\circ}$ \\
        STARE \cite{hoover2000locating}      & FP  & 30 & 700$\times$605   & 35$^{\circ}$ \\
        HRF \cite{budai2013robust}       & FP  & 45 & 3304$\times$2336 & 60$^{\circ}$ \\
        CHASE\_DB1 \cite{fraz2012ensemble} & FP  & 28 & 1280$\times$960  & 30$^{\circ}$ \\
        %,abbasi2015biologically
        IOSTAR \cite{zhang2016robust}    & SLO & 30 & 1024$\times$1024 & 45$^{\circ}$ \\
        RC-SLO \cite{zhang2016robust} & SLO & 40 & 360$\times$320   & 45$^{\circ}$ \\
    \end{tabular}
    \end{scriptsize}
\end{table}

%Given the observed variability in terms of CNN architectures published over the last years, it is difficult to conclude on a robust procedure that could be easily implemented and generalized to different imaging modalities in clinical practice, such as FP and SLO. Therefore, the goal of this paper is two-fold: first, to systematically analyse a state-of-art deep learning segmentation approach, the U-net, studying the effect of the number of patches and respective size, whereas quantifying the accuracy, sensitivity, and specificity in each of the public available databases. It is the aim of this work to verify if a U-net could be easily implemented in the clinical practice, without pre-processing, post-processing, or data augmentation, and still be able to provide accurate results. The second goal is to observe if the knowledge of such architecture trained on FP is transferable to SLO, or vice-versa, and hence infer if there are significant differences depending on the imaging modality. 

%Given the variability in CNN architectures and parameters proposed over the last years, it is difficult to conclude on a general procedure that could be easily applied to different imaging modalities. The goal of this paper is two-fold: first, to propose a basic retinal vessel segmentation pipeline using a state-of-art architecture, U-net, analysing the influence that the parameters patch size and number of patches have in the output. Taking into account the findings in the literature, the pipeline was kept as simple as possible, without pre-processing, post-processing, or data augmentation. The second goal is to investigate the transferability of information between \emph{en face} retinal imaging modalities in vessel segmentation. This is motivated by the wide range of approaches and larger datasets available on FP, while other \emph{en face} imaging modalities, such as SLO, have seen less development.

%The literature review also illustrates that, while a large number of approaches have been developed for retinal vessel segmentation on FP, other \emph{en-face} modalities have not received as much attention. Thus, 


The main motivation for this work is to study if a model trained in one of the modalities, FP or SLO, can be used to segment the other accurately. This goal is tackled in two steps. First, a review on the existing approaches for vessel segmentation was performed in order to establish a basic pipeline. Consequently, a state-of-art segmentation architecture (U-net) was used, and the influence of two parameters, patch size and number of patches, was analysed. Taking into account the reviewed state-of-the-art, the pipeline was kept as simple as possible, without pre-processing, post-processing, or data augmentation. In the second part of this work, the pipeline was used to investigate the transferability of information between \emph{en-face} retinal imaging modalities in vessel segmentation. The results for each modality individually, as well as the cross-modal evaluation, are presented and discussed.


\section{Methods}
Figure \ref{fig:pipeline} depicts the pipeline adopted in this work. A fixed CNN architecture, U-net with Adam optimizer (learning rate 0.001) and 20\% of dropout was trained during 1000 epochs. The U-net is a state-of-art architecture in medical image segmentation problems, and it got its name from its distribution in two branches: first, a contracting path, which applies a sequence of two 3$\times$3 convolutions, ReLU, and 2$\times$2 max pooling operations. The second branch is an expansive path, which consists in a sequence of upsampling, 2$\times$2 convolution, two 3$\times$3 convolutions, and ReLU operations. Additionally, there are connections between both branches of the network, %as shown in Fig. \ref{fig:unet},
in order to incorporate part of the feature maps from the contracting path in the computation of the expansive path. The last layer of the network performs a 1$\times$1 convolution operation to map the components of the feature vectors into the desired number of classes. The CNN has a total of 23 convolutional layers.

\begin{figure}[h!]
    \centering
        \includegraphics[width=0.6\textwidth]{img/pipeline_vessel_segmentation2.png}
    \caption{Schematics of the pipeline used in this work. N relates to the number of patches per image depending on the ratio between image and patch size.}
    \label{fig:pipeline}
\end{figure}

%\begin{figure}[h!]
%    \centering
%        \includegraphics[width=0.7\textwidth]{img/Unet_retina.eps}
%    \caption{Architecture of the CNN U-net used in this work.}
%    \label{fig:unet}
%\end{figure}

The Dice score \cite{havaei2017brain} was used as a loss function for training the U-net CNN. %The Dice score  is computed as:
%\begin{equation}
    %DICE = \frac{2|y \cap \^{y}|}{|y| + |\^{y}|}
%\end{equation}
%with $y$ the ground truth (binary mask with the vessel as positive class and the remaining pixels as the negative class), and $\^{y}$ the values predicted by the model. 
This loss function was selected due to the nature of the labels, which are highly imbalanced, as the vessels represent only a small portion of the pixels in each image. The Dice score is expected to improve the sensitivity of the model, which is usually the least optimal parameter reported in literature \cite{srinidhi2017recent}. Different patch sizes were evaluated: 32, 64, 128, and 256. The smallest size was selected based on the literature review~\cite{wang2015hierarchical,liskowski2016segmenting}, and the largest size according to the smallest image within the datasets included (RC-SLO dataset). No pre-processing was considered, and only the green channel of the FP images, the most informative for vessel segmentation~\cite{ramlugun2012small,staal2004ridge}, was used% (Figure \ref{fig:channels}) 
. No data augmentation was applied, as some previous works reported minimal improvement with the inclusion of augmentation~\cite{liskowski2016segmenting,oliveira2018retinal} and accurate results without augmentation~\cite{wang2015hierarchical,jiang2018retinal}.

%\begin{figure}[h!]
%    \centering
%        \begin{tabular}{c@{\hskip 0in}c@{\hskip 0in}c@{\hskip 0in}c}
%            \includegraphics[width=0.12\textwidth]{img/channel_all.png} &
%            \includegraphics[width=0.12\textwidth]{img/channel_red.png} & 
%            \includegraphics[width=0.12\textwidth]{img/channel_green.png} & 
%            \includegraphics[width=0.12\textwidth]{img/channel_blue.png} \\
%        \end{tabular}
%    \caption{From left to right: fundus image from DRIVE dataset and each of the three RGB channels: red, green, and blue.}
%    \label{fig:channels}
%\end{figure}

In this work, six public datasets have been used, four in FP and two in SLO (see Table \ref{tab:datasets}). All of the datasets have at least one manual segmentation available, and different image characteristics. For each dataset, 70\% of the images were used for training, 20\% for validation, and a final 10\% for testing. The division was done at the image level instead of in the patches in order to not include patches for a given image in both training and validation sets, as some of them would overlap and bias the outcome. The specific number of patches per image depended on the image size and on the patch size, and it was computed as $(img_x \times img_y / patch_x \times patch_y) \times N$, where $(img_x \times img_y / patch_x \times patch_y)$ is the theoretical non-overlapping maximum number of patches that could be obtained per image. The factor $N$ varied between 1, 10, and 20. The batch size was as large as the available memory allowed for a specific patch size: 128, 64, 32, and 16, for the smallest to largest patch size respectively. For the test set, each pixel in the predicted image was classified in one of four categories: true positive, true negative, false positive, and false negative. Then, the accuracy, sensitivity, specificity, and Dice score were obtained. % as:
%\begin{equation}
%    accuracy = \frac{TPos+TNeg}{TPos+FPos+TNeg+FNeg}
%\end{equation}
%\begin{equation}
%    sensitivity = \frac{TPos}{TPos+FNeg}
%\end{equation}
%\begin{equation}
%    specificity = \frac{TNeg}{TNeg+FPos}
%\end{equation}

%These three quality measures are the most commonly reported in previous works, so they are the most appropriate for comparison purposes.\\

\section{Results}
The inter-observer agreement was computed in the datasets that have several manual annotations available per image, in order to establish the maximum expected values for the evaluation metrics. The values of sensitivity, specificity, accuracy, and Dice were 0.81, 0.98, 0.96, and 0.79 for DRIVE; 0.64, 0.99, 0.95, and 0.74 for STARE; and 0.80, 0.98, 0.97, and 0.78 for CHASE\_DB1. It can be observed that sensitivity and Dice score are quite low in comparison to accuracy and specificity, which is coherent with the outcomes reported in literature for automated segmentation methods (Appendix \ref{sec:appC}, Table \ref{tab:literature}). These results are also justified by the nature of the data, as the most difficult part (for both manual and automated approaches) is to label the capillaries correctly.

%\begin{table}[h!]
%    \centering
%    \caption{Inter-observer agreement for the datasets that provide two manual segmentations.}
%    \label{tab:agreement}
%    \begin{small}
%    \begin{tabular}{c|c|c|c}
%        Dataset       & Sensitivity  & Specificity & Accuracy \\ \hline
%        DRIVE         & 0.81 & 0.97 & 0.95 \\
%        STARE         & 0.64 & 0.99 & 0.94 \\
%        CHASE\_DB1    & 0.80 & 0.97 & 0.96 \\
%    \end{tabular}
%    \end{small}
%\end{table}

%The performance of the U-net trained on each dataset for different patch sizes and respective number of patches, are depicted in Table \ref{tab:individual1sens}. %, \ref{tab:individual10sens}, and \ref{tab:individual20}. 
The complete results for the U-net trained on each dataset using different patch sizes and number of patches are depicted in Table \ref{tab:individual1sens}. A smaller N resulted on lower values for all the measures in most of the datasets. Only CHASE\_DB1 and HRF datasets obtained a sensitivity comparable to the state-of-the-art. Although it was not the case for all datasets, a slight improvement was observed for N=20 compared to N=10 in terms of sensitivity, specificity, and accuracy, while the Dice score remained mostly unchanged. The results have also shown that larger patches provide better results regardless the dataset. Among all metrics, sensitivity and Dice score were the most affected by the patch size.%, presenting lower values for small patches configuration. Figure \ref{fig:individual20} (left) illustrates the accuracy, sensitivity, and accuracy as a function of the patch size, for a fixed N value (N = 20). The results for different N values with a fixed patch size (256$\times$256) are depicted in Figure \ref{fig:individual20} (right).%\ref{fig:individualpatches}.\\

\begin{table}[h!]
    \centering
    \caption{Sensitivity, specificity, accuracy, and Dice score of each individual dataset for each $\#$ patches N and patch size. The highest values for each dataset are highlighted.}
    \label{tab:individual1sens}
    \begin{scriptsize}
    \begin{tabular}{c@{\hskip 0.1cm}c@{\hskip 0.5cm}cccc@{\hskip 0.5cm}cccc@{\hskip 0.5cm}cccc}
        \multirow{2}{*}{Dataset} & \multirow{2}{*}{Size} & \multicolumn{4}{c}{N = 1} & \multicolumn{4}{c}{N = 10} & \multicolumn{4}{c}{N = 20} \\[0.1cm]
                   &  & Sens.         & Spec.         & Acc.          & Dice          & Sens.         & Spec.         & Acc.          & Dice          & Sens.         & Spec.         & Acc.          & Dice \\
        \rowcolor{Gray1}DRIVE      & 256  & 0.53          & 0.98          & 0.93          & \textbf{0.00} & 0.89          & \textbf{0.93} & \textbf{0.93} & \textbf{0.72} & 0.93          & \textbf{0.92} & \textbf{0.92} & \textbf{0.71} \\ 
        \rowcolor{Gray1}           & 128  & \textbf{0.63} & 0.97          & \textbf{0.94} & \textbf{0.00} & \textbf{0.92} & 0.92          & 0.92          & 0.68          & \textbf{0.96} & 0.91          & 0.91          & 0.67 \\ 
        \rowcolor{Gray1}           & 64   &  0.48          & 0.98          & 0.93          & \textbf{0.00} & 0.86          & 0.90          & 0.91          & 0.58          & 0.90          & 0.88          & 0.90          & 0.57 \\
        \rowcolor{Gray1}           & 32   &  0.00          & \textbf{1.00} & 0.91          & \textbf{0.00} & 0.66          & 0.90          & 0.91          & 0.42          & 0.68          & 0.88          & 0.90          & 0.41 \\
				   
                  %& size & Sens.         & Spec.         & Acc.          & Dice          & Sens.         & Spec.         & Acc.          & Dice          & Sens.         & Spec.         & Acc.          & Dice \\				   
        \rowcolor{Gray2}STARE      & 256  & 0.04          & \textbf{1.00} & \textbf{0.96} & \textbf{0.00} & \textbf{0.82} & \textbf{0.97} & \textbf{0.96} & \textbf{0.69} & 0.78          & \textbf{0.97} & \textbf{0.95} & \textbf{0.69} \\
        \rowcolor{Gray2}           & 128  & \textbf{0.23} & \textbf{1.00} & \textbf{0.96} & \textbf{0.00} & 0.71          & \textbf{0.97} & \textbf{0.96} & 0.62          & \textbf{0.80} & 0.95          & 0.94          & 0.61 \\  
        \rowcolor{Gray2}           & 64   & 0.13          & \textbf{1.00} & \textbf{0.96} & \textbf{0.00} & 0.56          & \textbf{0.97} & \textbf{0.96} & 0.45          & 0.60          & 0.96          & \textbf{0.95} & 0.45 \\
        \rowcolor{Gray2}           & 32   & 0.01          & \textbf{1.00} & \textbf{0.96} & \textbf{0.00} & 0.36          & \textbf{0.97} & \textbf{0.96} & 0.29          & 0.41          & 0.94          & 0.94          & 0.29 \\
				   				   
                  %& size & Sens.         & Spec.         & Acc.          & Dice          & Sens.         & Spec.         & Acc.          & Dice          & Sens.         & Spec.         & Acc.          & Dice \\
        \rowcolor{Gray1}HRF        & 256  & \textbf{0.76} & 0.94          & 0.93          & \textbf{0.61} & \textbf{0.87} & 0.92          & \textbf{0.93} & \textbf{0.62} & \textbf{0.89} & \textbf{0.92} & \textbf{0.93} & \textbf{0.60} \\
        \rowcolor{Gray1}           & 128  & 0.63          & \textbf{0.95} & \textbf{0.94} & 0.50          & 0.73          & \textbf{0.93} & \textbf{0.93} & 0.52          & 0.76          & 0.91          & 0.92          & 0.49 \\ 
        \rowcolor{Gray1}           & 64   & 0.38          & \textbf{0.95} & \textbf{0.94} & 0.34          & 0.47          & 0.92          & 0.92          & 0.34          & 0.51          & 0.91          & 0.92          & 0.33 \\ 
        \rowcolor{Gray1}           & 32   & 0.23          & 0.92          & 0.92          & 0.17          & 0.29          & 0.88          & 0.89          & 0.20          & 0.31          & 0.87          & 0.89          & 0.20 \\                   

                  %& size & Sens.         & Spec.         & Acc.          & Dice          & Sens.         & Spec.         & Acc.          & Dice          & Sens.         & Spec.         & Acc.          & Dice \\
        \rowcolor{Gray2}CHASE      & 256  & \textbf{0.62} & \textbf{0.98} & \textbf{0.95} & 0.00          & \textbf{0.87} & \textbf{0.95} & \textbf{0.95} & \textbf{0.74} & \textbf{0.89} & \textbf{0.95} & \textbf{0.95} & \textbf{0.74} \\   
        \rowcolor{Gray2}DB1        & 128  & 0.57          & 0.97          & 0.94          & 0.02          & 0.76          & \textbf{0.95} & \textbf{0.95} & 0.59          & 0.79          & \textbf{0.95} & \textbf{0.95} & 0.59 \\  
        \rowcolor{Gray2}           & 64   & 0.21          & \textbf{0.98} & \textbf{0.95} & \textbf{0.29} & 0.57          & \textbf{0.95} & \textbf{0.95} & 0.42          & 0.57          & 0.94          & 0.94          & 0.41 \\  
        \rowcolor{Gray2}           & 32   & 0.17          & \textbf{0.98} & \textbf{0.95} & 0.05          & 0.36          & 0.92          & 0.93          & 0.24          & 0.34          & 0.92          & 0.93          & 0.24 \\
				   
                  %& size & Sens.         & Spec.         & Acc.          & Dice          & Sens.         & Spec.         & Acc.          & Dice          & Sens.         & Spec.         & Acc.          & Dice \\        
	\rowcolor{Gray1}	IOSTAR     & 256  & 0.09          & \textbf{1.00} & 0.89          & 0.00          & \textbf{0.89} & \textbf{0.96} & \textbf{0.96} & \textbf{0.80} & \textbf{0.92} & \textbf{0.96} & \textbf{0.96} & \textbf{0.79} \\
      \rowcolor{Gray1}             & 128  & \textbf{0.65} & 0.98          & \textbf{0.95} & 0.00          & 0.83          & \textbf{0.96} & 0.95          & 0.71          & 0.86          & 0.95          & 0.95          & 0.71 \\
      \rowcolor{Gray1}             & 64   & 0.31          & 0.99          & 0.93          & \textbf{0.32} & 0.67          & 0.95          & 0.95          & 0.52          & 0.68          & 0.94          & 0.95          & 0.52 \\
      \rowcolor{Gray1}             & 32   & 0.17          & 0.99          & 0.93          & 0.25          & 0.40          & 0.93          & 0.94          & 0.30          & 0.42          & 0.92          & 0.93          & 0.30 \\

                  %& size & Sens.         & Spec.         & Acc.          & Dice          & Sens.         & Spec.         & Acc.          & Dice          & Sens.         & Spec.         & Acc.          & Dice \\				   
        \rowcolor{Gray2}RC-SLO     & 256  & \textbf{0.00} & \textbf{1.00} & 0.90          & \textbf{0.00} & \textbf{0.88} & 0.98          & \textbf{0.97} & \textbf{0.82} & \textbf{0.91} & \textbf{0.97} & \textbf{0.97} & \textbf{0.83} \\
        \rowcolor{Gray2}           & 128  & \textbf{0.00} & \textbf{1.00} & 0.88          & \textbf{0.00} & 0.84          & 0.98          & 0.96          & 0.81          & \textbf{0.91} & \textbf{0.97} & \textbf{0.97} & 0.79 \\
	\rowcolor{Gray2}			   & 64   & \textbf{0.00} & \textbf{1.00} & 0.90          & \textbf{0.00} & 0.57          & \textbf{0.99} & 0.95          & 0.63          & 0.83          & 0.96          & 0.96          & 0.67 \\
	\rowcolor{Gray2}			   & 32   & \textbf{0.00} & \textbf{1.00} & \textbf{0.91} & \textbf{0.00} & 0.48          & 0.96          & 0.96          & 0.41          & 0.52          & 0.95          & 0.95          & 0.40 \\  
    \end{tabular}
    \end{scriptsize}
\end{table}

%\begin{figure}[h!]
%    \centering
%    \includegraphics[height=2.1in]{img/individual_datasets_plot.eps} 
%    \caption{Accuracy, sensitivity, and specificity as a function of the patch size for N = 20 in each individual dataset. The dashed line represents the average of the inter-observer agreement.}
%    \label{fig:individual20}
%\end{figure}

%\begin{figure}[h!]
%    \centering
%    \includegraphics[height=2.1in]{img/individual_datasets_plot_npatches.eps} 
%    \caption{Accuracy, sensitivity, and specificity as function of the number of patches for a patch size of 256$\times$256 in each individual dataset. The dashed line represents the average of the inter-observer agreement.}
%    \label{fig:individualpatches}
%\end{figure}

The configuration that obtained the best results overall (patch size = 256$\times$256 and N = 20) was used for training and testing on the same dataset individually. The comparison between the predicted image for each dataset and the respective ground-truth is shown in Figure \ref{fig:datasets} (left). Figure \ref{fig:datasets} (right) depicts how the automatic segmentation is affected by varying the patch size. 

%The results show that the same pipeline can be applied to both FP and SLO datasets, and still achieve an accurate performance if compared with previously published approaches (see Tables \ref{tab:literature} and \ref{tab:individual1sens}). %\ref{tab:individual20}). \\

\begin{figure}[h!]
    \centering
    \begin{tabular}{c@{ }c@{ }c@{ }c@{ }c@{ }c@{\hskip 1.0cm}c}
         \includegraphics[height=0.47in]{img/DRIVE_256_img_orig.png} &
         \includegraphics[height=0.47in]{img/STARE_256_img_orig.png} &
         \includegraphics[height=0.47in]{img/HRF_256_img_orig.png} &
         \includegraphics[height=0.47in]{img/CHASE_256_img_orig.png} &
         \includegraphics[height=0.47in]{img/IOSTAR_256_img_orig.png} &
         \includegraphics[height=0.47in]{img/RCSLO_256_img_orig.png} &%\\
         \includegraphics[height=0.47in]{img/RCSLO_128_img_overlap.png} \\
         
         \includegraphics[height=0.47in]{img/DRIVE_256_img_lbl.png} &
         \includegraphics[height=0.47in]{img/STARE_256_img_lbl.png} &
         \includegraphics[height=0.47in]{img/HRF_256_img_lbl.png} &
         \includegraphics[height=0.47in]{img/CHASE_256_img_lbl.png} &
         \includegraphics[height=0.47in]{img/IOSTAR_256_img_lbl.png} &
         \includegraphics[height=0.47in]{img/RCSLO_256_img_lbl.png} &%\\
         \includegraphics[height=0.47in]{img/RCSLO_64_img_overlap.png} \\
         
         \includegraphics[height=0.47in]{img/DRIVE_256_img_pred.png} &
         \includegraphics[height=0.47in]{img/STARE_256_img_pred.png} &
         \includegraphics[height=0.47in]{img/HRF_256_img_pred.png} &
         \includegraphics[height=0.47in]{img/CHASE_256_img_pred.png} &
         \includegraphics[height=0.47in]{img/IOSTAR_256_img_pred.png} &
         \includegraphics[height=0.47in]{img/RCSLO_256_img_pred.png} &%\\
         \includegraphics[height=0.47in]{img/RCSLO_32_img_overlap.png} \\
    \end{tabular}
    \caption{Left: retinal images (top), ground truth (middle), and output of the network (bottom), for datasets DRIVE, STARE, HRF, CHASE\_DB1, IOSTAR, and RC-SLO. Right: evolution of true positives (blue), false positives (green) and false negatives (red) for patch size 128, 64, and 32 on the same RC-SLO image.}
    \label{fig:datasets}
\end{figure}

In order to observe if the results were transferable from one imaging modality to another, networks were trained and tested combining all the datasets of the same modality, as depicted in Table \ref{tab:combined}. Given the number of patches available for the FP modality is more than 3 times higher than the number of SLO patches, the amount of patches sampled per image was lowered on the FP dataset to the same size of the SLO dataset. This way, the number of images for training was the same for both imaging modalities, avoiding an eventual bias due the different number of patches. As reference, the values for training and testing on the same type of image are also depicted. It can be observed that the outcome is the same than in Table \ref{tab:individual1sens}, and the configuration with the largest patch size lead to the best performance. Regarding the training and testing in different modalities, it was observed that training in SLO and testing in FP yields significantly lower results, showing poor sensitivity and Dice. However, training in FP and testing in SLO yields accurate results, comparable to those obtained training and testing in the same modality.

\begin{table}[h!]
    \centering
    \caption{Sensitivity, specificity, accuracy, and Dice score for training and testing in the same and different modalities. All datasets were merged by modality. The highest values per dataset highlighted.}
    \label{tab:combined}
    \begin{scriptsize}
    \begin{tabular}{c@{\hskip 0.3cm}c@{\hskip 0.5cm}cccc@{\hskip 0.5cm}cccc}
		Train & \multirow{2}{*}{Size} & \multicolumn{4}{c}{Test set FP}  & \multicolumn{4}{c}{Test set SLO} \\
        set &   &  Sens.   & Spec.   & Acc.      & Dice &  Sens.   & Spec.   & Acc.      & Dice \\[0.1cm]
		  
        \rowcolor{Gray2}FP    & 256  & \textbf{0.93} & \textbf{0.90} & 0.91          & \textbf{0.61} & \textbf{0.94} & \textbf{0.93} & \textbf{0.93} & \textbf{0.71} \\
        \rowcolor{Gray2}      & 128  & 0.83          & \textbf{0.90} & \textbf{0.91} & 0.50          & 0.91          & 0.90          & 0.91          & 0.62 \\ 
        \rowcolor{Gray2}      & 64   & 0.62          & 0.89          & 0.90          & 0.37          & 0.74          & 0.90          & 0.91          & 0.48 \\ 
        \rowcolor{Gray2}      & 32   & 0.41          & 0.84          & 0.86          & 0.23          & 0.47          & 0.79          & 0.82          & 0.27 \\%[0.1cm]
			 
        \rowcolor{Gray1}SLO   & 256  & \textbf{0.41} & \textbf{0.98} & \textbf{0.93} & \textbf{0.42} & \textbf{0.92} & \textbf{0.96} & \textbf{0.96} & \textbf{0.79} \\   
        \rowcolor{Gray1}      & 128  & 0.33          & \textbf{0.98} & \textbf{0.93} & 0.32          & 0.88          & 0.95          & 0.95          & 0.71          \\  
        \rowcolor{Gray1}      & 64   & 0.22          & \textbf{0.98} & \textbf{0.93} & 0.16          & 0.73          & 0.94          & 0.94          & 0.54          \\
        \rowcolor{Gray1}      & 32   & 0.15          & 0.97          & \textbf{0.93} & 0.11          & 0.45          & 0.92          & 0.93          & 0.32          \\ 
    \end{tabular}
    \end{scriptsize}
\end{table}


\section{Discussion}
Automated segmentation has been a subject of study by the image processing community for quite some time, and the number of works based on deep learning have exponentially grown over the last decade, pushing the boundaries of what was possible in the domain of digital image processing. Challenging problems are now being solved with substantially better performance compared to traditional methods. This trend has also reached medical image processing, including ophthalmic research. Automated retinal vessel segmentation on FP have substantially improved since the introduction of CNNs. Besides the boost in performance, promising results have been shown in cross-training (training the network in one dataset and testing in a second dataset, with high variability in the image characteristics between both \cite{wang2015hierarchical,jiang2018retinal}). In addition, the fact that these approaches are very fast (e.g. less than 1 second to process an image \cite{girard2019joint}), make them suitable for a real-time processing environment. Besides FP, good results have also been reported in SLO \cite{meyer2017deep}. However, despite all research performed so far, it is difficult to identify an approach across the current options of architectures, components, pre- and post-processing, among other aspects that may influence the results, that could be easily applied and used in the clinical practice. Thus, in this work, a few guidelines that serve as a baseline for retinal vessel segmentation have been established. 

In this work, it is shown that a simple CNN such as the U-net is good enough to replicate the current current results in the literature. These findings are in line with previous works \cite{isensee2018no}, that argue how a correctly tuned U-Net, also with a large patch size and Dice score, can outperform more tailor-made approaches in brain segmentation. While this work does not show if data augmentation, pre-processing, or post-processing, could improve the results, the obtained accuracy, sensitivity, and specificity are on par with the values reported in the state-of-the-art. Moreover, the obtained values are also on the same range as the inter-observer agreement, and can be thus taken as theoretical maximum. The results show that the largest the input patch size, the better. However, the depth of the CNN is fixed in this work, and the smaller patches may suffer from issues at the deeper levels, such as insufficient resolution or border issues when down- and up-sampling. The size 256$\times$256 provided the best results across all datasets. This also implies that a small number of images is enough to feed the network, as the largest patch sizes have been associated to a lower number of images. However, the number of patches must be sufficient, otherwise a dramatic drop on the sensitivity will occur, in agreement with the findings in \cite{oliveira2018retinal}. While the overall accuracy is barely affected, this metric must be handled carefully due to the imbalance of the labels. Overall, the main problem in all the reported methods is the sensitivity, as the capillaries tend to be ignored by the segmentation. Hence, this value should be always reported and carefully compared between methods. An interesting finding is that the results achieved for IOSTAR and RC-SLO in Table \ref{tab:individual1sens} have a higher sensitivity than the previously reported by \cite{meyer2017deep} in Table \ref{tab:literature}, despite of both approaches using a U-net. One of the key differences that may be causing the variation in the results is the choice of loss function, that in \cite{meyer2017deep} was the cross-entropy. In this work, Dice score was used, which emphasizes the weight of the true positives.% and, hence, the higher sensitivity is expected. 

While many approaches have been proposed for FP, other imaging modalities, such as SLO, have not received that much attention. One of the causes for this lack of interest is the absence of public labeled datasets. Nevertheless, it is shown that the network trained in FP still provides an accurate segmentation on SLO data, but the opposite seems to not be true. Such results may be justified by the characteristics of both datasets. The FP datasets are more varied and have more pathological data. The difference on sharpness may also justify the performance between modalities. The fact that imaged vessels in FP may be less sharp than in SLO may make the algorithm more robust to different data. As a future work, augmentations in SLO imaging should be considered to infer whether a model trained on SLO could eventually be applicable to FP. Lastly, one should also consider the hypotheses that the green channel of the FP may be more informative than the infrared image acquired with SLO for segmenting the retinal vasculature.



\section{Conclusion}
In this work, it is shown that a state-of-the-art network, such as U-net, can be trained without pre-processing and augmentation and still perform as good as a manual grader, as far as a large patch size and enough images are used to train the CNN. The knowledge obtained from training on different modalities, such as FP or SLO, is transferable, but its sensitivity depends on the modality used to train the network. In this study, it is shown that, despite all its simplicity, a colour fundus photograph appears to be much more informative for training network than an image obtained from SLO. Hence, despite the lack of manual annotations on SLO images, coloured fundus photographs can be used to develop new and better networks with potential applicability in SLO imaging.

% Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{This work was supported by the European research and innovation programme Horizon 2020 (grant agreement No 780989: Multi-modal, multi-scale retinal imaging project).}

\bibliography{sanchezbrea20}


\appendix

%\input{appendix.tex}

\clearpage
\section{Summary of state-of-art results on the public datasets}\label{sec:appC}

Table \ref{tab:literature} summarizes the results reported by previous approaches on retinal vessel segmentation that propose a CNN, using (at least one of) sensitivity, specificity, and accuracy on the public datasets listed in Table \ref{tab:datasets}.

\begin{table}[h!]
    \centering
    \caption{Values reported in previous works for deep learning-based vessel segmentation approaches. Best values for each dataset highlighted in bold.}
    \label{tab:literature}
    \begin{small}
    \begin{tabular}{c|c|c|c|c}
        Work & Dataset  & Sensitivity  & Specificity & Accuracy \\ \hline
        \multirow{2}{*}{\cite{wang2015hierarchical}} & DRIVE & 0.8173 & 0.9733 & \textbf{0.9767} \\
        & STARE & 0.8104 & 0.9791 & \textbf{0.9813} \\[3pt]
        
        \multirow{2}{*}{\cite{fu2016retinal}} & DRIVE & 0.7294 & - & 0.9470\\
                             & STARE & 0.7140 & - & 0.9545\\[3pt]

        %\cite{wu2016deep} & DRIVE & - - - \\ % Reports only AUC
        %\cite{maninis2016deep} & DRIVE &  \\ % Reports only DICE
        %& STARE \\
        
        % Values for the best model (chosen by the authors in the discussion)
        \multirow{2}{*}{\cite{liskowski2016segmenting}} & DRIVE & - & - & 0.9535 \\ 
                                       & STARE & -      & -      & 0.9729 \\[3pt]
                                       
        \multirow{2}{*}{\cite{guo2018novel}}            & DRIVE & \textbf{0.9859} & 0.7046 & 0.9613 \\
                                       & STARE & \textbf{0.9861} & 0.5628 & 0.9539 \\[3pt]
                                       
        \multirow{4}{*}{\cite{jiang2018retinal}}        & DRIVE & 0.7540 & \textbf{0.9825} & 0.9624 \\
                                       & STARE & 0.8352 & 0.9846 & 0.9734 \\
                                       & CHASE\_DB1 & \textbf{0.8640} & 0.9745 & \textbf{0.9668} \\
                                       & HRF   & \textbf{0.8010} & \textbf{0.8010} & \textbf{0.9650} \\[3pt]
                                       
        \cite{girard2019joint}         & DRIVE & -      & -      & 0.9493 \\[3pt]
        
        \multirow{3}{*}{\cite{oliveira2018retinal}}     & DRIVE & 0.8405 & 0.9814 & 0.9639 \\
                                       & STARE & 0.6329 & \textbf{0.9924} & 0.9365 \\
                                       & CHASE\_DB1 & 0.7731 & 0.9813 & 0.9600 \\[3pt]
                                       
        \cite{melinvsvcak2015retinal}  & DRIVE & 0.7276 & -      & 0.9466 \\[3pt]
        
        \multirow{3}{*}{\cite{mo2017multi}}             & DRIVE & 0.7779 & 0.9780 & 0.9521 \\
                                       & STARE & 0.8147 & 0.9844 & 0.9674 \\
                                       & CHASE\_DB1 & 0.7661 & \textbf{0.9816} & 0.9599 \\[3pt]
        
        \multirow{2}{*}{\cite{meyer2017deep}} & IOSTAR & \textbf{0.8038} & \textbf{0.9801} & \textbf{0.9695} \\
        & RC-SLO & \textbf{0.8090} & \textbf{0.9794} & \textbf{0.9623} \\
    \end{tabular}
    \end{small}
\end{table}


\end{document}
