\documentclass{midl} % Include author names
%\documentclass[anon]{midl} % Anonymized submission

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images
\jmlryear{2021}
\jmlrworkshop{Full Paper -- MIDL 2021}
\editors{Under Review for MIDL 2021}
\usepackage{wrapfig}
\usepackage{multirow}

\title[ttUNETCrops-ttAttNet]{Image Sequence Analysis via GRU and Attention for Trachomatous Trichiasis Classification}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Juan C. Prieto\nametag{$^{1}$}} \Email{jprieto@med.unc.edu}\\
\addr $^{1}$ Department of Psychiatry, UNC Chapel Hill, NC \\
\Name{Hina Shah\nametag{$^{1}$}} \Email{hinashah@email.unc.edu}\\
\Name{Kasey Jones\nametag{$^{2}$}} \Email{krjones@rti.org}\\
\addr $^{2}$ Center for Data Science, RTI International, Research Triangle Park, NC \\
\Name{Robert F. Chew\nametag{$^{2}$}} \Email{rchew@rti.org }\\
\Name{Hashiya M. Kana\nametag{$^{3}$}} \Email{hashiya@email.unc.edu}\\
\addr $^{3}$ Department of Epidemiology, UNC Chapel Hill, NC \\
\Name{Jerusha Weaver\nametag{$^{3}$}} \Email{jweave25@email.unc.edu}\\
\Name{Rebecca M. Flueckiger\nametag{$^{4}$}} \Email{rflueckiger@rti.org}\\
\addr $^{4}$ Global Health Division, RTI International, Research Triangle Park, NC \\
\Name{Scott McPherson\nametag{$^{4}$}} \Email{smcpherson@rti.org}\\
\Name{Emily W. Gower\nametag{$^{3,4}$}} \Email{egower@unc.edu}\\
}

\begin{document}

\maketitle

\begin{abstract}
Chlamydia trachomatous is an infectious ocular condition that can cause the eyelid to turn inward so that one or more eyelashes touch the eyeball, a condition call trachomatous trichiasis (TT), which  can lead to blindness. Community-based screeners are used in rural areas to identify patients with TT, who   can then be referred for proper medical care. Having automatic methods to detect TT will reduce the amount of time required to train screeners and improve accuracy of detection. This paper proposes a method to automatically identify regions of an eye and identify TT, using photographs taken with smartphones in the field. The attention-based gated deep learning networks in combination with a region- identification network can identify TT with an accuracy of 91\%, sensitivity of 92\% and specificity of 87\%, showing that these methods have the potential to be deployed in the field.
\end{abstract}

\begin{keywords}
Trachoma, Trachomatus Trichiasis, image classification, attention based networks
\end{keywords}

\section{Introduction}
Trachoma is the leading infectious cause of blindness world-wide\cite{resnikoff2002global, flueckiger2019global}. It is transmitted from person to person 
through ocular and nasal secretions or eye-seeking flies carrying the infection from one person to another. 
Repeated infections result in eyelid scarring, causing it to turn inwards and the eyelashes to touch the eye, a condition called trachomatous trichiasis (TT). 
If not corrected, inturned eyelashes can abrade the cornea, which can lead to blindness \cite{west2006association}. Surgery is the primary method for treating TT. 

The majority of individuals with TT live in low- and middle-income countries where access to health services is limited.  Surgical campaigns are conducted to provide surgery through outreach services. To facilitate planning and align resources, individuals with TT are often identified in advance and invited to the upcoming surgical camp. Current methods for TT case identification are suboptimal. Often, local community members are asked to serve as case screeners in their communities. These case screeners receive a brief training on how to identify TT and then are asked to go door-to-door in their village to screen for TT. These case screeners often have limited success at identifying TT appropriately, with positive predictive values for case identification ranging from 15-30\%\cite{greene2015assessment}. The time and cost associated with door-to-door case finding is substantial\cite{west2013close}, and  engaging case screeners with ophthalmology expertise to conduct this work is cost and resource-prohibitive in many scenarios.

% The vast majority of eyelids with trichiasis have some conjunctival scarring. However, 2.4\% of the cases do not have easily visible scars that should be considered 
% for trichiasis surgery\cite{gower2020trachomatous}. The 
% number of lashes touching the globe (five or more) is also used as diagnostic criteria for 
% surgery, while patients with minor TT are treated via epilation\cite{bowman2002longitudinal}. In both cases, patients should be remitted to the closest medical center for treatment. 

% One of the biggest challenges to eradicate TT is convincing patients to accept surgery. 
% While investing in developing surgical teams is the highest priority, doing so without ensuring patient uptake of surgery would be wasteful. A study found that training and deploying community-based screeners appears to be a very effective method to 
% increase patient uptake\cite{habtamu2015clearing} \cite{bowman2002longitudinal_a}. 

In this paper, we propose a method to close the gap between experts and non-experts in assessing TT by increasing screening accuracy, and reducing the time required for training community-based screeners. Previous work has demonstrated that early signs of trachoma that present on the underside of the eyelid can be detected better than chance\cite{kim2019sensitivity}. However, to our knowledge, we are the first to classify trichiatic eyelashes, a later stage of the disease, using machine learning. We identify TT cases through attention-based recurrent neural networks using automatically segmented regions of the eye. We chose this method based on experiments performed on our data set as well as additional constraints of computational cost for the algorithm, (\textit{i.e.}, it must be possible to run on a mobile device in remote communities without reliable internet connectivity).

\begin{figure}[h]
\centering
\includegraphics[width=\textwidth]{images/examples_tt.png}
\vspace{-5mm}
\caption{Examples of eyes with moderate TT (left two) and severe TT.}
\label{fig:examples_tt}
\end{figure} 

\section{Data}

We use images that were collected as part of the Maximizing Trichiasis Surgery Success Trial (MTSS)\cite{bayissasse2020maximizing} in Ethiopia. The MTSS trial was designed to evaluate three surgical approaches for correcting TT. A total of 5001 individuals with previously unoperated TT were enrolled and followed for one year.  Individuals were required to have TT in at least one eye. At each study visit, images were taken of each upper eyelid, with the participant looking up, using either a Motorola Moto X Pure edition or a Samsung Galaxy 8 smartphone. After imaging, a trained study examiner evaluated each eyelid for the presence of TT, defined as one or more eyelashes touching the eye or evidence of eyelash removal through epilation. The examiner recorded the number and location of any trichiatic eyelashes in the study database. Images were transmitted to the coordinating center at University of North Carolina at Chapel Hill, where they were assessed by a certified photograph grader. For the work in this paper the grader marked images with moderate or severe TT (based on number of eyelashes touching the eye, but no epilation) as having TT, and the eyes without TT as normal/healthy. Figure \ref{fig:examples_tt} shows examples of moderate and severe TT. 
In addition to images of eyes with TT, we collected upper eyelid images of 1,121 adults without TT, from the same region in Ethiopia. The combined dataset has 6,048 images. From these, our grader marked the sclera, cornea and upper eyelid regions of the eye for 1,113 images using a custom extension to 3D Slicer\footnote{\url{www.slicer.org}} \cite{Kikinis2014}\footnote{\url{https://slicer.readthedocs.io/en/latest/user_guide/modules/segmenteditor.html}}. Figure \ref{fig:eye_seg} shows examples of the expertly segmented regions of the eye. A set of 1,706 images with good quality were also identified for the task of classification.

% We use images that were collected as part of the Maximizing Trichiasis Surgery Success Trial (MTSS)\cite{bayissasse2020maximizing} in Ethiopia. The MTSS trial was designed to evaluate three surgical approaches for correcting TT. A total of 5001 individuals with previously unoperated TT were enrolled and followed for one year.  Individuals were required to have TT in at least one eye, and approximately 50\% had TT in both eyelids. At each study visit, images were taken of each upper eyelid, with the participant looking up, using either a Motorola Moto X Pure edition or a Samsung Galaxy 8 smartphone. After imaging, a trained study examiner evaluated each eyelid for the presence of TT, defined as one or more eyelashes touching the eye or evidence of eyelash removal through epilation. The examiner recorded the number and location of any trichiatic eyelashes and this information was entered into the study database. Images were transmitted to the coordinating center at University of North Carolina at Chapel Hill, where they were assessed by a certified photograph grader. The number of eyelashes and extent of epilation define the TT severity, which is categorized as mild, moderate or severe using a standardized system. Figure \ref{fig:examples_tt} shows examples of moderate and severe TT. 
% All eyes with TT received surgery and then were assessed at three time points within a year after surgery. Most eyelids did not have TT return, and thus could be categorized as normal/healthy eyelids for the current analytic purposes. In addition to images of eyes with TT, we collected upper eyelid images of 1,121 adults without TT, from the same region in Ethiopia. The combined dataset has 11,617 images.

% In this work, we focus on images of upper eyelid with good quality, and that are graded as either normal or having moderate or severe TT without epilation. From a total of 8,089
% our expert photograph grader selected 
% a subset of 1,661 images (657 normal, 493 moderate TT and 511 severe TT) to train our classification algorithms.
% Additionally, our grader marked the sclera, cornea and upper eyelid regions of the eye for 1,113 using a custom extension to 3D Slicer\footnote{\url{www.slicer.org}} \cite{Kikinis2014}\footnote{\url{https://slicer.readthedocs.io/en/latest/user_guide/modules/segmenteditor.html}}. Figure \ref{fig:eye_seg} shows examples of the expertly segmented regions of the eye.
% Although these are slightly different sets of images, we do have an overlap between them, and our consistent test data of 308 images has both manual segmentations and TT grading. These images
% are left out for testing our approach (73\% for training and \%27 for testing the UNET segmentation; 82\% for training 18\% for testing classification).
% The validation set (10\%) is randomly chosen from the training set during training of each network. 
% We use the validation set to stop the training using the early-stopping criteria and avoid over/under-fitting. 

\begin{figure}[h]
\centering
\subfigure[Normal eye]{
    \includegraphics[width=0.225\textwidth]{images/eye_normal.png}
    \includegraphics[width=0.225\textwidth]{images/eye_normal_seg.png}}
\subfigure[Severe TT]{
    \includegraphics[width=0.225\textwidth]{images/eye_sev3.png}
    \includegraphics[width=0.225\textwidth]{images/eye_sev3_seg.png}}
\vspace{-2.5mm}
\caption{Training samples and corresponding label maps generated by an expert rater.
The sclera, cornea, and upper eyelid are shown in red, green and blue respectively.}
\label{fig:eye_seg}
\end{figure} 


\section{Related work}
Machine learning is becoming an increasingly popular means to screen for eye diseases and disorders. Although, certain eye conditions, such as diabetic retinopathy \cite{asiri2019deep} and glaucoma \cite{barros2020machine}, have received considerable attention in the literature, far less progress has been made on Neglected Tropical Diseases (NTDs), such as trachoma despite TT impacting an estimated 2.8 million people worldwide \cite{flueckiger2019global}. \cite{kim2019sensitivity} performed a multi-step process to distinguish images of trachomatous inflammation from normal eye images. The models were trained on images taken of participant’s everted upper eyelid, labeled as follicular inflammation (TF), intense inflammation (TI), or normal using the WHO simplified trachoma assessment system\cite{thylefors1987simple}. They used a multi-layer perception classifier to identify eyelid pixels followed by a shallow convolutional network to detect trachoma. They found the less severe TF designation to be more difficult to identify consistently than TI.

Examples of detecting regions of the eye are seen in \cite{rot2018deep} and \cite{bingnan2019shape}. While \cite{bingnan2019shape} identify the pupil and sclera regions of the eye in widely varying images using a shape constrained network, \cite{rot2018deep}'s method labels six different regions of the eye including the eyelashes using an encoder-decoder scheme. These methods are a) trained on images of healthy eyes, and b) do not identify the eyelid region, which our method needs for TT identification.

In general, a neural network (NN) can be trained either from scratch, or by adding a few additional layers (Convolution, RNN, dense, etc.) to pretrained models to customize for the classification or regression task. However, it is known
that using pre-trained models instead of training models from scratch leads to equal or superior performance \cite{zhou2017fine}.
In our experiments, we use pre-trained weights (on the ImageNet\cite{deng2009imagenet} data set) from
VGG19\cite{simonyan2014very}, ResNet50 \cite{he2016deep}, MobileNetV2 \cite{sandler2018mobilenetv2} NN architectures. 
The pre-trained weights from VGG19 have been used to develop state-of-the-art classification
\cite{su2015multi,kanezaki2018rotationnet,ma2018learning,cho2014properties}. 

\section{Methods}

In this section we describe our analysis pipeline to detect if an eye has TT or not. Our initial experiments to do binary classification using labeled TT and non-TT images directly were unable to perform well, despite fine tuning several models. Hence, we focused on a procedure by using areas around the upper eyelid (where human experts also diagnose TT according to WHO protocols). As such, our final pipeline includes a segmentation, extraction, and a concluding classification step. 

We first train a residual UNET\cite{ronneberger2015u,zhang2018road} to identify cornea, sclera and upper eyelid regions of the eye in both healthy and TT-affected eyes. The segmented upper eyelid region is then used to extract a sequence of upper eyelid images at full resolution to train a TT classifier (ttAttNet). We first describe our pre-processing and then explain ttAttNet. 

Our pre-processing pipeline (ttUNETCrops) is shown in Figure \ref{fig:stacknet}.
The main components are the segmentation of the eye's regions of interest (ROI) via UNET, 
and forming an image sequence by cropping image regions around the upper eyelid.
We use the segmentation network to locate our region of interest (i.e. upper eyelid) and focus the analysis
on this region. By doing so, we can perform the analysis at maximum resolution.
The sequence of image crops is created by fitting a curve in the upper eyelid ROI and 
sampling it uniformly between $[x_{min}, x_{max}]$, and extracting 32 contiguous cropped images around the eyelid ROI.
\begin{figure}[h]
\centering
\subfigure[ttUNETCrops]{
    \includegraphics[width=0.55\textwidth]{images/STACKNET-TT.png} 
    \label{fig:stacknet}
}
\subfigure[ttAttNet]{
    \includegraphics[width=0.4\textwidth]{images/GANET-TT.png}
    \label{fig:ganet}
}
\caption{ a) Image sequence creation at full image resolution. The segmentation produces three labels for the sclera (red), cornea (green), and upper eyelid (blue). The upper eyelid ROI is used to fit a smooth curve or polynomial (red line)
which is used to guide the extraction of crops and form an image sequence. The sequence is analyzed by (ttAttNet). b) Feature extraction (FE) (VGG19, ResNet50 and MobileNetV2) for each frame in the input sequence. Analysis using Bi-directional Recurrent Neural Network (RNN) with Gated Recurrent Units (GRU) and attention layers. The attention layers allows focusing on relevant frames of the sequence.}
\end{figure}

Figure \ref{fig:ganet} shows the sequence analysis pipeline (ttAttNet). ttAttNet
uses a
bi-directional recurrent neural network (RNN) with Gated Recurrent Units (GRU)\cite{cho2014properties},
and Bahdanau additive attention layers\cite{bahdanau2014neural,bahdanau2016end}.
The attention layers allow focusing on specific frames of the input sequence by computing weights that highlight the importance of certain frames
and reduce the contribution of others for the final network decision. 
Moreover, these weights may be used for visually explaining predictions to stakeholders or for additional analysis at a later stage.

We start by processing each frame of the sequence with a feature extraction and 
a global average pooling layer in order to extract salient
features. Thus, we reduce each frame in the sequence (384x384 pixels) to a vector of (512 for VGG19, 2048 for ResNet50, 1280 for MobileNetV2) features.
Recurrent neural networks (RNN) with Long-short term memory (LSTM)\cite{hochreiter1997long} or GRUs units can then analyze this sequence of 1-dimensional features. GRUs have been reported to have a slightly better performance than
LSTM. LSTM networks also have more parameters and take longer to train
\cite{fu2016using,khandelwal2016comparing}. 

We hypothesize that using images at lower resolution is not ideal 
for TT classification since TT classification often requires detecting a few eye lashes pointing 
in the wrong direction. In other words, these fine image details may be 
lost if the images are down sampled, which is a common pre-processing step to use state 
of the art neural network architectures for image classification. Nevertheless, customizing existing NN architectures
to handle images at larger resolution is possible, but it may not be computationally efficient. On the contrary, 
in this paper, we propose to use existing architectures for image analysis and focus only on the upper eyelid region using image patches 
at the highest resolution level. 

We compare the performance of our approach (ttUNETCrops+ttAttNet) with various other approaches. For comparison we use:
1. Re-sampled images at 512x512, and VGG19, ResNet50, and MobileNetV2 as feature extraction (output size of feature extraction is 16x16x512, 16x16x2048, 16x16x1280 respectively). 
1.1 ttVGG19$_{512}$: Features from VGG19 and training an additional 2DConv layer (filter size 3x3, stride 2x2, 512 units), and 3 fully connected (FC) layers (the 3 FC layers are part of the original VGG19 architecture). 
1.2 ttResNet50$_{512}$ - ttMobileNetV2$_{512}$: Features from ResNet50/MobileNetV2 and an additional 2DConv (filter size 3x3, stride 2x2, 2048/1280 units) and a FC layer.
2. Our approach which re-samples an input image to 512x512, segments the image to locate the upper eyelid and extract a sequence of 32 images of size 384x384
at full resolution (ttUNETCrops), followed by ttAttNet which analyzes the sequence of features extracted with VGG19, ResNet50, and MobileNetV2 (output size of feature extraction is 1x512, 1x2048, 1x1280 as we use a global average pooling layer on each). The ttAttNet architecture is the same for all experiments as is shown in Figure \ref{fig:ganet}.

\section{Results}

We select a set of 1,706 images (996 TT, 709 non-TT) images with good quality around the eyelid region for classification. Our grader marked the sclera, cornea and upper eyelid regions of the eye for 1,113 images. These are somewhat different sets of images, with an overlap of 455 images that have both manual segmentations and TT grading. From this overlap set we randomly chose  308 images to test our approach. In summary, 73\% for training and 27\% for testing the UNET segmentation, and 82\% for training and 18\% for testing the classification. A validation set is randomly chosen (10\%) from the training set during training of each network. We use the validation set to stop the training using the early-stopping criteria and avoid over/under-fitting. 

\subsection{ttUNETCrops}

The residual UNET was trained using the images segmented by our expert grader as ground truth. 
The training went for 124 epochs, the learning rate was set to $1e-4$, dropout rate 
was set to 0.15, Adam optimizer and a categorical cross entropy loss function to discriminate between 4 classes (background, sclera, cornea, and upper eyelid).
\begin{figure}[h]
\centering
    \includegraphics[width=0.16\textwidth]{images/seg0.png}
    \includegraphics[width=0.16\textwidth]{images/seg1.png}
    \includegraphics[width=0.16\textwidth]{images/seg2.png}
    \includegraphics[width=0.16\textwidth]{images/seg3.png}
    \includegraphics[width=0.16\textwidth]{images/seg4.png}
    \includegraphics[width=0.16\textwidth]{images/seg5.png}
    \vspace{-5mm}
    \caption{Segmentation results for randomly chose samples in our data set. The fitted curve guides the extraction of contiguous image crops.}
    \label{fig:eye_seg_eval}
\end{figure}
%               precision    recall  f1-score   support

%          0.0       0.99      0.98      0.99  68515941
%          1.0       0.85      0.80      0.83   4233897
%          2.0       0.85      0.91      0.88   3997300
%          3.0       0.70      0.73      0.71   3993214

%     accuracy                           0.96  80740352
%   macro avg       0.85      0.86      0.85  80740352
% weighted avg       0.96      0.96      0.96  80740352
\begin{table}
\begin{center}
\begin{tabular}{ |l|c|c|c|c| } 
\hline
Label & Precision & Recall & F1-score & Dice\\ 
\hline
\hline
Background & 0.99 & 0.98 & 0.99 & 0.98 \\ 
\hline
Sclera & 0.85 & 0.80 & 0.83 & 0.82\\ 
\hline
Cornea & 0.85 & 0.91 & 0.98 & 0.88\\ 
\hline
Upper Eyelid & 0.70 & 0.73 & 0.71 & 0.71\\
\hline
\end{tabular}
\end{center}
\caption{Accuracy of segmentation task}
\label{tab:seg}
\end{table}
Table \ref{tab:seg} shows the results for the image segmentation task. 
% Precision = What proportion of positive identifications was actually correct?
% Recall = What proportion of actual positives was identified correctly?
The dice score for the eyelid, which is the most important ROI for 
our experiment, is 0.71.
This is an acceptable outcome for the next phase of the analysis and it will be demonstrated by our classification results.
Figure \ref{fig:eye_seg_eval} shows the segmentation plus polynomial fitting for randomly chosen samples in our data set.

\subsection{ttAttNet}

In our experiments, we trained different neural networks to evaluate the performance
of the classification task. Table \ref{tab:class_results} shows the result of the classification task for all the networks evaluated. ttAttNet$_{VGG19}$, ttAttNet$_{ResNet50}$ and ttAttNet$_{MobileNetV2}$ trained for 4, 7, 5 epochs respectively. Learning rate was set to $1e-4$, Adam optimizer and a categorical cross entropy loss function to discriminate between normal and tt is used. 
% True positive rate or sensitivity: [0.99258071 0.87027597 0.93278774 0.80517576]
% True negative rate or specificity: [0.94421953 0.97162501 0.96942165 0.98167387]
% Positive predictive value or precision: [0.85573047 0.91090131 0.91046071 0.9360831 ]
% Negative predictive value: [0.99738765 0.95739208 0.97741127 0.93795102]
% False positive rate or fall out [0.05578047 0.02837499 0.03057835 0.01832613]
% False negative rate: [0.00741929 0.12972403 0.06721226 0.19482424]
% False discovery rate: [0.14426953 0.08909869 0.08953929 0.0639169 ]
% Overall accuracy: [0.95630982 0.94628775 0.96026318 0.93754934]

%                   precision    recall  f1-score   support
%          0.0       0.99      0.99      0.99 140703417
%          1.0       0.88      0.87      0.88   7991580
%          2.0       0.90      0.93      0.92   7843227
%          3.0       0.85      0.81      0.83   7826064

%     accuracy                           0.97 164364288
%   macro avg       0.91      0.90      0.90 164364288
% weighted avg       0.97      0.97      0.97 164364288

%               precision    recall  f1-score   support

%           1       0.88      0.87      0.88   7991580
%           2       0.90      0.93      0.92   7843227
%           3       0.85      0.81      0.83   7826064

%   micro avg       0.88      0.87      0.87  23660871
%   macro avg       0.88      0.87      0.87  23660871
% weighted avg       0.88      0.87      0.87  23660871


% We use similar architectures 
% Since we are using VGG19 as feature extraction tool, we use similar architectures in our experiments, \textit{i.e.}, 
% for our first experiment (ttVGG19$_{256}$), we train the remaining decision layers of VGG19 which consist 
% of two dense(4096) and a dense(2) layer. 

% In our second experiment (ttVGG19$_{512}$), we train a 2DConv(512, kernel=(3,3), stride=(2,2)
% layer plus the Dense layers described above. 

% In our approach ttAttNet, we train the bi-directional RNN with GRUs(512), two Attention(1024) layers, 
% and a Dense(2) layer. 

% For all of our experiments we train using a 10-fold cross-validation approach (90\% of the data for training and leaving
% 10\% for testing each fold).
% The learning rate is set to $1e^{-4}$, we use the Adam optimizer, and 
% a binary cross entropy loss function to discriminate between two classes (normal v.s tt). See Appendix \ref{sec:appendix_epoch} for the number 
% of epochs set for the cross-validation training. 
% \begin{table}
% \begin{center}
% \begin{tabular}{ |c|c|c| } 
% \hline
% NN & Train & Number of parameters\\ 
% \hline
% VGG19 & - & 20,024,384  \\ 
% \hline
% \multirow{2}{4em}{256x256} & 2 x Dense(4096) + Dense(2) & \multirow{2}{4em}{151,003,136} \\ 
% & & \\
% \hline
% \multirow{2}{4em}{512x512} & Con2D(512, kernel=(3, 3), stride=(2,2)), 2 x Dense(4096) + Dense(2) & \multirow{2}{4em}{153,362,432} \\
% & & \\
% \hline
% Full res & STACKNET + GANET & 3,545,840 + 6,303,746 = \\
% & & 9,849,586 \\ 
% \hline
% \end{tabular}
% \end{center}
% \label{tab:params}
% \caption{Number of parameters for each NN architecture and training parameters for each network.}
% \end{table}
% \begin{table}
% \begin{center}
% \begin{tabular}{ |c|c|c|c|c|c| } 
% \hline
% NN & class & Precision & Recall & F1-score & Accuracy\\
% \hline
% \hline
% \multirow{2}{6em}{ttVGG19$_{256}$} & normal & 0.74 & 0.72 & 0.73 & \multirow{2}{4em}{0.78}\\ 
%                           & tt     & 0.81 & 0.83 & 0.82 & \\
% \hline
% \multirow{2}{6em}{ttVGG19$_{512}$} & normal & 0.78 & 0.76 & 0.77 & \multirow{2}{4em}{0.82}\\ 
%                           & tt     & 0.84 & 0.85 & 0.85 & \\
% \hline
% \multirow{2}{10em}{ttAttNet$_{VGG19}$} & normal & 0.84 & 0.82 & 0.83 & \multirow{2}{4em}{0.87}\\ 
%                  & tt & 0.88 &  0.89 & 0.89 & \\
% \hline
% \multirow{2}{10em}{ttAttNet$_{RESNET50}$} & normal & 0.87 & 0.83 & 0.85 & \multirow{2}{4em}{0.88}\\ 
%                  & tt & 0.88 &  0.91 & 0.90 & \\
% \hline
% \multirow{2}{10em}{ttAttNet$_{MobileNetV2}$} & normal & 0.75 & 0.69 & 0.72 & \multirow{2}{4em}{0.77}\\ 
%                  & tt & 0.78 &  0.83 & 0.80 & \\
% \hline
% \end{tabular}
% \end{center}
% \label{tab:class_results}
% \caption{Classification results using images at different resolution levels}
% \end{table}
\begin{table}
\begin{center}
\begin{tabular}{ |c|c|c|c|c|c|c| } 
\hline
NN & input & class & precision & recall & f1-score & accuracy\\
\hline
\hline
%VGG19 - early stopping 4 epochs
%               precision    recall  f1-score   support

%           0       0.84      0.72      0.78       128
%           1       0.82      0.91      0.86       180

%     accuracy                           0.83       308
%   macro avg       0.83      0.81      0.82       308
% weighted avg       0.83      0.83      0.83       308
\multirow{2}{8.5em}{ttVGG19$_{512}$} & \multirow{2}{5em}{16x16x512}    & normal & 0.84 & 0.72 & 0.78 & \multirow{2}{1.5em}{0.83}\\ 
                                     &                                                & tt     & 0.81 & 0.91 & 0.86 & \\
\hline
%RESNET - THIS MODEL TRAINED FOR 21 epochs using early stopping
%               precision    recall  f1-score   support

%           0       0.90      0.73      0.81       134
%           1       0.82      0.94      0.87       174

%     accuracy                           0.85       308
%   macro avg       0.86      0.83      0.84       308
% weighted avg       0.85      0.85      0.84       308
\multirow{2}{8.5em}{ttResNet50$_{512}$} & \multirow{2}{5em}{16x16x2048}  & normal & 0.90 & 0.73 & 0.81 & \multirow{2}{1.5em}{0.85}\\ 
                                         &                                              & tt & 0.82 & 0.94 & 0.87 & \\
\hline
%MOBILENETV2
%               precision    recall  f1-score   support

%           0       0.87      0.70      0.78       136
%           1       0.79      0.92      0.85       172

%     accuracy                           0.82       308
%   macro avg       0.83      0.81      0.81       308
% weighted avg       0.83      0.82      0.82       308
\multirow{2}{8.5em}{ttMobileNetV2$_{512}$} & \multirow{2}{5em}{16x16x1280}   & normal & 0.87 & 0.70 & 0.78 & \multirow{2}{1.5em}{0.82}\\ 
                                            &                                               & tt & 0.79 & 0.92 & 0.85 & \\
\hline
%%%% ttStack + VGG19 + ttGNET
%%% 4 epochs
%               precision    recall  f1-score   support

%           0       0.78      0.84      0.81       101
%           1       0.92      0.88      0.90       207

%     accuracy                           0.87       308
%   macro avg       0.85      0.86      0.86       308
% weighted avg       0.87      0.87      0.87       308
\multirow{2}{8.5em}{ttAttNet$_{VGG19}$}   & \multirow{2}{5em}{32x512} & normal & 0.78 & 0.84 & 0.81 & \multirow{2}{1.5em}{0.87}\\ 
                                           &                                             & tt & 0.92 &  0.88 & 0.90 & \\
\hline
%%%% ttStack + ResNet + ttGANEt using stopping criteria 
% it went for 8 epochs
%               precision    recall  f1-score   support

%           0       0.87      0.86      0.87       110
%           1       0.92      0.93      0.93       198

%     accuracy                           0.91       308
%   macro avg       0.90      0.90      0.90       308
% weighted avg       0.91      0.91      0.91       308
\multirow{2}{8.5em}{ttAttNet$_{ResNet50}$} & \multirow{2}{5em}{32x2048}  & normal & 0.87 & 0.86 & 0.87 & \multirow{2}{1.5em}{0.91}\\ 
                                            &                                              & tt & 0.92 &  0.93 & 0.93 & \\
\hline
%%%% ttStack + MobileNetV2 + ttGANET for 5 epochs
%               precision    recall  f1-score   support

%           0       0.83      0.81      0.82       113
%           1       0.89      0.91      0.90       195

%     accuracy                           0.87       308
%   macro avg       0.86      0.86      0.86       308
% weighted avg       0.87      0.87      0.87       308
\multirow{2}{8em}{ttAttNet$_{MobileNetV2}$} & \multirow{2}{5em}{32x1280}   & normal & 0.83 & 0.81 & 0.82 & \multirow{2}{1.5em}{0.87}\\ 
                                               &                                               & tt & 0.89 &  0.91 & 0.90 & \\
\hline
\end{tabular}
\end{center}
\label{tab:class_results}
\caption{Classification results for different architectures and feature extraction methods. ttVGG19$_{512}$, ttResNet50$_{512}$ and ttMobileNetV2$_{512}$ use the features extracted with the respective networks as inputs of images re-sampled to 512x512. No pooling operation is applied. The ttAttNet networks use features extracted
for each of the 32 frames (384x384) in the sequence and a global average pooling.}
\end{table}
Our approach of classifying a sequence of cropped images around the ROI of upper eyelid outperforms the classification of full images, even if different feature extraction schemes are used, with ResNet giving an overall accuracy of 91\%. The weights computed by the attention layers may be used for further analysis or to visualize the most relevant frames. 
Figure \ref{fig:attention} shows the top 3 frames selected by the attention layer.
\begin{figure}[h]
\centering
    \includegraphics[width=\textwidth]{images/AttentionLayerOutputsTT.png}
    \vspace{-5mm}
    \caption{The weights produced by the attention layers may be used for visualization and validation. The figure shows the top 3 frames selected by the forward (magenta) and backward (cyan) attention layers. Both images show eyelashes pointing inwards towards the eye detecing TT. Both examples were misclassified by the ttVGG19$_{512}$ NN.}
    \label{fig:attention}
\end{figure}

% THIS THE CLASSIFICATION FOR THE 256 re-sampled trained on VGG19
%               precision    recall  f1-score   support

%       normal       0.74      0.72      0.73       673
%           tt       0.81      0.83      0.82       987

%     accuracy                           0.78      1660
%   macro avg       0.77      0.77      0.77      1660
% weighted avg       0.78      0.78      0.78      1660

% THIS THE CLASSIFICATION FOR THE 512 re-sampled trained on VGG19
%               precision    recall  f1-score   support

%       normal       0.78      0.76      0.77       676
%           tt       0.84      0.85      0.85       984

%     accuracy                           0.82      1660
%   macro avg       0.81      0.81      0.81      1660
% weighted avg       0.81      0.82      0.81      1660

% THIS IS THE CLASSIFICATION FOR THE ttStack + VGG19 + gruatt
%               precision    recall  f1-score   support

%       normal       0.84      0.82      0.83       668
%           tt       0.88      0.89      0.89       992

%     accuracy                           0.87      1660
%   macro avg       0.86      0.86      0.86      1660
% weighted avg       0.87      0.87      0.87      1660

%THIS IS CLASSIFICATION WITH ttStack + RESNET50 + gruatt
%               precision    recall  f1-score   support

%       normal       0.87      0.83      0.85       691
%           tt       0.88      0.91      0.90       969

%     accuracy                           0.88      1660
%   macro avg       0.88      0.87      0.87      1660
% weighted avg       0.88      0.88      0.88      1660



% MARTIN TIPS
% computational load of vgg is not mission critical as it is easily fit in the device
% network can tell you how sure of the classification 
% segmentation is a separate step
% out of distribution means outlier, a.k.a, anomalies
% the fact that is unconvinced we cannot do anything about, this is a product that is exciting to our clinical collaborator and is a it is at the moment for field testing in the next month, its going to be in field tests
% add standard deviation for the folds

% Performance on the images on app
% VGG - 
% segmentation: 278 ms
% feature extraction: 8371 ms
% total: 22,270 ms

% Resnet:
% segmentation: 269ms
% feature extraction: 5004ms
% total: 14,714ms

% Mobilenet:
% segmetnation: 270ms
% feature extraction: 691 ms
% total: 8601 ms

\section{Conclusions}

In this work, we  have proposed an approach to analyze high resolution images of eyes captured by commercial phones, and identify TT - an infectious disesase of the eye. By focusing the analysis only to the region of interest, we have demonstrated 
that we can achieve an f1-score of 0.93 using full resolution images vs. 0.87 when the images are re sampled to 512x512. This pipeline is to be deployed on mobile devices; hence, there are computational constraints and the trained models need to be as lean as possible without compromising the accuracy of the classification. Re-sampling the images to 512x512, using state-of-art neural networks for feature extraction and training an additional few layers yields a higher number of parameters and lower accuracy than 
the proposed approach. The experiments in this paper demonstrate that using image samples
at full resolution is needed to have the best possible accuracy for TT prediction, as it may come down to 
identifying a single eyelash pointing in the wrong direction. Section \ref{sec:appendix_params} shows a summary of the number of parameters for each neural network used in this paper. The attention weights can be used to display which frames of the sequence contribute to the final decision of the pipeline.
The results from the attention modules used in ttAttNet indicate that anomalies in the eyelid are a great indicator to classify normal v.s. TT images. 
Down sampling the images to enable other architectures results 
in lower accuracy. During our analysis, we also found that image quality, resolution, and TT severity played a role in the accuracy of the classification. A standardized protocol for taking the photos is followed, which includes using similar quality cameras at same distances from the patient. In future, we will assess the robustness of our models when the protocol is not followed. Our future work will also create classification models to predict severity of TT, include mild cases of TT, as well as images with epilation.

This paper reports the first step in demonstrating that machine learning is an achievable approach to TT identification. The MTSS trial has demonstrated that smartphone-naïve individuals can be trained how to navigate a simple app and take high-quality images of the human eyelid.  Future steps will require employing the methods described here into a smartphone-ready app and then assessing app functionality in the field.  Ultimately, this line of work has the potential to impact the global approach to TT case finding by increasing screening accuracy and subsequently getting patients who need TT management access to sight-saving services. 

% Our approach combines techniques for detection of the region of interest and analysis at full resolution level 
% while minimizing computational costs
% VGG19 & - & 20,024,384  \\ 
% \hline
% \multirow{2}{4em}{256x256} & 2 x Dense(4096) + Dense(2) & \multirow{2}{4em}{151,003,136} \\ 
% & & \\
% \hline
% \multirow{2}{4em}{512x512} & Con2D(512, kernel=(3, 3), stride=(2,2)), 2 x Dense(4096) + Dense(2) & \multirow{2}{4em}{153,362,432} \\
% & & \\
% \hline
% Full res & STACKNET + GANET & 3,545,840 + 6,303,746 = \\
% & & 9,849,586 \\ 

% Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{ We are grateful to the many study participants in Ethiopia who allowed us to collect images of their eyelids. 
Additionally, we would like to thank RTI International and the National Eye Institute for the funding that made this work possible. }

\bibliography{prieto21}

\section{Appendix}
\label{sec:appendix_epoch}
\subsection{Early stopping and number of epochs}
The number of epochs for our experiments are set
based on the early stopping criteria, \textit{i.e.}, after 
the validation loss stops improving. 
The number of epochs for ttVGG19$_{512}$, ttResNet50$_{512}$ and ttMobileNetV2$_{256}$ is set to 4, 27, 6.

\subsection{Number of parameters for each neural network}
\label{sec:appendix_params}
\begin{table}
\begin{center}
\begin{tabular}{ |l|c|c| } 
\hline
NN & Parameters & Total \\
\hline
\hline
VGG19$_{Fextraction}$ & 20,024,384 & - \\
\hline
ResNet50$_{Fextraction}$ & 23,587,712 & - \\
\hline
MobileNetV2$_{Fextraction}$ & 2,257,984 & - \\
\hline
\hline
ttVGG19$_{512}$  & 153,362,432 & 173,386,816 \\
ttResNet50$_{512}$ & 37,752,832 & 61,340,544 \\
ttMobileNetV2$_{512}$ & 14,748,160 & 17,006,144 \\
\hline
\hline
ttUNETCrop & 14,160,352 & - \\
\hline
\hline
ttAttNet$_{VGG19}$ & 6,303,748 & 40,488,484 \\
ttAttNet$_{ResNet50}$ & 11,022,340 & 49,358,212 \\
ttAttNet$_{MobileNetV2}$ & 8,663,044 & 25,081,380 \\
\hline
\end{tabular}
\end{center}
\label{tab:number_of_params}
\caption{Number of parameters for each architecture. The total includes the parameters from the feature extraction + the additional layers trained for each architecture. For the ttAttNet rows, it includes the parameters from ttUNETCrop as well.}
\end{table}



\end{document}