\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution
\newcommand{\invalid}[1]{{\color{red}#1}}

\usepackage{mwe} % to get dummy images
\usepackage{multirow} %for table
\usepackage{booktabs} %for table
\jmlrvolume{-- 26}
\jmlryear{2024}
\jmlrworkshop{Full Paper -- MIDL 2024}
\editors{Accepted for publication at MIDL 2024}

\title[Probabilistic Segmentation Models for Improved Glaucoma Diagnosis]{Leveraging Probabilistic Segmentation Models for Improved Glaucoma Diagnosis: A Clinical Pipeline Approach}

\midlauthor{\Name{Anna M. Wundram\midljointauthortext{Contributed equally}\nametag{$^{1}$}} \Email{anna.wundram@student.uni-tuebingen.de}\\
\Name{Paul Fischer\midlotherjointauthor\nametag{$^{1}$}} \Email{paul.fischer@uni-tuebingen.de}\\
\Name{Stephan Wunderlich\nametag{$^{1,2}$}} \Email{stephan.wunderlich@tum.de}\\
\Name{Hanna Faber\nametag{$^{3,4,5}$}} \Email{h.faber@uke.de}\\
\Name{Lisa M. Koch\nametag{$^{6,7}$}} \Email{lisa.koch@unibe.ch}\\
\Name{Philipp Berens\nametag{$^{1,6}$}} \Email{philipp.berens@uni-tuebingen.de}\\
\Name{Christian F. Baumgartner\nametag{$^{1,8}$}} \Email{christian.baumgartner@unilu.ch}\\
\addr $^{1}$ Cluster of Excellence -- Machine Learning for Science, University of Tübingen, Germany \\
\addr $^{2}$ Ludwig-Maximilians-University of Munich, Germany \\
\addr $^{3}$ University Clinic Hamburg, Germany\\
\addr $^{4}$ University Eye Clinic Tübingen, Germany\\
\addr $^{5}$ Moorfields Eye Hospital, London, UK\\
\addr $^{6}$ Hertie Institute for AI in Brain Health,  University of Tübingen, Germany \\
\addr $^{7}$ Department of Diabetes, Endocrinology, Nutritional Medicine and Metabolism UDEM, Inselspital, Bern University Hospital, University of Bern, Switzerland \\
\addr $^{8}$ Faculty of Health Sciences and Medicine, University of Lucerne, Switzerland \\
}

\def\*#1{\mathbf{#1}}
\usepackage{gensymb} % for \degree command

\begin{document}

\maketitle

\begin{abstract}
The accurate segmentation of the optic cup and disc in fundus images is essential for diagnostic processes such as glaucoma detection. The inherent ambiguity in locating these structures often poses a significant challenge, leading to potential misdiagnosis. To model such ambiguities, numerous probabilistic segmentation models have been proposed. In this paper, we investigate the integration of these probabilistic segmentation models into a multistage pipeline closely resembling clinical practice. Our findings indicate that leveraging the uncertainties provided by these models substantially enhances the quality of glaucoma diagnosis compared to relying on a single segmentation only. 
\end{abstract}

\section{Introduction}

Glaucoma is a chronic eye disease in which nerve fibers gradually degenerate, leading to damage in the optic nerve head. It is the second leading cause of blindness worldwide affecting one in ten people over the age of eighty~\cite{Sevastopolsky2017}. Diagnosis involves the exact localization of the optic disc and cup in the fundus image. Their shape, size, and relationship to each other are crucial disease markers. Therefore, automated segmentation of the cup and disc is an important step for computer-aided diagnosis frameworks. However, delineation of those structures, in particular the optic cup, is highly challenging and subject to large uncertainties with large disagreements even among experts.

In this paper, we demonstrate that probabilistic segmentation techniques are effective at capturing uncertainties in the segmentation of the optic disc and cup. Going further, we propose a method to propagate this uncertainty through a multi-stage diagnostic pipeline. Specifically, we propose a method for incorporating the uncertainty arising in an upstream step in the final downstream task, and show that it substantially improves classification performance with respect to a deterministic baseline\footnote{The code is available at https://github.com/annawundram/glaucoma-diagnosis-pipeline}.

Several models have been proposed for optic cup and disc segmentation over the past years~\cite{sevastopolsky2019stack,tulsani2021automated,rasheed2023rimnet}. However, these methods do not take into account the inherent uncertainties of this problem. A small number of works address expert disagreements. 
\citet{edupuganti2018automatic} incorporate multi-annotator information into learning the segmentation by weighing the loss for pixels depending on the agreement or disagreement of the annotators.
\citet{cheng2020probability} approximate the joint distribution of the input fundus image and the ground truth segmentation to regularize a U-Net segmentation network.
However, estimation of the segmentation uncertainty in optic cup and disc segmentation remains unexplored.

Most recent automated glaucoma diagnosis frameworks approach the problem with black box solutions~\cite{mirzania2021applications,Singh2022,FAN2023100233,de2023airogs}. While reaching high performance, end-to-end black box models may often not be clinically desirable. In practice, systems that model the individual steps of a diagnostic pipeline are more interpretable, easier to debug, and more likely to find clinical acceptance. Moreover, data shifts can be addressed by retraining individual components rather than the whole method. Indeed, clinical pipeline approaches have shown great promise in similar areas~\cite{fauw2018clinically}. A number of works propose to first segment the cup and disc and then extract the vertical or area \emph{cup-to-disc ratio} (CDR) as a diagnostic marker~\cite{al2018dense,sevastopolsky2019stack,bi2019automated,jiang2019jointrcnn,bian2020optic,neto2022evaluations,zhang2023multiple}. The vertical CDR measures the ratio between the diameter of the cup and the disc along a vertical line through the center. The area CDR measures the ratio of pixels belonging to the cup and disc respectively. However, clinical literature shows that the \emph{rim thickness curve} (RTC) (i.e. the distance between the disc and the cup for every point) may be a more informative measure~\cite{spaeth2002disc,kumar2019rim}. In this work, we follow these insights and propose a clinically inspired multi-stage pipeline based on initial probabilistic segmentation of the cup and disc, automated RTC extraction, followed by glaucoma classification (see Fig.~\ref{fig:overview}).

Uncertainty estimation is typically studied for individual deep learning models, and has been shown to yield improved performance and robustness on individual tasks~\cite{abdar2023uncertaintyfusenet,schmidt2023probabilistic}. However, when clinical workflows consist of tasks that are arranged in a cascading sequence -- as is the case here -- it becomes crucial to understand how uncertainty in certain stages impacts subsequent tasks. Few studies have focused on uncertainty propagation in clinical pipelines. \citet{eaton2018towards} propagate tumor segmentation uncertainty to tumor volume measurement using the variance sum law. \citet{mehta2019propagating,mehta2021propagating} propose to append variance maps obtained using MC Dropout~\cite{kendall2016bayesian} in a segmentation stage as an additional input channel to a subsequent tumor detection network. To our knowledge, these are the only studies showing that incorporating uncertainty can enhance downstream task performance. However, relying on variance maps instead of the full distribution limits the downstream task to neural networks, as the variance must be added as a channel. Recently, \citet{feiner2023propagation} and \citet{fischer2023uncertainty} proposed two related approaches for propagating uncertainty arising in MR reconstruction to subsequent classification, regression, or segmentation tasks. Both these approaches directly propagate samples from the conditional distribution through the pipeline rather than relying on the variance. Similar, to those works, we propose a sampling-based uncertainty propagation approach. However, we go one step further by using uncertainty to \emph{enhance} downstream performance. This is achieved by marginalizing over possible outcomes in the highly uncertain segmentation stage.

In summary, our contributions are as follows:
\begin{enumerate}
    \item An interpretable pipeline for glaucoma diagnosis which incorporates clinical knowledge and closely mirrors clinicians' reasoning. 
    \item The first application of segmentation uncertainty quantification techniques to cup-and-disc segmentation in fundus images with a comparative analysis of four widely used techniques.
    \item A sampling-based approach for propagating uncertainty through a multi-stage pipeline as well as a strategy for leveraging this approach to enhance downstream performance.
    \item Introduction of RTC as a clinically motivated alternative to CDR in computer-aided glaucoma diagnosis pipelines and empirical verification of its superior performance. 
\end{enumerate}

\section{Methods}
\label{sec:methods}

\begin{figure}[t]
    \centering
    \includegraphics[width=\textwidth]{figures/method_1.png}
    \caption{\textbf{Proposed pipeline.} a) Automatic ROI extraction from the full fundus images. b) Sampling of possible segmentations using probabilistic segmentation. c) Extraction of rim thickness curves (RTCs) for each segmentation sample. d) Classification of each RTC. e) Marginalization over possible segmentations produces the final glaucoma probability.}
    \label{fig:overview}
\end{figure}

The proposed pipeline consists of five steps as illustrated in Fig.~\ref{fig:overview}. We deterministically extract the region of interest (ROI) containing the cup and disc to obtain the close-up images $\*x$. For the highly uncertain step of optic disc and cup segmentation, we use probabilistic segmentation to approximate the distribution $p(\*s|\*x)$ of the segmentations $\*s$, and we produce segmentation samples $\*s_i$. The samples (which capture the uncertainty) are propagated through a deterministic rim thickness extraction function $g: \*s_i \mapsto \*r_i$ producing RTCs $\*r_i$. Following this, a deterministic classification function $f: \*r_i \mapsto d_i$ takes the RTCs as input and produces a predicted diagnosis $d_i \in \{0,1\}$ for each $\*r_i$. Lastly, we marginalize over the samples to obtain a final probability $p(d|\*x)$ of the image being ``glaucoma suspect''. All of these steps will be introduced in detail below.

\subsection{Automated Region of Interest Detection}
\label{sec:ROI-extraction}

The region of interest (ROI) for glaucoma diagnosis only includes a small area of the fundus containing the optic nerve head, in particular the optic disc and cup (see Fig.~\ref{fig:overview}a). 
 
Prior work has shown that a two-step segmentation approach consisting of ROI extraction, followed by segmentation can improve segmentation results~\cite{app9153064, Liu2021}. Following these works we use a U-Net for cup and disc segmentation on full-view fundus images. Next, a padded quadratic bounding box is placed around the segmentations, and the resulting ROI is cropped and resized to $320 \times 320$ pixels. All experiments in this paper were performed with ROI images $\*x$ obtained in this manner.

We use the improved U-Net first employed by \citet{NEURIPS2018_473447ac} which operates on 7 rather than 5 resolution levels and uses bilinear upsampling instead of transposed convolutions for all experiments, to ensure consistency with the Prob. U-Net~\cite{NEURIPS2018_473447ac} and PHiSeg~\cite{phiseg} baselines described in the next section.


\subsection{Probabilistic Optic Cup and Disc Segmentation}
\label{sec:segmentation}

The cup and disc segmentation step is characterized by large uncertainties and variability even among human experts. Therefore, segmentations resulting from a deterministic approach, such as a U-Net, may be insufficient. It may for example produce segmentations with a very thin rim in a subject where a thick and a thin rim are equally likely. The full distribution of possible segmentations $p(\*s|\*x)$ matching a given image $\*x$ contains valuable information for the downstream classification task. In recent years, several techniques have been proposed which allow to approximate this conditional probability distribution. In this work, we compare four such techniques: The probabilistic U-Net~\cite{NEURIPS2018_473447ac}, PHiSeg~\cite{phiseg}, MC Dropout~\cite{kendall2016bayesian}, and ensembles~\cite{lakshminarayanan2017simple}. 

The \textbf{probabilistic U-Net}~\cite{NEURIPS2018_473447ac} is a combination of the conditional VAE~\cite{sohn2015learning} approach with a U-Net architecture. \textbf{PHiSeg} further extends the idea by a hierarchical latent space and was shown to provide closer approximations of $p(\*s|\*x)$. Both techniques estimate the \emph{aleatoric} uncertainty.

\textbf{Ensembles} are implemented by training ten standard U-Nets with different random seeds (Ensemble$_{seeds}$). We additionally create an ensemble by training a U-Net for each of the 11 expert annotators in our data (Ensemble$_{experts}$). The widely used \textbf{MC Dropout} technique produces probabilistic segmentation samples by repeatedly predicting segmentations for the same image with dropout enabled. We use a dropout rate of 0.2 on the activation maps for training and testing. Dropout is applied to all layers except the final four segmentation layers. Ensembles and MC Dropout estimate \emph{epistemic} uncertainty. We refer the reader to \cite{abdar2021review} for definitions of aleatoric and epistemic uncertainty. 

We use the improved U-Net architecture first proposed in \citet{NEURIPS2018_473447ac} for all approaches with the exception of PHiSeg. PHiSeg employs the same U-Net encoder, but requires a specific decoder. Crucially, all examined probabilistic segmentation methods allow the generation of segmentation samples $\*s_i$ from the estimated distribution of $p(\*s|\*x)$.

\subsection{Rim Thickness Curve (RTC) Extraction}
\label{sec:RTC-extraction}

Next, we extract RTCs $\*r_i$ from segmentation samples $s_i$. Rim thickness is defined as the width of the rim between the borders of the optic cup and disc~\cite{spaeth2002disc,hwang2012glaucoma}. To compute the RTC, a beam centered at the optic cup is rotated by 360 degrees. At every half-degree interval, the points where the beam intersects with the borders of the optic disc and optic cup are determined. The rim thickness is calculated as the Euclidean distance between these two intersections. This process results in a data vector of length 720 for each segmentation sample $s_i$. The resulting RTCs were visualized as polar plots (see Fig.~\ref{fig:overview} and \ref{fig:rimthickness}). We denote the rim-thickness extraction procedure as a deterministic function $g: \*s_i \mapsto \*r_i$. A visual explanation is shown in Appendix~\ref{sec:appenix-rim-thickness}.

\subsection{Glaucoma Classification}
\label{sec:glaucoma-classification}

The RTCs $\*r_i$ are classified as ``glaucoma suspect'' or ``not glaucoma suspect''. We use a logistic regression classifier since, in preliminary experiments, more powerful classifiers such as SVMs or Random Forests did not lead to improvements. To prevent overfitting, the RTC data is reduced by grouping the values into 72 bins and calculating their mean. We train an individual classifier for the RTCs obtained from each of the examined segmentation methods. The optimal decision threshold for each classifier is obtained by maximizing the Youden index (sensitivity + specificity - 1) on the validation set. This results in a deterministic classifier $f: \*r_i \mapsto d_i$ that maps each rim thickness sample to a binary diagnosis. 

\subsection{Uncertainty Estimation and Robust Classification}
\label{sec:marginalization}

The final probability $p(d|\*x)$ of an input image $\*x$ being ``glaucoma suspect'' is obtained by marginalizing over all possible segmentations, that is $p(d|\*x) = \int p(\*s|\*x) f(g(\*s)) d\*s$. We approximate this integral using the Monte Carlo method with samples from the respective segmentation techniques. We use 100 samples for all methods, except for ensembles which are limited by the number of networks trained. 

The probability $p(d|\*x)$ is a natural measure for uncertainty as it can be interpreted as a respective agreement or disagreement of the predictions resulting from the segmentation samples. In that sense it is comparable to expert disagreement. We obtain the final robust prediction of our pipeline by thresholding the above probability at 0.5. 

\section{Experiments and Results}

\subsection{Data}
We used two publicly available fundus image datasets for experiments. The \textbf{Chákṣu dataset}~\cite{kumar2023chakṣu} contains 1345 fundus images with cup and disc annotations by five experts for each image. Additionally, each expert also provided a diagnosis of ``glaucoma suspect'' or ``not glaucoma suspect''. The dataset also contains consensus segmentations obtained using the STAPLE algorithm~\cite{warfield2004simultaneous}. The \textbf{RIGA dataset}~\cite{RIGA} consists of 750 fundus images with cup and disc annotations by six experts for each image. In contrast to Chákṣu it contains no diagnosis labels, and was only used for training the segmentation networks in our study. 

We split the training portion of the Chákṣu data into a training and validation set according to an 80/20 split. We used the official test split of the Chákṣu data for all our evaluations. Additionally, we split the RIGA dataset into a training, and a validation portion according to a 80/20 split.

\subsection{Training}

We trained the segmentation-based stages of our pipelines with combined RIGA and Chákṣu datasets (see Sections~\ref{sec:ROI-extraction} \& \ref{sec:segmentation}), and the classifiers using only the Chákṣu dataset which contains Glaucoma suspect labels (see Section~\ref{sec:glaucoma-classification}). More details about the training and model selection can be found in Appendix~\ref{sec:training-details}.

\begin{table}[t]
\caption{\textbf{Quantitative results.} AUROC, sensitivity and specificity refer to the glaucoma classification task. Dice between the mean predicted segmentation and the consensus ground truth segmentation. The correlation coefficient (CC) between the mean pipeline prediction and the mean expert prediction for all test samples.}
\begin{center}
\vspace{-4mm}
\resizebox{\textwidth}{!}{%
\begin{tabular}{llccccccccc}
& & \multicolumn{2}{c}{AUROC} & \multicolumn{2}{c}{Sensitivity} & \multicolumn{2}{c}{Specificity} & \multicolumn{1}{c}{Dice} & \multicolumn{1}{c}{CC} & \\
\cmidrule(lr){3-4} \cmidrule(lr){5-6} \cmidrule(lr){7-8} \cmidrule(lr){9-9} \cmidrule(lr){10-10}
& Segm. source & \multicolumn{1}{c}{RTC} & \multicolumn{1}{c}{CDR} & \multicolumn{1}{c}{RTC} & \multicolumn{1}{c}{CDR} & \multicolumn{1}{c}{RTC} & \multicolumn{1}{c}{CDR} & & & \\
\toprule
{\rotatebox[origin=c]{90}{\textit{\footnotesize Det.}}} & U-Net & 0.857 & 0.8881 & 0.701 & 0.771 & 0.838 & 0.845 & 0.919 & - & \\
\cmidrule(lr){2-10}
\multirow{6}{*}{\rotatebox[origin=c]{90}{\textit{\footnotesize Probabilistic}}} & Ensemble$_{experts}$ & 0.863 & \textbf{0.890} & 0.719 & 0.789 & 0.802 & 0.853 & 0.907 & 0.629 & \\
& Ensemble$_{seeds}$ & \textbf{0.899} & 0.886 & 0.824 & 0.736 & 0.792 & \textbf{0.888} & \textbf{0.926} & 0.639 & \\
& Prob. U-Net & 0.876 & 0.869 & \textbf{0.877} & 0.824 & 0.749 & 0.799 & 0.871 & 0.612 & \\
& PHiSeg & 0.884 & 0.882 & 0.807 & \textbf{0.842} & 0.741 & 0.752 & 0.900 & \textbf{0.653} & \\
& MC Dropout & 0.885 & 0.887 & 0.736 & 0.736 & \textbf{0.835} & 0.874 & 0.894 & 0.592 & \\
\cmidrule(lr){2-10}
& Expert Annotations & 0.957 & 0.930 & 0.921 & 0.883 & 0.884 & 0.866 & \multicolumn{1}{c}{-} & \multicolumn{1}{c}{-} & \\
\midrule
& ResNet50 (black box) & \multicolumn{2}{c}{0.884} & \multicolumn{2}{c}{0.754} & \multicolumn{2}{c}{0.853} & \multicolumn{1}{c}{-} & \multicolumn{1}{c}{-} & \\
\bottomrule
\end{tabular}
}

\end{center}
\label{tab:results}
\end{table}

\subsection{Findings}


\textbf{Uncertainty quantification improves downstream predictions} \\ \vspace{-4mm}

\noindent In order to show that accounting for the uncertainty in the segmentation step leads to improved performance in downstream tasks, we included a deterministic U-Net for the cup and disc segmentation as an additional baseline. Again, we used the improved architecture proposed in \citet{NEURIPS2018_473447ac}. Moreover, we included a classifier trained directly on the expert annotations, as well as a black box ResNet50 network trained on the ROIs of the Chákṣu dataset to get a sense of the maximum achievable performance. 

We observed that using probabilistic segmentation techniques consistently led to improvements in the downstream classification performance compared to the deterministic U-Net (Tab.~\ref{tab:results}), with Ensemble$_{seeds}$ achieving the highest overall AUC score.

Notably, the black box ResNet50 model performed slightly worse than the best probabilistic pipeline approaches. This suggests that our interpretable approach has the potential to outperform black-box models in certain settings. We note that black box approaches may perform better in a data-richer setting~\cite{de2023airogs}.

Most methods showed similar performance in the segmentation task, as indicated by the Dice scores in Table~\ref{tab:results}. However, despite its high Dice score, the deterministic U-Net fell short on AUROC scores, indicating that accurate segmentation alone is insufficient, and that considering the entire probability distribution of the upstream task leads to improvements. \\ \vspace{-2mm}

\begin{figure}
    \centering
    \includegraphics[width=\textwidth]{figures/stripplot-grid-alpha.png}
    \caption{\textbf{Mean expert disagreement versus mean pipeline prediction for all methods.} Values close to 1 or 0 indicate large agreement for ``glaucoma suspect'' or ``not glaucoma suspect'', respectively. Values close to 0.5 denote high disagreement.}  
    \label{fig:stripplots}
\end{figure}

\noindent \textbf{RTCs outperformed CDR for glaucoma diagnosis} \\ \vspace{-4mm}

\noindent In order to confirm the hypothesis that RTCs are better suited for glaucoma diagnosis than the widely used CDR, we additionally extracted the area CDR from all segmentations and trained an additional set of logistic regression classifiers on those values. We observed that the best AUROC scores were achieved with RTC, with particularly large improvements over CDR for the highly performing Ensemble$_{seeds}$. \\ \vspace{-2mm} 

\noindent \textbf{Propagated uncertainty correlates with expert disagreement} \\ \vspace{-4mm}

\noindent We additionally calculated the Pearson's correlation coefficient (CC) between the mean expert prediction (i.e. all expert predictions averaged) and the mean pipeline prediction $p(d|\*x)$ in the last column of Tab.~\ref{tab:results}. PHiSeg achieved the highest CC with Ensemble$_{seeds}$ also performing very well. Visual inspection of the mean expert and pipeline predictions confirmed these findings (see  Fig.~\ref{fig:stripplots}). This indicates that the propagated uncertainty correlates with the expert disagreement, and thus is an informative measure for prediction uncertainty. However, further improvements may be achieved in future work by specifically optimizing downstream calibration. \\ \vspace{-2mm}

\noindent \textbf{Qualitative analysis shows good segmentation and RTC agreement with experts} \\ \vspace{-4mm}

\noindent The entropy maps and RTCs in Fig.~\ref{fig:rimthickness} confirm that the distributions of the annotator disagreement approximately matched the estimated segmentation and RTC uncertainties. We observed that the method correctly predicted higher uncertainties in areas where the rim was obscured by blood vessels. Additional samples and methods are shown in Appendix~\ref{sec:appenix-qualitative}. 

\begin{figure}[tb]
    
    \centering
    \includegraphics[width=0.95\textwidth]{figures/rim_thickness_new.pdf}
    \caption{\textbf{Entropy maps and RTCs for expert annotations as well as PHiSeg.} The mean rim-thickness (blue line) as well as the standard deviation (light blue shading) are displayed. Three different scenarios are shown: an uncertain case (top), a certain glaucoma suspect (middle) and a certain healthy eye (bottom).}
    \label{fig:rimthickness}
    \vspace{-4mm}
\end{figure}


\section{Discussion and Conclusion}

Here, we proposed a pipeline for human-interpretable glaucoma prediction. We showed that probabilistic segmentation techniques are suitable for capturing uncertainties in the location of the cup and disc, and demonstrated an approach for propagating these uncertainties through the pipeline steps to the final prediction. Knowledge about the uncertainty adds an additional level of interpretability to the individual pipeline steps. We furthermore proposed a simple strategy for obtaining robust predictions by marginalizing over the distribution of possible segmentations, and showed that accounting for the uncertainty in this manner led to improved downstream predictions. Our analysis of different probabilistic segmentation techniques revealed that a simple random seed ensemble provided the best balance. However, PHiSeg provided the best qualitative results and correlation with expert disagreements.  

A limitation of our work is the sole focus on rim thickness as diagnostic feature. Future work will focus on incorporating diagnostic markers such as the color and intensity of the optic cup, and whether fundus corresponds to the right or left eye into our pipeline. These features are known to be diagnostically important and may further improve performance.

% Acknowledgments---
\midlacknowledgments{This work was supported by the German Science Foundation (BE5601/8-1 and the Excellence Cluster 2064 ``Machine Learning --- New Perspectives for Science'', project number 390727645) and the Hertie Foundation. The authors thank the International Max Planck Research School for Intelligent Systems (IMPRS-IS) for supporting Paul Fischer.
}


\bibliography{midl23_26}

\newpage
\appendix

\section{Training Details}
\label{sec:training-details}

The ROI extraction U-Net network (see Sec.~\ref{sec:ROI-extraction}) was trained with the combined RIGA and Chákṣu training data sampling different expert ground truths for each batch. Model selection was performed using the Dice score on the validation set. 

The probabilistic segmentation networks described in Sec.~\ref{sec:segmentation} were trained in the same fashion based on the regions extracted in the first step. Model selection for the probabilistic U-Net, PHiSeg, and MC Dropout was performed using the generalized energy distance (GED) metric~\cite{NEURIPS2018_473447ac} between the expert annotations and the samples on the validation sets. The ensembles were analogously trained by sampling different experts for each batch, and model selection was performed based on the Dice score of the individual networks. 

We trained classifiers with RTCs resulting from each of the segmentation methods as described in Sec.~\ref{sec:glaucoma-classification}. The classifiers were trained using the Chákṣu training data, and the optimal threshold was determined using the Chákṣu validation data. We used 100 RTC curve samples per training image for all methods except the Ensemble$_{experts}$ and Ensemble$_{seeds}$, which were limited by design to 11 and 10 samples, respectively.

The ResNet50 baseline was initialized using ImageNet weights, and then fine-tuned on predicting the glaucoma label from the automatically extracted ROI crops of the Chákṣu dataset. During training we used random horizontal flips, and small random rotations in $[-10\degree, 10\degree]$ to augment the small dataset. Note that we did not use any augmentation for the segmentation networks, as they were additionally trained with RIGA data and segmentation tasks typically require fewer training data points due to the dense segmentation annotations. We used the AUC computed on the validation set for model selection. In order to compute the sensitivity and specificity in Table~\ref{tab:results}, we obtained the decision threshold which maximizes the Youden index.

\section{Additional qualitative results}
\label{sec:appenix-qualitative}

In Fig.~\ref{fig:appendix-qualitative} we show additional qualitative results of the entropy for experts and models as well as the corresponding RTCs. 

\begin{figure}[!ht]
    \centering
    \includegraphics[width=\textwidth]{figures/model_entropies.png}
    \caption{Entropy maps and RTC plots for every model for three representative example subjects.}
    \label{fig:appendix-qualitative}
\end{figure}

\section{Visual explanation of RTC extraction}
\label{sec:appenix-rim-thickness}

\begin{figure}[h]
    \centering
    \includegraphics[width=0.7\textwidth]{figures/rimthicknesscalc.png}
    \caption{\textbf{Rim-thickness calculation.} A beam centered at the cup is rotated 360 degrees. At every 0.5 degrees the Euclidean distance between the cup and the disc is recorded in the rim thickness plot.}
    \label{fig:rimthicknesscalculation}
\end{figure}

Fig.~\ref{fig:rimthicknesscalculation} provides some intuition for the calculation of the RTC. 
\end{document}