\documentclass{midl} % Include author names
%
\usepackage{float}
\usepackage{wrapfig}
\usepackage{pdfpages}
\usepackage{hyperref}

\setlength{\textfloatsep}{7pt plus 0pt minus 2.0pt}


% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images
\jmlryear{2024}
\jmlrworkshop{Full Paper -- MIDL 2024}
\jmlrvolume{-- nnn}
\editors{Accepted for publication at MIDL 2024}

\title[Efficiently correcting patch-based segmentation errors]{Efficiently correcting patch-based segmentation errors to control image-level performance in retinal images}


\midlauthor{\noindent
 \Name{Patrick K{\"o}hler\nametag{$^{1}$}} \Email{patrick.koehler@uni-tuebingen.de}\\
 \Name{Jeremiah Fadugba\nametag{$^{2, 3}$}} \Email{jfadugba@quantumleapafrica.org}\\
 \Name{Philipp Berens\nametag{$^{1,4}$}} \Email{philipp.berens@uni-tuebingen.de}\\
 \Name{Lisa M. Koch\nametag{$^{1,5}$}} \Email{lisa.koch@uni-tuebingen.de}\\
 \vspace{0.1}\\
 \addr $^{1}$ Hertie Institute for AI in Brain Health, University of T{\"u}bingen, Germany\\
 \addr $^{2}$ African Institute for Mathematical Sciences (AIMS), Rwanda\\
 \addr $^{3}$ University of Ibadan, Nigeria\\
 \addr $^{4}$ Tübingen AI Center, University of T{\"u}bingen, Germany \\
 \addr $^{5}$ Department of Diabetes, Endocrinology, Nutritional Medicine and Metabolism UDEM, Inselspital, Bern University Hospital, University of Bern, Switzerland
}

\begin{document}
%
\maketitle
%
\begin{abstract}
    Segmentation models which are deployed into clinical practice need to meet a quality standard for each image.
    Even when models perform well on average, they may fail at segmenting individual images with a sufficiently high quality.
    %
    We propose a combined quality control and error correction framework to reach the desired segmentation quality in each image. 
    %
    Our framework recommends the necessary number of local patches for manual review  and estimates the impact of the intervention on the Dice Score of the corrected segmentation.
    This allows to trade off segmentation quality against time invested into manual review.
    %
    We select the patches based on uncertainty maps obtained from an ensemble of segmentation models.
    %
    We evaluated our method on retinal vessel segmentation on fundus images, where the Dice Score increased substantially after reviewing only a few patches. 
    Our method accurately estimated the review's impact on the Dice Score and we found that our framework controls the quality standard \textit{efficiently}, i.e. reviewing as little as necessary.
\end{abstract}

   
\begin{keywords}
 Quality Control, Segmentation, Retinal Blood Vessels, Fundus
\end{keywords}

\section{Introduction}
%
    Segmentation is a central task in medical image analysis, as it often builds the foundation for surgical planning \cite{li2021real}, diagnosis and disease progression monitoring \cite{soomro2019deep}.
    In ophthalmology for example, segmenting retinal blood vessels from fundus images provides geometric characteristics such as branching angles or vessel diameters in a non invasive fashion. 
    Unfortunately, their manual segmentation requires three to five hours per image \cite{jin2022fives}, making  it unfeasible to annotate entire images routinely for every patient. 
%
    Recently, medical image segmentation algorithms have achieved a performance that is sufficient for clinical deployment \cite{isensee2021nnu}.  
    Yet, even the best models are not guaranteed to perform well on \textit{all} images and may fail silently on individual ones. 

%
    In medical contexts, quality standards are often crucial for safety, fairness and efficacy of therapeutic decisions.
    One strategy to implement such standards is to predict a quality metric such as the Dice Score Coefficient (DSC) per image without knowing the ground truth and exclude low quality segmentations from downstream analyses.
    % Suggestion
    This can be done by training an auxiliary DSC regression network in addition to the segmentation model itself \citep{robinson2018real,williams2021automatic, FOURNEL2021102213}, or using probabilistic segmentation model outputs \citep{li2022estimating}.
    % Rebuttal
    Similarly, \citet{galdran2018no} learn to predict the normalized mutual information between the ground truth (GT) and the model segmentation, leveraging manually degraded GTs as a training set.
    However, this quality control paradigm defers entire images with subpar DSC for manual review.
    This may cause more manual labour than necessary because the low performance may be caused predominantly by specific image regions.
    For a quality assessment which is more granular than image level, \citet{ZAMAN2023107324} train a model to predict segmentation error maps.
%
\begin{figure}[t!]
    \centering
    \includegraphics[width=\textwidth]{figures/overview_v1}
    \caption{Overview. We obtain multiple predictions from an ensemble (\textbf{a}-\textbf{c}) and compute an uncertainty map (\textbf{d}) from which we select patches with highest uncertainty (\textbf{e}). Afterwards, we estimate the DSC and how it would change if the patches were to be reviewed manually (\textbf{f}) to satisfy a target quality (\textbf{g}).}    
    \label{fig:overview}
\end{figure}
%
    While such approaches identify segmentation errors they do not discuss effective strategies for corrective interventions.
    This is done by interactive segmentation  methods \cite{liu2022transforming, luo2021mideepseg}, which incorporate manual annotations to refine their prediction.
    Similarly, \citet{benenson2019large} propose an error correction mechanism by training an additional network to correct the predicted segmentation. 
    This approach relies on a manually curated dataset of corrections which is representative for the model's failure modes.
    Neither of these methods has attempted to quantify the effect of the correction in advance.

    In contrast, here we propose a combined quality control and error correction framework for vessel segmentation in retinal fundus images.
    Our framework proposes local candidate regions for manual review instead of deferring entire images.
    Then, we extend a recently proposed DSC estimator to provide an estimate of the correction's impact \citep{li2022estimating}.
    This allows the annotator to assess how many patches should be reannotated  in order to meet the required segmentation quality standards.
%      
%  
%
%  
%
\section{Methods}
%
\subsection{Multi-disease dataset for challenging vessel segmentation}
\label{sec:data}
    We used the FIVES dataset \cite{jin2022fives}, comprised of 800 high-resolution retinal fundus images from the Second Affiliated Hospital of Zhejiang University (SAHZU), China. The images were taken from healthy individuals and patients with glaucoma, age-related macular degeneration (AMD) and diabetic retinopathy (DR) (200 images each). In each image, retinal blood vessels were annotated manually by two junior annotators and verified by experienced senior annotators in a standardized  procedure \cite{jin2022fives}. The presence of disease lesions made the segmentation task more challenging because these could interfere with the blood vessels. We used the original splits provided with the dataset, i.e. 600 training images, from which we used 120 for validation, and 200 test images.
    % Preprocessing
    All images were pre-processed by applying Contrast Limited Histogram Equalization \cite{pizer1987adaptive} with a clip limit of 2 and a grid size of \mbox{8 x 8}.  The images and segmentation masks with original resolution of \mbox{2048 x 2048} were resampled to \mbox{512 x 512} pixels.
%
\subsection{Probabilistic segmentation model for vessel segmentation}
\label{sec:model}
    We used the state-of-the-art vessel segmentation model FR-Unet \cite{liu2022full} to develop our framework. 
    The architecture was optimized for the intricacies of retinal vessel segmentation, namely thin foreground structures and low-contrast regions.
    The model's hidden representations expanded horizontally and vertically through a multiresolution convolution mechanism to retain the full image resolution.
    This allowed aggregating features from different scales to supplement high-level contextual information to the low-level regimes and vice versa.
    In experiments with a limited range of common datasets, FR-Unet has been shown to outperform other architectures with fewer parameters \cite{liu2022full}. 

    We used an ensemble of $m$ FR-Unets,  which were trained with different random seeds \cite{ganaie2022ensemble}. An image of size \mbox{$n \times n$} was passed through the ensemble, resulting in $m$ predicted probabilistic segmentations \mbox{$\widehat Y_p^{(1)}, \ldots, \widehat Y_p^{(m)}$} (Fig.\,\ref{fig:overview}\,\textbf{a}-\textbf{c}).
    The final probabilistic segmentation \mbox{$\widehat Y_p =  \{p_i: p_i \in [0,1], i = 1, \ldots, n^2\}$} was obtained by averaging the individual outputs.
    Thresholding $\widehat{Y}_p$ yielded the predicted binary segmentation $\widehat Y = \textbf{1}_{[\widehat{Y}_p > 0.5 ]}$.
%
\subsection{Quality control framework}
\label{sec:quality_control_framework}
%
    % Summarize Fig. 1 in 3 Sentences
    Given a fundus image and a probabilistic segmentation, our goal was to refer a minimal number of patches to manual review such that a desired segmentation quality could be guaranteed (Fig.\,\ref{fig:overview}). Our proposed framework consisted of two major components: (1) We computed pixel-wise uncertainties from the outputs of the FR-Unet ensemble to select patches as candidates for manual review (Fig.\,\ref{fig:overview}\,\textbf{a}-\textbf{e}). (2) We estimated each patch's impact on the segmentation quality if it was to be reviewed by an expert (Fig.\,\ref{fig:overview}\,\textbf{f}, \textbf{g}). This allowed us to control the desired segmentation performance efficiently, i.e. only referring as few patches for review as necessary.
%    
\subsubsection{Selecting candidate patches for manual segmentation}
\label{sec:patch_selection}
    To obtain an uncertainty map $U$, we computed the pixelwise entropy across all \mbox{$i=1, \ldots, m$} probabilistic segmentations $\widehat Y_p^{(i)}$ (Fig.\,\ref{fig:overview}\,\textbf{d})
    We expected that high uncertainty regions correspond to erroneous segmentations and that reviewing those would lead to efficient quality improvements.  To identify local regions with potentially low segmentation quality and therefore high potential for improvement, we convolved $U$ with a square kernel of size $P \times P$, effectively computing the mean uncertainty in the patch around each pixel. We then selected the $k$ non-overlapping patches with the highest uncertainty (Fig.\,\ref{fig:overview}\,\textbf{e}).

\subsubsection{Estimating segmentation quality without ground-truth labels}
\label{sec:method-dsc-estimation}

    %% Objective of this subsection
    Having identified candidate patches, we wanted to estimate their impact on the image's segmentation quality in terms of DSC if they were to be re-segmented manually.
    The DSC is defined as 
%
\begin{align}
    \label{eq:dsc_def}
    \text{DSC} = \frac{2\text{TP}}{2\text{TP} + \text{FP} + \text{FN}} = \frac{2\text{TP}}{(\text{TP} + \text{FN}) + (\text{TP} + \text{FP})} ~,
\end{align}
%    
    where, TP, FP and FN denote the number of true/ false positive and false negative pixels.
    Therefore, the DSC can only be computed using the GT segmentation.
    As GT labels are not available at test time, we can not compute Eq.\,\ref{eq:dsc_def} directly to assess the impact of reannotating a given patch. However, DSC can be estimated only having access to the probabilistic model output $\widehat Y_p$  \citep{li2022estimating}.  
    This approach relies on calibrated output probabilities, which means that for all \mbox{$\pi \in [0,1]$} exactly \mbox{$\pi \cdot 100\%$} of the pixels with predicted probability $\pi$ actually belong to the foreground, such that the predicted probabilities reflect the correctness of the prediction accurately.
    
    % AC = Acc
    If the outputs were perfectly calibrated, summing over them would yield the number of pixels that belonged to the GT foreground (by definition).
    \citet{li2022estimating} leverage this property to construct their estimator
%    
\begin{equation}
    \label{eq:dsc-est}
    \widehat{\text{DSC}} (\widehat Y_p) =  \frac{2 \sum_{i=1}^n \textbf{1}_{[p_i > 0.5 ]} p_i}{\sum_{i=1}^n p_i + \sum_{i=1}^n \textbf{1}_{[p_i > 0.5 ]}} ~, 
\end{equation}
 %   
    where $\textbf{1}_{[.]}$ denotes the indicator function.
    Hence, summing over the output probabilities of all pixels that were classified as foreground yields an estimator for TP (enumerator of Eq.\, \ref{eq:dsc_def}).
    Analogously, we can estimate the total number of GT foreground pixels, i.e. \mbox{TP + FN}, by summing over all output probabilities (Eq.\,\ref{eq:dsc-est}, first summand in denominator).
%
    We applied temperature scaling (TS) to the model outputs as in \citet{li2022estimating} to calibrate $p_i$.
  
\subsubsection{Estimating quality improvement after manual review}
\label{sec:method-dsc-with-patches}
%
    The DSC estimator provided a quality assessment for an individual image.
    While this may be useful for quality control in itself, it does not provide actionable insight for estimating the effect of patch-based error correction. Here, we were interested in the following question: \textit{What would the DSC be if an expert reviewed specific high-uncertainty patches} (Fig.\,\ref{fig:overview}\,\textbf{f})\textit{?}
    
    Therefore, we propose an estimator for the DSC of the corrected segmentation $\widehat{Y}_{\text{corr}}$ that is composed of two image parts: The high-confidence regions where the model output is accepted $\widehat{Y}_{\text{model}}$, and the manually reviewed patches \mbox{$\widehat{Y}_{\text{manual}}$} in the high-uncertainty regions.
    The DSC of the combined segmentation $\widehat{Y}_{\text{corr}}$ can be expressed as as a linear combination of the DSC estimates of its components:
%
\begin{align}
    \label{eq:combine_dsc}
    & \widehat{\text{DSC}}(\widehat{Y}_{p,\,\text{corr}}) = w_{\text{model}} \widehat{\text{DSC}} (\widehat{Y}_{p,\,\text{model}}) + w_{\text{manual}} \widehat{\text{DSC}} (\widehat{Y}_{p,\,\text{manual}}) ~.
\end{align}
% 
    The DSC for $\widehat{Y}_{\text{model}}$ can be estimated with Eq.\,\ref{eq:dsc-est} by considering the foreground probabilities $p_i$ only for regions where the model output was accepted.
    For $\widehat{Y}_{\text{manual}}$, we assumed perfect manual segmentation performance for simplicity, i.e. $\widehat{\text{DSC}} (\widehat{Y}_{\text{manual}}) = 1$.
    %
    The weights $w_i$ correspond to the fraction of predicted foreground within $\widehat Y_i$.
    We give a theoretical justification for this choice in App.~\ref{app:proof}.     

\section{Results}
%
\subsection{Segmentation performance before error correction}
%
    We trained an ensemble of $m=5$ FR-Unets on the training fold of the FIVES dataset (see Sec. \ref{sec:model}) for 80 epochs with the DSC Binary Cross Entropy loss\footnote{Trained models and code: \href{https://github.com/berenslab/MIDL24-segmentation_quality_control}{\url{github.com/berenslab/MIDL24-segmentation_quality_control}}.}.
    This loss was a combination of the softDSC loss \cite{milletari2016v} with Cross-Entropy and has been observed to produce better generalization than softDSC alone \cite{liu2021hidden, ma2021loss, galdran2022optimal}.
    To improve generalization, we augmented the data with random flips and rotations.
    We used the Adam optimizer with a learning rate of $10^{-4}$ and a cosine annealing scheduler (number of iterations \mbox{$\leq$ 40}).
    
    Afterwards, we selected the model with the highest validation DSC, leading to an average DSC of $0.887\pm 0.094$ (mean $\pm$ SD) on the test set ($n=200$). 
    % Distribution over DSC
    For 10\% of the images, the DSC was below 0.828 or above 0.934 (Fig.\,\ref{fig:increase}\,\textbf{a}).
    % Major mistakes
    We found that many of the segmentation errors occured in the finer vessel structures and we mainly observed discontinuities of vessels and missing segments (see. Fig \ref{fig:app:xamples} in App.~\ref{app:xamples}).
    The uncertainty maps provided by the FR-Unet ensemble accurately identified segmentation errors in the images (see Fig \ref{fig:app:xamples} in App.~\ref{app:xamples}). 
%
%
%
\subsection{Correcting high-uncertainty regions increases segmentation quality}
\label{sec:increase}
%
\begin{figure}
    \centering
    \includegraphics[width=\textwidth]{figures/increase_all_v2.png}
    \caption{Distribution over DSC in the test set (\textbf{a}) and effect of reviewing the patches \mbox{(\textbf{b}-\textbf{d})}.
    For the selected patches, we inserted the GT to model manual correction. 
    For the baseline we choose random patches and depict only mean, std for visual clarity.    
    We split up the test set (\textbf{b}) into well (\textbf{c}, above median DSC) and poorly segmented images (\textbf{d}).}
    \label{fig:increase}
\end{figure}
%
%
%
    Manually segmenting high-uncertainty regions increased the segmentation quality (Fig.\,\ref{fig:increase}\,\textbf{b}). Here, we selected the top $k \in \{0, \dots, 5\}$ non-overlapping patches of size \mbox{81\,$\times$\,81} in the uncertainty map (as in Fig.\,\ref{fig:overview}\,\textbf{e}) and replaced the predicted output with the GT to model a "perfect" human annotator.
    % Rebuttal
    The impact of patch size is discussed in App.~\ref{app:patchsize}.
    
   We observed a gradual increase in DSC when correcting more patches. For example, the median DSC across all images increased by more than $0.02$ after correcting five patches. This was much more than could be achieved by a simple baseline approach of selecting random patches (from inside the retinal fundus) instead of high entropy patches, which increased segmentation quality at a lower rate (grey points in Fig.\,\ref{fig:increase}).

    For further examination, we split the test set into well-segmented images (above median DSC, Fig.\,\ref{fig:increase}\,\textbf{c}) and poorly segmented images (below median DSC, Fig.\,\ref{fig:increase}\,\textbf{d}).  The segmentation quality improved more for images which were initially poorly segmented (Fig.\,\ref{fig:increase}\,\textbf{d}, similar trends were observed in a subgroup analysis in App.\,\ref{app:subgroup}). Here, we observed a median increase in DSC of approximately 0.04 when reannotating $k=5$ patches. For some images, the segmentation quality was improved by up to 0.1. Even for images that were already well segmented the DSC could be further improved by $0.018$ on average.

    % Takeaway in context of the whole workflow
    In summary, patch-wise error correction based on uncertainty led to a substantial increase in performance on average.
    However, using this simple patch selection strategy did not by itself provide an a-priori estimate for the effect of error correction. 
%
%
%
\subsection{DSC estimation allows to predict the impact of manual patch review}
\label{sec:exp:est_acc}
%
\begin{figure}[t!]
    \centering
    \includegraphics[scale=0.4]{figures/estimation_accuracy.png}
    \caption{Accuracy of estimating DSC for test set images entirely segmented (\textbf{a}) by the model and (\textbf{b}) after manual review. Estimation error computed as \mbox{$\widehat{\text{DSC}} - \text{DSC}$}.}
    \label{fig:estimation_accuracy}
\end{figure}


\noindent
    % Motivation
    Therefore, we next predicted the effect of patch-wise error correction on the resulting segmentation quality, without access to the GT. 
    We first assessed the estimator proposed by \citet{li2022estimating} (Eq.\,\ref{eq:dsc-est}) to predict DSC before error correction.
    With a mean absolute error of $0.02$, we found that it reliably estimated the segmentation quality for individual images (Fig.\,\ref{fig:estimation_accuracy}\,\textbf{a}, red points), even though
    we observed a slight bias towards overestimating the true performance (mean error \mbox{$\widehat{\text{DSC}} -  \text{DSC} = 0.012$}).
    Moreover, accurate DSC estimation relied heavily on calibration with temperature scaling.
    For uncalibrated model outputs, \mbox{$\widehat{\text{DSC}}$} overestimated consistently (grey points in Fig.\,\ref{fig:estimation_accuracy}\,\textbf{a}).
    
    Ultimately, we were interested in predicting how the DSC changed if the segmentation in high uncertainty regions of the images was corrected.
    Hence, we evaluated our proposed DSC estimator for a patch-wise corrected segmentation (Eq.\,\ref{eq:combine_dsc}) and observed constantly low estimation errors across all number of patches (Fig.\,\ref{fig:estimation_accuracy}\,\textbf{b}).
    The bias towards overestimation observed in the estimator from \citet{li2022estimating} carried over to our correction estimate.
    When we correct only one or two patches, the DSC's are being overestimated for more than half of the images.
    In general, we preferred conservative estimates over those liberal ones, because we rather wanted to ensure the desired quality target with a higher probability at the cost of reviewing more patches.
    Hence, we introduced a correction term in the subsequent analysis to reduce overconfidence post-hoc.
   
    In conclusion, the estimator from \citet{li2022estimating} was not only useful for its initial purpose of estimating average performance over entire datasets but also allowed us to assess how DSC would change if an expert was to review a specific set of patches.
%
%
%
%
\subsection{Adaptive patch selection leads to more efficient resource allocation}
\noindent
    % Motivation
    Reviewing segmentations with human experts is typically time intensive and costly. 
    Hence, we wanted to ensure that our method is efficient, i.e. that we only request as little human resources as necessary to reach the desired quality.  
    
    % 5th percentile and epsilon correction
    To quantify robustly the quality standard that has been reached by a correction strategy we chose the 5$^\text{th}$ percentiles over the images' DSCs.
    In contrast to the minimum DSCs, this accounted for outliers for which the model's segmentation is fundamentally incorrect and cannot be fixed with partial review.
    In our framework, the remaining 5\% should be  referred to full review or repeated acquisition.
    %
    We calibrated our DSC estimates to prevent overconfidence (cf. Sec.\,\ref{sec:exp:est_acc}) by subtracting  \mbox{$\epsilon = 0.02$}.
    $\epsilon$ was optimized on the validation set such that the $5^{\text{th}}$ percentiles matched the quality target.  
\label{sec:efficient}
\begin{figure}[h!]
  \centering
  \includegraphics[width=\textwidth]{figures/efficient.png}
  \caption{Efficiency comparison of our proposed workflow (coloured) with a baseline, where for each image an equal number of patches is reviewed (grey). For our workflow, we chose three quality targets and estimated for each image individually how many patches needed to be reviewed to achieve the targets.}
  \label{fig:efficient}
\end{figure}

    
    We determined for each image individually how many patches needed to be reviewed in order to reach each of the three quality targets of \mbox{DSC = 0.88}, 0.90 and 0.92, where  0.92 was reported as human performance on this data set \cite{jin2022fives}.
    With Eq. \ref{eq:combine_dsc}, we calculated the estimated DSC after reviewing \mbox{$k=1, \ldots, 14$} patches and select the lowest number of patches such that the estimated DSC exceeded the target.
    % Baseline comparison
    As a baseline, we chose a fixed number of patches which were reviewed for \textit{each} image.

    Our adaptive strategy led to 3.2 reviewed patches per image on average to reach a quality standard of almost 0.90 (Fig.\,\ref{fig:efficient}).
    The actual performance (5$^\text{th}$ percentiles were 0.87, 0.89, 0.91) differed slightly from the desired quality standards because of imperfect DSC estimation. 
    For the baseline, approximately three times more patches were reviewed per image to achieve a similar quality standard.
%    
%   
\section{Discussion and Conclusion}
    %
    % Summary
    We presented a framework to correct segmentation errors locally in order to control the segmentation quality per image.
    Correcting patches with high uncertainty led to an increase in segmentation quality for retinal blood vessels.
    Furthermore, we could accurately predict this increase using a DSC estimator which did not require access to the GT segmentation.
    Therefore, our workflow allowed to allocate more resources to images with poor segmentations and not waste resources where outputs already satisfied the quality criterion.

    % Rebuttal: Extending to clDSC
    Our method estimates DSC, which is a measure of overlap with broad applicability in many segmentation tasks.
    However, particularly to account for topological consistency in thin vessel structures, predicting customized metrics such as the centerline DSC \cite{shit2021cldice} is an important next step for certain downstream tasks.

    In this paper, we assumed perfect expert performance for the patch review.
    While the performance may be higher than in large scale annotation settings because the reviewer can focus their attention on few small areas, this simplification ignores potential inter-rater variability. 
    One mitigation would be to evaluate our method with multi-annotator data, where the oracle patch could be provided by a different annotator. 

    We used deep ensembles to quantify uncertainty and suggest candidate patches for manual review.
    Our modular framework allows replacing this step with any other approach that generates uncertainty maps such as Monte-Carlo dropout (see \citet{fuchs2022practical} for an overview and App.\,\ref{app:uncertainties} for a comparison of uncertainty maps). 
    As uncertainty estimation typically generates computational overhead, other strategies could be pursued to suggest candidate patches purely based on image statistics, e.g. by identifying low-contrast regions.
        
    Beyond covering large demands in hospitals more efficiently, our method can be applied in clinical trials, when automatic volumetric measurements inform on effect sizes of drugs.
    In that case, our framework could improve the efficiency of drug development by ensuring that each volumetric estimate is accurate enough for the downstream analysis of interest. 
    
\midlacknowledgments{
This work was supported by the German Science Foundation (BE5601/8-1 and the Excellence Cluster 2064 ``Machine Learning --- New Perspectives for Science'', project number 390727645), the Carl Zeiss Foundation in the project ``Certification and Foundations of Safe Machine Learning Systems in Healthcare'', the Hertie Foundation and the Carnegie Corporation of New York (provided through the African Institute for Mathematical Sciences). 
}

\bibliography{midl24_010}
\newpage




\appendix


\section{Justification for the choice of $w_i$ in Equation \ref{eq:combine_dsc}}
\label{app:proof}
%
    Let us assume we are given a predicted segmentation $X$ and the respective ground truth $Y$.
    Given two disjoint subsets of those, which we denote with subscripts, we would like to combine their individual DSC linearly to obtain the overall \mbox{DSC$(X, Y)$}.
    Let us consider the DSC definition in terms of set sizes
%
\begin{align}
    \text{DSC} (X, Y) & = \frac{2 \vert X \cap Y \vert}{\vert X \vert + \vert Y \vert} \label{eq:denominator} \\
    & \stackrel{!}{=} w_1 \text{DSC}(X_1, Y_1) + w_2 \text{DSC}(X_2, Y_2)~, \label{eq:first_summand}
\end{align}
    and solve for $w_i$.
    
%
    Furthermore, let the subsets be disjoint and their union the entire image
\begin{align}
    \label{eq:assumptions}
    & Z_1 \cup Z_2 = Z~, \quad Z_1 \cap Z_2 = \varnothing \qquad \forall Z \in \{X, Y\}.
\end{align}
%
    Multiplying each of the summands in Eq. \ref{eq:first_summand} with 1 allows us to write them over the same denominator as 
    on the right side of Eq. \ref{eq:denominator}.
    For $i = 1$ set $j = 2$ and vice versa.
    Then
\begin{align}
\label{eq:by_one}
    w_i \text{DSC} (X_i, Y_i) = w_i \text{DSC} (X_i, Y_i) \frac{\frac{\vert X_1 \vert + \vert X_2 \vert + \vert Y_1 \vert + \vert Y_2 \vert}{\vert X_i \vert + \vert Y_i \vert}}{\frac{\vert X_1 \vert + \vert X_2 \vert + \vert Y_1  \vert + \vert Y_2 \vert}{\vert X_i \vert + \vert Y_i \vert}}
    = w_i \frac{2 \vert X_i \cap Y_i \vert \left(1 + \frac{\vert X_j \vert + \vert Y_j\vert}{\vert X_i \vert + \vert Y_i \vert}\right)}{\vert X_1 \vert + \vert X_2 \vert + \vert Y_1 \vert + \vert Y_2 \vert},
\end{align}
    where the denominator of the right side equals to $\vert X \vert + \vert Y \vert$ because of the prerequisits \ref{eq:assumptions}.\\
    Now, let us choose $w_i$ such that the enumerator of \mbox{Eq. \ref{eq:denominator}} equals the enumerator of \mbox{Eq. \ref{eq:first_summand}} after Eq. \ref{eq:by_one} as been applied, i.e.
\begin{align}
\label{eq:to_solve}
    \sum_{i=1}^2 2 w_i  \vert X_i \cap Y_i \vert \left(1 + \frac{\vert X_j \vert + \vert Y_j\vert}{\vert X_i \vert + \vert Y_i \vert} \right) = 2 \vert X \cap Y \vert.
\end{align}
    Since
\begin{align}
    \sum_{i=1}^2 2  \vert X_i \cap Y_i \vert = 2 \vert X \cap Y \vert~,
\end{align}
    because of the prerequisites \ref{eq:assumptions}, solving Eq. \ref{eq:to_solve} for $w_i$ yields
\begin{align}
    w_i^* = \left(1 + \frac{\vert X_j \vert + \vert Y_j\vert}{\vert X_i \vert + \vert Y_i \vert} \right)^{-1} =  \frac{\vert X_i \vert + \vert Y_i \vert}{\vert X \vert + \vert Y \vert}.
\end{align}
In words, this is the sum of predicted foreground and GT foreground within the subset $i$ over the sum of predicted foreground and GT foreground in the entire image.
As we have no access to the amount of GT foreground we approximate it with the predicted amount of foreground.
Hence, we approximate $w_i$ with the number of predicted foreground pixels within the subset $i$ over the total number of predicted foreground pixels.
\newpage

\section{Example Images}
\label{app:xamples}
\begin{figure}[H]
    \centering
    \includegraphics[width=\textwidth]{figures/grid_patches_v1.png}
    \caption{Example images drawn from the test set including the first two patches that were automatically selected as candidates for manual review (red indicates the first patch).}
    \label{fig:app:xamples}
\end{figure}

\newpage

\section{Impact of patchsize}
\label{app:patchsize}
\begin{figure}[h!]
    \centering
    \includegraphics[scale=0.35]{figures/rebuttal_patchsize.png}
    \caption{Impact of patchsize on the change in DSC. For the comparison of three patch sizes ($41^2$, $81^2$, $161^2$), we choose the number of patches such that all three settings select the same area of the image, i.e. 1 patch of size $161^2$ vs 4 patches of size $81^2$ vs 16 times $41^2$. The effect strength is comparable between 16 small patches and 4 medium sized patches. We suppose that it is more convenient for a clinician to review fewer patches because delineating at the border of the patch requires special attention. Hence, we opt to show the results for patch size $81^2$ in the main text.}
    \label{fig:enter-label}
\end{figure}
\newpage

\section{Subgroup analysis}
\label{app:subgroup}


\begin{table}[h!]
    \centering
    \begin{tabular}{c|c|c|c}
        AMD & DR & Glaucoma & Normal \\
        \hline
        0.91 $\pm$ 0.04 & 0.89 $\pm$ 0.05 & 0.84 $\pm$ 0.16 & 0.90 $\pm$ 0.04\\
    \end{tabular}
    \caption{Average $\pm std$ DSC per subgroup on the test set. Each subgroup consists of 50 images.}
    \label{tab:subgroup}
\end{table}

\begin{figure}[h!]
    \centering
    \includegraphics[width=\textwidth]{figures/subgroup.png}
    \caption{Effect of reviewing a specified number of patches split up by subgroup. A = Age Related Macular Degeneration, D = Diabetic Retinopathy, G = Glaucoma, N = Normal. Within the pathological subgroups, there exist more images that benefitted strongly from the review, whereas the effect size was distributed more uniformly in the subgroup of normal images.
    The effect was particularly pronounced for the Glaucoma subgroup, which had the lowest segmentation performance (Tab.~\ref{tab:subgroup}).}
    \label{fig:subgroup}
\end{figure}

\newpage

\section{Comparison of uncertainty estimates}
\label{app:uncertainties}
\begin{figure}[h]
    \centering
    \includegraphics[scale=0.3]{figures/uncertainty_comparison.png}
    \caption{Qualitative comparison of pixelwise uncertainty estimates obtained from Deep Ensembles and MC Dropout. The uncertainties are structurally very similar. The MC Dropout uncertainties are computed from a single FR-Unet with dropout rate 0.1 at each convolutional layer. Note that the computational cost to determine the optimal dropout rate and position is very high, making it not substantially cheaper than the deep ensembles.}
    \label{fig:uncertainties}
\end{figure}
\end{document}