\documentclass[]{midl} % Include author names
% \documentclass[anon]{midl} % Anonymized submission

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution
\usepackage{float}
\usepackage{graphicx} % Required for \resizebox
\usepackage{tabularx}
\usepackage{tabulary}
\usepackage{booktabs} % For better quality table rules
\usepackage{siunitx}  % Allows for alignment at the decimal point
\usepackage{amsmath}  % For text in the table
\usepackage{multirow} % For multirow feature
\usepackage{geometry} % For page margin adjustments
\usepackage{makecell}
\usepackage{caption}  % For captions
 
% \usepackage{subcaption}
\captionsetup[figure]{font=small}
\captionsetup[table]{font=small}
% \usepackage{blindtext}
\usepackage{mwe} % to get dummy images
\jmlrvolume{-- 218}
\jmlryear{2024}
\jmlrworkshop{Full Paper -- MIDL 2024}
\editors{Accepted for publication at MIDL 2024}





\title[segmentation through probabilistic signed distance functions]{Uncertainty-aware retinal layer segmentation in OCT through probabilistic signed distance functions}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Mohammd Mohaiminul Islam\nametag{$^{1,2}$}} \Email{m.m.islam@uva.nl}\\
\Name{Coen {de Vente}\nametag{$^{1,2}$}} \Email{c.w.devente@uva.nl}\\
\Name{Bart Liefers\nametag{$^{3}$}} \Email{b.liefers@erasmusmc.nl}\\
\Name{Caroline Klaver\nametag{$^{3}$}} \Email{c.c.w.klaver@erasmusmc.nl}\\
\Name{Erik J Bekkers\nametag{$^{1}$}} \Email{e.j.bekkers@uva.nl}\\
\Name{Clara I. Sánchez\nametag{$^{1,2}$}} \Email{c.i.sanchezgutierrez@uva.nl}\\
\addr $^{1}$ Informatics Institute, University of Amsterdam, Netherlands.\\
\addr $^{2}$ Department of Biomedical Engineering and Physics, Amsterdam UMC, Netherlands.\\
\addr $^{3}$ Department of Ophthalmology \&  Epidemiology, Erasmus MC, Rotterdam, Netherlands.
}

\begin{document}

\maketitle 

\begin{abstract}
In this paper, we present a new approach for uncertainty-aware retinal layer segmentation in Optical Coherence Tomography (OCT) scans using probabilistic signed distance functions (SDF). Traditional pixel-wise and regression-based methods primarily encounter difficulties in precise segmentation and lack of geometrical grounding respectively. To address these shortcomings, our methodology refines the segmentation by predicting a signed distance function (SDF) that effectively parameterizes the retinal layer shape via level set. We further enhance the framework by integrating probabilistic modeling, applying Gaussian distributions to encapsulate the uncertainty in the shape parameterization. This ensures a robust representation of the retinal layer morphology even in the presence of ambiguous input, imaging noise, and unreliable segmentations. Both quantitative and qualitative evaluations demonstrate superior performance when compared to other methods. Additionally, we conducted experiments on artificially distorted datasets with various noise types—shadowing, blinking, speckle, and motion—common in OCT scans to showcase the effectiveness of our uncertainty estimation. Our findings demonstrate the possibility to obtain reliable segmentation of retinal layers, as well as an initial step towards the characterization of layer integrity, a key biomarker for disease progression. Our code is available at \url{https://github.com/niazoys/RLS_PSDF}. 

\end{abstract}

\begin{keywords}
Probabilistic signed distance function, Implicit representation, OCT
\end{keywords}


\begin{figure}[t]\vspace{-0.75cm}
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
\floatconts
  {fig:example}
  {\vspace{-0.65cm}\caption{Left: The zero level set (green) of a signed distance function (SDF) $d$ parametrizes a retinal layer in a OCT B-scan. Right: Due to the Eikonal constraint $\lVert \tfrac{\partial}{\partial y} d(x,y) \rVert = 1$, uncertainty in the SDF $d$, here represented as displacement $\Delta d = \sigma$ in the level, induces an equal displacement $\Delta y$ of the contour.}}
  {\includegraphics[width=\linewidth,height=4.65cm]{imgs/fig1_final.pdf}}
  
\end{figure}
\vspace{-0.1cm}
\section{Introduction} \label{intro}
In recent years, deep learning methods have demonstrated remarkable success in automating the delineation and segmentation of retinal layers in Optical Coherence Tomography (OCT) scans. The task of retinal layer segmentation is primarily tackled via pixel-wise classification, an approach extensively explored in recent works \cite{li2020deepretina,liu2024simultaneous,kugelman2022comparison, sousa2021automatic,lou2020fast,chen2019automated,kugelman2018automatic,santos2019diagnostico}. This technique, however, is known to have challenges in precisely delineating the small structures and nuanced contours when employing cross-entropy or dice loss \cite{abraham2019novel}. The issue is further exacerbated when the layer targeted for segmentation is exceptionally thin, often reduced to a one-pixel width line for layer boundaries or as a consequence of disease manifestation, resulting in inaccurate \cite{lin2023dtu}, and disconnected boundaries (see Fig. \ref{disconnected}) \cite{lan2020elastic,sousa2021automatic}. 

The alternative is to switch to a regression-based approach, predicting layer coordinates per vertical scan (A-scan) line, which addresses the problem of disconnected layer boundaries. Notably, deep learning models that take a two-dimensional B-scan as input and directly output a coordinate for each A-scan \cite{Liefers2019-md,ngo2019deep}, and models that directly predict the height of each layer, ensuring a correct layer ordering \cite{morelle2023accurate}, have been proposed. However, the regression-based approach lacks mechanistic interpretability as the predictions are not in one-to-one correspondence from input to output, i.e. it reduces all pixel information in a vertical column to a single coordinate, making the relationship between input and anticipated boundary ambiguous. Due to this reduction, the geometric grounding of the predictions is challenged, limiting the neural network's ability to exploit geometric priors and regularities. 


We present a new level set approach that overcomes both the challenges of thin layer segmentation and the lack of geometric grounding. Our approach involves predicting an SDF that parametrizes the layer boundary via a level set. The predicted SDF promotes an enhanced spatial congruence between the input and output via a one-to-one pixel correspondence, thereby delivering more robust supervisory signals for the learning algorithm. Additionally, our approach is amenable to probabilistic modeling through a Gaussian likelihood assumption. 


The SDF has seen application in broader medical image segmentation contexts \cite{xue2020shape,raju2022deep,bogensperger2023score}, but their utility in retinal layer segmentation remains unexplored. On the other hand, the literature on aleatoric uncertainty estimation via Gaussian likelihood in medical image segmentation is extensive \cite{abdar2021review,gawlikowski2021survey}. Although \cite{dong2018psdf} introduced a probabilistic framework for SDF using a Normal-Beta mixture distribution, their approach does not extend to the uncertainty in segmentation or shape inference challenges. To the best of our knowledge, this research represents the initial effort to employ SDF to address the challenge of segmenting thin retinal layer boundaries and incorporate Gaussian uncertainty in SDF for segmentation tasks.






Following are the key contributions of this work:
\begin{itemize}
 \setlength\itemsep{-0.25em}

    \item We present a redefined approach to thin layer boundary segmentation with SDF. 
    \item We extend Gaussian modeling for aleatoric uncertainty estimation in our framework.
    \item We show that the proposed uncertainty estimation can offer a means to flag unreliable segmentation, particularly under compromised layer visibility or integrity.
\end{itemize}



\vspace{-.55cm}
\section{Method}\label{method}
Consider an OCT B-scan, or image, $I: \mathcal{X} \times \mathcal{Y} \rightarrow \mathbb{R}$ that assigns to each coordinate $x,y$ an intensity value $I(x,y)$, with $x \in \mathcal{X}$ the horizontal coordinate and $y \in \mathcal{Y}$ the vertical coordinate. Let $a_x \in I$ denote a single A-scan at horizontal location $x$ as $a_x: \mathcal{Y} \rightarrow \mathbb{R}$, and it is defined $a_x(y):=I(x,y)$. The ground truth of a retinal layer can be parametrized as a function ${y}^{gt}:\mathcal{X}\rightarrow\mathcal{Y}$ that assigns to every a-scan a corresponding vertical $y$ coordinate. The $y$ location in single A-scan $a_x$ may then be denoted as $y^{gt}_x:=y^{gt}(x)$. With this notation in place, we present three methods for retinal layer segmentation: pixel-wise, regression, and signed distance function (SDF) approach. 

% While the pixel-wise method is mentioned for completeness, our experimental focus is on the regression and SDF approaches.


\vspace{0.1cm}

\textbf{Pixel-wise approach}: A neural network $f$ predicts a segmentation function $s: \mathcal{X}\times\mathcal{Y} \rightarrow [0,1]$ that returns a probability for the layer to each position. Specifically, a network with parameters $\theta$ generates $s=f^{segm}[I;\theta]$, such that ideally $s(x,y^{gt}_x) = 1$ at the location of the layer, and zero otherwise. Typically, the pixel-wise segmentation predictions are parametrized with Bernoulli distributions, giving a probability for object presence and uncertainty that is directly reflected by the entropy (i.e., $-s \log(s) - (1-s) \log(1 - s)$) of the distribution. Notably, this approach can only express uncertainty in the presence of the structure, but not its shape.


\vspace{0.1cm}


\textbf{Regression approach (\textit{REGR} \& \textit{pREGR})}: A neural network $f^{regr}$  predicts a regression function $y^{regr}: \mathcal{X} \rightarrow \mathcal{Y}$ that parametrizes the layer similarly as done in the ground truth, generating ${y}^{regr} = f^{regr}[I; \theta]$ such that $\forall_{x}: {y}^{regr}_x \approx y^{gt}_x$. Such a model will be referred to with label \textit{REGR}.
To incorporate uncertainty in the predicted location we let the neural network parametrize a predictive distribution 
\begin{equation}
\label{eq:predict_regr}
p( y(x) \mid I, \theta) = \mathcal{N}(\mu(x), \sigma(x)^2) \, ,
\end{equation}
which we model as a Gaussian distribution with mean $\mu(x) = f^{regr}_{\mu}[I;\theta](x)$ and variance $\sigma(x) = f^{regr}_{\sigma}[I;\theta](x)$. Such a probabilistic approach will be referred to with \textit{pREGR}.

\textbf{Signed distance function approach (\textit{SDF})}: A neural network $f^{sdf}$ predicts a signed distance function $d:\mathcal{X}\times\mathcal{Y} \rightarrow \mathbb{R}$ with the property that it satisfies the Eikonal constraint, that is $\forall_{x,y}: \lVert \frac{\partial}{\partial y} d(x,y)\rVert = 1$, and $d(x,y^{gt}_x)=0$, such that $d$ can be interpreted as a distance to the layer. The Eikonal constraint ensures that $d$ linearly increases when moving away from the boundary. Note that in our method the Eikonal constraint is enforced by explicit construction of the ground truth to train the model. Then from a predicted SDF $d$, the actual location of the layer is obtained by solving 
\begin{equation}
\label{eq:zerocrossing}
d(x,y)=0 \, ,
\end{equation}
i.e., by finding the zero-level set of $d$, where the level set of a function $d$ is defined as the set of coordinates $S^l$ for which it equals the given level $l$. In our setting, this means that $
S^l := \{ (x,y) \in X \times Y \mid d(x,y)=l\} \,
$. Since we can define the SDF per A-scan as $d_x$ we can also define the per-A-scan level set as $
S_x^l := \{ y \in Y \mid d_x(y) = l \} \, .$

Due to the laminar structure of the retina, and the Eikonal constraint, we can assume that the per-A-scan zero level set consists of only a single element, which we denote with $y^{sdf}(x)$ and which is the unique solution of (\ref{eq:zerocrossing}). We thus have $S_x^0=\{y^{sdf}(x)\}$. In contrast to the regression method which \textit{explicitly} (directly) parametrizes a layer through a contour function $y^{regr}:\mathcal{X}\rightarrow \mathcal{Y}$, the signed distance function approach \textit{implicitly} parametrizes $y^{sdf}:\mathcal{X}\rightarrow \mathcal{Y}$ as the zero level set of $d$.  We label the latter method with \textit{SDF}.

\textbf{Probabilistic SDF-based regression (\textit{pSDF})}: We model uncertainty on the location of the layer by letting the neural network predict the expected distance $\mu(x,y)=f^{sdf}_{\mu}[I;\theta](x,y)$ as well as an uncertainty for that prediction $\sigma(x,y)=f^{sdf}_{\sigma}[I;\theta](x,y)$, to parametrize a predictive distribution for the SDF $d$ via
\begin{equation}
\label{eq:predict_sdf}
p(d(x,y) \mid I,\theta) = \mathcal{N}(\mu(x,y), \sigma(x,y)^2) \, .
\end{equation}
The uncertainty in the predicted SDF value is directly related to uncertainty in the actual contour location. Namely, if the SDF is normally distributed as above, then the uncertainty $\sigma(x,y)$ on the SDF value translates to uncertainty on the vertical location via
\begin{equation}
\label{eq:predict_sdf_to_regr}
p( y(x) \mid I, \theta) =  \mathcal{N}( {y}^{sdf}(x), \sigma(x,{y}^{sdf}(x))^2) \, ,
\end{equation}
where $y^{sdf}(x)$ is the solution of (\ref{eq:zerocrossing}) at $x$. This result follows from the fact that by the Eikonal constraint, we have $d(x,y+\sigma)= d(x,y) + \sigma$, see Fig.~\ref{fig:example}. We label this probabilistic SDF-based regression approach with \textit{pSDF}.

\vspace{0.1cm}

\textbf{Optimization objective}: With these probabilistic models in place, we take as optimization objective the likelihood of the ground truth under the probabilistic models. For the \textit{SDF} approach, we construct the ground-truth $d^{gt}(x,y)$ following the process detailed in \ref{target_space}. The negative log-likelihood (NLL) for it at a given pixel $(x,y)$ is obtained by
\begin{equation} 
\label{eq:nll}
\small
\ln p(d^{gt}(x,y) \mid I, \theta) = -\frac{1}{2} \left[ \frac{(d^{gt}(x,y) - f^{sdf}_{\mu}[I;\theta](x,y))^2}{f^{sdf}_{\sigma}[I;\theta]^2(x,y)} + \ln f^{sdf}_{\sigma}[I;\theta]^2(x,y) + \ln(2\pi) \right] \, ,
\end{equation}
which under i.i.d. assumption leads to the overall loss (summed over all pixels):
\begin{equation}
\label{eq:nll_full}
\mathcal{L}(\theta) = -\sum_{x,y} \ln p(d^{gt}(x,y) \mid I, \theta) \, .
\end{equation}
The NLL becomes a least squares loss if $\sigma=1$, which we use for \textit{REGR} and \textit{SDF}.




\begin{minipage}{0.49\textwidth}
    \vspace{10pt}
    \fontsize{9pt}{9pt}\selectfont
    \centering
    \captionof{table}{Layer segmentation performance (measured with mean absolute error (MAE) in pixels).}
    \label{tab01}
    \begin{tabular}{llll}
    Method & Layer & \multicolumn{2}{c}{MAE} \\
    & & (internal) & (external) \\
    \toprule
              & ILM  & $0.97_{\pm 0.72}$ & $2.37_{\pm 1.83}$ \\
    REGR      & RPE  & $1.32_{\pm 0.51}$ & $2.54_{\pm 2.14}$   \\
              & BM   & $1.76_{\pm 1.48}$ & $5.28_{\pm 2.94}$   \\ 
              & Avg. & 1.35              & 3.40 \\ 
    \midrule
              & ILM  & $0.85_{\pm 0.71}$ & $2.98_{\pm 2.41}$   \\
    pREGR     & RPE  & $1.28_{\pm 0.82}$ & $2.71_{\pm 2.00}$     \\
              & BM   & $1.80_{\pm 1.50}$ & $3.69_{\pm 1.78}$       \\ 
              & Avg. & 1.31              & 3.12  \\
    \midrule
              & ILM  & $0.33_{\pm 0.42}$ & $1.61_{\pm 0.75}$  \\
    SDF (Our) & RPE  & $0.60_{\pm 0.41}$ & $1.02_{\pm 1.64}$     \\
              & BM   & $0.79_{\pm 0.42}$ & $1.39_{\pm 1.01}$       \\ 
              & Avg. & 0.57              & 1.34  \\
    \midrule
              & ILM  & $0.32_{\pm 0.44}$ & $1.63_{\pm 0.78}$ \\
    pSDF (Our)& RPE  & $0.59_{\pm 0.40}$ & $1.02_{\pm 1.47}$     \\
              & BM   & $0.75_{\pm 0.29}$ & $1.27_{\pm 0.82}$       \\ 
              & Avg. & \textbf{0.55}     & \textbf{1.30}  \\
    \bottomrule
    \end{tabular}
\end{minipage}
\hfill
\begin{minipage}{0.47\textwidth}
    \centering
    \vspace{1.1cm}
    \includegraphics[width=\linewidth]{imgs/MAE.png}
    \captionof{figure}{Layer segmentation performance summary on \textit{internal} and \textit{external} test set.}
    \label{fig2}
\end{minipage}
\vspace{10pt}






\vspace{-0.75cm}
\section{Experimental Setup}\label{experimental_setup}
\par \textbf{Datasets.} We utilized the public dataset \textit{(internal}) from \cite{farsiu2014quantitative}, comprising 384 OCT volumes from AMD and control participants, with three manually annotated retinal layers: the inner limiting membrane (ILM), the retinal pigment epithelium (RPE), and Bruch’s membrane (BM). OCT volumes were acquired using Bioptigen SD-OCT scanners, covering a \(6.7 \text{ mm} \times 6.7 \text{ mm}\) area centered on the fovea, producing volumetric scans of \(1,000 \times 512 \times 100\) dimensions. Additionally, we also utilize an external validation dataset (\textit{external}), a data set of 458 B-scans from 159 unique participants from the Rotterdam Study \cite{hofman2007rotterdam}. This set was acquired using a Topcon system, obtaining 512 $\times$ 885 pixels or 512 $\times$ 650 pixels and covering a 6x6mm area centered on the macula. 





\begin{figure}[t]\vspace{-1cm}
\centering

% First image
\includegraphics[width=\textwidth,height=1.95cm]{imgs/AMD_Subject_1137_slice_48.png}
% \caption{imgs/unp1.png}
% \label{fig:image1}

\vspace{-0.23cm} % Adds vertical space between the images

% Second image
\includegraphics[width=\textwidth,height=1.95cm]{imgs/AMD_Subject_1250_slice_48.png}
% \caption{imgs/unp2.png}
% \label{fig:image2}

\vspace{-0.24cm}


% Second image
\includegraphics[width=\textwidth,height=1.95cm]{imgs/AMD_Subject_1049_slice_52.png}
% \caption{imgs/unp2.png}
% \label{fig:image2}

\vspace{-0.24cm}  % Adds vertical space between the images

% Third image
\includegraphics[width=\textwidth,height=1.95cm]{imgs/AMD_Subject_1046_slice_38.png}
% \caption{imgs/unp3.png}
% \label{fig:image3}

\vspace{-0.24cm}  % Adds vertical space between the images

% Fourth image
\includegraphics[width=\textwidth,height=2.45cm]{imgs/AMD_Subject_1003_slice_76.png}

\vspace{-0.2cm}  % Adds vertical space between the images

\caption{Segmentation examples for \textit{REGR, pREGR, SDF, and pSDF}. The first and second row contains structural deformation due to AMD, and the third row presents a case of closely packed layers. Finally, last two rows show instances of Pigment Epithelial Detachment}
\label{fig3}
\end{figure}





 \textbf{Training and Inference.} The \textit{internal} dataset was divided into distinct sets for training (179 AMD, 71 normal), validation (10 AMD, 10 normal), and testing (80 AMD, 34 normal). A comprehensive suite of randomized data augmentations was applied, including shearing, rotation, zooming, translation, intensity scaling, shifting, contrast adjustment, and the addition of Gaussian noise. For the \textit{REGR} and \textit{pREGR} methods, training uses the ground truth coordinates in the 2D image plane as provided by the dataset. The \textit{SDF} and \textit{pSDF}  methods were trained using signed distance functions, which were constructed from these coordinates (see \ref{target_space}). We use ResUnet++ \cite{8959021} as the backbone architecture for all the SDF models (see \ref{dl_model} \& fig. \ref{schematics} for details). The SDF models were trained with the negative log-likelihood loss with the addition of a clamping function (see \ref{app_loss} for details). For robust extraction of the boundary from the SDFs, we employ a soft boundary extraction mechanism (see \ref{boundryExtract}) instead of directly solving for the zero-level.




\textbf{Experimental Validation.} To validate the effectiveness of our proposed approach, we evaluate the layer segmentation performance of \textit{REGR/pREGR} and \textit{SDF/pSDF} on both \textit{internal} and \textit{external} test sets using the Mean Absolute Error (MAE) in pixels. We  compare our methods against previous studies \cite{sousa2021automatic, lou2020fast, chen2019automated, kugelman2018automatic, santos2019diagnostico,morelle2023accurate}, on the \textit{internal} dataset. Additionally, in our framework, the estimation of uncertainty is geared towards gauging \textit{aleatoric uncertainty}, focusing predominantly on the integrity and visibility of the segmented retinal layers. To validate this, we compute the correlation of the calculated uncertainty with layer integrity and visibility. We expect a higher value of uncertainty in regions where the layer integrity is compromised or visibility is poor. To this end, we measure the average variance for each A-scan in randomly selected regions before and after being synthetically corrupted with noise such as shadow, blinking, speckle, and motion, naturally occurring in OCT scans due to various factors including disease progression and imaging noise. More about the noisy sample generation is detailed in \ref{noise_gen}.

\vspace{-0.30cm}
\section{Result and Discussion}


\vspace{-0.15cm}
\subsection{Layer Segmentation}
 The segmentation performance of different methods are detailed in Table \ref{tab01}. Our method, both in its probabilistic (\textit{pSDF}) and non-probabilistic (\textit{SDF}) forms, significantly boosted the performance over \textit{REGR/pREGR} method, as evidenced by the improvement of the averaged MAE — approximately 2.38x for the \textit{internal} and 2.4x for the \textit{external} test set in the probabilistic version, and similar improvements in the non-probabilistic one. Moreover, the lower standard deviation suggests improved segmentation precision and consistency across B-scans. We also note that the introduction of uncertainty estimation leads to slightly better segmentation performance. This improvement can be attributed to how uncertainty prediction modulates the residual loss in Eq. \ref{eq:nll}. Effectively functioning as a data-adaptive regression modifier, it allows the network to adjust the impact of residuals \cite{Kendall2017-fa}. This is particularly beneficial for reducing the influence of erroneous labels, a common challenge in segmentation tasks, and, consequently, making the model more robust to noisy data.
 
 
 Fig. \ref{compare_all} of the appendix shows a comparison of our method with previous studies, surpassing their reported performance. It is important to note, however, that these studies employed different training and test data partitions, and direct comparisons are not possible. Additionally, we present a comparative analysis featuring a pixel-wise segmentation method, as detailed in appendix \ref{pixel_seg}, alongside our approach.



     Finally, Fig. \ref{fig3} presents a series of five distinct examples. It is evident that the \textit{SDF/pSDF} approach enhances segmentation precision, particularly in the presence of AMD-related abnormalities. As shown in the first two rows of Fig. \ref{fig3}, intricate structural deformation (such as drusen) were more accurately captured by \textit{SDF/pSDF}. The third row demonstrates our method's proficiency in handling cases where the retinal layers are closely packed, showcasing enhanced delineation in such challenging scenarios. Furthermore, the last rows present cases of Retinal Pigment Epithelial Detachment (PED), a pathological condition that appears as domed elevations of the RPE layer \cite{todorich2012treatment}, posing significant segmentation challenges. Our approach handles such cases well compared to  \textit{REGR/pREGR}, demonstrating its robustness and improved accuracy.




\begin{figure}[t]\vspace{-1cm}
    \centering
    \includegraphics[width=\textwidth]{imgs/combo_fig_2.png}
    \vspace{-1cm}
    \caption{(a) Avg. per A-scan variance with \textit{pSDF} for different types of noise and their non-noisy counterparts, (b) Uncertainty on pathological and noisy conditions with \textit{pSDF}. The dotted line shows a +1 or -1 standard deviation. }
    \label{fig4}
\end{figure}




\vspace{-0.25cm}
\subsection{Uncertainty Estimation}

The quantitative results of our experiment with artificially distorted B-scans are shown in Fig. \ref{fig4}(a). These findings highlight that our approach consistently registers high uncertainty in the presence of all types of noise. Illustrative examples of such distortion are provided in the Appendix (Fig. \ref{fig:art_sample}). Additionally, a series of examples in Fig. \ref{fig4}(b) shows naturally occurring phenomena (The first and second rows contain instances of macular holes, third and fourth rows show partial layer invisibility.) that affect layer integrity and visibility. As shown, our model accurately produces larger uncertainty for disrupted or ambiguous regions. We also show a comparison with other popular uncertainty estimation approaches (namely Monte Carlo Dropout \& Deep Ensemble), highlighting the superiority of our approach (see \ref{un_comparison}). These results suggest that the quantified uncertainty in our model transcends mere segmentation confidence, potentially flagging pathological alterations tied to layer integrity. It can be particularly valuable for diseases affecting structural integrity, such as retinitis pigmentosa (RP) and geographic atrophy (GA), which especially cause disruption of the ellipsoid zone and thinning or attenuation of the retinal pigment epithelium, respectively. This uncertainty measurement can effectively highlight areas where the retinal structure is compromised, offering deep insights into disease progression.






\section{Conclusion}
In this paper, we concentrate on advancing retinal layer segmentation in OCT B-scans by focusing on two main aspects: improved segmentation of thin layers and enhanced uncertainty quantification. We introduce an SDF-based approach to achieve better layer shape representation, addressing the shortcomings of previous approaches and leading to improved segmentation. This representation also inherently leads to a more accurate estimation of the uncertainty concerning the layer shape, offering a deeper understanding of layer integrity and visibility in pathological and noisy conditions.





\bibliography{midl24_218}


\appendix
\newpage
\section{Additional Materials}
\subsection{Constructing Target 
Space}\label{target_space}






The ground truth distance function, which serves as the target for the neural network (NN) training, is generated from the segmented layers within 2D B-scans provided with the \textit{internal} dataset. These layers, delineated by lines that denote boundaries, are transformed into distance functions. Since calculating the signed distance function (SDF) given a segmentation mask is non-trivial \cite{zhao2005fast}, we use Danielsson's algorithm \cite{danielsson1980euclidean} to accomplish it. Note that by using Danielsson's algorithm for converting binary masks into signed distance maps, we ensure adherence to the Eikonal constraint,$\forall_{x,y}: \lVert \frac{\partial}{\partial y} d(x,y)\rVert = 1$, where $d$ represents the distance function. This algorithm computes the Euclidean distance from each pixel to the nearest object boundary, satisfying the Eikonal equation by maintaining a gradient magnitude of one in proximity to boundaries. Hence, our model implicitly learns to uphold the constraint, as the training data generation process inherently respects this fundamental geometric condition.
 The process is defined as follows:

In the domain $\Omega \subset \mathbb{R}^2$, the SDF $d^* : \mathbb{R}^2 \rightarrow \mathbb{R}$ assigns each pixel $p \in \mathbb{R}^2 $ the shortest distance to the boundary $\partial\Omega $. For a binary mask $M$ indicating segmented layers by line pixels $\partial\Omega \subset M$, the function $d^*$ assigns each $p \in M$ to its nearest point in $\partial\Omega$. Formally, this is given by $ d^*(p) = \min_{q \in \partial\Omega} \| p - q \|$, where \( \| \cdot \| \) denotes the Euclidean norm. As the structures delineated by \(\partial \Omega \) do not enclose any area, i.e., there are no inherent 'interior' or 'exterior' regions, leading to an absence of signed distances in the initial computation; all distances are non-negative. To incorporate the notion of sign, we introduce an orientation to the line \( \partial\Omega \) by defining a directional vector \( \overrightarrow{\vec{v}} \). This vector establishes a 'positive' direction above the line and a 'negative' direction below the line. The signed distance function \( d^{gt} \) is then constructed by assigning a sign to the distances in \( d^* \) based on the position of each pixel relative to \( \partial\Omega \). This is expressed as:

\begin{equation}
d^{gt}(p) = 
\begin{cases}
d^{*}(p) & \text{if } (p - q) \cdot \vec{v} \geq 0 \text{ for the closest } q \in \Omega,\\
-d^{*}(p) & \text{otherwise}.
\end{cases}
\end{equation}

In this way, \( d^{gt}(p) \) represents the SDF, with the zero level set corresponding precisely to the line \( \partial\Omega \), pixels above the line having positive values, and those below have negative values. The NN is thus trained to approximate this SDF, learning the spatial relations encoded within the signed distance function.


\begin{figure}[]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
\floatconts
  {schematics}
  {\caption{Schamatics for regression-based and our approach.}}
  {\includegraphics[width=\linewidth]{imgs/supp_fig_rgb2_biggerfont_c.pdf}}
\end{figure}



\subsection{Soft Boundary Extraction} \label{boundryExtract}
Now having the SDF $d$ predicted by the network, we retrieve the retinal layer boundary coordinate by first applying a Gaussian mask to generate a weight mask \( W \) for the whole OCT B-scan. The mask is calculated using the formula:

\begin{equation}
W_{xy} = \exp\left(-\frac{1}{2} \left(\frac{d(x,y) - \textit{level}}{s_{const}}\right)^2\right),
\end{equation}

\noindent
where \textit{level} is the SDF level that defines the boundary (typically zero) and \(s_{const}\) is a predefined value. This masking operation serves to highlight the regions in the  B-scan where the retinal layer boundary is likely to be located. Then the boundary is extracted by applying a weighted averaging scheme to the pixel indices, using \(W\). For each column \( x \) (corresponding to each A-scan), the boundary coordinate \( y^{sdf}_x \) is computed as:

\begin{equation}
y^{sdf}_x = \frac{\sum_{y} y \cdot W_{xy}}{\sum_{y} W_{xy}},
\end{equation}

\noindent where \( y \) runs over all pixel indices in the column. This essentially, transforms the implicit SDF representation into explicit coordinates, pinpointing the retinal layer boundaries.



 \subsection{Deep Learning Model} \label{dl_model} 
 For the \textit{REGR} approach, we modify the ResUnet++ architecture to regress the retinal layer boundary coordinates, producing an output of \(3 \times 512\) for \(512 \times 512\) input B-scans as shown in fig. \ref{schematics}(a). This is primarily done by a non-expanding decoder for one of the spatial dimensions as explained by \cite{Liefers2019-md}. We use a significantly smaller model of \(70\) million parameters, in contrast to the 187 million parameters in \cite{Liefers2019-md} yet experiments with their proposed architecture yield comparable results. In the \textit{SDF} approach we adhere closely to the original ResUnet++ configuration except, for outputting three signed distance functions that mirror the input's spatial dimensions. For the uncertainty-aware variants,  \textit{pREGR} and \textit{pSDF}, a new prediction head is integrated to predict the mean and variance for each column and pixel location, as shown in \ref{schematics}(b) \& \ref{schematics}(d) respectively. Despite the architectural variations in \textit{REGR/pREGR} and \textit{SDF/pSDF}, we try to keep the models as close as possible to facilitate a fair comparison between methods.




\subsection{Loss Function}\label{app_loss}
In alignment with our methodology, we optimize the Negative Likelihood (NLL) across all models, both probabilistic and deterministic. For the \textit{REGR} approach, the NLL manifests simply as mean squared error (MSE) under the assumption of a Gaussian distribution with fixed variance. For \textit{SDF} however, we slightly deviate since empirical observations indicate that an L1 loss results in superior segmentation fidelity compared to an L2 loss. Additionally, to increase the concentration  capacity of the network on details near the surface \cite{park2019deepsdf}, we introduce a 'clamp' function, restricting the error within a specified range:
\begin{equation}
    \mathcal{L}_{\text{SDF}}(\theta) = \sum_{n,x,y} | \text{clamp}(d^{gt}(x,y),\delta) - \text{clamp}(f^{sdf}[I_n;\theta](x,y),\delta) |,
\end{equation}
The function $ clamp(x, \delta):= \min(\delta, \max(-\delta, x)) $ introduces the parameter $\delta$ to control the distance from the surface over which we expect to maintain a metric SDF.





\begin{figure}[t]\vspace{-1cm}
    \centering
    \includegraphics[width=\textwidth]{imgs/disconnected.png}
    \caption{Examples of prediction with the pixel-wise segmentation approach.}
    \label{disconnected}
\end{figure}




 In the probabilistic counterpart, we optimized as it's shown in \ref{eq:nll}. Again for \textit{pSDF}, we adapt to its clamped version as following:
\begin{equation}
\footnotesize
    \mathcal{L}_{\text{pSDF}}(\theta) =\sum_{n,x,y} -\frac{1}{2} \left[ \frac{(\text{clamp}(d^{gt}(x,y),\delta) - \text{clamp}(f^{sdf}_{\mu}[I_n;\theta](x,y),\delta))^2}{f^{sdf}_{\sigma}[I_n;\theta]^2(x,y)} + \ln f^{sdf}_{\sigma}[I_n;\theta]^2(x,y) \right],
\end{equation}


\noindent Here the chosen $\delta$ values are typically between 28-30. We empirically find a larger clamp value results in less precise segmentation bounderies whereas a smaller value leads to a generalization problem resulting in overall reduced performance, especially on the \textit{external} dataset. 








\subsection{Noisy Sample Generation} \label{noise_gen}
Here we explain in detail how we generate the noisy sample with different types of noise and what some analogous naturally occurring phenomena in OCT to these noises.

\begin{figure}[t]\vspace{-1cm}
    \centering
    \includegraphics[width=\textwidth]{imgs/A_scan_norm.png}
    \caption{Examples of prediction with the pixel-wise segmentation approach. Predicted probabilities are normalized along each A-scan.}
    \label{a_scan_norm}
\end{figure}

\vspace{0.1cm}

\textbf{Shadow artifact.}  Shadowing can naturally occur due to blockages or opacities in ocular media, often caused by cataracts or hemorrhages, leading to darkened or obscured regions in the scans \cite{duker2021handbook}. 

To add the shadow artifact within a specific B-scan, we adhere to the procedure outlined by \cite{de2023uncertainty}. For any given B-scan, each constituent A-scan \( a_x \) undergoes an individualized modification process. Here, \( x \) spans the totality of A-scans, collectively numbered as \( \mathcal{X} \). The transformation of each A-scan \( a_x \) is governed by the shadow function \( S \), defined as \( S(a_x) = a_x \cdot (1 - s(x)) \), where the term \( s(x) \) is derived from a Gaussian probability density function: 
$ s(x) = \frac{1}{\sigma \sqrt{2\pi}} e^{-\frac{1}{2} \left(\frac{x - \mu}{\sigma}\right)^2}.$ Parameters \( \mu \) and \( \sigma \) are randomly determined within the bounds [0, \( \mathcal{X} \)] and \( \left[\frac{\mathcal{X}}{4}, \frac{3\mathcal{X}}{4}\right] \) respectively.


\vspace{0.1cm}



\begin{figure}[t]\vspace{-1cm}
    \centering
    \includegraphics[width=\textwidth]{imgs/mcdo_ensemble_uncertainty_example.png}
    \caption{ Uncertainty estimation examples for MCDO, Deep ensemble, and pSDF (our). The dotted line
shows +1 or -1 standard deviation. In the first row, we expect to see high uncertainty only in the region where the macular hole (layer not present) is. However, MCDO shows high uncertainty in a region that is substantially smaller than the size of the macular hole, while the Deep Ensemble method exhibits overestimation (highlighted with yellow arrows). Conversely, the region with high uncertainty from our approach is accurately co-located with the macular hole. Furthermore, the Deep Ensemble introduces unwanted high uncertainty areas within the BM and RPE layers—marked by a red arrow—despite clear imaging features. This issue recurs in the second and fourth rows (yellow arrows). In the third row, a segment of the ILM near the yellow marker is obscured due to inferior image quality, where a consistently high uncertainty was anticipated. Instead, both Deep Ensemble and MCDO exhibit alternating high and low uncertainty patterns. Similarly, the fifth row evidences unexpected high uncertainty regions produced by both MCDO and Deep ensemble (yellow arrows).}
    \label{uncertain_example}
\end{figure}


\textbf{Blinking artifact:} Blinking artifacts are analogous to motion artifacts caused by patient movement or blinking during the scan, resulting in discontinuities or stripes in the B-scan\cite{chhablani2014artifacts}. 

To introduce artificial blinking artifacts into B-scans, we commence with a B-scan where all pixel intensities are set to zero. Subsequently, we superimpose additive Gaussian noise onto this B-scan. The mean of the Gaussian distribution was set to the median pixel value observed across the entire OCT scan, while the standard deviation was matched to the standard deviation inherent to the OCT scan itself.

\vspace{0.1cm}
\textbf{Speckle artifact:} Speckle is an inherent property of OCT imaging, arising from the coherent nature of the light source and the interference of backscattered light, manifesting as granular noise that can mask underlying retinal structures\cite{curatolo2013speckle}.

To replicate the characteristic granularity of speckle noise in B-scans, a binary modification of pixel intensities is employed. A probability \( p \), uniformly selected from the interval [0.4, 0.6], determines the binary state of each pixel. Accordingly, pixels are stochastically assigned either the maximum or minimum intensity values present in the  B-scan, thereby producing the sharp, speckled contrasts characteristic of speckle noise.

\vspace{0.1cm}
\textbf{Motion artifact:} Similar to blinking, motion noise in OCT can be attributed to micro-movements of the patient or the eye during scanning, leading to blurring or smearing of the  B-scan \cite{chhablani2014artifacts}.

To simulate motion artifacts in B-scans, we implement a localized pixel displacement. For each horizontal cross-section of the B-scan, a random shift is applied, with the shift magnitude uniformly drawn from the interval \([- \Delta_{max}, \Delta_{max}]\), where \( \Delta_{max} \) denotes the maximum allowed pixel shift. This shifting process introduces a realistic representation of motion blur, akin to the motion artifacts.



\begin{figure}[t]\vspace{-0.5cm}
    \centering
    % \includegraphics[width=\textwidth]{imgs/mcdo_ensemble_uncertainty_bar.png}
    \includegraphics[width=\textwidth]{imgs/alluncmethods_uncertainty.pdf}
    \caption{Avg. per A-scan variance for different types of noise and their non-noisy counterparts, shown for the three different uncertainty estimation techniques (a) MCDO, (b) Deep Ensemble, and (c) pSDF.}
    \label{uncertain_bar}
\end{figure}



\section{Additional Results}

\subsection{Pixel-wise Segmentation}\label{pixel_seg}
A pixel-wise model has also been trained to evaluate performance against our proposed approach. We use the same architecture described for \textit{SDF} approach and cross-entropy loss to train the model. However, this immediately leads to the problem shown in Fig. \ref{disconnected}, as some of the pixels that belong to the layer boundaries are classified as background pixels. The problem highlights the fact that due to the thin layer structure, using a pixel-wise cross-entropy loss fails to provide a robust supervisory signal to train the model properly as discussed in the section \ref{intro}. The problem of disconnected lines can partially be alleviated by simply applying softmax to each A-scan (per column of the 2D image) and taking the coordinate with the highest probability. This ensures a single coordinate for each layer per A-scan. This allows us to compute the mean absolute error to compare with other approaches. However, this approach is also not stable in terms of prediction as it shows oscillating behavior shown in Fig. \ref{a_scan_norm}. Tab. \ref{tab02} outlines the performance of this approach, which performed nearly $\approx$7x and $\approx$10x worse than our proposed method on \textit{internal} and \textit{external} datasets respectively.     



\subsection{Uncertainty Comparison}\label{un_comparison}
We compare our uncertainty estimation with more widely used approaches such as Monte Carlo Dropout (MCDO) \cite{kendall2017uncertainties} and Deep Ensemble \cite{lakshminarayanan2017simple}. We again use the same experimental setup as our proposed approach. Additionally, to estimate the performance we consider 5 ensemble members trained from scratch for deep ensemble, and we sample 15 times for each input image for MCDO. We make the following observations when we compare to our method.


\textbf{Segmentation Performance.} The segmentation performance of the Deep Ensemble aligns closely with that of our method, as detailed in Tab. \ref{tab02}. Conversely, the Monte Carlo Dropout (MCDO) approach experienced a minor decline in performance across both internal and external datasets. 


\begin{minipage}{\textwidth}
    \vspace{30pt}
    \fontsize{9.5pt}{9.5pt}\selectfont
    \centering
    \captionof{table}{Layer segmentation performance for additional methods (measured with mean absolute error (MAE) in pixels).}
    \label{tab02}
    \begin{tabular}{llll}
    Method & Layer & \multicolumn{2}{c}{MAE} \\
    & & (internal) & (external) \\
    \toprule
              & ILM  & $1.47_{\pm 2.93}$ & $8.55_{\pm 6.47}$ \\
    Pixel-wise& RPE  & $2.51_{\pm 3.77}$ & $16.39_{\pm 11.09}$     \\
              & BM   & $6.86_{\pm 10.54}$ & $10.01_{\pm 6.57}$       \\ 
              & Avg. & 3.62                 & 11.65  \\
    \midrule
              & ILM  & $0.32_{\pm 0.43}$ & $1.10_{\pm 1.10}$   \\
    Deep ensemble & RPE  & $0.58_{\pm 0.36}$ & $1.60_{\pm 1.46}$     \\
              & BM   & $0.77_{\pm 0.36}$ & $1.39_{\pm 0.96}$       \\ 
              & Avg. & 0.56              & 1.37  \\
    \midrule
              & ILM  & $0.35_{\pm 0.40}$ & $1.25_{\pm 0.70}$  \\
    MCDO      & RPE  & $0.63_{\pm 0.41}$ & $1.52_{\pm 1.33}$     \\
              & BM   & $0.79_{\pm 0.32}$ & $1.36_{\pm 0.77}$       \\ 
              & Avg. & 0.59              & 1.38  \\
    \midrule
              & ILM  & $0.32_{\pm 0.44}$ & $1.63_{\pm 0.78}$ \\
    pSDF (Our)& RPE  & $0.59_{\pm 0.40}$ & $1.02_{\pm 1.47}$     \\
              & BM   & $0.75_{\pm 0.29}$ & $1.27_{\pm 0.82}$       \\ 
              & Avg. & \textbf{0.55}     & \textbf{1.30}  \\
    \bottomrule
    \end{tabular}
\end{minipage}




\begin{figure}[]
    \centering
    \includegraphics[width=0.8\textwidth]{imgs/mae_comparison_sidebar.png}
    \caption{Layer segmentation performance comparison with other studies, measured with mean absolute error (MAE) in pixels. FCN-GS \cite{kugelman2018automatic}, CapsNet \cite{santos2019diagnostico}, DeepForest \cite{chen2019automated}, Wave \cite{lou2020fast}, DexiNed \cite{sousa2021automatic}, REGR \cite{Liefers2019-md}, OCRegression \cite{morelle2023accurate}}
    \label{compare_all}
\end{figure}

\vspace{0.5cm}

\textbf{Better Uncertainty.} To compare the uncertainty with our approach, we measure the average variance for each A-scan in randomly selected regions before and after being synthetically corrupted with noise patterns as explained in section \ref{experimental_setup}. The results can be found in Fig. \ref{uncertain_bar} (a-b). We note here that both MCDO and Deep ensemble remain very sensitive to shadow and blinking noise but insensitive to other types of noise, leading to overall unbalanced uncertainty estimation. In contrast, our method produces more balanced uncertainty across the board as shown in Fig. \ref{uncertain_bar} (c). Further qualitative evaluation also reveals that both MCDO and Deep ensemble often produce erroneous uncertainty estimates (Fig. \ref{uncertain_example}). For example, in cases where the layers are well visible, high uncertainties can be produced by these approaches. Furthermore, the extent of regions in which higher uncertainty is expected due to certain pathologies or imaging artifacts is often underestimated or overestimated.



\textbf{Computational Cost} In our approach, we deploy an ensemble of five members, each requiring independent training. This necessitates executing five forward passes during inference, quintupling the computational demand for both the training and inference phases. Additionally, the requirement to store the weights of five distinct models increases the memory footprint. Also, the MCDO technique predominantly inflates the computational cost during inference, which in our implementation, increases fifteenfold as we perform fifteen sampling operations for each input image.



\begin{figure}[t]
\centering

% First image
\vspace{-0.5cm}
\includegraphics[width=\textwidth]{imgs/shadow.png}
 \vspace{-0.25cm}
\includegraphics[width=\textwidth]{imgs/motion.png}
 \vspace{-0.25cm}
\includegraphics[width=\textwidth]{imgs/blinking.png}
 \vspace{-0.25cm}
\includegraphics[width=\textwidth]{imgs/speckle.png}



\caption{Prediction with uncertainty for artificially distorted samples.(MAE=Mean absolute error) }
\label{fig:art_sample}
\end{figure}

\end{document}
