\documentclass{midl} 

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% IMPORTS
\usepackage[nohyperlinks,nolist]{acronym}
\usepackage{booktabs}
\usepackage{multirow}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\jmlryear{2026}
\jmlrworkshop{Full Paper -- MIDL 2026}
\jmlrvolume{-- 396}
\editors{Accepted for publication at MIDL 2026}

\title[Heteroscedastic Regression for Reliable Pectoral Muscle Segmentation]{Heteroscedastic Heatmap Regression for Reliable Pectoral Muscle Segmentation in Mammography}


\midlauthor{\Name{Paul Zech\nametag{$^{1,2}$}} \orcid{0009-0005-9342-3225} \Email{p.zech@siemens-healthineers.com}\\
\Name{Christian H\"{u}mmer\nametag{$^{1}$}} \Email{christian.huemmer@siemens-healthineers.com}\\
\Name{Benjamin El-Zein\nametag{$^{1,2}$}} \Email{benjamin.el-zein@siemens-healthineers.com}\\
\Name{Christopher Syben\nametag{$^{1}$}} \Email{christopher.syben@siemens-healthineers.com}\\
\Name{Ludwig Ritschl\nametag{$^{1}$}}\Email{ludwig.ritschl@siemens-healthineers.com}\\
\Name{Steffen Kappler\nametag{$^{1}$}}\Email{steffen.kappler@siemens-healthineers.com}\\
\Name{Sebastian Stober\nametag{$^{2}$}} \Email{stober@ovgu.de}\\
\addr $^{1}$ Siemens Healthineers AG, Forchheim, Germany \\
\addr $^{2}$ Otto-von-Guericke University, Magdeburg, Germany
}

\begin{document}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% define acronyms
\begin{acronym}
    \acro{FCN}{fully-convolutional network}
    \acro{LL}{Log-Likelihood}
    \acro{MC}{Monte-Carlo}
    \acro{MLO}{mediolateral oblique}
    \acro{PM}{pectoral muscle}
    \acro{I2I}{image-to-image}
    \acro{CI}{column-index}
    \acro{NLL}{negative log-likelihood}
    \acro{MSE}{mean-squared-error}
    \acro{MAE}{mean-absolute-error}
    \acro{RMSE}{root-mean-squared-error}
    \acro{MSTD}{mean-standard-deviation}
\end{acronym}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\maketitle

\begin{abstract}
Breast cancer remains a leading cause of mortality worldwide, making accurate mammography screening essential for early detection. An important preprocessing step in mammography is the accurate segmentation of the pectoral muscle, as it affects downstream tasks such as breast density estimation or automated exposure control. Existing automated segmentation methods, both traditional and deep learning-based, often lack reliable confidence measures, which becomes especially problematic in the presence of occlusions or visually confounding structures such as skin folds or other muscle fibers. To address this limitation, we propose a probabilistic framework that combines heatmap-based boundary regression with heteroscedastic uncertainty estimation to capture input-dependent variability. Our approach not only predicts the pectoral muscle boundary but also quantifies the associated uncertainty. While mainly producing unimodal predictions, the probabilistic heatmaps reveal multimodal patterns for confounding structures, further enhancing transparency in challenging cases. We demonstrate that our method provides robust and transparent means to achieve accurate segmentation while producing meaningful uncertainty estimates.
\end{abstract}

\begin{keywords}
Pectoral muscle segmentation, heteroscedastic regression, aleatoric uncertainty
\end{keywords}

\section{Introduction}
Breast cancer remains one of the most prominent cancer types, especially in young and middle-aged women \citep{siegel2016cancer, Ren_Chen_Qiao_Zhao_2022}. To improve early detection, the World Health Organization recommends regular mammography screening in this population \citep{World-Health-Organization_2014}. One standard projection for breast tissue characterization is the \ac{MLO} view in mammograms. Besides breast tissue, \ac{MLO} projections also capture the \ac{PM}, which typically appears as a bright and dense region in the upper corner of the image. Accurate segmentation of the \ac{PM} constitutes an essential preprocessing step in the analysis of mammography images. For instance, breast positioning control systems use the \ac{PM} as a key anatomical landmark \citep{brahim_automated_2022}. Moreover, \ac{PM} segmentation is important for automated patient-specific calibration of the radiation exposure, which is computed on a previously acquired low-dose preshot. Accurate exclusion of the \ac{PM} is essential to prevent the dose from being calibrated to the dense \ac{PM} tissue, which would otherwise result in elevated radiation and overexposure. To automate the \ac{PM} segmentation task, many different traditional and deep learning-based algorithms have been developed. However, the proposed solutions provide no or insufficient measures of confidence for their predictions \citep{rampun_breast_2019, ma_automated_2019}. This is particularly problematic in \ac{PM} segmentation, as the \ac{PM} may be obscured by dense glandular tissue or confused with structurally similar skin folds and other muscle fibers. In such ambiguous cases, uncertainty modeling is essential to identify potentially unreliable local segmentation results, especially in high-noise environments such as low-dose preshots. 

To address these challenges, we propose a novel framework for \ac{PM} segmentation that explicitly accounts for multiple sources of input noise and ambiguities. We reformulate the task as contour regression, modeling the contour positions as probability distributions that capture both the expected contour position and its associated uncertainty. We predict these distributions jointly by means of a single probabilistic heatmap using an \ac{I2I} architecture. Our method builds on previous work \citep{huemmer_improved_2024} with substantial technical improvements, which we validate through a comprehensive experimental evaluation.

\section{Related work}
\label{Sec:rel_work}

\textbf{\ac{PM} segmentation: } Early developments leverage prior knowledge about the shape of the PM boundary by detecting a straight line \citep{Karssemeijer_1998} or fitting active contours to the PM boundary \citep{ferrari2004identification, Wang_Zhu_Deng_Yuan_2010}. Other traditional algorithms can be categorized into line detection, intensity-based, wavelet-based, and statistical techniques as summarized in \citet{Ganesan_Acharya_Chua_Min_Abraham_2013}. However, these methods rely on extensive pre- and postprocessing as well as feature engineering. 
 
Recent advances in deep learning overcome these limitations by learning hierarchical representations directly from data using deep neural networks. Some approaches address the task through pixel-wise classification of the \ac{PM} tissue. Architectures such as U-Net \citep{navab_u-net_2015} and its variants have been widely adopted for this purpose \citep{ma_automated_2019, liu_utilizing_2020}, with extensions that incorporate adversarial training to improve robustness and anatomical plausibility \citep{guo_automatic_2020}. While pixel-wise classification can model highly complex shapes, this flexibility is not required for \ac{PM} segmentation in \ac{MLO} images: since the muscle occupies a well-defined position in the upper image corner, its area is fully 
determined by the outer contour alone. Therefore, contour-based methods can focus 
on the clinically challenging part of delineating the \ac{PM} boundary, which may 
overlap with dense breast tissue. These methods are based on edge detection or boundary-aware strategies, such as VGG16-based contour detection \citep{soleimani_segmentation_2020}, U-Net adaptations for boundary extraction \citep{angelone_u-net_2025}, and Holistically-Nested Edge Detection \citep{rampun_breast_2019}. By restricting the prediction task to the \ac{PM} boundary, the solution space is reduced, simplifying the learning problem while still fully defining the muscle region \citep{angelone_u-net_2025}. Building on this idea, \citet{huemmer_improved_2024} exploited anatomical prior 
knowledge by showing that the \ac{PM} boundary admits a unique functional mapping from image rows to column indices. This reformulation enables segmentation to be performed as a row-wise \ac{CI} regression, inherently encouraging continuity and structural consistency of the predicted \ac{PM} boundary. The authors were able to show that this approach notably outperforms a standard pixel-wise classification baseline in terms of segmentation performance, parameter efficiency, and inference time. However, despite their high accuracy, the lack of a robust measure of uncertainty remains a key limitation of these solutions. 


\textbf{Uncertainty quantification:} Following \citet{kendall_what_2017}, uncertainty arises from two sources: (1) epistemic uncertainty, due to limited training data and uncertain model parameters, and (2) aleatoric uncertainty, caused by noise or ambiguity in the input. Aleatoric uncertainty can further be classified as homoscedastic (constant across inputs) or heteroscedastic (input-dependent). 
However, in medical imaging, most methods estimate the overall predictive uncertainty rather than explicitly modeling either one \citep{lambert_trustworthy_2024}. Common strategies involve modeling a predictive distribution through \ac{MC} dropout with multiple stochastic forward passes at test time \citep{pmlr-v48-gal16, crimi_towards_2018}, or through ensembles of differently initialized models \citep{mehrtash_confidence_2020}. In \ac{PM} segmentation, similar strategies have been applied using \ac{MC} dropout \citep{klanecek_uncertainty_2023} or deep ensembles, either from model snapshots along the training trajectory \citep{tang_uncertainty-aware_2025} or from models trained on different data distributions \citep{huemmer_improved_2024}. However, these methods do not model input-dependent, heteroscedastic uncertainty to capture the aforementioned input ambiguities. 
A straightforward approach to model this uncertainty is to use the inter-rater variability as ground truth for the uncertainty in a supervised setting as done in \citet{crimi_meta-learning_2022}. A more scalable approach is to implicitly learn heteroscedastic uncertainty from the data itself. The underlying idea is to predict the mean and variance of the predictive distribution, which is learned by maximizing the \ac{LL} within a heteroscedastic framework \citep{lambert_trustworthy_2024}. This is usually achieved by adding the variance as a separate output to the mean predictions and has been successfully applied to segmentation \citep{sudre_hierarchical_2020} and regression tasks \citep{seitzer_pitfalls_2022}. Nevertheless, to the best of our knowledge, heteroscedastic uncertainty modeling has not been applied to \ac{PM} segmentation yet. 

\textbf{Heteroscedastic heatmap regression:} The detection of a contour is closely related to coordinate regression, as a contour can be represented by a finite set of points. A common strategy for coordinate regression is heatmap-based regression, in which coordinate locations are encoded as Gaussian distributions within spatial heatmaps. In its simplest form, the network is trained to regress these target Gaussians centered at the ground truth coordinates using a heatmap matching objective \citep{Zhang_Hu_Feng_2020}. This approach has also been applied in the context of heteroscedastic uncertainty modeling. For instance, \citet{thaler_modeling_2021} demonstrated that the learned heatmaps can be interpreted as pseudo-probability distributions that can be used to quantify uncertainty. However, their approach does not explicitly model heteroscedasticity during training, but instead fits a Gaussian model to the learned heatmaps during inference. Most other methods that model heteroscedastic uncertainty do not represent uncertainty as a full probability distribution over the heatmap. Instead, they predict parameterized Gaussians through mean and variance \citep{seitzer_pitfalls_2022, shukla_tic-tac_2024}, thereby imposing a strong unimodal Gaussian assumption. However, this assumption is not necessarily valid in \ac{PM} segmentation, where confounding structures such as skin folds can lead to multimodal uncertainty patterns. To our knowledge, only \citet{kumar_luvli_2020} attempted to model full heteroscedastic distributions directly in the heatmap in the context of landmark detection, but reported instabilities due to the limited spatial resolution of current heatmap-based frameworks.

\textbf{Our contribution:} In this study, we advance the idea of \citet{huemmer_improved_2024}, who modeled \ac{PM} segmentation as row-wise \ac{CI} regression of the \ac{PM} boundary. To allow uncertainty in the contour predictions, we replace the discrete \ac{CI} vector with a probabilistic heatmap where each row represents a full probability distribution over possible contour positions. We demonstrate that the probabilistic heatmaps can be learned using an \ac{I2I} network trained with a heteroscedastic loss function and show that robust uncertainty estimates can be derived from the predicted heatmaps' distributions. We further show that the prediction of full row-wise distributions enables the identification of uncertain cases where the underlying assumption of unimodality on the errors is violated. Last, we perform a rigorous evaluation in terms of method configuration, segmentation performance, and ability to quantify uncertainty. A preliminary version of this work was presented in \citet{zech2025uncertaintyaware}; this paper presents the full, extended study. 


\section{Methods}
An overview of our method is depicted in Figure~\ref{fig:pipeline}. Input images are first processed by a U-Net to produce a spatial heatmap, which is then converted into a probabilistic heatmap using a row-wise softmax operation. The contour prediction and its associated uncertainty are subsequently extracted as mean and variance, which are used for training with a heteroscedastic loss function. An additional regularizer ensures that the model produces valid probability distributions. All steps are detailed below. 
\begin{figure}[ht]
    \centering
    \includegraphics[width=1.0\textwidth]{figures/Pipeline_n.pdf}
    \caption{Overview of our method, with label-extrapolated input (left), predictive heatmap (center) and boundary prediction with uncertainty band (right).}
    \label{fig:pipeline}
\end{figure}

\textbf{Probabilistic heatmap regression:} First, the input image $\mathbf{I} \in \mathbb{R}^{H \times W}$ is processed by an \ac{I2I} network to generate a spatial heatmap $h$. For this task, we employ a U-Net as described in the original publication \citep{navab_u-net_2015}, as it is widely adopted and well established in medical image analysis. To obtain the probabilistic heatmap, we convert each row into a pseudo-probability distribution by applying a row-wise softmax operation
\begin{equation}
    \hat{h}_{i,j} = \text{Softmax}(h_{i,j}) = \frac{\exp(h_{i,j})}{\sum_{w=0}^{W-1} \exp(h_{i,w})}, 
\label{equ:softmax}
\end{equation}
where $\hat{h}_{i,j}$ is the softmax-activated heatmap $h$ at row $i \in \{0, \ldots, H-1\}$ 
and column $j \in \{0, \ldots, W-1\}$, and index $w$ runs over all columns of row $i$. In this probabilistic heatmap, we define the \ac{PM} contour prediction as the row-wise mean of the learned probability distributions. Accordingly, we define the uncertainty as a row-wise variance over the predicted boundary positions. The row-wise mean and variance are computed as first- and second-order moments directly from the probabilistic heatmap as
\begin{equation}
    \hat{\mu}_i = 
     \sum_{j=0}^{W-1}  
     j\cdot \hat{h}_{i,j}, \qquad
    \hat{\sigma}^2_i = \sum_{j=0}^{W-1}  
    \hat{h}_{i,j} \cdot (j - \hat{\mu}_i)^2
\label{equ:meanvar}
\end{equation}
with mean $\hat{\mu}_i$ as \ac{PM} boundary prediction and variance $\hat{\sigma}^2_i$ as a measure of the uncertainty for row $i$. The mean computation in Equation~\eqref{equ:meanvar} corresponds to the soft-argmax operation \citep{luvizon_human_2017}, which has been proven effective in different scenarios, as it leverages the spatial generalization of \acp{FCN} \citep{nibali_numerical_2018}. 

To model heteroscedastic uncertainty in the probabilistic heatmap, we train the network by minimizing the negative log-likelihood (NLL) derived from both Laplace ($\mathcal{L}$) and Normal ($\mathcal{N}$) distributions, both well known for this purpose \citep{kumar_luvli_2020}, with
\begin{align}
  \text{NLL}_{\mathcal{N}} := \frac{1}{H} \sum_{i=0}^{H-1} \left( \frac{(y_i - \hat{\mu}_i)^2}{2\hat{\sigma}^2_i} + \frac{1}{2}\log(2\pi\hat{\sigma}^2_i) \right) \quad \text{and}
\label{equ:loss_normal}
\end{align}
\begin{align}
  \text{NLL}_{\mathcal{L}} := \frac{1}{H} \sum_{i=0}^{H-1} \left( \frac{|y_i - \hat{\mu}_i|}{\hat{b}_i} + \log(2\hat{b}_i) \right),
\quad \text{where } \hat{b}_i =    \sqrt{\frac{\hat{\sigma}^2_i}{2}}.
\label{equ:loss}
\end{align}

Here, $y_i$ refers to the ground truth contour position at row $i$. In this setting, the shape of the heatmaps' distributions is only weakly supervised, since infinitely many different distributions can yield the same first- and second-order moments, which may lead to unstable training behavior. To address this, we extend the loss function with a regularization term that enforces a Gaussian or Laplacian shape on the predicted distributions, thereby stabilizing soft-argmax-based training as demonstrated by \citet{nibali_numerical_2018}. Unlike static variance regularization, our approach employs a variable regularization where both the mean and variance are controlled by the heteroscedastic loss (see Figure~\ref{fig:pipeline}). In more detail, we define the regularizer as 
\begin{align}
  \mathcal{L}_{\text{reg}} &=  \frac{1}{H} \sum_{i=0}^{H-1} \mathcal{D}_{\text{JS}}(\mathbf{\hat{h}}_i\|\ \mathcal{Q}_i) 
  \quad \text{with} \quad \mathcal{Q}_i =
  \begin{cases} 
    \mathcal{N}(\hat{\mu}_i, \hat{\sigma}^2_i) & \text{(Gaussian)} \\[8pt]
    \mathcal{L}(\hat{\mu}_i, \hat{b}_i) & \text{(Laplace),}
  \end{cases}
\label{equ:reg}
\end{align}
 where $\mathcal{D}_{\text{JS}}(\mathbf{\hat{h}}_i\|\ \mathcal{Q}_i)$ denotes the Jensen-Shannon divergence between $\mathbf{\hat{h}}_i$ defined as the softmax-activated heatmap in row $i$, and a template distribution $\mathcal{Q}_i$, which is constructed from the predicted heatmaps' statistics in Equation~\eqref{equ:meanvar}. The regularizer, scaled by a constant factor $\lambda$ to control its strength, is added to the heteroscedastic loss to encourage distributions that align with the probabilistic assumptions of the loss function. 

 \textbf{Detecting multimodality:} As previously discussed, many uncertainty modeling approaches assume unimodal error distributions (Sec.~\ref{Sec:rel_work}). However, this assumption might not hold for all anatomical situations. For instance, confounding structures can induce bi- or multimodal patterns in the predictive distributions, as multiple structures may represent plausible anatomical interpretations of the \ac{PM}. Since our method predicts full probability distributions, it enables identifying such cases where the underlying unimodal assumption is violated. This is achieved by classifying whether there exists at least one row in the predicted heatmap that contains more than one peak.
In this setting, the regularizing term serves as a soft constraint that controls the trade-off between enforcing row-wise unimodal predictions for accurate training and allowing the model to express genuine multimodality in the predictions.

\textbf{Label extrapolation:} 
Representing the \ac{PM} target contour as a \ac{CI} vector requires defining a contour position for every image row. In the lower part of the image, where the muscle is no longer visible (see Figure~\ref{fig:pipeline}), \citet{huemmer_improved_2024} set the target \ac{CI} to the image boundary. This introduces artificial flat segments, which are unproblematic for standard regression approaches that focus solely on point estimates. However, when modeling uncertainty, this leads to systematic overconfidence near the image edges. To avoid this, we pad the image by one-quarter of its width and linearly extrapolate the label vector as illustrated in Figure~\ref{fig:pipeline}. This preserves a smooth muscle shape beyond the visible image area and provides a more robust basis for uncertainty quantification. Note that label extrapolation is used only for training with heteroscedastic losses to prevent overconfidence. For the baselines, we create the target \ac{CI} vectors as described in \citet{huemmer_improved_2024}.
%, while evaluation remains restricted to the visible muscle.

%--------------------------------------------------------------------------------

\section{Experiments and results}


\textbf{Dataset and labels:} To evaluate the proposed approach, we extracted $2{,}847$ unprocessed \ac{MLO}-view mammograms from the MBTST dataset~\citep{dahlblom}. Segmentation labels were provided by clinical experts as binary masks. To obtain contour labels, we converted the segmentation masks into a \ac{CI} target vector as follows: for each image row, the corresponding entry of the \ac{CI} vector was extracted as the first non-zero pixel in the respective binary mask. The dataset was split into training, validation, and test sets using a ratio of $75\%/15\%/10\%$ with a uniform distribution of breast densities. This dataset split was kept consistent across all experiments.\\
\textbf{Image processing:} The images were processed by cropping to the region of interest, resizing to a resolution of $256 \times 256$ and subsequently Z-normalized.\\
\textbf{Augmentation:} For data augmentation, we employed the image processing pipeline of \citet{Eckert2024}. The pipeline processes raw mammograms using a linear workflow comprising a Neg-Log transform, background segmentation, Laplacian-pyramid-based frequency band manipulation, and window leveling. All configurable parameters are mapped to three normalized values in $[0,1]$, representing realistic imaging styles. During training, we sampled these parameters uniformly, while during inference we kept them fixed to $[1.0, 0.5, 0.5]$\footnote{The first parameter controls window leveling, where $1.0$ corresponds to the largest possible window.}. For detailed information about the image processing pipeline, refer to \citet{Eckert2024}.\\
\textbf{Training:} As the optimizer, we used AdamW with weight decay $10^{-2}$ and an initial learning rate of $10^{-5}$, which was reduced by a factor of $0.1$ when the loss plateaued for more than $10$ epochs. For all experiments, we present the average metrics across three independent training runs. Throughout the experiments, uncertainty is quantified by computing the row-wise standard deviation $\hat{\sigma}_i$ from the variance defined in Equation~\eqref{equ:meanvar} and subsequently averaging it over all rows, with the resulting \ac{MSTD} reported as the uncertainty metric.

%--------------------------------------------------------------------------------


\subsection{Heatmap regression configuration}
\label{sec:Config}

To determine an effective heteroscedastic training setup, we systematically compare different loss formulations and regularization strengths within our heatmap regression framework. Specifically, we evaluate the heteroscedastic loss functions, $\mathrm{NLL}_{\mathcal{N}}$ in Equation~\eqref{equ:loss_normal} and $\mathrm{NLL}_{\mathcal{L}}$ in Equation~\eqref{equ:loss}, against two regression baseline losses, \ac{MAE} and \ac{MSE}, for different regularization strengths $\lambda$. Since \ac{MAE} and \ac{MSE} provide no variance supervision, the fixed variance $\sigma^2_i = 10$ was empirically chosen as a target for the template distributions $\mathcal{Q}_i$ in the regularizer (Equation~\eqref{equ:reg}). For evaluation, we choose \ac{MAE} and \ac{RMSE} as standard regression metrics, while the \ac{LL} is used to assess the quality of the learned predictive distributions. All metrics are computed solely within the muscle region and summarized in Table~\ref{tab:Configuration}.

\begin{table}[ht]
\centering
\caption{Evaluation of models trained with two heteroscedastic losses ($\mathrm{NLL}_{\mathcal{L}}$, $\mathrm{NLL}_{\mathcal{N}}$) and two baseline losses ($\mathrm{MAE}$, $\mathrm{MSE}$) across three regularization strengths $\lambda$. Reported metrics are \ac{MAE}, \ac{RMSE}, and \ac{LL}, given as mean $\pm$ standard deviation over 3 runs.}
\label{tab:Configuration}
\begin{tabular}{lcccc}

\toprule
 &  & \multicolumn{3}{c}{\textbf{Metric}} \\
\cmidrule(lr){3-5}
\textbf{Loss} & \textbf{$\lambda$} 
& \textbf{\ac{MAE}} $\downarrow$ 
& \textbf{\ac{RMSE}} $\downarrow$ 
& \textbf{\ac{LL}} $\uparrow$ \\
\midrule

\multirow{3}{*}{$\mathrm{MAE}$} 
    & 0   & $1.97 \pm 0.04$ & $2.46 \pm 0.05$ & $-3.55 \pm 0.19$ \\
    & 10  & $1.94 \pm 0.02$ & $2.40 \pm 0.03$ & $-2.55 \pm 0.07$ \\
    & 100 & $1.90 \pm 0.04$ & $2.35 \pm 0.04$ & $-2.38 \pm 0.03$ \\
\midrule

\multirow{3}{*}{$\mathrm{MSE}$} 
    & 0   & $1.95 \pm 0.07$ & $2.43 \pm 0.06$ & $-3.85 \pm 0.21$ \\
    & 10  & $1.93 \pm 0.02$ & $2.41 \pm 0.04$ & $-3.43 \pm 0.36$ \\
    & 100 & $1.90 \pm 0.01$ & $2.38 \pm 0.03$ & $-2.83 \pm 0.19$ \\
\midrule

\multirow{3}{*}{$\mathrm{NLL}_{\mathcal{L}}$} 
    & 0   & $2.01 \pm 0.00$ & $2.58 \pm 0.02$ & $-2.30 \pm 0.02$ \\
    & 10  & $1.92 \pm 0.04$ & $2.42 \pm 0.05$ & $-2.28 \pm 0.02$ \\
    & 100 & $1.88 \pm 0.04$ & $2.34 \pm 0.05$ & $-2.26 \pm 0.02$ \\
\midrule

\multirow{3}{*}{$\mathrm{NLL}_{\mathcal{N}}$} 
    & 0   & $2.03 \pm 0.03$ & $2.59 \pm 0.10$ & $-2.31 \pm 0.01$ \\
    & 10  & $1.98 \pm 0.01$ & $2.51 \pm 0.01$ & $-2.32 \pm 0.08$ \\
    & 100 & $1.94 \pm 0.09$ & $2.42 \pm 0.11$ & $-2.30 \pm 0.07$ \\
\bottomrule

\end{tabular}
\end{table}

The results show that the introduction of the regularization term leads to a slight performance improvement, as both \ac{MAE} and \ac{RMSE} decrease with increasing regularization strength across all used loss functions. A similar trend is observed for the \ac{LL} values, which also increase with stronger regularization. Furthermore, both heteroscedastic loss formulations achieve \ac{MAE} and \ac{RMSE} values comparable to the non-heteroscedastic loss functions. At the same time, the heteroscedastic models show high \ac{LL} values across all regularization strengths. Overall, the lowest errors and highest \acp{LL} are achieved by the model trained with $\mathrm{NLL}_{\mathcal{L}}$. These findings suggest that incorporating heteroscedastic uncertainty modeling and label extrapolation does not compromise predictive performance while it allows for learning stable predictive distributions across different regularization strengths. Further, it is demonstrated that the regularization term stabilizes training notably, as evidenced by consistent improvement across all metrics and loss functions for higher regularization strengths. Among the evaluated configurations, the model trained with $\mathrm{NLL}_{\mathcal{L}}$ achieves the lowest errors and highest \acp{LL}, indicating that the Laplace distribution is better suited to model the underlying heteroscedastic uncertainty. It is therefore selected as the best configuration and used in all subsequent experiments, and referred to as $\mathrm{NLL}_{\mathcal{L}}$-model. 
%--------------------------------------------------------------------------------

\subsection{Performance comparison}
\label{sec:Performance}
To verify that our approach does not compromise segmentation performance, we compare it against a pixel-wise classification baseline. For this, we choose a classical U-Net \citep{navab_u-net_2015}, trained to predict a binary segmentation mask using a binary cross-entropy loss. We assess the segmentation performance in terms of Dice across different network sizes in Figure~\ref{fig:dice}. 
\begin{figure}[ht]
    \centering
    \includegraphics[width=0.6\linewidth]{figures/dice_vs_architecture.pdf}
    \caption{Segmentation performance (Dice) of our method (orange) against a U-Net baseline (blue) across network sizes [depth, filters], where depth is the number of U-Net stages and filters the number of first-layer filters (doubled each stage).}
    \label{fig:dice}
\end{figure}

For very small networks, the segmentation performance of our method is noticeably lower than that of the baseline. This indicates that the model capacity is insufficient to handle the increased complexity of the task, which involves modeling both the contour and the associated uncertainty, as well as interpolating in regions where the muscle is not visible. In contrast, for medium and large network configurations, performance is on par with the baseline, while the model simultaneously provides uncertainty estimates with high \acp{LL} as highlighted in Table~\ref{tab:LL_table}.
\begin{table}[h!]
\centering
\caption{\ac{LL} achieved by our method for different network configurations (\text{filters} $\backslash$ \text{depth}), reported as mean $\pm$ standard deviation over three runs.}
\begin{tabular}{ccccc}
\toprule
\ac{LL} (filters $\backslash$ depth) & \textbf{2} & \textbf{3} & \textbf{4} & \textbf{5} \\
\midrule
\textbf{3} & $-5.10 \pm 0.01$ & $-4.95 \pm 0.02$ & $-2.27 \pm 0.01$ & $-2.25 \pm 0.01$ \\
\textbf{5} & $-5.06 \pm 0.00$ & $-2.51 \pm 0.02$ & $-2.28 \pm 0.04$ & $-2.26 \pm 0.02$ \\
\bottomrule
\end{tabular}
\label{tab:LL_table}
\end{table}

%--------------------------------------------------------------------------------
\subsection{Uncertainty quantification}
\label{sec:Uncertainty}

In this section, we examine the method's ability to quantify uncertainty. We evaluate the model's response to (1) inherent ambiguities in the dataset, such as occlusions or confounders, and (2) artificially introduced unseen noise corrupting the input images. In practice, we assess how well the uncertainty correlates with model errors by means of mean residuals (\ac{MAE}) and the predicted \ac{MSTD}. The experiments are conducted on the $\mathrm{NLL}_{\mathcal{L}}$-model trained with $\lambda=100$ (Sec.~\ref{sec:Config}) and the results are shown in Figure~\ref{fig:correlation_plots}.  

\textbf{(1) Dataset-intrinsic uncertainty: } To analyze heteroscedastic uncertainty originating from the dataset itself, we employ a similar strategy to \citet{kumar_luvli_2020}: We perform the model inference on the test set and collect all row-wise residual errors and predicted heatmap standard deviations for every image. We then sort these tuples by standard deviation and group them into bins of size $N_{\text{bin}} = 50$. Within each bin, we compute the average residual error (\ac{MAE}) and the \ac{MSTD}; each bin therefore corresponds to a single point in Figure~\ref{fig:inherent_noise}.  

\textbf{(2) Uncertainty under unseen noise: } To evaluate whether the model's uncertainty estimates generalize to unseen input noise, we add noise to the unprocessed input images during inference. The noise is modeled by artificially reducing the photon counts and adding X-ray-typical Poisson noise to the input images using the procedure described by \citet{eckert_deep_2019}\footnote{Only the Anscombe transformation is omitted as it is not required for noise simulation.}. To this end, we linearly reduce the effective dose from $100\%$ to $25\%$ in $50$ steps while producing $3$ realizations with different seeds for each dose level. Finally, the model generates predictions for all noisy image realizations and the results are aggregated over all images. Hence, each dot in Figure~\ref{fig:unseen_noise} represents \ac{MAE} and \ac{MSTD}, aggregated over the whole test set at the respective noise level.
\begin{figure}[ht]
    \centering
    \subfigure[Dataset-intrinsic uncertainty.]{
        \includegraphics[height=4.5cm]{figures/uncertainty_vs_error_scatter_inner_eval-refined.pdf}
         \label{fig:inherent_noise}
    }
    \hspace{0.5cm} % small fixed space between subfigures
    \subfigure[Uncertainty under unseen noise.]{%
        \includegraphics[height=4.5cm]{figures/uncertainty_vs_error_scatter_merged_251205-002354.pdf}
        \label{fig:unseen_noise}
    }
    \caption{Correlation between \ac{MAE} and \ac{MSTD} for inherent noise in the dataset \ref{fig:inherent_noise} and unseen noise distorting the input \ref{fig:unseen_noise} for the $\mathrm{NLL}_{\mathcal{L}}$-model ($\lambda=100$).}
    \label{fig:correlation_plots}
\end{figure}

For the uncertainty within the test set in Figure~\ref{fig:inherent_noise}, the results show a strong linear correlation between the \ac{MAE} and \ac{MSTD}, with the \ac{MSTD} consistently matching or slightly exceeding the \ac{MAE}. A similar behavior can be observed for the model's response to previously unseen noise in Figure~\ref{fig:unseen_noise}. Artificially reducing the dose leads to a continuous increase in model error to which the model responds with a steady increase in the \ac{MSTD} of the predicted heatmap. These results indicate that the standard deviations computed from the model's predictive distributions are highly predictive for model error for both inherent heteroscedastic noise within the dataset and previously unseen noise. It should be noted that this finding holds as an aggregated behavior when averaging over $N_{\text{bin}}$ for Figure~\ref{fig:inherent_noise} or the whole test set for Figure~\ref{fig:unseen_noise}.

\begin{figure}[ht]
    \centering
    \includegraphics[width=0.6\linewidth]{figures/uncertainty_panel_234.pdf}
    \caption{Predictions of the $\mathrm{NLL}_{\mathcal{L}}$-model ($\lambda=100$) for an example case from the test set across different noise levels. The row-wise mean $\hat{\mu}_i$ is depicted as a thick red line and the corresponding standard deviation $\hat{\sigma}_i$ as a light red area around the mean. }
    \label{fig:noisy_samples}
    %Standard_OV2_MBTST_002479_R_MLO_Processing_0003_61438 --> 251205-002354-1
\end{figure}

To complement the quantitative analysis, we examine the model's behavior at the image level using an exemplary image under varying noise levels shown in Figure~\ref{fig:noisy_samples}. Up to a $50\%$ dose reduction, the model produces accurate predictions with low uncertainty along the entire contour. When further reducing the dose to $35\%$, the model's uncertainty starts to increase locally in the lower part where the muscle is originally slightly occluded, while still producing a stable contour prediction. At $25\%$ dose, the contour prediction degrades notably in the lower part of the muscle, but at the same time the model increases uncertainty significantly as an indicator for prediction failure. The observations suggest that the model is able to adjust uncertainty locally in areas of reduced visibility, while maintaining stable and confident predictions in regions where the muscle remains clearly visible.

\subsection{Detecting multimodality in the predictive distributions}
This section evaluates the method's capability to reveal multimodal patterns in the learned distributions. Further, we evaluate how the regularization term affects the number of detected cases that exhibit multimodal distributions in the predicted heatmaps. To this end, we compare the proportion of detected cases that exhibit multimodal distributions across different regularization strengths for the $\mathrm{NLL}_{\mathcal{L}}$-model and evaluate the performance in terms of \ac{MAE} and \ac{LL} within the classified subgroups of uni- and multimodal cases. The results are summarized in Table~\ref{tab:nll_lambda_modality}. 
\begin{table}[ht]
    \centering
    \caption{Subgroup analysis of the $\mathrm{NLL}_{\mathcal{L}}$-model based on predicted heatmaps: unimodal vs. multimodal. \ac{MAE} and \ac{LL} are shown for each subgroup across different $\lambda$, with proportions of each class. Metrics are mean $\pm$ standard deviation over 3 runs.}
    \label{tab:nll_lambda_modality}
    \begin{tabular}{llccc}
        \toprule
        \textbf{Metric} & \textbf{Modality} & $\lambda=0.0$ & $\lambda=10.0$ & $\lambda=100.0$ \\
        \midrule
        \multirow{2}{*}{\ac{MAE} $\downarrow$}
        & Multimodal & $3.49 \pm 0.11$ & $3.82 \pm 0.03$ & $6.08 \pm 1.32$ \\
        & Unimodal   & $1.72 \pm 0.05$ & $1.78 \pm 0.05$ & $1.85 \pm 0.03$ \\
        \multirow{2}{*}{\ac{LL} $\uparrow$}
        & Multimodal & $-2.77 \pm 0.06$ & $-2.84 \pm 0.03$ & $-3.57 \pm 0.66$ \\
        & Unimodal   & $-2.21 \pm 0.03$ & $-2.24 \pm 0.03$ & $-2.25 \pm 0.02$ \\
        \midrule
        \multirow{2}{*}{\textbf{Proportion [\%]}}
        & Multimodal & $16.15$ & $7.05$ & $0.91$ \\
        & Unimodal   & $83.85$ & $92.95$ & $99.09$ \\
        \bottomrule
    \end{tabular}
\end{table}

The results show that for stronger regularization, more cases are classified as unimodal. At the same time, \ac{MAE} and \ac{LL} worsen notably within the multimodal class while only marginally deteriorating for the unimodal class. This suggests that the regularization term substantially stabilizes the predicted heatmaps towards unimodal distributions, forcing the model to suppress smaller confounding structures. As a result, only large confounders remain, which are responsible for the larger prediction errors. To further support this, we analyze two qualitative examples of high uncertainty from both classes for $\lambda=0$ and $\lambda=100$, depicted in Figure~\ref{fig:sample_images}.
\begin{figure}[ht]
    \centering
    \subfigure[Unimodal case: muscle occluded by breast tissue.]{%
        \includegraphics[width=0.48\textwidth]{figures/sample1_comparison.pdf}%
        \label{fig:occlusion}
    }
    %\hspace{0.5cm}
    \subfigure[Multimodal case: confounding structures.]{%
        \includegraphics[width=0.48\textwidth]{figures/sample2_comparison.pdf}%
        \label{fig:confounding}
    }
\caption{Qualitative examples illustrating (a) unimodal and (b) multimodal predictive distributions for the $\mathrm{NLL}_{\mathcal{L}}$-model for two regularization strengths $\lambda$.}
    \label{fig:sample_images}
\end{figure}
For the unimodal case in Figure~\ref{fig:occlusion}, it is observed that the muscle shows a clear edge in the upper part while being occluded by breast tissue in the lower part of the muscle, to which both models react with increased uncertainty. For the multimodal case in Figure~\ref{fig:confounding}, there are multiple confounding anatomical structures in the image, leading to multimodal predictive distributions in the heatmaps. This aligns with our initial assumption that in the confounding case the underlying estimation problem becomes inherently multimodal, whereas in the occluded scenario, the potential contour location can be adequately represented by a unimodal distribution. The results indicate that the model can capture multimodal patterns in the distributions, with the sensitivity adjustable via the regularization term. Finally, the results show that the regularizer notably stabilizes the heatmaps towards smooth unimodal distributions for larger regularization strengths, especially for the confounding structure in Figure~\ref{fig:confounding}. 

\subsection{Limitations}
In this section, we discuss the limitations regarding the evaluation of our method. First, all results are averaged over three independent model runs, chosen due to resource constraints. While this allows certain insights into the stability of the results with respect to different model initializations, three runs are not sufficient to compute valid statistics. However, we report the mean and standard deviation across runs to illustrate the consistency of the observed trends. 

Further, in Section~\ref{sec:Uncertainty}, the uncertainty estimates are aggregated over bins of size $N_{\text{bin}} = 50$ rows and over the entire dataset, respectively. While this aggregation captures global uncertainty trends, it does not allow conclusions about the model's local or spatially resolved uncertainty behavior, which is discussed qualitatively in this study. 

Finally, the study focuses on one representative model architecture and one dataset. While this allows a focused evaluation, it may limit the generalizability of the findings across different architectures and datasets. 


\section{Conclusion and Outlook}

In this work, we present a novel method for modeling input-dependent uncertainty in \ac{PM} segmentation using a heatmap-based heteroscedastic regression framework. We show that robust mean and variance estimates can be derived from learned probabilistic heatmaps to jointly model the \ac{PM} boundary and the associated predictive uncertainty. Furthermore, by representing uncertainty directly in probabilistic heatmaps, the method provides richer information than approaches that output mean and variance as isolated numerical values, as it allows detecting inherent multimodality in the predictive distributions and controlling this behavior through a dedicated regularizer. At the same time, we show that our method does not compromise segmentation performance as we achieve on-par performance with a binary segmentation baseline. Last, we show that the model's uncertainty estimates correlate with model error in a global trend and demonstrate that the model reacts appropriately to previously unseen noise, increasing its predicted uncertainty when reduced visibility of the \ac{PM} leads to erroneous predictions.

Although the current framework mainly models unimodal distributions, it establishes a strong foundation for future research on extending the approach to explicitly model multimodal predictive distributions. Moreover, a systematic evaluation against multi-reader annotations would provide valuable insights into the method's ability to capture localized uncertainty. Such an analysis, combined with validation on larger and more diverse datasets, would be essential to ensure robustness in clinical practice.\\

\noindent\textbf{Disclaimer: } The methods presented in this paper are not commercially available and their future availability cannot be guaranteed.


\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version
%\midlacknowledgments{We thank a bunch of people.}


\bibliography{midl26_396}


\end{document}
