\section{Results}\label{sec:results}

\begin{table}[t]
\centering
\caption{Quantitative performance comparison on PanNuke and Ki-67.
Per-class F1 scores are reported for dataset-specific semantic classes, together with instance-level Detection F1 (Det.) and segmentation Dice.}
\rowcolors{2}{gray!15}{white}
\resizebox{\linewidth}{!}{%
\begin{tabular}{lcccccc|c}
\hline
\rowcolor{gray!30}
\textbf{Model} &
\multicolumn{6}{c|}{\textbf{Classification and Detection} $\uparrow$} &
\multicolumn{1}{c}{\textbf{Segmentation} $\uparrow$} \\

\hline
\rowcolor{gray!30}
\multicolumn{8}{c}{\textbf{PanNuke}} \\
\hline
\rowcolor{gray!30}
&
\textbf{Neo.} &
\textbf{Epi.} &
\textbf{Inflam.} &
\textbf{Conn.} &
\textbf{Dead} &
\textbf{Det.} &
\textbf{Dice} \\
\hline
 Ours &
 0.667$_{\pm 0.007}$ &
 $0.675_{\pm 0.002}$ &
 $0.557_{\pm 0.016}$ &
 $0.494_{\pm 0.007}$ &
 $0.001_{\pm 0.002}$ &
 $0.799_{\pm 0.002}$ &
 $0.753_{\pm 0.005}$ \\
\hline
Ours w &
$0.663_{\pm 0.010}$ &
$0.649_{\pm 0.030}$ &
$0.559_{\pm 0.002}$ &
$0.482_{\pm 0.009}$ &
$0.144_{\pm 0.031}$ &
\textbf{0.812}$_{\pm 0.003}$ &
$0.761_{\pm 0.007}$ \\
\hline
Base &
$0.666_{\pm 0.014}$ &
$0.680_{\pm 0.004}$ &
$0.575_{\pm 0.011}$ &
$0.521_{\pm 0.006}$ &
$0.243_{\pm 0.172}$ &
$0.798_{\pm 0.002}$ &
$0.755_{\pm 0.007}$ \\
\hline
DE &
\textbf{0.687}$_{\pm 0.014}$ &
\textbf{0.705}$_{\pm 0.011}$ &
\textbf{0.594}$_{\pm 0.009}$ &
\textbf{0.542}$_{\pm 0.002}$ &
\textbf{0.382}$_{\pm 0.051}$ &
$0.809_{\pm 0.003}$ &
\textbf{0.766}$_{\pm 0.008}$ \\
\hline
MCD &
$0.525_{\pm 0.025}$ &
$0.328_{\pm 0.048}$ &
$0.472_{\pm 0.015}$ &
$0.429_{\pm 0.007}$ &
$0.024_{\pm 0.021}$ &
$0.768_{\pm 0.009}$ &
$0.738_{\pm 0.004}$ \\
\hline
\rowcolor{gray!30}
\multicolumn{8}{c}{\textbf{Ki-67}} \\

\hline
\rowcolor{gray!30}
&
\textbf{Pos.} &
\textbf{Neg.} &
\textbf{Stroma} &
\multicolumn{2}{c}{} &
\textbf{Det.} &
\textbf{Dice} \\
\hline
Ours &
$0.544_{\pm 0.151}$ &
$0.655_{\pm 0.105}$ &
$0.432_{\pm 0.053}$ &
\multicolumn{2}{c}{} &
$0.809_{\pm 0.042}$ &
$0.838_{\pm 0.031}$ \\
\hline
Ours w &
\textbf{0.598}$_{\pm 0.136}$ &  % Positive
$0.683_{\pm 0.069}$ &  % Negative
$0.461_{\pm 0.074}$ &  % Stroma
\multicolumn{2}{c}{} &
$0.819_{\pm 0.042}$ &  % Det. F1
$0.827_{\pm 0.038}$ \\ % Dice
\hline
Base &
$0.531_{\pm 0.186}$ &
$0.683_{\pm 0.076}$ &
$0.437_{\pm 0.065}$ &
\multicolumn{2}{c}{} &
$0.809_{\pm 0.035}$ &
$0.825_{\pm 0.045}$ \\
\hline
DE &
$0.574_{\pm 0.162}$ &
\textbf{0.688}$_{\pm 0.103}$ &
\textbf{0.476}$_{\pm 0.070}$ &
\multicolumn{2}{c}{} &
\textbf{0.822}$_{\pm 0.038}$ &
\textbf{0.845}$_{\pm 0.032}$ \\
\hline
MCD &
$0.553_{\pm 0.067}$ &
$0.626_{\pm 0.059}$ &
$0.390_{\pm 0.075}$ &
\multicolumn{2}{c}{} &
$0.797_{\pm 0.045}$ &
$0.804_{\pm 0.043}$ \\
\hline
\end{tabular}}
\label{tab:performance}
\end{table}




\paragraph{Experiments and implementation details.}
We follow PanNuke three-fold cross validation and Ki-67 leave-one-patient-out cross validation. Following the original DualU-Net training scheme, ll models use a ResNeXt-50 32$\times$4d~\cite{xie2017aggregated} encoder and Gaussian centroid maps are generated using a fixed standard deviation $\sigma = 5$.
Starting from this baseline, we apply two minor modifications: (i) Gaussian centroid maps are scaled by a factor of 100 to improve numerical stability \cite{xie2018microscopy}; and (ii) training is performed for 200 epochs with constant learning rates ($2\times10^{-4}$ for PanNuke and $1\times10^{-4}$ for Ki-67) and batch sizes of 64 and 8, respectively.
For centroid uncertainty, we use fixed weights $\lambda_{\mathrm{mass}}=0.6$ and $\lambda_{\mathrm{peak}}=0.3$ to form the combined score $u_{\mathrm{cent}}$.
We include three segmentation-uncertainty baselines: (i) the original DualU-Net using Shannon entropy of the softmax output, (ii) Monte Carlo Dropout (MCD), implemented by applying spatial dropout ($p=0.1$) after the last two blocks of the segmentation decoder and computing uncertainty over $T=30$ stochastic forward passes, and (iii) a ten-model deep ensemble (DE) using the entropy of the ensemble-averaged predictions. We focus on MCD and DE as uncertainty baselines that can be integrated into the DualU-Net architecture with minimal structural changes.
We consider two evidential variants: \emph{Ours} with unweighted Dice and loss weights $\lambda_{\mathrm{seg}}=1$, $\lambda_{\mathrm{dice}}=0.4$, $\lambda_{\mathrm{cent}}=0.7$, $\lambda_{\mathrm{kl}}=0.4$, and \emph{Ours w} with class-weighted Dice and $\lambda_{\mathrm{kl}}=0.2$, both using a 40-epoch warm-up for $\lambda_{\mathrm{kl}}$. All hyperparameters have been selected on PanNuke validation folds and reused on Ki-67 without further tuning.


\paragraph{Performance evaluation.}
Performance results are reported in Table~\ref{tab:performance}. Using paired two-sided $t$-tests across folds, we observe no statistically significant differences between our evidential approaches (\emph{Ours}, \emph{Ours w}) and the three considered baselines (Base, DE, and MCD) for any of the primary metrics, including Detection F1, Dice, and per-class F1 scores ($p > 0.05$ in all cases).A significant difference appears only for the rare \emph{Necrotic} class in PanNuke, where \emph{Ours w} achieves higher performance than \emph{Ours} ($p=0.015$). No such exception is observed on Ki-67, where no statistically significant differences are found for any metric or method pair.









\paragraph{Evaluation metrics.}
We evaluate uncertainty quality using Adaptive Calibration Error (ACE)~\cite{nixon19} and its maximum (MCE), as well as Adaptive UCE (A-UCE) and its maximum (M-UCE) using quantile-based binning~\cite{corr1019Laves}. Error–uncertainty separability is quantified using the Kolmogorov--Smirnov (KS) statistic~\cite{uncertainyerror2025Tan} and AUROC, computed between continuous uncertainty values and binary correctness indicators. Calibration metrics (ACE, MCE, A-UCE, M-UCE) are reported only for the segmentation head, whose evidential formulation yields probabilistic class predictions. For the centroid head, uncertainty derives from geometric cues rather than calibrated probabilities; accordingly, only KS and AUROC are evaluated, as these measure how well uncertainty ranks correct versus incorrect detections.

\begin{table}[t]
\centering
\caption{Quantitative uncertainty evaluation on PanNuke and Ki-67.
\emph{Left:} segmentation-head uncertainty and calibration results (EDL head) compared with Deep Ensembles (DE), Monte Carlo Dropout (MCD) and the deterministic DualU-Net baseline.  
\emph{Right:} centroid-head uncertainty results. Complete centroid histograms and eCDF plots in Appendix~\ref{ap:centroids}.}

\begin{minipage}[t]{0.65\textwidth}
\centering

\rowcolors{2}{gray!15}{white}
\resizebox{\linewidth}{!}{%
\begin{tabular}{llccccccc}
\hline
\rowcolor{gray!30}
\textbf{M} & \textbf{UM} & \textbf{ACE $\downarrow$} & \textbf{MCE $\downarrow$} &
\textbf{A-UCE $\downarrow$} & \textbf{M-UCE $\downarrow$} &
\textbf{KS $\uparrow$} & \textbf{AUROC $\uparrow$} \\
\hline
\rowcolor{gray!30}
\multicolumn{8}{c}{\textbf{PanNuke}}\\
\hline
\textit{Ours} & $u_{\text{ale}}$
    & \textbf{0.061}$_{\pm0.004}$
    & 0.289$_{\pm0.010}$
    & 0.157$_{\pm0.010}$
    & 0.326$_{\pm0.025}$
    & 0.392$_{\pm0.003}$
    & 0.759$_{\pm0.003}$ \\
& $u_{\text{epi}}$
    & \textbf{0.061}$_{\pm0.004}$
    & 0.289$_{\pm0.010}$
    & 0.100$_{\pm0.004}$
    & 0.251$_{\pm0.017}$
    & 0.392$_{\pm0.003}$
    & 0.759$_{\pm0.003}$ \\
& $u_{\text{vac}}$
    & \textbf{0.061}$_{\pm0.004}$
    & 0.289$_{\pm0.010}$
    & \textbf{0.054}$_{\pm0.004}$
    & 0.246$_{\pm0.016}$
    & 0.391$_{\pm0.003}$
    & 0.758$_{\pm0.003}$ \\
\hline
\textit{Ours w} & $u_{\text{ale}}$ 
    & 0.095$_{\pm0.003}$ 
    & 0.383$_{\pm0.005}$ 
    & 0.175$_{\pm0.005}$ 
    & 0.382$_{\pm0.008}$ 
    & 0.442$_{\pm0.005}$ 
    & 0.791$_{\pm0.002}$ \\
& $u_{\text{epi}}$ 
    & 0.095$_{\pm0.003}$ 
    & 0.383$_{\pm0.005}$ 
    & 0.113$_{\pm0.002}$ 
    & 0.333$_{\pm0.002}$ 
    & \textbf{0.442}$_{\pm0.005}$ 
    & \textbf{0.796}$_{\pm0.003}$ \\
& $u_{\text{vac}}$ 
    & 0.095$_{\pm0.003}$ 
    & 0.383$_{\pm0.005}$ 
    & 0.080$_{\pm0.003}$ 
    & 0.321$_{\pm0.003}$ 
    & 0.441$_{\pm0.005}$ 
    & 0.796$_{\pm0.003}$ \\
\hline
Base & $u_s$      
    & 0.234$_{\pm0.004}$ & 0.417$_{\pm0.027}$ & 0.198$_{\pm0.004}$ &
      0.353$_{\pm0.027}$ & 0.287$_{\pm0.016}$  & 0.692$_{\pm0.010}$ \\
\hline
DE         
    & & 0.131$_{\pm0.001}$ & \textbf{0.220}$_{\pm0.019}$ & 0.085$_{\pm0.001}$ &
      \textbf{0.159}$_{\pm0.013}$ & 0.344$_{\pm0.006}$ & 0.721$_{\pm0.003}$\\
\hline
MCD &
    & 0.136$_{\pm0.014}$ & 0.194$_{\pm0.027}$ &
      0.051$_{\pm0.017}$ & 0.088$_{\pm0.025}$ &
      0.144$_{\pm0.071}$ & 0.602$_{\pm0.047}$ \\
\hline
\hline
\rowcolor{gray!30}
\multicolumn{8}{c}{\textbf{Ki67}}\\
\hline
\textit{Ours} & $u_{\text{ale}}$
    & \textbf{0.106}$_{\pm0.048}$ 
    & \textbf{0.161}$_{\pm0.040}$ 
    & 0.217$_{\pm0.080}$ 
    & 0.287$_{\pm0.069}$ 
    & 0.452$_{\pm0.147}$ 
    & 0.786$_{\pm0.088}$ \\
& $u_{\text{epi}}$ 
    & \textbf{0.106}$_{\pm0.048}$ 
    & \textbf{0.161}$_{\pm0.040}$ 
    & \textbf{0.096}$_{\pm0.046}$ 
    & \textbf{0.173}$_{\pm0.068}$ 
    & 0.450$_{\pm0.146}$ 
    & 0.787$_{\pm0.088}$ \\
& $u_{\text{vac}}$ 
    & \textbf{0.106}$_{\pm0.048}$ 
    & \textbf{0.161}$_{\pm0.040}$ 
    & 0.111$_{\pm0.036}$ 
    & 0.175$_{\pm0.027}$ 
    & 0.446$_{\pm0.148}$ 
    & 0.786$_{\pm0.089}$ \\
\hline
\textit{Ours w} & $u_{\text{ale}}$ 
    & 0.132$_{\pm0.053}$ 
    & 0.220$_{\pm0.056}$ 
    & 0.201$_{\pm0.086}$ 
    & 0.258$_{\pm0.078}$ 
    & 0.470$_{\pm0.156}$ 
    & 0.796$_{\pm0.090}$ \\
& $u_{\text{epi}}$ 
    & 0.132$_{\pm0.053}$ 
    & 0.220$_{\pm0.056}$ 
    & 0.122$_{\pm0.095}$ 
    & 0.222$_{\pm0.123}$ 
    & \textbf{0.471}$_{\pm0.157}$ 
    & \textbf{0.796}$_{\pm0.090}$ \\
& $u_{\text{vac}}$ 
    & 0.132$_{\pm0.053}$ 
    & 0.220$_{\pm0.056}$ 
    & 0.131$_{\pm0.059}$ 
    & 0.195$_{\pm0.086}$ 
    & 0.471$_{\pm0.159}$ 
    & 0.796$_{\pm0.090}$ \\
\hline
Base & $u_s$
    & 0.286$_{\pm0.153}$ & 0.430$_{\pm0.120}$ & 0.207$_{\pm0.144}$ &
      0.226$_{\pm0.119}$ & 0.252$_{\pm0.113}$ & 0.663$_{\pm0.071}$ \\
\hline
DE &
    & 0.159$_{\pm0.120}$ & 0.283$_{\pm0.179}$ &
      0.112$_{\pm0.065}$ & 0.252$_{\pm0.108}$ &
      0.311$_{\pm0.126}$ & 0.690$_{\pm0.076}$ \\
\hline
MCD &
    & 0.203$_{\pm0.141}$ & 0.325$_{\pm0.179}$ &
      0.121$_{\pm0.076}$ & 0.214$_{\pm0.137}$ &
      0.226$_{\pm0.135}$ & 0.633$_{\pm0.135}$ \\
\hline


\end{tabular}}
\end{minipage}
\hfil
\begin{minipage}[t]{0.30\textwidth}
\centering
\rowcolors{2}{gray!15}{white}
\resizebox{\linewidth}{!}{%
\begin{tabular}{llcc}
\hline
\rowcolor{gray!30}
\textbf{M} & \textbf{UM} & \textbf{KS $\uparrow$} & \textbf{AUROC $\uparrow$}  \\
\hline
\rowcolor{gray!30}
\multicolumn{4}{c}{\textbf{PanNuke}}\\
\hline
\textit{Ours} & $u_{\text{cent}}$
    & 0.429$_{\pm0.006}$ & 0.782$_{\pm0.003}$ \\
& $u_{\text{mass}}$
    & 0.410$_{\pm0.005}$ & 0.767$_{\pm0.003}$ \\
& $u_{\text{peak}}$
    & 0.338$_{\pm0.025}$ & 0.712$_{\pm0.016}$ \\
\hline
\textit{Ours w} & $u_{\text{cent}}$
    & \textbf{0.461}$_{\pm0.010}$ & \textbf{0.801}$_{\pm0.009}$ \\
& $u_{\text{mass}}$
    & 0.448$_{\pm0.007}$ & 0.787$_{\pm0.009}$ \\
& $u_{\text{peak}}$
    & 0.361$_{\pm0.015}$ & 0.723$_{\pm0.008}$ \\
\hline\hline
\rowcolor{gray!30}
\multicolumn{4}{c}{\textbf{Ki67}}\\
\hline
\textit{Ours} & $u_{\text{cent}}$
    & 0.591$_{\pm0.113}$ & 0.862$_{\pm0.058}$ \\
& $u_{\text{mass}}$
    & 0.575$_{\pm0.121}$ & 0.851$_{\pm0.063}$ \\
& $u_{\text{peak}}$
    & 0.520$_{\pm0.096}$ & 0.823$_{\pm0.052}$ \\
\hline
\textit{Ours w} & $u_{\text{cent}}$
    & \textbf{0.612}$_{\pm0.092}$ & \textbf{0.875}$_{\pm0.047}$ \\
& $u_{\text{mass}}$
    & 0.596$_{\pm0.099}$ & 0.863$_{\pm0.056}$ \\
& $u_{\text{peak}}$
    & 0.543$_{\pm0.058}$ & 0.843$_{\pm0.033}$ \\
\hline
\end{tabular}}
\end{minipage}

\label{tab:segmentation_and_centroid_uncertainty}
\end{table}

\paragraph{Segmentation uncertainty.}
Table~\ref{tab:segmentation_and_centroid_uncertainty} (left) summarizes calibration and error-separation metrics for segmentation-head uncertainties. Across both datasets, the evidential formulation (\textit{Ours} and \textit{Ours w}) consistently improves the separation between correct and incorrect predictions. On PanNuke, all evidential uncertainties achieve substantially higher KS and AUROC than the deterministic baseline, with improvements that are highly statistically significant ($p<10^{-6}$). Compared to MC Dropout, both evidential variants attain significantly higher KS and AUROC ($p<0.05$), while differences with Deep Ensembles are not statistically significant ($p>0.05$). The three evidential uncertainty measures behave similarly, with no statistically significant differences between them ($p>0.1$). Distribution histograms and eCDF plots further confirm a clearer separation for evidential measures compared to all baselines (Figure~\ref{fig:segmentation_results}). The weighted variant (\textit{Ours w}) yields a statistically significant improvement over the unweighted model on PanNuke ($p<0.05$). On Ki-67, both evidential variants outperform the deterministic baseline, Deep Ensembles, and MC Dropout in terms of KS and AUROC. Improvements over the baseline are statistically significant ($p<10^{-4}$), while gains over Deep Ensembles are consistent but not statistically significant ($p>0.1$). Compared to MC Dropout, \textit{Ours w} achieves a statistically significant improvement in AUROC ($p<0.05$), whereas KS differences do not reach significance ($p>0.1$); differences between \textit{Ours} and MC Dropout are not statistically significant for either metric ($p>0.1$). As on PanNuke, the three evidential uncertainties remain statistically indistinguishable ($p>0.1$), and the weighted variant shows a consistent but not statistically significant improvement over the unweighted model.


\begin{figure}[t]
\centering
\setlength{\tabcolsep}{2pt}  % spacing between images
\begin{tabular}{ccccc}

% --- Row 1 ---
\includegraphics[width=0.19\columnwidth]{figures/plots/seg_ins_pannuke/hist_edl_ale.png} &
\includegraphics[width=0.19\columnwidth]{figures/plots/seg_ins_w_pannuke/hist_edl_epi.png} &
\includegraphics[width=0.19\columnwidth]{figures/plots/base/resnext_pannuke_31_seed_42_base_g100_no_norm_epoch_200_hist_seg_entropy.jpg} &
\includegraphics[width=0.19\columnwidth]{figures/plots/de/hist.jpg} &
\includegraphics[width=0.19\columnwidth]{figures/plots/mcd/hist.jpg} \\


% --- Row 2 ---
\includegraphics[width=0.19\columnwidth]{figures/plots/seg_ins_pannuke/ecdf_edl_ale.png} &
\includegraphics[width=0.19\columnwidth]{figures/plots/seg_ins_w_pannuke/ecdf_edl_epi.png} &
\includegraphics[width=0.19\columnwidth]{figures/plots/base/resnext_pannuke_31_seed_42_base_g100_no_norm_epoch_200_ecdf_seg_entropy.jpg} &
\includegraphics[width=0.19\columnwidth]{figures/plots/de/ecdf.jpg} &
\includegraphics[width=0.19\columnwidth]{figures/plots/mcd/ecdf.jpg} \\


\end{tabular}

\caption{Segmentation-head uncertainty histograms (top) and eCDFs (bottom). Errors in red, correct instances in blue. Columns: \textit{Ours}, \textit{Ours w}, Base, DE, MCD. For evidential models we plot the best separator (\(u_{\mathrm{ale}}\) for \textit{Ours}, \(u_{\mathrm{epi}}\) for \textit{Ours w}). See additional histogram and eCDF analyses in Appendix~\ref{ap:plots_seg}
}


\label{fig:segmentation_results}
\end{figure}


\paragraph{Segmentation calibration.}
Across both datasets, our evidential variants show significantly improved calibration compared to the deterministic baseline, with all gains confirmed by statistical testing ($p<10^{-4}$). Their calibration is statistically indistinguishable from Deep Ensembles and MC Dropout ($p>0.1$), indicating ensemble-level performance. For MCE, both evidential variants significantly outperform the baseline, with stronger evidence for \textit{Ours} ($p<10^{-3}$) and a smaller but still significant effect for \textit{Ours w} ($p<0.05$), while Deep Ensembles remain the best-performing method. Compared to MC Dropout, the evidential variants exhibit higher miscalibration on PanNuke, with MC Dropout achieving significantly lower MCE and UCE-style errors ($p<0.05$). On Ki-67, higher variance prevents statistically significant differences between methods ($p>0.15$); nevertheless, the evidential models remain at least as well calibrated as Deep Ensembles, outperform the deterministic baseline, and achieve stronger calibration than MC Dropout in terms of ACE and MCE across folds ($p<0.05$).

\paragraph{Centroid uncertainty.}
Table~\ref{tab:segmentation_and_centroid_uncertainty} (right) reports KS and AUROC for the centroid-head uncertainties (\(u_{\text{peak}},u_{\text{mass}},u_{\text{cent}}\)). On PanNuke, the proposed geometric cues provide clear discrimination, with the combined centroid score consistently outperforming the individual components. The weighted variant (\textit{Ours w}) yields a statistically significant improvement in KS over the unweighted model (\(p<0.05\)), while differences in AUROC remain within fold-to-fold variability (\(p>0.1\)). Among the individual cues, the mass-based uncertainty is the most informative, followed by the peak-based cue, and their combination produces the strongest overall signal. On Ki-67, centroid uncertainties are even more discriminative. Both evidential variants achieve strong error separation across all centroid metrics, but no statistically significant differences are observed between \textit{Ours} and \textit{Ours w} (\(p>0.1\)). As in PanNuke, the mass-based cue remains the most informative individual component, while the combined centroid uncertainty provides the most robust and stable separation.



\paragraph{Qualitative results.}
Figure~\ref{fig:qualitative_results} illustrates qualitative examples from a representative PanNuke fold using the \textit{Ours w} configuration. Across the examples, nuclei highlighted with high segmentation-head or centroid-head uncertainty consistently correspond to meaningful failure modes: clear classification mistakes, missed or imprecise detections, or instances that, despite being labeled as correct, exhibit ambiguous morphology or borderline staining and could warrant ground-truth revision. 









\begin{figure}[t]
\centering
\setlength{\tabcolsep}{2pt}  
\begin{tabular}{ccccc}
\includegraphics[width=0.19\columnwidth]{figures/plots/seg_ins_pannuke/ece_adaptive.png} &
\includegraphics[width=0.19\columnwidth]{figures/plots/seg_ins_w_pannuke/ece_adaptive.png} &
\includegraphics[width=0.19\columnwidth]{figures/plots/base/resnext_pannuke_31_seed_42_base_g100_no_norm_epoch_200_reliability_instance_adaptive.jpg} &
\includegraphics[width=0.19\columnwidth]{figures/plots/de/reliability_instance_adaptive.jpg} &
\includegraphics[width=0.19\columnwidth]{figures/plots/mcd/reliability_instance_adaptive.jpg} \\

\end{tabular}
\caption{ACE plots for the segmentation head. 
Left to right: \textit{Ours}, \textit{Ours w}, Base, DE, MCD.
}

\label{fig:segmentation_calibration}
\end{figure}




\begin{figure}
    \centering
    \includegraphics[width=0.9\linewidth]{figures/plots/exemples_all_space.jpg}
    \caption{
        Qualitative uncertainty examples on PanNuke using the \textit{Ours w} configuration.
        We show the original patch, ground-truth labels, predictions, and the three uncertainty measures for each head ($u_{\mathrm{epi}}$, $u_{\mathrm{ale}}$, $u_{\mathrm{vac}}$ for segmentation and $u_{\mathrm{cent}}$, $u_{\mathrm{mass}}$, $u_{\mathrm{peak}}$ for detection).  
        For segmentation, red circles mark class-mismatch errors, blue false-positive nuclei, and green correctly predicted but ambiguous cases.  
        For centroids, red circles highlight missed or imprecise detections, and green circles indicate correct detections with residual uncertainty.  
        These examples illustrate how segmentation- and centroid-based uncertainties jointly identify unreliable instances.
        }

    \label{fig:qualitative_results}
\end{figure}


