\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution
%\usepackage{hyperref}

% My packages
%% Nice tables
\usepackage{framed,multirow,array}
\usepackage{booktabs}
\usepackage{xcolor}

%% The amssymb package provides various useful mathematical symbols
\usepackage{amssymb}
\usepackage{latexsym}
\usepackage{amsmath}

%% Other packages
\usepackage{comment}
\usepackage{graphicx}

% \usepackage{mwe} % to get dummy images
% \jmlrvolume{-- Under Review}
% \jmlryear{2026}
% \jmlrworkshop{Full Paper -- MIDL 2026 submission}
% \editors{Under Review for MIDL 2026}

\jmlryear{2026}
\jmlrworkshop{Full Paper -- MIDL 2026}
\jmlrvolume{-- nnn}
\editors{Accepted for publication at MIDL 2026}

\title[IB U-Nets]{Learning Robust Medical Image Segmentation with Inductive Bias}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }
%\footnotetext[1]{Contributed equally}
% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Shrajan Bhandary\nametag{$^{1}$}}  \Email{shrajan.bhandary@tuwien.ac.at}\\
\Name{Dejan Kuhn\nametag{$^{2}$}} \Email{dejan.kostyszyn@uniklinik-freiburg.de}\\
\Name{Zahra Babaiee\nametag{$^{1}$}} \Email{zahra.babaiee@tuwien.ac.at}\\
\Name{Tobias Fechter\nametag{$^{2}$}} \Email{tobias.fechter@uniklinik-freiburg.de}\\
\Name{Anca{-}Ligia Grosu\nametag{$^{2}$}} \Email{anca.grosu@uniklinik-freiburg.de}\\
\Name{Radu Grosu\nametag{$^{1}$}} \Email{radu.grosu@tuwien.ac.at}\\
\addr $^{1}$ Cyber-Physical Systems Research Unit, Technische Universit\"at Wien, Austria \\
\addr $^{2}$ Department of Radiation Oncology, University Medical Center Freiburg, Germany
}

\begin{document}

\maketitle

\begin{abstract}
Despite the success of transformer-based and convolutional neural networks in 3D medical image segmentation, current architectures exhibit limited generalisation on small datasets and under distribution shifts, especially when high-quality examples are scarce for specific structures. We introduce IB-nnU-Nets, a family of U-Net variants augmented with inductively biased filters inspired by vertebrate visual processing. Starting from a 3D U-Net backbone, we insert two 3D residual components into the second encoder block that implement on- and off-centre-surround convolutions with fixed, pre-computed weights and act as complementary edge detectors. Across multiple organ and tumour segmentation tasks, we show that equipping state-of-the-art 3D U-Nets with an IB block improves accuracy and robustness, with the strongest gains in small-data and out-of-distribution settings. The framework and trained IB-nnU-Net models are publicly available.
\end{abstract}

\begin{keywords}
3D segmentation, inductive bias, limited data, out-of-distribution robustness
\end{keywords}


\section{Introduction}
\label{Introduction}

Manual segmentation of organs and tumours from medical images is essential for diagnosis, treatment planning, and disease monitoring~\cite{goldenberg_NATURE_2019, thno61207, 3d_survey_singh_2020, MDC_2022, nnU_Net_2020}. However, manual delineation is time-consuming and subject to substantial interobserver variability~\cite{rischke_2013, steenbergen_2015}. Deep learning offers powerful alternatives for automatic segmentation. Since AlexNet~\cite{alexnet_2012}, numerous convolutional neural network (CNN) architectures have been explored~\cite{3d_survey_singh_2020}, with U-Nets~\cite{ronneberger_u_net_2015, cicek_3d_2016} becoming the dominant choice across many benchmarks.

\begin{figure}[t]
\centering
\includegraphics[scale=0.4]{IB_encoder_block.png}
\caption{Extending U-Net variants with two 3D inductive-bias kernels (on- and off-centre-surround convolutions) in the second encoder block.}
\label{fig:oocsArchitecture}
\vspace{-6mm}
\end{figure}

Despite this progress, U-Net variants remain sensitive to imaging heterogeneity, shape variability across patients, and low tissue contrast~\cite{gillespie2020deep}. Even state-of-the-art frameworks such as nnU-Net~\cite{nnU_Net_2020} show limited generalisation when training data are scarce or when deployed on out-of-distribution acquisitions~\cite{litjens_2017, nnU_Net_2020}. Scaling up models or datasets~\cite{liu2023clip, ulrich2023multitalent} can help, but requires substantial computational resources and curated data that are often unavailable in clinical settings. Moreover, recent work has shown that architectural innovations yield diminishing returns on large datasets: properly configured U-Nets remain competitive with transformer-based alternatives under matched training budgets~\cite{nnunet_revisited_isensee}. This motivates our focus on regimes where improvements are still achievable: small-data and out-of-distribution scenarios.

In this work, we introduce a biologically inspired inductive bias into U-Net architectures without increasing trainable parameters. Our approach draws on on- and off-centre-surround receptive fields in the vertebrate retina, modelled by difference-of-Gaussians (DoG) filters. We adapt previous 2D work on such inductive biases~\cite{pmlr-v139-babaiee21a} to 3D, designing spherical kernels and integrating them into the encoder of 3D U-Nets. Concretely, we add two 3D residual components with fixed on- and off-centre-surround convolutions to the second encoder block (Figure~\ref{fig:oocsArchitecture}), encouraging feature representations that emphasise edges and local contrast. We extend several architectures - Attention U-Net~\cite{attention_unet_2018}, SegResNet~\cite{segresnet2018}, TransUNet~\cite{chen2024transunet}, and nnU-Net~\cite{nnU_Net_2020} - with these inductive-bias (IB) kernels and evaluate their performance on multiple organ and tumour segmentation tasks.

Our experiments show that IB-extended U-Nets are particularly beneficial in small-data and out-of-distribution scenarios, while maintaining strong performance on larger datasets. The 3D IB filters are modular, backbone-agnostic, and can be inserted into existing U-Net architectures without adding learnable parameters. In summary, our contributions are:
\begin{itemize}
\item We introduce a 3D inductive bias based on spherical on/off centre-surround kernels and show how to integrate it into U-Net variants for 3D medical image segmentation, analysing kernel shape and encoder placement.
\item We extend four architectures (nnU-Net, Attention U-Net, TransUNet, SegResNet) with this IB and demonstrate consistent robustness gains, with the strongest improvements in small-data and out-of-distribution settings.
\end{itemize}

\begin{figure}[t]
\centering
\includegraphics[width=\linewidth]{attention_maps.pdf}
\caption{Qualitative comparison of attention maps for nnU-Net and IB-nnU-Net. (a) Raw MR and CT images with ground-truth annotations. (b) Attention maps of nnU-Net showing spurious and missed regions. (c) Attention maps of IB-nnU-Net showing robust segmentation performances. (d) Final predictions: green and yellow contours denote nnU-Net and IB-nnU-Net outputs, respectively.}
\label{fig:attention_maps}
\vspace{-4mm}
\end{figure}

\section{Related Work}
\label{related_work}

\subsection{U-Net Variants}

U-Nets~\cite{ronneberger_u_net_2015,cicek_3d_2016} and their derivatives form the backbone of most biomedical segmentation pipelines. They employ an encoder–decoder structure with skip connections that preserve spatial information and support high-resolution prediction. Attention U-Net~\cite{attention_unet_2018} introduces attention gates that emphasise relevant structures while suppressing irrelevant regions. SegResNet~\cite{segresnet2018} replaces pooling with strided convolutions and adds residual connections. The nnU-Net framework~\cite{nnU_Net_2020} is a self-configuring pipeline that adapts to each dataset and has achieved top performance in multiple segmentation challenges.

\subsection{Medical Image Segmentation}

Public datasets such as the Medical Segmentation Decathlon (MSD)~\cite{MDC_2019}, Beyond the Cranial Vault (BTCV)~\cite{btcv_gibson_eli_2018}, and AMOS-2022~\cite{ji2022amos} have accelerated progress in medical image analysis, with nnU-Net~\cite{nnU_Net_2020} achieving top ranks on many benchmarks. Transformer-based models have recently gained traction: UNETR~\cite{hatamizadeh2022unetr}, Swin UNETR~\cite{hatamizadeh2022swin}, and TransUNet~\cite{chen2024transunet} integrate transformer modules into the architectures.

However,~\citet{nnunet_revisited_isensee} showed that many claims of transformer superiority over U-Nets do not hold under rigorous, controlled comparisons, with well-configured CNNs remaining state-of-the-art when training data and computational budgets are matched. Data efficiency and robustness therefore remain central challenges. For organ-specific segmentation,~\citet{BHANDARY2023102241} observed that robustness degrades substantially as dataset size decreases, underlining the need for methods that perform well with limited training data.

\subsection{Biologically Inspired Architectures}

Early vision models were heavily influenced by neuroscience and psychology, and biologically inspired ideas remain promising for artificial intelligence~\cite{HASSABIS2017245}. Recent work revisits neuroscience-motivated modifications of CNNs to improve robustness and interpretability~\cite{nayebiNeurips2018, Dapello2020, pmlr-v139-babaiee21a}. For example,~\citet{Dapello2020} proposed a CNN architecture aligned with primate primary visual cortex that exhibits increased robustness to adversarial perturbations. Most relevant to our work,~\citet{pmlr-v139-babaiee21a} added 2D on/off centre-surround pathways, modelled by DoG filters, to CNNs and demonstrated improved robustness across several image classification benchmarks. We build on this foundation by extending the approach to 3D, designing spherical centre-surround kernels suited to volumetric medical images and adapting the integration strategy for U-Net-style segmentation architectures.


\section{Materials and Methods}
\label{Materials_and_Methods}

\subsection{Datasets}
\label{Datasets}

We evaluate our approach on multiple public and private 3D datasets spanning different organs, modalities, and task difficulty levels.~\textit{AMOS-2022} is an abdominal CT dataset comprising 500 scans (300 training, 200 testing) with 15 organ annotations~\cite{ji2022amos}. We use the training split for 5-fold cross-validation and the spleen annotations from the held-out test set as an out-of-distribution target for models trained on MSD-spleen.~\textit{PROMISE-12} is a prostate MRI challenge dataset with 80 T2-weighted MR volumes (50 training, 30 testing) from multiple centres using heterogeneous acquisition protocols~\cite{LITJENS2014359}. The data exhibit substantial variability in voxel spacing, image quality, and prostate appearance.~\textit{MSD} refers to three organ segmentation tasks from the Medical Segmentation Decathlon~\cite{MDC_2019,MDC_2021,MDC_2022}: MSD-hippocampus (394 volumes), MSD-prostate (48 multi-modal MRI scans), and MSD-spleen (61 CT scans).~\textit{PROSTATEx} comprises 204 prostate MRI studies acquired on two scanners~\cite{prostatex_Litjens_2014}.~\textit{Prostate158} is a multi-modal MRI dataset with 158 volumes (139 training, 19 testing) including prostate and tumour annotations~\cite{keno_bressem_2022_6481141}.

For tumour segmentation, along with the tumour subsets of the MSD dataset, we use two in-house PSMA-PET cohorts with different tracers: $^{68}$Ga-PSMA-11 (68 scans: 51 training, 17 test) and $^{18}$F-piflufolastat (65 scans: 45 training, 20 test). Annotations were obtained by expert consensus using fixed SUV windows tailored to each tracer ($^{68}$Ga: 0–5; $^{18}$F: 0–10).

\subsection{U-Net Variants}
\label{Networks}

We consider Attention U-Net (IB-Att-U-Net), SegResNet (IB-SegResNet), TransUNet (IB-TransUNet), and nnU-Net (IB-nnU-Net). These architectures represent diverse design choices - attention mechanisms, residual connections, CNN–transformer hybrids, and self-configuring pipelines - and are widely used in medical image segmentation. UNETR and Swin UNETR employ transformer encoders without convolutional backbones, making our fixed convolutional IB kernels difficult to integrate in a principled manner. We therefore focus on architectures with convolutional encoder blocks, where the IB can be inserted directly.

\subsection{Inductive Biases in U-Nets}
\label{Inductive_Biases_in_U_Net_Variations}

We combine biologically motivated filters~\cite{pmlr-v139-babaiee21a} with U-Net variants to obtain more robust 3D architectures. This section describes the construction of our 3D IB kernels and how we integrate them into U-Nets.

\subsubsection{Design of the 3D IB Kernels}

Retinal receptive fields in primates can be modelled by a difference of Gaussians (DoG) ~\cite{rodieck_1965}. For 2D kernels, a formulation by~\citet{kruizinga_petkov_2000,Petkov2005ModificationsOC} defines the centre and surround weights as:


\begin{equation}
\label{eq1:petrov}
DoG_{\sigma,\rho}(x,y) =  \frac{A_c}{\rho^2} e^{-\frac{x^2+y^2}{2\rho^2\sigma^2}} - A_s e^{-\frac{x^2+y^2}{2\sigma^2}}
\end{equation}
where $\rho<1$ is the ratio of the centre radius to the surround radius, $\sigma$ is the variance of the surround Gaussian, and $A_c$, $A_s$ are the centre and surround coefficients.

Naively extending this to 3D by stacking along the z-axis yields a cylindrical centre-surround structure, which is suboptimal for 3D medical images. Instead, we construct a spherical 3D centre-surround kernel:
\begin{equation}
\label{eq:petrov_3d}
DoG_{\sigma,\rho}(x,y,z) =  \frac{A_c}{\rho^3} e^{-\frac{x^2+y^2+z^2}{2\rho^2\sigma^2}} - A_s e^{-\frac{x^2+y^2+z^2}{2\sigma^2}}
\end{equation}

To balance excitation and inhibition while maintaining sufficiently large kernel weights, we enforce
\begin{equation}
\int [DoG_{\sigma,\rho}(x,y,z)]^{+} dx\, dy\, dz = c, \quad
\int [DoG_{\sigma,\rho}(x,y,z)]^{-} dx\, dy\, dz = -c
\end{equation}
where $[w]^{+}=\max(0,w)$ and $[w]^{-}=\min(0,w)$. In the continuous, infinite-support case, this implies $A_c = A_s$ (see appendix). For discrete kernels, we approximate the variance as:
\begin{equation}
\label{eq:sigma2}
\sigma \approx \frac{r}{\rho}\sqrt{\frac{1-\rho^2}{-6\ln{\rho}}}
\end{equation}

\subsubsection{Extending U-Net Architectures with IB Kernels}
\label{sec:TheArchitectureOfSecondEncoderBlock}

Using Equation~\eqref{eq:petrov_3d}, we compute fixed kernel weights for the On and Off pathways; the Off kernel is the sign-inverted version of the On kernel. 
%In all experiments, we use $5\times5\times5$ IB kernels with $\rho=2/3$ and $r=2$.
For an input volume $\chi$, the on and off responses are obtained by convolving with the corresponding kernels:
\begin{equation}
\begin{array}{c@{\ }c@{\ }l}
\chi_{\mathrm{On}}[x,y,z] &=& (\chi * DoG[r,\rho, c]^{+})[x,y,z],\\[1mm]
\chi_{\mathrm{Off}}[x,y,z] &=& (\chi * DoG[r,\rho, c]^{-})[x,y,z].
\end{array}
\end{equation}

To integrate these IB kernels into the second encoder block, we split its convolutional layers into two parallel pathways, each using half of the original filters, mirroring retinal on/off pathways. We add the 3D on and off responses to the activation maps of the first convolutional layer before max-pooling, using stride-2 IB convolutions to match downsampling; adding IBs after max-pooling yields inferior performance. We then concatenate the activation maps from the two pathways, producing an output with the same shape as the original block. Any 3D U-Net variant be extended to an IB-augmented version by replacing its second encoder block with an IB encoder block (Figure~\ref{fig:oocsArchitecture}). This modification introduces no additional trainable parameters: IB-nnU-Net has identical parameter count to nnU-Net.

\subsection{Implementation Details}

All models were trained and evaluated using nnU-Net~\cite{nnU_Net_2020}. IB hyperparameters $(k=5, r=2, \rho=2/3)$ were chosen based on the properties of the spherical 3D kernels and validated on a small development subset ($n=8$) from PROMISE-12, then fixed for all subsequent experiments. We trained all models from scratch under identical settings - same loss function, optimiser, learning rate schedule, augmentation, and hardware. % - differing only in the presence of the IB block in the second encoder. %The same protocol was followed for all other architecture pairs (e.g., Attention U-Net vs IB-Att-U-Net). 
We used combined cross-entropy and Dice loss, stochastic gradient descent with initial learning rate $10^{-2}$, a polynomial scheduler for 1000 epochs, and L2 regularisation ($10^{-5}$). Inference used sliding-window evaluation with 1/2 overlap. We report Dice similarity coefficient (DSC), surface Dice coefficient (SDC), and 95th percentile Hausdorff distance (HD-95). The IB kernels add no trainable parameters. VRAM usage increases by at most 1\%, and training time by at most 2 seconds per epoch relative to the corresponding baseline U-Net. We use the Wilcoxon signed-rank test: IB-nnU-Net improvements with an asterisk (*) for $p \le 0.05$.



% TABLE 1
\begin{table}[t]
\fontsize{9}{10}\selectfont
\centering
\begin{tabular}{|c|c|c|c|c|}
\hline
Comparison Type & Model Name/Setting & DSC ($\uparrow$) & HD-95 ($\downarrow$) & SDC ($\uparrow$) \\
\hline
\multirow{4}{*}{Model variants}
 & nnU-Net                 & 0.728 & 12.592 & 0.647 \\
 & Cylindrical IB-nnU-Net (k=3)  & 0.686 & 15.626 & 0.581 \\
 & Cylindrical IB-nnU-Net (k=5)  & 0.715 & 13.256 & 0.600 \\
 & Spherical IB-nnU-Net (k=5)  & \textbf{0.742} & \textbf{11.125} & \textbf{0.670} \\
\hline
\multirow{3}{*}{IB filter placement location}
 & Symmetric Encoder–Decoder           & 0.706 & 13.830 & 0.616 \\
 & IB filters in all encoders            & 0.681 & 20.790 & 0.571 \\
 & IB filters in only Encoder 2          & \textbf{0.742} & \textbf{11.125} & \textbf{0.670} \\
\hline
\multirow{4}{*}{Different IB kernel parameters}
 & $k{=}3,\,r{=}1,\,\rho{=}1/2$ & 0.711 & 15.949 & 0.638 \\
 & $k{=}5,\,r{=}2,\,\rho{=}2/3$ & \textbf{0.742} & \textbf{11.125} & \textbf{0.670} \\
 & $k{=}7,\,r{=}3,\,\rho{=}3/4$ & 0.738 & 11.581 & 0.659 \\
 & $k{=}9,\,r{=}4,\,\rho{=}4/5$ & 0.740 & 11.362 & 0.660 \\
\hline
\end{tabular}
\caption{Ablation study on IB-nnU-Net design choices using a development subset of size 8 from PROMISE-12.}
\label{table:Various_IB_UNets}
\vspace{-4mm}
\end{table}


\section{Experiments and Results}

We evaluate IB-extended U-Nets on challenging datasets, with emphasis on small training sets, noisy acquisitions, and out-of-distribution scenarios. First, we assess performance with limited training data by constructing subsets from MSD-hippocampus, MSD-prostate, MSD-spleen, and PROMISE-12. We randomly sample subsets of size 8, 16, and 24 from each training cohort and treat the remaining training images as test sets. Due to the small size of MSD-heart, we use subsets of 8 and 16. For PROMISE-12, we reserve a subset of size 8 as a development set for IB hyperparameter selection and ablations (Section~\ref{Inductive_Biases_in_U_Net_Variations}).

% TABLE 2
\begin{table}[t]
\fontsize{9}{10}\selectfont
\centering
\begin{tabular}{|c|c|c|c|c|c|c|c|}
\hline
Size & Model &
\multicolumn{2}{c|}{MSD-Hippocampus} &
\multicolumn{2}{c|}{MSD-Prostate} &
\multicolumn{2}{c|}{MSD-Spleen} \\
\cline{3-8}
 &  & DSC$(\uparrow)$ & HD-95$(\downarrow)$ & DSC$(\uparrow)$ & HD-95$(\downarrow)$ & DSC$(\uparrow)$ & HD-95$(\downarrow)$ \\
\hline
\multirow{8}{*}{$8$} & SegResNet       & 0.580 & 43.443 & 0.555 & 81.141 & 0.601 & 61.078 \\
& IB-SegResNet     & 0.598 & 34.579 & 0.560 & 45.730 & 0.621 & 43.575 \\
& Attention U-Net  & 0.644 & 21.012 & 0.456 & 94.860 & 0.615 & 51.639 \\
& IB-Att-U-Net     & 0.652 & 20.283 & 0.657 & 15.365 & 0.630 & 33.087 \\
& TransUNet        & 0.657 & 19.500 & 0.632 & 18.500 & 0.650 & 31.000 \\
& IB-TransUNet     & 0.663 & 17.800 & 0.638 & 17.500 & 0.659 & 29.000 \\
& nnU-Net          & 0.660 & 18.360 & 0.705 & 12.500 & 0.652 & 32.377 \\
& IB-nnU-Net       & \textbf{0.670*} & \textbf{17.415*} & \textbf{0.720*} & \textbf{11.346*} & \textbf{0.665*} & \textbf{27.522*} \\
\hline
\multirow{8}{*}{$16$} & SegResNet       & 0.694 & 14.744 & 0.705 & 27.255 & 0.704 & 16.513 \\
& IB-SegResNet     & 0.709 & 13.574 & 0.754 & 13.992 & 0.715 & 14.331 \\
& Attention U-Net  & 0.743 & 11.115 & 0.682 & 32.189 & 0.728 & 12.764 \\
& IB-Att-U-Net     & 0.763 &  9.696 & 0.779 &  9.846 & 0.739 & 11.979 \\
& TransUNet        & 0.750 &  9.730 & 0.782 &  9.900 & 0.745 & 10.400 \\
& IB-TransUNet     & 0.810 &  9.250 & 0.793 &  9.800 & 0.747 & 10.300 \\
& nnU-Net          & 0.762 &  9.763 & 0.804 &  9.714 & 0.752 & 10.274 \\
& IB-nnU-Net       & \textbf{0.818*} & \textbf{9.177*} & \textbf{0.821*} & \textbf{9.672*} & \textbf{0.756*} & \textbf{10.181*} \\
\hline
\multirow{8}{*}{$24$} & SegResNet       & 0.809 &  9.993 & 0.805 & 11.741 & 0.818 &  7.954 \\
& IB-SegResNet     & 0.831 &  8.990 & 0.808 & 11.213 & 0.819 &  8.252 \\
& Attention U-Net  & 0.851 &  8.660 & 0.816 & 10.972 & 0.841 &  9.725 \\
& IB-Att-U-Net     & 0.862 &  8.604 & 0.819 & 10.106 & 0.843 &  8.920 \\
& TransUNet        & 0.850 &  8.800 & 0.816 & 10.100 & 0.866 &  8.900 \\
& IB-TransUNet     & 0.857 &  8.700 & 0.822 & 10.000 & 0.884 &  8.700 \\
& nnU-Net          & 0.870 &  7.812 & 0.823 &  9.981 & 0.870 &  8.852 \\
& IB-nnU-Net       & \textbf{0.879*} & \textbf{7.668*} & \textbf{0.831*} & \textbf{9.671*} & \textbf{0.888*} & \textbf{8.568*} \\
\hline
\end{tabular}
\caption{Accuracy and robustness of U-Net variants on MSD-hippocampus, MSD-prostate, and MSD-spleen for training subset sizes 8, 16, and 24.}
\label{table:Performance-Table-Metrics-MSD-Only}
\end{table}
\vspace{-1mm}


% TABLE 3
\begin{table}[t]
\fontsize{9}{10}\selectfont
\centering
\begin{tabular}{|c|c|c|c|c|c|c|c|}
\hline
Size & Model &
\multicolumn{2}{c|}{No noise} &
\multicolumn{2}{c|}{Gaussian blur} &
\multicolumn{2}{c|}{Random Gaussian noise} \\
\cline{3-8}
 &  & DSC$(\uparrow)$ & HD-95$(\downarrow)$
    & DSC$(\uparrow)$ & HD-95$(\downarrow)$
    & DSC$(\uparrow)$ & HD-95$(\downarrow)$ \\
\hline
\multirow{8}{*}{$8$}
& SegResNet       & 0.559 & 55.413 & 0.392 & 127.450 & 0.510 & 90.016 \\
& IB-SegResNet    & 0.624 & 29.484 & 0.495 &  66.931 & 0.639 & 22.567 \\
& Attention U-Net & 0.512 & 89.027 & 0.479 &  44.768 & 0.508 & 87.453 \\
& IB-Att-U-Net    & 0.674 & 14.007 & 0.623 &  26.420 & 0.662 & 15.149 \\
& TransUNet       & 0.722 & 13.200 & 0.701 &  15.462 & 0.716 & 13.560 \\
& IB-TransUNet    & 0.730 & 12.400 & 0.713 &  14.167 & 0.718 & 13.464 \\
& nnU-Net         & 0.728 & 12.592 & 0.707 &  14.750 & 0.722 & 12.935 \\
& IB-nnU-Net      & \textbf{0.742*} & \textbf{11.125*} & 0.725 & 12.710 & 0.730 & 12.080 \\
\hline
\multirow{8}{*}{$16$}
& SegResNet       & 0.618 & 42.979 & 0.603 &  56.850 & 0.611 & 44.502 \\
& IB-SegResNet    & 0.705 & 18.748 & 0.650 &  47.239 & 0.678 & 24.019 \\
& Attention U-Net & 0.582 & 60.322 & 0.241 & 196.081 & 0.589 & 50.391 \\
& IB-Att-U-Net    & 0.622 & 38.066 & 0.461 & 113.534 & 0.617 & 41.565 \\
& TransUNet       & 0.753 & 11.400 & 0.704 &  14.160 & 0.747 & 12.878 \\
& IB-TransUNet    & 0.763 & 11.000 & 0.728 &  12.866 & 0.751 & 13.834 \\
& nnU-Net         & 0.759 & 11.207 & 0.710 &  13.920 & 0.753 & 12.660 \\
& IB-nnU-Net      & \textbf{0.796*} & \textbf{8.272*} & 0.760 &  9.675 & 0.783 & 10.403 \\
\hline
\multirow{8}{*}{$24$}
& SegResNet       & 0.645 & 30.436 & 0.532 &  56.045 & 0.594 & 56.915 \\
& IB-SegResNet    & 0.768 &  9.315 & 0.596 &  39.574 & 0.723 & 22.853 \\
& Attention U-Net & 0.690 & 24.905 & 0.449 & 101.603 & 0.640 & 44.640 \\
& IB-Att-U-Net    & 0.737 & 11.955 & 0.487 &  90.554 & 0.645 & 33.852 \\
& TransUNet       & 0.800 &  9.150 & 0.791 &   8.624 & 0.797 &  8.939 \\
& IB-TransUNet    & 0.802 &  9.050 & 0.791 &   8.566 & 0.793 &  8.420 \\
& nnU-Net         & 0.803 &  8.938 & 0.794 &   8.424 & 0.800 &  8.732 \\
& IB-nnU-Net      & \textbf{0.811*} & \textbf{8.863*} & 0.800 &   8.389 & 0.802 &  8.246 \\
\hline
\end{tabular}
\caption{Accuracy and robustness of U-Net variants on PROMISE-12 under no noise, Gaussian blur, and additive Gaussian noise.}
\label{table:Performance-Table-Metrics-PROMISE12-Only}
\vspace{-4mm}
\end{table}

Second, we evaluate performance using all available training volumes via 5-fold cross-validation on MSD-heart, MSD-hippocampus, MSD-prostate, MSD-spleen, PROMISE-12, AMOS-2022, and the two PSMA-PET datasets. In addition, we use AMOS-2022, PROSTATEx, and Prostate158 as out-of-distribution test sets. All experiments were conducted on an NVIDIA Titan RTX GPU (24 GB); adding IB kernels increased VRAM usage by less than 1\% and training time by at most 2 seconds per epoch.

\paragraph{Cylindrical versus Spherical IB Kernels: }Table~\ref{table:Various_IB_UNets} compares cylindrical and spherical IB kernels for IB-nnU-Net on a PROMISE-12 development subset. Spherical kernels substantially outperform cylindrical ones, whereas cylindrical IB variants can underperform the baseline nnU-Net, particularly when capturing 3D contours. Cylindrical kernels overemphasise planar structures and struggle with anisotropic voxel spacing and artefacts, supporting the necessity of the spherical 3D design.

\paragraph{IB-nnU-Net Variants: }We conducted ablations to determine an effective IB-nnU-Net configuration (Figure~\ref{fig:oocsArchitecture}). We explored adding IB layers after max-pooling (yielding only marginal improvements), introducing IBs symmetrically in encoder and decoder blocks, and inserting IB kernels in all encoder blocks (which resulted in overfitting). The most effective configuration places the IB block only in the second encoder block, with input taken before max-pooling and IB convolutions using stride 2. As shown in Table~\ref{table:Various_IB_UNets}, this configuration yields the best trade-off between accuracy and robustness. We also experimented with kernel sizes $k \in \{3, 5, 7, 9\}$ and associated parameters $(r, \rho)$. A kernel size of $k=3$ improved performance relative to nnU-Net but was less stable across tasks. Larger kernels increased memory and computation with no consistent benefit over $k=5$, which is the default.

\paragraph{Robustness on Small Training Subsets: }Tables~\ref{table:Performance-Table-Metrics-MSD-Only} and~\ref{table:Performance-Table-Metrics-PROMISE12-Only} summarise the segmentation performance of all four original U-Nets, and their IB-extended variants on small subsets of MSD-hippocampus, MSD-prostate, MSD-spleen, and PROMISE-12. We report DSC and HD-95; SDC results are provided in the appendix. The IB extensions consistently outperform their corresponding baselines across nearly all settings. Among all variants, IB-nnU-Net achieves the highest accuracy and robustness, with pronounced gains for training sizes of 8 and 16. Figure~\ref{fig:attention_maps} illustrates that IB-nnU-Net exhibits fewer spurious activations and more accurate delineation, especially in challenging PROMISE-12 and MSD-spleen scans. Figure~\ref{fig:featuremaps} shows that IB kernels act as effective edge detectors, retaining sharper boundary information in early encoder blocks. 

\paragraph{Robustness on Noisy Data: }PROMISE-12 is challenging due to anisotropic voxel spacing, heterogeneous acquisition protocols, and variable prostate appearance. To further probe robustness, we introduce Gaussian blur and additive Gaussian noise at test time. Table~\ref{table:Performance-Table-Metrics-PROMISE12-Only} reports performance across noise conditions and training subset sizes. In all settings, IB-nnU-Net matches or outperforms nnU-Net, with the largest gains in HD-95. The IB extensions confer similar robustness benefits to SegResNet, Attention U-Net, and TransUNet, indicating that IB kernels enhance resilience to acquisition artefacts and noise without sacrificing accuracy on clean data.

\paragraph{Performance on Full Datasets: }We next evaluate nnU-Net and IB-nnU-Net using all available training volumes for MSD-heart, MSD-hippocampus, MSD-prostate, MSD-spleen, PROMISE-12, and AMOS-2022 via 5-fold cross-validation (Table~\ref{table:CV_All_Metrics}). As expected for larger datasets, absolute DSC improvements are modest~\cite{nnunet_revisited_isensee}, but IB-nnU-Net consistently matches or slightly surpasses nnU-Net, with more noticeable gains in HD-95, reflecting improved boundary localisation. On AMOS-2022, IB-nnU-Net achieves a mean DSC of 89.59\% vs 88.64\% for nnU-Net. On the official MSD test set (public leaderboard), IB-nnU-Net surpasses nnU-Net when both models are trained exclusively on the MSD training data. %Higher-ranking leaderboard methods typically rely on larger external datasets, a validation pitfall noted by~\citet{nnunet_revisited_isensee}. 
On PROMISE-12, IB-nnU-Net achieves a challenge score of 89.69 versus 89.65 for nnU-Net. Table~\ref{table:CV_All_Metrics} reports prostate tumour segmentation from PET images, and IB-nnU-Net improves over nnU-Net both tracers. This indicates that the proposed IB is also beneficial for challenging tumour segmentation in PET, characterised by noisy, low-resolution signals and heterogeneous tracer uptake.

\begin{table}[t]
\fontsize{9}{10}\selectfont
\centering
\begin{tabular}{|c|c|c|c|c|c|c|c|c|}
\hline                  
Metric & Model & MSD & MSD & PROMISE & MSD & AMOS &  PET & PET \\ 
& & hippo. & prostate & prostate & spleen & 2022 & $^{68}$Ga & $^{18}$F \\
 &  & ($n$=260) & ($n$=32) & ($n$=50) & ($n$=41) & ($n$=300) & ($n$=68) & ($n$=65) \\
\hline
DSC & nnU-Net & $0.909$ & $0.882$ & $0.890$ & $0.966$ & $0.886$ & $0.711$ & $0.768$ \\
($\uparrow$) & IB-nnU-Net & $\mathbf{0.910}$ & $\mathbf{0.895}$ & $\mathbf{0.902}$ & $\mathbf{0.970}$ & $\mathbf{0.896}$ & $\mathbf{0.749}$ & $\mathbf{0.777}$ \\
\hline
HD-95 & nnU-Net & $1.068$ & $1.980$ & $1.735$ & $1.640$ &  $1.932$  & $12.458$ & $10.764$ \\
($\downarrow$) & IB-nnU-Net & $\mathbf{1.064}$ & $\mathbf{1.738}$ & $\mathbf{1.237}$ & $\mathbf{1.189}$ &  $\mathbf{1.625}$  & $\mathbf{11.041}$ & $\mathbf{10.038}$ \\
\hline
\end{tabular}
\caption{Average 5-fold cross-validation accuracy of nnU-Net and IB-nnU-Net on full datasets. The standard deviations values are listed in the appendix.}
\label{table:CV_All_Metrics}
\vspace{-2mm}
\end{table}

\begin{table}[t]
\fontsize{9}{10}\selectfont
\centering
\begin{tabular}{|c|c|c|c|c|c|}
\hline
\multirow{2}{*}{Training dataset} & \multirow{2}{*}{Testing dataset}
& \multicolumn{2}{c|}{DSC $(\uparrow)$}
& \multicolumn{2}{c|}{HD-95 $(\downarrow)$} \\
\cline{3-6}
 &  & nnU-Net & IB-nnU-Net & nnU-Net & IB-nnU-Net \\
\hline
PROMISE-12   & Prostate158 (Prostate)   & 0.812 & \textbf{0.831} & 4.450 & \textbf{3.450} \\
MSD-prostate & Prostate158 (Prostate)   & 0.826 & \textbf{0.845} & 4.050 & \textbf{3.150} \\
PROMISE-12   & PROSTATEx (Prostate)     & 0.922 & \textbf{0.931} & 2.000 & \textbf{1.700} \\
MSD-spleen   & AMOS-2022 (Spleen)       & 0.927 & \textbf{0.940} & 1.800 & \textbf{1.350} \\
\hline
\end{tabular}
\caption{Out-of-distribution performance of nnU-Net and IB-nnU-Net.}
\label{table:Performance-Out-Of-Distribution}
\vspace{-2mm}
\end{table}

\paragraph{Out-of-Distribution Generalisation:} We examine cross-dataset generalisation without fine-tuning. Models trained on PROMISE-12 or MSD-prostate are evaluated on Prostate158 and PROSTATEx; models trained on MSD-spleen are evaluated on AMOS-2022 spleen cases (Table~\ref{table:Performance-Out-Of-Distribution}). IB-nnU-Net consistently outperforms nnU-Net across all out-of-distribution scenarios in both DSC and HD-95. These gains are practically relevant, as clinical deployment often involves datasets acquired with different scanners, protocols, or patient populations than those seen during training.



\begin{figure}[t]
\centering
\includegraphics[width=\linewidth]{FeatureMaps.pdf}
\caption{Feature maps from early encoder stages of IB-nnU-Net (left) and nnU-Net (right) trained for prostate segmentation. IB-nnU-Net retains sharper boundary information and reduces irrelevant activations.}
\label{fig:featuremaps}
\vspace{-3mm}
\end{figure}

{
\begin{table}[t]
\fontsize{9}{10}\selectfont
\centering
\begin{tabular}{|c|c|c|c|c|}
\hline
Comparison Type & Model Name/Setting & DSC ($\uparrow$) & HD-95 ($\downarrow$) & SDC ($\uparrow$) \\
\hline
\multirow{4}{*}{Different IB kernel parameters}
 & $k{=}3,\,r{=}1,\,\rho{=}1/2$ & 0.640 & 29.318 & 0.639 \\
 & $k{=}5,\,r{=}2,\,\rho{=}2/3$ & \textbf{0.665} & 27.522 & 0.651 \\
 & $k{=}7,\,r{=}3,\,\rho{=}3/4$ & \textbf{0.665} & \textbf{27.510} & \textbf{0.652} \\
 & $k{=}9,\,r{=}4,\,\rho{=}4/5$ & 0.660 & 28.591 & 0.650 \\
\hline
\end{tabular}
\caption{Ablation study on IB-nnU-Net design choices using a development subset of size 8 from MSD-Spleen. This is to determine whether adapting the parameters based on the same dataset produces different results compared to the PROMISE-12 subset.}
\label{table:Various_IB_UNets_MSDSpleen}
\vspace{-4mm}
\end{table}



\begin{table}[t]

\fontsize{9}{10}\selectfont
\centering
%%\vspace0.1in}
\begin{tabular}{|c|c||c|c|c||c|c|c|}
\hline
Size  & Model Name
& DSC$(\uparrow)$ & HD95$(\downarrow)$ & SDC$(\uparrow)$
& $\Delta$DSC$(\uparrow)$ & $\Delta$HD95$(\downarrow)$ & $\Delta$SDC$(\uparrow)$ \\
\hline

\multicolumn{8}{|c|}{\textit{Different 16-sample subset from MSD-Spleen trained with fixed IB kernels)}} \\
\hline
\multirow{8}{*}{$16$}
& SegResNet      & $0.693$ & $20.695$ & $0.671$ & $-0.011$ & $+4.182$ & $-0.029$ \\
& IB-SegResNet   & $0.697$ & $20.152$ & $0.682$ & $-0.018$ & $+5.821$ & $-0.037$ \\
& Att-U-Net      & $0.696$ & $18.098$ & $0.681$ & $-0.032$ & $+5.334$ & $-0.036$ \\
& IB-Att-U-Net   & $0.698$ & $17.302$ & $0.687$ & $-0.041$ & $+5.323$ & $-0.038$ \\
& Trans-U-Net    & $0.706$ & $14.120$ & $0.674$ & $-0.039$ & $+3.720$ & $-0.062$ \\
& IB-Trans-U-Net & $0.704$ & $13.800$ & $0.699$ & $-0.043$ & $+3.500$ & $-0.040$ \\
& nnU-Net        & $0.701$ & $14.409$ & $0.696$ & $-0.051$ & $+4.135$ & $-0.053$ \\
& IB-nnU-Net     & $\mathbf{0.725*}$ & $\mathbf{12.102*}$ & $\mathbf{0.706*}$ & $-0.031$ & $+1.921$ & $-0.056$ \\
\hline\hline

\multicolumn{8}{|c|}{\textit{Learnable IB kernels using the original 16 samples from MSD-Spleen}} \\
\hline
\multirow{4}{*}{$16$}
& IB-SegResNet   & $0.714$ & $14.303$ & $0.716$ & $-0.001$ & $-0.028$ & $-0.003$ \\
& IB-Att-U-Net   & $0.737$ & $11.952$ & $0.724$ & $-0.002$ & $-0.027$ & $-0.001$ \\
& IB-Trans-U-Net & $0.749$ & $10.223$ & $0.738$ & $+0.002$ & $-0.077$ & $-0.001$ \\
& IB-nnU-Net     & $\mathbf{0.755}$ & $\mathbf{10.152}$ & $\mathbf{0.761}$ & $-0.001$ & $-0.029$ & $-0.001$ \\
\hline
\end{tabular}
\caption{Evaluating the IB-extended U-Nets with different a subset of samples, and training with IB kernels - no fixed parameters. For DSC and SDC metrics, increased delta signifies that the newer models performed better than the ones with original subsamples/fixed IB kernels, and vice-versa for HD-95 metric.}
\label{table:AdditionalExperiments}
\vspace{-2mm}
\end{table}

\section{Additional Analyses: Hyperparameter Transfer, Subset Sensitivity, and Learnable IB Kernels}
\label{subsec:additional_analyses}

In this section we analyse the generalisability of our fixed IB hyperparameters, the sensitivity of limited-data results to subset sampling, and the impact of allowing IB kernels to be learnable. We conducted three additional analyses on MSD-Spleen and improved variance reporting for cross-validation results.

\paragraph{Dataset-specific hyperparameter adaptation on MSD-Spleen:} While our default IB configuration $(k{=}5, r{=}2, \rho{=}2/3)$ was selected on a small PROMISE-12 development subset and then held fixed across all experiments, we additionally performed a dataset-specific hyperparameter ablation on a small MSD-Spleen development subset of size 8. Table~\ref{table:Various_IB_UNets_MSDSpleen} shows that the best-performing settings on MSD-Spleen are close to our default choice: $k{=}5$ yields the highest DSC, while $k{=}7$ achieves a marginally lower HD-95 and slightly higher SDC with essentially identical DSC. Overall, these results indicate that adapting the IB parameters to a different dataset does not change the main conclusion: the PROMISE-12-selected configuration transfers well, and the method is not brittle to modest changes in kernel size and associated parameters.

\paragraph{Robustness to limited-data subset sampling:} For the limited-data experiments, subset construction can introduce sampling variance. To quantify this effect, we trained and evaluated all U-Net variants on a different randomly drawn 16-sample subset from MSD-Spleen using the same protocol and fixed IB kernels. The top block of Table~\ref{table:AdditionalExperiments} reports absolute metrics on this alternative subset alongside $\Delta$ values relative to the corresponding results obtained with the original subset. As expected, absolute performance shifts across architectures due to the changed subset; however, the overall pattern remains stable, with IB-extended variants remaining competitive and IB-nnU-Net continuing to provide strong accuracy/robustness. This supports that the observed improvements are not an artifact of a single particular subset draw.

\paragraph{Fixed versus learnable IB kernels:} Here we evaluate on whether fixing the IB kernel weights is necessary, or whether allowing them to be learnable (starting from the same DoG initialization) provides meaningful gains. We therefore repeated training on the original 16-sample MSD-Spleen subset while allowing the IB kernel weights to update during training. The bottom block of Table~\ref{table:AdditionalExperiments} shows that learnable-IB performance is extremely close to the fixed-IB setting across backbones, with only minimal changes in DSC/SDC and HD-95. This suggests that the fixed kernels already capture most of the benefit of the inductive bias in this regime, supporting our design choice to keep the kernels fixed and parameter-neutral.

\paragraph{Cross-validation variance reporting:} Finally, to improve statistical reporting for full-data experiments, we now explicitly note in the main cross-validation table (Table~\ref{table:CV_All_Metrics}) that standard deviations across the five folds are provided in the appendix. This complements the mean performance values and helps assess variability across folds.


\section{Discussion and Conclusion}
\label{Discussion_and_Conclusion}

We introduced two fixed 3D kernels inspired by on/off centre-surround pathways in the vertebrate retina and integrated them as inductive biases into 3D U-Net variants. These kernels act as complementary edge detectors with pre-computed weights and add no learnable parameters. When inserted into the second encoder block, they improve boundary representations and enhance robustness, particularly on small datasets and in out-of-distribution scenarios. Our experiments show that IB-extended U-Nets provide the strongest relative gains when training data are limited or when test data differ substantially from the training distribution, as reflected in improved HD-95 and SDC scores and qualitative visualisations (Figures~\ref{fig:attention_maps} and~\ref{fig:featuremaps}). For large datasets with high-contrast structures, improvements over nnU-Net are smaller, consistent with performance saturation on well-curated benchmarks~\cite{nnunet_revisited_isensee}.These benchmark experiments~\cite{nnunet_revisited_isensee} were conducted by the original authors of the nnU-Net framework, where they showcased the robustness of CNN-based architectures over recent ones such as transformers, mamba, etc. This is one of the main reasons we limited our model selections mainly to CNN-based (except TransUNet) architectures.

\textbf{Limitations.} IB kernels do not guarantee large gains in every setting. For large organs with clear boundaries and abundant training data ($>50$), the inductive bias becomes less critical. Our ablations also show that naively placing IB kernels in all encoder blocks or symmetrically in encoder–decoder fashion can lead to overfitting or negligible benefits, underscoring the importance of the chosen configuration (second encoder block, $k=5$). 

\textbf{Future work.} Adapting the IB concept to transformer-based encoders - for example through attention-based analogues of centre-surround processing - could extend the benefits to a broader class of architectures. Allowing the IB kernel parameters to be lightly learned, but regularised toward the biologically motivated initialisation, may enable task-specific adaptation without sacrificing robustness. Systematic evaluation on additional modalities such as ultrasound and histopathology would further test generality.

The IB kernels introduce negligible computational overhead and retain the parameter count of the original U-Net variants, making them attractive as a drop-in modification for existing 3D segmentation pipelines. The improvements in boundary quality and robustness - particularly for small datasets, anisotropic acquisitions such as PROMISE-12, PET tumour segmentation, and cross-dataset transfer to Prostate158, PROSTATEx, and AMOS-2022 - suggest that such inductive biases are practically useful in clinical scenarios where collecting large amounts of labelled data is difficult.

In summary, equipping U-Net-style architectures with biologically inspired 3D IB kernels yields consistent robustness gains at virtually no parameter cost. IB-nnU-Net and related variants offer a simple yet effective approach to improving 3D medical image segmentation in clinically relevant regimes characterised by limited data and distribution shifts.



\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version
\newpage
\midlacknowledgments{This research was funded in part by the Austrian Science Fund (FWF) [grant number: I 6605], and the German Federal Ministry of Education and Research (BMBF) [grant number: 01KT2325] under the ERA-NET TRANSCAN-3 initiative project MATTO-GBM. The authors declare that there are no conflicts of interest.}


\bibliography{midl26_311}

\newpage
\appendix

\section{Additional Experimental details and Results}

\begin{figure}[!htbp]
\centering
\includegraphics[width=\linewidth]{IB_kernels.pdf}
%%\vspace{0.25ex}
\caption{Geometrical representations of the IB kernels. (a) Spherical On and Off 3D centre-surround receptive fields. (b) The 3D IB-On cubic kernel. The 3D-IB Off cubic kernel is complementary: all its signs are inverted.}
\label{fig:kernelModels}
\vspace{-2mm}
\end{figure}

\begin{table}[!htbp]
\fontsize{9}{10}\selectfont
\centering
\begin{tabular}{|c|c|c|}
\hline
Dataset & Patch Size & Batch \\
\hline
MSD-Heart & $160{\times}160{\times}64$ & 2 \\
MSD-Hippocampus & $160{\times}160{\times}96$ & 2 \\
MSD-Prostate & $160{\times}160{\times}64$ & 2 \\
PROMISE-12 & $128{\times}128{\times}64$ & 2 \\
MSD-Spleen & $192{\times}160{\times}64$ & 2 \\
Private PET & $128{\times}128{\times}128$ & 2 \\
\hline
\end{tabular}
\caption{Model training patch sizes and batch sizes.}
\label{tab:patchsizes}
\vspace{-4mm}
\end{table}

% Appendix table: SDC only
\begin{table}[!htbp]
\fontsize{8}{9}\selectfont
\centering
\begin{tabular}{|c|c|c|c|}
\hline
\multirow{2}{*}{Training dataset} & \multirow{2}{*}{Testing dataset}
& \multicolumn{2}{c|}{SDC $(\uparrow)$} \\
\cline{3-4}
 &  & nnU-Net & IB-nnU-Net \\
\hline
PROMISE-12   & Prostate158 (Target: Prostate) & $0.822 \pm 0.133$ & $\mathbf{0.841 \pm 0.080}$ \\
MSD-prostate & Prostate158 (Target: Prostate) & $0.836 \pm 0.093$ & $\mathbf{0.855 \pm 0.069}$ \\
PROMISE-12   & PROSTATEx (Target: Prostate)   & $0.932 \pm 0.050$ & $\mathbf{0.941 \pm 0.044}$ \\
MSD-spleen   & AMOS-2022 (Target: Spleen)     & $0.933 \pm 0.130$ & $\mathbf{0.946 \pm 0.081}$ \\
\hline
\end{tabular}
\caption{SDC results of the nnU-Net and IB-nnU-Net on out-of-distribution samples.}
\label{table:Performance-Out-Of-Distribution-SDC}
\vspace{-4mm}
\end{table}

\begin{table}[!htbp]
\fontsize{9}{10}\selectfont
\centering
\begin{tabular}{|c|c|c|c|c|}
\hline
Size & Model &
\multicolumn{1}{c|}{MSD-Hippocampus} &
\multicolumn{1}{c|}{MSD-Prostate} &
\multicolumn{1}{c|}{MSD-Spleen} \\
\cline{3-5}
 & Name & SDC$(\uparrow)$ & SDC$(\uparrow)$ & SDC$(\uparrow)$ \\
\hline
\multirow{8}{*}{$8$} & SegResNet       & $0.568$ & $0.523$ & $0.602$ \\
& IB-SegResNet     & $0.585$ & $0.605$ & $0.607$ \\
& Attention U-Net  & $0.630$ & $0.490$ & $0.613$ \\
& IB-Att-U-Net     & $0.636$ & $0.578$ & $0.620$ \\
& Trans-U-Net      & $0.642$ & $0.578$ & $0.638$ \\
& IB-Trans-U-Net   & $0.645$ & $0.665$ & $0.667$ \\
& nnU-Net          & $0.643$ & $0.614$ & $0.655$ \\
& IB-nnU-Net       & $\mathbf{0.655*}$ & $\mathbf{0.699*}$ & $\mathbf{0.658*}$ \\
\hline
\multirow{8}{*}{$16$} & SegResNet       & $0.680$ & $0.699$ & $0.700$ \\
& IB-SegResNet     & $0.694$ & $0.740$ & $0.719$ \\
& Attention U-Net  & $0.733$ & $0.684$ & $0.717$ \\
& IB-Att-U-Net     & $0.745$ & $0.756$ & $0.725$ \\
& Trans-U-Net      & $0.733$ & $0.751$ & $0.736$ \\
& IB-Trans-U-Net   & $0.756$ & $0.796$ & $0.739$ \\
& nnU-Net          & $0.751$ & $0.758$ & $0.749$ \\
& IB-nnU-Net       & $\mathbf{0.800*}$ & $\mathbf{0.797*}$ & $\mathbf{0.762*}$ \\
\hline
\multirow{8}{*}{$24$} & SegResNet       & $0.792$ & $0.799$ & $0.822$ \\
& IB-SegResNet     & $0.813$ & $0.809$ & $0.812$ \\
& Attention U-Net  & $0.838$ & $0.799$ & $0.828$ \\
& IB-Att-U-Net     & $0.851$ & $0.823$ & $0.833$ \\
& Trans-U-Net      & $0.831$ & $0.813$ & $0.863$ \\
& IB-Trans-U-Net   & $0.839$ & $0.818$ & $0.866$ \\
& nnU-Net          & $0.852$ & $0.820$ & $0.856$ \\
& IB-nnU-Net       & $\mathbf{0.864*}$ & $\mathbf{0.829*}$ & $\mathbf{0.877*}$ \\
\hline
\end{tabular}
\caption{SDC results of U-Net variants on MSD-Hippocampus, MSD-Prostate, and MSD-Spleen datasets.}
\label{tab:sdc-appendix}
\vspace{-4mm}
\end{table}

\begin{table}[!htbp]
\fontsize{9}{10}\selectfont
\centering
\begin{tabular}{|c|c|c|c|c|}
\hline
Size & Model Name &
\multicolumn{1}{c|}{No-Noise} &
\multicolumn{1}{c|}{Gaussian Blur} &
\multicolumn{1}{c|}{Random Gaussian} \\
\cline{3-5}
 &  & SDC$(\uparrow)$ & SDC$(\uparrow)$ & SDC$(\uparrow)$ \\
\hline
\multirow{8}{*}{$8$}
& SegResNet       & 0.551 & 0.386 & 0.566 \\
& IB-SegResNet    & 0.579 & 0.494 & 0.594 \\
& Attention U-Net & 0.455 & 0.461 & 0.445 \\
& IB-Att-U-Net    & 0.638 & 0.526 & 0.615 \\
& Trans-U-Net     & 0.655 & 0.563 & 0.666 \\
& IB-Trans-U-Net  & 0.658 & 0.585 & 0.646 \\
& nnU-Net         & 0.647 & 0.556 & 0.658 \\
& IB-nnU-Net      & $\mathbf{0.670*}$ & 0.596 & 0.658 \\
\hline
\multirow{8}{*}{$16$}
& SegResNet       & 0.573 & 0.472 & 0.507 \\
& IB-SegResNet    & 0.597 & 0.608 & 0.565 \\
& Attention U-Net & 0.608 & 0.322 & 0.642 \\
& IB-Att-U-Net    & 0.649 & 0.482 & 0.689 \\
& Trans-U-Net     & 0.699 & 0.664 & 0.685 \\
& IB-Trans-U-Net  & 0.701 & 0.663 & 0.645 \\
& nnU-Net         & 0.713 & 0.677 & 0.699 \\
& IB-nnU-Net      & $\mathbf{0.762*}$ & 0.721 & 0.701 \\
\hline
\multirow{8}{*}{$24$}
& SegResNet       & 0.665 & 0.543 & 0.662 \\
& IB-SegResNet    & 0.744 & 0.661 & 0.683 \\
& Attention U-Net & 0.625 & 0.475 & 0.604 \\
& IB-Att-U-Net    & 0.652 & 0.646 & 0.638 \\
& Trans-U-Net     & 0.766 & 0.714 & 0.739 \\
& IB-Trans-U-Net  & 0.771 & 0.725 & 0.734 \\
& nnU-Net         & 0.794 & 0.740 & 0.766 \\
& IB-nnU-Net      & $\mathbf{0.810*}$ & 0.762 & 0.771 \\
\hline
\end{tabular}
\caption{SDC results of U-Net variants on PROMISE-12 under No-Noise, Gaussian Blur Noise, and Random Gaussian Noise.}
\label{tab:PROMISE12-SDC-appendix}
\vspace{-2mm}
\end{table}

\begin{table}[!htbp]
\fontsize{8}{9}\selectfont
\centering
\begin{tabular}{|c|c|c|c|c|c|}
\hline                  
Metric & Model & MSD-hippocampus & MSD-prostate & PROMISE-12  & MSD-spleen  \\ 
 & Name & ($260$) & ($32$) & ($50$) & ($41$) \\
\hline
SDC & nnU-Net & $0.986 \pm 0.016$ & $0.876 \pm 0.117$ & $0.910 \pm 0.121$ & $0.976 \pm 0.052$ \\
($\uparrow$) & IB-nnU-Net & $\mathbf{0.986 \pm 0.015}$ & $\mathbf{0.879 \pm 0.078}$ & $\mathbf{0.920 \pm 0.093}$ & $\mathbf{0.982 \pm 0.026}$ \\
\hline
\end{tabular}
\caption{SDC results of U-Net variants and their IB extensions on the full datasets.}
\label{table:CV_All_SDC}
\vspace{2mm}
\end{table}


\begin{table}[!htbp]
\fontsize{8}{9}\selectfont
\centering
\begin{tabular}{|c|c|c|c|c|c|c|c|c|}
\hline
Metric (SD) & Model & MSD & MSD & PROMISE & MSD & AMOS & PET & PET \\
& & hippo. & prostate & prostate & spleen & 2022 & $^{68}$Ga & $^{18}$F \\
& & ($n$=260) & ($n$=32) & ($n$=50) & ($n$=41) & ($n$=300) & ($n$=68) & ($n$=65) \\
\hline
DSC SD & nnU-Net    & $0.028$ & $0.118$ & $0.101$ & $0.044$ & $0.070$ & $0.057$ & $0.061$ \\
($\uparrow$) & IB-nnU-Net & $0.028$ & $0.042$ & $0.036$ & $0.017$ & $0.032$ & $0.026$ & $0.027$ \\
\hline
HD-95 SD & nnU-Net    & $0.177$ & $1.203$ & $0.565$ & $4.708$ & $0.730$ & $4.963$ & $4.512$ \\
($\downarrow$) & IB-nnU-Net & $0.166$ & $1.122$ & $0.556$ & $1.914$ & $0.629$ & $4.057$ & $3.505$ \\
\hline
\end{tabular}
\caption{Fold-wise standard deviation (SD) across 5-fold cross-validation for the full-dataset results in Table~\ref{table:CV_All_Metrics}.}
\label{tab:appendix_cv_sd}
\vspace{-4mm}
\end{table}

\begin{table}[!htbp]
\fontsize{8}{9}\selectfont
\centering
\begin{tabular}{|c|c|c|c|c|c|}
\hline
\multirow{2}{*}{Training dataset} & \multirow{2}{*}{Testing dataset}
& \multicolumn{2}{c|}{DSC SD $(\uparrow)$}
& \multicolumn{2}{c|}{HD-95 SD $(\downarrow)$} \\
\cline{3-6}
 &  & nnU-Net & IB-nnU-Net & nnU-Net & IB-nnU-Net \\
\hline
PROMISE-12   & Prostate158 (Prostate)   & $0.131$ & $\mathbf{0.079}$ & $1.873$ & $\mathbf{1.772}$ \\
MSD-prostate & Prostate158 (Prostate)   & $0.092$ & $\mathbf{0.068}$ & $1.372$ & $\mathbf{1.172}$ \\
PROMISE-12   & PROSTATEx (Prostate)     & $0.049$ & $\mathbf{0.044}$ & $0.479$ & $\mathbf{0.229}$ \\
MSD-spleen   & AMOS-2022 (Spleen)       & $0.129$ & $\mathbf{0.081}$ & $0.653$ & $\mathbf{0.624}$ \\
\hline
\end{tabular}
\caption{Standard deviation (SD) estimates for the out-of-distribution metrics in Table~\ref{table:Performance-Out-Of-Distribution}.}
\label{tab:appendix_ootd_sd}
\vspace{-4mm}
\end{table}

\end{document}
