\appendix

\counterwithin{figure}{section}
\counterwithin{table}{section}
\renewcommand{\thefigure}{\thesection\arabic{figure}}
\renewcommand{\thetable}{\thesection\arabic{table}}

\section{Robustness Evaluation: Noise Curves}
\label{ap:robust_noise_curves}
Figure~\ref{fig:dsc_busi} shows DSC degradation curves across noise intensities for BUSI dataset. Hyperbolic U-Net/U-Net++ exhibit almost no degradation even at high noise levels, while the performance of Euclidean U-Net/U-Net++ degrades quickly. Although, the performance of nnU-Net stays relatively stable for Poisson and Speckle noise, it degrades for other noises and brightness and contrast shifts.

\begin{figure}[h]
  \includegraphics[width=\linewidth]{figs/busi_unet_unetpp_unetpp_nnunet2_cv_hyp_dice_score_1.pdf}
  \caption{DSC degradation curves across noise intensities for BUSI dataset.}
  \label{fig:dsc_busi}
\end{figure}

Figure~\ref{fig:dsc_kvasir} shows DSC degradation curves across noise intensities for KVASIR dataset. Hyperbolic U-Net/U-Net++ and nnU-Net show relatively less degradation overall compared to Euclidean U-Net/U-Net++, which degrades quickly. Hyperbolic U-Net/U-Net++ and nnU-Net perform similarly across Gaussian, Poisson and Speckle noise. However, Hyperbolic U-Net/U-Net++ remains robust to all perturbations.

\begin{figure}[h]
  \includegraphics[width=\linewidth]{figs/kvasir_unet_unetpp_unetpp_nnunet_hyp_dice_score_1.pdf}
  \caption{DSC degradation curves across noise intensities for KVASIR dataset.}
  \label{fig:dsc_kvasir}
\end{figure}

\section{Robustness Evaluation: Qualitative Results}
\label{ap:robust_qual_res}
Figure \ref{fig:busi_qual} and \ref{fig:kvasir_qual} shows qualitative comparisons of Hyperbolic U-Net and Euclidean U-Net on BUSI and KVASIR datasets, respectively. We report the predictions on a sample from each dataset on the following perturbations: Gaussian ($\sigma_{g}=0.2$), Speckle ($\sigma_{s}=0.3$), Poisson ($\lambda=10$), Rician ($\sigma_{r}=0.2$), Brightness ($\Delta_{b}=0.5$), Contrast ($\Delta_{c}=0.3$). Segmentation predictions of Hyperbolic U-Net remains relatively stable under mid-high levels of noise compared to Euclidean U-Net. 
\begin{figure}[t]
\floatconts
  {fig:busi_qual}
  {\caption{\textbf{Qualitative example of Hyperbolic versus Euclidean U-Net on a breast ultrasound image for mid-high level noise types.} We report the predictions on an BUSI dataset sample on the following perturbations: Gaussian ($\sigma_{g}=0.2$), Speckle ($\sigma_{s}=0.3$), Poisson ($\lambda=10$), Rician ($\sigma_{r}=0.2$), Brightness ($\Delta_{b}=0.5$), Contrast ($\Delta_{c}=0.3$).}}
  {\includegraphics[width=\linewidth]{figs/busi_qualitative.pdf}}
\end{figure}

\begin{figure}[t]
%
%
\floatconts
  {fig:kvasir_qual}
  {\caption{\textbf{Qualitative example of Hyperbolic versus Euclidean U-Net for mid-high level noise types.} We report the predictions on an KVASIR dataset sample on the following perturbations: Gaussian ($\sigma_{g}=0.2$), Speckle ($\sigma_{s}=0.3$), Poisson ($\lambda=10$), Rician ($\sigma_{r}=0.2$), Brightness ($\Delta_{b}=0.5$), Contrast ($\Delta_{c}=0.3$).}}
  {\includegraphics[width=\linewidth]{figs/kvasir_qualitative.pdf}}
\end{figure}

Figure \ref{fig:nnunet_vs_hypunet_isic16} shows qualitative comparisons of Hyperbolic U-Net and nnU-Net under high levels of noise and perturbations. Despite being trained with multiple data augmentation transformations, nnU-Net is not as robust as Hyperbolic U-Net.

\begin{figure}[t]
%
%
\floatconts
  {fig:nnunet_vs_hypunet_isic16}
  {\caption{\textbf{Qualitative example of Hyperbolic U-Net versus nnU-Net on a skin lesion image for heavy noise types.} We report the predictions on an ISIC16 dataset sample on the following perturbations: Gaussian ($\sigma_{g}=0.5$), Speckle ($\sigma_{s}=0.5$), Poisson ($\lambda=5$), Rician ($\sigma_{r}=0.5$), Brightness ($\Delta_{b}=0.8$), Contrast ($\Delta_{c}=0.2$).}}
  {\includegraphics[width=\linewidth]{figs/nnunet_vs_hypunet_isic16.pdf}}
\end{figure}

\section{Evaluation: Hyperbolic U-Net}
\label{ap:full_results}

Table \ref{tab:full_results} summarizes the evaluation of Hyperbolic U-Net and U-Net on Dice score (DSC), mean Intersection over Union (mIoU), dataset IoU (dIoU), Hausdorff Distance (HD) and HD 95-percentile (95). We report the evaluation metrics of Hyperbolic U-Net versus a standard U-Net, with the same architecture and number of parameters on seven datasets. Both the networks achieve similar scores across most of the metrics.
\begin{table}[t]
\centering
\small
\resizebox{\linewidth}{!}{%
\begin{tabular}{llccccc}
\toprule
 \textbf{Dataset} & \textbf{Model} & \textbf{DSC$\uparrow$} & \textbf{mIoU$\uparrow$} & \textbf{dIoU$\uparrow$} & \textbf{HD$\downarrow$} & \textbf{HD95$\downarrow$} \\
\midrule
\multirow{2}{*}{ISIC16}
& U-Net & \textbf{0.92} & 0.91 & 0.91 & 1.43 & 0.09\\
& \cellcolor{Gray} Hyp U-Net & \cellcolor{Gray} 0.91 & \cellcolor{Gray} \textbf{0.93} & \cellcolor{Gray} \textbf{0.93} & \cellcolor{Gray} \textbf{1.18} & \cellcolor{Gray} \textbf{0.06} \\
\hline
\multirow{2}{*}{ISIC18}
& U-Net & \textbf{0.89} & \textbf{0.91} & \textbf{0.90} & \textbf{0.36} & \textbf{0.04} \\
& \cellcolor{Gray} Hyp U-Net & \cellcolor{Gray} 0.87 & \cellcolor{Gray} \textbf{0.91} & \cellcolor{Gray} \textbf{0.90} & \cellcolor{Gray} 0.47 & \cellcolor{Gray} 0.05\\
\hline
\multirow{2}{*}{BUSI}
& U-Net & \textbf{0.82} & 0.88 & 0.87 & 13.89 & 4.74\\
& \cellcolor{Gray} Hyp U-Net & \cellcolor{Gray} 0.80 & \cellcolor{Gray} \textbf{0.89} & \cellcolor{Gray} \textbf{0.88} & \cellcolor{Gray} \textbf{9.56} & \cellcolor{Gray} \textbf{3.30} \\
\hline
\multirow{2}{*}{SANET}
& U-Net & \textbf{0.78} & 0.88 & 0.86 & 1.30 & 0.49\\
& \cellcolor{Gray} Hyp U-Net & \cellcolor{Gray} 0.76 & \cellcolor{Gray} \textbf{0.89} & \cellcolor{Gray} \textbf{0.87} & \cellcolor{Gray} \textbf{0.87} & \cellcolor{Gray} \textbf{0.27}\\
\hline
\multirow{2}{*}{KVASIR}
& U-Net & \textbf{0.86} & \textbf{0.93} & \textbf{0.93} & 9.91 & \textbf{0.42}\\
& \cellcolor{Gray} Hyp U-Net & \cellcolor{Gray} 0.83 & \cellcolor{Gray} \textbf{0.93} & \cellcolor{Gray} \textbf{0.93} & \cellcolor{Gray} \textbf{9.88} & \cellcolor{Gray} 0.50\\
\hline
\multirow{2}{*}{ACTA}
& U-Net & 0.50 & \textbf{0.98} & \textbf{0.98} & 35719.49 & 35719.11\\
& \cellcolor{Gray} Hyp U-Net & \cellcolor{Gray} \textbf{0.54} & \cellcolor{Gray} \textbf{0.98} & \cellcolor{Gray} \textbf{0.98} & \cellcolor{Gray} \textbf{10.47} & \cellcolor{Gray} \textbf{9.16} \\
\hline
\multirow{2}{*}{DCBR}
& U-Net & 0.62 & \textbf{0.95} & \textbf{0.95} & \textbf{10.08} & 6.74\\
& \cellcolor{Gray} Hyp U-Net & \cellcolor{Gray} \textbf{0.65} & \cellcolor{Gray} \textbf{0.95} & \cellcolor{Gray} \textbf{0.95} & \cellcolor{Gray} 12.25 & \cellcolor{Gray} \textbf{4.75} \\
\bottomrule
\end{tabular}}
\caption{Evaluation of Hyperbolic U-Net and U-Net on Dice score (DSC), mean Intersection over Union (mIoU), dataset IoU (dIoU), Hausdorff Distance (HD) and HD 95-percentile (95).}
\label{tab:full_results}
\end{table}

\section{Evaluation: Robustness Dice Scores}
\label{ap:robust_unetpp}

Table \ref{tab:robustness_unetpp} presents the DSC of Hyperbolic U-Net++ versus U-Net++ on clean data and under mid-high noise levels. The results are identical to Table \ref{tab:robustness}. Under noise, Hyperbolic U-Net++ significantly outperforms Euclidean U-Net++ across all datasets and noise types with average improvement of 32\% on Gaussian noise, 27\% on Speckle noise, 26\% on Poisson noise, 37\% on Rician noise, 15\% on Brightness shift and 21\% on Contrast variation. 

\begin{table}[t]
\centering
\small
\resizebox{\linewidth}{!}{%
\begin{tabular}{llccccccc}
\toprule
 \textbf{Dataset} & \textbf{Model} & \textbf{Clean} & \textbf{Gaussian} & \textbf{Speckle} & \textbf{Poisson} & \textbf{Rician} & \textbf{Brightness} & \textbf{Contrast} \\
\midrule
\multirow{2}{*}{ISIC16}
& U-Net++ & \emph{0.92} & 0.72 & 0.84 & 0.78 & 0.80 & 0.76 & 0.80 \\
& \cellcolor{Gray} Hyp U-Net++ &  \emph{0.92} & \cellcolor{Gray} \textbf{0.90} & \cellcolor{Gray} \textbf{0.90} & \cellcolor{Gray} \textbf{0.91} & \cellcolor{Gray} \textbf{0.91} & \cellcolor{Gray} \textbf{0.85} & \cellcolor{Gray} \textbf{0.89} \\
\hline
\multirow{2}{*}{ISIC18}
& U-Net++ & \emph{0.89} & 0.54 & 0.55 & 0.59 & 0.46 & 0.79 & 0.73 \\
& \cellcolor{Gray} Hyp U-Net++ & \emph{0.87} & \cellcolor{Gray} \textbf{0.86} & \cellcolor{Gray} \textbf{0.86} & \cellcolor{Gray} \textbf{0.86} & \cellcolor{Gray} \textbf{0.86} & \cellcolor{Gray} \textbf{0.84} & \cellcolor{Gray} \textbf{0.79} \\
\hline
\multirow{2}{*}{BUSI}
& U-Net++ & \emph{0.81} & 0.54 & 0.58 & 0.59 & 0.47 & 0.58 & 0.53 \\
& \cellcolor{Gray} Hyp U-Net++ & \emph{0.82} & \cellcolor{Gray} \textbf{0.80} & \cellcolor{Gray} \textbf{0.80} & \cellcolor{Gray} \textbf{0.81} & \cellcolor{Gray} \textbf{0.79} & \cellcolor{Gray} \textbf{0.80} & \cellcolor{Gray} \textbf{0.78} \\
\hline
\multirow{2}{*}{SANET}
& U-Net++ & \emph{0.80} & 0.49 & 0.49 & 0.49 & 0.49 & 0.58 & 0.62 \\
& \cellcolor{Gray} Hyp U-Net++ & \emph{0.78} & \cellcolor{Gray} \textbf{0.59} & \cellcolor{Gray} \textbf{0.58} & \cellcolor{Gray} \textbf{0.56} & \cellcolor{Gray} \textbf{0.60} & \cellcolor{Gray} \textbf{0.67} & \cellcolor{Gray} \textbf{0.69} \\
\hline
\multirow{2}{*}{KVASIR}
& U-Net++ & \emph{0.84} & 0.47 & 0.49 & 0.47 & 0.48 & 0.58 & 0.53 \\
& \cellcolor{Gray} Hyp U-Net++ & \emph{0.85} & \cellcolor{Gray} \textbf{0.68} & \cellcolor{Gray} \textbf{0.71} & \cellcolor{Gray} \textbf{0.68} & \cellcolor{Gray} \textbf{0.70} & \cellcolor{Gray} \textbf{0.71} & \cellcolor{Gray} \textbf{0.81} \\
\hline
\multirow{2}{*}{ACTA}
& U-Net++ & \emph{0.51} & 0.50 & 0.50 & 0.50 & 0.50 & 0.50 & 0.50 \\
& \cellcolor{Gray} Hyp U-Net++ & \emph{0.55} & \cellcolor{Gray} \textbf{0.51} & \cellcolor{Gray} \textbf{0.54} & \cellcolor{Gray} \textbf{0.53} & \cellcolor{Gray} \textbf{0.52} & \cellcolor{Gray} \textbf{0.52} & \cellcolor{Gray} \textbf{0.52} \\
\hline
\multirow{2}{*}{DCBR}
& U-Net++ & \emph{0.69} & 0.51 & 0.53 & 0.52 & 0.51 & 0.53 & 0.52 \\
& \cellcolor{Gray} Hyp U-Net++ & \emph{0.64} & \cellcolor{Gray} \textbf{0.62} & \cellcolor{Gray} \textbf{0.63} & \cellcolor{Gray} \textbf{0.62} & \cellcolor{Gray} \textbf{0.60} & \cellcolor{Gray} \textbf{0.57} & \cellcolor{Gray} \textbf{0.58} \\
\bottomrule
\end{tabular}}
\caption{\textbf{Robust medical image segmentation Dice scores.} We report the effect of hyperbolic U-Net++ versus U-Net++, with the same architecture and number of parameters, on seven datasets and six noise types. We use the following settings for all: Gaussian ($\sigma_{g}=0.2$), Speckle ($\sigma_{s}=0.3$), Poisson ($\lambda=10$), Rician ($\sigma_{r}=0.2$), Brightness ($\Delta_{b}=0.5$), Contrast ($\Delta_{c}=0.3$). We find that a hyperbolic U-Net++ is much more robust to noise as well as brightness and contrast shifts.}
\label{tab:robustness_unetpp}
\end{table}


\section{Experimental Setup: Datasets}
\label{ap:exp_data}
We evaluate on seven diverse medical imaging datasets spanning different modalities and anatomical regions: 
\begin{itemize}
    \item \textbf{Skin lesion:} ISIC 2016 (900 train, 379 test) and ISIC 2018 (2594 train, 1000 test) images.
    \item \textbf{Breast ultrasound:} BUSI (702 train, 78 test) images.
    \item \textbf{Polyp:} KVASIR (900 train, 100 test) and SANET (1450 train, 798 test) images.
    \item \textbf{Dental caries:} ACTA (2043 train, 227 test) and DCBR (1142 train, 127 test) images.
\end{itemize}

We use the official train/test splits where available and set aside 10\% from the training split for testing in other cases. We also create a validation split during training where we randomly set aside 10\% of the data and treat is as a held out set. 

\textbf{Preprocessing. } We first zero-pad the images wherever the height and the width of the image differ. Then, we resize the images to a $256 \times 256$ resolution in order to reduce the computational costs (crucial in the case of hyperbolic U-Net).  All RGB images are then rescaled to lie between $[0,1]$. For the grayscale images, we use a z-score normalization similar to the one performed in nnU-Net \cite{isensee2021nnu}. 

\section{Experimental Setup: Test Perturbations}
\label{ap:exp_noise}

To evaluate robustness under realistic acquisition imperfections, we apply a set of controlled perturbations representing common noise processes and intensity variations encountered in medical imaging. All perturbations are applied independently at test time and swept across multiple severity levels.

\subsection{Noise Models}

\paragraph{Channel-wise Gaussian Noise.}
We apply additive white Gaussian noise (AWGN) independently per channel:
\[
I'_{c} = I_{c} + n_{c}, \quad n_{c} \sim \mathcal{N}(0, \sigma_{c}^2),
\]
with channel-wise standard deviations
\[
\sigma_{g} = 
\begin{Bmatrix}
[0.00,\, 0.00,\, 0.00], \\
[0.12,\, 0.15,\, 0.18], \\
[0.15,\, 0.20,\, 0.25], \\
[0.18,\, 0.22,\, 0.28], \\
[0.20,\, 0.25,\, 0.30], \\
[0.25,\, 0.30,\, 0.35], \\
[0.30,\, 0.35,\, 0.40], \\
[0.35,\, 0.40,\, 0.45], \\
[0.40,\, 0.45,\, 0.50], \\
[0.45,\, 0.50,\, 0.55], \\
[0.50,\, 0.55,\, 0.60]
\end{Bmatrix}.
\]

\paragraph{Poisson Noise.}
Photon-counting noise is simulated as
\[
I' \sim \text{Poisson}(\lambda I),
\]
where the peak count parameter is varied over
\[
\lambda \in \{5,\, 10,\, 20,\, 50,\, 100,\, 200,\, 500,\, 10000\}.
\]
These values span extremely noisy low-dose conditions to near noise-free acquisition.

\paragraph{Channel-wise Speckle Noise.}
Speckle noise is applied multiplicatively per channel:
\[
I'_{c} = I_{c} \cdot (1 + n_{c}), \quad n_{c} \sim \mathcal{N}(0, \sigma_{c}^2),
\]
with the same per-channel variance levels as Gaussian noise:
\[
\sigma_{s} = 
\begin{Bmatrix}
[0.00,\, 0.00,\, 0.00], \\
[0.12,\, 0.15,\, 0.18], \\
[0.15,\, 0.20,\, 0.25], \\
[0.18,\, 0.22,\, 0.28], \\
[0.20,\, 0.25,\, 0.30], \\
[0.25,\, 0.30,\, 0.35], \\
[0.30,\, 0.35,\, 0.40], \\
[0.35,\, 0.40,\, 0.45], \\
[0.40,\, 0.45,\, 0.50], \\
[0.45,\, 0.50,\, 0.55], \\
[0.50,\, 0.55,\, 0.60]
\end{Bmatrix}.
\]

\paragraph{Rician Noise.}
MRI-specific noise is modeled as the magnitude of complex Gaussian components:
\[
I' = \sqrt{(I + n_1)^2 + n_2^2}, \quad n_1, n_2 \sim \mathcal{N}(0, \sigma^2),
\]
with
\[
\sigma_{r} \in \{0.00,\, 0.12,\, 0.15,\, 0.18,\, 0.20,\, 0.25,\, 0.30,\, 0.35,\, 0.40,\, 0.45,\, 0.50\}.
\]

\subsection{Brightness and Contrast Perturbations}

\paragraph{Brightness Shifts.}
Brightness is modified as
\[
I' = I + \Delta_b,
\]
with shift values
\[
\Delta_b \in \{0.0,\, 0.2,\, 0.3,\, 0.4,\, 0.5,\, 0.6,\, 0.7,\, 0.8,\, 0.9,\, 1.0\}.
\]

\paragraph{Contrast Scaling.}
Contrast is varied using the affine transformation
\[
I' = \Delta_c I + (1 - \Delta_c)\mu_I,
\]
where \(\mu_I\) is the image mean and
\[
\Delta_c \in \{0.0,\, 0.2,\, 0.3,\, 0.4,\, 0.5,\, 0.6,\, 0.7,\, 0.8,\, 0.9,\, 1.0\}.
\]


\section{Transposed Convolution vs. Bilinear Upsampling}

Table~\ref{tab:tc_vs_bi} compares the test Dice scores and memory consumption of the proposed Hyperbolic U-Net when employing either transposed convolution or bilinear upsampling within the decoder blocks. Across all seven datasets, the two variants exhibit nearly identical segmentation performance, indicating that the choice of upsampling operation has minimal impact on accuracy. This suggests that the performance gains of our architecture stem primarily from its hyperbolic representation and not from a specific decoder upsampling strategy.

\label{ap:tc_vs_bi}
\begin{table}[h!]
\centering
\begin{tabular}{lc|cc|ccc}
\hline
\multirow{2}{*}{\textbf{Dataset}} 
  & \multicolumn{2}{c}{\textbf{Transposed Convolution}} 
  & \multicolumn{2}{c}{\textbf{Bilinear Upsample}} \\
\cline{2-5}
 & \textbf{Dice score} & \textbf{Mem. Util.} 
 & \textbf{Dice score} & \textbf{Mem. Util.} \\
\hline
ISIC16 & 0.91 & 1.39 GB & 0.91 & 1.39 GB \\
ISIC18 & 0.87 & 1.39 GB & 0.88 & 1.39 GB \\
BUSI & 0.80 & 1.40 GB & 0.79 & 1.39 GB \\
SANET & 0.75 & 1.40 GB & 0.75 & 1.39 GB \\
KVASIR & 0.83 & 1.40 GB & 0.84 & 1.40 GB \\
ACTA & 0.74 & 1.40 GB & 0.75 & 1.39 GB \\
DCBR & 0.68 & 1.39 GB & 0.67 & 1.38 GB \\
\hline
\end{tabular}
\caption{Dice score and memory utilization comparison between Hyperbolic U-Net with transposed convolution and bilinear upsampling layer.}
\label{tab:tc_vs_bi}
\end{table}


\section{Derivation of Hyperbolic Bilinear Upsampling}
\label{ap:hyp_bi_up}

We derive the proposed hyperbolic bilinear upsampling by drawing a direct analogy to classical Euclidean bilinear interpolation and replacing Euclidean geometric primitives with their Riemannian counterparts.

\subsection{Euclidean Bilinear Interpolation}

Let the four corners of a unit square be given by vectors
\[
\boldsymbol{x}_{00}, \boldsymbol{x}_{10}, \boldsymbol{x}_{01}, \boldsymbol{x}_{11} \in \mathbb{R}^d,
\]
located at Euclidean coordinates $(0,0)$, $(1,0)$, $(0,1)$, and $(1,1)$, respectively. For a point $(s,t) \in [0,1]^2$, Euclidean bilinear interpolation is defined as
\begin{equation}
\label{eq:bilinear_euclidean}
\boldsymbol{h}(s,t)
= (1-t)\big[(1-s)\boldsymbol{x}_{00} + s\boldsymbol{x}_{10}\big]
+ t\big[(1-s)\boldsymbol{x}_{01} + s\boldsymbol{x}_{11}\big].
\end{equation}

Equivalently, this operation can be written as two successive linear interpolations. First, interpolate along the horizontal direction:
\begin{align}
\boldsymbol{h}_0(s) &= (1-s)\boldsymbol{x}_{00} + s\boldsymbol{x}_{10}, \\
\boldsymbol{h}_1(s) &= (1-s)\boldsymbol{x}_{01} + s\boldsymbol{x}_{11},
\end{align}
followed by interpolation along the vertical direction:
\begin{equation}
\boldsymbol{h}(s,t) = (1-t)\boldsymbol{h}_0(s) + t\boldsymbol{h}_1(s).
\end{equation}

Geometrically, each linear interpolation corresponds to moving along a straight line segment between two points in $\mathbb{R}^d$.

\subsection{From Straight Lines to Geodesics}

In a Riemannian manifold $(\mathcal{M}, \mathfrak{g})$, a smooth path $\gamma$ of minimal length between two points $\boldsymbol{a}$ and $\boldsymbol{b}$ is called a geodesic, and can be seen as the generalization of a straight-line in Euclidean space. The geodesic $\gamma$ connecting $\boldsymbol{a}$ and $\boldsymbol{b}$ can be expressed using the exponential and logarithmic maps as
\[
\gamma(\boldsymbol{a}, \boldsymbol{b}; t)
= \exp_{\boldsymbol{a}}\big(t \log_{\boldsymbol{a}}(\boldsymbol{b})\big), 
\quad t \in [0,1].
\]

\subsection{Hyperbolic Bilinear Interpolation}

We now extend bilinear interpolation to the Poincar\'e ball model $\mathbb{B}_c^d$ by replacing each Euclidean linear interpolation with its geodesic counterpart.

Let $
\boldsymbol{x}_{00}, \boldsymbol{x}_{10}, \boldsymbol{x}_{01}, \boldsymbol{x}_{11} \in \mathbb{B}_c^d
$
be four neighboring representations. For interpolation weights $(s,t) \in [0,1]^2$, we first interpolate along the horizontal direction using geodesics:
\begin{align}
\boldsymbol{h}_0(s) &= \gamma\big(\boldsymbol{x}_{00}, \boldsymbol{x}_{10}; s\big), \\
\boldsymbol{h}_1(s) &= \gamma\big(\boldsymbol{x}_{01}, \boldsymbol{x}_{11}; s\big).
\end{align}

We then interpolate between these intermediate points along the vertical direction:
\begin{equation}
\label{eq:bilinear_hyperbolic}
\boldsymbol{h}(s,t)
= \gamma\big(\boldsymbol{h}_0(s), \boldsymbol{h}_1(s); t\big).
\end{equation}

This construction exactly mirrors Euclidean bilinear interpolation, with straight lines replaced by geodesics and linear interpolation replaced by geodesic interpolation.

\subsection{Consistency and Validity}

The proposed hyperbolic bilinear interpolation satisfies the following properties:
\begin{itemize}
    \item \textbf{Manifold closure:} Each interpolation step follows a geodesic in $\mathbb{B}_c^d$, ensuring that the output remains within the Poincar\'e ball.
    \item \textbf{Reduction to the Euclidean case:} As the curvature parameter $c \rightarrow 0$, the exponential and logarithmic maps converge to their Euclidean counterparts, and Eq.~\eqref{eq:bilinear_hyperbolic} reduces to classical bilinear interpolation.
    \item \textbf{Geometric consistency:} The construction replaces Euclidean straight-line interpolation with geodesic interpolation, yielding an operation that is consistent with the underlying Riemannian geometry.
\end{itemize}

Therefore, hyperbolic bilinear upsampling arises as the natural Riemannian generalization of Euclidean bilinear interpolation.

\section{Derivation of Newton-Scaled Hyperbolic Weight Initialization}

We derive the Newton-scaled hyperbolic weight initialization by extending variance-preserving Euclidean initialization schemes to the hyperbolic setting by preserving the expected magnitude of representations measured using the hyperbolic distance.

\subsection{Motivation: Hyperbolic Norm Preservation}

In Euclidean neural networks, common initialization schemes such as Kaiming or orthogonal initialization aim to preserve the expected squared $\ell_2$ norm of activations across layers. However, in hyperbolic space, the Euclidean norm is no longer geometrically meaningful. Instead, the notion of feature magnitude is given by the hyperbolic distance to a reference point, typically the origin.

Given a Poincar\'e fully connected layer $
\boldsymbol{y} = \mathcal{F}^c(\boldsymbol{x}; Z, \boldsymbol{r}),
$
we therefore seek to preserve the expected squared hyperbolic distance
$
\mathbb{E}_{\boldsymbol{x}}\big[d_c(\boldsymbol{y}, 0)^2\big]
$
across layers. This motivates enforcing the condition
\begin{equation}
\label{eq:hyperbolic_norm_condition}
\mathbb{E}_{\boldsymbol{x}}\big[d_c(\mathcal{F}^c(\boldsymbol{x}; Z, \boldsymbol{r}), 0)^2\big]
\approx
\mathbb{E}_{\boldsymbol{x}}\big[d_c(\boldsymbol{x}, 0)^2\big].
\end{equation}

\subsection{Reduction to a Scalar Scaling Problem}

We begin by initializing the Euclidean parameter matrix $Z$ using a standard initialization scheme (e.g., Kaiming or orthogonal). Rather than modifying the structure of the layer, we introduce a scalar rescaling $s > 0$ applied uniformly to $Z$, yielding the scaled mapping
\[
\boldsymbol{y}(s) = \mathcal{F}^c(\boldsymbol{x}; s Z, \boldsymbol{r}).
\]

This reduces the norm preservation condition in Eq.~\eqref{eq:hyperbolic_norm_condition} to finding a scalar root of the function
\begin{equation}
\label{eq:gs_def}
g(s)
=
\mathbb{E}_{\boldsymbol{x}}\big[d_c(\boldsymbol{y}(s), 0)^2\big]
-
\mathbb{E}_{\boldsymbol{x}}\big[d_c(\boldsymbol{x}, 0)^2\big].
\end{equation}

Importantly, $g(s)$ is a one-dimensional, continuously differentiable function of $s$, since $\mathcal{F}^c$ is smooth with respect to its parameters.

\subsection{Newton’s Method for Scalar Root Finding}

Since $g(s)$ admits no closed-form root due to the nonlinear nature of hyperbolic operations, we approximate a solution using Newton’s method. Starting from an initial guess $s_0 = 1$, the iteration proceeds as
\begin{equation}
\label{eq:newton_update}
s_{t+1} = s_t - \frac{g(s_t)}{g'(s_t)}.
\end{equation}

The derivative $g'(s)$ is computed efficiently using automatic differentiation, as $\mathcal{F}^c$ is fully differentiable with respect to $s$. In practice, the expectations in Eq.~\eqref{eq:gs_def} are approximated using a randomly sampled batch of inputs. We found that Newton's method converged rapidly in all encountered configurations.

\subsection{Implementation}

The initialization procedure is applied layer-wise, starting from the first layer, as follows:
\begin{enumerate}
    \item Initialize $Z$ using a standard Euclidean scheme.
    \item Perform a single forward pass up to the layer that is to be initialized with a randomly sampled batch to collect layer inputs.
    \item Compute $s$ by applying Newton’s method to Eq.~\eqref{eq:gs_def}.
    \item Rescale the weight matrix as $Z \leftarrow s Z$.
\end{enumerate}

This procedure yields stable initializations even in architectures involving dimensionality expansion, such as transposed convolutions.

\section{Feature Space Separation Analysis under Noise}
\label{ap:inter_class_distances}

To understand the source of the improved robustness observed in Hyperbolic U-Net under increasing noise levels, we analyze the geometric structure of the learned feature representations. In particular, we study how well different semantic classes are separated in the feature space of hyperbolic and Euclidean models, and how this separation evolves as a function of noise degradation.

For both Hyperbolic U-Net and Euclidean U-Net, we extract intermediate feature embeddings from the final decoder stage before the prediction layer. These embeddings correspond to spatial feature vectors associated with each pixel location in the input image. 

We quantify feature separation by measuring:
\begin{itemize}
    \item \textbf{Inter-class distances:} distances between feature embeddings corresponding to pixels belonging to different classes (e.g., foreground class vs. background class).
    \item \textbf{Intra-class distances:} distances between feature embeddings corresponding to pixels within the same class.
\end{itemize}

The distances between feature embeddings are computed using the geometry of the corresponding model, i.e., for the hyperbolic model, pairwise distances are computed using the hyperbolic distance in the Poincar\'e ball. Whereas, distances for the Euclidean embeddings are computed using the standard Euclidean distance. However, due to the high spatial resolution of the images $(256 \times 256)$, we randomly subsample 1024 feature pairs per image to compute these distances. This subsampling strategy is applied consistently across all models, datasets, and noise levels.

\subsection{Separation Ratio}

To summarize the relative separation between classes, we define a separation ratio as:
$$
\text{Separation Ratio} = \frac{\text{Mean Inter-class Distance}}{\text{Mean Intra-class Distance}}
$$
A higher separation ratio indicates stronger relative class separation, corresponding to a larger margin between classes in the feature space.

\subsection{Results}

We analyze the separation of learned feature representations for the KVASIR dataset under varying noise conditions similar to our previous robustness experiments. The trends are consistent across different datasets. Figure~\ref{fig:inter_intra_distances} shows the evolution of inter-class and intra-class distances as a function of noise intensity. Note that, absolute distance values are not directly comparable between models as they are computed in different geometric spaces. The key comparison is the relative separation (shaded area) within each model's geometry.

\begin{figure}[t]
\floatconts
  {fig:inter_intra_distances}
  {\caption{Evolution of feature space separation under increasing noise perturbations. Solid lines represent mean inter-class distances, while dashed lines represent mean intra-class distances for Euclidean U-Net (gray) and Hyperbolic U-Net (red). The shaded regions indicate the separation margin between classes. A larger shaded area corresponds to better class discriminability. Hyperbolic U-Net maintains consistently wider margins across all perturbation types and intensities, indicating superior preservation of geometric structure under noise degradation.}}
  {\includegraphics[width=\linewidth]{figs/kvasir_euc_hyp_inter_class_mean_intra_class_mean.pdf}}
\end{figure}

\textbf{Consistent separation advantage.} Across all six perturbation types (Gaussian noise, Poisson noise, speckle noise, Rician noise, brightness, and contrast), Hyperbolic U-Net exhibits larger separation margins (shaded regions) between inter-class and intra-class distances compared to Euclidean U-Net. This larger margin indicates that hyperbolic features maintain better class discriminability in the learned representation space.

To quantify the separation behavior with a single metric, we visualize the mean separation ratio (Figure~\ref{fig:separation_ratio}). Higher separation ratios indicate stronger relative class separation. Hyperbolic U-Net consistently achieves separation ratios [18-200\%] higher than Euclidean U-Net across all perturbation types, with a mean improvement of 97\%.


\begin{figure}[t]
\floatconts
  {fig:separation_ratio}
  {\caption{Separation ratio (inter-class distance / intra-class distance) as a function of noise intensity across six perturbation types. Higher values indicate better relative class separation in the learned feature space. Hyperbolic U-Net consistently achieves higher separation ratios than Euclidean U-Net, particularly under severe noise conditions, demonstrating its ability to maintain discriminative feature representations when input quality degrades.}}
  {\includegraphics[width=\linewidth]{figs/kvasir_euc_hyp_separation_ratio_mean.pdf}}
\end{figure}


These results suggest that the hyperbolic geometry provides well-separated feature representations. The inherent exponential growth of volume in hyperbolic space may enable the model to maintain larger margins between semantic classes while keeping within-class features compact, even when the input signal is corrupted. This geometric advantage translates directly into improved segmentation performance under noise, as evidenced by the Dice score improvements reported in the main paper.

\section{Curvature Ablation Study}

To study the effect of curvature on model performance and stability, we conducted an ablation over both fixed and trainable curvature settings in the Poincar\'e ball model. We evaluated two commonly used initial curvature values, $c = 0.1$ and $c = 1.0$, consistent with prior work such as Poincar\'e ResNet \cite{van2023poincare}. For all settings, the remaining architecture, optimization parameters, and training protocol were kept identical.

\subsection{Fixed vs. Trainable Curvature}

Across all datasets and architectures, we did not observe any strong differences in training stability or segmentation performance between fixed and trainable curvature settings. All models converged reliably without exhibiting numerical instabilities.

When curvature was set to be trainable, initializing at $c=0.1$ yielded slightly higher Dice scores on average compared to other configurations. Based on this observation, we adopt a trainable curvature initialized at $c=0.1$ in our main experiments.

\subsection{Learned Curvature Values}

When curvature was trainable, the final learned curvature varied depending on several factors, including network depth, initial number of feature channels, dataset, decoder design (transposed convolution vs. bilinear upsampling), and weight initialization. Across all experimental settings, the learned curvature values consistently converged to the range $[0.1, 1.5]$. Importantly, no systematic drift toward extreme curvature values was observed.

\subsection{Performance Sensitivity to Curvature}

Overall, segmentation performance was not highly sensitive to the choice of initial curvature. Models initialized with different curvature values achieved comparable Dice scores, and all configurations converged reliably.

This suggests that the Poincar\'e ball formulation is robust to curvature initialization, particularly when curvature is allowed to be trainable. The ability to adapt curvature during training enables the model to adjust the effective geometry of the representation space to the data and task.

\subsection{Summary}

These results indicate that while curvature influences the geometric structure of the embedding space, the proposed Hyperbolic U-Net is not critically dependent on a specific curvature value. A trainable curvature initialized at $c = 0.1$ provides a stable and slightly favorable default, and is therefore used throughout the paper.

\section{Hyperbolic U-Net Architecture}

Here, we detail the architecture of the proposed Hyperbolic U-Net, including its building blocks and the handling of skip connections and norm stability. The architecture is an analogue of the standard Euclidean U-Net, where all operations are replaced by their hyperbolic counterparts while preserving the overall encoder–decoder structure.

\subsection{Overall Architecture}

The Hyperbolic U-Net architecture consists of:
\begin{itemize}
    \item exponential map operation to embed the Euclidean pixel vectors into hyperbolic space
    \item an encoder composed of repeated Down blocks,
    \item a decoder composed of repeated Up blocks with skip connections, and
    \item logarithmic map operation to map the logits back into the Euclidean space.
\end{itemize}
All feature representations are maintained on the Poincar\'e ball throughout the network.

\subsection{DoubleConv Block}

The fundamental building block of the architecture is the DoubleConv module, which mirrors the Euclidean U-Net design. Each DoubleConv block consists of the following sequence, repeated twice:
$$
(\text{Hyperbolic Convolution} \rightarrow \text{Hyperbolic Batch Normalization} \rightarrow \text{Hyperbolic ReLU}) \times 2
$$
All operations are defined on the Poincar\'e ball and follow the formulations introduced in Poincar\'e ResNet \cite{van2023poincare}.

\subsection{Down Block (Encoder)}

Each Down block in the encoder consists of:
\begin{enumerate}
    \item Hyperbolic Max Pooling to reduce spatial resolution, followed by
    \item a DoubleConv block to increase feature expressivity.
\end{enumerate}
This structure allows the encoder to progressively aggregate global context while remaining entirely within hyperbolic space.

\subsection{Up Block (Decoder)}

Each Up block in the decoder performs spatial upsampling followed by feature fusion via skip connections. Two decoder variants are supported:
\begin{itemize}
    \item Hyperbolic Transposed Convolution, or
    \item Hyperbolic Bilinear Upsampling
\end{itemize}

In both cases, upsampling is followed by:
\begin{enumerate}
    \item Poincar\'e concatenation between the upsampled decoder features and the corresponding encoder features (\textbf{skip connection}), and
    \item a DoubleConv block.
\end{enumerate}

The Poincar\'e concatenation operation, introduced in Hyperbolic Neural Networks++ \cite{shimizu2021hyperbolic}, ensures that feature fusion preserves the manifold constraints and remains within the Poincar\'e ball.

\subsection{Summary}

The Hyperbolic U-Net is a reformulation of the standard U-Net architecture, where all operations are replaced by their hyperbolic counterparts while preserving the original architectural design. This enables a direct comparison between Euclidean and hyperbolic models while ensuring that all computations respect the underlying manifold structure.

\section{nnU-Net Baseline Implementation Details}
\label{ap:nnunet}

All baseline experiments are conducted using the official nnU-Net v2 implementation, obtained from the publicly released nnU-Net repository maintained by the original authors \cite{isensee2021nnu, isensee2024nnu}. This repository corresponds to the nnU-Net v2 framework and includes the updated training, inference, and data preprocessing pipelines.

\subsection{Training and Inference Pipeline}

We use the standard nnU-Net v2 training and inference pipelines without modification, including automatic data preprocessing, default loss functions, default optimizer and learning rate schedules, and default data augmentation strategies.

\subsection{Architectural Configuration}

To ensure a fair architectural comparison between nnU-Net and our Hyperbolic U-Net models, we align the network depth and initial feature dimensionality across methods. Specifically, the nnU-Net architecture is configured with 4 levels, and an initial feature dimension of 8 channels. All other architectural and hyperparameter settings remain at their default nnU-Net v2 values. No additional tuning or customization is applied.

\section{Inference-Time Analysis}

We evaluate inference-time performance using a batch size of 8 under identical hardware and an input resolution of $(256 \times 256)$. Hyperbolic U-Net incurs a substantially higher inference-time cost than Euclidean U-Net, corresponding to an approximately 11 $\times$ increase in batch inference time. This results in a significantly lower throughput in terms of images processed per second. The overhead arises from repeated hyperbolic operations (e.g., exponential and logarithmic maps), which are currently not supported by optimized GPU kernels and are executed at a higher computational cost. We emphasize that this work focuses on representational robustness rather than inference efficiency, and that improving the efficiency of hyperbolic operations is an important direction for future work.

\begin{table}[h!]
\centering
\begin{tabular}{lccc}
\toprule
\textbf{Model} & \textbf{Batch Time (ms) $\downarrow$} & \textbf{Throughput (img/s) $\uparrow$} \\
\midrule
Euclidean U-Net   & $1553.34 \pm 112.17$ & $5.15$\\
Hyperbolic U-Net & $18493.52 \pm 332.01$ & $0.43$\\
\bottomrule
\end{tabular}
\caption{Inference-time comparison between Euclidean U-Net and Hyperbolic U-Net using a batch size of 8. Inference time is reported as mean $\pm$ standard deviation over multiple forward passes on the same GPU. Throughput is computed as images processed per second.}
\label{tab:inference_time}
\end{table}

\section{Inference-Time Analysis}

We evaluate inference-time performance using a batch size of 8 under identical hardware and an input resolution of $(256 \times 256)$. Hyperbolic U-Net incurs a substantially higher inference-time cost than Euclidean U-Net, corresponding to an approximately 11 $\times$ increase in batch inference time. This results in a significantly lower throughput in terms of images processed per second. The overhead arises from repeated hyperbolic operations (e.g., exponential and logarithmic maps), which are currently not supported by optimized GPU kernels and are executed at a higher computational cost. We emphasize that this work focuses on representational robustness rather than inference efficiency, and that improving the efficiency of hyperbolic operations is an important direction for future work.

\begin{table}[h!]
\centering
\begin{tabular}{lccc}
\toprule
\textbf{Model} & \textbf{Batch Time (ms) $\downarrow$} & \textbf{Throughput (img/s) $\uparrow$} \\
\midrule
Euclidean U-Net   & $1553.34 \pm 112.17$ & $5.15$\\
Hyperbolic U-Net & $18493.52 \pm 332.01$ & $0.43$\\
\bottomrule
\end{tabular}
\caption{Inference-time comparison between Euclidean U-Net and Hyperbolic U-Net using a batch size of 8. Inference time is reported as mean $\pm$ standard deviation over multiple forward passes on the same GPU. Throughput is computed as images processed per second.}
\label{tab:inference_time}
\end{table}

\section{Ablation on Newton-Scaled Weight Initialization}

To evaluate the effect of the proposed Newton-scaled weight initialization, we perform an ablation study comparing three initialization strategies: Newton-scaled initialization, Shimizu initialization, and identity initialization, across seven datasets. Table \ref{tab:init_ablation} reports the Dice scores obtained using each initialization under identical training settings.

Newton-scaled initialization consistently achieves comparable or improved Dice scores across most datasets, with more stable optimization during early training as shown in the main paper. These results indicate that the proposed initialization provides a favorable starting point for hyperbolic optimization, though final performance remains largely comparable across initialization schemes.

Our initialization strategy follows the same design principle as standard Euclidean initializations (e.g., Xavier or Kaiming), where weights are constrained only at initialization to ensure stable signal propagation at the start of training. In practice, we observe that subsequent training dynamics, combined with hyperbolic batch normalization are sufficient to prevent uncontrolled norm growth, without requiring explicit norm clamping during training.

\begin{table}[h!]
\centering
\label{tab:init_ablation}
\resizebox{\linewidth}{!}{
\begin{tabular}{lccccccc}
\toprule
\textbf{Initialization} & \textbf{ISIC16} & \textbf{ISIC18} & \textbf{BUSI} & \textbf{SANET} & \textbf{KVASIR} & \textbf{ACTA} & \textbf{DCBR} \\
\midrule
Identity Init        & 0.84 & 0.85 & 0.76 & 0.73 & 0.80 & 0.53 & 0.62 \\
Shimizu Init         & 0.89 & 0.87 & 0.78 & 0.74 & 0.82 & 0.54 & 0.63 \\
Newton-Scaled Init   & \textbf{0.91} & \textbf{0.87} & \textbf{0.80} & \textbf{0.76} & \textbf{0.83} & \textbf{0.54} & \textbf{0.65} \\
\bottomrule
\end{tabular}}
\caption{Ablation study on weight initialization strategies. Dice scores are reported for Hyperbolic U-Net trained with different initializations across seven datasets.}
\end{table}