\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images
\usepackage{color,soul}
\usepackage{booktabs} % table lines
\jmlrvolume{}
\jmlryear{2025}
\jmlrworkshop{Full Paper -- MIDL 2025}
\editors{Accepted for publication at MIDL 2025}

\title[GAMBAS]{GAMBAS: Generalised-Hilbert Mamba for Super-resolution of Paediatric Ultra-Low-Field MRI}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Levente Baljer\midljointauthortext{Corresponding author}\nametag{$^{1,2}$}} \Email{levente.1.baljer@kcl.ac.uk}\\
\Name{Ula Briski\nametag{$^{1}$}} \Email{ula.briski@kcl.ac.uk}\\
\Name{Robert Leech\nametag{$^{1}$}} \Email{robert.leech@kcl.ac.uk}\\
\Name{Niall J Bourke\nametag{$^{1}$}} \Email{niall.bourke@kcl.ac.uk}\\
\Name{Kirsten A Donald\nametag{$^{3}$}} \Email{kirsten.donald@uct.ac.za}\\
\Name{Layla E Bradford\nametag{$^{3}$}} \Email{layla.bradford@uct.ac.za}\\
\Name{Simone R Williams\nametag{$^{3}$}} \Email{simone.williams@uct.ac.za}\\
\Name{Sadia Parkar\nametag{$^{4}$}} \Email{sadia.parkar@aku.edu}\\
\Name{Sidra Kaleem\nametag{$^{4}$}} \Email{sidra.kaleem@aku.edu}\\
\Name{Salman Osmani\nametag{$^{4}$}} \Email{salman.osmani@aku.edu}\\
\Name{Sean CL Deoni\nametag{$^{5}$}} \Email{sean.deoni@gatesfoundation.org}\\
\Name{Steven CR Williams\nametag{$^{1}$}} \Email{steve.williams@kcl.ac.uk}\\
\Name{Rosalyn J Moran\midljointauthortext{Joint senior authors}\nametag{$^{1}$}} \Email{rosalyn.moran@kcl.ac.uk}\\
\Name{Emma C Robinson{\normalfont\textsuperscript{\textdagger}}\nametag{$^{2}$}} \Email{emma.robinson@kcl.ac.uk}\\
\Name{František Váša{\normalfont\textsuperscript{\textdagger}}\nametag{$^{1}$}} \Email{frantisek.vasa@kcl.ac.uk}\\
\addr $^{1}$ Department of Neuroimaging, King's College London \\
\addr $^{2}$ Department of Biomedical Computing, King's College London \\
\addr $^{3}$ Department of Paediatrics and Child Health, University of Cape Town \\
\addr $^{4}$  Paediatrics and Child Health, Aga Khan University Hospital, Karachi \\
\addr $^{5}$ Bill \& Melinda Gates Foundation, Seattle, Washington, USA
}

\begin{document}

\maketitle

%\vspace{-0.7cm}

\begin{abstract}
Magnetic resonance imaging (MRI) is critical for neurodevelopmental research, however access to high-field (HF) systems in low- and middle-income countries is severely hindered by their cost. Ultra-low-field (ULF) systems mitigate such issues of access inequality, however their diminished signal-to-noise ratio limits their applicability for research and clinical use. Deep-learning approaches can enhance the quality of scans acquired at lower field strengths at no additional cost. For example, Convolutional neural networks (CNNs) fused with transformer modules have demonstrated a remarkable ability to capture both local information and long-range context. Unfortunately, the quadratic complexity of transformers leads to an undesirable trade-off between long-range sensitivity and local precision. We propose a hybrid CNN and state-space model (SSM) architecture featuring a novel 3D to 1D serialisation (GAMBAS), which learns long-range context without sacrificing spatial precision. We exhibit improved performance compared to other state-of-the-art medical image-to-image translation models. Our code is made publicly available at \url{https://github.com/levente-1/GAMBAS}.
\end{abstract}

\begin{keywords}
Mamba, Hilbert curve, MRI, Deep Learning, Image Quality Transfer.
\end{keywords}

\section{Introduction}

High-field (HF) MRI (\textgreater1 T) is vital for radiation-free neurodevelopmental research and clinical assessment. Unfortunately, the high cost of scanners ($\sim$\$1,000,000 per Tesla; \citealt{https://doi.org/10.1002/jmri.28408}) and corresponding infrastructural requirements result in significant access inequalities across the globe. When comparing access to imaging via MRI units per million population, over a hundred-fold difference is exhibited between high-income countries (HICs) and low- and middle-income countries (LMICs) (e.g. 51.67 units/million in the US vs 0.48 units/million in Ghana; \citealt{https://doi.org/10.1002/nbm.5022}). 

Ultra-low-field (ULF) imaging systems ($<$0.1T) present a potential solution to issues of access inequality. By relying on large, permanent magnets, they are significantly cheaper and easier to purchase and run than HF systems \cite{doi:10.1148/radiol.2019190452}. Additionally, their reduced acoustic noise and open design increase patient compliance, especially in paediatric populations in whom the risk of head motion is naturally higher \cite{https://doi.org/10.1002/mrm.29509}. Unfortunately, these benefits come at the cost of reduced signal-to-noise ratio (SNR) and resolution, which renders images suboptimal for visual reading by radiologists or for many currently available processing methods.

Super-resolution (SR) techniques may help bridge the gap between ULF and HF, with deep learning being particularly well-suited to handle the nonlinear differences in tissue contrast between field strengths. Convolutional neural networks (CNNs) have historically dominated the field of medical image translation \cite{CNNreview}, with adversarial training schemes using an added discriminator further enhancing the perceptual quality of outputs \cite{MedGAN}. CNNs exploit 
correlations among neighbouring voxels by extracting local features with compact filters. This reduces model complexity, however it limits the ability to model long-range spatial dependencies present in spatially-distributed brain structures, such as white matter or cerebrospinal fluid \cite{CNNinductivebias}. 

Vision transformers (ViTs) have shown an increased capacity to capture long-range context. By tokenizing an image into a sequence of patches and computing inter-patch similarity via attention operators, transformers learn to capture spatial correlations among distant regions \cite{ViT}. Unfortunately, transformers exhibit quadratic model complexity with respect to sequence length (i.e. number of patches), impeding the use of patches small enough to maintain high spatial precision. State-space models (SSMs) offer a more efficient alternative for capturing long-range dependencies. They replace computationally expensive self-attention with linear recurrence, permitting images to be fed into the model as a sequence of individual voxels. Mamba, a variant of traditional SSMs with a powerful selection mechanism, has demonstrated impressive performance in a variety of medical imaging tasks, including segmentation, registration and classification \cite{mambasurvey}. 

Here we introduce GAMBAS, \textbf{G}eneralised-Hilbert M\textbf{amba} \textbf{S}uper-resolution, trained on paired paediatric ULF and HF scans (64mT and 3T, respectively). We fuse elements of CNNs with those of Mamba to increase model sensitivity to local and global context, and add adversarial learning to further increase output realism. We additionally present a novel serialisation approach that mitigates the loss of spatial proximity in Mamba layers. Our results indicate that GAMBAS offers superior performance against state-of-the-art (SOTA) transformer- and SSM-based models.


\section{Related Work}

Several works investigate translation between anisotropic 64mT and isotropic \textgreater1T brain MRI. SynthSR \cite{SynthSR} employs a U-Net, trained using synthetic low-resolution scans generated from HF segmentations, to super-resolve images of any contrast and resolution, whereas LF-SynthSR \cite{SynthSRulf} is a dedicated ULF version of the same network. LoHiResGAN \cite{LoHiResGAN} is an adversarial model with a ResNet-based generator \cite{ResNet}, trained on 37 paired 64mT and 3T scans (T\textsubscript{1}w and T\textsubscript{2}w). SFNet \cite{SFNet} is another adversarial model, using a SwinUNETR-V2 \cite{SwinUnetrV2} generator trained on 30 empirical ULF/HF pairs combined with 60 synthetic pairs generated using a two-channel latent diffusion model. 

\section{Methods}
\subsection{Preliminaries}
\subsubsection{State-space models}

The state space model (SSM) is a continuous-time latent state model, which maps a 1D input to a 1D output via a linear ordinary differential equation (ODE). Mamba is a specific SSM variant that discretises the ODE using the zero-order hold (ZOH) technique to produce an efficient recurrent formulation, and adds a selective scan algorithm to filter relevant/irrelevant information (see Appendix \ref{S4andMamba} for a derivation of Mamba from continuous-time SSMs). Importantly, Mamba demonstrates an ability to model long-range dependencies on par with that of Transformers, while reducing computational complexity \cite{mamba}. 

As Mamba is designed to process 1D data, its application to vision tasks requires image features of shape $(B, C, H, W, D)$ to be flattened and transposed to $(B, L, C)$, where $L = H$×$W$×$D$. Unlike input sequences in natural language processing tasks, images have no inherent directionality. As such, the approach used to arrange voxels into a 1D sequence determines the way Mamba processes the image and, in turn, affects overall model performance \cite{MambaScanning}.  


\subsubsection{Space-filling curves}

Many approaches to flatten an image rely on a simple ‘Raster’ scan, which can result in discontinuities when the scan reaches the boundaries of the image. An alternative approach involves the use of space-filling curves, e.g. Z-order curve \cite{z-order} or Hilbert curve (\citealt{Hilbert1891UeberDS}; see Figure \ref{fig:HilbertFigure} and Appendix \ref{HilbertAppendix}). Both curves provide a linear mapping between 3D and 1D space that preserves spatial locality \cite{908985}, with the Hilbert curve demonstrating superior order-preserving behaviour \cite{hilbertzorder}. Because of this property, it has been applied to point-cloud processing in conjunction with transformers \cite{HilbertNet} and, more recently, with Mamba \cite{VoxelMamba}. Despite these successes, the fusion of the Hilbert curve with Mamba has not yet been demonstrated for medical image applications. Here, we apply a generalised variant of the Hilbert curve (“Gilbert”; \citealt{gilbert}), capable of handling evenly-sided 3D data of any size, to serialise structural MRI input into a 1D vector. 

\begin{figure}[htb]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
\centering
\includegraphics[width=1.0\linewidth]{figures/Hilbert3D.jpeg}
\caption{\textbf{Left}: [A] 2D Raster trajectory and [B] 2D Hilbert trajectory, with corresponding 2D $\rightarrow$ 1D serialisation of highlighted region shown below.\\
\textbf{Right}: [C] 3D Raster trajectory and [D] 3D Generalised-Hilbert trajectory.}
\label{fig:HilbertFigure}
\end{figure}

\subsection{Architecture}
\subsubsection{GAMBAS}

We use a 3D counterpart of the 9-block ResNet architecture, originally introduced in \citealt{CycleGAN}, as the backbone of GAMBAS. The model follows an encoder-decoder pathway (composed of 3 layers each) with an extended central bottleneck (consisting of 9 layers). The order of operations across layers is kept identical to the original paper, however all 2D convolutions are switched to their 3D counterparts, and batch normalisation is substituted with instance normalisation. Inspired by earlier work incorporating SSMs for 2D multi-modal medical image synthesis \cite{I2I-Mamba}, we insert Mamba blocks into the 9-layer bottleneck, with the positioning determined via ablations studies (see Figure \ref{fig:ModelArchitecture} and Appendix \ref{Ablations}). Through these blocks, encoder outputs are serialised via a generalised Hilbert trajectory, then fed into bidirectional Mamba modules. Outputs from Mamba are then concatenated with earlier CNN outputs and fed into channel-mixing blocks \cite{ResViT} to consolidate contextual features from Mamba with structural features from CNNs. Channel-mixing blocks consist of two parallel convolutional branches of varying kernel sizes, mixing features at both identical and differing spatial locations \cite{MLPmixer}.

\subsubsection{Training objective}

We implement GAMBAS as an adversarial model, using PatchGAN as our discriminator \cite{Pix2Pix}. Once again, we keep operations identical to the original paper, switching only 2D convolutions to 3D convolutions and batch normalisation to instance normalisation. The discriminator has two input channels, taking in both an ULF scan and either a real or synthetic HF scan at each training step to determine whether the input pair is real. As such, our model follows the conditional GAN (cGAN) training objective, summarised by:

\begin{equation}
\begin{aligned}
\mathcal{L}_{_{c}GAN}(G,D) & = \mathbb{E}_{x,y}[\mbox{log}\:D(x, y)] + \mathbb{E}_{x}[\mbox{log}\:1-D(x, (G(x))]\\
\end{aligned}
\label{eq:loss-cGAN}
\end{equation}

where generator $G$ and discriminator $D$ aim to a minimise and maximise expectation $\mathbb{E}$, respectively. To ensure subject anatomy is maintained on conditional image generation, $G$  is tasked with an additional loss of minimising the L1 distance between model outputs and reference HF images:

\begin{equation}
\begin{aligned}
\mathcal{L}_{L1}(G) & = \mathbb{E}_{x,y}[\lVert y - G(x) \rVert_{1}]
\end{aligned}
\label{eq:loss-L1}
\end{equation}

Combining Eq.\eqref{eq:loss-cGAN} and Eq.\eqref{eq:loss-L1} yields the following minimax objective:

\begin{equation}
\begin{aligned}
G^{*} & = \:\mbox{arg} \:\underset{G}{\min} \:\underset{D}{\max} \:\lambda_{adv}\mathcal{L}_{_{c}GAN}(G,D) + \lambda_{L1}\mathcal{L}_{L1}(G)\\
\end{aligned}
\label{eq:minimax}
\end{equation}

Here, $\lambda_{adv}$ = 1 and $\lambda_{L1}$ = 100 are loss weighting terms selected based on favourable results from earlier research \cite{Pix2Pix}, \cite{ResViT}.

\begin{figure}[htb]
\centering
\includegraphics[width=1.0\linewidth]{figures/GAMBAS_architecture.jpg}
\caption{Generator Architecture. Bottleneck layers 1, 5 and 9 (light blue) include forward and reverse Mamba blocks, channel compression and residual blocks. Remaining bottleneck layers (light green) are comprised of a residual block only.}
\label{fig:ModelArchitecture}
\end{figure}

\subsection{Experiments}

\subsubsection{Data and Training}

MRI data used in this paper stem from two separate studies on neurodevelopment over the first 3 years of life in LMICs. The Khula Study \cite{Khula} investigates the development of executive function in children based in South Africa and Malawi. Children with no known neurological abnormalities were scanned at ages of 3, 6, 12, 18, or 24 months. Subjects attended two scanning sessions, with HF T\textsubscript{2}w scans (1×1×1mm) acquired using a Siemens 3T scanner (Erlangen, DE) and ULF T\textsubscript{2}w scans (1.5×1.5×5mm) acquired using a Hyperfine Swoop 64mT system (Guildford, CT). MINE (Maternal and environmental Impact assessment on Neurodevelopment in Early childhood years; \citealt{MiNE}) assesses children based in Karachi, Pakistan, beginning at ages of 1, 3, or 6 months and continuing via yearly follow-ups. Subjects also attended two scanning sessions, with HF T\textsubscript{2}w scans (0.5×0.5×0.5mm) acquired using a Toshiba 3T scanner (Otawara, JP) and ULF T\textsubscript{2}w scans (1.6×1.6×5mm) also acquired using a Hyperfine Swoop 64mT system (Guildford, CT).

Images were assessed for motion or banding artifacts; those with severe distortions were excluded, resulting in a total of 215 paired ULF and HF scans (116 Male, 99 Female; mean age 10.9 ± 6.7 months). Pre-processing involved rigid registration \cite{ANTs} of all HF scans to a custom age-specific HF template, and of each subject's ULF scan to the corresponding pre-registered HF scan. We additionally resampled all ULF and HF scans to 1×1×1mm resolution to ensure consistency across scanners and sites. The data were split into training/validation/test sets of 137/35/43 scans (i.e. $\sim$64/16/20\%), balancing for age and sex, and our model was trained for 1000 epochs on a Nvidia RTX 3090 24GB GPU. We used the Adam optimizer with $\beta_1$ = 0.5, $\beta_2$ = 0.999 and a learning rate of 0.0002. Patch-wise training was conducted, extracting random 128×128×128 patches from input and target images at each training step, and batch size was set to 1. During training, image augmentation was carried out on the fly, including random 3D rotation, BSpline deformation, random flip, and contrast and brightness adjustment.

\subsubsection{Competing Methods}

To compare performance of our model to similar generator architectures, we trained three additional models from scratch using our data: SwinUNETR-V2 \cite{SwinUnetrV2}, ResViT \cite{ResViT}, and U-Mamba \cite{U-Mamba}. These networks all combine local and global context; SwinUNETR-V2 merges the U-Net architecture with Swin-transformers, ResViT adds pre-trained vision transformers into bottleneck layers of a 9-block ResNet, and U-Mamba adds SSM blocks to each encoding layer of the U-Net. As ResViT was not designed to handle 3D data, we provide a 3D implementation here (details in Appendix \ref{CompetingMethods}). For a fair comparison, all competing methods were implemented as adversarial models with a PatchGAN discriminator, trained via the loss function described in Eq.\eqref{eq:minimax}. The quality of model outputs relative to ground-truth HF images is evaluated via normalised root mean squared error (NRMSE), peak signal-to-noise ratio (PSNR), structural similarity index measure (SSIM), and learned perceptual image patch similarity (LPIPS).

We additionally benchmark against other existing ULF SR models; LF-SynthSR and SFNet. LoHiResGAN does not have publicly available weights for a model pre-trained on T\textsubscript{2}w scans, and thus is excluded from benchmarking. As LF-SynthSR exclusively outputs T\textsubscript{1}w MRI scans regardless of the contrast of the input, we compare outputs of GAMBAS and SFNet with those of LF-SynthSR via Dice overlap of segmented brain regions, using segmentations obtained via SynthSeg \cite{SynthSeg}, a toolkit that is agnostic to contrast and resolution. Segmentation outputs for subjects at 3 months of age and younger (n=10) were highly inconsistent, therefore these were excluded from analyses. Unless otherwise specified, all significance testing was carried out via the Wilcoxon signed-rank test.

\begin{figure}[htb]
\centering
\includegraphics[width=1.0\linewidth]{figures/GAMBAS_voxelwise_v2.jpg}
\caption{Model outputs and error maps from a single test subject. Left to right: raw ULF scan, SwinUNETR-V2, ResViT, U-Mamba, GAMBAS, reference HF scan.}
\label{fig:ModelOutputs}
\end{figure}

\section{Results}

From our comparative analyses, we demonstrate that GAMBAS produces visually superior outputs compared to other models assessed (Figure \ref{fig:ModelOutputs}). We highlight this with improved performance in all voxelwise metrics compared to SwinUNETR-V2, ResViT, and U-Mamba (Table \ref{tab:Voxel-scores}), with GAMBAS improving ULF scores on average by 23.9\%. Taking a more overarching view of the results, we additionally observe a pattern whereby Mamba-infused models (U-Mamba and GAMBAS) outperform transformer-infused models (SwinUNETR-V2 and ResViT), potentially signifying that Mamba is better suited for capturing global context in this image translation task. When assessing our performance against existing ULF SR models, segmentation-based analyses reveal that GAMBAS outperforms the only publicly available SR toolkits for use on 64mT ULF scans (LF-SynthSR and SFNet). Across all four tissue types (GMC, GMS, WM, CSF), GAMBAS demonstrates closest correspondence in Dice overlap to HF scans ($\mu$=0.788); higher than both SFNet ($\mu$=0.691) and LF-SynthSR ($\mu$=0.693); see Table \ref{tab:Dice-scores} and Appendix \ref{Segmentations}.

\begin{table}[ht]
\caption{Image quality metrics for ULF scans and all SR methods trained using our dataset. Asterisk reflects a significant difference compared to all previous methods.}
\begin{tabular*}{\textwidth}{@{\extracolsep\fill}lcccc}
\toprule%
\textbf{Model} & \textbf{NRMSE ($\downarrow$)} & \textbf{PSNR ($\uparrow$)} & \textbf{SSIM ($\uparrow$)} & \textbf{LPIPS ($\downarrow$)}\\
\midrule
none [ULF] & 0.441$\pm$0.038 & 26.772$\pm$1.372 & 0.896$\pm$0.021 & 0.0402$\pm$.0075\\
\midrule
SwinUNETR-V2 & 0.315$\pm$0.044 & 29.750$\pm$1.827 & 0.911$\pm$0.017 & 0.0231$\pm$.0047\\
ResViT & 0.310$\pm$0.046 & 29.885$\pm$1.811 & 0.911$\pm$0.016 & 0.0235$\pm$.0046\\
U-Mamba & 0.294$\pm$0.043 & 30.336$\pm$1.868 & 0.914$\pm$0.016 & 0.0226$\pm$.0050\\
GAMBAS & \textbf{0.290$\pm$0.044} & \textbf{30.469$\pm$1.919} & \textbf{0.916$\pm$0.016*} & \textbf{0.0220$\pm$.0051*}\\
\bottomrule
\label{tab:Voxel-scores}
\end{tabular*}
\end{table}

\begin{table}[ht]
\caption{Dice coefficients for four tissue types: cortical grey matter (GMC), subcortical grey matter (GMS), white matter (WM) and cerebrospinal fluid (CSF), for input ULF scans, SynthSR and GAMBAS.}
\begin{tabular*}{\textwidth}{@{\extracolsep\fill}lcccc}
\toprule%
\textbf{Model} & \textbf{GMC} & \textbf{GMS} & \textbf{WM} & \textbf{CSF}\\
\midrule
none [ULF] & 0.682$\pm$0.055 & 0.680$\pm$0.216 & 0.698$\pm$0.097 & 0.423$\pm$0.069\\
\midrule
LF-SynthSR & 0.716$\pm$0.032 & 0.840$\pm$0.102 & 0.778$\pm$0.025 & 0.439$\pm$0.062\\
SFNet & 0.731$\pm$0.036 & 0.794$\pm$0.083 & 0.759$\pm$0.042 & 0.478$\pm$0.061\\
GAMBAS & \textbf{0.806$\pm$0.021*} & \textbf{0.899$\pm$0.014*} & \textbf{0.815$\pm$0.017*} & \textbf{0.631$\pm$0.051*}\\
\bottomrule
\label{tab:Dice-scores}
\end{tabular*}
\end{table}

\section{Discussion}

Our novel GAMBAS network presents an effective and reliable method for translation of ULF to HF MRI, underscoring the potential for deep learning tools in widening accessibility to MRI in resource-limited settings. We demonstrate improved performance compared to other SOTA methods, highlighting the benefits conferred by our approach of fusing novel Generalised-Hilbert Mamba blocks with CNNs for image generation. Although U-Mamba also presents a network combining CNN and Mamba modules, they rely on a simple 'Raster' trajectory for 1D serialisation and do not include channel mixing blocks between CNN and Mamba outputs. 

By training models with a paediatric dataset, we tackle a particularly challenging image translation task. Our data includes children scanned at various stages of brain development, from early stages of myelination with inverted GM:WM signal intensities relative to adult contrast \cite{Myelination}, through a period of strong overlap in tissue intensity \cite{Myelination2}, to later stages of myelination (1 year and above) when adult contrast is established. In spite of this, we demonstrate improved NRMSE, PSNR, SSIM, and LPIPS on a test set spanning subjects across all stages. 
One notable limitation of our study stems from our use of SynthSeg \cite{SynthSeg} for segmentation. It was chosen as it is the only widely available segmentation toolkit agnostic to contrast and resolution (allowing direct comparison between anisotropic T\textsubscript{2}w ULF scans, 1mm isotropic T\textsubscript{2}w GAMBAS/SFNet outputs, and 1mm isotropic T\textsubscript{1}w SynthSR outputs). However, as SynthSeg was trained on adult data, it performs poorly on scans deviating too much from the normal adult contrast, necessitating the exclusion of 3-month-old subjects from segmentation-based analyses. 

In summary, GAMBAS shows great promise for SR of ULF MRI. Future work will explore its application to other datasets, including adult and/or clinical populations.

\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{We would like to acknowledge funding from Artificial Intelligence Methods for Low Field MRI Enhancement, Bill and Melinda Gates Foundation (INV-032788) and Wellcome Leap 1kD programme (The First 1000 Days) [222076/Z/20/Z].}

\bibliography{midl25_135}

\newpage
\appendix
\section{S4 and Mamba}
\label{S4andMamba}

The state space model (SSM) is a continuous-time latent state model, which maps a 1D input $x(t) \in \mathbb{R}^{L}$ to an output $y(t) \in \mathbb{R}^{L}$ through hidden state $h(t) \in \mathbb{R}^{N}$ via the following linear ODE:

\begin{equation}
\begin{aligned}
h'(t) & = \mathbf{A}h(t) + \mathbf{B}x(t),\\
y(t) & = \mathbf{C}h(t) + \mathbf{D}x(t)\\
\end{aligned}
\label{eq:ssm}
\end{equation}


Here, $\mathbf{A} \in \mathbb{R}^{N \times N}$, $\mathbf{B} \in \mathbb{R}^{N \times 1}$, and $\mathbf{C} \in \mathbb{R}^{1 \times N}$ are learnable parameters and $\mathbf{D} \in \mathbb{R}^{1}$ represents a residual connection. The continuous-time SSM has prohibitive computation and memory requirements for deep learning applications, however, it can be discretized using a timescale parameter $\mathbf{\Delta}$. For this, the zero-order hold (ZOH) technique is employed, whereby continuous parameters $\mathbf{A}$, $\mathbf{B}$ are discretized as $\mathbf{\overline{A}} = \mbox{exp}(\mathbf{\Delta}\mathbf{A})$, $\mathbf{\overline{B}} = (\mathbf{\Delta}\mathbf{A})^{-1}(\mbox{exp}(\mathbf{\Delta}\mathbf{A})-\mathbf{I}) \cdot\mathbf{\Delta}\mathbf{B}$. By substituting these into Eq.(\ref{eq:ssm}), we obtain the structured state-space sequence model, or S4 \cite{s4}, represented in the following recurrent form:

\begin{equation}
\begin{aligned}
h_t & = \mathbf{\overline{A}}h_{t-1} + \mathbf{\overline{B}}x_t,\\
y_t & = \mathbf{C}h_t + \mathbf{D}x_t\\
\end{aligned}
\label{eq:ssm-discrete}
\end{equation}

Given an input ${x} \in \mathbb{R}^{1 \times L}$, where ${L}$ is the sequence length, the model can also be computed by convolving a structured convolution kernel  $\mathbf{\overline{K}} = (\mathbf{C}\mathbf{\overline{B}}, \mathbf{C}\mathbf{A}\mathbf{\overline{AB}},...,\mathbf{C}\mathbf{\overline{A}}^{\mathbf{L}}\mathbf{\overline{B}})$ across ${x}$. Although efficient for training, S4 is unable to select data in an input-dependent manner. 

Mamba overcomes this hurdle and distinguishes itself from earlier SSMs by setting the model parameters to be input-dependent, endowing the model with a selection mechanism that can filter out irrelevant information and remember relevant information indefinitely \cite{mamba}. This no longer allows for the convolutional formulation to be applied (as input-dependent parameters prevent the use of a single structured kernel), however efficient training is ensured via a hardware-aware algorithm that computes model outputs using a parallel scan operation. 

\section{Hilbert curve and implementation}
\label{HilbertAppendix}

The Hilbert curve can be expressed concisely using a Lindenmayer system, or L-system. L-systems consist of three components: 1) an “alphabet” of symbols that form a string, 2) an initial “axiom” string, and 3) a set of “production rules” that describe how symbols are replaced to form a larger string. Crucially, the “alphabet” can be sub-divided into “variables” (symbols that can be replaced) and “constants” (symbols that cannot be replaced). Once a string evolves through a number of successive iterations from the axiom via the production rules, the output is used as a set of instructions for generating a geometric structure. The L-system describing the Hilbert curve can be summarised as:

\begin{equation}
\begin{aligned}
\mathbf{Variables} & = A, B\\
\mathbf{Constants} & = F, +, -\\ 
\mathbf{Axiom} & = A\\ 
\mathbf{Production} \: \mathbf{rules} & = A \rightarrow +BF-AFA-FB+, \\
& \quad\: B \rightarrow -AF+BFB+FA-\\
\end{aligned}
\label{eq:Hilbert-L}
\end{equation}

where ${F}$ is forward, $-$ and $+$ indicate a 90° turn clockwise and counterclockwise, respectively, and ${A}$ and ${B}$ are ignored during drawing. The first three iterations of this L-system are depicted in Figure \ref{fig:Hilbert3iterations}.

\begin{figure}[htbp]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
\centering
\includegraphics[width=1.0\linewidth]{figures/HilbertAppendix_v2.jpeg}
\caption{[A] Iteration 1, [B] iteration 2, and [C] iteration 3 of the Hilbert curve, following starting axiom $A$. The output string is shown on the bottom, and the corresponding geometric structure on top.}
\label{fig:Hilbert3iterations}
\end{figure}

The original formulation of the Hilbert curve only produces structures composed of $4^n$ vertices to fill regions in 2D space (see Figure \ref{fig:Hilbert3iterations}). By using a generalised formulation of the Hilbert curve \cite{gilbert}, we instead are able to handle 3D image features of any even dimensions. This implementation uses a recursive mechanism to split a cuboid into multiple regions, with the function calling on itself until a trivial path can be computed. Calling this function in each training step would compromise efficiency, however we circumvent this by only using the function once upon model initialization, at which point a tensor is stored with indices that delineate the serialisation of 3D inputs with pre-specified dimensions. We operate with patches of size 128×128×128, which are then downsampled through our encoder blocks to be of size 32×32×32. As such, a Generalised Hilbert-curve for a cube with height, width and depth of 32 is computed and stored upon model initialisation, and is used in each training step to serialise 3D image features before being fed into forward and reverse Mamba blocks (Figure \ref{fig:ModelArchitecture}).


\section{Implementation of competing methods}
\label{CompetingMethods}
\subsection{SwinUNETR-V2}

SwinUNETR-V2 was implemented using the MONAI framework \cite{MONAI}, with all parameters (i.e. network feature size, number of layers in each stage and number of attention heads) set to default settings, in line with the original paper \cite{SwinUnetrV2}.

\subsection{U-Mamba}

As U-Mamba is implemented in the self-adapting nnU-Net \cite{nnU-Net} framework, which is tailored for segmentation data, we examined the network configurations for various datasets listed in the original paper \cite{U-Mamba} and selected the one most closely aligned with our own data (3D Abdomen MRI). Consequently, we used the “U-Mamba\_Enc” variant (employing Mamba blocks in all encoder layers), with 6 encoder/decoder stages and pooling per axis set at (3, 5, 5). 

\subsection{ResViT}
ResViT used an identical 9-block ResNet backbone as GAMBAS, however transformer modules were inserted into bottleneck layers 1 and 4, in accordance with the original paper. These modules used weights from transformer component of the ImageNet-pretrained model R50+ViT-B/16 (\url{https://github.com/google-research/vision_transformer}). As the pre-trained transformer expects 2D inputs of size 16×16, we downsampled image features in transformer blocks to match the desired height and width, and performed patch embedding across the depth dimension.

\section{Ablation studies}
\label{Ablations}
\subsection{Insertion of Mamba blocks}

\begin{table}[ht]
\caption{Ablations on the number of mamba blocks inserted into the bottleneck layers of GAMBAS. (1,5,9) denotes mamba blocks in the first, fifth, and ninth layers.}
\begin{tabular*}{\textwidth}{@{\extracolsep\fill}lcccc}
\toprule%
\textbf{Model} & \textbf{NRMSE ($\downarrow$)} & \textbf{PSNR ($\uparrow$)} & \textbf{SSIM ($\uparrow$)} & {$\boldsymbol{\mu}$} \textbf{Dice ($\uparrow$)}\\
\midrule
none & 0.314$\pm$0.053 & 30.185$\pm$1.403 & 0.7795$\pm$.0141 & 0.7795$\pm$.0339\\
(5) & 0.312$\pm$0.054 & 30.254$\pm$1.489 & 0.9149$\pm$.0143 & 0.7766$\pm$.0352\\
(1,5,9) & \textbf{0.311$\pm$0.055} & \textbf{30.285$\pm$1.482} & \textbf{0.9162$\pm$.0140} & \textbf{0.7802$\pm$.0363}\\
(1,3,5,7,9) & 0.315$\pm$0.057 & 30.116$\pm$1.518 & 0.9153$\pm$.0141 & 0.7760$\pm$.0336\\
(1,2,3,...9) & 0.312$\pm$0.052 & 30.185$\pm$1.432 & 0.9160$\pm$.0139 & 0.7782$\pm$.0349\\
\bottomrule
\label{tab:ablation-mambablocks}
\end{tabular*}
\end{table}

\newpage
\subsection{Scanning trajectory}

\begin{table}[ht]
\caption{Ablation on scanning trajectory used by bidirectional mamba blocks in first, middle, and final bottleneck layers.}
\begin{tabular*}{\textwidth}{@{\extracolsep\fill}lcccc}
\toprule%
\textbf{Serialisation} & \textbf{NRMSE ($\downarrow$)} & \textbf{PSNR ($\uparrow$)} & \textbf{SSIM ($\uparrow$)} & {$\boldsymbol{\mu}$} \textbf{Dice ($\uparrow$)}\\
\midrule
Raster & 0.315$\pm$0.061 & 30.195$\pm$1.598 & 0.9157$\pm$.0141 & 0.7782$\pm$.0350\\
Generalised-Hilbert & \textbf{0.311$\pm$0.055} & \textbf{30.285$\pm$1.482} & \textbf{0.9162$\pm$.0140} & \textbf{0.7802$\pm$.0363}\\
\bottomrule
\label{tab:scanning-trajectory}
\end{tabular*}
\end{table}

\subsection{Unidirectional vs bidirectional mamba}

\begin{table}[ht]
\caption{Ablation on the use of bidirectional mamba for mamba blocks in first, middle, and final bottleneck layers.}
\begin{tabular*}{\textwidth}{@{\extracolsep\fill}lcccc}
\toprule%
\textbf{Model} & \textbf{NRMSE ($\downarrow$)} & \textbf{PSNR ($\uparrow$)} & \textbf{SSIM ($\uparrow$)} & {$\boldsymbol{\mu}$} \textbf{Dice ($\uparrow$)}\\
\midrule
Unidirectional & \textbf{0.311$\pm$0.052} & 30.265$\pm$1.458 & 0.9160$\pm$.0140 & 0.7792$\pm$.0353\\
Bidirectional & \textbf{0.311$\pm$0.055} & \textbf{30.285$\pm$1.827} & \textbf{0.9162$\pm$.0140} & \textbf{0.7802$\pm$.0363}\\
\bottomrule
\label{tab:uni-bi}
\end{tabular*}
\end{table}


\newpage
\section{Segmentations}
\label{Segmentations}

\begin{figure}[htb]
\centering
\includegraphics[width=1.0\linewidth]{figures/GAMBAS_segmentations_v2.jpg}
\caption{Model outputs and segmentations from a single test subject. Left to right: raw ULF scan, SFNet, GAMBAS, reference T\textsubscript{2}w HF scan, SynthSR, reference T\textsubscript{1}w HF scan.}
\label{fig:Segmentations}
\end{figure}

\section{Bias across sites}
\label{bias}

\begin{table}[ht]
\caption{Model performance across the two imaging sites: South Africa (Khula study) and Pakistan (MINE study). Significance testing, carried out via the Mann-Whitney U-Test, showed no significant differences.}
\begin{tabular*}{\textwidth}{@{\extracolsep\fill}lcccc}
\toprule%
\textbf{Imaging site} & \textbf{NRMSE ($\downarrow$)} & \textbf{PSNR ($\uparrow$)} & \textbf{SSIM ($\uparrow$)} & {$\boldsymbol{\mu}$} \textbf{Dice($\uparrow$)}\\
\midrule
Khula & 0.298$\pm$0.051 & 30.146$\pm$2.462 & 0.913$\pm$0.016 & \textbf{0.789$\pm$0.024}\\
MINE & \textbf{0.282$\pm$0.031} & \textbf{30.884$\pm$1.272} & \textbf{0.917$\pm$0.015} & 0.786$\pm$0.018\\
p-value & 0.312 & 0.473 & 0.551 & 0.503\\
\bottomrule
\label{tab:site bias}
\end{tabular*}
\end{table}

\end{document}
