\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution
\usepackage{booktabs}
\usepackage{commath}
\usepackage{pifont}
\usepackage{siunitx}
\newcommand{\cmark}{\ding{51}}%
\newcommand{\xmark}{\ding{55}}%
\usepackage{mwe} % to get dummy images
\usepackage{gensymb}
\usepackage{graphicx} % For \scalebox

% Define a shorter minus sign
\newcommand{\shortminus}{\scalebox{0.75}[1.0]{\( - \)}}
\jmlryear{2024}
\jmlrworkshop{Full Paper -- MIDL 2024}
\jmlrvolume{-- 091}
\editors{Accepted for publication at MIDL 2024}
\title[Fast physics-informed neural fields for CT perfusion analysis]{Accelerating physics-informed neural fields for fast CT perfusion analysis in acute ischemic stroke}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Bart J. Emmer} \Email{b.emmer@amsterdamumc.nl}\and
 %  \Name{Charles B.L.M. Majoie} \Email{c.b.majoie@amsterdamumc.nl}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Lucas {de Vries}\nametag{$^{1,2,3}$}} \Email{lucas.devries@amsterdamumc.nl}\\
\Name{Rudolf L. M. {van Herten}\nametag{$^{2,3}$}} \Email{r.l.m.vanherten@amsterdamumc.nl}\\
\Name{Jan W. Hoving\nametag{$^{1}$}} \Email{j.w.hoving@amsterdamumc.nl}\\
\Name{Ivana I\v{s}gum\nametag{$^{1,2,3}$}} \Email{i.isgum@amsterdamumc.nl}\\
\Name{Bart J. Emmer\nametag{$^{1}$}} \Email{b.j.emmer@amsterdamumc.nl}\\
\Name{Charles B. Majoie\nametag{$^{1}$}} \Email{c.b.majoie@amsterdamumc.nl}\\
\Name{Henk A. Marquering\nametag{$^{1,2}$}} \Email{h.a.marquering@amsterdamumc.nl}\\
\Name{Efstratios Gavves\nametag{$^{3}$}} \Email{e.gavves@uva.nl}\\
\\
\addr $^{1}$ Amsterdam UMC location University of Amsterdam, Radiology and Nuclear Medicine, Meibergdreef~9, Amsterdam, 1105 AZ, The Netherlands \\
\addr $^{2}$ Amsterdam UMC location University of Amsterdam, Biomedical Engineering and Physics, Meibergdreef~9, Amsterdam, 1105 AZ, The Netherlands \\
\addr $^{3}$ Informatics Institute, University of Amsterdam, Amsterdam, The Netherlands}

% \Name{Author Name4\midljointauthortext{Contributed equally}\nametag{$^{3}$}} \Email{uvw@foo.ac.uk}\\
% \addr $^{3}$ Address 3 \AND
% \Name{Author Name5\midlotherjointauthor\nametag{$^{4}$}} \Email{fgh@bar.com}\\
% \addr $^{4}$ Address 4

\begin{document}
\newcommand{\AIF}{$C_{\textsc{aif}}(t)$}
\newcommand{\VOF}{$C_{\textsc{vof}}(t)$}
\newcommand{\TAC}{$C_{\textsc{tac}}(t)$}
\newcommand{\TACx}{$C_{\textsc{tac}}(\mathbf{x},t)$}

\newcommand{\TACv}{$C_{\textsc{tac}, v}(t)$}
\newcommand{\IRF}{$IRF(t)$}
\newcommand{\IRFv}{$IRF_{v}(t)$}
\newcommand{\delay}{$t_d$}
\newcommand{\loss}{\mathcal{L}}
\newcommand{\aifnet}{$f_{\textsc{aif}}(t;\theta)$}
\newcommand{\tacnet}{$f_{\textsc{tac}}(t;\phi)$}
\newcommand{\tacnetst}{$f_{\textsc{tac}}(\mathbf{x},t;\phi)$}
\newcommand{\odenet}{$f_{\textsc{ode}}(\mathbf{x};\xi)$}
\newcommand{\sv}{\textsc{Syngo.via}}
\newcommand{\rapid}{\textsc{rapid}}
\newcommand{\strokeviewer}{\textsc{StrokeViewer}}

\newcommand{\sppinn}{\textsc{sppinn}}
\newcommand{\resppinn}{\textsc{r}e\textsc{sppinn}}
\maketitle

\begin{abstract}
Spatio-temporal perfusion physics-informed neural networks were introduced as a new method (\sppinn{}) for CT perfusion (CTP) analysis in acute ischemic stroke.
\sppinn{} leverages physics-informed learning and neural fields to perform a robust analysis of noisy CTP data.
However, \sppinn{} faces limitations that hinder its application in practice, namely its implementation as a slice-based (2D) method, lengthy computation times, and the lack of infarct core segmentation.
To address these challenges, we introduce a new approach to accelerate physics-informed neural fields for fast, volume-based (3D), CTP analysis including infarct core segmentation: \resppinn{}.
To accommodate 3D data while simultaneously reducing computation times, we integrate efficient coordinate encodings.
Furthermore, to ensure even faster model convergence, we use a meta-learning strategy.
In addition, we also segment the infarct core.
We employ acute MRI reference standard infarct core segmentations to evaluate \resppinn{} and we compare the performance with two commercial software packages.
We show that meta-learning allows for full-volume perfusion map generation in 1.2 minutes without comprising quality, compared to over 40 minutes required by \sppinn{}.
Moreover, \resppinn{}'s infarct core segmentation outperforms commercial software.
\end{abstract}

\begin{keywords}
neural fields, CT perfusion, physics-informed, acute ischemic stroke
\end{keywords}

\section{Introduction}
CT perfusion (CTP) imaging is often part of the imaging work-up of patients suffering acute ischemic stroke for treatment decision support.
CTP is the sequential acquisition of CT after contrast agent administration. These images are subsequently processed for the generation of so-called \emph{perfusion maps} \citep{Konstas2009}.
These maps depict, for example, the cerebral blood flow and time-to-maximum contrast attenuation, which are crucial in estimating the infarct core (irreversibly damaged tissue) and penumbra (hypoperfused but salvageable tissue).
Typical commercial CTP software thresholds the perfusion maps to determine these two regions \citep{Demeestere2020ReviewTissue}.

In recent years, many deep learning-based studies focused on performing infarct core segmentation from commercial vendor CTP perfusion maps \citep{10.1007/978-3-030-37969-8_3, CLERIGUES2019103487, 10.1007/978-3-030-11723-8_36}, infarct core segmentation directly from CTP source data \citep{Bertels2019, Robben2020, DEVRIES2023102749}, or a combination of both~\citep{Wang2020}.
On the other hand, \citet{DEVRIES2023102971} used deep learning for the generation of perfusion maps and introduced \sppinn{}, a novel approach to CTP analysis using spatio-temporal physics-informed neural networks, which showed improved accuracy for estimating perfusion parameters particularly when data are noisy.
\sppinn{} infers the perfusion parameters by learning coordinate-based neural networks, or \emph{neural fields}, that represent the perfusion parameters and observed data, guided by a loss function formulated as the residual of a differential equation corresponding to the dynamics of CTP.

\sppinn{}'s application in clinical settings is considerably limited due to (i) the method operating on 2D axial slices rather than the full 3D volume, (ii) the computation time being too long for clinical use, and (iii) the lack of infarct core segmentation.
To address these limitations, we introduce \resppinn{} for fast, volume-based (3D), CTP analysis with physics-informed neural fields including infarct core segmentation.
To adapt from a slice-based to a volume-based method and at the same time reduce convergence time, we use coordinate encodings.
Additionally, we learn network initializations through meta-learning to further reduce computation time.
To expand the potential of \resppinn{} in a clinical setting, we incorporate infarct core segmentation.
We evaluate our method on two levels.
Initially, we assess the segmentation and detection performance of \resppinn{} against that of two commercial CTP software packages.
Subsequently, we examine if the perfusion maps generated by \resppinn{} maintain the same level of accuracy for infarct core segmentation as perfusion maps which were not subject to acceleration.
We achieve this by comparing its derived infarct core segmentation to acute MRI reference standard infarct core segmentations.

\section{Method}
In the following, we introduce \resppinn{}. \figureref{fig:example} presents an overview of our method.
\subsection{Physics-informed neural fields for CTP analysis}
In CTP source data, the arterial input function \AIF{} is the contrast attenuation curve in one of the main supplying arteries, and \TACv{} is the tissue contrast attenuation in voxel $v$.
\begin{figure}[htbp]
\floatconts
  {fig:example}
  {\caption{\resppinn{} learns neural fields \tacnetst{} and \aifnet{} of the observed tissue attenuation and arterial input data, and \odenet{} for the perfusion parameters. With meta-learning, we learn neural field initializations and at test time we tune them to new patient data. We use the CBF map for infarct core segmentation.}}
  {\includegraphics[width=\linewidth]{figures/reptile_sppinnv2.drawio.pdf}}
\end{figure}
The approach we follow to obtain the perfusion parameters from CTP source data is to infer the parameters of the following differential equation \citep{Bennink2016}:
\begin{equation}\label{eqn:deq}
    \frac{dC_{\textsc{tac}, v}(t)}{dt} = \textrm{CBF}_v\cdot [C_{\textsc{aif}}(t-t_{d,v})-C_{\textsc{aif}}(t-t_{d,v}-\textrm{MTT}_v)],
\end{equation}
for each spatial voxel $v$.
The cerebral blood flow ($\textrm{CBF}_v$), delay ($t_{d,v}$), and mean transit time ($\textrm{MTT}_v$) are the parameters to be inferred for voxel $v$.
The cerebral blood volume is $\textrm{CBV}_v = \textrm{CBF}_v\times \textrm{MTT}_v$ and time-to-maximum results from $\textrm{Tmax}_v = t_d + \frac{1}{2} \textrm{MTT}_v$.
To solve for the perfusion parameters of voxel $v$, we derive the residual form of \equationref{eqn:deq}:
\begin{equation}\label{eqn:deq_res}
    r(t)= \frac{dC_{\textsc{tac}, v}(t)}{dt} - \textrm{CBF}_v\cdot [C_{\textsc{aif}}(t-t_{d,v})-C_{\textsc{aif}}(t-t_{d,v}-\textrm{MTT}_v)],
\end{equation}
We can define an objective function, e.g. $\loss=r(t)^2$, and use an optimization method to minimize $\loss{}$ and infer the parameters of the differential equation.
This is not straightforward, however, since the temporal data for the tissue attenuation and arterial input function have a low temporal resolution (often 1-2 seconds), the attenuation curves are noisy, and $\frac{dC_{\textsc{tac}, v}(t)}{dt}$ is not well-defined.
We approach this problem by using neural fields and physics-informed learning \citep{Raissi2019Physics-informedEquations}.
A field $f: \mathbb{R}^d \rightarrow \mathbb{R}^n$ is a scalar ($n=1$) or vector ($n>1$) quantity defined over the spatial, temporal, or spatio-temporal domain ($d=1, 2, 3, 4$), i.e. the Hounsfield units (the quantity) in a CT scan (the domain).
In the case of CTP, the observed data are discretely sampled at coordinates $\mathbf{x} = (x, y, z)$ on the voxel grid and $t\in[t_0, ..., t_T]$ in the temporal domain $T$.
Using fields we can rewrite \equationref{eqn:deq_res}:
\begin{equation}\label{eqn:deq_res_fields}
    r(\mathbf{x}, t)= \frac{\partial C_{\textsc{tac}}(\mathbf{x}, t)}{\partial t} - \textrm{CBF}(\mathbf{x})\cdot [C_{\textsc{aif}}(t-t_{d}(\mathbf{x}))-C_{\textsc{aif}}(t-t_{d}(\mathbf{x})-\textrm{MTT}(\mathbf{x}))],
\end{equation}
A \emph{neural} field is a neural network $f_{\theta}$ that parameterizes a field $f$ \citep{10.1111:cgf.14505}.
We learn neural fields of the arterial input function data: $f_{\textsc{aif}}(t)$, and tissue attenuation data: $f_{\textsc{tac}}(\mathbf{x}, t)$, but also the perfusion parameters: $f_{\textsc{cbf}}(\mathbf{x})$, $f_{\textsc{mtt}}(\mathbf{x})$, $f_{t_{d}}(\mathbf{x})$.
Since neural fields are fully differentiable, we can compute the continuous derivative $\frac{\partial C_{\textsc{tac}}(\mathbf{x}, t)}{\partial t}$.
This allows us to rewrite \equationref{eqn:deq_res_fields} with neural fields:
\begin{equation}\label{eqn:nn_deq_res}
    r(\mathbf{x}, t) = \frac{\partial f_{\textsc{tac}}(\mathbf{x},t)}{\partial t} - f_{\textsc{cbf}}(\mathbf{x})\cdot [ f_{\textsc{aif}}(t-f_{t_{d}}(\mathbf{x}))-f_{\textsc{aif}}(t-f_{t_{d}}(\mathbf{x})-f_{\textsc{mtt}}(\mathbf{x}))].
\end{equation}
In the same fashion as \equationref{eqn:deq_res}, we can define an objective function or \emph{physics-informed} loss function, e.g. $\loss{}_{ODE}=r(\mathbf{x},t)^2$, and use stochastic gradient descent to minimize the loss and train the neural fields for the perfusion parameters.
After training, we sample the perfusion parameter neural fields at the spatial domain to obtain the perfusion maps.

\paragraph{Neural field definitions and training} \aifnet{}$:\mathbb{R} \rightarrow \mathbb{R}$ and \tacnetst{}$:\mathbb{R}^4 \rightarrow \mathbb{R}$ are parameterized with sets of parameters $\theta$ and $\phi$. In practice, we estimate the perfusion parameters with a single neural field \odenet{}$:\mathbb{R}^3~\rightarrow~\mathbb{R}^3$ that is only a function of the spatial coordinates.
Hence, the parameters of $f_{\textsc{cbf}}(\mathbf{x})$, $f_{\textsc{mtt}}(\mathbf{x})$, $f_{\textsc{t}_{d}}(\mathbf{x})$ are shared.
We use mini-batches $(\mathbf{x}, t)$ to optimize \aifnet{} and \tacnetst{}  with norm-based loss functions $\loss_{AIF}$ and $ \loss_{TAC}$, supervised with the observed data.
We sample sets of continuous collocation points $(\mathbf{x}, \tau)$ with $\tau \in T_c$ in the same range as $T$ to minimize $\loss{}_{ODE}$ and to ensure smooth derivatives between time points.
The total loss is:
\begin{align}\label{eqn:loss}
\loss = \loss_{AIF} +\loss_{TAC} + \loss_{ODE}.
\end{align}
For the specific implementation details and loss functions, we refer to Appendix~\ref{app:implementation}.
\subsection{Coordinate encoding}
\citet{10.5555/3495724.3496356} showed that simply using coordinates as input to the neural fields limits the capacity of neural fields to fit high-frequency details.
This complexity to fit details increases with the dimensionality of the problem, particularly causing problems for slice-based \sppinn{} to be a full 3D(+T) approach.
\citet{10.5555/3495724.3496356}, therefore, proposed encoding the coordinates into a higher dimensional space to accelerate convergence.
We follow the multi-resolution hash-encodings, in short, \emph{hash-encodings} or $h(\vec{\cdot})$, proposed by \citep{mueller2022instant} to encode our spatial coordinates into an efficient higher dimensional space.
Hash-encodings define $L$ multi-resolution grids over the input domain with $d$ learnable weights at each grid point at each resolution.
For a coordinate $\mathbf{x}$, it determines the closest grid points per resolution and linearly interpolates the weights at these grid points to obtain embedding $e\in\mathbb{R}^{Ld}$.
We share the encoding layer between \odenet{} and \tacnetst{}. For \aifnet{} we do not require encoding, since approximating the one-dimensional \AIF{} is already efficient without hashing.
The hash-encoding $h(\vec{\cdot})$ lacks global differentiability due to its discontinuities at hash grid boundaries, and with the discontinuous nature of the derivative in its linear interpolation.
Therefore, to keep the temporal derivatives well-defined, we only encode spatial coordinates and not the temporal coordinates.
This hash-encoding allows \resppinn{} to use small architectures (3 layers, 16 neurons) for 3D+T and 3D neural fields.

\subsection{Learning initializations}
Learning neural fields from random network initializations is inefficient.
\citet{9578751} showed that neural fields can be trained with considerably fewer steps when the initialization facilitates fast convergence.
We use the \emph{Reptile} meta-learning algorithm \citep{DBLP:journals/corr/abs-1803-02999} to learn an optimal initialization for \aifnet{}, \tacnetst{}, and \odenet{} using training data.
Let us consider the neural field \tacnetst{} with parameters $\phi$.
Reptile meta-learning consists of an outer and an inner loop, as described in Algorithm~\ref{alg:reptile}.
In the outer loop, we learn the initialization of the neural fields in $N_{out}$ iterations.
In the inner loop, we optimize the neural field \tacnetst{} for an instance in the training data for $N_{in} > 1$ iterations starting from the current initialization $\phi$ and obtain $\phi^*$.
We set the difference between the parameters $\phi$ and  $\phi^*$, scaled by $\epsilon$, as the gradient for the neural field.
After each inner loop, we run gradient descent to update the neural field parameters.
We select the network obtained after $N_{out}$ iterations to use for inference.
We empirically set $N_{out} =7500$ in our experiment. 
Without meta-learned initialization, \resppinn{} empirically shows convergence after 5000 iterations.
We, therefore, set the $N_{in} =500$ to achieve a factor 10 speed-up.

\begin{algorithm2e}
\caption{Reptile meta-learning for \resppinn{}'s \tacnetst{}.}
\label{alg:reptile}
\DontPrintSemicolon
\KwData{Initialize \(\phi\), the initial parameter vector for neural field $f$}
\For{iteration \(1, 2, 3, \ldots, N_{out}\)}{
    Randomly sample a patient \(P\)\;
    Perform \(N_{in} > 1\) steps on patient \(P\), starting with parameters \(\phi\), resulting in \(\phi^*\)\;
    Update: \(\phi \leftarrow \phi + \epsilon (\phi^* - \phi)\)\;
}
\Return{\(\phi\)}\;
\end{algorithm2e}
\subsection{Infarct core segmentation}\label{sec:unet}
Consistent with the methodologies of commercial CTP software, we use the CBF map for infarct core segmentation.
Specifically, we calculate a relative CBF map by scaling it to the median CBF value of the healthy hemisphere.
This relative map is the input to a U-Net \citep{2015arXiv150504597R} and we use infarct segmentations from co-registered reference standard acute MRI for supervision.
Appendix~\ref{app:implementation} presents more implementation details.

\subsection{Baseline}
We compare \resppinn{} to baseline \sppinn{}.
% We re-implemented \sppinn{} to fit our new framework for fair comparison.
We re-implemented \sppinn{} to align with our new approach for a fair comparison.
\sppinn{} has two-dimensional inputs, no coordinate encoding scheme, no meta-learned initializations, and larger network architectures for \tacnetst{} (3 layers, 128 neurons) and \odenet{} (3 layers, 64 neurons).
For a review of the vanilla \sppinn{} implementation and its quantitative performance, we refer to \citet{DEVRIES2023102971}.
\subsection{Datasets}
We use data from the CLEOPATRA health care evaluation study \citep{Koopman2022Cost-effectivenessStudy}.
We included 898 patients who received CTP at baseline and for which the CTP scan was processed with CTP software \strokeviewer{} (version 3.2.11; Nicolab, Amsterdam, The Netherlands).
For training the infarct core segmentation model, we allocated 15 patients who also underwent Diffusion-Weighted Imaging (DWI) MRI at baseline, only including imaging with an interval between CTP and DWI $<4.5$ hours to limit the effect of infarct growth \citep{Bala2021InfarctMeta-analysis.}.
The median (IQR) interval was 56 $(41-70)$ minutes.
Using a semi-automated method \citep{Tolhuisen2022OutcomeStroke, Kamnitsas2017EfficientSegmentation}, we obtained the ground truth infarct core segmentations after co-registration to the CTP.
We manually corrected the results as necessary.
Those 15 patients also had results from commercial software \sv{} CT Neuro Perfusion (version VB40; Siemens Healthcare, Erlangen, Germany) available.
In our analysis, we use the CTP source data pre-processing (motion reduction, smoothing) and AIF from \strokeviewer{}.
We aligned all scans to a standard coordinate frame of size $256\times 256\times 32$ with spacing $0.91\ \textrm{mm} \times 0.91\ \textrm{mm} \times 5.00\ \textrm{mm}$ to ensure that the midline was properly centered.

\section{Experiments}
\paragraph{Effectiveness of hash-encodings}
Preliminary experiments showed that using full CTP volumes causes problems in fitting the high-frequency details with \sppinn{}.
We, therefore, investigate \resppinn{}'s convergence speed with hash-encodings versus slice-based \sppinn{}.
For comparison, we use \resppinn{} without learned initialization (\resppinn{}-no-init).
We train volume-based \resppinn{} for 5000 iterations and use 5000 and 10000 iterations per slice for \sppinn{}.
We compare $\loss_{TAC}$ for \resppinn{} to the average loss over all slices for \sppinn{}.
\paragraph{Accelerating convergence speed with Reptile meta-learning}
We compare $\loss_{TAC}$ and $\loss_{ODE}$ for \resppinn{} trained for 5000 iterations without meta-learned initialization (\resppinn{}-no-init@5000) and \resppinn{}@500, trained with only 10\% of the iterations with initialization.
We exclude $\loss_{AIF}$ from evaluation as \aifnet{} fits \AIF{} fast regardless of initialization.
% For comparison, we also show the loss curves of \resppinn{}-no-init@500.
Furthermore, we investigate the computation time gain achieved by Reptile meta-learning and compare the total computation time to baseline \sppinn{}.
\paragraph{Infarct core segmentation}
We use the CBF map as input to a segmentation model (Section~\ref{sec:unet}).
We train the same model for both \resppinn{}@500 and \resppinn{}-no-init@5000 CBF maps to investigate if the model with \resppinn{}@500 perfusion maps achieves similar performance for the downstream segmentation task, compared to using \resppinn{}-no-init@5000 maps.
We train and evaluate through leave-one-out cross-validation on all patients (14 training, 1 test). We use the first fold to define the hyperparameters and exclude this fold from all evaluations.
We measure the average Dice score, mean volumetric difference (MVD), and absolute volumetric difference (AVD), between the reference and automatic segmentations, and the false negative rate (FNR) for infarct detection.
We compare \resppinn{}'s infarct core segmentation results with those from two commercial vendors.
\section{Results}
\paragraph{Effectiveness of hash-encodings}
\figureref{fig:losses} (left) shows the tissue attenuation loss for one patient for \sppinn{} (in grey) and \resppinn{}-no-init (in blue).
The hash-encodings allow \resppinn{}-no-init to fit the full-volume tissue attenuation data in 5000 iterations.
Unlike the proposed method, \sppinn{} is unable to fit the high-frequency data in 5000 iterations per slice.
The obtained data representation is less detailed which causes too smooth perfusion maps.
Training \sppinn{} for more iterations will further reduce the loss but also increase the computation time, which is undesirable.
Optimization differences between \sppinn{}@5000 and @10000 stem from our iteration-based learning rate scheduler.
\figureref{fig:segmentation} shows that \sppinn{} maps are less detailed compared to \resppinn{}, even after 10000 iterations.
\begin{figure}[htbp]
\floatconts
  {fig:losses}
  {\caption{Loss curves and computation time for \sppinn{} and \resppinn{}. The losses are for a single patient and the computation times are averages over the validation set.}}
  {\includegraphics[width=\linewidth]{figures/losses_no500.pdf}}
\end{figure}
\paragraph{Accelerating convergence speed with meta-learning}
\figureref{fig:losses} shows $\loss_{TAC}$ (left) and $\loss_{ODE}$ (center) for one patient for \resppinn{}-no-init@5000 (in blue) and the proposed \resppinn{}@500 (in red).
We observe rapid and stable convergence for \resppinn{}@500, for both $\loss_{TAC}$ and $\loss_{ODE}$.
\figureref{fig:losses} shows the full-volume computation time (right) on an Nvidia V100 GPU.
Training \resppinn{}-no-init@5000 until convergence takes approximately 12 minutes.
Meta-learning allows for fast convergence in 1.2 minutes on average.
\sppinn{} processes each slice in 1.7 minutes for 5000 iterations and 3.5 minutes for 10000 iterations, culminating in full-volume computation times of 40-50 and 60-100 minutes, respectively.

\paragraph{Infarct core segmentation}
\figureref{fig:segmentation} shows \resppinn{} and commercial CBF maps for one patient, including the infarct core segmentations (in red) for these methods, and the DWI reference.
The \resppinn{} CBF map shows a low CBF region at the location of the infarct.
The visual differences between \resppinn{}-no-init@5000 and \resppinn{}@500 are marginal.
The CBF map generated by \sv{} shows irregular patterns and noticeable visual differences, primarily due to the exclusion of vessels.
On the other hand, \strokeviewer{} generates results that are more similar to our method, but the perfusion map appears somewhat smoother and displays slightly elevated CBF within the infarcted area compared to \resppinn{}.
By visual inspection, \resppinn{} segmentations closely align with the DWI reference segmentation.
Appendix~\ref{app:qualitative_results} presents more qualitative results.
\begin{figure}
\floatconts
  {fig:segmentation}
  {\caption{The CBF perfusion maps and DWI for one patient. The infarct core segmented by each method and the DWI reference segmentation are outlined in red.}}
  {\includegraphics[width=0.9\linewidth]{figures/example.pdf}}
\end{figure}
\tableref{tab:results} lists the average Dice score and volumetric agreement with acute DWI reference segmentations for \resppinn{} and the two commercial software packages.
Dice scores of \resppinn{} demonstrate a significant improvement compared to \sv{} and \strokeviewer{}.
Unlike \strokeviewer{} and \sv{}, which missed several smaller infarcts, our method successfully detected each one.
Furthermore, there is only a marginal decline in the Dice score ($-0.02$) when using the \resppinn{}@500 CBF map compared to \resppinn-no-init{}@5000.
\resppinn{}@500 outperforms \sv{} and \strokeviewer{}, but also \resppinn-no-init{}@5000, in terms of volumetric agreement, with a mean difference closer to zero and a reduced absolute difference.
Appendix~\ref{app:bafigs} presents an analysis of Bland-Altman figures supporting these results.

\begin{table}[htbp]
\floatconts{tab:results}{\caption{Infarct core segmentation results. Dice and mean or absolute volumetric difference (MVD, AVD), and false negative rate (FNR). We report mean (standard deviation) for 5 seeds. Symbols indicate if larger $(\uparrow)$, smaller $(\downarrow)$, or close to zero $(0)$ values denote better performance.}}{
\begin{tabular}{lS[table-number-alignment=left, table-format=$1.2$, table-alignment-mode=format]S[table-number-alignment=left, table-format=$2.1$, table-alignment-mode=format]S[table-number-alignment=left, table-format=$2.1$, table-alignment-mode=format]S[table-number-alignment=left, table-format=$1.2$, table-alignment-mode=format]}
\toprule
{\bfseries Method} & {\bfseries Dice $(\uparrow)$\hspace{0.3cm}}& {\bfseries MVD, ml $(0)$} & {\bfseries AVD, ml $(\downarrow)$}& {\bfseries FNR $(\downarrow)$} \\
\midrule
\sv{} & $0.27$ & $-24.1$ & $28.6$ & $0.14$ \\
\strokeviewer{} & $0.26$ & $-23.3$ & $26.3$ & $0.43$ \\
\midrule
\resppinn{}-no-init@5000 & $0.51{(0.01)}$ & $9.2{(2.6)}$ & $14.4{(1.7)}$ & $0.00{(0.00)}$ \\
\resppinn{}@500 & $0.49{(0.02)}$ & $-0.2{(2.2)}$ & $14.0{(1.1)}$& $0.00{(0.00)}$ \\
\bottomrule
\end{tabular}
}
\end{table}

\section{Discussion and Conclusion}
We presented \resppinn{}, a method for fast volume-based CT perfusion analysis using physics-informed neural fields.
Our experiments show that hash-encodings help the neural fields to rapidly fit high-frequency details in tissue attenuation data and produce accurate perfusion maps.
\sppinn{} operates on 2D axial slices rather than the entire volume and takes more than 40 minutes for full-volume perfusion map generation.
\resppinn{}-no-init, on the other hand, achieves accurate perfusion maps in 12 minutes.
Using the proposed meta-learned initialization, the networks converge faster and with greater stability, allowing for full-volume perfusion map generation within 1.2 minutes, which makes \resppinn{} suitable to be used in clinical practice.
The strong bias introduced by the meta-learned initialization could be a disadvantage since \resppinn{} will be more likely to converge to a local rather than a global minimum.
The segmentation performance and the strong visual agreement between the CBF from \resppinn{}@500 and \resppinn{}-no-init@5000 suggests, however, that acceleration does not harm the perfusion map quality and infarct detection accuracy, and only marginally affects the segmentation results in practice.
Lastly, we show that the proposed method shows improved infarct core segmentation performance compared to commercial software.
Using standard U-Net, our performance is on par with the top methods in the Ischemic Stroke Lesion Segmentation 2018 challenge, where the leading model achieved a Dice of 0.51 \citep{Hakim2021PredictingLearning} (see also Appendix \ref{app:isles}).
We use U-Net since we aimed to show that \resppinn{} perfusion maps are effective for infarct core segmentation.
Future work could investigate whether other approaches could further enhance the infarct core segmentation performance.

The main limitation of this study is the limited acute DWI data that was available for training and evaluation.
Acute DWI imaging is not often acquired and therefore such reference segmentations are hard to come by.
Another limitation is that the initialization works most efficiently if the data are pre-registered to a standard coordinate system.

In conclusion, we showed that meta-learning allows \resppinn{} to achieve rapid full-volume perfusion map generation in 1.2 minutes without compromising map quality.
This computation time is brief enough to potentially enable future clinical use.
Moreover, \resppinn{} achieves accurate infarct core segmentation outperforming commercial software.
\clearpage

\midlacknowledgments{}
This work was part of the Artificial Intelligence for Early Imaging-Based Patient Selection in Acute Ischemic Stroke (AIRBORNE) project.
This project was supported by Top Sector Life Sciences \& Health and Nicolab B.V.
The CLEOPATRA healthcare evaluation study was funded by Leading the Change (LtC).
LtC is financed by Zorgverzekeraars Nederland (ZN) and supports various healthcare evaluations in the Netherlands as part of the Healthcare Evaluation Netherlands project and the CONTRAST consortium.
The CONTRAST consortium acknowledges the support from the Netherlands Cardiovascular Research Initiative, an initiative of the Dutch Heart Foundation (CVON2015-01: CONTRAST), and from the Brain Foundation Netherlands (HA2015.01.06).
The collaboration project is additionally financed by the Ministry of Economic Affairs by means of the PPP Allowance made available by the Top Sector Life Sciences \& Health to stimulate public-private partnerships (LSHM17016).
The CONTRAST consortium was funded in part through unrestricted funding by Stryker, Medtronic and Cerenovus.
The funding sources were not involved in study design, monitoring, data collection, statistical analyses, interpretation of results, or manuscript writing.
\bibliography{midl24_091}

\appendix
\section{Implementation details}\label{app:implementation}
\paragraph{SPPINN and ReSPPINN}
\tableref{tab:settings_sppinn} lists implementation details for \sppinn{} and our proposed \resppinn{}.
% In our implementation, we normalize the spatial coordinates between 0 and 1 to fit the requirements of the hash-encoding strategy.

\paragraph{Loss functions}
\tableref{tab:settings_sppinn_loss} lists a description of the loss functions $\loss{}_{TAC}$, $\loss{}_{AIF}$, and $\loss{}_{ODE}$.
% We employ $l1$-loss for $\loss{}_{TAC}$, $\loss{}_{AIF}$, and $\loss{}_{ODE}$.
% We calculate the loss $\loss{}_{TAC}$ as the average over the spatio-temporal coordinates $(\mathbf{x}, t)$ in the domain, that is, all $t\in T$ and $v\in\Omega$.
% For $\loss{}_{AIF}$, we only consider the temporal domain $T$.
% We randomly generate a set of collocation points at each iteration to minimize $\loss{}_{ODE}$.
% Specifically, we generate collocation points $\tau$ continuously sampled in the same range as $T$ and denote this continuous domain as $T_c$.
% We then calculate the loss over $(\mathbf{x}, \tau)$ in the spatio-temporal collocation domain, that is, all $\tau \in T_c$ and $c\in\Omega$.
$\loss{}_{TAC}$, $\loss{}_{AIF}$, and $\loss{}_{ODE}$ are equally weighted in the total loss function.
In preliminary experiments, we empirically set the batch size to 25000. Adjusting the batch size primarily influences compute time, with minimal visual impact on the quality of the perfusion maps.

\paragraph{Hash-encoding}
\figureref{fig:hash-encoding} presents an illustrative example of the hash-encoding operation.
In the example, we demonstrate hash-encoding with two resolutions ($L=2$) in a two-dimensional setting, while our method actually utilizes $L=16$ resolutions within the three-dimensional domain.
The multi-resolution hash-encoding operation divides the domain into multiple grids, with each grid point indexed by an integer.
For each grid point, we assign $d=2$ learnable weights in a hash table per resolution, which can be retrieved by looking up the integer index.
For a spatial coordinate $\mathbf{x}$, the hash-encoding operation identifies the weights for the nearest grid points in the hash table for each resolution.
Per resolution, the weights corresponding to the grid points are then linearly interpolated according to the relative positions of the coordinate with respect to the grid points.
The interpolated weights for each resolution are subsequently stacked to obtain an embedding $e\in\mathbb{R}^{Ld}$.
This embedding is then used as the input to the networks.
We used a PyTorch implementation of multi-resolution hash-encodings \citep{Hsiao2023HashGridEncoding}.
\begin{figure}[htbp]
\floatconts
  {fig:hash-encoding}
  {\caption{Illustrative example of the hash-encoding operation. Figure inspired by \citet{mueller2022instant}.}}{\includegraphics[width=\linewidth]{figures_appendix/hashgrid.drawio.pdf}}
\end{figure}
\paragraph{Reptile meta-learning}
\tableref{tab:settings_reptile} lists the implementation details for Reptile meta-learning.
\figureref{fig:meta-init} presents the meta-learned initializations for \tacnetst{} at two time points and for \odenet{} for the CBF perfusion parameter.
We note that the initializations already resemble brain-like attenuation or blood flow values. For instance, there is a noticeable increased attenuation in the later timepoint for \tacnetst{}'s initialization. Similarly, the initial CBF pattern shows characteristic features, such as increased flow near supplying arteries.
\begin{figure}[htbp]
\floatconts
  {fig:meta-init}
  {\caption{Meta-learned initializations for \tacnetst{} at two time points and for \odenet{} (CBF is shown).}}{\includegraphics[width=0.7\linewidth]{figures_appendix/meta.pdf}}
\end{figure}
\figureref{fig:meta-train} presents the output of \odenet{} for the CBF at various iterations for \resppinn{}@500 starting from the meta-learned initialization. Moreover, it shows the CBF for \resppinn{}-no-init@5000 for visual comparison. It illustrates \odenet{}'s iterative progress in achieving a closer fit to the data and accurately determining the perfusion parameters.
\begin{figure}[htbp]
\floatconts
  {fig:meta-train}
  {\caption{Starting from the meta-learned initialization \resppinn{}@500 iteratively learns the perfusion map. \resppinn{}-no-init@5000 shown for comparison.}}{\includegraphics[width=0.7\linewidth]{figures_appendix/learning.pdf}}
\end{figure}
\paragraph{Segmentation model}
\tableref{tab:settings_unet} lists the implementation details for training the U-Net segmentation model.
We use the U-Net architecture within the \textsc{monai} framework \citep{2022arXiv221102701C}.
Furthermore, we employ augmentations and loss functions from \textsc{monai}.
Because we select the model based on only one validation scan, we keep a running mean over the last ten epochs and use the mean Dice score to select our best model.
%We repeat all experiments with three different seeds.


\section{Additional qualitative results}\label{app:qualitative_results}
\figureref{fig:qualitative_results} presents qualitative results for five patients.
It includes CBF maps for \sppinn{}@5000 and \resppinn{}@500, along with DWI images, core segmentations from \resppinn{}, and reference segmentations.
Similar to \figureref{fig:segmentation}, there are significant differences between the \sppinn{} and \resppinn{} CBF maps.
The infarct is visible on \sppinn{}'s smoother perfusion maps, but distinguishing brain structures is more challenging.
The infarct is also visible on \resppinn{}'s CBF map and the qualitative structural similarity with the DWI is larger than for \sppinn{}.
The predicted infarct core segmentation and reference show generally good overlap.
The results for two patients with small infarcts are displayed in the second and last row of \figureref{fig:qualitative_results}.
Our approach missed not a single infarct, unlike \strokeviewer{} and \sv{}, which failed to identify several infarcts.
This is also reflected by the false negative rate in \tableref{tab:results}.
Specifically, \sv{} missed 2 out of 14 infarcts, and \strokeviewer{} missed 6 out of 14, often failing to identify particularly small infarcts.
This underscores our model's strength in detecting smaller infarcts, which are often overlooked by alternative approaches.

\begin{figure}[htbp]
\floatconts{fig:qualitative_results}
  {\caption{Comparison of CBF maps, DWI, and core segmentations for five patients, showcasing the differences between \sppinn{}@5000 and \resppinn{}@500.}}{\includegraphics[width=\linewidth]{figures_appendix/mosaic.pdf}}
\end{figure}

\section{Bland–Altman}\label{app:bafigs}
\begin{figure}[htbp]
\floatconts
  {fig:blandaltman}
  {\caption{Bland-Altman figures for \resppinn{} with and without initialization and the two commercial software packages.}}
  {\includegraphics[width=0.75\linewidth]{figures_appendix/bas_cbf_rebuttal.drawio.pdf}}
\end{figure}
\figureref{fig:blandaltman} shows Bland-Altman figures for the two commercially available CTP software packages and \resppinn{} with and without Reptile acceleration.
For both commercial methods, the average difference between the predicted and the reference standard infarct volume is larger than for \resppinn{}.
For the commercial methods, we observe a negative bias, suggesting an underestimation of the predicted infarct core volume.
The negative bias seems primarily due to measurements over 50 ml since we observe a negative trend for larger infarct volumes.
For \resppinn{}@500, we observe little to no bias, also for larger infarct volumes.
\resppinn{}-no-init@5000, however, shows a larger bias, probably caused by an outlier with a considerable overestimation of the infarct core volume.
For all methods, we observe increased variability in the difference as the mean infarct volume grows.
The limits of agreement are narrowest for \resppinn{}@500.
In conclusion, \resppinn{}@500 has the best volumetric correspondence to reference standard DWI infarct core volume estimations.

\begin{table}[htbp]
\floatconts{tab:settings_sppinn}{\caption{Implementation configuration for \sppinn{} and \resppinn{}.}}{
\footnotesize{
\begin{tabular}{@{}lll@{}}
                                                   & SPPINN                      & ReSPPINN                    \\ \midrule
config                                              & value                       & value                       \\ \midrule
spatial input dimensions                            & 2D                          & 3D                          \\
number of layers                   &                             &                             \\
\hspace*{1em}\tacnetst{}           & 3                           & 3                           \\
\hspace*{1em}\aifnet{}             & 3                           & 3                           \\
\hspace*{1em}\odenet{}              & 3                           & 3                           \\
neurons per layer                  &                             &                            \\
\hspace*{1em}\tacnetst{}             & 128                         & 16                          \\
\hspace*{1em}\aifnet{}          & 16                          & 16                          \\
\hspace*{1em}\odenet{}             & 64                          & 16                          \\
activation function                &                             &  \\
\hspace*{1em}\tacnetst{}             & Siren \( w = 15, w_0 = 15\) & Siren \( w = 15, w_0 = 15\)  \\
\hspace*{1em}\aifnet{}             & Siren \( w = 1, w_0 = 1\)   & Siren \( w = 1, w_0 = 1\)   \\
\hspace*{1em}\odenet{}              & Siren \( w = 15, w_0 = 15\)  & Siren \( w = 15, w_0 = 15\)  \\
\hspace*{1em}\odenet{}, last layer    & Exp                         & Exp                         \\

optimizer                                           & Adam                        & Adam                        \\
base learning rate                                  & 1e-3   & 1e-3  \\
learning rate schedule                              & OneCycleLR                  & OneCycleLR                  \\
warm-up iterations                                  & 0                           & 0                           \\
batch size $(B)$                                           & 25000                       & 25000                       \\
hash-encoding                                       & \xmark                     &      \cmark                \\
\hspace*{1em}\textrm{levels}                                     & -                           & 16                          \\
\hspace*{1em}\textrm{features per level}                         & -                           & 2                           \\
\hspace*{1em}\textrm{\( \log_2 \) hashmap size}                  & -                           & 15                          \\
\hspace*{1em}\textrm{base resolution}                            & -                           & 16                          \\
\hspace*{1em}\textrm{finest resolution}                          & -                           & 4096                        \\
GPU memory requirement (MB)  &1134       & 1670                     \\ \midrule
\end{tabular}
}
}
\end{table}

\begin{table}[htbp]
\floatconts{tab:settings_sppinn_loss}{\caption{Description and implementation of the loss function.}}{
\footnotesize{
\begin{tabular}{@{}llp{8.0cm}@{}}
\toprule
loss                                              & function                       & description  \\ \midrule
\(\loss{}_{TAC}\)&  $\frac{1}{|B|}\sum_{(\mathbf{x}, t)}\norm{f_{TAC}(\mathbf{x},t)-C_{TAC}(\mathbf{x},t)}$ & The goal for \tacnetst{} is to fit the observed tissue attenuation $C_{TAC}(\mathbf{x},t)$ at all voxel locations. \(\loss{}_{TAC}\), therefore, is the \(l_1\)-norm of the difference between the predicted and observed tissue attenuation. The observed data are discretely sampled at spatio-temporal coordinates $(\mathbf{x}, t)$. At each iteration, we sample a subset $B$ coordinates from all spatio-temporal coordinates to compute \(\loss{}_{TAC}\). \\ \midrule

\(\loss{}_{AIF}\)& $\frac{1}{|B|}\sum_{t}\norm{f_{\text{AIF}}(t)-C_{\text{AIF}}(t)}$&The goal for \aifnet{} is to fit the observed arterial input function \AIF{}. \(\loss{}_{AIF}\), therefore, is the \(l_1\)-norm of the difference between the predicted and observed arterial input function. The observed data are discretely sampled at temporal coordinates $t$. At each iteration, we sample a subset $B$ spatio-temporal coordinates from all spatio-temporal coordinates and use only the sampled temporal coordinates to compute \(\loss{}_{AIF}\).
\\\midrule
\(\loss{}_{ODE}\) & $\frac{1}{|B|}\sum_{(\mathbf{x}, \tau)}\norm{r(\mathbf{x}_v,\tau)}$&The goal for \odenet{} is to obtain the best estimate of the perfusion parameters at all voxel locations. \(\loss{}_{ODE}\), therefore, is  the \(l_1\)-norm of the residual equation \equationref{eqn:nn_deq_res}. The observed tissue intensity data are discretely sampled at spatio-temporal coordinates $(\mathbf{x}, t)$. For \(\loss{}_{ODE}\), however, we \emph{continuously} sample temporal coordinates $(\mathbf{x}, \tau)$ within the temporal domain. At each iteration, we use the subset $B$ of coordinates $(\mathbf{x}, \tau)$ to compute \(\loss{}_{ODE}\). \\ \bottomrule
\end{tabular}
}
}


\end{table}
\begin{table}[htbp]
\floatconts{tab:settings_reptile}{\caption{Implementation configuration for Reptile meta-learning.}}{
\footnotesize{
\begin{tabular}{@{}ll@{}}
\toprule
config                                              & value                                         \\ \midrule
outer loop & \\
\hspace*{1em}\textrm{optimizer}  & Adam\\
\hspace*{1em}\textrm{base learning rate}  & 1e-2\\
\hspace*{1em}\textrm{learning rate schedule}  & OneCycleLR\\
\hspace*{1em}\textrm{warm-up iterations}  & 30\%  \\
\hspace*{1em}$N_{out}$\textrm{ iterations}  & 7500\\
\hspace*{1em}$\epsilon$  & 0.1\\\midrule
inner loop & \\
\hspace*{1em}\textrm{optimizer}  & Adam\\
\hspace*{1em}\textrm{base learning rate}  & 1e-3\\
\hspace*{1em}\textrm{learning rate schedule}  & OneCycleLR\\
\hspace*{1em}\textrm{warm-up iterations}  & 0\%  \\
\hspace*{1em}$N_{in}$\textrm{ iterations}  & 500\\ \bottomrule
\end{tabular}}
}
\end{table}

\begin{table}[htbp]
\floatconts{tab:settings_unet}{\caption{Implementation configuration for the proposed U-Net.}}{
\footnotesize{
\begin{tabular}{@{}ll@{}}
\toprule
config                                              & value                                         \\ \midrule
features per layer	& 16, 16, 32, 32, 64 \\
augementations	& RandFlip, RandRotate, RandZoom \\
optimizer	&Adam \\
base learning rate	&5e-5 \\
learning rate schedule	&OneCycleLR \\
warm-up iterations	&0 \\
loss function	&DiceCELoss(lambda\_dice=1, lambda\_ce=1) \\
batch size	&1 \\
epochs	&500 \\
patch size	&256x256x20 \\
patch stichting	&SlidingWIndowInferer(mode=Gaussian, overlap=0.5) \\
post-processing	& Infarct core restricted to affected hemisphere \\\bottomrule
\end{tabular}}
}
\end{table}

\section{Segmentation results with slice-based SPPINN perfusion maps}
\begin{table}[htbp]
\floatconts{tab:sppinn2d}{\caption{Infarct core segmentation results for segmentation model using \sppinn{} and \resppinn{} perfusion maps. Dice and mean or absolute volumetric difference (MVD, AVD), and false negative rate (FNR). We report mean (standard deviation) for 3 seeds. Symbols indicate if larger $(\uparrow)$, smaller $(\downarrow)$, or close to zero $(0)$ values denote better performance.}}{
  \begin{tabular}{lS[table-number-alignment=left, table-format=1.2, table-alignment-mode=format]S[table-number-alignment=left, table-format=2.1, table-alignment-mode=format]S[table-number-alignment=left, table-format=2.1, table-alignment-mode=format]S[table-number-alignment=left, table-format=1.2, table-alignment-mode=format]}
    \toprule
    {\bfseries Method} & {\bfseries Dice $(\uparrow)$\hspace{0.3cm}}& {\bfseries MVD, ml $(0)$} & {\bfseries AVD, ml $(\downarrow)$}& {\bfseries FNR $(\downarrow)$} \\
    \midrule
    \sppinn{}@5000 (slice-based) & 0.50{(0.01)} & 10.2{(1.4)} & 17.3{(0.7)} & 0.00{(0.00)} \\
    \resppinn{}@500 (volume-based) & 0.49{(0.02)} & -0.2{(2.2)} & 14.0{(1.1)}& 0.00{(0.00)} \\
    \bottomrule
  \end{tabular}
}
\end{table}

\tableref{tab:sppinn2d} lists the infarct core segmentation results using the perfusion maps generated by the original slice-based \sppinn{}. This experiment resulted in a Dice score of 0.50, an MVD of 10.2 ml, an AVD of 17.3 ml, and an FNR of 0.00. Though the performance is similar to the proposed \resppinn{} approach in terms of Dice score and FNR, the volumetric agreement with the reference DWI segmentation is significantly worse, suggesting an overestimation of the infarct core. This may be a result of a higher level of smoothness in the slice-based perfusion maps, leading to poorly defined segmentation boundaries. Alternatively, the absence of 3D context, and therefore, structural differences between slices, may affect results.

It should further be noted that the high difference in computational time (43 minutes for \sppinn{}, 1.2 minutes for \resppinn{}) makes \sppinn{} unusable in clinical practice. The slice-wise approach further ignores 3D context, which is supported by our proposed method. For a slice-based approach, fitting perfusion values in the brain's top and bottom parts is challenging and prone to suboptimal optimization, due to the smaller brain area in those regions and the fact that those regions are more prone to image artifacts. A full-volume approach mitigates this by calculating loss across the entire volume, enhancing stability and guiding optimization. The volume-based approach therefore offers significant advantages.
\section{Ischemic Stroke Lesion Segmentation Challenge 2018 challenge results}\label{app:isles}
\tableref{tab:isles} lists the top-five results of the Ischemic Stroke Lesion Segmentation Challenge (ISLES) 2018 \cite{Hakim2021PredictingLearning}, as detailed in the official leaderboard at \url{https://www.smir.ch/ISLES/Start2018}. The goal of the challenge was to segment the acute phase DWI MRI reference infarct core from CT and CTP imaging. The available data (63 patients for training, 40 patients for testing) comprised baseline non-contrast CT, CTP source data, and four perfusion maps (CBF, CBV, MTT, Tmax) generated by \textsc{rapid} (\textsc{rapid}; iSchemaview, Menlo Park, CA). The reference standard was segmented on acute phase DWI MRI with a median time between CT and MRI of 36 minutes. Most participants of the challenge employed the non-contrast CT and four perfusion maps (CBF, CBV, MTT, Tmax) as inputs to the segmentation models. We refer to \citet{Hakim2021PredictingLearning} for a full overview of the challenge and main results. The Dice score of \resppinn{} is competitive, aligning with the top-five methods featured on the ISLES 2018 leaderboard. Notably, for \resppinn{}, the absolute volumetric difference is approximately $3-5$ ml higher.

\begin{table}[htbp]
\floatconts{tab:isles}{\caption{Top-five results of the Ischemic Stroke Lesion Segmentation Challenge (ISLES) 2018 \cite{Hakim2021PredictingLearning}.}}{
\begin{tabular}{@{}llS[table-number-alignment=left, table-format=1.2, table-alignment-mode=format]S[table-number-alignment=left, table-format=2.2, table-alignment-mode=format]}
\toprule
{\bfseries Team} & {\bfseries Reference} & {\bfseries Dice $(\uparrow)$} & {\bfseries AVD, ml $(\downarrow)$} \\ \midrule
songt1  & \citet{10.1007/978-3-030-11723-8_31} & 0.51 &  10.24 \\
clera2 & \citet{CLERIGUES2019103487} & 0.49 & 12.18 \\
ghosp1 & N/A & 0.49 & 9.30 \\
zhans10 & N/A & 0.49 & 9.81 \\
pengl1 & \citet{10.1007/978-3-030-11723-8_25} & 0.49 & 10.08 \\ \bottomrule
\end{tabular}}

\end{table}
\end{document}
