\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images
\usepackage{booktabs}
\usepackage{multirow}
\usepackage{enumitem}
\usepackage{float}
\usepackage{caption}

% Header for extended abstracts
\jmlryear{2026}

% to be uncommented for submissions under review
\jmlrworkshop{Full Paper -- MIDL 2026}
\jmlrvolume{-- 117}
\editors{Accepted for publication at MIDL 2026}

\title[SegMaST: A Mamba Spatio-Temporal Model for Longitudinal Segmentation]{SegMaST: Mamba-based Spatio-Temporal Modeling to Improve Longitudinal Disease Detection and Segmentation}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally} 

% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{%
\Name{Aswathi Varma\nametag{$^{1,2,3}$}} \Email{aswathi@tum.de}\\
\Name{Jonas Weidner\nametag{$^{1,3}$}} \Email{j.weidner@tum.de}\\
\Name{Laurin Lux\nametag{$^{2,3}$}} \Email{laurin.lux@tum.de}\\
\Name{Cosmin Bercea\nametag{$^{3,4,5}$}} \Email{cosmin.bercea@tum.de}\\
\Name{Mark M{\"{u}}hlau\nametag{$^{6}$}} \Email{mark.muehlau@tum.de}\\
\Name{Jan Kirschke\nametag{$^{7}$}} \Email{jan.kirschke@tum.de}\\
\Name{Benedikt Wiestler\midljointauthortext{Contributed equally as senior authors}\nametag{$^{1,3}$}} \Email{b.wiestler@tum.de}\\
\Name{Daniel Rueckert\midlotherjointauthor\nametag{$^{2,3,8}$}} \Email{daniel.rueckert@tum.de}\\[0.5em]
\addr $^{1}$ AI for Image-Guided Diagnosis and Therapy, TUM, Munich, Germany\\
\addr $^{2}$ Chair for AI in Healthcare and Medicine, Technical University of Munich (TUM) and TUM University Hospital, Munich, Germany\\
\addr $^{3}$ Munich Center for Machine Learning, Munich, Germany\\
\addr $^{4}$ Technical University of Munich, Munich, Germany\\
\addr $^{5}$ Helmholtz AI and Helmholtz Center Munich, Munich, Germany\\
\addr $^{6}$ Department of Neurology, TUM University Hospital, Munich, Germany\\
\addr $^{7}$ Department of Neuroradiology, TUM University Hospital, Munich, Germany\\
\addr $^{8}$ Department of Computing, Imperial College London, London, UK\\
}

\begin{document}

\maketitle
\begin{abstract}
Longitudinal medical image segmentation is fundamental for quantifying disease progression and evaluating treatment efficacy. However, two critical challenges persist: First, methods that jointly segment baseline and follow-up images remain underexplored, often missing the contextual benefits of simultaneous assessment and lacking longitudinal consistency. Second, real-world datasets typically exhibit severe class imbalance, as scans showing actual disease progression are far rarer than those showing stable anatomy, an issue frequently neglected by existing models. To address these limitations, we propose \emph{SegMaST}, a novel \emph{Mamba}-based spatio-temporal framework. Unlike conventional approaches that treat timepoints in isolation, \emph{SegMaST} leverages cross-temporal information and spatial correspondences to jointly segment the initial baseline mask and explicitly localize new or progressive pathologies in follow-up scans. Additionally, we introduce an imbalance-aware loss accumulation strategy to enhance robustness in realistic clinical settings. On longitudinal cohorts of patients with Multiple Sclerosis (MS) and glioma, \emph{SegMaST} outperforms established CNN- and attention-based baselines for follow-up segmentation (mean follow-up Dice MS \emph{in-house} 0.536, \emph{MSSEG-2} 0.620, and glioma 0.631) and lesion detection (F1 \emph{in-house} 0.688, \emph{MSSEG-2} 0.723), while maintaining state-of-the-art accuracy in baseline segmentation (Dice: 0.617 MS, 0.844 glioma).
\end{abstract}

\begin{keywords}
Longitudinal Segmentation, State Space Models, Mamba, Imbalance Aware Loss
\end{keywords}

\section{Introduction}
Longitudinal Magnetic Resonance Imaging (MRI) is the cornerstone of disease assessment for neurological pathologies. Unlike single-timepoint analysis, longitudinal imaging captures disease progression and therapy response over time \cite{marti2020survey}. This is particularly critical in \textit{Multiple Sclerosis} (MS), where the detection of new and evolving lesions serves as the primary biomarker for modifying treatment plans. Similarly, in \textit{diffuse glioma}, precise follow-up segmentation is required to distinguish tumor recurrence from treatment effects. While manual volumetry remains the gold standard \cite{barkhof20252024, wen2023rano}, it is labor-intensive and prone to inter- and intra-variability. Deep learning offers a scalable solution to this bottleneck; however, popular architectures like \emph{UNet} \cite{ronneberger2015u} and \emph{Swin-UNETR} \cite{hatamizadeh2021swin} typically process time-points independently, ignoring the rich temporal correlations in serial imaging. This risks \emph{longitudinal inconsistency}, where lesions appear and disappear between scans due to noise, whereas deformable registration introduces alignment errors that obscure progression.

To ensure longitudinal consistency, recent architectures explicitly model temporal dependencies. Simple input concatenation, as in \emph{Neuropoly} \cite{macar2021team}, often fails to separate static anatomy from true temporal change. In contrast, \emph{SNAC} \cite{cabezas2021estimating} uses parallel encoders to compare features across multiple resolutions, offering clearer temporal cues. Advanced methods explicitly model scan differences to reduce longitudinal inconsistency: \emph{CoActSeg} \cite{wu2023coactseg} uses voxel-wise difference maps, and the \emph{Temporal Difference Weighting (TDW)} block \cite{rokuss2024longitudinal} performs subtraction on latent features to amplify evolving regions.

Architectural design is often shaped by the available annotation scheme. One common strategy fully annotates both time points, enabling temporally consistent learning \cite{carass2017longitudinal, wei2021consistent} but at the cost of extensive manual labelling. Another approach reduces this burden by annotating only new or progressing abnormalities at follow-up \cite{commowick2021msseg}, though models trained under this setting \cite{macar2021team, cabezas2021estimating} often lack the anatomical context needed to interpret changes. A third annotation configuration, followed in our work, strikes a balance by providing a full baseline segmentation while annotating only new or enlarging regions in the follow-up scans. However, prior efforts often suffer from significant limitations. Some rely on cumbersome auxiliary inputs \cite{seghedplus}, while others require labor-intensive, full-mask supervision for every longitudinal scan \cite{denner2020spatio}. Consequently, the hybrid setting remains underexplored.

Furthermore, typically a strong class imbalance of stable \textit{vs.} progressive cases exists in real-world longitudinal data. This imbalance drives models toward two extremes: majority-class bias (missing subtle progression) or over-sensitivity (falsely flagging stable patients) - both challenge the safe clinical use of such models. To mitigate this issue, strategies range from augmenting training sets with synthetic abnormalities \cite{tahghighi2024enhancing} to leveraging abundant cross-sectional data to compensate for limited longitudinal pairs \cite{wu2023coactseg}. However, ensuring robustness across both progressing and non-progressing cases remains an open problem.  Additionally, enhancing baseline segmentation alongside progression detection ensures more reliable longitudinal analysis. To this end, our contributions are as follows:
\begin{enumerate}[itemsep=0pt] \item We introduce \emph{SegMaST}, a \emph{Mamba}-based spatio-temporal framework that exploits longitudinal dependencies through efficient state-space modeling to simultaneously refine baseline anatomical segmentation and precisely localize new or enlarging abnormalities in follow-up scans.

\item  We address the clinically relevant issue that many scans show no disease activity at follow-up, by employing an \textit{imbalance-aware loss accumulation}. This ensures robust performance on real-world clinical data where disease activity is intermittent.

\item We extensively validate \emph{SegMaST} on two distinct pathologies (MS and glioma), demonstrating that it significantly reduces longitudinal inconsistency compared to both cross-sectional and longitudinal baselines, effectively distinguishing true disease progression from noise in stable and active cases. \end{enumerate}

\section{Methodology}
The above contributions motivate a methodological design that combines longitudinal feature modeling with stable learning in non-progressing cases, along with a joint prediction strategy for segmenting baseline regions and identifying follow-up abnormalities. Following this design, \emph{SegMaST} processes baseline and follow-up scans within one coherent pipeline to generate both masks (see Figure \ref{model}-(a)). To balance performance and efficiency, we adopt a 2.5D architecture rather than a resource-intensive full-3D model, preserving contextual information while significantly lowering resource demands.

To effectively capture nuanced spatio-temporal interactions within this framework, our pipeline leverages \emph{Mamba}-based state-space modeling. This approach addresses the computational bottlenecks inherent in standard spatial self-attention, which suffers from quadratic complexity ($O(N^2)$) in terms of sequence length. By strictly utilizing a selective state-space model (SSM) with linear complexity ($O(N)$) \cite{gu2023mamba}, we overcome the challenges that arise when flattening spatial patches results in sequence lengths that are prohibitive for explicit attention matrices. Consequently, we achieve global receptive fields comparable to \emph{SegFormer} \cite{xie2021segformer} while maintaining high throughput. We realize this design through a modular pipeline consisting of a hierarchical encoder, where Spatio-Temporal (ST) blocks capture longitudinal disease activity at multiple scales, and a dual-head decoder, detailed as follows.

\paragraph{Hierarchical Encoder.} To capture features ranging from fine-grained lesion boundaries to global semantic context, \emph{SegMaST} employs a four-layer hierarchical encoder. For an input pair with spatial dimensions $ H \times W $ and channels $ C $, we define the input tensor as $ X \in \mathbb{R}^{N \times C \times H \times W} $, where $ N=2 $ represents the timepoints. We first apply a convolutional patch embedding layer to tokenize the input while preserving local spatial continuity, yielding $ X_1 \in \mathbb{R}^{N \times C_1 \times \frac{H}{4} \times \frac{W}{4}} $. In subsequent stages, we progressively increase the receptive field, such that the feature tensor at layer $ i $ follows $ X_i \in \mathbb{R}^{N \times C_i \times \frac{H}{2^{i+1}} \times \frac{W}{2^{i+1}}} $.

\paragraph{ST Block and MaST Module.} We employ ST blocks to capture joint dynamics from the encoder features. Central to this block is the \emph{Mamba} Spatio-Temporal (\emph{MaST}) module, which processes the input by layer normalizing and reshaping it into two complementary sequences: (1) \textbf{Temporal-first} ($ X_{i,t} \in \mathbb{R}^{C_i \times N (H/2^{i+1} W/2^{i+1})} $), where spatial patches are unfolded and concatenated along the time axis; (2) \textbf{Spatial-first} ($ X_{i,s} \in \mathbb{R}^{C_i \times (H/2^{i+1} W/2^{i+1}) N} $), where patches are stacked to preserve spatial correspondence across time. These flattened sequences, which create contexts too long for standard attention, are efficiently processed by the \emph{Mamba} SSM \cite{gu2023mamba}. By compressing context into a hidden state rather than calculating pairwise interactions, the SSM enables the effective learning of spatio-temporal dependencies with linear complexity. We repeat this ST block $ M = 4 $ times before applying overlapped patch merging to downsample features for the next stage.

\paragraph{Decoder.} We utilize a \emph{CNN}-based decoder to generate segmentation masks, adopting the lightweight design of \cite{yang2024vivim,xie2021segformer}. First, an \emph{MLP} unifies the multi-level features from the encoder along the channel dimension. These unified representations are upsampled to a common spatial resolution and concatenated. A subsequent \emph{MLP} fuses this aggregated representation, projecting the concatenated tensor $(N, 4C, H/4, W/4)$ to a lower embedding dimension $C$. The resulting features are then reshaped to recover the temporal structure, separating the latent representations of the two time points. Finally, a dual-head prediction module applies a $1 \times 1$ convolution to the respective time steps, producing the baseline segmentation $\hat{Y_1}$ from the first time point and the progression segmentation $\hat{Y_P}$ from the second time point.
\begin{figure}[ht]
\includegraphics[width=1.05\textwidth,
    trim={0 20 0 20}]{images/model.pdf}
\caption{\small (a) \emph{SegMaST} Architecture. Longitudinal image pairs are processed by a hierarchical encoder utilizing Spatio-Temporal (ST) blocks, where a \emph{Mamba}-based module (MaST) efficiently captures spatial and temporal dependencies. Multi-scale features are aggregated by a \emph{CNN}-based decoder, which feeds a dual-head module to predict baseline ($\hat{Y_1}$) and progression ($\hat{Y_P}$) masks.  (b) \emph{Imbalance-Aware Loss Accumulation}. The total loss combines the baseline term ($\mathcal{L}_1$) with a filtered progression term ($\mathcal{L}_p^{\text{filtered}}$). This mechanism selectively retains a balanced subset of zero-mask samples to prevent the high prevalence of non-progression from biasing the model.
} 
\label{model}
\end{figure}
\paragraph{Imbalance-Aware Loss Accumulation.} In clinical settings, follow-up datasets are dominated by stable cases. These cases have \textit{empty} progression masks. Progressive cases, which contain \textit{non-empty} masks, are comparatively rare. This imbalance skews model predictions and reduces sensitivity to disease evolution \cite{karimian2018gd}.
 We address this through a combination of an \textit{imbalance-aware} loss accumulation that filters some empty masks and a sampling strategy that maintains a balanced progression-to-no-progression ratio per batch. This prevents gradients from being dominated by empty masks while preserving useful learning signals.

Given a batch of segmentation outputs, let \( Y_1, \hat{Y}_1 \) denote the ground truth and predicted baseline masks, and \( Y_P, \hat{Y}_P \) denote the ground truth and predicted progression masks. We use the \textit{Dice-Focal} loss, computed separately for the baseline and progression heads, and sum them to obtain the final loss. To prevent trivial cases from dominating training, we apply a filtering mechanism for progression head loss, as many samples have no progression (\( Y_P = 0 \)).
\paragraph{Filtering Mechanism. }  
The total loss consists of two terms: the baseline head loss (\( \mathcal{L}_1 \)) and a subset of the progression head loss (\( \mathcal{L}_p \)). We define a binary indicator \( M \), indicating whether a sample contains progression. Using this value, we separate the progression loss values into:  
\[
\mathcal{L}_p^{+} = \mathcal{L}_p[M], \quad \mathcal{L}_p^{0} = \mathcal{L}_p[\neg M]
\]  
Here \( \mathcal{L}_p^{+} \) represents the loss for samples with progression (\( M = 1 \)), while \( \mathcal{L}_p^{0} \) corresponds to the loss for no-progression samples (\( M = 0 \)). Since we want the model to learn from no-progression cases while preventing imbalance, we randomly sample a subset from \( \mathcal{L}_p^{0} \), matching the number of progression cases in the batch. This selected subset is included in the final loss term, denoted as \( \mathcal{L}_p^{\emptyset} \). The filtered progression loss is obtained by concatenating the two:  
\[
\mathcal{L}_p^{\text{filtered}} = \textit{Concat}(\mathcal{L}_p^{+}, \mathcal{L}_p^{\emptyset}).
\]  
The final loss is computed as the sum of mean terms, as shown in Figure \ref{model} - (b), where \( B \) is the original batch size and \( B' \) is the number of samples used in the progression head after filtering. There is a trade-off between excluding all zero masks, which could bias training towards detecting only positive cases, and retaining all zero masks, which could lead to the model ignoring changes in progression. Our approach ensures a proportionate distribution, striking a balance between these extremes. The impact of different proportions is analyzed in an ablation study (section~\ref{subsec:ial_ablation}).

\section{Experimental Setup}

\textbf{Datasets.} We evaluate \emph{SegMaST} on three datasets spanning two distinct brain pathologies: MS and diffuse glioma. The MS dataset was derived from a well-characterized subgroup of the cohort of our \emph{in-house} observational MS study, TUM-MS \cite{bayas2024prospective}. This dataset contains MRI scans at baseline, 6, 12, and 24 months (3–4 time points per patient). Each subject has a baseline whole-lesion mask and new or enlarging lesion masks for follow-ups. We use 3D FLAIR and T2 volumes of size $193 \times 229 \times 193$ voxels (1 × 1 × 1 mm$^3$) for binary segmentation of baseline lesions and progression, with 224 successive time-point pairs (125 without progression and 99 with progression). The test-set for the \emph{in-house} MS dataset includes 45 patients, of whom 25 exhibit no progression.

The second MS dataset is the public \emph{MSSEG-2} dataset, which provides only new-lesion annotations of follow-up scans. We use the official training split, which consists of two-time-point 3D FLAIR scans of size $193 \times 229 \times 193$ voxels (1 × 1 × 1 mm$^3$), comprising 40 timepoint pairs (11 without progression and 29 with progression). Follow-up scans were acquired 1–3 years after the baseline scan. We use 32 subjects for training and 8 subjects for testing. The test set corresponds to the official challenge validation split, with subjects without new lesions excluded according to the challenge protocol.

The third dataset, \emph{UCSF-ALTPD} \cite{fields2024university}, consists of multimodal MRI scans from glioma patients with two consecutive follow-up time points. The preprocessing includes skull stripping \cite{isensee2019automated}, N4 bias correction \cite{tustison2010n4itk}, coregistration, and rigid SRI24 atlas registration \cite{rohlfing2010sri24}. Each case provides FLAIR, T1, contrast-enhanced T1 (T1-ce), and T2 volumes of size $240 \times 240 \times 155$ voxels (1 × 1 × 1 mm$^3$), along with tumor masks delineating enhancing tissue (ET), surrounding nonenhancing FLAIR hyperintensity (SNFH), nonenhancing tumor core (NETC), and resection cavity (RC). In our setup, the baseline head performs binary whole-tumor segmentation, while the progression head predicts a binary difference mask between the two time points. We randomly sample 200 patients for training and evaluation (88 without progression and 112 with progression). The test set consists of 25 patients, of whom 16 exhibit no progression.

\paragraph{Training and Evaluation.}\emph{SegMaST} adopts a 2.5D slice-based cross-view strategy. During training, the network is exposed to slices from all three orthogonal anatomical planes (axial, coronal, and sagittal), following established cross-view training paradigms in MS \cite{aslani2019multi}. During inference, for each voxel, predictions are obtained from the three orthogonal slices intersecting that voxel. The probability outputs from the pixels in each view are averaged and thresholded to determine the final voxel-wise segmentation.

As baselines, we include both 2.5D and 3D spatiotemporal models. In the 2.5D setting, we adopt a \emph{SegFormer} and a convolutional \emph{DynUNet} (MONAI's \emph{nnUNet}) baseline, both of which follow the same cross-view inference strategy as \emph{SegMaST}, aggregating predictions from the three orthogonal planes at the voxel level. We further evaluate 3D models including \emph{nnUNet}~\cite{isensee2021nnu}, \emph{DynUNet}, \emph{SwinUNETR}, and \emph{LongiUNet-DW} - extending a standard \emph{UNet} with temporal depth-wise (TDW) blocks~\cite{rokuss2024longitudinal}. On the \emph{in-house} MS dataset, we additionally train two cross-sectional \emph{DynUNet} models (\emph{DynUNet (CS)}), with progression estimated via subtraction. For \emph{MSSEG-2}, we further compare against established longitudinal and cross-sectional baselines, including \emph{CoactSeg}, \emph{Neuropoly}, and \emph{SNAC}~\cite{wu2023coactseg}. All baseline models for the \emph{in-house} and \emph{UCSF-ALTPD} datasets, except \emph{nnUNet}, employ dual convolutional output heads to ensure architectural consistency with \emph{SegMaST}. Since \emph{nnUNet} does not natively support multiple ground-truth outputs, we adopt a multi-class formulation (background, baseline-only, follow-up-only, and overlap), where the overlap class denotes voxels labeled as lesions at both timepoints. At inference, baseline and progression masks are reconstructed from the predicted labels to enable fair comparison with dual-head models.

To ensure consistent and fair evaluation, we standardize training, inference, and augmentation settings across models. We employ the weighted patch-sampling strategy of~\cite{zhang2022qsmrim} with crop sizes tailored to each dataset (\emph{In-house}: 128; \emph{MSSEG-2}: 80; \emph{UCSF-ALTPD}: 160), which is applied during training. At inference, we perform patch-wise sliding-window prediction over the full volume using the same voxel resolution and patch sizes as in training, with a 50\% overlap. For 2.5D models, slices are processed at the original in-plane resolution, and predictions from the three orthogonal planes are aggregated voxel-wise to obtain the final 3D segmentation. Predicted connected components below minimum volume thresholds ($27~\mathrm{mm}^3$ for MS and $50~\mathrm{mm}^3$ for UCSF-ALTPD) are suppressed following~\cite{brats2023}. The 2.5D models are lightweight ($\sim$9M parameters for MS and $\sim$16M for diffuse glioma) and trained for 150-500 epochs with batch size 32 and early stopping, whereas 3D models are larger ($\sim$15M for MS and $\sim$25M for diffuse glioma) and trained for up to 1000 epochs with batch size 8. Standard geometric augmentations, including random flips and rotations, are applied. All models are optimized using AdamW~\cite{loshchilov2017decoupled} (weight decay 0.05, learning rate $10^{-4}$) with cosine learning rate decay~\cite{loshchilov2016sgdr} and a 40-epoch warm-up~\cite{goyal2017accurate}. The proposed \textit{imbalance-aware loss accumulation} is applied to all models throughout training. Our experiments are conducted on an NVIDIA RTX A6000 GPU. Note that \emph{nnUNet} uses its default training, inference, and augmentation pipeline; thus, these settings are not applicable.

In line with recent validation recommendations \cite{maier2022metrics}, we use the Dice score to assess voxel-wise segmentation quality and the lesion-wise F1 score to evaluate lesion-level detection.  We additionally report positive predictive value (PPV) for the MS datasets. To evaluate clinical utility, we compute the Disease Activity Assessment (DAA), a case-level clinical metric that assesses whether a follow-up scan of an MS patient is correctly classified as exhibiting disease progression or stability. A prediction is considered correct if progression is detected for true progression cases and no progression is predicted for stable cases, consistent with clinical monitoring guidelines for MS \cite{wattjes20212021}.
Following the methodology established by \cite{commowick2018objective}, we exclude tiny lesions (defined as less than $11~\mathrm{mm}^3$) from the F1 calculation to filter out noise and focus on clinically significant observations. We apply the Dice score to both baseline and progression regions, with progression performance further stratified into progression (P) and non-progression (NP) cases. For NP cases, we assign a strict Dice score of $1$ when the predicted progression mask is empty and $0$ otherwise, reflecting the clinical requirement of correctly identifying stable patients.
\section{Results}
\input{tables/proval_results}
\input{tables/msseg2_results}
\input{tables/ucsf_results}
For the \textit{in-house} MS dataset, Table~\ref{tab:ms_metrics} reports test-set performance for baseline and progression prediction. For the baseline head, 3D \emph{nnUNet} achieves the highest Dice and F1, likely due to its extensive data augmentation and pre/post-processing pipelines. While \emph{SegMaST} remains competitive at baseline, it is the best-performing model for progression, achieving the highest P Dice (0.513) and lesion-wise F1 (0.688), outperforming all 2.5D and 3D baselines. Although 2.5D \emph{DynUNet} attains strong NP performance, it underperforms \emph{SegMaST} on progression, highlighting the benefit of explicit spatiotemporal modeling. The subtractive \emph{DynUNet (CS)} performs poorly in both P and NP cases. \emph{SegMaST} achieves DAA comparable to top-performing baselines, matching \emph{DynUNet} and \emph{LongiUNet-DW} (64.4\%). Importantly, \emph{SegMaST} achieves superior longitudinal performance at the lowest training cost, converging 5.71$\times$ faster than \emph{nnUNet}. On \emph{MSSEG-2}, \emph{SegMaST} achieves the highest lesion-wise F1 (0.723), surpassing all prior methods, including the \emph{nnUNet} (0.691). It also attains a competitive Dice score of 0.620, on par with top 3D models such as \emph{CoactSeg}. Figure.~\ref{example} qualitatively demonstrates improved progression delineation on the \emph{in-house} dataset (top) and \emph{MSSEG-2} dataset (bottom), with reduced false positives and missed lesions.
\begin{figure}[htbp]
\centering
\includegraphics[width=0.66\textwidth,
    trim={0 20 0 20}]{images/examples_ms.pdf}
\caption{ \small Exemplar results of \emph{SegMaST} and other models for MS lesion progression segmentation. 
\textbf{Top:} \textit{in-house} dataset. 
\textbf{Bottom:} \emph{MSSEG-2} dataset. 
Green arrows indicate false positives, and yellow arrows indicate false negatives.
} \label{example}
\end{figure}
The mean test-set metrics for baseline whole-tumor segmentation and P/NP prediction on the \emph{UCSF-ALTPD} dataset are shown in Table~\ref{tab:ucsf_results}. For the baseline task, 3D \emph{nnUNet} achieves the highest Dice score (0.861), while \emph{SegMaST} remains highly competitive (0.844) and outperforms several 3D baselines. For the progression task, \emph{SegMaST} provides the best overall performance, achieving the highest Dice for NP cases (0.750) while obtaining P Dice (0.511) comparable to \emph{DynUNet} (0.520). Notably, the 3D baselines struggle significantly with NP cases, underscoring the difficulty of accurately modeling stable follow-up scans in these architectures. Furthermore, \emph{SegMaST} maintains its efficiency advantage, training nearly 4$\times$ faster than \emph{nnUNet}. 

Overall, results across all experiments demonstrate that while \emph{SegMaST} achieves performance comparable to state-of-the-art models such as \emph{nnUNet} for baseline segmentation in two of three datasets, it consistently outperforms all models in follow-up assessment across all three datasets, at a substantially lower computational cost.

\subsection{Ablation Study - Contribution of No Progression Cases}
\label{subsec:ial_ablation}
\begin{figure}[htbp]
\floatconts
  {fig:ial_ablation}
  {\caption{%
  \small
  Comparison of three strategies for handling imbalance between progression (P) and no-progression (NP) cases. 
\emph{Whole} uses all NP cases, \emph{None} excludes them, and the proposed \emph{Partial} strategy uses a balanced subset. Deviations indicate differences between \emph{Partial} and \emph{None}.
}}
  {\includegraphics[width=0.85\linewidth, trim={0 160 0 60}]{images/ial_ablation.pdf}}
\end{figure}
 To counter the over-dominance of non-progression cases (which essentially are empty segmentation masks), we compare three distinct loss accumulation strategies: (1) \textbf{Whole:} all NP cases included, (2) \textbf{None:} all NP cases excluded, and (3) Our proposed \textbf{Partial:} or \textit{imbalance-aware loss accumulation}, where a subset of NP cases is included, maintaining an equal number of NP and P cases per batch. As shown in Figure~\ref{fig:ial_ablation}, the Whole strategy yields the lowest P Dice (0.484) and P F1 (0.619) due to the dominance of NP samples. The None setting achieves the highest P Dice (0.523) and P F1 (0.698) but performs poorly on B and NP. Our Partial strategy offers the best overall balance, obtaining the highest B Dice (0.617), NP Dice (0.560), and B F1 (0.567), while maintaining strong P F1 (0.688). This demonstrates that \textit{imbalance-aware loss accumulation} effectively balances B, P, and NP performance, an important aspect for clinical use.

\subsection{Ablation Study - Lesion Size-Wise Analysis}
\input{tables/proval_sizewise_bl}
 To evaluate the robustness of \emph{SegMaST} across varying lesion volumes, we perform a size-wise analysis on the \emph{in-house} MS dataset, reporting results for baseline lesions and progression lesions (Table~\ref{tab:lesion_size_combined}). For the baseline segmentation (Left), \emph{SegMaST} demonstrates superior performance across medium and large lesions, achieving the highest Lesion-F1 and Dice scores in both the Medium (Lesion-F1: 0.746, Dice: 0.424) and Large (Lesion-F1: 0.915, Dice: 0.617) bins. In the challenging progression lesion analysis (Right), \emph{SegMaST} exhibits a dominant trend, achieving the highest Lesion-F1 scores across all three bins: Small (0.235), Medium (0.703), and Large (0.889). It also achieves the best Dice scores for both Medium (0.598) and Large (0.720) progression lesions. This strong performance, particularly in the difficult small-to-medium progression bins, validates the ability of our \emph{Mamba}-based spatio-temporal modeling to effectively capture subtle changes indicative of disease progression across different lesion scales.

\subsection{Ablation Study - Multi-Timepoint Spatio-Temporal Modeling}
\begin{figure}[htbp]
\centering
\captionsetup{font=footnotesize}

\caption{ Multi-timepoint analysis of \emph{SegMaST}. FU1 and FU2 denote the first and second follow-up scan. Follow-up scores are split into progression (P) and no-progression (NP). Values are mean (SD). Entropy maps show predictive uncertainty with ground-truth lesion contours in green. (a) Localized uncertainty at FU1 and diffused uncertainty at FU2. (b) Increased uncertainty around new lesion at FU2.}
\label{tab:multi_tp}

\includegraphics[width=0.9\linewidth]{images/entropy_maps.pdf}

\vspace{0.6em}

\scriptsize
\renewcommand{\arraystretch}{1}
\setlength{\tabcolsep}{5pt}

\begin{tabular}{l c cc cc}
\toprule
\multirow{2}{*}{\textbf{Metric}} 
& \multirow{2}{*}{\textbf{B}} 
& \multicolumn{2}{c}{\textbf{FU1}} 
& \multicolumn{2}{c}{\textbf{FU2}} \\
\cmidrule(lr){3-4} \cmidrule(lr){5-6}
& & \textbf{P} & \textbf{NP} & \textbf{P} & \textbf{NP} \\
\midrule

Dice $\uparrow$      
& 0.574 {\tiny(0.159)}
& 0.435 {\tiny(0.287)}
& 0.857 {\tiny(0.350)}
& 0.278 {\tiny(0.310)}
& 0.700 {\tiny(0.458)} \\

Lesion-F1 $\uparrow$ 
& 0.477 {\tiny(0.139)}
& 0.464 {\tiny(0.317)}
& --
& 0.237 {\tiny(0.346)}
& -- \\

\bottomrule
\end{tabular}

\end{figure}


We investigate the scalability of \emph{SegMaST} by extending the input from two to three consecutive timepoints using the \textit{in-house} MS dataset. In this configuration, the model simultaneously predicts the baseline lesion mask (B) and two follow-up progression masks corresponding to the first (FU1) and second (FU2) follow-up scans. As shown in Table~\ref{tab:multi_tp}, the model maintains only slightly diminished baseline performance (Dice: $0.574$) but exhibits a clear degradation in progression prediction as the temporal distance increases. Specifically, progression Dice decreases from $0.435$ at FU1 to $0.278$ at FU2, indicating increased uncertainty and temporal drift in long-term longitudinal modeling.

\textit{Entropy maps} provide a mechanistic explanation for this temporal decay. 
At FU1, uncertainty remains spatially localized around lesion boundaries, whereas at FU2 it expands into a diffuse volumetric region, indicating that the network captures the correct anatomical neighborhood but loses sharp voxel-wise delineation (see Figure~\ref{tab:multi_tp}). This shift aligns with the observed Dice degradation. Overall, while \emph{SegMaST} leverages longitudinal dependencies, spatial precision degrades over extended temporal windows. Several factors may contribute to this, such as (i) propagated registration inaccuracies, (ii) conflicting supervision signals, as the same image signal is a new lesion in FU1 and a stable lesion in FU2, or (iii) pronounced treatment changes and atrophy that accumulates over time.
\section{Discussion \& Conclusion}
Longitudinal segmentation plays a crucial role in monitoring disease progression, yet existing methods often face challenges related to imbalanced datasets or foregoing the rich spatio-temporal information. To address these issues, we introduce \emph{SegMaST}, a novel approach that jointly segments baseline and follow-up images while incorporating an \textit{imbalance-aware loss accumulation} strategy to effectively manage the dominance of non-progression cases. Our results demonstrate that \emph{SegMaST} achieves superior performance for longitudinal progression modeling compared to established baseline methods, including both convolutional and attention-based models, by leveraging rich spatio-temporal information in medical imaging data. Notably, this improvement is consistent across two distinct and challenging clinical scenarios: multiple small, newly appearing lesions in MS and continuous growth patterns in gliomas. These findings underscore the versatility and robustness of \emph{SegMaST} for advancing longitudinal disease progression analysis.

An important limitation of our framework, and longitudinal image analysis in general, is the dependence on accurate image registration. While \emph{SegMaST} exploits spatio-temporal dependencies to maintain consistency, it relies on good pre-alignment of baseline and follow-up scans. In clinical practice, perfect alignment is non-trivial; rigid registration fails to capture non-linear anatomical shifts, while deformable registration can introduce warping artifacts that obscure true pathological changes or create false progression. Consequently, residual misalignment can disrupt the spatial correspondences our spatio-temporal module relies on, potentially introducing noise that negatively impacts detection sensitivity and specificity.
We explicitly designed \emph{SegMaST} to be a disease-agnostic framework for longitudinal image analysis. Looking ahead, a promising direction for improving longitudinal assessment is the integration of clinical covariates alongside image data. Disease progression is often contextualized by non-imaging factors such as patient age, genetic markers, and specifically, treatment status. Incorporating these variables, for example via conditioning mechanisms within the network bottleneck, could provide the model with a prior regarding the likelihood of progression versus stability (e.g., distinguishing pseudo-progression from true recurrence in glioma). By moving beyond image changes alone and fusing multimodal clinical data, future iterations could offer a more holistic and clinically accurate evaluation of therapeutic response. 
We make our codes for  \emph{SegMaST} and the \emph{imbalance-aware loss accumulation} publicly available at \url{https://github.com/Aswathi-Varma/SegMaST}.

\midlacknowledgments{Aswathi Varma, Mark M{\"{u}}hlau, Benedikt Wiestler, and Daniel Rueckert are supported by the DFG as part of the SPP \textit{Radiomics} (project number 428223038).}
\bibliography{midl26_117}



\section{Appendix}
\label{a:patch_ablation}
\begin{figure}[H]
    \centering
    \includegraphics[width=0.7\textwidth]{images/patch_ablation.pdf}
    \caption{\small 
        Ablation on patch size under weighted cropping. Larger patches ($40^2$ to $128^2$) provide more global context and improve Dice and F1 performance on the \emph{in-house} dataset.
    }
    \label{fig:patch_ablation}
\end{figure}

\paragraph{Effect of Weighted Cropping Patch Size.} When applying weighted cropping on the in-house dataset, the results show that larger patch sizes consistently outperform smaller ones in terms of both baseline Dice and P F1 (Figure \ref{fig:patch_ablation}). Weighted cropping makes small patches (e.g., $40^2$) highly local and biased toward lesion-centered regions, limiting global contextual cues and resulting in lower baseline Dice (0.525) and weaker NP F1 (0.523). As patch size increases, the model benefits from more anatomical context, which improves its ability to distinguish true progression from stable regions. This is reflected in the steady rise in baseline Dice, reaching 0.617 at $128^2$, and in the pronounced improvement in **non-empty progression F1, which peaks at 0.688 at the same size. Progression Dice follows a similar upward trend, increasing from 0.439 at $40^2$ to above 0.51 at $128^2$. Overall, combining weighted cropping with larger patches ($128^2$) provides the optimal balance: weighted sampling focuses the model on lesion-relevant areas, while the larger patch size supplies the necessary global context, yielding the strongest Dice and F1 performance for progression detection.

\end{document}