\documentclass[runningheads]{llncs}
%
\usepackage[T1]{fontenc}
\usepackage{graphicx,verbatim}
%
% additional packages 
\usepackage{amsmath,amssymb}
\usepackage{booktabs}
\usepackage{multirow}
\usepackage{xcolor}
\usepackage{tikz}
\usetikzlibrary{positioning,arrows.meta,fit,backgrounds}
\usepackage[hidelinks]{hyperref}
\usepackage{marvosym}
%
% math shorthands used throughout
\newcommand{\z}{\mathbf{z}}
\newcommand{\x}{\mathbf{x}}
\newcommand{\cova}{\mathbf{c}}            % covariate vector
\newcommand{\zt}{\z_t}
\newcommand{\Dt}{\Delta t}
\newcommand{\Eb}[1]{\mathbb{E}\!\left[#1\right]}

\begin{document}
%
\title{Recovering Progression Beyond the Identity Shortcut: A Schr\"odinger Bridge Framework for Longitudinal Brain-MRI}
\titlerunning{Latent Schr\"odinger Bridge for AD Progression}
%
\author{Leonida Lumburovska\inst{1,2}\textsuperscript{(\Letter)} \and
Pedro Moreira\inst{3} \and
Jasmina Bogojeska\inst{1}}
\index{Lumburovska, Leonida}
\index{Moreira, Pedro}
\index{Bogojeska, Jasmina}
%
\authorrunning{L. Lumburovska et al.}
%
\institute{Centre for Artificial Intelligence, ZHAW School of Engineering, Winterthur, Switzerland\\
\email{\{lumb,bogo\}@zhaw.ch} \and
University of Zurich, Zurich, Switzerland \and
Massachusetts Institute of Technology, Cambridge, MA, USA\\
\email{pedrojfm@mit.edu}}

\maketitle              
%
\begin{abstract}
Predicting how a patient's brain will change over time is a central goal
in Alzheimer's disease (AD), with applications from trial enrichment to
patient counselling. Conditional generative models are the natural tool,
but face a silent failure mode: consecutive scans of the same subject are
nearly identical, so the learning target collapses toward the identity
map and the model copies the baseline, undetected by standard image-quality
metrics (SSIM, PSNR, FID). We pose longitudinal AD-MRI prediction as
stochastic transport problem between a baseline and a follow up distribution and solve it with an Image-to-Image Schr\"odinger
Bridge (I\textsuperscript{2}SB) in the latent space of a
biomarker-preserving VAE, conditioned on demographics, \emph{APOE4},
cognitive scores, treatment, and the time horizon $\Dt$; this way, the bridge drift
stays non-trivial even when endpoints coincide. We add three
anti-identity supervision terms to the loss and a suitable evaluation protocol: an identity floor
plus a conditional-response test, that measures collapse directly. On
held-out subjects the model beats the floor on all three biomarkers,
including the hippocampus (the hardest, smallest-signal case), and its
follow-ups respond in the clinically correct direction, monotonically in
$\Dt$. The model thus recovers progression signal that the identity
baseline cannot: it gets the direction and ordering of change right,
while predicting its magnitude conservatively.

\keywords{Longitudinal MRI \and Schr\"odinger bridge \and Latent
diffusion \and Alzheimer's disease \and Identity collapse \and
Conditional generation.}
\end{abstract}

\section{Introduction}
As it progresses over time, Alzheimer's disease (AD) produces gradual structural changes detectable on
T1-weighted MRI including hippocampal and entorhinal atrophy, ventricular enlargement, and cortical thinning ~\cite{adni,frisoni2010}. A
model that, given a baseline scan, a patient's covariates, and a horizon
$\Dt$, creates a plausible \emph{future} scan would aid trial
enrichment and counterfactual ``what-if-treated'' reasoning. Such predictions are
\emph{population-conditional}: they describe the expected progression of
a covariate-matched subgroup, not one person's exact future brain~\cite{brlp}.

 Conditional generative models are the natural tool for this. Recent work uses GANs for brain-aging (DaniNet~\cite{daninet}) and diffusion for longitudinal and counterfactual synthesis (SADM~\cite{sadm}, counterfactual editing~\cite{counterfactual}, and BrLP~\cite{brlp}). These report image-quality and volumetric metrics, but to
our knowledge none against an explicit identity baseline, leaving a key failure mode undetected. Because scans of the same subject years apart are nearly identical, a model can minimise its loss by copying the baseline, and standard frequently used image-quality metrics such as Structural Similarity Index Measure (SSIM), Peak Signal-to-Noise Ratio (PSNR) and Fréchet Inception Distance (FID) all rate such a copy as excellent~\cite{brlp}. A collapsed model thus reports strong numbers while learning nothing about progression, as we observed in an earlier conditional Diffusion Transformer (DiT)~\cite{dit} trained with rectified flow~\cite{rectflow}. The task is particularly difficult: annual hippocampal atrophy is only
$1$--$3\%$~\cite{frisoni2010}, easily buried in acquisition noise
and registration drift; with small cohorts and imbalanced covariates.

We therefore reframe the task as stochastic transport between
the baseline and follow-up \emph{distributions}, realised by an
Image-to-Image Schr\"odinger Bridge (I\textsuperscript{2}SB)~\cite{i2sb}
in the latent space of a biomarker-preserving 3D VAE~\cite{maisi}, whose drift
stays non-trivial even for near-identical endpoints, so identity is never optimal.
We condition it on clinical covariates and add an evaluation protocol that
exposes and mitigates identity collapse.

\noindent\textbf{Contributions.} We make three contributions: (i)~a conditional latent Image-to-Image Schrödinger Bridge for longitudinal AD-MRI prediction whose stochastic interpolant mitigates identity collapse by making the optimal velocity a population displacement rather than zero; (ii)~an evaluation protocol that makes identity collapse measurable, combining latent biomarker constraints, explicit identity baselines, and a $\Delta t$-sensitivity test;
and (iii)~three anti-identity losses that discourage trivial reconstruction and improve sensitivity in low-signal regions such as the hippocampus.

\section{Method}
\label{sec:method}

\begin{figure}[t]
\centering
\resizebox{\textwidth}{!}{%
\begin{tikzpicture}[
  font=\small, >=Latex, node distance=6mm,
  box/.style   ={draw, rounded corners, align=center, minimum height=11mm,
                 inner sep=4pt},
  frozen/.style={box, fill=blue!6},
  model/.style ={box, fill=orange!14, thick},
  data/.style  ={box, fill=gray!8},
  io/.style    ={box, fill=gray!4, draw=gray!55},
  eval/.style  ={box, fill=green!8},
  cov/.style   ={box, fill=gray!6, align=center},
  sup/.style   ={draw, dashed, rounded corners, align=center, fill=red!5,
                 inner sep=4pt},
  tag/.style   ={font=\scriptsize\itshape, gray!55!black},
  lbl/.style   ={font=\scriptsize, midway, above, inner sep=1.5pt},
  infarr/.style  ={->, blue!65!black, thick},
  trainarr/.style={->, violet!65!black, thick, dashed},
]
% ---- main inference row ----
\node[io]                       (x0)  {Baseline\\$\x_0$ (ADNI)};
\node[io, below=6mm of x0, draw=violet!65!black, dashed] (x1in)
                                      {Follow-up\\$\x_1$ (train)};
\node[frozen, right=of x0]      (enc) {MAISI $E$\\frozen};
\node[model,  right=of enc]     (dit) {Conditional I\textsuperscript{2}SB\\bridge $v_\theta$ (DiT)};
\node[frozen, right=of dit]     (dec) {MAISI $D$\\fine-tuned};
\node[io,     right=of dec]     (xpred) {Predicted\\follow-up $\hat\x_1$};
\node[eval,   right=of xpred]   (ev)  {Biomarkers\\vs identity floor};

% Two flows share the frozen encoder E: solid blue = inference (x0->z0),
% dashed violet = training-only (follow-up x1 -> bridge target z1).
\draw[infarr]   (x0)   -- (enc);
\draw[trainarr] (x1in) -- (enc);
\draw[infarr]   ([yshift=2.5pt]enc.east)  -- node[lbl]{$\z_0$} ([yshift=2.5pt]dit.west);
\draw[trainarr] ([yshift=-2.5pt]enc.east) -- node[lbl, below]{$\z_1$} ([yshift=-2.5pt]dit.west);
\draw[->] (dit)   -- node[lbl]{$\hat\z_1$} (dec);
\draw[->] (dec)   -- (xpred);
\draw[->] (xpred) -- (ev);

% conditioning from below 
\node[cov, below=10mm of dit] (cov)
  {Covariates $\cova$: age, sex, \emph{APOE4}, MMSE,\\
   CDR-SB, diagnosis, treatment, $\Dt$};
\draw[->] (cov) -- (dit);

% training-only region from above 
\node[sup, above=9mm of dit, align=center] (sup)
  {\textbf{Training only}\\[1pt]
   Step 3: pretrain on synthetic atrophy pairs $\rightarrow$
   fine-tune on real ADNI pairs\\
   Step 4: anti-identity losses --- biomarker $\cdot$ identity hinge $\cdot$
   $\Delta$-map};
\draw[->, dashed] (sup) -- (dit);

% step tags 
\node[tag, below=0.5mm of enc]  {Step 1};
\node[tag, below=0.5mm of dec]  {Step 1};
\node[tag, above right=0mm and 0mm of dit.north west, anchor=south west]
     {Step 2};
\node[tag, below=0.5mm of ev]   {\S\ref{sec:exp}};
\node[tag, right=1mm of cov]    {Step 2};
\end{tikzpicture}}
\caption{Overview of the method, annotated with the four steps described in the Method section. A baseline T1w volume $\x_0$ from ADNI is encoded
by the frozen MAISI encoder to a latent $\z_0$ (solid blue, inference); at
training the follow-up scan $\x_1$ passes through the \emph{same} frozen
encoder to the bridge target $\z_1$ (dashed violet). The decoder is
fine-tuned for biomarker fidelity and then frozen (\textbf{Step 1}). A
conditional Image-to-Image Schr\"odinger bridge $v_\theta$, conditioned on
the covariate vector $\cova$, transports $\z_0$ to a predicted follow-up
latent $\hat\z_1$ by deterministic Heun sampling (\textbf{Step 2}); the
decoder returns the predicted follow-up volume $\hat\x_1$, evaluated
against an identity floor with ROI biomarkers (\S\ref{sec:exp}). Dashed
components are training only: the bridge is pretrained on synthetic atrophy
pairs and fine-tuned on real ADNI pairs (\textbf{Step 3}), then refined
under three anti-identity losses (\textbf{Step 4}).}
\label{fig:pipeline}
\end{figure}

Given a baseline T1-weighted MRI scan $\x_0$, a covariate vector $\cova$, and a
target time horizon $\Dt$, our goal is to predict a plausible follow-up scan
$\hat\x_1$. We first state the problem and the \emph{identity-collapse}
failure it induces, then describe the method as four steps, illustrated
in Fig.~\ref{fig:pipeline}:
(\textbf{1})~we encode volumes into a biomarker-preserving latent space and
verify the VAE faitfully represents the relevant atrophy signal;
(\textbf{2})~we define a conditional Image-to-Image Schr\"odinger bridge that
transports the baseline latent to a follow-up latent;
(\textbf{3})~we pretrain this bridge on synthetic atrophy pairs and fine-tune
it on real-world clinical data, namely image pairs from the Alzheimer’s Disease Neuroimaging Initiative (ADNI) dataset; and
(\textbf{4})~we add three anti-identity losses in a final fine-tuning stage.
Steps~1--2 define the model; Steps~3--4 train it.

\medskip\noindent\textbf{Problem formulation and identity collapse.}
We are given longitudinal pairs $(\x_0,\x_1,\cova)$ of a baseline and
follow-up T1-weighted scan of the same subject with a covariate vector
\begin{equation}
  \cova=\big(\text{age},\,\text{sex},\,\text{weight},\,\textit{APOE4},\,
  \mathrm{MMSE}_{\mathrm{bl}},\,\text{CDR-SB}_{\mathrm{bl}},\,
  \mathrm{dx},\,\text{treatment},\,\Dt\big).
\end{equation}
Table~\ref{tab:covariates} lists the nine covariates: five continuous (each
with a validity mask) and four one-hot categorical (each with an explicit
\emph{unknown} token for missing or unparseable records). Our goal is to
model $p(\x_1 \mid \x_0,\cova)$ and sample follow-ups for a requested time horizon
$\Dt$. The natural baseline is flow matching: with the linear interpolant
$\x_t=(1-t)\x_0+t\,\x_1$, a network $v_\theta$ is trained to predict the
velocity transporting $\x_0$ to $\x_1$, minimising
\begin{equation}
\label{eq:flow}
  \mathcal{L}_{\text{flow}}=
  \Eb{\big\lVert v_\theta(\x_t,t,\x_0,\cova)-(\x_1-\x_0)\big\rVert^2}.
\end{equation}
In longitudinal brain MRI this objective is problematic: annual atrophy is
small ($1$--$3\%$ per year), so $\x_1 \approx \x_0$ and the target
$\x_1-\x_0$ is near zero for most voxels. The optimal solution therefore might be
$v_\theta \equiv \mathbf{0}$, i.e. the identity predictor $\hat{\x}_1=\x_0$,
a collapse that standard metrics SSIM, PSNR, and FID cannot detect, since they rate a copied
real scan as near-perfect. This motivates the Schr\"odinger-bridge transport
of Step~2 and an evaluation protocol with an explicit identity baseline, the
\emph{identity floor} (Step 3).

\begin{table}[t]
\centering
\caption{Conditioning covariates $\cova$. Continuous fields are min--max
normalised with a validity mask; categorical fields are one-hot with an
\emph{unknown} token for missing or unparseable records.}
\label{tab:covariates}
\footnotesize
\setlength{\tabcolsep}{4pt}
\begin{tabular}{@{}lp{8.6cm}@{}}
\toprule
\textit{Continuous} (5)  & age, weight, $\mathrm{MMSE}_{\mathrm{bl}}$ (0--30), $\text{CDR-SB}_{\mathrm{bl}}$ (0--18), $\Dt$ (months) \\
\midrule
\textit{Categorical} (4) & sex (M/F), \emph{APOE4} ($\varepsilon4$ count 0/1/2), dx (baseline diagnosis), treatment (none/AChE-i/memantine/anti-amyloid/other) \\
\bottomrule
\end{tabular}
\end{table}

\medskip\noindent\textbf{Step 1: Biomarker-preserving latent space.}
A full-resolution T1w volume has $256^3$ voxels, far too large to
model directly in 3D, so we first compress each scan into a compact latent
with MAISI~\cite{maisi}, a 3D VAE pretrained on brain MRI.
Volumes are intensity-normalised to $[0,1]$ and resized to its $256^3$ input;
the frozen encoder $E$ yields a precomputed latent
$\z=E(\x)\in\mathbb{R}^{4\times 64^3}$ ($4\times$ downsampling), decoded by
$D$; each scan pair thus yields the latent pair
$(\z_0,\z_1)=(E(\x_0),E(\x_1))$ that the bridge later operates on.
Working in latent space could introduce a second failure mode: if the latent
fails to preserve the subtle atrophy signal, the decoded output might be similar to the
baseline regardless of bridge quality. We therefore enforce a
biomarker-preservation constraint before bridge training: we segment both the
original scan $\x$ and its reconstruction $D(E(\x))$ with
FastSurfer~\cite{fastsurfer}, compute ROI volumes for hippocampus, ventricles,
and whole brain, and measure how well the two agree with the intraclass
correlation coefficient (ICC$(2,1)$)~\cite{iccshrout}. We require $\mathrm{ICC}>0.95$, stricter than the standard $0.90$
threshold for excellent agreement~\cite{koo2016} to preserve the subtle
atrophy signal. The pretrained VAE was only marginal on the medial temporal
lobe, so we fine-tune only the decoder, keeping the encoder frozen so the
latent space and the latents we precompute and cache for every scan
stay fixed for bridge training. We optimise an $\ell_1$ reconstruction loss with
increased weighting in a dilated hippocampal mask, plus a gradient regulariser
that discourages boundary blurring. After fine-tuning, all three ROIs sit well
above the $0.95$ threshold (Sec.~\ref{subsec:icc}), justifying operating in
latent space.

\medskip\noindent\textbf{Step 2: Conditional latent Schr\"odinger bridge.}
With each scan pair now encoded as a latent pair $(\z_0,\z_1)$, we define a conditional bridge that transports the baseline latent $\z_0$ to the follow-up latent $\z_1$, realised by an I\textsuperscript{2}SB~\cite{i2sb}:
it learns a stochastic process directly between the two paired latent
distributions rather than between noise and data as in standard diffusion. For a pair $(\z_0,\z_1)$ and
$t\in(0,1)$ the bridge defines a stochastic interpolant
\begin{equation}
  \z_t \sim \mathcal{N}\!\big((1-t)\,\z_0+t\,\z_1,\ \sigma^2(t)\,\mathbf{I}\big),
  \qquad \sigma^2(t)=\sigma_{\max}^2\,t\,(1-t),
  \label{eq:posterior}
\end{equation}
with $\sigma_{\max}=1$, giving zero variance at the endpoints and maximal
uncertainty at $t=\tfrac{1}{2}$. This lets us sample $\z_t$ directly, without
simulating a stochastic differential equation (SDE), and gives a single-shot
training objective from the closed-form conditional interpolant~\cite{stochinterp}
instead of iterative bridge solvers~\cite{dsbm}. We train a conditional velocity field
\begin{equation}
\label{eq:bridge}
  \mathcal{L}_{\mathrm{bridge}}=
  \Eb{\big\lVert v_\theta(\z_t,t,\z_0,\cova)-(\z_1-\z_0)\big\rVert^2}.
\end{equation}
Unlike deterministic flow matching, the stochastic interpolant couples many
$(\z_0,\z_1)$ pairs to the same $\z_t$, so the optimal predictor is the
conditional expectation of displacements: the learned field stays informative
even when individual pairs barely differ, avoiding the identity collapse. A residual bias toward small displacements remains from
population averaging, which Step~4 addresses.

The bridge's velocity field $v_\theta$, its only learned component, is
realised by a 3D Diffusion Transformer (DiT)~\cite{dit} with AdaLN-zero. The
baseline latent $\z_0$ is concatenated with $\z_t$ at the
input, giving explicit voxel-level access so the network predicts a residual
deformation rather than an absolute image; the feature volume is split into
non-overlapping $4^3$ patches, each linearly projected to a token, giving
$16^3=4096$ tokens. Conditioning is factorised into two streams: covariates
are embedded by an MLP and combined with a sinusoidal time embedding, and the
resulting vector modulates each block through AdaLN shift, scale, and gate
parameters. For classifier-free guidance, $\cova$ is replaced by a learned
null token with probability $0.1$, distinct from the \emph{unknown}
treatment category, so missing data and absent conditioning are not
conflated. At inference we integrate the ordinary differential equation (ODE) limit
$\dot{\z}=v_\theta(\z,t,\z_0,\cova)$ from $\z(0)=\z_0$ to $t=1$ with a
$100$-step Heun solver and decode $\z(1)$ with $D$. A guidance scale $s$
controls conditioning strength via $v_\varnothing + s\,(v_\cova -
v_\varnothing)$, where $v_\cova$ and $v_\varnothing$ are the velocities with
and without (null-token) conditioning; we use $s=1$.

\medskip\noindent\textbf{Step 3: Synthetic pretraining and real-data
fine-tuning.}
Real ADNI pairs are scarce and mostly near-identity, so we first pretrain the
bridge on a large synthetic corpus with controlled, anatomically plausible
deformations so it first learns meaningful change. We generate follow-ups by warping
a real baseline, $\x_1=\x_0 \circ \varphi^{-1}$, where $\varphi$ is a
diffeomorphic field modelling AD-related atrophies: hippocampal contraction,
ventricular expansion, cortical thinning in medial temporal and posterior
cingulate regions, a low-frequency global contraction, and small smooth
random perturbations for variability. ROI locations and per-year magnitudes
are fixed literature priors~\cite{frisoni2010}, scaled by a diagnosis-stage
multiplier (CN/MCI/AD); none are fit to ADNI subjects, so pretraining
introduces no leakage into the eval cohort. The stationary velocity field is
integrated by scaling-and-squaring to ensure invertibility~\cite{synthmorph},
and its magnitude is scaled by a synthetic time horizon $\Dt\in[6,60]$ months,
matched to the bridge conditioning so atrophy is time-consistent. Because
$\varphi$ is known, voxel-wise and regional volume changes are available as
ground truth (used for loss supervision in Step~4). We pretrain $v_\theta$ on these
pairs with Eq.~\eqref{eq:bridge}, so identity is never optimal during
pretraining, and then fine-tune on real ADNI pairs under the same objective.

\medskip\noindent\textbf{Step 4: Anti-identity supervision.}
The bridge and synthetic prior weaken but do not eliminate the residual bias
toward small displacements. We therefore add a final fine-tuning stage with three auxiliary losses
(ablated in Sec.~\ref{subsec:floor}):
\begin{enumerate}\setlength{\itemsep}{1pt}
\item \textbf{Biomarker consistency.} We decode $\hat\z_1$ and $\z_1$ and
  segment both with a frozen lightweight 3D network trained on FastSurfer
  labels, enforcing agreement in hippocampal and ventricular volumes via a
  relative error $|V(\hat{\z}_1)-V(\z_1)|/V(\z_1)$ and a Dice loss. Relative
  scaling avoids domination by large regions and focuses supervision on
  clinically relevant structures.
\item \textbf{Identity hinge.} We penalise under-predicted change via
  $\max(0,\, e(\Dt,\mathrm{dx}) - \lVert \hat{\z}_1 - \z_0 \rVert)^2$, where
  $e(\cdot)$ is the expected displacement magnitude conditioned on horizon and
  diagnosis. The term activates only when predicted change is insufficient.
\item \textbf{$\Delta$-map head.} A zero-initialised auxiliary head directly
  predicts $\hat{\z}_1 - \z_0$ against the ground-truth displacement with an
  $\ell_1$ loss, removing the trivial zero solution and giving a
  change-specific signal independent of the bridge velocity. It is used only
  during training.
\end{enumerate}
We apply these losses only in this final stage: the synthetic deformations of
Step~3 are already large, so on that data, such region-targeted supervision is
unnecessary and would mainly fit the generator's deformation prior rather than
real atrophy (besides being costly at pretraining scale); it is the
near-identity real pairs that make the identity collapse a genuine risk. The losses are ramped in over 5000 steps with the bridge objective dominant throughout. The biomarker term is computed only on a subset of iterations to keep training affordable: evaluating it requires decoding the latents and segmenting them.

\medskip\noindent\textbf{Implementation.}
The bridge velocity $v_\theta$ is a DiT backbone of $12$ blocks at width
$768$. After decoder fine-tuning (Step~1), training proceeds in the three
stages of Steps~3--4: synthetic pretraining (200k steps), real-data
fine-tuning (50k), and anti-identity fine-tuning (10k). We use AdamW with
cosine learning-rate scheduling, bf16 precision, and EMA at $0.9999$.
Classifier-free guidance is fixed at $s=1$: higher values increase apparent
anatomical change but reduce agreement with true progression.

\section{Experiments and Results}
\label{sec:exp}

\subsection{Data and Evaluation Protocol}
\label{subsec:data}
We evaluate on T1-weighted ADNI MRI~\cite{adni}, preprocessed with skull stripping to remove non-brain tissue (SynthStrip~\cite{synthstrip}), N4 bias correction for intensity inhomogeneity, and affine alignment to the MNI152 template for spatial standardisation using ANTs~\cite{ants}. We avoid nonlinear registration, as it would suppress longitudinal atrophy signals. All scans are resampled to $256^3$ at $1\,\mathrm{mm}$ resolution and rigidly aligned to each subject’s baseline (intra-subject, pose only). From $1{,}229$ subjects, we construct all ordered visit pairs with $\Delta t \ge 6$ months, yielding $8{,}499$ training and $1{,}372$ validation pairs, split $70/15/15$ by subject: every pair from a given subject falls in exactly one split.

We report all metrics against the \emph{identity floor} $\hat{\x}_1=\x_0$, evaluated through the same pipeline. For biomarker evaluation, we sample 100 held-out pairs, decode predictions and ground truth, and segment both using FastSurfer. We report per-ROI metrics, including volume MAE, win rate (fraction of samples beating the identity floor), Pearson correlation of predicted versus true volumes, and change-correlation relative to baseline.

We explicitly avoid SSIM, PSNR, and FID as primary metrics, as they cannot distinguish progression from identity. Model selection is based on change-correlation rather than validation reconstruction error, since low MSE correlates with identity collapse.

\subsection{Biomarker Preservation through the Encode–Decode Round-Trip}
\label{subsec:icc}
Our Step~1 biomarker-preservation requirement that ROI volumes are preserved through the VAE encode-decode round-trip ($\mathrm{ICC}>0.95$) is satisfied. On
$100$ held-out MCI scans ($88$ subjects), decoder fine-tuning improves the
original-versus-round-trip $\mathrm{ICC}(2,1)$ from $0.949$ to $0.997$
(hippocampus), $0.991$ to $1.000$ (ventricles), and $0.652$ to $0.971$
(whole brain), exceeding the $0.95$ gate on every ROI. The latent space thus preserves structural information relevant to biomarkers, reducing the risk that identity collapse is masked by the autoencoder.

\subsection{Beating the Identity Floor}
\label{subsec:floor}
Table~\ref{tab:floor} reports the main quantitative result and  ablates the anti-identity losses. The bridge alone already beats the identity floor on ventricle and whole-brain volume by wide margins, predicting genuine, biologically coherent change rather than copying the baseline. The hippocampus is the most challenging region, where the bridge alone underperforms the identity floor (win rate $0.38$), reflecting its low signal-to-noise ratio. Adding anti-identity supervision improves performance substantially across all ROIs, with strong gains in hippocampal accuracy and win rate. The full model then consistently outperforms the identity floor on all biomarkers. Crucially, this gain is invisible to image-quality metrics: all three models are near-indistinguishable in whole-volume SSIM (within $0.001$; Table~\ref{tab:floor}), so SSIM alone would rate a baseline copy as highly as a genuinely informative prediction, exactly the silent failure our biomarker protocol exposes.

\begin{table}[t]
\centering
\caption{Biomarker volume MAE (mm$^3$) on $100$ held-out pairs at guidance
scale $1$, against the identity floor. ``I\textsuperscript{2}SB'' is the bridge
alone, ``$+$anti-identity'' adds the three anti-identity losses; \emph{win} is
the fraction of pairs beating the floor. Final row:
each model's whole-volume SSIM to the true follow-up.}
\label{tab:floor}
\small
\setlength{\tabcolsep}{4pt}
\begin{tabular}{l c cc cc}
\toprule
 & Identity & \multicolumn{2}{c}{I\textsuperscript{2}SB} &
   \multicolumn{2}{c}{$+$anti-identity} \\
\cmidrule(lr){3-4}\cmidrule(lr){5-6}
ROI & MAE$\downarrow$ & MAE$\downarrow$ & win$\uparrow$ & MAE$\downarrow$ & win$\uparrow$ \\
\midrule
Hippocampus        & 340   & 358   & 0.38 & \textbf{280}   & \textbf{0.73} \\
Ventricles         & 4718  & 2689  & \textbf{0.85} & \textbf{2436}  & 0.83 \\
Whole brain        & 17879 & 14550 & 0.59 & \textbf{12680} & \textbf{0.69} \\
\midrule
Whole-volume SSIM\,$\uparrow$ & $0.989$ & \multicolumn{2}{c}{$0.990$} & \multicolumn{2}{c}{$0.990$} \\
\bottomrule
\end{tabular}
\end{table}

\subsection{Conditional Response to $\Dt$}
\label{subsec:response}
We test whether the model truly responds to the requested horizon: as $\Delta t$ grows, the predicted atrophy should advance in the clinically correct direction and grow in magnitude. We fix eight baselines spanning different diagnostic stages and vary only $\Delta t \in [6,60]$ months (Table~\ref{tab:response}). In every case the predicted change has the correct sign (hippocampus contracting, ventricles expanding) and increases monotonically with $\Delta t$, whereas an identity predictor is $\Delta t$-invariant. Doubling $\Delta t$ roughly doubles ventricular change but under-scales hippocampal change, a conservative bias in high-variability regions consistent with real-data fine-tuning dominating the synthetic prior rather than over-amplifying change.

\begin{table}[t]
\centering
\caption{Response to the conditioning horizon $\Delta t$.
Monotonicity is the rank correlation of ROI volume change with $\Delta t$;
scaling is the change ratio when $\Delta t$ is doubled.}
\label{tab:response}
\small
\setlength{\tabcolsep}{6pt}
\begin{tabular}{lccc}
\toprule
ROI & Direction acc. & Monotonicity ($\rho$) & $2\times\Delta t$ scaling \\
\midrule
Hippocampus & $100\%$ & $0.84$ & $1.63\times$ \\
Ventricles  & $100\%$ & $1.00$ & $1.99\times$ \\
Whole brain & $100\%$ & $1.00$ & $1.43\times$ \\
\bottomrule
\end{tabular}
\end{table}

\subsection{Progression Correlation}
\label{subsec:corr}
Absolute correlation between predicted and true follow-up volumes is high (hippocampus $0.979$; CN/MCI/DEM: $0.965/0.985/0.934$), but close to the identity floor ($0.975$), reflecting strong dependence on inter-subject anatomy and limited sensitivity to progression. We therefore report change-correlation (predicted vs.\ true change from baseline), zero for the identity predictor. The model achieves $0.48$ (hippocampus), $0.67$ (ventricles), and $0.33$ (whole brain), a moderate but consistent progression signal, strongest in ventricular expansion. Absolute correlation alone would thus overestimate performance, staying high even for identity; change-correlation is the discriminating measure relevant for the application.

\section{Discussion and Conclusion}
\label{sec:discussion}
Our conditional latent Schr\"odinger bridge, with three anti-identity
losses and a collapse-aware protocol (identity floor and $\Dt$-response
test), targets a rarely measured failure mode: on near-identical
longitudinal scans, regressors collapse to copying the baseline, undetected
by image-quality metrics. On held-out ADNI subjects it consistently beats the
identity floor on all biomarkers, with correct monotonic $\Dt$ responses. The
protocol is reusable: any longitudinal generative model can be tested against
an identity floor and a $\Dt$-response test.

\medskip\noindent\textbf{Limitations and future work.}
The model captures the direction and ordering of change but underestimates its magnitude: hippocampal change scales sub-linearly with $\Dt$ and change-correlation is only moderate. This reflects the small-target regime of the MSE bridge objective on near-identity pairs; increasing the bridge noise scale $\sigma_{\max}$ is a straightforward way to reduce this bias. The study is ADNI-only and internally validated. A leakage-free head-to-head with published models such as BrLP~\cite{brlp} would require retraining them on our split with matched preprocessing. Future work includes that comparison, external-cohort evaluation, iterative Schr\"odinger bridge refinement, and classifier-based transfer evaluation.

\begin{credits}
\subsubsection{\discintname}
The authors have no competing interests to declare that are relevant to the
content of this article.
\end{credits}

\bibliographystyle{splncs04}
\bibliography{refs}

\end{document}
