\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{multirow} % to get dummy images
\usepackage{booktabs}
\jmlrvolume{-- 086}
\jmlryear{2026}
\jmlrworkshop{Full Paper -- MIDL 2026}
\editors{Accepted for publication at MIDL 2026}

\usepackage{amssymb}
\usepackage{tikz}
\usetikzlibrary{shapes,arrows,positioning,calc, spy, shadows}
\newcommand{\R}{\mathbb{R}}
\newcommand{\N}{\mathbb{N}}
\newcommand{\D}{\mathcal{D}}
\newcommand{\Source}{\mathcal{S}}
\newcommand{\New}{\text{New}}

\newcommand{\rev}[1]{#1}

\title[IRTTA]{Exploiting Intermediate Reconstructions in Optical Coherence Tomography for Test-Time Adaptation of Medical Image Segmentation }


\midlauthor{\Name{Thomas Pinetz
\nametag{$^{1}$}} \orcid{0000-0002-6100-2136} \Email{thomas.pinetz@meduniwien.ac.at}\\
\Name{Veit Hucke
\nametag{$^{1}$}}\orcid{0009-0002-1519-1252}\Email{veit.hucke@meduniwien.ac.at}\\
\Name{Hrvoje Bogunovi\'c
\nametag{$^{1}$}} \orcid{0000-0002-9168-0894} \Email{hrvoje.bogunovic@meduniwien.ac.at}\\
\addr $^{1}$ Institute of Artificial Intelligence, Center for Medical Data Science, Medical University of Vienna, Austria\\
}

\begin{document}

\maketitle

\begin{abstract}
Primary health care frequently relies on low-cost imaging devices, which are commonly used for screening purposes. 
To ensure accurate diagnosis, these systems depend on advanced reconstruction algorithms designed to approximate the performance of high-quality counterparts. 
Such algorithms typically employ iterative reconstruction methods that incorporate domain-specific prior knowledge.  
However, downstream task performance is generally assessed using only the final reconstructed image, thereby disregarding the informative intermediate representations generated throughout the reconstruction process.
In this work, we propose IRTTA to exploit these intermediate representations at test-time by adapting the normalization-layer parameters of a frozen downstream network via a modulator network that conditions on the current reconstruction timescale. 
The modulator network is learned during test-time using an averaged entropy loss across all individual timesteps.
Variation among the timestep-wise segmentations additionally provides uncertainty estimates at no extra cost.
This approach enhances segmentation performance and enables semantically meaningful uncertainty estimation, all without modifying either the reconstruction process or the downstream model.
Code is available here\footnote{\url{https://github.com/tpinetz/domain_adaption_by_iterative_reconstruction}}.
\end{abstract}

\begin{keywords}
Diffusion, Test-Time Adaptation, Uncertainty Estimation, Optical Coherence Tomography  
\end{keywords}

\section{Introduction}

Medical image segmentation is a cornerstone of diagnostic interpretation, with deep learning enabling significant improvements in the robust quantification of biomarkers~\cite{FaAr22,HeBu23}. 
Yet, these models are trained primarily on curated, high-fidelity datasets originating from university hospitals~\cite{TrGa23}. 
Consequently, their generalization capability often suffers during clinical translation to low-cost imaging hardware~\cite{VaCh22}.

Recent developments in medical image reconstruction offer a potential solution by enhancing image fidelity and accessibility~\cite{McRa23}. 
While reconstruction was traditionally formulated as an inverse problem solved via iterative optimization of hand-crafted priors~\cite{FeLu12,SiLa10}, the paradigm has shifted toward data-driven methods~\cite{HaKl16}. 
State-of-the-art approaches now leverage powerful generative frameworks, most notably diffusion~\cite{FaPi25,LiHi24}, energy-based~\cite{ZaKn23}, autoregressive~\cite{WaPa23}, and flow matching models~\cite{YaMe25}.

Although these generative models have markedly increased reconstruction quality, they still operate iteratively, refining predictions over multiple steps. 
Moreover, they follow predetermined time schedule loosely corresponding to the distance from the high-quality domain.
Despite this, common evaluation protocols involving downstream tasks, such as biomarker segmentation, rely almost exclusively on the final reconstructed image~\cite{WaPa23,DoVa25,TiMc25}.
While the final reconstructed image is certainly valuable for clinicians, this standard practice neglects the rich information available across the iterative trajectory. Recently, ~\cite{JeCh25} incorporated the segmentation objectives during training of the reconstruction network, however, this requires jointly training both.
We hypothesize that standard segmentation networks, even those trained solely on high-quality data, can leverage the intermediate reconstructions of the anatomy to enhance performance without any labeled data.
\rev{While the visual structure evolves significantly throughout the reconstruction trajectory, we observe strong structural consistency across different subjects at any given timepoint.
Coupled with the inherent loss of fine detail during reconstruction, we expect that only minor adaptations to the segmentation network are necessary to handle these time-specific distributions.
By analyzing this trajectory of segmentation maps, we aim to provide meaningful semantic uncertainty.}

In this work, we propose \textbf{I}ntermediate \textbf{R}econstruction for \textbf{T}est-\textbf{T}ime \textbf{A}daption (\textbf{IRTTA}), a method that modulates an existing segmentation network based on the reconstruction process's time schedule. \rev{To this end, }we introduce a modulation network that adjusts the downstream model's normalization parameters\rev{, given the current timepoint in the reconstruction, and} without altering its frozen weights. Addressing the challenges of adaptation without ground truth, we employ a zero-initialization strategy to preserve original performance at the onset and optimize the modulation via entropy minimization~\cite{WaSh21}.
Crucially, our approach yields multiple predictions per input, enabling uncertainty estimation without requiring additional training of the backbone model.

We further demonstrate the resulting uncertainty estimates and improved segmentation performance on retinal Optical Coherence Tomography (OCT) data~\cite{BoVe19} acquired from three different devices, one of which provides substantially higher signal-to-noise ratio (SNR) than the others.
Given that OCT segmentation models are known to be sensitive to intensity histogram shifts~\cite{LuMa25}, adapting normalization layers is expected to be particularly effective for this modality.
Our contributions can be summarized as follows:
\begin{itemize}
    \item \textbf{Novel Modulation Framework:} We propose a method to improve the downstream performance of reconstruction models by exploiting the full reconstruction trajectory.
    \item \textbf{Zero-Shot Uncertainty Estimation:} We provide a mechanism for semantically meaningful uncertainty estimation in pre-trained models without the need for retraining or architectural modification.
    \item \textbf{State-of-the-Art Adaptation:} We achieve superior performance in test-time adaptation for segmentation tasks compared to existing baselines.
\end{itemize}


\begin{figure}[ht]
\includegraphics[width=\textwidth]{imgs/teaser.pdf}
\caption{A schematic overview of our proposed method IRTTA. The differences between the reconstructed $x_0,\ldots,x_{S-1}$. \rev{Initially, the weights and biases of the final layer in $g_\Psi$ is set to $0$, which is highlighted with bold arrows. Hence, the frozen backbone $f_\theta$ retains its performance at initialization and is adapted without labels.}}
\label{fig:teaser}
\end{figure}

\section{Method}


\subsection{Problem Formulation}
We consider an inverse problem setup where a measurement $m \in \R^M$ is transformed into an image estimate via an iterative reconstruction process. This process yields a sequence of $S$ discrete reconstructions $\mathbf{x} = (x_0, x_1, \ldots, x_{S-1})$, where each $x_i \in \R^d$ represents the estimated image at step $i$, approximating samples from our source domain $\D_\Source$.

We assume access to a pre-trained segmentation network $f_\theta: \mathbb{R}^d \to \mathbb{R}^{d \times C}$, parameterized by weights $\theta$, which maps an input image to dense probability maps over $C$ classes. This network is trained on data from $\D_\Source$. Our goal is to approximate the ground truth segmentation $y$ by taking advantage of the rich semantic information contained within the entire reconstruction trajectory $\mathbf{x}$, rather than relying solely on the final output $x_{S-1}$.

To achieve this, we introduce a temporal modulation network $g_\Psi(t)$, parameterized by $\Psi$. This network conditions the normalization statistics of $f_\theta$ on the reconstruction time-step $t$. We adopt a test-time adaptation strategy where $g_\Psi$ is optimized via unsupervised entropy minimization, ensuring the segmentation network adapts to the reconstruction dynamics without altering the frozen weights $\theta$. The overall framework is illustrated in Figure~\ref{fig:teaser}.


\subsection{Iterative Reconstruction via Diffusion}
While our framework is agnostic to the specific generative model, we formulate our method in the context of diffusion models, which represent the current state-of-the-art in medical image reconstruction~\cite{LiHi24, FaPi25}.

We utilize a diffusion model $\epsilon_\phi(x,t):\R^d\times\R^+\to\R^d$ defined by a time schedule $t \in [0, T]$, discretized into $S$ steps. The reconstruction process generates a trajectory of iterates $\{x_i\}_{i=0}^{S-1}$ corresponding to time points $t_i$. In this context, $t_i$ typically correlates with the distance from the data manifold.
Unlike standard generation, in the reconstruction setting, the trajectory is conditioned on the measurement $m$ using a data-consistency term $\mathcal{D}(x, m):\R^d\times\R^M\to\R$ to ensure fidelity to the measurement~\cite{PiKo21,ChKi23,LiHi24}.
Otherwise, it follows the diffusion specific update step denoted as $\text{Update}$, which depends on the noise schedule in the chosen model~\cite{KaAi22}:
\begin{align}
    \bar x = \epsilon_\phi(z_i, t_i) \notag\\
    x_i = \bar x - \tau \nabla\mathcal{D}(\bar x, m) \notag\\ 
    z_{i+1} \leftarrow \text{Update}(x_i, t_i).
\end{align}
The starting point $z_0$ is either pure noise~\cite{ChKi23}, the pseudoinverse of the measurement~\cite{LiHi24,FaPi25} or a mixture thereof~\cite{GaZh23}.
We extract the intermediate images $x_i$ after the data-consistency projection, treating them as inputs for the downstream segmentation task.

\subsection{Test-Time Modulation Network}
\rev{The general idea is that during the reconstruction the appearance of the images change and small changes are necessary to adapt the segmentation network to perform well along this trajectory.
Furthermore, small details might be lost during the reconstruction, which in turn could be useful for the downstream task.
For this reason, we adapt the segmentation network $f_\theta$ across the trajectory by injecting time-dependent modulation into its normalization layers (e.g., BatchNorm or LayerNorm).}
The modulation network $g_\Psi$ accepts the current time-step $t_i$ encoded via sinusoidal embeddings, similar to standard diffusion architectures~\cite{SoMe21}. The embedding is processed by a Multi-Layer Perceptron (MLP) consisting of two layers with Swish activation. The network $g_\Psi(t_i)$ predicts a set of modulation parameters $(\gamma, \beta)$ for each normalization layer in the backbone $f_\theta$.
Let $\bar{x} \in \mathbb{R}^{B \times C \times H \times W}$ denote the output of a standard normalization layer in the frozen network $f_\theta$, defined as:
\begin{equation}
    \bar{x} = \bar{\gamma} \cdot \frac{x_{in} - \mu}{\sigma} + \bar{\beta},
\end{equation}
where $\mu, \sigma$ are the running statistics and $\bar{\gamma}, \bar{\beta}$ are the frozen affine parameters.
We apply the learned modulation as a residual affine transformation:
\begin{equation}
    \label{eq:modulation}
    \bar{z} = e^\gamma \odot \bar{x} + \beta.
\end{equation}

\rev{Inspired by the zero convolutions used in ControlNet~\cite{ZhRa23}, we also intend for the initialization to retain the performance of the original segmentation network. Therefore, the weight and bias of the final layer of $g_\Psi$ is initialized to $0$. However, this would change all the predictions to $0$ and therefore, we model the scaling factor in log-space ($e^\gamma$).
This has two distinct advantages:}

\begin{enumerate}
    \item \textbf{Sign Stability:} It ensures the scaling factor remains positive, preventing arbitrary sign flips of the features.
    \item \textbf{Identity Initialization:} By initializing the final projection layer of $g_\Psi$ to 0, we obtain $\bar{z} = 1\cdot \bar{x} + 0$, which reproduces the original pre-trained performance exactly.
\end{enumerate}


\subsection{Optimization and Uncertainty Estimation}
During inference, given a measurement $m$, we perform the reconstruction to obtain pairs $(x_i, t_i)$. We adapt $\Psi$ by minimizing the prediction entropy across the trajectory. Let $\hat{y}_i = \text{Softmax}(f_{\theta, \Psi}(x_i)) \in \mathbb{R}^{d \times C}$ be the soft prediction at step $i$. The unsupervised objective is:
\begin{equation}
    \mathcal{L}(\Psi) = -\sum_{i=1}^S \frac{1}{d} \sum_{p=1}^{d} \sum_{c=1}^C \hat{y}_{i, p, c} \log(\hat{y}_{i, p, c}),
\end{equation}
where the inner sums compute the spatial entropy of the prediction.

\textbf{Inference and Uncertainty:}
After adaptation, we compute the ensemble mean prediction $\hat{y}_\mu = \frac{1}{S} \sum_{i=1}^S \hat{y}_i$. The final semantic segmentation is obtained via $\text{argmax}(\hat{y}_\mu)$.
Furthermore, the pixel-wise entropy of the mean prediction, $H(\hat{y}_\mu)$, serves as a semantic uncertainty map. As shown in Figure~\ref{fig:teaser}, regions of high entropy (bright) correlate with ambiguous anatomical structures.


\section{Results}

In this section, we detail our experimental setup, baselines, and quantitative results. We further provide an ablation study on the modulation architecture and analyze the uncertainty estimation capabilities of our method IRTTA.

\subsection{Experimental Setup}

\subsubsection{Dataset and Preprocessing}
We utilize the RETOUCH benchmark~\cite{BoVe19}, which comprises OCT volumes from three different device manufacturers: Cirrus, Topcon, and Spectralis. The Spectralis device is characterized by a notably higher Signal-to-Noise Ratio (SNR) and serves as our reference target domain for high-quality imaging. The dataset includes pixel-wise annotations for fluid-related biomarkers (Intraretinal Fluid (IRF), Subretinal Fluid (SRF), and Pigment Epithelial Detachment (PED)), which are critical biomarkers for Geographic Atrophy (GA). Due to these properties, RETOUCH is a standard benchmark for domain adaptation in medical imaging~\cite{KoHo22,GoKi25}.
For standardization, all B-scans were resized to $512\times 512$ pixels.


\subsubsection{Implementation Details}
We employ a standard U-Net with a ResNet-18 encoder\footnote{\url{https://github.com/qubvel-org/segmentation_models.pytorch}} as our downstream segmentation backbone. The network is optimized using Adam with an initial learning rate of $10^{-3}$, a batch size of 8, for 400,000 iterations. We employ cosine annealing to decay the learning rate to $10^{-6}$. Data augmentation includes random 90-degree rotations, shifts, elastic transformations, and contrast adjustments via Albumentations~\cite{BuIg20}. For the proposed test-time modulation, we freeze the backbone and optimize only the modulation parameters $\Psi$ using Adam with a learning rate of $10^{-5}$. The GARD\footnote{\url{https://github.com/ABotond/GARD}} model was used as our diffusion reconstruction model for Spectralis \rev{ with number of reconstructions $S=10$}. Hyperparameters were selected via the ablation study detailed in Section~\ref{sec:ablation}. 

Following the RETOUCH protocol~\cite{BoVe19}, we report the Dice Similarity Coefficient (DSC) evaluated on the full 3D OCT volumes. To avoid biasing the metric towards background predictions (predicting all zeros), evaluation is restricted to cases where fluid is physically present, consistent with standard medical imaging practices.

\subsection{Baselines and Comparison Methods}

We compare our approach against three categories of methods:
\begin{enumerate}
    \item \textbf{General Denoising:} We evaluate SCUNet~\cite{ZhLi23}, assuming that domain shifts in OCT are primarily driven by noise characteristics.
    \item \textbf{Unsupervised Domain Adaptation (UDA):} We compare with SVDNA~\cite{KoHo22} and SegClr~\cite{GoKi25}. For these methods, we do 4-fold cross validation to evaluate on the same cases as the other methods. Note that these methods require access to the source domain during training.
    \item \textbf{Test-Time Adaptation (TTA):} We compare against TENT~\cite{WaSh21}, \rev{CoTTA~\cite{WaFi22}} (\rev{both} adapting normalization stats), Energy-based adaptation~\cite{ZaKn23} and two diffusion based approaches (DDA~\cite{GaZh23} and what we denote as EDM~\cite{ChKi23}). For both methods, we utilized a standard diffusion model implemented via EDM~\cite{KaAi22} trained on the Spectralis dataset, as GARD uses a diffusion process based on gamma distribution~\cite{NaRo21} which is non-trivial to adapt.
\end{enumerate}
Additionally, we report an **Oracle** (Supervised) model, where the segmentation network is trained directly on the target domain labels using the same cross validation scheme. We also include a supervised version of our method (IRTTA$_{sup}$) to quantify the theoretical limit of our architecture.

\subsection{Quantitative Analysis}
The results for the Cirrus $\to$ Spectralis adaptation task are presented in Table~\ref{tab:results_retouch_cirrus}. All adaptation methods yield improvements over the baseline. Among the TTA frameworks, our proposed method achieves the highest mean Dice score ($0.603$), outperforming our baseline GARD~\cite{FaPi25} ($0.553$) and the generic denoiser SCUNet ($0.551$).
GARD was specifically developed for OCT images, and hence already outperforms competing methods.

Notably, SCUNet's strong performance suggests that the domain gap is largely dominated by noise levels. However, our method outperforms SVDNA~\cite{KoHo22}, despite SVDNA having access to Cirrus data during training. 
While our unsupervised approach performs comparably to the best UDA methods, the gap between IRTTA and IRTTA$_{sup}$ (0.603 vs 0.645) indicates that while the normalization adaptation is effective, the unsupervised entropy loss does not fully recover the information available to a supervised signal.

Table~\ref{tab:results_retouch_topcon} presents the results for the Topcon $\to$ Spectralis task. Here, our method demonstrates strong generalizability, achieving the highest performance among TTA methods ($0.438$) using the same hyperparameters as the Cirrus experiments. A qualitative comparison is provided in Figure~\ref{fig:reconstruction_gallery}, which highlights the typical differences to the baseline GARD as observed in the dataset.
The first row shows an example, where the segmentation is very similar, which is the usual case.
In the next two rows, there are subtle changes, where small lesions are added or connected.
In the final row an artifact produced by the diffusion process is removed.



\begin{figure}[ht!]
\begin{tikzpicture}[spy using outlines={
        magnification=2, 
        size=0.8cm, 
        every spy on node/.append style={thick, red},    % Style of the target circle
        every spy in node/.append style={thick, blue}   % Style of the glass
    },
    imgnode/.style={
        minimum width=2cm,  % Define standard width for the images
        minimum height=2cm, % Define standard height for the images
        inner sep=0pt       % No padding around the included graphic
    }
]

% --- Define the list of methods to iterate over ---
\def\MethodList{gt/$m$, orig_seg/baseline, dda/DDA, segclr/SegClr, seg_9/GARD, mean_prediction/IRTTA, gt_seg/target $y$} 
\def\PatList{topcon_pat_15_35,topcon_pat_16_69, pat_16_60, pat_18_90}

\foreach \patient [count=\patpos starting from 0] in \PatList{
    % \colpos = Column Index (0, 1, 2...)
    \foreach \methodname/\visname [count=\colpos starting from 0] in \MethodList {
         
         \edef\filename{\methodname.png}
        % 1. Place the Node
        % WE NAME THE NODE USING INTEGERS: (Img-\colpos-\patpos)
        % This prevents errors with underscores in \methodname
        % \begin{scope}[spy using outlines={rectangle, draw=white, magnification=2, size=1cm}]
        \node[imgnode] (Img-\colpos-\patpos) at (\colpos*2.2, \patpos*2.2) {
            % Replaced with placeholder for demo. Uncomment your line below:
            \includegraphics[width=2cm, height=2cm, trim={0cm 4cm 0cm 2cm}, clip]{imgs/\patient/\filename}
        };
            % \spy on (2.5*\xi+.4,-0.9) in node at (2.5*\xi-.75,+1.05);
            % \end{scope}
        
        % 2. Add Label (Only for the last row, assuming 4 rows indexed 0-3)
        \ifnum\patpos=0 
            \node[below=3pt] at (Img-\colpos-\patpos.south) {\textbf{\visname}};
        \fi
        \ifnum\patpos=1
            \spy on ([shift={(0.,0.5)}]Img-0-1.center) 
                in node at (Img-0-1.south east);
            
            \spy on ([shift={(0.,0.5)}]Img-1-1.center) 
                in node at (Img-1-1.south east);
            
            \spy on ([shift={(0.,0.5)}]Img-2-1.center) 
                in node at (Img-2-1.south east);
            
            \spy on ([shift={(0.,0.5)}]Img-3-1.center) 
                in node at (Img-3-1.south east);
            
            \spy on ([shift={(0.,0.5)}]Img-4-1.center) 
                in node at (Img-4-1.south east);
            
            \spy on ([shift={(0.,0.5)}]Img-5-1.center) 
                in node at (Img-5-1.south east);
            
            \spy on ([shift={(0.,0.5)}]Img-6-1.center) 
                in node at (Img-6-1.south east);
        \fi
        \ifnum\patpos=2
            \spy on ([shift={(-0.1,0.1)}]Img-0-2.center) 
                in node at (Img-0-2.south east);
            
            \spy on ([shift={(-0.1,0.1)}]Img-1-2.center) 
                in node at (Img-1-2.south east);
            
            \spy on ([shift={(-0.1,0.1)}]Img-2-2.center) 
                in node at (Img-2-2.south east);
            
            \spy on ([shift={(-0.1,0.1)}]Img-3-2.center) 
                in node at (Img-3-2.south east);
            
            \spy on ([shift={(-0.1,0.1)}]Img-4-2.center) 
                in node at (Img-4-2.south east);
            
            \spy on ([shift={(-0.1,0.1)}]Img-5-2.center) 
                in node at (Img-5-2.south east);
            
            \spy on ([shift={(-0.1,0.1)}]Img-6-2.center) 
                in node at (Img-6-2.south east);
        \fi

        \ifnum\patpos=3
            \spy on ([shift={(-0.1,0.1)}]Img-0-3.center) 
                in node at (Img-0-3.south east);
            
            \spy on ([shift={(0.2,0.1)}]Img-1-3.center) 
                in node at (Img-1-3.south east);
            
            \spy on ([shift={(0.2,0.1)}]Img-2-3.center) 
                in node at (Img-2-3.south east);
            
            \spy on ([shift={(0.2,0.1)}]Img-3-3.center) 
                in node at (Img-3-3.south east);
            
            \spy on ([shift={(0.2,0.1)}]Img-4-3.center) 
                in node at (Img-4-3.south east);
            
            \spy on ([shift={(0.2,0.1)}]Img-5-3.center) 
                in node at (Img-5-3.south east);
            
            \spy on ([shift={(0.2,0.1)}]Img-6-3.center) 
                in node at (Img-6-3.south east);
        \fi
        
    }
% Indices 0 and 1 are Topcon (Bottom rows in Cartesian coords)
% Y-Midpoint for 0 and 1: (0 + 2.2)/2 = 1.1
\node[rotate=90, anchor=south, font=\large] at (-1., 1.1) {Topcon};

% Indices 2 and 3 are Cirrus (Top rows in Cartesian coords)
% Y-Midpoint for 2 and 3: (4.4 + 6.6)/2 = 5.5
\node[rotate=90, anchor=south, font=\large] at (-1., 5.5) {Cirrus};
}
\end{tikzpicture}
\caption{Visual comparison of downstream results using different methods. The top and bottom two rows show examples from the Cirrus and Topcon datasets respectively.}
\label{fig:reconstruction_gallery}
\end{figure}



\begin{table}[ht]
\centering
\caption{Comparison of downstream performance on RETOUCH Cirrus $\to$ Spectralis~\cite{BoVe19}.
SVDNA and SegClr use Cirrus data during the training phase of the downstream network without labels.}

\begin{tabular}{l l c c c c}
\toprule
\multirow{2}{*}{\textbf{Method}} & \multirow{2}{*}{\textbf{Venue}} & \multicolumn{4}{c}{\textbf{DICE}} \\
\cmidrule(lr){3-6}
 & & \textbf{IRF} & \textbf{SRF} & \textbf{PED} & \textbf{Mean}  \\
\midrule
Baseline & - & $0.463\pm 0.29$ & $0.368\pm 0.24$ & $0.267\pm 0.27$ & $0.366$  \\
\hline
SCUNET~\cite{ZhLi23}    & MLR         & $0.563\pm 0.29$ & $\underline{0.617}\pm 0.17$ & $0.474\pm 0.30$ & $0.551$ \\
TENT~\cite{WaSh21}      & ICLR        & $0.548\pm 0.21$ & $0.492\pm 0.25$ & $0.234\pm 0.23$ & $0.425$ \\
\rev{CoTTA~\cite{WaFi22}} & \rev{CVPR} & $\textbf{0.585}\pm 0.18$ & $0.544\pm 0.23$ & $0.282 \pm 0.24$ & $0.470$ \\
Energy~\cite{ZaKn23}    & TMI         & $0.429\pm 0.29$ & $0.525\pm 0.20$ & $0.369\pm 0.28$ & $0.441$ \\
DDA~\cite{GaZh23}       & CVPR        & $0.429\pm 0.29$ & $0.525\pm 0.20$ & $0.368\pm 0.28$ & $0.441$ \\
EDM~\cite{ChKi23}       & ICLR        & $0.281\pm 0.23$ & $0.378\pm 0.18$ & $0.476\pm 0.18$ & $0.378$ \\
GARD~\cite{FaPi25}      & MICCAI      & $0.563\pm 0.23$ & $0.588\pm 0.21$ & $\underline{0.509}\pm 0.25$ & $\underline{0.553}$ \\
IRTTA                    & -           & $\underline{0.581}\pm 0.23$ & $\textbf{0.709}\pm 0.15$ & $\textbf{0.517}\pm 0.27$ & $\textbf{0.603}$ \\
\hline
SVDNA~\cite{KoHo22}     & MICCAI      & $0.513\pm 0.29$ & $0.478\pm 0.28$ & $0.308\pm 0.32$ & $0.433$ \\
SegClr~\cite{GoKi25}    & MIA         & $0.562\pm 0.26$ & $0.578\pm 0.24$ & $0.397\pm 0.29$ & $0.512$ \\
\hline
IRTTA$_{sup}$            & -           & $0.686\pm 0.21$ & $0.775\pm 0.09$ & $0.474\pm 0.30$ & $0.645$\\
Supervised              & -           & $0.686\pm 0.20$ & $0.667\pm 0.21$ & $0.517\pm 0.32$ & $0.623$ \\
\end{tabular}
\label{tab:results_retouch_cirrus}
\end{table}


\begin{table}[ht]
\centering
\caption{Comparison of downstream performance on RETOUCH Topcon $\to$ Spectralis ~\cite{BoVe19}.
SVDNA and SegClr use Topcon data during the training phase of the downstream network without labels.}

\begin{tabular}{l l c c c c c}
\toprule
\multirow{2}{*}{\textbf{Method}} & \multirow{2}{*}{\textbf{Venue}} & \multicolumn{4}{c}{\textbf{DICE}} \\
\cmidrule(lr){3-6}
 & & \textbf{IRF} & \textbf{SRF} & \textbf{PED} & \textbf{Mean} &   \\
\midrule
Baseline & - & $0.475\pm 0.27$ & $0.282\pm 0.21$ & $0.398\pm 0.16$ & $0.385$ \\
\hline
SCUNET~\cite{ZhLi23}    & MLR         & $0.446\pm 0.26$ & $0.213\pm 0.18$ & $\textbf{0.518}\pm 0.14$ & $0.393$ \\
DDA~\cite{GaZh23}       & CVPR        & $0.461\pm 0.27$ & $0.362\pm 0.27$ & $0.384\pm 0.22$ & $0.402$ \\
EDM~\cite{ChKi23}       & ICLR        & $0.294\pm 0.24$ & $0.326\pm 0.19$ & $0.504\pm 0.17$ & $0.375$ \\
Energy~\cite{ZaKn23}    & TMI         & $0.464 \pm 0.26$ & $0.241 \pm 0.16$ & $0.478 \pm 0.14$ & $0.394$ \\
TENT~\cite{WaSh21}      & ICLR        & $\underline{0.553} \pm 0.25$ & $0.362\pm 0.29$ & $0.289\pm 0.22$ & $0.401$ \\
\rev{CoTTA~\cite{WaFi22}} & \rev{CVPR} & $\textbf{0.567} \pm 0.25$ & $\textbf{0.400} \pm 0.30$ &  $0.302 \pm 0.22$ & $\underline{0.423}$ \\
GARD~\cite{FaPi25}      & MICCAI      & $0.502\pm 0.26$ & $0.271 \pm 0.16$ & $\underline{0.488} \pm 0.15$ &  $0.420$ \\
IRTTA                    & -           & $0.507\pm 0.27$ & $\underline{0.377}\pm 0.25$ & $0.447\pm 0.15$ & $\textbf{0.444}$ \\
\hline
SVDNA~\cite{KoHo22}     & MICCAI      & $0.488 \pm 0.25$ & $0.419\pm 0.26$ & $0.438\pm 0.20$ & $0.448$ \\
SegClr~\cite{GoKi25}    & MIA         & $0.582\pm 0.24$ & $0.487\pm 0.28$ & $0.483\pm 0.20$ & $0.517$ \\
\hline
IRTTA$_{sup}$            & -           & $0.526\pm 0.27$ & $0.353\pm 0.27$ & $0.534\pm 0.14$ & $0.471$ \\
Supervised & -                        & $0.625\pm 0.22$ & $0.481\pm 0.29$ & $0.521\pm 0.20$ & $0.542$ & 
\end{tabular}
\label{tab:results_retouch_topcon}
\end{table}


\subsection{Ablation Study}
\label{sec:ablation}

We investigate the impact of key architectural choices on the Cirrus$\to$Spectralis adaptation.

\textbf{Trajectory Adaptation:}
Table~\ref{tab:ablation_method} analyzes the benefit of temporal adaptation. "Adapt only last" restricts adaptation to the final reconstructed image ($x_S$), neglecting the trajectory. While this improves over the baseline, our full trajectory method yields superior results ($0.603$).
\rev{Similarly "Adapt without first" restricts adaptation to use the trajectory without the first reconstruction as the first step will be the furthest from the target domain and introduce the most noise. However, the final dice score is only slightly lower and therefore not crucial for the adaptation. However, there is still an increase and therefore it is useful to integrate this step into the adaptation.} 
We also compare adapting weights per individual volume ("Per Case") versus sharing adaptation weights across the test set ("Per Dataset"). The negligible performance difference suggests that a single 3D volume contains sufficient statistical diversity to drive the adaptation process effectively.

\textbf{Hyperparameter Sensitivity:}
Table~\ref{tab:ablation_emb_size} suggests that the embedding size of the modulation network is not a critical hyperparameter, though performance degrades with excessively large embeddings ($>64$), likely due to overfitting the limited modulation signal.
Conversely, increasing the number of adaptation steps initially yields minor performance gains (Table~\ref{tab:ablation_steps}). These improvements saturate at 100 steps, after which performance deteriorates.
\rev{Similarly, increasing the number of reconstructions improves the performance and saturates at $S=10)$ and drops slightly with $S=20$ (see Table~\ref{tab:ablation_recon_steps}). Furthermore, we added run time statistics for our approach with increasing reconstruction steps. The training itself is not a large bottleneck.}


\begin{table}[ht]
\centering
\caption{Ablation: Trajectory vs. Single-step adaptation (Cirrus).}
\begin{tabular}{l c c c c c}
\toprule
\multirow{2}{*}{\textbf{Method}} & \multicolumn{4}{c}{\textbf{DICE}} \\
\cmidrule(lr){2-5}
 & \textbf{IRF} & \textbf{SRF} & \textbf{PED} & \textbf{Mean} &   \\
\hline
Adapt only last & $0.560\pm 0.24$ & $0.678\pm 0.17$ & $0.507\pm 0.26$ & $0.581$ \\
\rev{Adapt without first} & $0.575\pm 0.23$ & $0.697 \pm 0.16$ & $\textbf{0.519} \pm 0.26$ & $0.597$\\
Adapt per case & $0.570 \pm 0.23$ & $\textbf{0.711}\pm 0.15$ & $0.513 \pm 0.28$ & $0.598$ \\
Adapt per dataset & $\textbf{0.581} \pm 0.23$ & $0.709 \pm 0.15$ & $0.517 \pm 0.27$ & $\textbf{0.603}$ \\
\end{tabular}
\label{tab:ablation_method}
\end{table}

\begin{table}[ht]
\centering
\caption{Ablation: Size of the embedding vector (Cirrus).}
\begin{tabular}{l c c c c c}
\toprule
\multirow{2}{*}{\textbf{Emb size}} & \multicolumn{4}{c}{\textbf{DICE}} \\
\cmidrule(lr){2-5}
 & \textbf{IRF} & \textbf{SRF} & \textbf{PED} & \textbf{Mean} &   \\
\hline
4 & $0.579 \pm 0.23$ & $0.697\pm0.16$ & $\textbf{0.531}\pm 0.26$ & $0.602$ \\
8  & $0.578\pm 0.23$ & $0.704 \pm 0.16$ & $0.526\pm 0.26$ & $\textbf{0.603}$\\
16 & $\textbf{0.581} \pm 0.23$ & $\textbf{0.709} \pm 0.15$ & $0.517 \pm 0.27$ & $\textbf{0.603}$ \\
32 & $0.572\pm 0.24$ & $0.709\pm 0.15$ & $0.516\pm 0.27$  & $0.599$ \\
64 & $0.550 \pm 0.25$ & $0.705 \pm 0.13$ & $0.480\pm 0.29$ & $0.578$\\
128 & $0.496\pm 0.26$ & $0.679\pm 0.15$ & $0.434\pm 0.32$ & $0.536$ \\
\end{tabular}
\label{tab:ablation_emb_size}
\end{table}


\begin{table}[ht]
\centering
\caption{Ablation: Number of steps used in the adaption (Cirrus).}
\begin{tabular}{l c c c c c}
\toprule
\multirow{2}{*}{\textbf{Steps}} & \multicolumn{4}{c}{\textbf{DICE}} \\
\cmidrule(lr){2-5}
 & \textbf{IRF} & \textbf{SRF} & \textbf{PED} & \textbf{Mean} &   \\
\hline
1 & $0.579\pm 0.23$ & $0.670\pm 0.19$ & $0.541\pm 0.26$ & $0.597$ \\
10 & $0.580\pm 0.23$ & $0.674\pm 0.18$ & $\textbf{0.541}\pm 0.26$ & $0.598$ \\
50 & $0.580\pm0.23$ & $0.691\pm 0.17$ & $0.536\pm 0.26$ & $0.602$\\
100 & $\textbf{0.581} \pm 0.23$ & $\textbf{0.709} \pm 0.15$ & $0.517 \pm 0.27$ & $\textbf{0.603}$ \\
500 & $0.465\pm 0.26$ & $0.613\pm 0.20$ & $0.386\pm 0.33$ & $0.488$\\ 
\end{tabular}
\label{tab:ablation_steps}
\end{table}


\begin{table}[ht]
\centering
\caption{\rev{Ablation: Number of reconstructions used in the adaption (Cirrus).}}
\begin{tabular}{l c c c c | c}
\toprule
\multirow{2}{*}{\textbf{S}} & \multicolumn{4}{c}{\textbf{DICE}} & \rev{Run Time} \\
\cmidrule(lr){2-5}
 & \textbf{IRF} & \textbf{SRF} & \textbf{PED} & \textbf{Mean} & \rev{in s}  \\
\hline
5 & $0.539\pm 0.23$ & $0.648 \pm 0.18$ & $0.497 \pm 0.26$ & $0.561$ & $16$ \\
10 & $0.580\pm 0.23$ & $0.674\pm 0.18$ & $\textbf{0.541}\pm 0.26$ & $0.598$ & $69$ \\
15 & $0.593 \pm 0.23$ & $\textbf{0.720} \pm 0.13$ & $0.500\pm 0.30$ & $\textbf{0.605}$ & $121$ \\
20 & $\textbf{0.604} \pm 0.20$ & $0.695 \pm 0.13$ & $0.474\pm 0.32$ & $0.591$ & $196$
\end{tabular}
\label{tab:ablation_recon_steps}
\end{table}

\begin{table}[ht]
\centering
\caption{Uncertainty Quantification (Expected Calibration Error\rev{ and Precision-Recall Area Under the Curve}).}
\begin{tabular}{l c c | c c }
\toprule
\multirow{2}{*}{\textbf{Method}} & \multicolumn{2}{c}{ECE  $\downarrow$ } & \multicolumn{2}{c}{\rev{PRAUC}  $\uparrow$ } \\
& \textbf{Cirrus} & \textbf{Topcon} & \rev{\textbf{Cirrus}} & \rev{\textbf{Topcon}} \textbf{}  \\
\hline
GARD &  $.01332\pm .010$ & $.00455\pm .004$ & $0.532\pm 0.26$ & $0.447 \pm 0.24$ \\
IRTTA &  $.00697\pm.007$ & $.00342\pm .005$ & $0.672 \pm 0.22$ & $0.454 \pm 0.24$  \\
Supervised & $.00534\pm .005$ & $.00363\pm .004$ & $0.765\pm 0.21$ & $0.614 \pm 0.25$ \\
\end{tabular}
\label{tab:uncertainty}
\end{table}



\begin{figure}[ht!]
\begin{tikzpicture}[spy using outlines={
        magnification=2, 
        size=0.8cm, 
        every spy on node/.append style={thick, red},    % Style of the target circle
        every spy in node/.append style={thick, blue}   % Style of the glass
    },
    imgnode/.style={
        minimum width=2cm,  % Define standard width for the images
        minimum height=2cm, % Define standard height for the images
        inner sep=0pt       % No padding around the included graphic
    }
]

% --- Define the list of methods to iterate over ---
\def\MethodList{gt/$m$, seg_9/GARD, uncertainty_gamma/$H(\text{GARD})$, mean_prediction/IRTTA, uncertainty/$H(\hat y_\mu)$, gt_seg/target $y$} 
\def\PatList{pat_18_75,pat_13_75}

\foreach \patient [count=\patpos starting from 0] in \PatList{
    % --- Loop variables: \methodname is the letter (A, B, C), \position is the placement index (0, 1, 2, 3) ---
    \foreach \methodname/\visname [count=\position starting from 0] in \MethodList
    {
        % 1. Construct the filename: method + \methodname + .png
        \edef\filename{\methodname.png}
        
        % 2. Place the image using a TiKZ node
        % The node is placed at (\position * 4.2, 0) to separate images by 4.2cm
        \node[imgnode] (Img-\position) at (\position * 2.7, \patpos * 2.7)
        {
            % Include the image file
            \includegraphics[width=2.5cm, height=2.5cm]{imgs/\patient/\filename}
        };
    
        % 3. Add the label (caption) below the image
        \ifnum\patpos=0 
        \node[below=5pt of Img-\position.south] 
            {\textbf{\visname}};
        \fi

        \ifnum\patpos=0
            \spy on ([shift={(-0.4,0.2)}]Img-0.center) 
                in node at (Img-0.south east);
            
            \spy on ([shift={(-0.4,0.2)}]Img-1.center) 
                in node at (Img-1.south east);
            
            \spy on ([shift={(-0.4,0.2)}]Img-2.center) 
                in node at (Img-2.south east);
            
            \spy on ([shift={(-0.4,0.2)}]Img-3.center) 
                in node at (Img-3.south east);
            
            \spy on ([shift={(-0.4,0.2)}]Img-4.center) 
                in node at (Img-4.south east);
            
            \spy on ([shift={(-0.4,0.2)}]Img-5.center) 
                in node at (Img-5.south east);
        \fi
    }
}


\end{tikzpicture}
\caption{Visual comparison of uncertainty estimation compared to the baseline, visualized by showing the entropy of the prediction. The uncertainty now shows semantically meaningful information instead of the boundary of the initial segmentation.}
\label{fig:uncertainty}
\end{figure}


\subsection{Uncertainty visualization}

We quantify the reliability of our uncertainty estimates using the Expected Calibration Error (ECE) \rev{and Precision-Recall Area Under the Curve (PRAUC)}, presented in Table~\ref{tab:uncertainty}.
\rev{The PRAUC is computed in a binary classification setting, where the probabilities of the fluid classes are summed up after softmax.}
Our method reduces the ECE from $\sim0.007$ vs $\sim0.013$ and $\sim0.003$ vs $\sim0.005$ compared to the baseline GARD, indicating that our approach aligns the confidence scores better with the true accuracy.
\rev{Similarly, the PRAUC values are improve from $0.532$ to $0.672$ on Cirrus and from $0.447$ to $0.454$ on Topcon.
For the PRAUC, the difference to the supervised trained model is more pronounced.}
Qualitatively (Figure~\ref{fig:uncertainty}), our entropy maps do not merely highlight object boundaries but correctly identify ambiguous anatomical regions, such as potential lesions that exist in the ground truth but are degraded in the input, offering valuable interpretability for clinicians. \rev{This is especially striking in the first row, where a small lesion is lost after the reconstruction process. However this lesion is found by the segmentation network at some point in the trajectory and hence visualized in the uncertainty map.}


% \section{Clinical Impact}

% Disparities in access to high-quality medical imaging remain a critical barrier to effective healthcare, particularly in rural and resource-constrained settings. While low-cost devices increase accessibility, the resulting images often suffer from reduced signal-to-noise ratios, which severely degrades the performance of diagnostic algorithms trained on high-quality data from high-resource institutions. Furthermore, retraining large-scale deep learning models for every specific device or site is often computationally prohibitive and impractical for on-site clinical hardware.
% Our proposed framework addresses this bottleneck by enabling high-performance diagnostic models to adapt to low-quality inputs at test-time. By leveraging the reconstruction trajectory, we improve downstream segmentation performance with minimal computational overhead and without requiring access to sensitive source-domain training data.

\section{Conclusion}

In this work, we demonstrated that the iterative nature of modern reconstruction algorithms offers a rich, yet underutilized, source of semantic information. By modulating a pre-trained segmentation network based on the reconstruction trajectory, we achieved significant performance gains on out-of-distribution data via ad-hoc test-time adaptation.

Crucially, our method achieves performance competitive with dedicated Unsupervised Domain Adaptation (UDA) frameworks, despite the distinct advantage of not requiring access to the target domain during training. While a performance gap remains compared to fully supervised upper bounds, our ablation studies suggest that future improvements may stem from more sophisticated fine-tuning strategies rather than loss function engineering alone.
\rev{However, the performance deterioration after more than hundred of iterations suggests that an improved loss function might increase the stability and therefore improve the robustness of the hyperparameters.}

Future research will focus on validating this approach across different imaging modalities, such as MRI/CT reconstruction. \rev{ In that space multiple iterative reconstruction models exist to be tested~\cite{SaEi26} as well as datasets with either paired devices~\cite{IsZh25} or including downstream tasks~\cite{ZbKn18}. Additionally, we aim at} exploring fusion mechanisms to replace naive ensemble averaging, thereby maximizing the utility of the intermediate representations.


\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version
 \midlacknowledgments{Funded by the European Union, EIC-2023-PATHFINDEROPEN-01 (I-SCREEN, grant no. 101130093). Views and opinions expressed are however those of the author(s) only and do not necessarily reflect those of the European Union or European Innovation Council and SMEs Executive Agency (EISMEA). Neither the European Union nor the granting authority can be held responsible for them.}


\bibliography{midl26_86}


\appendix

\end{document}

