\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{lipsum}
%\usepackage[dvipsnames]{xcolor}
\usepackage{xcolor}
\usepackage{float}
%\usepackage{subfloat}
%\usepackage{braket}
\usepackage{booktabs}
\usepackage{multirow}
\usepackage{graphicx} % for \rotatebox
%\usepackage[pass, showframe]{geometry}   % TO BE REMOVED. JUST FOR SPACE HUNTING

%\usepackage{siunitx}
\usepackage{hyperref}
%\usepackage{cleveref}
\usepackage{empheq}
\usepackage{outlines}
\usepackage{mwe} % to get dummy images
\usepackage{tikz}
\usepackage{url}
\usepackage{svg}
%\usepackage[hyphens]{url}  % recommended
\usepackage[normalem]{ulem}
\usepackage{braket}


\jmlryear{2026}
\jmlrworkshop{Full Paper -- MIDL 2026 submission}
%\editors{Under Review for MIDL 2026}
%\jmlrvolume{-- Under Review}

\editors{Accepted for publication at MIDL 2026}
\jmlrvolume{-- nnn}




\newcommand{\modelname}{ChronoConNet{}} %ChronoRankNet?
\newcommand{\lossname}{ChronoCon{}}
\newcommand{\myloss}{L^{\mathrm{ChronoCon}} }
\newcommand{\lossnamewithlabels}{OrdiCon:Y}
\newcommand{\mylosswithlabels}{L^{\mathrm{OrdinalCon:Y}}}
\newcommand{\lossRecon}{L_2^{\mathrm{DAE}}}
\newcommand{\lossRnCy}{L^{\mathrm{RnC}}}

\newcommand{\valpm}[3]{%
	$#1\,{\tiny \overset{\raisebox{0.15ex}{#3}}{\pm #2}}$%
}





%% Custom comment commands
\definecolor{darkred}{RGB}{165,50,50}
\definecolor{darkgreen1}{RGB}{0,100,0}
\definecolor{lightgray}{RGB}{200,200,200}


% GREEN improvement
\newcommand{\valpmgreen}[3]{%
	\ensuremath{#1^{\textcolor{darkgreen1}{\;\mathbf{#2}}}_{\;\pm #3}}%
}

% RED deterioration
\newcommand{\valpmred}[3]{%
	\ensuremath{#1^{\textcolor{darkred}{\;\mathbf{#2}}}_{\;\pm #3}}%
}

%\newcommand{\cw}[1]{{\color{darkgreen1}{CW: #1}}}
\newcommand{\round}[1]{{\num[round-mode=places,round-precision=4]{#1}}}
%\newcommand{\norm}[1]{\left\lVert#1\right\rVert}
\newcommand{\norm}[2][]{\left\lVert #2 \right\rVert_{#1}}
\newcommand{\ra}[1]{\renewcommand{\arraystretch}{#1}}

\newcommand{\ssim}{\operatorname{sim}}

%\newcommand{\code}[1]{\texttt{\small #1}}
\newcommand{\code}[1]{%
	\ifmmode
	\text{\texttt{\small #1}}%
	\else
	{\ttfamily\small #1}%
	\fi
}
\newcommand{\id}{\texttt{id}}
% \newcommand{\id}{g}

\usepackage[T1]{fontenc}
\usepackage{enumitem}

\newcommand{\me}{\mathrm{e}}
\newcommand{\md}{\mathrm{d}}
\newcommand{\mi}{\mathrm{i}}
\newcommand{\mIm}{\mathrm{Im}\,}
\newcommand{\mRe}{\mathrm{Re}\,}
\newcommand{\Tr}{\mathrm{Tr\,}}

\usepackage{tikz}
\newcommand\wye[1][]{%
	\tikz\draw[thick, line cap=round,x=1ex,y=1ex,#1]
	(0,0) -- ++(90:1)
	(0,0) -- ++(-30:1)
	(0,0) -- ++(-150:1);
}

%\newcommand{\cw}[1]{{\color{darkgreen1}[CW: #1]}}
%\newcommand{\je}[1]{{\color{cyan}[JE: #1]}}
%\newcommand{\ivana}[1]{{\color{green}[IJ: #1]}}
%\newcommand{\ps}[1]{{\color{blue}[P: #1]}}
%\newcommand{\gl}[1]{{\color{cyan}[GL: #1]}}

%\newcommand{\cw}[1]{}  % dont show comments. Will be used at the end!
%\newcommand{\je}[1]{}      % don't show JE comments
%\newcommand{\ivana}[1]{}   % don't show IJ comments
%\newcommand{\gl}[1]{}   % don't show GL comments
%\newcommand{\ps}[1]{} % don't show PH comments


% Highlight for drafting stage AND keep in in final version 
%\newcommand{\cwk}[1]{{\color{teal}#1}}
%
%% To be replaced with just text
%\newcommand{\cwkeep}[1]{{#1}}

%% What I used for submission:
%\newlength{\figaftervspace}
%\setlength{\figaftervspace}{-1.5em}
%\newlength{\figcapspace}
%\setlength{\figcapspace}{-1.6em}

\newlength{\figaftervspace}
\setlength{\figaftervspace}{-0em}
\newlength{\figcapspace}
\setlength{\figcapspace}{-0em}


%  \includegraphics[...]
%  \vspace{\figcapspace}
%  \caption{...}
%  \vspace{\figaftervspace}
%--------

%\makeatletter
%\def\codebreak{\begingroup\@makeother\_\@codetext}
%\def\@codetext#1{{\ttfamily\small #1}\endgroup}
%\makeatother




\title[Chronological Contrastive Learning]{Chronological Contrastive Learning: Few-Shot Progression Assessment in Irreversible Diseases}




\midlauthor{\Name{Clemens Watzenböck \midlotherjointauthor  \nametag{$^{1,2}$}} \orcid{0000-0002-9330-784X} \Email{clemens.watzenboeck@meduniwien.ac.at} \\
	%
	\Name{Daniel Aletaha\nametag{$^{3}$}} \orcid{0000-0003-2108-0030}, 
	\Name{Micha{\"e}l Deman\nametag{$^{4}$}},
	\Name{Thomas Deimel\nametag{$^{1,3}$}},	
	\Name{Jana Eder\nametag{$^{2,3}$}}    \orcid{0000-0003-0342-4952},
	\Name{Ivana Jan\'{\i}\v{c}kov\'{a}\nametag{$^{1,2}$}},  
	\Name{Robert Janiczek\nametag{$^{4}$}} \orcid{0000-0002-4952-9234},
	\Name{Peter Mandl\nametag{$^{3,6}$}}    \orcid{0000-0003-1526-4052},
	\Name{Philipp Seeböck\nametag{$^{1,2}$}} \orcid{0000-0001-5512-5810} ,
	\Name{Gabriela Supp\nametag{$^{3}$}},
	\Name{Paul Weiser\nametag{$^{1,2,5}$}} \orcid{0009-0004-2503-5696},
	%
	\Name{Georg Langs\midlotherjointauthor\nametag{$^{1,2}$}}\orcid{0000-0002-5536-6873}
	\Email{georg.langs@meduniwien.ac.at}
	\\
	%
	\addr $^{1}$ Computational Imaging Research Lab, Department of Biomedical Imaging and Image-guided Therapy, Medical University of Vienna, Vienna, Austria, 
	\addr $^{2}$ Comprehensive Center for Artificial Intelligence in Medicine, Medical University of Vienna, Vienna, Austria, 
	\addr $^{3}$ Division of Rheumatology, Department of Medicine III, Medical University of Vienna, Vienna, Austria, 
	\addr $^{4}$ Johnson \& Johnson, 
	\addr $^{5}$ Athinoula A. Martinos Center for Biomedical Imaging, Massachusetts General Hospital, Boston, Massachusetts, USA, 
	\addr $^{6}$ Ludwig Boltzmann Institute of Arthritis and Rehabilitation, Vienna, Austria
	%\addr $^{7}$ Christian Doppler Laboratory for Machine Learning Driven Precision Imaging, Department of Biomedical Imaging and Image-guided Therapy, Medical University of Vienna, Austria
}


\makeatletter
\def\@sauthor{C. Watzenb\"ock et al.}%
\let\@shortauthor\@sauthor
\def\@jmlr@authors{Watzenb\"ock et al}% used in \@titlefoot (CC-BY line)
\makeatother


\begin{document}

\maketitle

\vspace{-1em}
\begin{abstract}
Quantitative disease severity scoring in medical imaging is costly, time-consuming, and subject to inter-reader variability. At the same time, clinical archives contain far more longitudinal imaging data than expert-annotated severity scores. Existing self-supervised methods typically ignore this chronological structure.
We introduce ChronoCon, 
a contrastive learning approach that replaces label-based ranking losses with rankings derived solely from the visitation order of a patient’s longitudinal scans. Under the clinically plausible assumption of monotonic progression in irreversible diseases, the method learns disease-relevant representations without using any expert labels. This generalizes the idea of Rank-N-Contrast from label distances to temporal ordering.
Evaluated on rheumatoid arthritis radiographs for severity assessment, the learned representations substantially improve label efficiency. In low-label settings, ChronoCon significantly 
outperforms a fully supervised baseline initialized from ImageNet weights.
In a few-shot learning experiment, fine-tuning ChronoCon on expert scores from only five patients yields an intraclass correlation coefficient of 86\% for severity score prediction.
These results demonstrate the potential of chronological contrastive learning to exploit routinely available imaging metadata to reduce annotation requirements in the irreversible disease domain. Code is available at \url{https://github.com/cirmuw/ChronoCon}.
\end{abstract}





\begin{keywords}
Unsupervised Learning, Contrastive Learning, Few-Shot Learning,  Representation Learning, Longitudinal Medical Imaging,  Disease Progression, Rheumatoid Arthritis
\end{keywords}
% {\bf Primary Subject Area}: Unsupervised Learning and Representation Learning  {\bf Secondary Subject Area}: Learning with Noisy Labels and Limited Data 

\begin{figure}
    \floatconts
    {fig:ChronoConIllustrationFigure1}
    {\vspace{\figcapspace}
    	\caption{Chronological contrastive learning objective illustrated using a case of monotonically worsening joint-space narrowing (JSN) in a patient’s interphalangeal (IP).% joint due to rheumatoid arthritis (RA). 
    		\textit{Bottom:} Anti-/chronological contrastive terms. The loss aligns disease trajectories in latent space, capturing severity automatically. \textit{Top right:} Training stages. In stage~1, no labels beyond timestamps and patient+ROI IDs are required. In stage~2, the model is fine-tuned for score prediction.
    	}
    }
    {%
        \centering
        \includegraphics[width=0.9\linewidth]{fig/fig1/f1p3.pdf}
    }
\vspace{\figaftervspace}
\end{figure}
\section{Introduction}
Time is of the essence in clinical settings. 
Time series -- repeated scans of the same patient over multiple visits -- capture essential information about disease evolution and treatment response.
Although this information is routinely available in clinical archives, it is rarely used for representation learning. Most deep-learning approaches rely on large annotated datasets, yet expert scoring is expensive, time-consuming, and subject to inter-reader variability. In addition, discrete ordinal scores introduced to make expert assessment feasible and comparable capture only a coarse approximation of continuous disease severity. Often, they introduce quantization errors. 


We introduce \textit{\lossname}, a chronological contrastive learning objective function 
that uses temporal examination order to train a model for mapping imaging data to quantitative severity scores. The idea is motivated by a simple example: consider a patient with an irreversible disease who is imaged at times $t_1 < t_2 < t_3$. In the latent-disease representation, the second scan should be at least as similar to the first scan as the third is to the first. Formally, for encoded features $\mathbf{v}_i$, we expect:  $\ssim(\mathbf{v}_1, \mathbf{v}_2) \geq \ssim(\mathbf{v}_1, \mathbf{v}_3)$. A corresponding relation holds when comparing later visits to earlier ones. $\ssim(\mathbf{v}_2, \mathbf{v}_3) \geq \ssim(\mathbf{v}_1, \mathbf{v}_3)$. These ordering constraints, illustrated in \figureref{fig:ChronoConIllustrationFigure1}, allow the model to learn a progression-aware feature space without using any severity labels.

%
\paragraph{Related work}

Recent work on image series and latent-space alignment incorporates temporal or pairwise information by jointly processing image pairs in both supervised~\cite{Kamran:MICCAI:2025:LessiOnTime} and unsupervised settings~\cite{bannur_learning_2023}. 
%%%% BEGINNEW ADDED PARTS: 
\cite{Kim:MIDL2023:learning}, also assumes monotonic progression, just as we do, but operates on image pairs using a learned classifier to predict temporal order. Likewise, \cite{Chakravarty:MICCAI2024:Forecasting_Disease_Progression_with_Parallel_Hyperplanes} enforce increasing risk scores via pairwise losses and parallel hyperplanes in latent space.
%
While effective at capturing pairwise differences, these approaches do not leverage the full temporal trajectory available in longitudinal patient data.

\cite{Holland:MIA2024:Metadata_enhanced_learning_OCT} define positives as visits from the same patient within a predefined time window and negatives across patients. This requires known progression timescales, frequent acquisitions, and balanced disease states—assumptions that may not hold in many longitudinal settings such as rheumatoid arthritis progression.

In contrast to all these approaches, ChronoCon does not operate on image pairs or fixed time windows, but leverages the complete visit sequence to impose ordering directly in latent space without additional learnable components.

\cite{Zeghlache:MICCAI2024:LaTiM} combine self-supervision with Neural ODEs to model continuous disease dynamics and naturally handle irregular sampling. This more general formulation assumes differentiable feature evolution and adds ODE training complexity, whereas ChronoCon makes no continuity assumptions and is suited for trajectories with abrupt changes.

%
%\cite{Holland:MIA2024:Metadata_enhanced_learning_OCT} introduced an SSL approach where the visits from the same patient within a time-scale $\delta_T$ smaller than the disease progression dynamics are taken as positives. Other randomly selected images which are assumed to be in a different disease state are taken as negatives for the contrastive loss. While we suspect that an application to highly unbalanced datasets (many patients in the same disease state) and where the time-scale of disease dynamics is unknown and subject dependent could pose challenges it does lead to significant performance improvements in patients with age-related macular degeneration. 
%
%
%%%% END NEW ADDED PARTS: 


In supervised contrastive learning, several methods define positive and negative pairs based on label ordering~\cite{gong2022ranksimrankingsimilarityregularization, zha2023rankncontrastlearningcontinuousrepresentations}. \cite{JanIva_Temporal_MICCAI2025} used a triplet loss with time-dependent margins as hyperparameters, which makes the approach difficult to apply to nonlinear progressions in irregularly sampled time series.
Conversely, while \cite{couronne2021longitudinal} handles irregular sampling, the soft-rank loss does not enforce discriminability across more distant visits: it preserves ordering without explicitly pushing farther-apart time points away in latent space, similar to label-distribution smoothing or feature-distribution smoothing~\cite{yang2021delving}.


The closest work to ours is \textit{Rank-N-Contrast} (RnC)~\cite{zha2023rankncontrastlearningcontinuousrepresentations}.
RnC defines the conditional probability that the positive ($p$) is the correct match for the anchor ($a$) among its negatives $n \in \mathcal{S}_{ap}^\bullet$ as
\begin{equation}
	P(\mathbf{v}_p \mid \mathbf{v}_a, \mathcal{S}^\bullet_{ap})
	= \frac{\exp\!\bigl[\ssim(\mathbf{v}_a, \mathbf{v}_p)\bigr]}
	{\exp\!\bigl[\ssim(\mathbf{v}_a, \mathbf{v}_p)\bigr]  +  \sum_{n \in \mathcal{S}^\bullet_{ap}  	\setminus \{p\} } \exp\!\bigl[\ssim(\mathbf{v}_a, \mathbf{v}_n)\bigr]}.%,  \quad  \ell_{ap}^{\bullet} =  -\log P(v_p | v_a, \mathcal{S}^\bullet_{ap})
	%\label{Eq:PosteriorProbability}
\end{equation}
The corresponding per-pair loss is $\ell^\bullet_{ap} = -\log P(\mathbf{v}_p \mid \mathbf{v}_a, \mathcal{S}^\bullet_{ap})$.
Negatives are selected based on distances in label space 
$\mathcal{S}^{\mathrm{RnC}}_{ab}
:= 
\left\{
k ~\big|~
k \neq i,~
|y_a - y_n| \geq |y_a - y_p|
\right\}
,$
 a strategy well suited for fully supervised regression problems. % TODO Maybe remove this variable all together.. 
%
%
RnC has since been applied to sentiment analysis~\cite{Weng:IEEE2025:Enhancing_Multimodal_Sentiment_Analysis:via_RnC}, visual-concept explanation~\cite{Obadic2024:CVPR:Contrastive_Pretraining_for_Visual_Concept_Explanations_of_Socioeconomic_Outcomes_using_RNC}, and extended to survival prediction~\cite{Sae_SurvRNC_MICCAI2024}.
However, neither RnC nor these generalizations\footnote{In \cite{Sae_SurvRNC_MICCAI2024}, time-to-event was also the prediction target, and repeated imaging for the same patient was not considered.} can be used \textit{without labels} and thus cannot be applied directly to timestamps.


For a patient with visits at $t_1\!=\!0$, $t_2\!=\!1$, and $t_3\!=\!3$~years, RnC would imply that features between first and second visit are more similar than those between the second and third. In irreversible diseases, however, progression is nonlinear: long periods of stability may be followed by abrupt worsening. Consequently, absolute time intervals are not meaningful distances.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%---------------------------------------------------------%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\section{Methods}

For each image $\mathbf{x}$, the corresponding relative time point $t$ within a patient’s examination series is available. For some images, an additional ordinal expert-annotated score $y$ is provided. We use this information to encourage representations that capture disease progression. % in the latent space.
%
Let $\mathcal{D} = \{(\mathbf{x}_i, t_i, y_i, \code{id}_i)\}_{i=1...N}$ denote a dataset of $N$ examples with imaging, time, and scoring information, where $\id_i$ is the group identifier determining which samples may be contrasted against each other.
%
We propose a two-stage learning procedure. In the \textit{first stage}, only imaging data and metadata are required. We learn a mapping $f: \mathbb{R}^{l \times w} \rightarrow \mathbb{R}^d$, $\mathbf{x} \mapsto \mathbf{v}$ from the image space to a latent representation space using \lossname.
%
The goal of the \textit{second stage} is to learn a scoring function, mapping latent representations to an estimate of the ordinal score, $h: \mathbb{R}^d \rightarrow \mathbb{R}$, $\mathbf{v} \mapsto \hat{y}$, trained using only an MSE loss.

\paragraph{Chronological contrastive learning. }
To apply contrastive learning to time-stamps, we introduce the sets of chronological negatives $\mathcal{S}^<_{ap}$ and anti-chronological negatives $\mathcal{S}^>_{ap}$ as
\begin{equation}
	\begin{array}{rclcccl}
		\mathcal{S}^<_{ap} &=& \{n &|& n \neq a, & \id_a = \id_p = \id_n, & (t_a \leq t_p < t_n)\ \}, \\ %[0.25em]
		\mathcal{S}^>_{ap} &=& \{n &|& n \neq a, & \id_a = \id_p = \id_n, & (t_a \geq t_p > t_n)\ \}.
	\end{array}
\end{equation}	
Trivial pairs without valid negatives are excluded from normalization, and we define \textit{\lossname} loss as the balanced\footnote{One might also attempt to use only forward chronological contributions, but then early images would be under-represented as positives and late images over-represented as negatives.} sum of the forward and backward chronological contributions:
\begin{equation}
	\begin{array}{rcl}
		\myloss
		&=&
		\frac{1}{|\mathcal{P}^<_+|}
		\sum_{(a,p) \in \mathcal{P}^<_+}  
		%\log P(v_p \mid v_a, \mathcal{S}^<_{ap})
		\ell^<_{ap} 
		\,\, + \,\,
		\frac{1}{|\mathcal{P}^>_+|}
		\sum_{(a,p) \in \mathcal{P}^>_+}  
		\ell^>_{ap}
		%\log P(v_p \mid v_a, \mathcal{S}^>_{ap})
		,\\
		%
		\mathcal{P}^{<}_+ &=& \{ (a,p) \ \mid \ \id_a = \id_p,\ (t_a \leq t_p),\ |\mathcal{S}_{ap}^{<}|>0 \}, \\
		\mathcal{P}^{>}_+ &=& \{ (a,p) \ \mid \ \id_a = \id_p,\ (t_a \geq t_p),\ |\mathcal{S}_{ap}^{>}|>0 \}. \\
	\end{array}
	\label{Eq:ChronoConLoss}
\end{equation}
The functional form of each per-pair term $\ell^{<}_{ap}$ and $\ell^{>}_{ap}$ follows the same probabilistic formulation as the Rank-N-Contrast (RnC) loss~\cite{zha2023rankncontrastlearningcontinuousrepresentations}. ChronoCon differs, however, in how contrastive pairs and negatives are constructed: instead of relying on distances in label space, negatives are defined through temporal ordering within the same subject. To account for the asymmetry introduced by timestamps, the loss is further split into forward and backward chronological contributions. Furthermore a minor adjustment to the normalization is made, which is mainly relevant for short time series and small batch sizes.


Our loss provides a natural way of enforcing order via ranking with respect to $t$ for all samples sharing the same group identifier ($\id$). We explicitly avoid imposing a metric on $t$. Intuitively, such ordering should also support improved prediction of the target $y$ in downstream tasks, provided $y$ and $t$ exhibit a monotone relationship.

\textit{Ordinal contrastive learning}. -- This property also makes the loss well suited for ordinal regression. While not our primary focus, we evaluate the loss on ordinal disease-severity labels by adjusting the group identifiers accordingly. To distinguish this setting from our main objective—the unsupervised application to time-stamps—we denote the loss used on labels $y$ as $\mylosswithlabels$, emphasizing the \textit{ordinality} in the pair selection process.
\footnote{One might in this respect refer to $\myloss$ as $L^{\mathrm{OrdinalCon:t}}$, but we refrain from this to avoid confusion.}



{\textbf{Data augmentation.}} \hspace{1mm} 
Without augmentation, only image series of length \textit{three or more} would contribute to the loss, as at least one anchor, one positive, and one negative are required. 
To enable training on series with only two visits, we apply double-crop augmentation following \cite{zha2023rankncontrastlearningcontinuousrepresentations}.



\begin{figure}
	\floatconts
	{fig:ScoresExpl}%
{ \vspace{\figcapspace}
	\caption{
		\textit{Left:} Joint-level contributions to the total SvHS illustrated on a representative hand radiograph, highlighting erosions and joint spaces. 
		\textit{Right:} Regions of interest extracted during fully automatic preprocessing of hand radiographs.
}}
	{%
		\centering
		\includegraphics[
		width=\linewidth
		]{./fig/fig1/f2.png}%
	}
\vspace{\figaftervspace}
\end{figure}

\paragraph{Application of \lossname{} in rheumatoid arthritis (RA) radiographs}
We evaluate this approach on radiographs patients with RA to demonstrate that chronological information in routine imaging can yield clinically meaningful representations even without expert annotations.
Disease severity in RA is commonly quantified using the Sharp–van der Heijde (SvH) score, which aggregates erosion (ERO) and joint-space narrowing (JSN)  subscores for multiple joints, resulting in a total score ranging from 0 to 448. These subscores are discrete and costly to obtain, making this domain a representative and challenging test case for label-efficient learning~\cite{vanderHeijde2000}.



First, we localize the joints with an automatic landmark-detection method \cite{PAYER2019207, JONKERS2025102165} and extract an image patch for each detected joint (illustrated in \figureref{fig:ScoresExpl}). For the first stage of training, contrastive pairs are constructed only from patches belonging to the \textit{same patient, side, and joint type}. $\ssim(\mathbf{v}_i, \mathbf{v}_j) = -L_2(\mathbf{v}_i, \mathbf{v}_j)/\tau$ with temperature $\tau=1$ was used as similarity metric throughout. To stabilize training, we add a standard denoising autoencoder (DAE) reconstruction loss $\lossRecon$.

In the second stage, a multi-headed regressor replaces the decoder, with one head for each score type (59 in total). During fine-tuning, the encoder parameters are also updated, but with a reduced learning rate. The only loss used in this stage is mean-squared error (MSE) on the ERO/JSN scores.

All models are trained solely to predict \textit{cross-sectional} SvH scores. Individual JSN and ERO scores are then summed to obtain the total SvH score, and differences between visits ($\Delta$SvHS) are computed afterward.


\paragraph{Quality measures.}
Performance is evaluated for both single–time point predictions and the derived progression $\Delta\mathrm{SvHS}$. Agreement with ground truth is quantified using the intraclass correlation coefficient (ICC), root-mean squared error (RMSE), and Pearson’s correlation coefficient $\rho$. For significance tests between models we always used a two-sided paired \textit{t}-test on MSE (without the bootstrapping procedure).
Full metric definitions and elaboration on performed statistics are in Appendix~\ref{app:trainingDetails}.



%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%---------------------------------------------------------%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Experiments and Results}

\paragraph{Dataset}
The dataset consists of hand and foot radiographs from 778 patients with RA. It comprises 13\,742 radiographic images and a total of 407\,045 individual scores across 59 score types. %Some joint regions were marked as not scoreable (NS), typically due to surgical alterations such as spacers or fusions. 
As detailed in Table~\ref{tab:score_distribution} in the appendix, the score distribution is highly imbalanced: fewer than 1\% of erosion scores fall into the highest category, and fewer than 4\% of JSN scores do. The dataset also exhibits only short longitudinal series, with a median of 4 visits per patient (IQR [3,5]).

We use a patient-level split to avoid leakage of longitudinal information. Training, validation, and test sets contain 466/155/157 patients with 8\,157/2\,753/2\,832 images and 241\,701/81\,501/83\,843 scores, respectively. %The validation set is used for tracking metrics during development and for early stopping.

\begin{figure}
	\floatconts
	{fig:TrSetSize}
	{\vspace{\figcapspace}
		\caption{ICC of standard of reference and estimated SvHS as a function of training set size:  (\textit{left}) $SvHS$, and  (\textit{right}) change $\Delta SvHS$; blue: only single-stage baseline, green: pre-trained with reconstruction loss; orange: pre-trained with \lossname{} and reconstruction loss. Black cross: Pretrained with original Rank-N-Contrastive loss \textit{on time}; \wye[rotate=90, color=purple]: pre-trained with SimCLR. Error bars indicate 95\% CI.
	}}
	{%
		\centering
		\begin{tabular}{cc}
			\includegraphics[
			width=0.47\linewidth
			]{./fig/TrSetSize_vs_performanceWithReCon_Lines/training_set_size_vs_performance_test_ICC_psych.pdf}
			&		
			\includegraphics[
			width=0.47\linewidth
			]{./fig/TrSetSize_vs_performanceWithReCon_Lines/training_set_size_vs_performance_test_delta_SvH_ICC_psych.pdf}			
		\end{tabular}
	}
	\vspace{\figaftervspace}
\end{figure}



\paragraph{Model and training details}
We used ResNet18 as the encoder for all models~\cite{he2015deepresiduallearningimage}. For DAE pretraining, the decoder mirrored the encoder using transposed convolutions. 
A hierarchically grouped dataloader was used to improve temporal consistency: patches from the same ROI and patient were typically placed in the same batch and oversampled based on intrapatient median ERO/JSN scores to mitigate score imbalance. Early stopping on validation mean absolute error (MAE) with a 10-epoch patience was applied for fine-tuning and for the supervised baseline, restoring the best-performing model.

All methods, except the single-stage baseline, follow a unified two-stage protocol introduced in the Methods~section.  In \textit{Stage 1 (pretraining)}, the encoder is trained without labels using either a contrastive loss or a reconstruction loss. For contrastive methods, we denote the general loss as
\(
L^{\mathrm{Con}} + 10^3 \, \lossRecon.
\)
where the prefactor scales the MSE-reconstruction loss to a similar order of magnitude as the contrastive loss.

The contrastive loss \(L^{\mathrm{Con}}\) is instantiated in three variants: \textit{ChronoCon}, which uses patient visit order to define positive and negative pairs; \textit{RnC:t}, which is similar but defines negatives solely based on temporal distance,
\(
\mathcal{S}^{\mathrm{RnC:t}}_{ap} := \{ n \mid n \neq a,~ |t_a - t_n| \ge |t_a - t_p| \};
\)
and \textit{SimCLR}~\cite{chen2020simpleframeworkcontrastivelearning}, which uses standard contrastive learning without temporal or label information. DAE pretraining uses only the reconstruction loss \(\lossRecon\). Double-crop augmentation is applied for all contrastive methods; for non-contrastive methods, the second crop is discarded. Experiments with an attached decoder (DAE variants) used half the batch size due to memory constraints.


In \textit{Stage 2 (fine-tuning)}, the decoder is replaced by a multi-headed regressor and the encoder is fine-tuned on available labels with MSE, using a learning rate reduced by a factor of 10. The single-stage baseline skips Stage~1 and trains encoder and regressor directly from ImageNet initialization. The code is publicly available at \url{https://github.com/cirmuw/ChronoCon} (further details in Appendix~\ref{app:trainingDetails}).


\subsection{Label efficiency}
A key advantage of our loss is that it enables learning meaningful feature representations without access to scores $y$. To evaluate label efficiency, we created progressively smaller training subsets by reducing the number of patients with labeled data. The full training set comprises images from 466 patients. %, all of whom have labels. 
All splits were performed at the patient level, allowing us to simulate how performance changes when labels are available for only a subset of patients. The validation and test sets remained fixed across all experiments (155 and 157 patients, respectively). Details on the splits are provided in \tableref{tab:score_reduction}. Superscripts indicate improvements over the single-stage baseline (difference between orange and blue line in \figureref{fig:TrSetSize}.)


\begin{table}\centering
	\ra{1.3}
	\begin{tabular}{@{}lrrrcccccc@{}}
		\toprule
		\multicolumn{4}{c}{Dataset size} 
		&& \multicolumn{2}{c}{Cross sect. (SvHS)}
		&& \multicolumn{2}{c}{Longit. ($\Delta$SvHS)} \\
		\cmidrule(lr){1-4}
		\cmidrule(lr){6-7}
		\cmidrule(lr){9-10}
		scores [\%] & scores [$N$] & images & patients 
		&& RMSE $\downarrow$ & ICC $\uparrow$
		&& RMSE $\downarrow$ & ICC $\uparrow$ \\
		\midrule
		
		1 & 2\,442 & 82 & 5 &&
		\valpmgreen{19.9}{-7.0}{2.5} & \valpmgreen{86}{+17}{2} &&
		\valpmgreen{9.5}{-1.9}{0.7} & \valpmgreen{64}{+30}{3} \\
		
		2 & 4\,475 & 152 & 10 &&
		\valpmgreen{19.3}{-4.5}{2.6} & \valpmgreen{86}{+10}{2} &&
		\valpmgreen{9.0}{-2.1}{0.6} & \valpmgreen{63}{+24}{3} \\
		
		4 & 9\,413 & 319 & 20 &&
		\valpmgreen{18.7}{-4.7}{2.7} & \valpmgreen{87}{+7}{2} &&
		\valpmgreen{8.1}{-2.3}{0.5} & \valpmgreen{67}{+19}{3} \\
		
		6 & 14\,719 & 499 & 31 &&
		\valpmgreen{17.2}{-2.1}{2.7} & \valpmgreen{90}{+3}{2} &&
		\valpmgreen{7.7}{-1.4}{0.6} & \valpmgreen{72}{+11}{2} \\
		
		10 & 23\,546 & 808 & 46 &&
		\valpmgreen{14.5}{-2.9}{2.1} & \valpmgreen{93}{+3}{1} &&
		\valpmgreen{7.8}{-1.1}{0.5} & \valpmgreen{72}{+10}{2} \\
		
		15 & 36\,243 & 1\,238 & 71 &&
		\valpmgreen{14.5}{-1.6}{2.1} & \valpmgreen{93}{+1}{1} &&
		\valpmgreen{7.7}{-1.0}{0.5} & \valpmgreen{72}{+8}{2} \\
		
		34 & 79\,799 & 2\,745 & 156 &&
		\valpmgreen{12.5}{-0.5}{1.6} & \valpmgreen{95}{+0}{1} &&
		\valpmgreen{8.2}{-0.5}{0.7} & \valpmgreen{69}{+4}{3} \\
		
		49 & 116\,448 & 3\,989 & 226 &&
		\valpmgreen{12.1}{-1.2}{1.6} & \valpmgreen{95}{+0}{1} &&
		\valpmgreen{8.0}{-0.2}{0.6} & \valpmgreen{70}{+2}{3} \\
		
		100 & 237\,733 & 8\,157 & 466 &&
		\valpmgreen{10.8}{-1.0}{1.3} & \valpmgreen{96}{+0}{1} &&
		\valpmgreen{8.4}{-0.6}{0.7} & \valpmgreen{67}{+4}{3} \\
		
		\bottomrule
		\vspace{\figcapspace}
	\end{tabular}
	
	\caption{
			\label{tab:score_reduction}
		Test-set performance of our model after pretraining with \lossname{} loss on 466 patients and fine-tuning on a fraction of labeled data.
		Superscripts in green/red show improvement over the single-stage baseline ; subscripts give half the 95\% CI.
	}
	\vspace{\figaftervspace}
\end{table}







\paragraph{Observations and Interpretation.}
Our method using $\myloss+\lossRecon$ outperforms the baseline across all label sub-splits. The improvement is driven by $\myloss$, not by $\lossRecon$; in fact, DAE pretraining alone performs worse. %Likewise SimCLR pretraining worsened the performance. In fact, it unlearned some of the ImageNet initialization and performed worse that the fully single-stage baseline. 

Changing the \textit{ChronoCon} loss to \textit{RnC:t} worsened the performance significantly %\footnote{$p<10^{-6}$ cross-sectionally and longitudinally} 
in the low-label setting highlighting the importance of using a loss which allows for a non-linear relationship between time and disease-progression. However, overall the performance of \textit{RnC:t} was still good, likely because many of our image time-series consist of just a few images. 

Performance gains are most pronounced in the low-label setting. Even in a \textit{few-shot} scenario with labels from only 5 patients, our method achieves an ICC of 0.86 and RMSE of 19.9. For context, the SvHS has a standard deviation of 46 in the full dataset. Remarkably, \lossname{} trained on just 5 patients performs on par with a recently published model (RMSE = 23.6) trained on 367 patients \footnote{and substantially outperforms \cite{Moradmand2025} (trained on 428 patients/visits) who reported RMSE = 44.28 on scores 0–270.}

\begin{figure}
	\floatconts
	{fig:combined_prog_and_svh_5_31_466patients} % label
	{\vspace{\figcapspace}
		\caption{
			Scatter plots comparing ground truth and model predictions for the single-stage baseline (blue) and our ChronoCon method ($\myloss$ + $\lossRecon$; orange) trained on labels from 5, 31, and 466 patients. $\lossRecon$ only results are shown in green. 
			\textit{Top:} Longitudinal evaluation in terms of score differences between visits. 
			\textit{Bottom:} Cross-sectional prediction performance for total SvH scores. 
			}
		}%
		{%
			\centering
			%\fbox{
				\includegraphics[width=0.95\linewidth,  % Used to be 0.6 ... Maybe move to appendix 
				trim={2.1mm 0mm 2mm 0mm},clip
				]{fig/TrSetSize_vs_performanceWithReCon_SCATTER/5-31-466.pdf}
			%}
		}
			\vspace{\figaftervspace}
	\end{figure}
	
	

Longitudinal evaluation ($\Delta$SvHS) shows even larger gains from \lossname{} in the low-label regime. Notably, its performance remains almost constant over a wide range of training-set sizes, unlike the baseline. This stems from explicitly learning patient-specific progression in an unsupervised manner. With only 4\% of labels, results match those obtained using 100\% of labels. Interestingly, longitudinal performance peaks at 6–15\% of labels rather than at full supervision, suggesting that strong (and noisy) labels may override the features learned during pretraining.



\textit{Scoring consistency—}The progression error
are substantially better than expected from subtracting two noisy estimates. If $\hat y_i = y_i + \epsilon_i$ with noise variance $\sigma^2 = \mathbb{E}[\epsilon_i^2]$, then
$\operatorname{MSE}(\Delta\mathrm{SvHS}) = 2\sigma^2(1-c)$ ,
where $\mathbb{E}[\epsilon_i \epsilon_j]/\sigma^2 = c$ is the error correlation ($i\neq j$). For uncorrelated errors, RMSE($\Delta$SvHS) should be $\sqrt{2}$ times RMSE(SvHS). However, our errors are highly correlated ($c=0.91$ for 4\% labels; $c=0.70$ for 100\%), indicating strong error cancellation—i.e., \textit{scoring consistency}—at least partly due to \lossname{} pretraining.






\subsection{Learned feature space}
To further investigate the feature space learned in the first stage with \lossname{}, the embedding is visualized in \figureref{fig:PCA}\,$a$ and $b$. The 512-dimensional features were reduced to 2 dimensions using principal component analysis (PCA). The training process is completely invariant to global time shifts for each patient because only repeatedly acquired patches of the same patient, side, and joint type are contrasted against each other. In \figureref{fig:PCA}\,$a$, the embedding is colored by relative time, with three example trajectories shown as white lines. In \figureref{fig:PCA}\,$b$ the same embedding is colored by JSN/ERO labels. Importantly, no labels were used during training. Points are displayed in order of increasing score to highlight the transition from low to high severity. All ERO and JSN patches are shown in the same plot, even though their respective maximum scores differ (5 and 4).


% FIG: latent_spaces
\begin{figure}
	\floatconts
	{fig:PCA}
	{\vspace{\figcapspace}
		\caption{
Feature space (PCA) of the unsupervised model pretrained with \lossname{}. 
\textit{Left~(a):} Colored by relative time $t_{\mathrm{rel}}$ (0 = first visit, 1 = last); white lines show example patient–joint trajectories. 
\textit{Middle (b):} Same embedding colored by ground-truth scores (no score information was used during training). 
\textit{Right~(c):} Feature similarity between chronologically ordered visits compared to the corresponding joint-space–narrowing (JSN) score differences for the MCPV joint.
		}
	}
	{%
		\centering
	%	 \fbox{
			\includegraphics[
	 		%	trim={3mm 50mm 0mm 0mm}, % left bottom right top  (example values)
			%	clip=true			
			width=\linewidth
			]{./fig/latent_spaces/combined.pdf}
	%}
	}
		\vspace{\figaftervspace}
\end{figure}



In \figureref{fig:PCA}\,$c$ we compare pairwise feature similarities over disease-label differences for the MCPV joint. Only features for the same patient and side are compared. Equivalent plots for all other joints are provided in the appendix (\figureref{fig:FeatureSIM_ALL}). The visit pairs are chronologically ordered, though not necessarily consecutive for patients with more than two visits. The lower panel displays a histogram of disease-label- (JSN-score) -differences between the two visits. The histogram shows that while scores typically increase over time, decreases of up to -1 also occur, likely due to quantization effects in the discrete scoring system.


The DAE baseline is shown in green, our main model—which combines the reconstruction loss with \lossname{}—is shown in orange, and the untrained/frozen ResNet18 (initialized from ImageNet-weights) is shown in red. For the latter the image-net classification layer was removed; leaving a total of 17 layers from the ResNet18. None of these three models had access to ERO/JSN scores. The only model which used scores during training is the single-stage baseline shown in blue.



\paragraph{Observations and interpretation.}
Among the unsupervised baselines, the DAE shows the weakest behavior: its feature similarities exhibit little correspondence with progression severity. %, indicating that reconstruction alone does not encode longitudinal structure. 
In contrast, the frozen ImageNet encoder can already distinguish progression to some extent—the red box-plots roughly follow the trend of the supervised baseline. This aligns with the observation that the supervised baseline, when trained on labels from only 5 patients, still achieves an ICC of 0.34 for $\Delta$SvHS.

Our full model provides the clearest separation of progression patterns. The embeddings are effectively ordered only along each \textit{individual} trajectory 
and disease-related features are learned automatically. Remarkably, coloring the embedding by visit time appears more disordered than when coloring by severity. This suggests that, in the second stage, the regression heads only need to learn which regions of feature space correspond to which scores—explaining the strong performance even with labels from just 5 patients. Consistently, the feature similarities of our model (orange) closely follow those of the supervised baseline (blue), which was trained on patch–score pairs from all 466 patients.

 


\subsection{Combination with other pre-training methods and ablation study}
Different pre-training strategies are summarized in \tableref{tab:loss_combinations_stage}. All models were trained on the full set of 466 patients and their corresponding scores. The table is organized into four groups: (i) the supervised baseline (top), (ii) unsupervised methods where no labels $y$ are used in the first stage, (iii) supervised first-stage training with the original RnC loss applied to labels, and (iv) supervised first-stage training with our loss applied to \textit{labels} ($\mylosswithlabels$).% To distinguish this latter setting from the timestamp-based application, the loss is written as $\mylosswithlabels$.%, emphasizing that it enforces oridinality with respect to $y$.



Whenever a contrastive loss is applied to labels, stratification via \code{id} uses the score type (e.g., \code{IP\_JSN}, \code{PIPII\_ERO}, ... ), so any–vs–any patient pairs are allowed as long as they correspond to the same ROI and score type. 


\begin{table}[t]
	\centering
	\includegraphics[width=\linewidth]{fig/tab2.pdf}
	\caption{\label{tab:loss_combinations_stage}
		Different pre-training strategies on the full training set of 466 patients.
		Values are shown with 95\% CI. Underlined metrics indicate the best methods that do not require labels during pretraining; bold values indicate the best overall performance. Except for the baseline, all models were trained in two stages.
		$\star$ / $\ast$ indicate cross-sectional / longitudinal results of a two-sided paired \textit{t}-test on MSE. { \emph{Note:} \textit{p}-values are reported for the individual paired comparisons described in the text; see Appendix for details and interpretation.}
	}
	\vspace{0.3em}
	\vspace{\figaftervspace}
\end{table}


\paragraph{Observations and interpretation.}
Table~\ref{tab:loss_combinations_stage} should be read by separating three regimes: (i) no labels in stage one, (ii) full-label pretraining, and (iii) time-order based pretraining.

In the no-labels pre-training  setting, neither as a standalone task nor in combination with other losses did the reconstruction/denoising objective $\lossRecon$ yield meaningful improvements. In contrast, \lossname{} pretraining improved performance—particularly ICC($\Delta$SvH)—over other methods that do not use label information. Regarding MSE, there is a statistically significant difference between \textit{\lossname{} + DAE} and the single-stage baseline ($p < 10^{-4}$ cross-sectionally and longitudinally).

When all labels are used during pretraining (via $\mylosswithlabels$ or $\lossRnCy$), these methods perform best. In this regime, there is no significant difference between \textit{RnC} and \textit{\lossname{}}. This is expected, as the supervision signal is already maximal. However, applying $\mylosswithlabels$ to the ordinal JSN/ERO scores yields the strongest overall results, improving cross-sectional MSE over RnC ($p=0.016$) %\footnote{It should be noted that the ($p=0.016$) is statistically significant only prior to the Bonferroni correction which would reduce the type 1 error but inflate the type 2 error. \cite{Perneger1998:What_is_wrong_with_Bonferroni}. See appendix.}
%Our overall conclusion -- ChronoCon leads to significant benefits in the \textit{low-label setting} -- remains valid.}. 
We attribute this the the fact that when our loss is applied to labels ($\mylosswithlabels$) it respects the ordinal structure of the labels ($0\!\rightarrow\!1 \neq 1\!\rightarrow\!2$), whereas the original RnC loss does not. 

For time-order based pretraining, the \textit{RnC:t} baseline performs slightly worse than \textit{ChronoCon}, highlighting the advantage of separating positive- and negative-time directions. Similarly, the \textit{SimCLR + DAE} baseline shows numerical differences to \textit{ChronoCon + DAE} only in the longitudinal setting ($p=0.0016$), indicating that visitation-order aware pretraining primarily benefits longitudinal metrics.

Importantly, Table~\ref{tab:loss_combinations_stage} also shows that when labels are abundant, pretraining on visitation time does not add benefit over label-based pretraining. The main advantage of \textit{ChronoCon} therefore lies in \textit{low-label settings}, where time-order information substitutes for missing annotations.
%%%%%%%%%%%%%%%%%%%% Reformated and reodereded


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%---------------------------------------------------------%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Discussion}

\paragraph{Summary}
We introduced \lossname{}, a chronological contrastive loss that exploits visitation order in longitudinal imaging to learn disease-relevant representations without expert labels. In RA radiographs, the learned feature space captured both cross-sectional severity and longitudinal progression, and clearly improved performance in low-label scenarios compared with purely supervised or reconstruction-based pretraining. These findings highlight that chronological information routinely present in clinical archives can serve as a powerful and inexpensive supervisory signal.

Compared with typical representation-learning approaches that operate on individual images or unordered pairs, our method leverages temporal ordering as an additional inductive bias. While several self-supervised objectives have been explored in medical imaging, few explicitly account for longitudinal structure. %, and we are not aware of prior work using visitation order as a supervisory signal for predominantly irreversible diseases. 
The observed improvements in low-label settings suggest that temporal ordering can provide complementary information in an irreversible disease setting. 


\paragraph{Limitations and ethical concerns.}
The usefulness of \lossname{} depends on the presence of a valid ordering variable $t$ within subgroups of shared \code{id}. When $t$ denotes visit time, the loss assumes a predominantly monotonic progression. This is a reasonable approximation for erosive changes in RA \cite{vanderHeijde2000}, but may not hold in diseases with non-monotonic or treatment-reversible patterns.

Beyond this conceptual limitation, several practical aspects should be noted. First, our experiments are based on a single-center dataset, and broader multi-center validation will be necessary. Second, the method relies on sufficient longitudinal coverage; in datasets dominated by single visitations, the benefit of chronological contrastive learning is limited. 

Furthermore, the choice of subgroup identifiers also warrants careful consideration. In our RA study, \code{id} was defined at the joint level to avoid semantically implausible comparisons. More broadly, subgrouping can encode clinically meaningful structure, but our approach could be used with demographic or biologically sensitive categories which raises ethical concerns. Depending on the application, subgroup definitions may either improve representation quality or inadvertently entrench biases, making transparent justification essential.



\paragraph{Conclusions.}
Chronological contrastive learning provides a simple and effective way to leverage unlabeled longitudinal imaging data for representation learning. By using only visitation order, it generalizes label-based contrastive ranking to a setting where expert scores are not required, enabling strong performance even when labels are scarce. Our experiments on RA radiographs demonstrate that chronological signals embedded in routine clinical workflows contain exploitable structure for learning progression-aware feature spaces. The approach has potential relevance for other predominantly irreversible diseases and may help reduce annotation burden in domains where expert scoring is costly or inconsistent.


\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{This project has been partially funded by: The Innovative Health Initiative Joint Undertaking  (IHI JU) and its members, and other contributing partners, under grant agreement No. 101194766, the Vienna Science and Technology Fund (WWTF, PREDICTOME) [10.47379/LS20065], and the Austrian Science Fund (FWF, P35189). 
Some authors (C.W.) were supported by the Clinical Research Group MOTION, Medical University of Vienna, Vienna, Austria -- a project funded by the Clinical Research Groups Program of the Ludwig Boltzmann Gesellschaft (Grant Nr: LBG\_KFG\_22\_32) with funds from the Fonds Zukunft Österreich.\\
Co-Funded by the European Union, the private members, those contributing partners of the IHI JU, and SERI. Views and opinions expressed are, however, those of the authors only and do not necessarily reflect those of the aforementioned parties. Neither of the aforementioned parties can be held responsible for them. \\		
C.W. thanks Marlene Steiner and Simon Schürer-Waldheim for many insightful discussions. %and for generating \figureref{fig:progressionOfScores}.
}

%%% AUTOPIX
%This research has been partially funded by the Innovative Health Initiative Joint Undertaking  (IHI JU) and its members, and other contributing partners, under grant agreement No. 101194766

%%% PREDICTOME
% Ivana Janickova has been funded by the Vienna Science and Technology Fund (WWTF, PREDICTOME) [10.47379/LS20065]

%%% ONSET
% This research has been partially funded by the Austrian Science Fund (FWF, P35189)

%%% MOTION
% Some authors (C.W.) were supported by the Clinical Research Group MOTION, Medical University of Vienna, Vienna, Austria—a project funded by the Clinical Research Groups Program of the Ludwig Boltzmann Gesellschaft (Grant Nr: LBG_KFG_22_32) with funds from the Fonds Zukunft Österreich.



\paragraph{Data Availability}
The data used in this study were obtained from an internal dataset of the Medical University of Vienna, collected within the AutoPIX consortium. Due to ethical, legal, and data protection constraints, the data are not publicly available. Access may be granted upon reasonable request and subject to institutional approval and appropriate data sharing agreements.


%\bibliography{bib/main-ra,bib/AutoPIX}
\bibliography{midl26_241.bib}

\appendix
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\section{Vienna RA dataset details}

Data were collected between 1997 and February 2018 at the Division of Rheumatology, Department of Internal Medicine III, Medical University of Vienna.
The study protocol outlining the retrospective data analysis was approved by the local ethical committee of the Medical University of Vienna (vote number : 1206/2018).
\tableref{tab:score_distribution} shows the score distribution for erosion and joint-space narrowing speperated by hands and feet joints.
More information on the dataset can also be found in \cite{deimel2025autoscoRA}. 



 \begin{table}\centering
	\ra{1.3}
	\begin{tabular}{@{}lccccccc@{}}
		\toprule
		& \multicolumn{7}{c}{Score distribution (counts)} \\
		\cmidrule(lr){2-8}
		Type & 0 & 1 & 2 & 3 & 4 & 5 & NS \\
		\midrule
		ERO hands & 160\,000 & 11\,838 & 3\,657 & 1\,603 & 395  & 412 & 2\,613 \\
		ERO feet  & 66\,581  & 8\,733  & 2\,472 &   980 & 221  & 424 & 2\,177 \\
		JSN hands & 63\,165  & 24\,048 & 7\,043 & 5\,248 & 3\,093 & --   & 1\,548 \\
		JSN feet  & 24\,279  & 9\,221  & 2\,338 & 2\,225 & 1\,640 & --   & 1\,091 \\		
		\bottomrule
	\end{tabular}
	\label{tab:score_distribution}
	\caption{Erosion and joint-space narrowing scores on joint-(part) level.
		NS = Not scoreable (e.g.\ surgical spacers, fused joints, missing fingers, ...).}
	\vspace{\figaftervspace}
\end{table}

%%% Related work on RA SvH prediction:
\section{Related work for RA and SvH score estimation}
 

%% CW: Removed this due to lenght restrictions...
Most published work on SvHS prediction in RA has relied on fully supervised learning without unsupervised pretraining beyond ImageNet initialization \cite{sun_crowdsourcing_2022}. 
\cite{maziarz2022deeplearningrheumatoidarthritis} a combined objective of ROI segmentation together and smoothed label classification highlighting to address the quantification error in JSN and ERO scores.   
%
\cite{bo_interpretable_2025} proposed an attention-based multiple-instance learning model to obtain an interpretable SvHS predictor.
% 
\cite{moradmand_multistage_2025} used a vision transformer to aggregate per-joint predictions into a total SvH score, achieving strong performance in common, less severe cases. 
The winning RA2-DREAM Challenge approach \cite{Misc:RA2DreamChallenge:Winner} used a pipeline that combined joint localization through segmentation with a subsequent model that integrates all joint scores per patient. %This is in contrast to our model, were the scores as simply summed across ROIs. Likely our predictions could be improved by a more complex patients vise aggregation method.   
%
More recent work explored self-supervision \cite{Ling2024_IEEE_SelfSupervisedLearningforRA} 
and unsupervised pretraining of a vision transformer in a cohort of patients with psoriatic arthritis \cite{govind_vision_2025}.  
To our knowledge, no existing model explicitly leverages the time-series structure of longitudinal RA imaging.
%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Training-/ evaluation details and additional results }
\label{app:trainingDetails}

Training settings and code of ChronoCon is available at \url{https://github.com/cirmuw/ChronoCon}. Our point-annotation tool and the landmark detection code for the pre-processing steps is in a separate repository \url{https://github.com/cirmuw/autopix_landmarks_utils}.

\paragraph{Preprocessing}
Preprocessing followed the pipeline described in \cite{deimel2025autoscoRA}. 
All right-hand and right-foot radiographs were horizontally mirrored for consistency. 
Images originally encoded in the DICOM \texttt{MONOCHROME2} format (black foreground on white background) were converted to \texttt{MONOCHROME1}. 
Radiographs containing both hands or both feet were split at the midline.  

\paragraph{Joint detection}
Joint localization was performed using the Spatial Configuration Network (SCN) \cite{PAYER2019207}, implemented as described in \cite{JONKERS2025102165}. 
After training on 480 radiographs, landmark detection achieved a mean median point-to-point error of <1.0\,mm (mean over ROIs; median over samples) on a test set of 40 radiographs.
Detailed results are available online (\url{https://github.com/cirmuw/autopix_landmarks_utils/tree/main/nb/landmarks_evaluation}). 
Square patches of size $156 \times 156$ pixels were extracted around each region of interest (ROI).
%\cw{Consider moving detailed metrics to the appendix if needed.}

\paragraph{Data augmentation}
Data augmentation included random rotations (up to 10°), translations (up to 17 pixels), followed by a center crop to $128 \times 128$ pixels to avoid padded boundaries. 
Photometric augmentations consisted of random intensity scaling, intensity shifting, contrast adjustment, and histogram shifting. 
Additional robustness augmentations included Gaussian smoothing and light noise perturbations. 
All image patches were finally normalized to the intensity range $[0,1]$.


\paragraph{Encoder}
All experiments used a ResNet18 encoder. 
Prior to training, the encoder was initialized with ImageNet-pretrained weights.

\paragraph{Decoder}
For reconstruction or denoising tasks, a decoder mirroring the ResNet18 architecture was employed, with deconvolution (transposed convolution) layers replacing convolutional downsampling layers. 
When reconstruction was used, Gaussian noise of magnitude $10^{-5}$ (with clip) was added to the input to implement a denoising autoencoder. Whenever the decoder was included, memory requirements doubled and the batch size was therefore halved. 

\paragraph{Regression heads}
Score prediction used a multi-headed regression module comprising 59 independent heads (one per score subtype). 
Each head was a multilayer perceptron with two hidden layers of dimension 128. 
Whenever supervised regression was used, the MSE loss was applied.

\paragraph{Score summation and extrapolation}
For erosion, the proximal and distal parts of the affected joints %\cw{TODO: add joint names} 
were scored separately. 
In the computation of the total SvH score, these two parts were summed, and the model predictions were combined in the same way. 
For foot joints, the total erosion score per joint must not exceed 10 (5 for each joint part). 
Accordingly, the outputs of the regression heads ($y \in \mathbb{R}$) were clipped to the range $[0,5]$ for each foot joint part. 
For the PIP and MCP joints of the hand, the sum of proximal and distal parts was clipped to the range $[0,5]$.

\paragraph{Metrics and analysis}
Erosion and joint-space-narrowing scores were summed per visit to obtain the total SvH score. 
When subscores were missing (e.g., not scoreable due to surgery), the total score was estimated via linear interpolation. 
Visits with more than 25\% missing subscores were excluded. 
To assess the model’s ability to capture progression, we evaluated the change in total SvH score between visits, 
$\Delta\mathrm{SvHS} = \mathrm{SvHS}(t_2) - \mathrm{SvHS}(t_1)$.

Intraclass correlation coefficients were computed using a two-way mixed-effects model with single measures and absolute agreement 
(ICC3-1 in the terminology of \cite{Shrout1979IntraclassCU}). 
All ICC values and confidence intervals were calculated with the $R$ package \code{psych} \cite{R_Language, R_Package_psych2025}.

In scatter plots of true vs.\ predicted SvH or $\Delta$SvH scores, Pearson’s correlation coefficient $\rho$ is reported. 
Error bars represent 95\% confidence intervals (CI). 
For both the root-mean-squared error (RMSE) and $\rho$, confidence intervals were obtained via bootstrapping. 
In tables, reported $\pm$ values correspond to half the width of the 95\% CI.


\paragraph{Training parameters}
Models were trained on NVIDIA A100 GPUs (40\,GB VRAM) using the AdamW optimizer. 
Batch sizes were 512 without a decoder and 256 with a decoder, with learning rates scaled proportionally for smaller batches. 
A \texttt{ReduceLROnPlateau} scheduler was used to reduce the learning rate when the validation loss plateaued.


\paragraph{Hyperparameter search}
Learning rates and the contrastive temperature $\tau$ were optimized using \code{optuna}'s TPE sampler \cite{NIPS2011_Bergstra_AlgorithmsForHyperParameterOptimization} on the PIPIII and MCPIII joints (six score types) from the 466 training patients. 
Search ranges were $\tau \in [0.1, 5]$, encoder LR $\in [10^{-7}, 10^{-2}]$, head LR $\in [10^{-5}, 10^{-2}]$, and weight decay $\in [10^{-8}, 10^{-1}]$. 
The search yielded $\tau = 1$ as temperature (prefactor to $L_2$ feature similarity), an encoder LR of $4 \cdot 10^{-4}$, a head LR of $4 \cdot 10^{-5}$, and a weight decay of $10^{-6}$ for a batch size of 512, with proportional LR scaling for smaller batches.

\paragraph{SimCLR parameters}
Parameters and settings for the SimCLR baseline were taken directly from the original publication~\cite{chen2020simpleframeworkcontrastivelearning} without further hyper-parameter search. (Added in the rebuttal). MLP projector with a single hidden layer and an output-dimension of 128; Temperature $\tau=0.07$; Feature-similarity: cosine;


\paragraph{Statistical analysis}
All hypothesis tests were performed on paired, per-instance MSE differences on the fixed test set (\code{scipy.stats.ttest\_rel(..., alternative='two-sided')}). Bootstrapping was used only for confidence intervals of ICC and RMSE and was not involved in hypothesis testing.


In contrast to ICC/RMSE, where there is a nonlinear relationship between the statistic and the sample estimations, not bootstrapping is used for the statistical tests of MSE. 

Several paired hypothesis tests are reported in Table~\ref{tab:loss_combinations_stage}. These tests are intended to provide quantitative support for observed performance differences between specific model pairs, rather than to establish confirmatory claims across a family of hypotheses. The reported \textit{p}-values should therefore be interpreted in this descriptive context.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%\section{Additional results and abbreviations table}



\begin{table}[btph]
	\centering
	\ra{1.3}
	\begin{tabular}{@{}ll@{}}
		\toprule
		\textbf{Abbreviation} & \textbf{Meaning} \\
		\midrule
		RA   & Rheumatoid Arthritis \\
		SvH / SvHS & Sharp--van der Heijde Score \\
		$\Delta$SvHS & Change in SvH score between visits (in chronological order)\\		
		ERO  & Erosion \\
		JSN  & Joint Space Narrowing \\
		ROI  & Region of Interest \\
		NS   & Not Scoreable \\
		ICC  & Intraclass Correlation Coefficient \\
		RMSE & Root-Mean-Squared Error \\
		MAE  & Mean Absolute Error \\
		DAE  & Denoising Autoencoder \\
		RNC  & Rank-N-Contrast (loss) \\
		\lossname{} & Chronological Contrastive Loss (this work) \\
		SCN  & Spatial Configuration Network \\
		TPE  & Tree-Structured Parzen Estimator (Optuna) \\
		\bottomrule
	\end{tabular}
	\caption{List of abbreviations beyond used throughout the manuscript and appendix.}
	\label{tab:abbrev}
	\vspace{\figaftervspace}
\end{table}




\begin{table}\centering
	\ra{1.2}
	\begin{tabular}{@{}ll@{}}
		\toprule
		\textbf{Abbreviation} & \textbf{Description} \\
		\midrule
		% --- HAND ---
		\multicolumn{2}{@{}l}{\textbf{Hand joints}} \\
		PIPII–PIPV          & Proximal interphalangeal joints II–V (JSN) \\
		PIPIIED/EP ... VED/EP & PIP joints II–V, distal/proximal erosion \\
		MCPI–MCPV           & Metacarpophalangeal joints I–V (JSN) \\
		MCPIED/EP ... VED/EP & MCP joints I–V, distal/proximal erosion \\
		IPIED / IPIEP        & Thumb IP joint distal/proximal erosion \\
		Rad\_Carp            & Radiocarpal joint (JSN) \\
		RadiusE, UlnaE       & Radius / Ulna erosion \\
		LunatE, ScaphE, TrapE & Carpal bone erosions (lunate, scaphoid, trapezium) \\
		Sca\_Cap, Tra\_Sca   & Carpal articulation JSN \\
		Base\_MCIE           & Base of metacarpal I (erosion) \\
		
		\midrule
		% --- FOOT ---
		\multicolumn{2}{@{}l}{\textbf{Foot joints}} \\
		MTPI–MTPV           & Metatarsophalangeal joints I–V (JSN) \\
		MTPIED/EP ... VED/EP & MTP joints I–V, distal/proximal erosion \\
		IP                   & Hallux interphalangeal joint (JSN) \\
		IPED / IPEP          & Hallux interphalangeal distal/proximal erosion \\
		
		\midrule
		\textbf{\code{<joint>\_ED}} & Distal joint part (erosion) \\
		\textbf{\code{<joint>\_EP}} & Proximal joint part (erosion) \\
		\bottomrule
	\end{tabular}
	\caption{
		Grouped joint and score abbreviations used in this work contributing to Sharp-van der Heijde score . Erosion applies separately to proximal (EP) and distal (ED) joint parts.}
	\label{tab:abbrev_joints}
\end{table}




\begin{figure}
	\floatconts
	{fig:FeatureSIM_ALL}
	{\vspace{\figcapspace}
	\caption{Feature similarity ($-L_2(\mathbf{v}_i, \mathbf{v}_j)$) between different chronologically ordered visits compared to the difference in score (ground truths). 
	}}
	{%
		\centering
		%\fbox{%
			\includegraphics[
			width=0.75\linewidth,
			trim={0mm 0mm 0mm 0mm}, % left bottom right top  (example values)
			clip=true
			]{./fig/fs3/all_ROIs_comparison_some_models10.pdf}%
			%}%
	}
	\vspace{\figaftervspace}
\end{figure}



%\let\clearpage\relax
%\let\newpage\relax
\end{document}
