\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution


\usepackage{bbm}
\usepackage{mwe} % to get dummy images
\jmlrvolume{-- 193}
\jmlryear{2025}
\jmlrworkshop{Full Paper -- MIDL 2025}
\editors{Accepted for publication at MIDL 2025}

\title[Intelligent Lesion Selection]{Intelligent Lesion Selection: A Novel Method for Longitudinal Assessment of Breast Cancer Lung Metastases }



\midlauthor{\Name{Melika Qahqaie\nametag{$^{1,2}$}} \Email{melika.qahqaie@fau.de} \\
\Name{Veronika A. Zimmer\nametag{$^{2}$}} \Email{veronika.zimmer@siemens-healthineers.com} \\
\Name{Eduardo Casta\~{n}eda\nametag{$^{1,2}$}} \Email{eduardo.castaneda@fau.de} \\
\Name{Katariina Peltonen\nametag{$^{3}$}} \Email{katariina.h.peltonen@hus.fi} \\
\Name{Joonas Laaksolilja\nametag{$^{3}$}} \Email{joonas.laaksolilja@hus.fi} \\
\Name{Juho L\"{a}hteenmaa\nametag{$^{3}$}} \Email{juho.lahteenmaa@hus.fi} \\
\Name{Tobias Heimann\nametag{$^{2}$}} \Email{tobias.heimann@siemens-healthineers.com} \\
\Name{Andreas Maier\nametag{$^{1}$}} \Email{andreas.maier@fau.de} \\
\Name{Dominik Neumann\nametag{$^{2}$}} \Email{dominik.neumann@siemens-healthineers.com} \\
\\
\addr $^{1}$ Friedrich-Alexander-Universit\"{a}t Erlangen-N\"{u}rnberg, Pattern Recognition Lab, Erlangen, Germany \\
\addr $^{2}$ Digital Technology and Innovation, Siemens Healthineers, Erlangen, Germany \\
\addr $^{3}$ HUS Helsinki University Hospital, Comprehensive Cancer Center, Helsinki, Finland
}


\begin{document}
\maketitle
\begin{abstract}
Breast cancer, the second most common cancer globally, often metastasizes to the lungs, requiring frequent computed tomography (CT) scans to monitor disease progression. Manual analysis by radiologists is time-consuming and prone to variability, underscoring the need for automated systems to enhance accuracy and efficiency. The goal of such systems is to optimize processes like RECIST score calculation for tumor response assessment. This study presents a pipeline for the automated temporal analysis of breast cancer lung metastases. Existing lung nodule detection and segmentation models were adapted for detecting and segmenting breast cancer metastases. Registration-based lesion tracking was incorporated, and a novel Temporal Lesion Pair Classifier was developed to identify significant lesions and estimate tumor load evolution by summing their diameters, following an adaptation of the RECIST guidelines. Evaluated on a unique dataset of breast cancer patients, each with multiple annotated CT scans at different disease stages, the proposed pipeline demonstrated a 42\% reduction in median tumor size progression discrepancy for consecutive study pairs and improved tumor response classification accuracy by 22\% at the patient level.




\end{abstract}

\begin{keywords}
Longitudinal disease assessment, Lung metastasis in breast cancer, Computed Tomography (CT), Deep learning, Lesion detection and selection
\end{keywords}

\section{Introduction}
Breast cancer affects over 2.3 million individuals annually and is a leading cause of cancer-related death among women worldwide \citep{WHO}. It commonly metastasizes to specific organs, with the lungs being the second most frequent site after bones \citep{genes13091555, wang2019clinicopathological}. The prognosis for lung metastases is poor, with a 5-year survival rate of 16.8\% \citep{cancers13153725}, necessitating continuous CT scans to monitor disease progression and evaluate treatment effectiveness \citep{yang2020deep}.

In clinical trials, RECIST 1.1 guidelines are a framework widely used for measuring solid tumors and assessing changes in tumor size over time \citep{eisenhauer2009new}. Under RECIST , two to five target lesions are selected at baseline, with no more than two lesions per organ. These target lesions must be measurable, with non-nodal lesions requiring a minimum diameter of 10 mm as assessed by CT scans. The sum of diameters (SoD) of these target lesions is calculated at baseline and re-evaluated at each follow-up. Non-target lesions are monitored to determine stability, progression, or disappearance. The objective tumor response is categorized as complete response (CR), partial response (PR), stable disease (SD), or progressive disease (PD). PR indicates a $\geq 30\%$ reduction in SoD, PD an increase of $\geq 20\%$ or the appearance of new lesions, and SD reflects changes within these thresholds. CR is defined as the disappearance of all significant lesions. Automatic calculation of clinically relevant criteria, such as RECIST, requires robust methods for lesion detection, segmentation, tracking, and assessment across multiple time points. 



Previous works on longitudinal analysis in cancer types and organs other than breast cancer have demonstrated the potential of automated methods. In \citep{mukherjee2024image}, the focus was on lesion matching across scans with varying annotations and scan parameters using image registration and the Hungarian algorithm, achieving accurate lesion correspondence. In \citep{venkadesh2023prior, li2023time}, deep learning approaches have leveraged temporal information to improve malignancy predictions. Only few works concentrate on RECIST score estimation, which is essential for assessing treatment response in clinical settings. In \citep{zhou2024deep}, a pipeline was developed for RECIST score estimation in liver cancer integrating lesion detection and image registration methods. This pipeline was trained on comprehensively annotated liver lesion data, with RECIST scores calculated by selecting target lesions from detected lesions. However, its performance was only evaluated on liver tumors, limiting its applicability to other cancer types.

Factors such as lesion size, morphology, and growth rate are critical for cancer prognosis, however, many studies do not consider temporal dynamics \citep{macmahon2017guidelines, liao2019evaluate}. Indeed, most current approaches for breast cancer metastasis focus on single time-point analyses, neglecting the need for longitudinal assessments critical for monitoring disease progression and assessing tumor burden over time \citep{moreau2021automatic, li2023time}.  While the analysis of lung metastasis is crucial for evaluating disease progression in breast cancer, research in these areas remains limited compared to studies on bone and lymph node metastases \citep{yang2020deep, liu2021axillary, moreau2020deep}. 

Methods developed for lung cancer are often not directly applicable to breast cancer metastases in the lung. Pulmonary metastases from breast cancer frequently present as numerous well-defined nodules, whereas primary lung cancers typically appear as solitary, irregularly shaped nodules \citep{stana2025retrospective}. Existing datasets are often not designed to address the unique characteristics of breast cancer metastases in the lungs, posing additional challenges in developing robust, generalizable models.


This work presents an automated system for the temporal analysis of breast cancer metastases in the lungs using longitudinal 3D CT data. We leverage existing single-timepoint lung nodule detection and segmentation models trained on lung cancer images to detect and segment breast cancer metastases. Lesion tracking is performed using image registration. A novel Temporal Lesion Pair Classifier (TLPC) is introduced to identify temporally significant lesions for the automatic estimation of a RECIST-like score to assess disease progression. The ultimate goal is to provide a reliable, efficient, and precise tool for clinical decision support in the management of metastatic breast cancer. 



The key contributions of this study are two-fold. First, a complete pipeline for automated longitudinal analysis of metastatic lesions, integrating proven single-timepoint analysis modules, is proposed. Second, a novel Temporal Lesion Pair Classifier to identify significant lesions for estimating disease progression in alignment with an adaptation of the RECIST guidelines is proposed. The pipeline is evaluated on a unique dataset of breast cancer patients, with an average of 4 scans per patient. An expert radiologist identified and annotated up to 15 of the most significant lesions per patient, focusing on those showing notable growth or shrinkage.  

\section{Methods}
\subsection{Pipeline Description}


The pipeline for temporal analysis of disease progression (shown in Figure  \ref{fig:pair_cls_pipeline}) consists of five submodules: (i) lesion detection to identify potential candidates, (ii) lesion segmentation to determine lesion boundaries, (iii) lesion tracking, which aligns scans from consecutive studies and matches detected lesions to form lesion pairs for tracking changes over time, (iv) lesion pair identification using a novel method called the Temporal Lesion Pair Classifier (TLPC), which categorizes lesion pairs as either \textit{Significant} or \textit{Insignificant}, and (v) longitudinal analysis, where only pairs classified as \textit{Significant} are considered, ensuring that the system focuses on clinically relevant lesions.

\begin{figure}[ht]
    \centering
    \includegraphics[width=1\linewidth]{figures/experiments5.pdf}
    \caption{Overview of the proposed pipeline for automated longitudinal assessment of breast cancer lung metastases.}
    \label{fig:pair_cls_pipeline}
\end{figure}




\subsection{Lesion Detection and Segmentation}
\label{sec:detection}

The lung lesion detection process leverages a proprietary system developed by Siemens Healthineers, originally designed for lung cancer detection. This system is an adaptation of the baseline two-stage nodule detection framework presented in \citep{liu2020no}, with modifications to enhance lung lesion detection. It utilizes a RetinaNet-based detector \citep{ross2017focal} to identify candidate nodules, followed by candidate classification using an ensemble of DenseNet3D \citep{huang2017densely} and EfficientNet3D \citep{tan2019efficientnet} models, which generates nodule classification scores, helping to identify detected candidates that are more likely to be nodules. The models were trained on a variety of datasets, including NLST \citep{NLSTnational2011national}, LUNA \citep{lidc-idri}, and internal collections, ensuring robust performance across diverse nodule types. The detection outputs lesion bounding boxes, which are then fed into a DenseUNet-based segmentation model \citep{leotta2019urban}, originally developed for lung cancer nodule segmentation, to extract volumetric lesion masks. 



\subsection{Lesion Tracking}
\label{sec:registration}
Lesion tracking ensures consistent monitoring of metastatic lesions across consecutive time points. Rigid registration was employed to align CT scans, as standardized acquisition protocols (such as breath-hold scanning) and minimized anatomical variability prevented large deformations between scans, making rigid registration sufficient for this purpose. The registration process employed anatomical landmarks identified using a deep reinforcement learning technique \citep{marschner2022deep}, which detected up to 80 landmarks per scan. These landmarks included key structures such as the spine, lung apex, clavicles, kidneys, and liver. The alignment was achieved using a least-squares rigid registration method based on singular value decomposition (SVD) of the cross-covariance matrix between two sets of 3D landmarks \citep{arun1987least}. Lesions across consecutive scans were matched using an Intersection over Union (IoU) threshold of 0.1. To ensure small lesions were not missed due to potential registration inaccuracies, a minimum lesion diameter of 20 mm was applied for matching purposes, temporarily assigning this value to lesions smaller than 20 mm. These thresholds were determined heuristically. Matched lesions were retained for temporal analysis, while unmatched lesions were excluded to filter out potential false positives. This also accounted for lesions appearing or disappearing between scans, preventing consistent tracking over time. More details on lesion tracking are provided in appendix~\ref{app:tracking_methods}.



\subsection{Lesion Pair Identification (Temporal Lesion Pair Classification)}
A major contribution of this study is the development of a Temporal Lesion Pair Classifier (TLPC) to address the challenge of identifying clinically significant lesions for RECIST assessment. Since radiologists typically annotate only the most relevant lesions—those exhibiting substantial shrinkage or growth over time—the TLPC ensures that the system focuses on these key lesions, distinguishing them from less relevant ones and enabling clinically meaningful longitudinal analysis.
The input to the TLPC are 3D lesion pairs extracted from consectuive CT images. During training, lesion pairs identified by the lesion tracking process described in section~\ref{sec:registration} were used. These were labeled as \textit{Significant}, if they were annotated by the radiologist, and \textit{Insignificant}, if not. The \textit{Insignificant} category includes both false positives detected by the system and true lesions that were not annotated by the radiologist, as only the most clinically relevant lesions were selected for assessment.  The TLPC incorporates DenseNet-based feature extractors pre-trained on lung cancer data, obtained from the system described in section~\ref{sec:detection}, leveraging prior knowledge for lesion classification. For both training and inference, each lesion in the consecutive study pair was processed separately through parallel DenseNet instances, extracting features that were then concatenated and passed to a binary classifier. For the classification head of the model's architecture, a design replicating the original DenseNet framework was adopted, adjusted to process concatenated feature maps (lesion pair features). The classification architecture included an Adaptive Average Pooling layer followed by a fully connected layer for binary classification. This lightweight design, with only 4K parameters, was selected due to its efficiency and effective performance in lesion pair classification. 

 

\subsection{Longitudinal Analysis}
To enable the automatic estimation of disease progression, this study incorporated temporal analysis based on an adaption of the RECIST guidelines. The original RECIST classification includes the categories PD, PR, SD, and CR. In this study, the CR category was replaced with a "No Lesions" class, as no patients in the dataset exhibited complete response. This also enabled identification of cases with no significant lesions detected.



The longest diameter of the detected lesions was calculated by identifying boundary voxels within the segmentation mask of each axial slice and determining the maximum pairwise distance between all pairs of boundary points. For temporal analysis, the SoD at each time point was calculated by aggregating the diameters of the selected candidates identified in each image. These values were subsequently used to evaluate disease progression across consecutive study pairs and to calculate objective tumor response at the patient level.




\section{Data}
This study utilized a unique dataset provided by Helsinki University Hospital. It includes longitudinal CT scans from 94 breast cancer patients with lung metastases, with an average of 4 scans per patient. Radiologists annotated up to 15 significant lung lesions per patient, focusing on those exhibiting notable growth or shrinkage. The longest axial diameters of these lesions were recorded to enable temporal tracking of disease progression. As shown in \figureref{fig:annotaion overview}, the same lesions were annotated by the radiologist at each timepoint.


\begin{figure}[ht]
    \centering
    \includegraphics[width=0.9\linewidth]{figures/annotations3.pdf}
    \caption{An overview of a patient's lesion data with temporal annotations.}
    \label{fig:annotaion overview}
\end{figure}

The CT studies were acquired using devices from Siemens Healthineers, GE Medical Systems, and Toshiba. The in-plane spatial resolution ranged from 0.47 mm $\times$ 0.47 mm to 0.98 mm $\times$ 0.98 mm. Slice thickness varied between 1.5 mm and 5 mm (3.3 $\pm$ 0.8 mm). The dataset was randomly divided into training, validation, and test sets at the patient level. Of the 394 studies, 80\% were used for training and validation (of which 80\% for training and 20\% for validation), and 20\% were reserved for testing. This resulted in 60 patients (252 studies) in the training set, 15 patients (63 studies) in the validation set, and 19 patients (79 studies) in the test set. Model and hyperparameter selection was conducted on the validation set.

\section{Experiments and Results}
We performed multiple experiments to assess the performance of individual submodules within the proposed pipeline and to evaluate its overall end-to-end performance for temporal analysis of disease progression. All experiments were conducted on a Tesla V100 GPU (NVIDIA Corporation) with 16 GB of dedicated memory.

\paragraph{\textbf{i) Lesion Detection:}}
The RetinaNet model, originally trained on lung cancer data, was fine-tuned to detect lung lesions from breast cancer metastases. The detection performance was compared to the original pretrained RetinaNet (from the proprietary system mentioned in section ~\ref{sec:detection}), and to a MONAI implementation of RetinaNet \citep{cardoso2022monai} trained on the LUNA dataset (available as 'Lung Nodule CT Detection' in the MONAI model zoo). 

The pretrained and fine-tuned detection models outperformed the MONAI model in terms of sensitivity with the pretrained model achieving the best performance at 0.81, compared to 0.79 for the fine-tuned model and 0.62 for the MONAI model. Due to the complexity of the RetinaNet model and limited data, fine-tuning led to increased detections but also a rise in false positives and false negatives, resulting in fewer true positives. Therefore, the pretrained model, with better overall performance, was selected for longitudinal analysis (further details in appendix~\ref{app:detection}). 




\paragraph{\textbf{ii) Lesion Tracking:}} For lesion tracking, we used the estimated rigid transformation to transform candidate lesions from the first scan into the coordinate system of the second scan. After a hyperparameter search (detailed in the appendix~\ref{app:tracking_experiment}), we selected an IoU threshold of 0.1 and a minimum diameter of 20 mm for lesion matching, where lesions smaller than 20 mm were temporarily assigned this value to prevent small lesions from being missed due to registration inaccuracies, achieving 84\% correct matches when applied to the ground truth data on the validation set.




\paragraph{\textbf{iii) Lesion Pair Identification:}}
The TLPC model was trained for 100 epochs using the original DenseNet classifier adapted for binary input with the feature extractor frozen. The preprocessing included clipping image intensities to the range [-1024, 300] HU, linear normalization to [0,1], and resampling to a 0.5 mm isotropic resolution. Data augmentation included random rotations, flipping, zooming, and intensity adjustments to enhance model generalization. Class weights were applied in the weighted cross-entropy loss function to address the imbalance between 829 \textit{significant} and 9145 \textit{insignificant} lesion pairs in the training set. The Adam optimizer \citep{diederik2014adam} was used for training with a learning rate of \(1 \times 10^{-3}\).


The TLPC model achieved an accuracy of 87\% and a weighted F1-score of 0.89. The confusion matrix and ROC curve showing the evaluation results of the TLPC model on the validation set in Figure~\ref{fig:ex4_conf} indicate 413 false positives and 64 false negatives, with an AUC of 0.90. These results demonstrate the model's effectiveness in distinguishing significant from insignificant lesion pairs.
\begin{figure}[ht]
    \centering
    \includegraphics[width=0.9\linewidth]{figures/conf_TLPC3.pdf}
    \caption{Confusion matrix and ROC curve illustrating the TLPC model's classification performance on the validation set, classifying tracked lesion pairs as significant or insignificant.}
\label{fig:ex4_conf}
\end{figure}
\paragraph{\textbf{iv) Longitudinal Analysis:}}
We evaluated the whole pipeline (consisting of lesion detection, segmentation, tracking and identification) for temporal analysis on the test set. We considered both the estimation of disease progression for two consecutive time points and estimation of the objective tumor response on patient-level. We compared our pipeline with a baseline only including lesion detection and segmentation at each time point and calculating the SoD of the detected lesions at each time point.

For the consecutive study pairs, to quantify tumor load dynamics and how well they are aligned with the GT, the relative change in SoD between timepoint 1 and 2 was computed as
\begin{equation}
    \Delta_{\text{SoD}} = ~\frac{\text{SoD}_2 - \text{SoD}_1}{\text{SoD}_1},
\end{equation}


where $\text{SoD}_1$ and $\text{SoD}_2$ are the SoDs of timepoint 1 and 2. To assess the similarity between the predicted and ground truth (GT) trends, the absolute difference between their relative changes was calculated. This metric will be referred to as the Relative Change Discrepancy in Sum of Diameters: 
\begin{equation}
    \text{RCD-SoD} = \left| \Delta_{\text{SoD}_\text{GT}} - \Delta_{\text{SoD}_\text{Prediction}} \right|.
\end{equation}


Patient-level tumor response was assessed across multiple time points by combining consecutive time-point analyses. The SoD was calculated at each time point based on lesions detected, matched, and classified as \textit{significant}. When inconsistencies in SoD arose between overlapping study pairs due to differences in lesion selection or classification, the average SoD was used. Finally, predictions and ground truth were evaluated using an adaptation of the RECIST criteria as a multi-class classification problem, categorizing responses into PD, SD, PR, and No Lesions. 



\textit{a) Disease progression for consecutive timepoints via SoD: }
Figure \ref{fig:boxplot_relative_change}, left, compares the proposed pipeline to the baseline in terms of the RCD-SoD. The proposed approach achieved a lower RCD-SoD of 10.73 compared to the baseline result of 18.71, representing a 42\% reduction (p=0.001 using a Wilcoxon signed-rank test). 



\begin{figure}[ht]
    \centering
    \includegraphics[width=0.9\linewidth]{figures/temporal_merged3.pdf}
    \caption{Comparison of RCD-SoD values for the baseline and proposed pipeline for consecutive study pairs (left), Confusion matrices for both pipelines in patient-level evaluation of tumor response classification based on RECIST classes (right).}
    \label{fig:boxplot_relative_change}
\end{figure}
\textit{b) Disease progression at patient level via RECIST adaptation:}
The Baseline and Pair Classification methods were assessed at the patient level, focusing on the full available patient history rather than only two consecutive scans as in previous experiments. This was done by aggregating the SoD values from the pairwise analysis for each time point. Figure  \ref{fig:boxplot_relative_change}, right, presents the confusion matrices, while Table \ref{tab:cls_report_summary_simplified} summarizes the classification metrics. The proposed pipeline improved all metrics with respect to the baseline results. Most notably, it achieved a higher accuracy of 83\% compared with the baseline method (67\%).


\begin{table}[ht]
    \centering
    \caption{Classification performance of the Baseline and Proposed Pipelines for adapted RECIST scores. The RECIST classes include PR, SD, PD, and No Lesions.}
    \label{tab:cls_report_summary_simplified}

    \begin{tabular}{l|c|c|c|c}
        \hline
        \textbf{Method} & 
        \textbf{Accuracy} & 
        \textbf{Precision} & 
        \textbf{Recall} & 
        \textbf{F1-Score} \\ 
        
        &  & \textbf{(Wtd. Avg.)} & 
        \textbf{(Wtd. Avg.)} & 
        \textbf{(Wtd. Avg.)} \\ 
        
        \hline
        \textbf{Baseline} & 
        \textbf{0.67} & 
        \textbf{0.62} & 
        \textbf{0.67} & 
        \textbf{0.64} \\
        
        \textbf{Proposed Pipeline} &
        \textbf{0.83} & 
        \textbf{0.90} & 
        \textbf{0.83} & 
        \textbf{0.86} \\
        
        \hline
    \end{tabular}
\end{table}




\section{Discussion and Perspectives}

This work presents a deep learning-based pipeline for the longitudinal analysis of breast cancer lung metastases. By integrating lesion detection, segmentation, tracking, and identification techniques, the system estimates disease progression and objective tumor response in accordance with an adaptation of the RECIST guidelines. Evaluated on a unique dataset of 94 patients, the pipeline demonstrated significant improvements in tracking accuracy and reduction of false positives.

The evaluation of the TLPC model on the validation set showed a reduction in the number of false positives from 6337 detected lesions by the baseline detection method to 413 false positive lesion pairs. The TLPC model reduced the median RCD-SoD by 42\% for consecutive study pairs compared to the baseline and improved tumor response classification accuracy from 67\% to 83\% at the patient level. These results underscore the system's capability to enhance lesion tracking and provide clinically relevant insights, such as RECIST-based response evaluation.

Key limitations include the small dataset size, which constrained the fine-tuning of the detection model, and the reliance on rigid registration, which is less effective for long-term lesion tracking. Additionally, patient-level response estimation relied on aggregated study-pair results, limiting its precision. Future work should address these by improving lesion tracking across multiple time points, incorporating deformable registration, and handling cases with the emergence of new lesions or the complete disappearance of others.

Overall, this study introduces a fully automated end-to-end pipeline for longitudinal tumor load assessment in breast cancer lung metastases. A key contribution is the TLPC, which distinguishes clinically significant and insignificant lesions. By mimicking RECIST-based decision-making, TLPC enables a clinically meaningful, automated tumor response assessment. This framework offers a structured and clinically relevant solution for longitudinal tumor analysis, addressing a gap not extensively covered by existing approaches.

In conclusion, this study highlights deep learning's potential to automate disease progression estimation and RECIST score calculation, improving tracking consistency and reducing false positives to enhance clinical workflows.


\midlacknowledgments{This project has received funding from the European Union’s Horizon Europe Research and Innovation Programme under grant agreement No.~101095245.}

\bibliography{midl25_193}


\appendix
\section{Lesion Detection}
\label{app:detection}
\subsection{Experiment and Results}


For fine-tuning of the RetinaNet, CT scans in the training set were resampled to 1 mm isotropic resolution and data augmentation included cropping, flipping, zooming, rotations, and intensity adjustments to enhance generalization. Fine-tuning was performed using SGD with momentum \citep{sutskever2013importance}, focal loss, and a maximum of 1000 epochs, selecting the model with the lowest validation loss for evaluation. The optimizer was configured with a learning rate of \(1 \times 10^{-2}\), momentum of 0.9, weight decay of \(3 \times 10^{-5}\), and Nesterov acceleration. 

We compared the lesion detection results of the fine-tuned model with the original model trained on lung cancer data as described in section~\ref{sec:detection}. Additionally, we compare to a MONAI's RetinaNet implementation \citep{cardoso2022monai} with publicly available weights trained on the LUNA dataset for lung nodule detection.

The performance of the detection methods was evaluated by measuring sensitivity, along with the counts of true positives (TPs), false positives (FPs), and false negatives (FNs) per scan. To determine TPs, the center coordinates and radii of annotated lesions, calculated as half of their longest axial diameters, were used. Detected candidates’ center coordinates were compared to these annotations by calculating the Euclidean distance between their centers. A candidate was classified as a TP if it lay within the spherical region defined by the radius of the annotated lesion. Candidates that did not match any annotated lesions were classified as FPs. Sensitivity was calculated as the proportion of annotated lesions correctly detected, providing a comprehensive assessment of the detection method's performance.



The results of the performance of the lesion detection models are presented in Table \ref{tab:detection_eval}. Both the pretrained and fine-tuned detection models outperform the MONAI model in terms of maximum sensitivity, defined as the sensitivity achieved when considering all detected lesions without confidence threshold filtering, as well as the number of true positives (TPs). However, the MONAI model exhibits fewer false positives (FPs). The pretrained RetinaNet model outperformed the fine-tuned version in terms of sensitivity and had less FPs. While the fine-tuned model identified more candidates, it resulted in more FPs without significant improvement in sensitivity. Overall, the pretrained model demonstrated more reliable detection performance on the dataset, likely due to the limited size of the dataset available for fine-tuning. However, it produced a high number of false positives, which may include both insignificant lesions that were not annotated by the radiologist and non-lesions mistakenly detected by the model.
\begin{table}[ht]
    \centering
    \begin{tabular}{l|c|c|c|c}
    \hline
    Metric           & MaxS  & TP  & FP   & FN  \\ \hline
    MONAI Model      & 0.62  & 357 & 1527 & 214 \\
    Pretrained Model & 0.81  & 464 & 6337 & 107 \\
    Fine-tuned Model & 0.79  & 449 & 7645 & 122 \\  \hline
    \end{tabular}
    \caption{Performance comparison for all models measured by the maximum sensitivity (MaxS), true positives (TP), false positives (FP) and false negatives (FN).}
    \label{tab:detection_eval}
\end{table}



\section{Lesion Tracking}
\label{app:tracking}
\subsection{Methodology}
\label{app:tracking_methods}

The alignment between two consecutive CT sancs was achieved using a least-squares rigid registration method based on singular value decomposition (SVD) of the cross-covariance matrix between two sets of 3D landmarks \citep{arun1987least}. Given two sets of $n$ corresponding landmarks, \(\mathbf{P},\mathbf{Q}\in\mathbbm{R}^{3\times n}\) with landmarks $\mathbf{p}_i, \mathbf{q}_i\in\mathbbm{R}^3, i=1,\dots,n$, the objective is to find a rigid transformation consisting of a rotation matrix \(\mathbf{R}\in\mathbbm{R}^{3\times 3}\) and a translation vector \(\mathbf{t}\in\mathbbm{R}^3\) that minimizes the sum of squared distances between corresponding points: 
\begin{equation}
    \min_{\mathbf{R}, \mathbf{t}} \sum_{i=1}^{n} \left\| \mathbf{q}_i - (\mathbf{R} \mathbf{p}_i + \mathbf{t}) \right\|^2.
\end{equation}


The landmarks are first centered by subtracting their respective centroids $\mathbf{c}_P, \mathbf{c}_Q$. The cross-covariance matrix \(\mathbf{K}\) is then computed as
\begin{equation}
    \mathbf{K} = \sum_{i=1}^{n} \mathbf{p}_i' \mathbf{q}_i'^T, 
\end{equation}

where \(\mathbf{p}_i'\) and \(\mathbf{q}_i'\) represent the centered landmarks. Applying SVD to \(\mathbf{K}\): \(
\mathbf{K} = \mathbf{U} \mathbf{\Sigma} \mathbf{V}^T,
\) the optimal rotation matrix is calculated as \(\mathbf{R} = \mathbf{V} \mathbf{U}^T\). The translation vector is obtained as \(\mathbf{t} = \mathbf{c}_Q - \mathbf{R} \mathbf{c}_P\). The resulting rigid transformation \(\mathbf{T}\), comprising \(\mathbf{R}\) and \(\mathbf{t}\), ensures alignment of corresponding landmarks across consecutive images.



\subsection{Experiment and Results}
\label{app:tracking_experiment}
To evaluate the landmark-based rigid registration, the method was assessed by calculating the Target Registration Error (TRE) using available anatomical landmarks (mentioned in section \ref{sec:registration}), including the right and left primary bronchi, right and left lung tops, and carina bifurcation. The mean TRE of this approach was compared to a naïve translation-based registration using image centroids. The results showed a 98.2\% improvement, with the mean TRE reduced from 430 mm to 7.2 mm.

To determine the most effective criteria for lesion matching, various IoU thresholds and minimum diameters were tested. For each combination, the matched lesions were compared against the ground truth annotations to verify if they corresponded to the same lesion as identified by the radiologist. The criteria yielding the highest number of correct matches in the validation set were selected.
After evaluating various criteria, an IoU threshold of 0.1 and a minimum diameter of 20 mm were selected as the optimal parameters, achieving 84\% correct matches when applied to the ground truth data on the validation set. These settings consistently identified corresponding annotated lesions across consecutive time points. While an alternative criterion with a minimum diameter of 30 mm (using the same IoU threshold) achieved a slightly higher correct match rate of 85\%, it was ultimately not adopted due to also a higher incidence of incorrect matches (33 wrong matches) compared to the 20 mm threshold (4 wrong matches). Table \ref{tab:tracking_results}  presents results from the evaluation of candidate matching criteria for the various configurations.
\label{app:tracking_evaluation}

\begin{table}[ht]
\centering
\caption{Evaluation Results for various candidate matching criteria.}
\label{tab:tracking_results}
\begin{tabular}{l|c|c}
\hline
\textbf{Min. Diameter [mm]}        & \textbf{IoU}              &\textbf{Percentage of Correct Matches} \\ \hline
10                                 & 0.1                       & 60\%                      \\
10                                 & 0.3                       & 29\%                    \\ 
10                                 & 0.5                       & 13\%                      \\
\textbf{20}                        & \textbf{0.1  }            & \textbf{84\%}                   \\ 
20                                 & 0.3                      & 79\%                     \\ 
20                                 & 0.5                      & 49\%                     \\
\textbf{30}                     & \textbf{0.1}               & \textbf{85\% }                    \\ 
30                                 & 0.3                      & 79\%                     \\ 
30                                 & 0.5                      & 49\%                     \\ \hline
\end{tabular}
\end{table}



Finally we compared our approach against an advanced lesion tracking method proposed by \citep{vizitiu2023multi}, which utilizes a multi-scale self-supervised learning framework for lesion tracking. Using the original criteria from that study, the method achieved 38\% correct matches. After adapting it to the minimum lesion matching diameter criteria, where all lesions below 20 mm were adjusted accordingly, performance improved to 70\% correct matches. However, it remained lower than the accuracy achieved with the proposed approach.


\end{document}
