\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution
\usepackage{caption}
\usepackage[compact]{titlesec}
\usepackage{booktabs}
\usepackage{colortbl}
\usepackage{enumitem}
\usepackage{multirow}
\usepackage{mwe} % to get dummy images
\usepackage{xcolor}
\newcommand{\graymidrule}{\arrayrulecolor{gray}\midrule\arrayrulecolor{black}}

\makeatletter
\let\JMLR@Email\Email   % save jmlr/midl's \Email definition
\let\Email\relax        % make \Email look "undefined" to \newcommand
\usepackage{marvosym}   % now marvosym's \newcommand\Email won't error
\let\Email\JMLR@Email   % restore the original \Email used in the author block
\makeatother

\jmlryear{2026}
\jmlrworkshop{Full Paper -- MIDL 2026}
\jmlrvolume{-- 59}
\editors{Accepted for publication at MIDL 2026}

\title[TAP-CT]{TAP-CT: 3D Task-Agnostic Pretraining of Computed Tomography Foundation
Models}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{
\Name{Tim Veenboer\nametag{$^{1}$}}
% \Email{n.moriakov@nki.nl}
\Name{George Yiasemis\nametag{$^{1,2}$}}
% \orcid{0000-0002-1348-8987}
% \Email{n.moriakov, g.yiasemis@nki.nl} \\
\Name{Eric Marcus\nametag{$^{1,3,*}$}}
% \Email{j.sonke@nki.nl}
\Name{Vivien {van Veldhuizen}\nametag{$^{1}$}}
\Name{Cees G. M. Snoek\nametag{$^{2}$}}
\Name{Jonas Teuwen\nametag{$^{1}$}}
\Name{Kevin B. W. {Groot Lipman}\nametag{$^{1}$}}\\
\Email{\{t.veenboer, k.groot.lipman\}@nki.nl}\\
\addr $^{1}$Netherlands Cancer Institute,
\addr $^{2}$University of Amsterdam,
\addr $^{3}$Kaiko,
\addr $^*$Contributed while at NKI
}
\begin{document}

\maketitle

\begin{abstract}
Existing foundation models (FMs) in the medical domain often require extensive fine-tuning or rely on training resource-intensive decoders, while many existing encoders are pretrained with objectives biased toward specific tasks. This illustrates a need for a strong, task-agnostic foundation model that requires minimal fine-tuning beyond feature extraction. In this work, we introduce a suite of task-agnostic pretraining of CT foundation models (TAP-CT): a simple yet effective adaptation of Vision Transformers (ViTs) and DINOv2 for volumetric data, enabling scalable self-supervised pretraining directly on 3D CT volumes.  Our approach incorporates targeted modifications to patch embeddings, positional encodings, and volumetric augmentations, making the architecture depth-aware while preserving the simplicity of the underlying architectures. We show that large-scale 3D pretraining on an extensive in-house CT dataset (105K volumes) yields stable, robust frozen representations that generalize strongly across downstream tasks. To promote transparency and reproducibility, and to establish a powerful, low-resource baseline for future research in medical imaging, we will release all pretrained models, experimental configurations, and downstream benchmark code at \url{https://huggingface.co/fomofo/tap-ct-b-3d}.

%Using simple decoders (e.g., a linear convolution) on these features, our models achieve state-of-the-art segmentation performance compared to features from existing public approaches across five downstream tasks (avg. +16 $\Delta$DSC $\uparrow$). We show that the models remain competitive with or surpass existing approaches on five classification tasks, while revealing a significant challenge in full-volume classification.
\end{abstract}

\begin{keywords}
CT, Foundation Models, Self-Supervised Learning, 3D DINOv2
\end{keywords}

\begin{figure*}[!ht]
    \centering
    \includegraphics[width=0.75\linewidth]{images/dinoct_img.pdf}
    \caption{\textbf{Pretraining and Evaluation of TAP-CT Foundation Models.} Models are pretrained using a novel 3D adaptation of the DINOv2 framework and subsequently evaluated based solely on the representational quality of their learned features.\vspace{-2.5em}}
    \label{fig:teaser}
\end{figure*}

\section{Introduction}

\label{sec:intro}

Computer vision in medical imaging faces significant challenges hindering widespread adoption of AI in healthcare, particularly the lack of annotated data \cite{zhou2021reviewdlmi}. At the same time, hospitals typically maintain substantial repositories of imaging data. Computed Tomography (CT) is among the most widely used imaging modalities, especially in cancer care. CT volumes are generally highly homogeneous: each scan consistently provides a partial or full 3D view of a patient's body. Variability across CT scans is relatively limited, as most anatomical structures appear consistently present across individuals. In principle, an AI model should be able to learn latent representations of these structures while simultaneously capturing subtle patient-specific abnormalities within the same representational space. These characteristics make CT an ideal candidate for self-supervised learning (SSL) and the pretraining of foundation models (FMs).
\\
\indent Yet, current training strategies of FMs in CT imaging are largely limited to either (1) approaches that require fine-tuning of the encoder or an extensive decoder to extract useful information from the learned representations \cite{pai2025ctfm, wu2024vocosimpleyeteffectivevolumecontrastive}, or (2) methods that use the encoder only for specific tasks \cite{Pai2024fmcib, he2024vista3d, li2024well}, as the pretraining objective biases the model toward those tasks. These models are referred to as FMs in the literature but can only be applied in specific settings, whereas task-agnostic SSL pretraining is expected to yield more general-purpose foundation models. Moreover, existing approaches are often pretrained on the same datasets used for evaluation, due to limited data availability. The objective of this work is to establish a CT-native foundation model whose representations remain robust across downstream tasks, providing a reliable baseline that avoids task-specific pretraining biases and eliminates the need for heavy fine-tuning.
\\
\indent There exist distinct families of SSL methods that can be employed to learn these transferable representations. Broadly, these methods can be categorized into two groups: generative and contrastive approaches. Generative methods focus on reconstructing images, commonly through masked image modeling (MIM) \cite{xie2022simmim, fang2023eva} or auto-regressive modeling \cite{el2024scalable}. In contrast, contrastive methods aim to learn representations by enforcing view invariance across different augmentations \cite{chen2020simple, he2020momentum, caron2021emerging}. Over time, generative approaches have been shown to produce strong local representations, whereas contrastive methods tend to emphasize global features. Several studies \cite{van2025joint, vanyan2023analyzing, assran2022masked, assran2023self} have demonstrated that pixel-level reconstruction objectives, such as those employed by Masked Autoencoders (MAE) \cite{he2022masked}, often require fine-tuning of the encoder to achieve strong downstream performance, including in recent 3D medical imaging applications \cite{wald2025revisiting}. This observation has motivated the development of latent-space reconstruction methods, in which the encoder predicts masked latent representations rather than raw image values \cite{zhou2021ibot, darcet2025cluster}.
\\
\indent DINOv2 \cite{oquab2023dinov2} combines elements of both generative and contrastive modeling by aligning global representations using the DINO objective \cite{caron2021emerging}, which encourages consistency between local and global views, while simultaneously learning local representations through masked reconstruction in representation space, inspired by iBOT \cite{zhou2021ibot}. We translate DINOv2’s local and global crops to volumetric data by introducing a GPU-based 3D random resized crop. We also redesign the masking strategy to be compatible with 3D inputs. In addition, we adapt an existing Vision Transformer (ViT) \cite{dosovitskiy2020image} implementation to handle volumetric data by modifying the patch embedding layer and the positional encoding grid. Using this implementation, we train a family of 2D and 3D ViTs with DINOv2-style pretraining on a large-scale, in-house dataset of 105K CT volumes to obtain high-quality feature extractors. Across a comprehensive set of public segmentation and classification benchmarks, we show that our models achieve state-of-the-art performance in frozen-feature segmentation using a linear decoder, while classification results highlight limitations in full-volume representation learning.  The key contributions of this work are summarized below:

% The pretrained models are thoroughly benchmarked against existing publicly available models on a wide range of classification and segmentation tasks.

\begin{itemize}[leftmargin=0pt, topsep=-1pt, itemsep=-3.5pt]
    \item We introduce \textit{TAP-CT}, a family of 6 task-agnostic foundation models for CT pretrained on a large in-house dataset of 105K CT volumes. We publish the pretrained weights to provide a strong, low-resource baseline for the medical imaging research community.
    \item We adapt the DINOv2 SSL framework and ViT architecture to handle volumetric inputs, facilitating comprehensive 3D pretraining on CT data.
    \item We release the code and configurations for benchmarking our FMs and other existing approaches to promote simple and standardized evaluation of vision foundation models in medical imaging.
\end{itemize}

\section{Related Work}
\label{sec:related_work}
\textbf{Transfer learning:} Prior work in transfer learning has demonstrated the effectiveness of transferable representations in CT imaging for both segmentation \cite{gao2024hermes, KARIMI2021transfersegmentation} and classification \cite{Kim2022transferclass}. SuPReM \cite{li2024well} further shows that transfer learning can substantially improve data efficiency and model performance in low-label regimes for CT segmentation. However, the models are pretrained with a supervised segmentation objective, resulting in a feature space that is heavily biased toward this task.
\\
\noindent\textbf{Biomarkers:} FMCiB \cite{Pai2024fmcib} extracts 50 mm$^3$ patches centered on a large set of lesions and applies SimCLR \cite{chen2020simple} to contrast lesion patches against non-lesion patches. This approach effectively models imaging biomarkers and achieves strong performance on downstream tasks such as nodule malignancy classification. Despite this, FMCiB focuses on small patches and is therefore unlikely to capture generalizable global representations. 
\\
\noindent\textbf{Promptable Image Segmentation:} Numerous adaptations of SAM \cite{kirillov2023segment} have been proposed for CT imaging \cite{Ma2024medsam, zhang2023sammed, cheng2023sammed2d, cheng2024hsam}. VISTA3D \cite{he2024vista3d} aggregates 2D SAM embeddings from multiple CT views into a 3D supervoxel representation, on top of which an encoder is subsequently trained. However, similar to SuPReM, its representations are still shaped by segmentation tasks.
\\
\noindent\textbf{General CT foundation models:} VoCo \cite{wu2024vocosimpleyeteffectivevolumecontrastive} leverages geometric patterns inherent in CTs within a contrastive learning paradigm to derive latent representations via a SwinUNETR encoder \cite{hatamizadeh2022swinunetrswintransformers}. The authors establish that VoCo surpasses current supervised approaches through pretraining on large-scale CT datasets. However, this performance is achieved by fine-tuning both the encoder and a task-specific SwinUNETR decoder. CT-FM \cite{pai2025ctfm} uses a SegResNet \cite{myronenko20183dmribraintumor} encoder pretrained with SimCLR on large public datasets. While the model demonstrates strong overall performance, it relies on end-to-end fine-tuning with a SegResNet decoder for segmentation.
\\
\noindent\textbf{DINOv2 in medical imaging:} Several studies have investigated the capabilities of the original DINOv2 ViTs on medical imaging tasks \cite{Hussien2025explainable, baharoon2024evaluatinggeneralpurposevision}. The DINOv2 framework has also been applied to x-ray imaging \cite{PérezGarcía2025raddino}. X-ray imaging which is inherently 2D, and thus requires no adaptations of the original framework. More recently, Curia \cite{dancette2025curiamultimodalfoundationmodel} employed regular DINOv2 training on a large-scale dataset of CT and MRI slices. In contrast, our work translates DINOv2 to the 3D domain and demonstrates the benefits of 3D over 2D pretraining, as reported in prior non-FM studies \cite{avesta2023comparison3d, ozgun2016unet3d}.
\\
\indent Collectively, prior work highlights both the potential and the fragmentation of CT foundation models: many approaches require extensive end-to-end fine-tuning, are task-specific, or restrict pretraining to 2D slices. On the other hand, our goal is to develop a large-scale, task-agnostic, 3D-pretrained CT foundation model that produces off-the-shelf, readily usable features.

\section{Methodology}
\label{sec:methodology}

We first describe our adaptations to DINOv2 and ViTs from 2D to 3D, followed by the dataset and preprocessing. Finally, we present an overview of the ten downstream tasks.

\subsection{Pretraining setup}

\textbf{DINOv2:} DINOv2 is a joint-embedding architecture in which a student model learns to construct informative latent representations by comparing its outputs with the teacher model, which is a momentum-updated replica of the student. The framework uses two objectives: the DINO \cite{caron2021emerging} loss aligns representations of local and global crops for robust image-level representation, while the iBOT \cite{zhou2021ibot} loss leverages a subset of \texttt{[MASK]} tokens in the student’s global crops. Here, the teacher processes all tokens, and the student has to match the teacher’s representations for the masked tokens. We refer to \cite{oquab2023dinov2} for comprehensive details. 
\\
\noindent\textbf{Local and global crops:} The adaptation of the random resized crop transform from the 2D domain to the 3D domain brings along certain considerations. In 2D, the transformation stochastically selects a crop based on area, sampling random aspect ratios, and resizing the crop to a fixed target size. In 3D, implementing random resized cropping requires a volumetric region, which faces two constraints: computational limitations caused by interpolation of the crop, and the varying number of slices per CT. The axial dimension is often $(512, 512)$, but $z$ is inconsistent relative to height-width thus sampling a height-depth ratio becomes non-trivial.  
\\
\indent A more practical and effective solution, adopted in this work, selects an area and its aspect ratio in the axial plane and extends the crop by a fixed number of slices along the depth axis. This approach also acts as an implicit augmentation, since the physical slice spacing in world-coordinate $z$-space varies across CT volumes. To mitigate the substantial overhead of CPU-bound interpolation, we implement the 3D random resize crop on GPU.
\\
\noindent\textbf{Masking strategy:} We also adapt the masking strategy of DINOv2 to volumetric data. For each global crop, we randomly sample multiple masked regions until the desired number of masked patches is reached. In 2D, an area and subsequently a height–width aspect ratio are sampled for each region. In contrast to our crop sampling, we extend masking to 3D by sampling a height–depth aspect ratio, since depth does not vary across crops. Consequently, the masks tend to be more cube-like, with height–width and height–depth aspect ratios that are relatively similar.
\\
\noindent\textbf{Pretraining hyperparameters:} The training regimen mostly follows \cite{oquab2023dinov2}. We train for 125,000 iterations with batch size 2048 on 8 H100 GPUs. We observed that increasing the learning rate warmup phase from 12,500 to 25,000 iterations was critical to prevent early representational collapse. The full set of hyperparameters is listed in Appendix \ref{app:pretraining}. 
\\
\noindent\textbf{ViT adjustments:} We adapt the ViT by extending the patch embedding layer from a 2D to a 3D convolution. In addition, the learned positional encoding is interpolated onto a 3D grid to account for positional variations along the z-axis.
\\
\noindent\textbf{TAP-CT:} As a baseline, we train 2D models with a global crop size of (224, 224), a local crop size of (96, 96), and a patch size of (16, 16), following \cite{oquab2023dinov2}. For 3D volume experiments, we consider two distinct configurations. The first configuration employs a global crop size of (6, 224, 224), a local crop size of (6, 96, 96), and a patch size of (1, 16, 16), termed 2.5D, given its 3D volume and 2D patch size. The second configuration utilizes a global crop size of (12, 224, 224), a local crop size of (12, 96, 96), and a patch size of (4, 8, 8), termed 3D. The decision to keep $z$ consistent between global and local crops was derived empirically; see Appendix \ref{app:ablation_local_crop}. For each configuration, we train both ViT-S and ViT-B models. For brevity, we denote these models as TAP-S/B-2D, TAP-S/B-2.5D, and TAP-S/B-3D, respectively.

\subsection{Dataset}

The pretraining dataset comprises 104,405 CT volumes from 19,995 oncological patients, with a mean age of 63 years. The scans contain an average of 316 slices, totaling 32,973,620 slices across the dataset. The median voxel spacing is 0.79 mm × 0.79 mm × 1.5 mm along the x, y, and z axes, respectively. Distributions of patient age, sex, and scanner manufacturer are provided in Appendix \ref{app:dataset_specs}.
\\
\noindent\textbf{Preprocessing:} We extract the mean, standard deviation,  0.5th and 99.5th percentiles of the foreground voxels in the dataset, following \texttt{nnUNetv2} \cite{isensee2021nnu}. We use these to clip and normalize the volumes. Exact values listed in Appendix \ref{app:dataset_specs}.
\\
\noindent\textbf{SSL Augmentations:} Since CTs are single-channel, color-based augmentations used in DINO are replaced with a random gamma adjustment, while the random Gaussian blur transformation is retained. Prior research showed that for shorter training regimens, such as ours, data augmentations can potentially play a significant role in model performance \cite{moutakanni2024dontneeddomainspecificdata} which is why the augmentations are replaced. All augmentations are performed on the GPU to speed up training time.

\subsection{Downstream tasks}

\begin{table}[]
    \caption{Downstream task details for each publicly available model and ours. For Curia, volumes are resized to $(z, 512, 512)$. For TAP-CT we evaluate both on $(z, 224, 224)$ and $(z, 512, 512)$. An asterisk (*) indicates that only the encoder is used.}
    \label{table:model_sizes}
    \centering
    \setlength{\tabcolsep}{0.2em}
    \resizebox{\textwidth}{!}{\begin{tabular}{|l|lllll|}
    \hline
    \textbf{Model}   & \textbf{Architecture} & \textbf{Params} & \textbf{Spacing} &
    \textbf{Window Size} & \textbf{Data Size} \\ \hline
    \textbf{CT-FM}   & SegResNet*            & 77.8M           & (3.0, 1.0, 1.0) & (24, 128, 128) & 148K \\
    \textbf{Curia}   & ViT Base (2D)         & 86.0M           & -               & (1, 512, 512)  & 150K \\
    \textbf{FMCiB}   & ResNet                & 184.5M          & (1.0, 1.0, 1.0) & (64, 64, 64)   & 11.5K \\
    \textbf{SuPReM}  & UNet*                 & 19.1M           & (1.5, 1.5, 1.5) & (96, 96, 96)   & 2.1K \\
    \textbf{VISTA3D} & SegResNet*            & 175.0M          & (1.5, 1.5, 1.5) & (128, 128, 128) & 11.5K \\
    \textbf{VoCo}    & SwinUNETR Base*       & 53.2M           & (1.5, 1.5, 1.5) & (96, 96, 96)   & 160K \\
    \textbf{TAP-B-3D (ours)} & ViT Base (3D) & 86.0M & - & (12, 224, 224) & 105K \\\hline
    \end{tabular}}
\end{table}

To assess the performance of pretrained FMs (Table \ref{table:model_sizes}), we conduct a series of downstream segmentation and classification experiments. We compare against publicly available pretrained vision-only encoders. We consider models that incorporate language supervision during pretraining outside the scope of this evaluation. Both a ViT-B from DINOv2 and DINOv3 \cite{simeoni2025dinov3, liu2025does} are included to serve as a natural image benchmark.

\noindent\textbf{Window-based inference:} For each task, embeddings are first extracted using a sliding window approach \cite{cardoso2022monai}, after which a single layer is trained to exclusively evaluate the representational strength of the encoder. We set the window size to the native input size of each encoder, and use 0.75x overlap per dimension, equivalent to 0.25x striding window. Overlapping embeddings are merged via Gaussian averaging.

\noindent\textbf{Frozen-encoder evaluation:}: Throughout this work, we explicitly focus on evaluating the frozen features of all encoders. This enables a direct comparison with existing foundation models in terms of representational quality and is not intended as a comparison to the current state-of-the-art in segmentation and classification approaches. The purpose is to show the value of task-agnostic SSL pretraining for improving foundation model representations.

\indent All evaluations are implemented in EVA \cite{kaiko.ai2024eva}. The evaluation procedure is illustrated in Figure \ref{fig:teaser}, while further details of downstream task specifics, along with a summary overview of all datasets, are provided in Appendix \ref{app:downstreams}.

\subsubsection{Segmentation}
\label{sec:downstream-tasks-segmentation}
We leverage a single linear convolution layer that maps the frozen encoder embeddings to segmentation logits. Performance is measured using the macro-averaged Dice Similarity Coefficient (DSC) over all non-background classes. Following \cite{isensee2024nnu}, we provide a fully supervised nnU-Net 3D ResEnc (M) baseline and we employ datasets found to be most effective for benchmarking 3D medical image segmentation.
\\
\indent\textbf{AMOS22} \cite{ji2022amos} has abdominal annotations for 15 different organs. \textbf{KiTS23} \cite{heller2023kits21} is focused on kidney, kidney tumor and kidney cyst segmentation. Furthermore, we add three more datasets for broader comparison: \textbf{TotalSegmentator v2} \cite{wasserthal2023totalsegmentator}, a full-body segmentation dataset annotated with 117 distinct anatomical structures. To simplify evaluation and reduce redundancy, we merge certain classes, resulting in 49 distinct classes (Appendix \ref{app:downstreams}).  \textbf{LiTS17} \cite{Bilic2022lits} contains CTs for liver and liver tumor segmentation. \textbf{MSD Pancreas Tumor} \cite{simpson2019largeannotatedmedicalimage} contains labels for the pancreas and any associated lesions.

\subsubsection{Classification}
\label{sec:downstream-tasks-classification-method}
We evaluate classifications tasks using an Attention-Based Multiple Instance Learning (ABMIL) head \cite{ilse2018attentionbaseddeepmultipleinstance}. The ABMIL head is applied to the \texttt{[CLS]} embeddings when available, and to patch embeddings otherwise. The classification datasets are:
\\
\indent \textbf{LUNA16} \cite{SETI2017LUNA16, armato2015lidc}, a lung nodule malignancy task based on radiologists' verdict. For this task, each encoder extracts features from a 50mm$^3$ crop centered around the lesion. The metric reported for this downstream task is area under the ROC curve (AUC). Given that a commonly used split \cite{Pai2024fmcib} is on nodule level, and thus some patients/scans are both in training and test set, we use a new patient-level split. \textbf{LUNA25} \cite{peeters2025luna25_imaging, peeters2025luna25_annotation} is another lung nodule malignancy task but with pathologically confirmed labels. A 50mm$^3$ crop is extracted around each nodule. The reported metric is AUC. \textbf{RSNA2023} \cite{Hermans2025rsna} is a dataset focused on multi-label scan-level abdominal trauma classification. We consider only injuries to the kidney, spleen, and liver, and omit cases of extravasation and injuries to the bowel. The evaluation metric for this task is the micro-averaged multi-label AUC. \textbf{RSNA2022} \cite{Lin2023RSNAcervical} is a multi-label scan-level classification task to identify cervical spine fractures across vertebrae. Due to class imbalance, performance is evaluated using the micro-averaged multi-label Average Precision (AP). \textbf{FDG-PET-CT} \cite{gatidis2022fdgpetlesion} is a full-body dataset of patients diagnosed with three distinct cancer subtypes. For this downstream task, we used the diagnostic CTs and formulated a scan-level binary task, classifying tumor presence. The evaluation metric for this task is AUC.

\section{Results \& Analysis}
\label{sec:results}

\noindent The following section outlines the results obtained across downstream tasks, along with an analysis and ablation studies conducted in this work.

\subsection{Segmentation}

The segmentation results of TAP-B-3D compared with publicly available pretrained models are presented in Table \ref{tab:segmentation-results}, along with a supervised baseline. TAP-B-3D shows strong downstream capabilities, achieving improvements of 5 to 23 percentage points in DSC over the next best FM Curia. Table \ref{table:own_models_segmentation} summarizes the results for all TAP-CT models. TAP-B-3D, the largest model with the largest input context, performs best on (z, 224, 224) evaluation, while TAP-B-2.5D leads on 512-evaluation. Overall, segmentation quality largely scales with model dimensionality, model size, and input resolution. Frozen encoder embeddings with a linear probe still fall short of the performance achieved by fully supervised encoder–decoder setups such as nnU-Net.

\subsection{Analysis}
\label{subsec:seg_analysis}

\textbf{Public foundation models}: Among the publicly available pretrained models, Curia achieves the strongest overall performance, which is expected given its use of task-agnostic pretraining through DINOv2. SuPREM attains the second-best performance, likely benefiting from its supervised pretraining with a segmentation objective on Abdomen Atlas 1.1 \cite{li2024abdomenatlas}, which encompasses all downstream segmentation datasets considered in this work. The weaker performance of VoCo may stem from its pretraining design, which employs a large U-shaped decoder that relies on multilevel feature representations; as a result, its pretraining objective does not necessarily encourage the final-layer features to encode the bulk of the semantic information. FMCiB, which is pretrained on limited regions centered on lesions to extract specific biomarkers, is expected to underperform on full-volume segmentation tasks.

\begin{table}[t]
    \centering
    \caption{Segmentation performance after fine-tuning a single linear convolution layer on frozen encoder embeddings, except for the nn-UNet 3D ResEnc-M supervised baseline. We report the mean Dice Similarity Coefficient (DSC) $\pm$ standard deviation averaged over three runs. \textbf{Best} results are bolded, while \underline{second-best} results are underlined. OOM = Layer ran out of memory on an H100 80GB GPU. (\Cross) = Model’s pretraining data included these public datasets. (*) = Evaluated on ($z$, 224, 224): H100 80GB went OOM on ($z$, 512,  512).\vspace{-0.5em}}
    \label{tab:segmentation-results}
    \setlength{\tabcolsep}{0.1em}
    \resizebox{\textwidth}{!}{\begin{tabular}{lcccccc}
    \toprule
    \textbf{Model}  & \textbf{AMOS22} (DSC) & \textbf{LiTS17} (DSC) & \textbf{KiTS23} (DSC) & \textbf{TotalSeg.} (DSC) & \textbf{MSD Pancreas} (DSC) & \textbf{Average} \\ 
    \midrule
    \textcolor{gray}{nn-UNet} & \textcolor{gray}{0.904} & \textcolor{gray}{0.781} & \textcolor{gray}{0.749} & \textcolor{gray}{0.879} & \textcolor{gray}{0.682} & \textcolor{gray}{0.799} \\
    \graymidrule
    Curia           & \underline{0.669} ($\pm$ .006) & \underline{0.571} ($\pm$ .003) & \underline{0.429} ($\pm$ .006) & \underline{0.425}* ($\pm$ .002) & \underline{0.350} ($\pm$ .003) & \underline{0.489} \\ 
    SuPReM\Cross          & 0.450 ($\pm$ .004) & 0.440 ($\pm$ .002) & 0.363 ($\pm$ .002) & 0.353 ($\pm$ .002) & 0.301 ($\pm$ .000) & 0.381 \\
    CT-FM\Cross         & 0.417 ($\pm$ .008) & 0.416 ($\pm$ .013) & 0.265 ($\pm$ .006) & 0.317 ($\pm$ .000) & 0.213 ($\pm$ .009) & 0.326 \\
    VISTA3D\Cross         & 0.364 ($\pm$ .001) & 0.377 ($\pm$ .014) & 0.243 ($\pm$ .010) & 0.226 ($\pm$ .000) & 0.161 ($\pm$ .002) & 0.274 \\
    VoCo\Cross            & 0.120 ($\pm$ .004) & 0.345 ($\pm$ .014) & 0.176 ($\pm$ .004) & 0.072 ($\pm$ .000) & 0.120 ($\pm$ .001) & 0.167\\
    FMCiB           & 0.061 ($\pm$ .008) & 0.362 ($\pm$ .002) & 0.110 ($\pm$ .018) & OOM               & 0.051 ($\pm$ .002) & 0.146\\ 
    \graymidrule
    \textcolor{gray}{DINOv2-B} 
        & \textcolor{gray}{0.376 ($\pm .003$)} 
        & \textcolor{gray}{0.442 ($\pm .003$)} 
        & \textcolor{gray}{0.321 ($\pm .006$)} 
        & \textcolor{gray}{0.345 ($\pm .004$)} 
        & \textcolor{gray}{0.204 ($\pm .003$)} 
        & \textcolor{gray}{0.338} \\
    \textcolor{gray}{DINOv3-B} 
        & \textcolor{gray}{0.331 ($\pm .005$)} 
        & \textcolor{gray}{0.423 ($\pm .003$)} 
        & \textcolor{gray}{0.325 ($\pm .003$)} 
        & \textcolor{gray}{0.333 ($\pm .005$)} 
        & \textcolor{gray}{0.192 ($\pm .003$)} 
        & \textcolor{gray}{0.321} \\
    \midrule
    TAP-B-3D & \textbf{0.724} ($\pm$ .001) & \textbf{0.626} ($\pm$ .004) & \textbf{0.480} ($\pm$ .005) & \textbf{0.651}* ($\pm$ .001) & \textbf{0.429} ($\pm$ .003) & \textbf{0.582} \\
    \bottomrule
    \end{tabular}}
    \vspace{-1.5em}
\end{table}

\begin{table*}[t]
\centering
\caption{Segmentation performance after fine-tuning a single linear convolution layer on frozen encoder embeddings, averaged over three runs. For each task, we report the mean Dice Similarity Coefficient (DSC) $\pm$ standard deviation. \textbf{Best} results are bolded, while \underline{second-best} results are underlined.}
\label{table:own_models_segmentation}
\setlength{\tabcolsep}{0.1em}
\resizebox{\textwidth}{!}{%
\begin{tabular}{c l c c c c c c c}
\toprule
% leftmost header intentionally blank for the rotated group labels
& \textbf{Model}  & \textbf{Patch Size} & \textbf{Image Size} & \textbf{AMOS22} & \textbf{LiTS17} & \textbf{KiTS23} & \textbf{TotalSeg.} & \textbf{MSD Pancreas} \\ 
\multicolumn{4}{c}{} & (DSC) & (DSC) & (DSC) & (DSC) & (DSC) \\
\midrule
% ---------- Eval size = 224 block (6 rows) ----------
\multirow{6}{*}{\rotatebox{90}{\parbox{3.0cm}{\centering \textbf{($\mathbf{z}$, 224, 224)}}}}
 & TAP-S-2D    & (16, 16)      & (224, 224) & 0.545 ($\pm$ .002) & 0.513 ($\pm$ .002) & 0.387 ($\pm$ .002) & 0.482 ($\pm$ .001) & 0.301 ($\pm$ .004) \\
 & TAP-B-2D    & (16, 16)      & (224, 224) & 0.562 ($\pm$ .002) & 0.537 ($\pm$ .001) & 0.406 ($\pm$ .002) & 0.520 ($\pm$ .001) & 0.308 ($\pm$ .003) \\
 & TAP-S-2.5D  & (1, 16, 16)   & (6, 224, 224) & 0.553 ($\pm$ .001) & 0.537 ($\pm$ .001) & 0.435 ($\pm$ .004) & 0.508 ($\pm$ .003) & 0.315 ($\pm$ .001) \\
 & TAP-B-2.5D  & (1, 16, 16)   & (6, 224, 224) & 0.577 ($\pm$ .004) & 0.554 ($\pm$ .006) & 0.457 ($\pm$ .004) & 0.540 ($\pm$ .005) & 0.341 ($\pm$ .001) \\
 & TAP-S-3D    & (4, 8, 8)     & (12, 224, 224) & \underline{0.633} ($\pm$ .003) & \underline{0.572} ($\pm$ .001) & \underline{0.445} ($\pm$ .003) & \underline{0.612} ($\pm$ .001) & \underline{0.373} ($\pm$ .003) \\
 & TAP-B-3D    & (4, 8, 8)     & (12, 224, 224) & \textbf{0.648} ($\pm$ .001) & \textbf{0.583} ($\pm$ .006) & \textbf{0.453} ($\pm$ .005) & \textbf{0.651} ($\pm$ .001) & \textbf{0.395} ($\pm$ .002) \\
\arrayrulecolor{gray}\specialrule{0.3pt}{0.5\jot}{0.1pc}
% ---------- Eval size = 512 block (6 rows) ----------
\multirow{6}{*}{\rotatebox{90}{\parbox{3.0cm}{\centering \textbf{($\mathbf{z}$, 512, 512)}}}}
 & TAP-S-2D    & (16, 16)      & (224, 224) & 0.700 ($\pm$ .001) & 0.551 ($\pm$ .002) & 0.447 ($\pm$ .002) & - & 0.390 ($\pm$ .003) \\
 & TAP-B-2D    & (16, 16)      & (224, 224) & 0.722 ($\pm$ .000) & 0.583 ($\pm$ .003) & 0.479 ($\pm$ .004) & - & 0.420 ($\pm$ .002) \\
 & TAP-S-2.5D  & (1, 16, 16)      & (6, 224, 224) & 0.699 ($\pm$ .002) & 0.574 ($\pm$ .004) & \underline{0.496} ($\pm$ .006) & - & 0.412 ($\pm$ .003) \\
 & TAP-B-2.5D  & (1, 16, 16)      & (6, 224, 224) & \textbf{0.736} ($\pm$ .002) & \underline{0.597} ($\pm$ .002) & \textbf{0.536} ($\pm$ .004) & - & \textbf{0.449} ($\pm$ .005) \\
 & TAP-S-3D    & (4, 8, 8)      & (12, 224, 224) & 0.711 ($\pm$ .001) & 0.584 ($\pm$ .008) & 0.458 ($\pm$ .005) & - & 0.422 ($\pm$ .003) \\
 & TAP-B-3D    & (4, 8, 8)      & (12, 224, 224) & \underline{0.724} ($\pm$ .001) & \textbf{0.626} ($\pm$ .004) & 0.480 ($\pm$ .005) & - & \underline{0.429} ($\pm$ .003) \\
\bottomrule
\end{tabular}%
    }
\vspace{-1.5em}
\end{table*}
\noindent\textbf{The scaling of TAP-CT}: The substantial performance gap between TAP-B-3D, Curia, and other publicly available models highlights the need for general, high-capacity vision encoders in CT imaging that produce robust, standalone feature representations. The comparison in Table \ref{table:own_models_segmentation} of TAP-CT variants on volumes resized to $(z, 224, 224)$ -- their native axial input resolution -- demonstrates that foundation models for CT seem to adhere to conventional scaling laws: larger models consistently outperform smaller ones, and increased input context leads to improved results. The gains observed when moving from 2D to 3D further emphasize the importance of volumetric encoders over purely slice-based approaches. 
\\
\indent This trend is less apparent when evaluating frozen features on volumes resized to ($z$, 512, 512), closer to native CT resolution. Base models still outperform smaller ones, but the 2D–3D gap narrows, likely because sliding-window inference remains limited by each model’s native input size. Pretraining 3D models at 512×512 resolution would likely yield similar gains as observed at 224, but at substantially higher computational cost. A possible explanation for the stronger performance of the 2D and 2.5D models relative to the 3D models at 512-resolution is their larger patch extent in the $x$–$y$ plane rather than along z. Although all models have equal numbers of voxels per patch, resizing volumes to 512 solely affects the $x$–$y$ dimensions. As a result, the sliding-window evaluation may favor models whose patch structure allocates more capacity to these axes. We hypothesize that adopting a post-training strategy on high resolution as displayed in DINOv3 could yield further performance gains.

%Evaluation at 512 already increases runtime considerably; for example, a single TAP-B-3D run on AMOS22 takes about three hours at 512 compared with 30 minutes at 224 on a single H100. 

\begin{table}[t]
    \centering
    \caption{Classification results after fine-tuning an Attention-Based Multiple Instance Learning (ABMIL) model on frozen \texttt{[CLS]} embeddings, when available, or on frozen patch embeddings otherwise. Each metric (AP, AUC) is reported as mean $\pm$ standard deviation averaged over three runs. \textbf{Best} results are bolded, while \underline{second-best} results are underlined.}
    \label{table:classification-results}
    \setlength{\tabcolsep}{0.1em}
    \resizebox{\textwidth}{!}{\begin{tabular}{lccccc}
    \toprule
    \textbf{Model}  & \textbf{LUNA16} (AUC) & \textbf{LUNA25} (AUC) & \textbf{RSNA2022} (AP) & \textbf{RSNA2023} (AUC) & \textbf{FDGPETCT} (AUC) \\
    \midrule
    Curia           & 0.860 ($\pm$ .005) & \underline{0.856} ($\pm .002$) & \underline{0.408} ($\pm$ .022) & \textbf{0.748} ($\pm$ .013) & \textbf{0.877} ($\pm$ .011) \\ 
    SuPReM          & 0.777 ($\pm$ .030) & 0.788 ($\pm$ .025) & 0.341 ($\pm$ .002) & 0.598 ($\pm$ .004) & 0.598 ($\pm$ .009) \\
    CT-FM           & \textbf{0.876} ($\pm$ .011) & 0.847 ($\pm$ .005) & 0.337 ($\pm$ .015) & 0.589 ($\pm$ .005) & 0.694 ($\pm$ .009) \\
    VISTA3D         & 0.868 ($\pm$ .021) & \textbf{0.866} ($\pm$ .008) & 0.352 ($\pm$ .005) & 0.603 ($\pm$ .006) & 0.637 ($\pm$ .011) \\
    VoCo            & 0.620 ($\pm$ .004) & 0.632 ($\pm$ .006) & 0.359 ($\pm$ .004) & 0.610 ($\pm$ .004) & 0.599 ($\pm$ .050) \\
    FMCiB           & 0.776 ($\pm$ .012) & 0.702 ($\pm$ .054) & 0.347 ($\pm$ .001) & 0.605 ($\pm$ .015) & 0.587 ($\pm$ .033) \\
    \graymidrule
    \textcolor{gray}{DINOv2-B} 
        & \textcolor{gray}{0.818 ($\pm .020$)} 
        & \textcolor{gray}{0.864 ($\pm .002$)} 
        & \textcolor{gray}{0.380 ($\pm .012$)} 
        & \textcolor{gray}{0.634 ($\pm .003$)} 
        & \textcolor{gray}{0.763 ($\pm .019$)} \\
    \textcolor{gray}{DINOv3-B} 
        & \textcolor{gray}{0.855 ($\pm .005$)} 
        & \textcolor{gray}{0.869 ($\pm .004$)} 
        & \textcolor{gray}{0.376 ($\pm .013$)} 
        & \textcolor{gray}{0.605 ($\pm .008$)} 
        & \textcolor{gray}{0.780 ($\pm .020$)} \\
    \midrule
    TAP-B-3D & \textbf{0.876} ($\pm$ .006) & 0.855 ($\pm$ .007) & \textbf{0.420} ($\pm$ .019) & \underline{0.658} ($\pm$ .022) & \underline{0.798} ($\pm$ .027) \\
    \bottomrule
    \end{tabular}}    
\end{table}

\begin{table}[]
\centering
\caption{Classification results of the TAP-CT ViT models: fine-tuning an Attention-Based Multiple Instance Learning (ABMIL) model on frozen \texttt{[CLS]} embeddings, when available, or on frozen patch embeddings otherwise. Each metric (AP, AUC) is averaged over three runs, with the corresponding standard deviation. \textbf{Best} results are bolded, while \underline{second-best} results are underlined.}
\label{table:own_models_classification}
\setlength{\tabcolsep}{0.1em}
\resizebox{\textwidth}{!}{\begin{tabular}{lccccccc}
\toprule
\textbf{Model}  & \textbf{Patch Size} & \textbf{Image Size} & \textbf{LUNA16} & \textbf{LUNA25} & \textbf{RSNA2022} & \textbf{RSNA2023} & \textbf{FDGPETCT} \\ 
\multicolumn{3}{c}{} & (AUC) & (AUC) & (AP) & (AUC) & (AUC) \\
\midrule
TAP-S-2D & (16, 16) & (224, 224) & 0.830 ($\pm$ .003) & 0.833 ($\pm$ .005) & 0.390 ($\pm$ .017) & 0.663 ($\pm$ .015) & 0.789 ($\pm$ .011) \\
TAP-B-2D & (16, 16) & (224, 224) & 0.854 ($\pm$ .002) & \textbf{0.857} ($\pm$ .003) & 0.385 ($\pm$ .010) & 0.667 ($\pm$ .006) & 0.757 ($\pm$ .041) \\
TAP-S-2.5D & (1, 16, 16) & (6, 224, 224) & \textbf{0.886} ($\pm$ .010) & 0.809 ($\pm$ .003) & \underline{0.439} ($\pm$ .020) & \textbf{0.748} ($\pm$ .007) & \textbf{0.889} ($\pm$ .008) \\
TAP-B-2.5D & (1, 16, 16) & (6, 224, 224) & 0.815 ($\pm$ .010) & 0.837 ($\pm$ .003) & 0.429 ($\pm$ .010) & \underline{0.739} ($\pm$ .010) & \underline{0.827} ($\pm$ .009) \\
TAP-S-3D & (4, 8, 8) & (12, 224, 224) & 0.868 ($\pm$ .002) & 0.842 ($\pm$ .005) & \textbf{0.440} ($\pm$ .021) & 0.672 ($\pm$ .016) & 0.805 ($\pm$ .019) \\
TAP-B-3D & (4, 8, 8) & (12, 224, 224) & \underline{0.876} ($\pm$ .006) & \underline{0.855} ($\pm$ .007) & 0.420 ($\pm$ .019) & 0.658 ($\pm$ .022) & 0.798 ($\pm$ .027) \\
\midrule
\multicolumn{8}{c}{\textbf{Ablation}} \\
\shortstack[l]{TAP-B-3D \\ (Patch Feat.)} & \raisebox{0.5\height}{(4, 8, 8)} & \raisebox{0.5\height}{(12, 224, 224)} & \raisebox{0.5\height}{0.805 ($\pm .029$)} & \raisebox{0.5\height}{0.855 ($\pm .015$)} & \raisebox{0.5\height}{0.345 ($\pm .003$)} & \raisebox{0.5\height}{0.592 ($\pm .005$)} & \raisebox{0.5\height}{0.714 ($\pm .033$)}\\
\bottomrule
\end{tabular}}
\end{table}

\subsection{Classification}

Table \ref{table:classification-results} summarizes the classification performance of TAP-B-3D compared to other pretrained CT foundation models. TAP-B-3D achieves the best results on two tasks (with one tie) and ranks second or third on the remaining three. Table \ref{table:own_models_classification} reports the classification outcomes across the TAP-CT models, where TAP-S-2.5D attains the top performance on three tasks and demonstrates the overall strongest results, public models included. In contrast to the segmentation experiments, performance does not scale consistently with model size or dimensionality.

\subsubsection{Analysis}

\noindent\textbf{Model comparison:} From Table \ref{table:classification-results}, we observe that SuPReM’s classification performance deteriorates substantially relative to its segmentation performance and compared to the other models. This can be attributed to its pretraining bias toward segmentation tasks, resulting in feature representations that are less suitable for classification. As with the segmentation tasks, VoCo's weaker performance on frozen feature classification can be attributed to the fact that the model is primarily designed to heavily fine-tune together with task-specific decoder. Although VISTA3D remains competitive on LUNA16 and LUNA25, its encoder, pretrained for pointwise segmentation, shows limited effectiveness on scan-level classification tasks (RSNA2022, RSNA2023, and FDGPETCT). In contrast, Curia and TAP-S-2.5D (Table \ref{table:own_models_classification}) are able to extract meaningful signals from frozen features on these tasks. Given that Curia and TAP are the only models trained on private data and ViT/DINOv2 native, either could explain the performance difference, and further research is needed to draw conclusions. Moreover, the results in both Table \ref{table:classification-results} and Table \ref{table:own_models_classification} reveal a more fundamental challenge associated with volume-level classification in medical imaging.
\\
\noindent\textbf{Issues with volume-level classification in medical imaging:} Medical image classification often resembles finding a needle in a haystack: subtle, localized perturbations can decisively determine the outcome. For instance, diagnosing the presence of a lung nodule may depend on only a few voxels, and assessing whether the nodule is malignant poses an even greater challenge. Benchmarks such as LUNA16 tend to saturate quickly, as the classification task is typically confined to a predefined crop around the nodule when assessing malignancy. This is further supported by the strong performance of DINOv2 and DINOv3 on these tasks, as their natural-image pretrained features are well suited to modeling them effectively. Furthermore, the clinical relevance of these benchmarks is limited, since they depend on prior nodule detection (and labeling) by a radiologist. 
\\
\indent These challenges are clearly reflected in the results. On RSNA2022, most models struggle to surpass the expected average AP ($\approx 0.33$). For RSNA2023 and FDGPETCT, the ABMIL likewise fails to extract a strong signal from the majority of models. All models exhibit higher variance across runs in classification compared to segmentation tasks. Although TAP-S-2.5D and Curia generally perform well on volume-level tasks, the results among TAP-CT models show no consistent pattern. Increasing model size does not necessarily translate to improved outcomes, as the smaller variants frequently outperform their base counterparts. Furthermore, the difference between TAP-S/B-2D and TAP-S/B-3D remains marginal, suggesting that scaling dimensionality has limited influence in this context. Additionally, Curia’s features appear capable of conveying global information, despite the model having access only to slice-level inputs. A potential limitation of the TAP-S/B-3D models may stem from compression constraints: while their large context benefits segmentation, it becomes challenging to encode global information into a single $\texttt{[CLS]}$ token. This line of reasoning is reinforced by the fact that TAP-S/B-2.5D outperforms its counterparts with both shorter and longer context lengths. This suggests that longer sequences allow more information to be captured within the $\texttt{[CLS]}$ token, but that there is a limit to how much global context can be stored through the DINO objective before the representation quality deteriorates. These observations indicate that future research in CT foundation models should aim to extract global 3D representations while preserving robust and informative local features.

%Another factor at play could be compression limits of the $\texttt{[CLS]}$ tokens as the context size grows. All of this opens up an avenue of research to optimize for both local and global information separately or leveraging the local information to build a global representations.
%In contrast to segmentation, where a patch size of (4, 8, 8) proved more advantageous, classification performance benefits instead from the (1, 16, 16) configuration, suggesting an inverse relationship between the patch size for the two tasks.

\subsection{Ablation: Few Shot Classification}

% \begin{table}[t]
% \caption{Few-Shot Learning results for organ classification on AMOS22. \textbf{Best} results are bolded, while \underline{second-best} results are underlined.}
% \label{table:few_shot}
% \begin{tabular}{|l|ccc|}
% \hline
%  & \multicolumn{3}{c|}{\textbf{AMOS22 Few-Shot}} \\
%  & \multicolumn{3}{c|}{\textbf{F1-scores}} \\ \cline{2-4}
%  & \textbf{k = 1} & \textbf{k = 5} & \textbf{k = 10} \\ \hline
% \textbf{TAP-B-2D} & \underline{0.695} & \textbf{0.881} & \textbf{0.910} \\
% \textbf{Curia}    & \textbf{0.696} & \underline{0.873} & \underline{0.900} \\
% \textbf{CT-FM}    & 0.628 & 0.744 & 0.771 \\
% \textbf{SuPReM}   & 0.617 & 0.837 & 0.871 \\
% \textbf{VISTA3D}  & 0.525 & 0.750 & 0.796 \\
% \textbf{DINOv3-b} & 0.477 & 0.694 & 0.747 \\
% \textbf{DINOv2-b} & 0.387 & 0.584 & 0.632 \\
% \textbf{VoCo}     & 0.338 & 0.475 & 0.512 \\
% \textbf{FMCiB}    & 0.252 & 0.355 & 0.359 \\ \hline
% \end{tabular}
% \end{table}

\begin{figure}[t]
\centering

\begin{minipage}[t]{0.45\linewidth}
    \centering
    \captionof{figure}{Cosine similarity matching between lesion embeddings in an initial and a follow-up scan of the same patient. The first set of slices are ground-truth lesion segmentations across ten slices ($z$); the second set shows the top-$k$ voxel matches between averaged lesion embeddings from the initial scan and all embeddings in the next.}
    \includegraphics[width=\linewidth]{images/qualitative-cvpr-top-down.pdf}
    \label{fig:similarity_matching}
\end{minipage}
\hfill
\begin{minipage}[t]{0.54\linewidth}
    \centering
    \captionof{table}{Few-Shot Learning results for organ classification on AMOS22. \textbf{Best} results are bolded, while \underline{second-best} results are underlined.}
    \label{table:few_shot}
    \begin{tabular}{|l|ccc|}
    \hline
     & \multicolumn{3}{c|}{\textbf{AMOS22 Few-Shot}} \\
     & \multicolumn{3}{c|}{\textbf{F1-scores}} \\ \cline{2-4}
     & \textbf{k = 1} & \textbf{k = 5} & \textbf{k = 10} \\ \hline
    \textbf{TAP-B-2.5D} & \underline{0.695} & \textbf{0.881} & \textbf{0.910} \\
    \textbf{Curia}    & \textbf{0.696} & \underline{0.873} & \underline{0.900} \\
    \textbf{CT-FM}    & 0.628 & 0.744 & 0.771 \\
    \textbf{SuPReM}   & 0.617 & 0.837 & 0.871 \\
    \textbf{VISTA3D}  & 0.525 & 0.750 & 0.796 \\
    \textbf{DINOv3-b} & 0.477 & 0.694 & 0.747 \\
    \textbf{DINOv2-b} & 0.387 & 0.584 & 0.632 \\
    \textbf{VoCo}     & 0.338 & 0.475 & 0.512 \\
    \textbf{FMCiB}    & 0.252 & 0.355 & 0.359 \\ \hline
    \end{tabular}
\end{minipage}

\vspace{-2em}
\end{figure}

We take our leading model on 512-by-512 segmentation and evaluate its patches through few-shot learning (FSL) for organ classification on AMOS22. For each encoder, embeddings are extracted at the natural-coordinate centroids of the segmentation masks across all scans. We first extract all embeddings applying the same sliding window routine described in Section \ref{sec:downstream-tasks-segmentation}. For each centroid, its real-world coordinates are mapped sequentially to voxel space and to the corresponding feature space prior to embedding retrieval. Since centroid locations may fall between discrete embedding positions, the embedding at each centroid is obtained via trilinear interpolation. Evaluation is performed over 1{,}000 episodes. For each episode $E$, we construct a $15$-way $k$-shot classification task by sampling $k$ support embeddings for each class $c \in C$, where $|C| = 15$. Each episode additionally contains 15 query embeddings, randomly sampled from the validation query set and evaluated against the episode-specific support set. Support embeddings are sampled from the training split, while query embeddings are sampled from the validation split. Splits are identical to those used in the AMOS22 segmentation experiments.
\\
\indent Table \ref{table:few_shot} demonstrates the results of the few-shot experiment. TAP-B-2.5D and Curia outperform all other pretrained foundation models by a significant margin. It shows the potential of these FMs to handle tasks with only a few examples, and facilitate embedding matching and potentially longitudinal tracking, an ability that current supervised (segmentation) models struggle with \cite{rocholl2025unstable}.


\subsection{Qualitative: Patch Retrieval}

% \begin{figure}
%     \centering
%     \includegraphics[width=1\linewidth]{images/qualitative-cvpr-horizontal.pdf}
%     \caption{Cosine similarity matching between lesion embeddings in an initial and a follow-up scan of the same patient. The first set of slices are ground-truth lesion segmentations across ten slices ($z$); the second set shows the top-$k$ voxel matches between averaged lesion embeddings from the initial scan and all embeddings in the next.\vspace{-1em}}
%     \label{fig:similarity_matching}
% \end{figure}

Tracking disease progression, such as changes in tumor volume over time, represents a potentially critical clinical application of pretrained models. This approach involves extracting a model’s feature representation of a lesion from an initial scan and identifying the most closely related embeddings in subsequent scans. Ideally, features most relevant to the disease are consistently matched across follow-up scans. Figure \ref{fig:similarity_matching} demonstrates that TAP-B-3D’s features could be sufficiently descriptive to locate a lesion from an initial scan in a follow-up scan of the same patient. It indicates that fine-grained information is indeed present in the embeddings; however, a quantitative analysis would be needed to draw conclusions.

\subsection{Why not use patch features for classification?}
The frozen patch features of the TAP-CT models clearly encode rich semantic information, enabling effective segmentation with a simple decoder. In principle, the same information could be leveraged for classification using an attention-based mechanism such as ABMIL. However, it reflects the “needle-in-a-haystack” challenge: a single 300-slice CT scan processed by TAP-B-3D yields over 58,000 patches, making it difficult to isolate the few that are diagnostically relevant. Table \ref{table:own_models_classification} indeed demonstrates that scan-level classification deteriorates drastically when the ABMIL is applied to the frozen patch features, which indicates that global information retrieval is even more challenging from individual patches.

\section{Conclusion}
\label{sec:conclusion_discussion}
We introduced TAP-CT, a suite of 6 foundation models for CT imaging pretrained on 105K volumes through a novel 3D adaptation of the DINOv2 framework. This adaptation introduces a GPU-accelerated volumetric random resized crop and a 3D random masking strategy for DINO pretraining, alongside modifications of the patch embedding and positional encoding of a ViT. The models of TAP-CT achieve state-of-the-art performance on segmentation and competitive results on classification tasks. Therefore, self-supervised learning in CT should continue to focus on task-agnostic pretraining to develop well-rounded 3D vision encoders. 

\subsection{Future work}

Further progress is needed to capture global information from volumetric medical data more effectively.  Moreover, the requirement of multiple crops per sample in DINOv2 results in slow training and significant resource demands when scaling to volumetric data. This results in an overall high pretraining cost; exact GPU hours are reported in \ref{app:gpu_hours}. Preferably, the cost of pretraining a foundation model is a one-time amortized investment. However, the fact that linear probing frozen TAP-CT still lags significantly behind supervised approaches for medical image segmentation \cite{isensee2021nnu, isensee2024nnu}, indicates that substantial progress remains to be made in foundation models for CT imaging. While the training cost of individual end-to-end segmentation models is relatively low, this paradigm is limited by three factors: annotation scarcity in the medical domain, the large number of distinct pathological conditions for which separate models would need to be trained, and the regulatory assessment required for each individual model. Foundation models offer a potential solution to all three challenges, but significant improvements are still necessary.
\\
\indent These observations suggest an interesting direction for future work: developing less compute-intensive pretraining strategies that emphasize learning purely robust local features, followed by approaches to derive global representations from them effectively.

\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{We thank the Research High Performance Computing (RHPC) group at the Netherlands Cancer Institute, specifically Ameer Alkhier and Daniel Vis, for maintaining the GPU servers. Moreover, we like to thank the datadesk radiology, Artem Khmelinskii and Joost van Griethuysen, for downloading all CT scans from PACS.}


\bibliography{midl26_59}

\newpage
\appendix

\section{Pretraining Specifics}
\label{app:pretraining}

\subsection{Hyperparameters}

A list of the hyperparameters used during the pretraining of TAP-B-3D is provided in Table \ref{tab:dinov-hyperparam}. The global and local crop scales refer to the relative scale of the sampled height–width area, and a fixed number of slices along the $z$-axis are extracted by sampling around the corners of this area in the depth dimension. The pretraining hyperparameters for the ViT-S and ViT-B variants follow the same design, except that ViT-S uses a lower drop-path rate (0.1) to account for its smaller model size. 

\subsection{GPU Hours}
\label{app:gpu_hours}
Table \ref{tab:consumption} reports the GPU hours and estimated energy consumption for pretraining the TAP-CT models. As expected, models with larger context sizes require substantially longer training times; for example, TAP-B-3D takes approximately 26 times longer to train than its 2D counterpart. Scaling in medical imaging remains a significant challenge. While training on entire volumes would be ideal, this is unlikely to be feasible in the near future. Therefore, developing methods that reduce computational requirements while maintaining high performance is essential.

\begin{table}[]
\centering
\begin{tabular}{|l|ll|}
\hline
\textbf{Model}      & \textbf{GPU hours} & \textbf{Consumption (MWh)} \\ \hline
\textbf{TAP-S-2D}   & 96                 & 0.067                            \\
\textbf{TAP-B-2D}   & 136                & 0.095                            \\
\textbf{TAP-S-2.5D} & 384                & 0.269                            \\
\textbf{TAP-B-2.5D} & 864                & 0.605                            \\
\textbf{TAP-S-3D}   & 1,152              & 0.806                            \\
\textbf{TAP-B-3D}   & 3,648              & 2.550                            \\ \hline
\textbf{Total}      & 6,280              & 4.396                            \\ \hline
\end{tabular}
\caption{Overview of the total GPU hours required to train the TAP-CT models. One GPU hour corresponds to one hour of computation on a H100 SXM GPU with an approximate power draw of 700W.}
\label{tab:consumption}
\end{table}

\begin{table*}
\centering
\caption{The hyperparameter configuration used for 3D DINOv2 pretraining of the model TAP-B-3D.}
\label{tab:dinov-hyperparam}
\resizebox{\textwidth}{!}{\begin{tabular}{|llll|}
\hline
\multicolumn{4}{|c|}{\textbf{TAP-B-3D DINOv2 Hyperparameters}}\\
\hline
\textbf{Iterations}                       & \multicolumn{1}{l|}{125,000}      & \textbf{Scale Global Crops (Min, Max)} & (0.32, 1.0)       \\
\textbf{DINO Loss Weight}                 & \multicolumn{1}{l|}{1.0}          & \textbf{Scale Local Crops (Min, Max)}  & (0.05, 0.32)      \\
\textbf{iBOT Loss Weight}                 & \multicolumn{1}{l|}{1.0}          & \textbf{Tied Head Weights}             & No                \\
\textbf{KoLeo Loss Weight}                & \multicolumn{1}{l|}{0.1}          & \textbf{Head Prototypes}               & 65536             \\
\textbf{Batch Size (Total)}               & \multicolumn{1}{l|}{2048}         & \textbf{Head Hidden Dim}               & 2048              \\
\textbf{Drop Path Rate}                   & \multicolumn{1}{l|}{0.2}          & \textbf{Head Layers}                   & 3                 \\
\textbf{Layerscale}                       & \multicolumn{1}{l|}{1e-5}         & \textbf{Head Bottleneck Dim}           & 256               \\
\textbf{Base Learning Rate}               & \multicolumn{1}{l|}{0.0035}       & \textbf{Mask Probability}              & 0.5               \\
\textbf{Weight Decay (Start, End)}        & \multicolumn{1}{l|}{(0.04, 0.4)}  & \textbf{Mask Ratio (Min, Max)}         & (0.1, 0.5)        \\
\textbf{Teacher Momentum (Start, End)}    & \multicolumn{1}{l|}{(0.992, 1.0)} & \textbf{Centering}                     & Centering (No SK) \\
\textbf{Teacher Temperature (Start, End)} & \multicolumn{1}{l|}{(0.04, 0.07)} & \textbf{ViT FeedForward Layer}         & MLP               \\
\textbf{Temperature Warmup Iterations}    & \multicolumn{1}{l|}{37,500}       & \textbf{ViT Register Tokens}           & 4                 \\
\textbf{Gradient Clipping}                & \multicolumn{1}{l|}{3.0}          & \textbf{Layerwise Decay}               & 0.9               \\ \hline
\end{tabular}}
\end{table*}

\section{Dataset Specifics}
\label{app:dataset_specs}

\begin{figure}
    \centering
    \includegraphics[width=1.0\linewidth]{images/dataset_image.pdf}
    \caption{Distribution of pretraining data based on manufacturer, gender and age.}
    \label{fig:dataset_img}
\end{figure}

In Figure \ref{fig:dataset_img}, we visualize the distributions of scanner manufacturers, patient age groups, and patient sex. Because the dataset consists exclusively of individuals undergoing CT imaging for oncological assessment and tracking, the age distribution is naturally shifted toward older populations.
\\
\indent In Section \ref{sec:methodology}, we note that foreground voxels are clipped and normalized using dataset-wide statistics. Specifically, we use: $\mu = -86.8086$, $\sigma = 322.6347$, $\text{clip}_{\text{min}} = -1008.0$ and $\text{clip}_{\text{max}} = 822.0$

\section{Downstream Task Specifics}
\label{app:downstreams}

\begin{table}[t]
\centering
\renewcommand{\arraystretch}{1.1}
\caption{Overview of datasets used for segmentation and classification tasks.}
\label{tab:dataset_summary}
\begin{tabular}{llcccll}
\toprule
 & \textbf{Dataset} & \textbf{Train} & \textbf{Test} & \textbf{Classes} & \textbf{Body Part} & \textbf{Label Type} \\
\midrule
\multirow{5}{*}{\rotatebox{90}{\textbf{Segmentation}}}
 & AMOS22                  & 200  & 100 & 16 & Abdomen & Semantic Seg. \\
 & LiTS17                  & 104  & 27  & 3 & Liver & Semantic Seg.\\
 & KiTS23                  & 390  & 97  & 4 & Kidneys & Semantic Seg.\\
 & TotalSegmentator        & 892  & 200 & 49 & Full body & Semantic Seg. \\
 & MSD-Pancreas   & 224  & 57  & 3 & Pancreas & Semantic Seg.\\
\midrule
\multirow{5}{*}{\rotatebox{90}{\textbf{Classification}}}
 & LUNA16       & 539  & 138  & 2 & Lungs (Crop) & Binary\\
 & LUNA25       & 4929 & 1234 & 2 & Lungs (Crop) & Binary\\
 & FDG-PET-CT   & 804  & 210  & 2 & Full body & Binary\\
 & RSNA2022     & 767  & 192  & 7 & Thorax & Multilabel Binary\\
 & RSNA2023     & 705  & 177  & 3 & Abdomen & Multilabel Binary\\
\bottomrule
\end{tabular}
\end{table}

\begin{table}[h!]
    \centering
    \begin{tabular}{|c|c|}
        \hline
        \multicolumn{2}{|c|}{\textbf{TotalSegmentator Classes}}\\
        \hline
        Background & Iliac Artery \\
        \hline
        Lungs & Iliac Vein \\
        \hline
        Kidneys & Humerus \\
        \hline
        Ribs & Scapula \\
        \hline
        Vertebrae & Clavicula \\
        \hline
        Spleen & Femur \\
        \hline
        Gallbladder & Hips \\
        \hline
        Liver & Spinal Cord \\
        \hline
        Stomach & Gluteus Maximus \\
        \hline
        Pancreas & Gluteus Medius \\
        \hline
        Adrenal Glands & Gluteus Minimus \\
        \hline
        Esophagus & Autochthon \\
        \hline
        Trachea & Iliopsoas \\
        \hline
        Thyroid Glands & Brain \\
        \hline
        Small Bowel & Skull \\
        \hline
        Duodenum & Sternum \\
        \hline
        Colon & Costal Cartilages \\
        \hline
        Urinary Bladder & Heart \\
        \hline
        Prostate & Aorta \\
        \hline
        Kidney Cysts & Pulmonary Vein \\
        \hline
        Sacrum & Brachiocephalic Trunk \\
        \hline
        Superior Vena Cava & Subclavian Artery \\
        \hline
        Inferior Vena Cava & Common Carotid Artery \\
        \hline
        Portal Vein and Splenic Vein & Brachiocephalic Vein \\
        \hline
        Atrial Appendage & \\
        \hline
    \end{tabular}
    \caption{This table displays the individual classes of \textsc{TotalSegmentator} used in the downstream task of this work. Originally there were 117 separate classes; these were merged into 49 classes, i.e. combining the different lobes of the lungs into a single lungs class.}
    \label{tab:totalsegclasses}
\end{table}

\subsection{Batch size}
    Due to the variability in CT volume sizes, all downstream tasks are trained with a batch size of 1. To stabilize optimization, we employ 4 gradient accumulation steps for segmentation tasks and 16 steps for classification tasks. For LUNA16 and LUNA25, the batch size is set to 64 because the input crops correspond to a fixed physical extent in millimeters around each lesion.

\subsection{Resampling}
Each CT volume is resampled according to the preferred spacing of each publicly available model which can be found in Table \ref{table:model_sizes}. For our models, volumes are either resized to ($z$, 224, 224) or ($z$, 512, 512) in image space since we do not resample in world coordinates during pretraining. 

\subsection{Merging TotalSegmentator}

The TotalSegmentator v2 dataset comprises 117 anatomically distinct structures. Many of these labels correspond to fine-grained subdivisions of larger anatomical entities, such as individual bones that collectively form a unified structure. While such granularity is valuable for detailed modeling, the distinction between specific vertebrae or ribs may be unnecessary for certain downstream tasks. In these cases, anatomically related subclasses can be merged into a single category without compromising the overall structural fidelity; these classes are found in Table \ref{tab:totalsegclasses}.

\subsection{Omitting Classes RSNA2023}

RSNA2023 includes bowel injury and extravasation as target abnormalities. However, the challenge organizers note that reliably identifying these findings is extremely difficult for radiologists without access to longitudinal follow-up imaging. Although the dataset provides coordinate annotations for a subset of scans, we omit these classes in our evaluation, as accurate volume-level classification is already highly challenging.

\subsection{Dataset Summary and Dataset Splits}

In Table \ref{tab:dataset_summary} an overview the different datasets can be found. For LUNA16 and LUNA16, 50mm$^3$ crops were taken around the annotated nodules as outline in Section \ref{sec:downstream-tasks-classification-method}. The dataset splits will be made available alongside the code upon acceptance.

\subsection{Qualitative visualization of segmentation performance}

We display several segmentation maps from the AMOS and TotalSegmentator downstream tasks. The first image (Figure \ref{fig:segmentations_models_abdomen}) contains the segmentation masks of the pretrained encoders discussed in this work for three abdominal slices from the same CT scan. The second (Figure \ref{fig:segmentations_models_thorax}) and third image (Figure \ref{fig:segmentations_models_head}) show segmentations for the thorax and head respectively.

\section{Smaller local crops}
\label{app:ablation_local_crop}

% \midrule
% \multicolumn{8}{c}{\textbf{Ablations}} \\
% \shortstack[l]{ViT-S-3D \\ (Local Crops)} & \raisebox{0.5\height}{(1, 16, 16)} & \raisebox{0.5\height}{(12, 224, 224)} & \raisebox{0.5\height}{0.482 ($\pm$ .001)} & \raisebox{0.5\height}{0.504 ($\pm .001$)} & \raisebox{0.5\height}{0.381 ($\pm$ .006)} & \raisebox{0.5\height}{0.427 ($\pm$ .003)} & \raisebox{0.5\height}{0.278 ($\pm$ 0.002)} \\

% \shortstack[l]{ViT-S-3D \\ (Local Crops)} & \raisebox{0.5\height}{(1, 16, 16)} & \raisebox{0.5\height}{(12, 224, 224)} & \raisebox{0.5\height}{0.817 ($\pm$ .018)} & \raisebox{0.5\height}{0.809 ($\pm .003$)} & \raisebox{0.5\height}{0.398 ($\pm$ .009)} & \raisebox{0.5\height}{0.704 ($\pm$ .011)} & \raisebox{0.5\height}{0.820 ($\pm$ .006)} \\

\begin{table}[ht]
\centering
\caption{ViT-S-3D (Local Crops) performance across segmentation and classification tasks}
\label{tab:vits_crops}
\resizebox{\textwidth}{!}{\begin{tabular}{l c c c c c c c}
\toprule
\textbf{Model} & \textbf{Patch Size} & \textbf{Image Size} &
\textbf{AMOS22} & \textbf{LiTS17} & \textbf{KiTS23} & \textbf{TotalSeg.} & \textbf{MSD Pancreas} \\
\multicolumn{3}{c}{} & (DSC) & (DSC) & (DSC) & (DSC) & (DSC) \\
\midrule
\multirow{4}{*}{ViT-S-3D (Local Crops)} &
\multirow{4}{*}{(1, 16, 16)} &
\multirow{4}{*}{(12, 224, 224)} &
0.482 ($\pm$ .001) & 0.504 ($\pm$ .001) & 0.381 ($\pm$ .006) & 0.427 ($\pm$ .003) & 0.278 ($\pm$ .002) \\
\cmidrule(lr){4-8}
& & &
\textbf{LUNA16} & \textbf{LUNA25} & \textbf{RSNA2022} & \textbf{RSNA2023} & \textbf{FDGPETCT} \\
\multicolumn{3}{c}{} & (AUC) & (AUC) & (AP) & (AUC) & (AUC) \\
& & &
0.817 ($\pm$ .018) & 0.809 ($\pm$ .003) & 0.398 ($\pm$ .009) & 0.704 ($\pm$ .011) & 0.820 ($\pm$ .006) \\
\bottomrule
\end{tabular}}
\end{table}

\noindent One of the primary factors influencing local-to-global correspondence in the DINOv2 framework is the relative dimensionality of local and global crops. In this work, the depths of the local and global crops are identical, which may bias the model toward the iBOT objective, as this configuration simplifies optimization of the DINO objective. Restricting local crops to a smaller number of slices therefore provides a straightforward way to encourage optimization toward the DINO objective. For this ablation, we train a 3D ViT-S with a patch size of (1, 16, 16), local crops of (6, 96, 96), and global crops of (12, 224, 224). The results for this model are reported in Table \ref{tab:vits_crops}. Two observations can be made: (1) classification accuracy does not improve substantially relative to the other models, and (2) segmentation quality decreases significantly. The ablation model is outperformed by its 2D counterpart, despite having access to twelve times the context size. This reinforces the view that extracting 3D global representations from CT scans remains an open challenge and warrants further investigation in future research.


\begin{figure*}
        \centering
        \includegraphics[width=0.45\linewidth]{images/segmentations.pdf}
        \caption{Segmentations across three abdominal slices from the AMOS22 validation sample (amos\_286) for TAP-B-3D and other publicly available pretrained FMs. Each segmentation is produced using a linear convolutional layer fine-tuned on top of the frozen features of the respective pretrained encoder.}
        \label{fig:segmentations_models_abdomen}
\end{figure*}

\begin{figure*}
        \centering
        \includegraphics[width=0.45\linewidth]{images/totalseg_segmentations_thorax (1).pdf}
        \caption{Segmentations across three thorax slices from the TotalSegmentator validation sample (s0223) for TAP-B-3D and other publicly available pretrained FMs. Each segmentation is produced using a linear convolutional layer fine-tuned on top of the frozen features of the respective pretrained encoder.}
        \label{fig:segmentations_models_thorax}
\end{figure*}

\begin{figure*}
        \centering
        \includegraphics[width=0.45\linewidth]{images/totalseg_segmentations_head (2).pdf}
        \caption{Segmentations across three head slices from the TotalSegmentator validation sample (s0224) for TAP-B-3D and other publicly available pretrained FMs. Each segmentation is produced using a linear convolutional layer fine-tuned on top of the frozen features of the respective pretrained encoder.}
        \label{fig:segmentations_models_head}
\end{figure*}

\end{document}
