\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution
\usepackage{multicol}
\usepackage{multirow}
\usepackage{graphicx}
\usepackage{xcolor,colortbl,xspace}
\usepackage{comment}
%\usepackage{mwe} % to get dummy images
\usepackage{chngcntr}





\jmlryear{2024}
\jmlrworkshop{Full Paper -- MIDL 2024}
\jmlrvolume{-- nnn}
\editors{Accepted for publication at MIDL 2024}


\newcommand{\bumpup}{\vspace*{-2.0ex}}
\title[Short Title]{Self-supervised pretraining in the wild imparts image acquisition robustness to medical image transformers: an application to lung cancer segmentation}


 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Jue Jiang\midljointauthortext{Contributed equally}\nametag{$^{1}$}} \Email{jiangj1@mskcc.org}\\
	%\addr $^{1}$ Address 1 \\
	%\addr $^{2}$ Address 2 \AND
	\Name{Harini Veeraraghavan\midlotherjointauthor\nametag{$^{1}$}} \Email{veerarah@mskcc.org}\\
	%\addr $^{1}$ Address 1 \\
	%\Name{Author Name3\nametag{$^{2}$}} \Email{alphabeta@example.edu}\\
	%\Name{Author Name4\midljointauthortext{Contributed equally}\nametag{$^{3}$}} \Email{uvw@foo.ac.uk}\\
	%\addr $^{3}$ Address 3 \AND
	%\Name{Author Name5\midlotherjointauthor\nametag{$^{4}$}} \Email{fgh@bar.com}\\
	%\addr $^{4}$ Address 4
}

%\usepackage{hyperref}


\usepackage{booktabs}

\usepackage[labelsep=period]{caption}
\captionsetup{font=small}
\captionsetup[table]{aboveskip=0pt}
\captionsetup[table]{belowskip=0pt}
\captionsetup[figure]{aboveskip=2pt}
\captionsetup[figure]{belowskip=0pt}
\usepackage{bm}

\begin{document}
	
	\maketitle
	
	
	\begin{abstract}
		Self-supervised learning (SSL) is an approach to pretrain models with unlabeled datasets and extract useful feature representations such that these models can be easily fine-tuned for various downstream tasks. Self-pretraining applies SSL on curated task-specific datasets without using task-specific labels. Increasing availability of public data repositories has now made it possible to utilize diverse and large, task unrelated datasets to pretrain models in the "wild" using SSL. However, the benefit of such wild-pretraining over self-pretraining has not been studied in the context of medical image analysis. Hence, we analyzed transformers (Swin and ViT) and a convolutional neural network created using wild- and self-pretraining trained to segment lung tumors from 3D-computed tomography (CT) scans in terms of: (a) accuracy, (b) fine-tuning epoch efficiency, and (c) robustness to image acquisition differences (contrast versus non-contrast, slice thickness, and image reconstruction kernels). We also studied feature reuse using centered kernel alignment (CKA) with the Swin networks. Our analysis with two independent testing (public N = 139; internal N = 196) datasets showed that wild-pretrained Swin models significantly outperformed self-pretrained Swin for the various imaging acquisitions. Fine-tuning epoch efficiency was higher for both wild-pretrained Swin and ViT models compared to their self-pretrained counterparts. Feature reuse close to the final encoder layers was lower than in the early layers for wild-pretrained models irrespective of the pretext tasks used in SSL. Models and code are available at \url{https://github.com/The-Veeraraghavan-Lab/CTRobust_Transformers.git}.
		
	\end{abstract}
	
	\begin{keywords}
		Lung tumor segmentation, self-supervised learning, wild and self-pretraining, robustness to imaging differences.
	\end{keywords}
	

\section{Introduction}


Self-supervised learning (SSL) is an approach to extract useful feature representations from unlabeled images by minimizing a supervised objective through pretext tasks such as jigsaw puzzles\cite{zhu2020rubik}, contrastive losses\cite{taleb20203d}, image reconstruction\cite{zhou2023unified_PRCLV2}, and masked image prediction\cite{jiang2022self_SMIT}. Hence, SSL pretraining followed by fine-tuning on modest sized labeled datasets has demonstrated capability to achieve highly accurate segmentation in both medical and natural image analysis tasks\cite{NguyenAAAI2023,jiang2022self_SMIT,tang2022self,YanWACV2023,zhou2021models}. Prior works have shown that sequential SSL pretraining on natural images followed by pretraining on curated medical datasets improved 2D medical image analysis accuracy\cite{Taher2021} and that transformers benefit more from SSL than convolutional neural networks (CNN) for image classification tasks \cite{Taher2021}. 
\\
More frequently, medical image applications use self-pretraining, an approach wherein SSL pretraining is applied on the same curated task dataset that is subsequently used for supervised fine-tuning. In contrast, SSL pretraining in the ''wild" with large and diverse datasets that are uncurated and unrelated to task have shown to be an effective approach for natural image analysis\cite{Matsoukas2022}. The rationale for wild-pretraining is to leverage the imaging variations inherent in large and diverse sets of images to extract universally applicable feature representations for the downstream tasks. However, supervised pretraining using ImageNet has shown only variable feature reuse depending on the distance of the medical image domain from natural images\cite{RaghuNIPS2019}.
\\
In this context, the benefits of wild-pretraining performed on medical images over self-pretraining with curated, task-specific datasets has not been studied. Hence, we studied the impact of wild-pretraining on a relatively large and uncurated 3D medical datasets ($>$ 10,000 CTs) vs. self-pretraining with curated, 3D CT dataset for segmenting lung tumors along with evaluation of robustness to imaging variations. We chose volumetric lung tumor segmentation because SSL pretext tasks focus on learning universal feature representations, which is likely to capture common elements like organs but not tumors. Hence, our chosen application of tumor segmentation allows us to study whether wild-pretraining from images encompassing wider variations benefits over self-pretraining on the task dataset.
\\
Our contributions include: (a) Comparative analysis of SSL-based wild-pretraining and self-pretraining applied to three common architectures, a vision transformer (ViT), hierarchical shifted window transformer (Swin), and a Unet-based convolutional network for lung tumor segmentation, (b) analysis of robustness to CT acquisition differences due to the two SSL pretraining for the individual architectures, (c) evaluation of pretext tasks on the feature reuse with SSL. Understanding the relative merits of the SSL approach for commonly used networks could inform the development of pretrained models.



\bumpup
\section{Datasets}
Analyzed datasets with imaging acquisitions, and disease details are in Table~\ref{tab:dataset}. 
\\
\textbf{Wild-pretraining: \/}\rm A total of 10,412 3D CT scans covering head to pelvis sourced from datasets provided publicly for variety of tasks including lesion detection~\cite{xiao2023lesion}, classification~\cite{harmon2020artificial}, and multi-organ and abdominal tumor segmentations were used without additional curation for pretraining. Retrospectively collected and anonymized institutional datasets were used as is from patients treated for lung, esophageal (Internal 1) and head and neck (Internal 2) cancers treated with radiotherapy (RT). 
\\
\textbf{Task dataset for self-pretraining and fine-tuning: \/}\rm A publicly available dataset of patients with locally advanced non-small cell lung cancer (LA-NSCLC) scanned with contrast and non-contrast CTs, smooth reconstruction kernels ($\leq$ B30), 3 mm slices, and provided with tumor contours was analyzed~\cite{aerts2015data}. Tumor sizes ranged with a median of 33.68 cc and interquartile range (IQR) of 8.29 cc to 90.31 cc. A random set of 316 CTs were used in self-pretraining without tumor labels.
\\
\textbf{Testing: \/}\rm Two independent datasets totalling 335 CTs, consisting of a public dataset of patients with early stage (stage I-II) NSCLC~\cite{bakr2018radiogenomic} (median 7.91 cc, IQR of 3.60 cc to 28.23 cc) and institutional dataset of patients with stage (II-IV) NSCLC (median 19.54 cc, IQR of 6.64 cc to 66.87 cc) were evaluated. The LRad dataset used contrast and non-contrast CTs, a range of slice thicknesses, and a wide variety of image reconstruction kernels, which were categorized as smooth, medium, and sharp kernels for meaningful analysis of robustness. The institutional LC dataset was homogeneous in terms of CT acquisition (convolutional kernel used GE lung reconstruction and a slice thickness of 5mm). In addition, a subset of 20 patients reconstructed with both sharp (GE lung) and smooth (GE standard) kernel as well as with 2.5mm and 5mm slices were used for paired comparison of accuracy differences that controlled for tumor and patient anatomy differences. 


\begin{table}[ht]
	\centering
	\caption{Datasets summary. Smooth kernels: GE ''standard'' and ''bone'', Siemens $<$ B40; Medium: GE ''Bone Plus'', Siemens $\geq$ B40 and $<$ B50; Sharp: GE ''Lung'', Siemens $\geq$ B50. NA: not available indicated when not provided for a dataset.}
	\label{tab:dataset}
	\resizebox{1.0\textwidth}{!}{
		\large
		\begin{tabular}{*{7}{c}}
			\toprule
			Data & Location & Number &  Manufacturer & Thickness & Kernel & Contrast \\
			\midrule
			\textbf{Pretraining} & & & & & & \\
			MELA 2022\cite{xiao2023lesion} & Chest & 880 & NA & 1 mm & NA & contrast, non-contrast\\
			AMOS 2022\cite{ji2022amos} & Chest-Abd-pelvis & 360  & NA & 5mm to 7.5 mm & NA & contrast, non-contrast \\
			COVID-19\cite{harmon2020artificial} & Chest & 609 & NA & 5 mm & NA & contrast, non-contrast \\ 
			KITS (\cite{C4KC-KiTS}) & Abdomen-pelvis & 411 & Siemens, Toshiba & 3 mm & smooth & arterial, late, non-contrast\\ 
			Pancreas CT\cite{roth2015deeporgan} & Chest-Abdomen & 80 & NA & 1mm to 5mm & NA & contrast, non-contrast  \\ 
			Internal 1 Radiotherapy & Chest & 5,124 & GE & 3 mm to 5 mm & smooth, sharp & contrast, non-contrast \\
			Internal 2 Radiotherapy & Head and neck & 2,632 & GE & 2.5 mm to 3mm & smooth & contrast, non-contrast\\
			\midrule
			\textbf{Fine-tuning/Self-pretraining} & & & & & & \\
			TCIA NSCLC\cite{aerts2015data} & Chest-abdomen & 350  & Siemens, CMS & 3 mm & Smooth & contrast, non-contrast \\ 
			\midrule
			\textbf{Testing} & & & & & & \\
			LRad\cite{bakr2018radiogenomic} & Chest & 139 & Siemens, Toshiba, GE & 0.9 mm to 5mm & smooth, medium, sharp & contrast, non-contrast \\
			LC & Chest-abdomen & 196& GE & 1.25 mm to 5mm & smooth, sharp & contrast, non-contrast \\ 
			
			\bottomrule
		\end{tabular}
	}
	\vspace{-1em}
\end{table}


\bumpup
\section{Problem formulation and methodology}
\bumpup
The aim of this work is to  understand the benefits of using SSL-based wild-pretraining with large, diverse and uncurated medical images compared to SSL-based self-pretraining with curated, in-domain task-datasets for segmenting lung tumors from CT scans. 
Specifically, we analyzed under what conditions wild-pretraining improves over self-pretraining by studying three common networks with varying inductive bias and CT imaging differences. 
ViT~\cite{dosovitskiy2021}, which removes image specific inductive biases including locality and translational equivariance has the least inductive bias. Swin~\cite{liu2021swin} adds back hierarchical scale and windowed attention to improve inductive bias. A CNN model such as non-skip Unet (nsUnet)~\cite{zhou2023unified_PRCLV2} has the highest inductive bias.
\\
Feature reuse has been attributed to success in transfer learning applied to supervised pretraining with ImageNet\cite{Matsoukas2022,RaghuNIPS2019}. We studied how feature reuse is impacted by the SSL strategy as well as the pretext task used for SSL. 

\bumpup

%\bumpup
\subsection{Network architectures and SSL}
Transformer encoders connected to 3D Unet decoder shown to be effective for multi-organ\cite{hatamizadeh2022unetr,tang2022self,jiang2021unpaired} and brain tumor\cite{hatamizadehSwinUNETR2022} segmentation were studied. The 3D Swin encoder used a Swin-small backbone, which used a depth of [2,2,8,2] and [4,4,8,16] multi-head for each transformer depth, and a feature embedding size of 384. This setup also included a window size of 4 $\times$ 4 $\times$ 4 and patch size of 2 $\times$ 2 $\times$ 2. The ViT encoder comprised of 12 transformer blocks, 768 embedding features, 8 multi-head self attention blocks, and a patch size of 8. A non-skip Unet (nsUnet) was used as the CNN network due to it's higher accuracy over Unet for medical image segmentation\cite{zhou2023unified_PRCLV2}. The nsUnet had 4 downsampling and corresponding number of upsampling layers. All three networks used an input image of 128 $\times$ 128 $\times$ 128 voxels. The total number of parameters for ViT and Swin were 46,405,874 and 64,698,114, respectively. nsUnet had 17,111,499 parameters. 


\section{SSL pretext tasks}
Wild- and self-pretraining was performed with an identical set of 5 pretext tasks that consisted of  contrastive pretraining~\cite{chen2020simple}, masked image prediction (MIP), two self-distillation learning tasks called masked patch token distillation (MPD) and image token distillation (ITD), as well as a combination of MIP, MPD, and ITD tasks as used in the SMIT method~\cite{jiang2022self_SMIT}. We also studied the impact of sequential SSL by using wild-pretraining followed by self-pretraining using SMIT.
\\
Contrastive pretext task minimized the cosine similarity of feature embeddings from positive pairs (augmented 3D views from same patient) while maximizing the same distance between negative pairs (augmented 3D image views from different patients). ITD and MPD tasks are implemented through self-distillation performed between a student and an exponentially moving average teacher network\cite{jiang2022self_SMIT}, whereby, the two networks were presented with different 3D augmented views of input images. The view provided to the student was randomly masked using a default masking ratio of 0.75~\cite{jiang2022self_SMIT}. MIP forces the student to correctly predict the image regions underlying the masked image patches by utilizing the visual context from the visible image patches. 
%The teacher network was discarded following SSL. In order to perform self-distillation, the student and teacher networks are presented with different 3D augmented views of the input images. The view presented to the student network was randomly masked to force it to utilize the visual context from the visible image patches and extract appropriate feature representations.
MIP was implemented by minimizing the L1-norm between predicted and unmasked image. MPD and ITD were implemented using cross-entropy losses with temperature scaling to match the patch feature token and image token distributions, respectively.



\bumpup
\subsection{Implementation details}
All images intensity clipped [-500 HU to 500 HU], normalized to [0,1] and resampled to a uniform voxel size of 2mm $\times$ 2mm $\times$ 2mm and then randomly cropped to 128 $\times$ 128 $\times$ 128 voxels to generate the 3D views. Images were resampled to a uniform voxel size of 1.5 mm $\times$ 1.5 mm $\times$ 2.0 mm voxels for fine-tuning and testing. The networks were optimized using ADAMw~\cite{Loshchilov2017DecoupledWD} with a cosine learning rate scheduling~\cite{Loshchilov2016SGDRSG}, and trained for 500 epochs with an initial learning rate of $8e^{-4}$ and warmup for 50 epochs. Self-pretraining mitigated the issue of learning from fewer examples compared to wild-pretraining by using online data augmentation and training for 2,000 epochs with a warmup for 200 epochs. A path drop rate of 0.1 was applied to the student model, and all SSL tasks were conducted on 4 NVIDIA A100 GPUs ($4 \times$ 80GB memory) using a batch size of 32 for Swin, 32 for nsUnet, and 8 for ViT. 
\\
Fine-tuning was performed on NVIDIA $4\times$A100 GPU. All analyzed networks were trained with a learning rate of $2e^{-4}$ for 1,000 epochs. Swin and nsUnet models were fine-tuned with a batch size of 24 and the ViT models used a batch size of 4 due to memory limitations. Early stopping was used to select the model with highest accuracy on validation set. 

\bumpup
\subsection{Experiments and evaluation metrics}
Tumor segmentations were compared against manual delineations using the Dice similarity coefficient (DSC). Fine-tuning epoch efficiency was measured as the relative difference in the number of epochs at which fine-tuning was stopped with respect to the number of epochs required for training the model from scratch and expressed as a percentage.
Statistical comparisons measured tumor segmentation accuracy differences between fine-tuned models produced with wild-pretrained and self-pretrained models for individual networks (e.g. ViT wild-pretrained vs. ViT self-pretrained; Swin wild-pretrained vs. Swin self-pretrained) using paired, two-sided, Wilcoxon signed rank tests at 95\% significance level. Feature reuse from wild-pretrained to fine-tuned as well as self-pretrained to fine-tuned models were measured using centered kernel alignment (CKA) as detailed in  Supplementary section~\ref{sup_sec:technical_details_CKA}.
%\section{Training}
%\bumpup
\bumpup

\section{Results}
%\bumpup
\subsection{Segmentation accuracy}
As shown in Table.~\ref{tab:tumor_seg_acc}, wild-pretrained ViT and Swin models were more accurate their self-pretrained counterparts on both datasets. Wild- and self-pretrained CNN models were similarly accurate. Further analysis showed that wild-pretraining reduced dependency of tumor segmentation accuracy to volume compared to other training strategies for all three networks. Transformer methods showed smaller dependency of accuracy to tumor volume (Swin $R^{2}$ ranged from 0.11 for wild-pretrained to 0.14 for scratch trained; ViT $R^{2}$ ranged from 0.11 for wild-pretrained to 0.18 for scratch trained) when compared to CNN ($R^{2}$ of 0.37 for wild-pretraining to 0.41 for scratch training) as shown in Supplementary Figure.~\ref{fig:acc_vs_vol}. Example segmentations produced by the wild- and self-pretrained Swin following fine-tuning are shown in Figure.~\ref{fig:seg_show}, which clearly show better performance of the wild-pretrained model. 
\\
Individual pretext tasks did not lead to large differences in accuracy. However, a combination of pretext tasks as done in SMIT showed a larger accuracy improvement. Two-stage pretraining did not improve accuracy compared to wild-pretraining.

\begin{table}[ht]
	\centering
	\caption{Tumor segmentation accuracy with pretraining methods and transformer architectures.}
	\label{tab:tumor_seg_acc}
	\resizebox{0.70\textwidth}{!}{
		\large
		\begin{tabular}{*{5}{c}}
			\toprule
			Model & Training & Pretext Task & LRad & LC  \\
			\midrule
			\textcolor{black}{CNN}  & \textcolor{black}{Scratch} & \textcolor{black}{N/A} &\textcolor{black}{0.42$\pm$0.34}  & \textcolor{black}{0.54$\pm$0.24}  \\
			\textcolor{black}{CNN}  & \textcolor{black}{Self-pretraining} & \textcolor{black}{PRCLv2\cite{zhou2023unified_PRCLV2}} & \textcolor{black}{0.45$\pm$0.33} &  \textcolor{black}{0.56$\pm$0.26} \\
			\textcolor{black}{CNN}  & \textcolor{black}{Wild-pretraining} & \textcolor{black}{PRCLv2} & \textcolor{black}{0.46$\pm$0.33} &  \textcolor{black}{0.57$\pm$0.20}\\
			\hline
			ViT  & Scratch & N/A & 0.55$\pm$0.31 & 0.64$\pm$0.26 \\
			ViT  & Self-pretraining & SMIT & 0.64$\pm$0.25  & 0.67$\pm$0.22 \\
			ViT  & Wild-pretraining & SMIT&\textbf{0.66$\pm$0.22} & \textbf{0.70$\pm$0.23}\\
			\hline
			Swin  & Scratch & N/A  & 0.54$\pm$0.31 & 0.68 $\pm$ 0.24 \\
			Swin  & Self-pretraining & SMIT& 0.63$\pm$0.23  & 0.71 $\pm$ 0.21 \\
			Swin  & Wild-pretraining  &SMIT&   \textbf{0.69$\pm$0.18} & \textbf{0.72 $\pm$ 0.20} \\
			\textcolor{black}{Swin}  & \textcolor{black}{Wild and Self-pretraining}   &\textcolor{black}{SMIT}&  \textcolor{black}{0.65$\pm$0.21}  & \textcolor{black}{0.71$\pm$0.21}  \\
			\hline
			Swin  & Wild-pretraining   &MIP& 0.64$\pm$0.25   & 0.69$\pm$0.24\\
			Swin  & Wild-pretraining  & ITD&  0.64$\pm$0.24  &  0.70$\pm$0.21\\
			Swin  & Wild-pretraining   &ITD \& MPD& 0.66$\pm0.21$ & 0.71$\pm$0.21  \\
			\textcolor{black}{Swin}  & \textcolor{black}{Wild-pretraining}   &\textcolor{black}{Contrastive}& \textcolor{black}{0.64$\pm$0.24} & \textcolor{black}{0.69$\pm$0.24}  \\
			
			%\hline
			\bottomrule
		\end{tabular}
	}
	\vspace{-1em}
\end{table}

\begin{figure}[ht]
	\begin{center}
		\includegraphics[width=0.85\columnwidth,scale=0.7]{figures/Segmentation_show.pdf}\vspace{-0.05cm}\setlength{\belowcaptionskip}{-0.8cm}\setlength{\abovecaptionskip}{0.08cm}\caption{\small Segmentation (yellow contour) produced by Swin model applied to CTs reconstructed with sharp (A, C) and smooth (B, D) for two different patients with 2.5 mm thickness. } 
		\label{fig:seg_show}
	\end{center}
\end{figure}


\subsection{Robustness to CT imaging variations}
Analysis of accuracy robustness to CT contrast differences showed that all three network architectures resulted in higher accuracy for contrast-enhanced CTs when using wild-pretraining compared to self-pretraining (Table.~\ref{tab:tumor_seg_contrast}). Wild-pretrained Swin was significantly more accurate than self-pretrained Swin for both contrast and non-contrast CTs. There was no difference in accuracy for ViT or CNN models using the two SSL strategies.
\\
Figure.~\ref{fig:boxplotKernels} shows the impact of CT reconstruction kernels on tumor segmentation accuracy using the \textcolor{black}{public LRad dataset (n=139 CTs)}. The wild-pretrained Swin was significantly more accurate than self-pretrained Swin (p $<$ 0.001) with smooth kernel and scratch trained Swin with smooth (p $<$ 0.001) and sharp kernels (p $=$ 0.006). 

%On the other hand, accuracies were similar for wild-pretrained, self-pretrained, and scratch trained ViT and CNN models for smooth and medium kernels. Both wild-pretrained ViT (p $=$ 0.008) and self-pretrained ViT were more accurate than scratch trained ViT (p $=$ 0.02) for sharp kernels. 
%\\
%Subset analysis of an additional set of 20 patients with paired reconstructions showed that self- and wild-pretrained Swin models differed significantly with respect to convolutional kernels when using 5 mm slices (p $=$ 0.036) but not with 2.5 mm slices (Supplementary Table \ref{tab:tumor_seg_kernel}). On the other hand, wild-pretrained and self-pretrained ViT models produced significantly different accuracy for the same convolutional kernels when using 2.5 mm scans (p $=$ 0.019) but not the 5 mm scans.

\begin{table}[ht]
	\centering
	\caption{Tumor segmentation accuracy differences due to CT contrast. Significance comparisons of models were performed with respect to pretrained models on the public LRad dataset.}
	\label{tab:tumor_seg_contrast}
	\resizebox{0.7\textwidth}{!}{
		\large
		\begin{tabular}{*{6}{c}}
			\toprule
			Model & training & Contrast (N=85) & p-value & NonContrast (N=54) & p-value \\
			\midrule
			\textcolor{black}{CNN}  & \textcolor{black}{Scratch} & \textcolor{black}{0.47$\pm$0.33} & \textcolor{black}{0.014} & \textcolor{black}{0.35$\pm$0.33} & \textcolor{black}{0.28} \\
			\textcolor{black}{CNN}  & \textcolor{black}{Self-pretraining}  & \textcolor{black}{ 0.51$\pm$0.32}  & \textcolor{black}{0.72}& \textcolor{black}{0.36$\pm$0.32}  & \textcolor{black}{0.43}\\
			\textcolor{black}{CNN}  & \textcolor{black}{Wild-pretraining} & \textcolor{black}{\textbf{0.52$\pm$0.32}} & - & \textcolor{black}{\textbf{0.38$\pm$0.33}}  & - \\
			\hline
			\hline 
			ViT  & Scratch & 0.60$\pm$0.30 & 0.00004 & 0.47$\pm$0.31 & 9.13e-6 \\
			ViT  & Self-pretraining  &  0.65$\pm$0.27  & 0.064& \textbf{0.65$\pm$0.19}  & 0.61\\
			ViT  & Wild-pretraining & \textbf{0.67$\pm$0.23} & - & 0.64$\pm$0.20  & - \\
			\hline
			\hline
			Swin  & Scratch  & 0.58$\pm$0.30 & 3.26e-6& 0.48$\pm$0.31 & 5.21e-7\\
			Swin  & Self-pretraining  & 0.66$\pm$0.23  & 0.0041& 0.60$\pm$0.23 & 6e-4\\
			Swin  & Wild-pretraining  & \textbf{0.70$\pm$0.19} & - & \textbf{0.68$\pm$0.16} & - \\
			
			\bottomrule
		\end{tabular}
	}
	\vspace{-1em}
\end{table}

\begin{figure}[ht]
	\begin{center}
		\includegraphics[width=0.90\columnwidth,scale=0.8]{figures/boxplotAccuraciesKernelVitSWinCNN.pdf}
		\vspace{-0.05cm}\setlength{\belowcaptionskip}{-0.8cm}\setlength{\abovecaptionskip}{0.08cm}\caption{\small Influence of CT reconstruction kernel on segmentation accuracy with (A) Swin-backbone, (B) ViT-backbone, and (C) CNN backbone. Analysis was performed on all public LRad test cases. Abbreviations 'Self' refers to self-pretraining and 'Pre' refers to wild-pretraining.} \label{fig:boxplotKernels}
	\end{center}
\end{figure}


\bumpup

\subsection{Feature reuse analysis}
There was a considerable variation in the reuse of the features for the same network architecture (Swin) when wild-pretrained with different pretext tasks as shown in Figure.~\ref{fig:cka_diff_method} (A) to (E). Concretely, contrastive task resulted in the highest feature reuse even across different feature layers (off-diagnoal entries in the CKA matrix). ITD, a global image feature matching loss, resulted in the lowest feature reuse, followed by MPD, MIP, and SMIT (Figure.~\ref{fig:cka_diff_method}). SMIT, which uses a combination of ITD, MIP and MPD, the latter two are spatial locality context losses, resulted in higher feature reuse in the lower (1 to 4) and middle level features (5 to 9) compared to higher level (10 to 14) layer features. In addition, the features across different layers (off-diagonal entries of the CKA matrix) were different between wild-pretrained and fine-tuned features for SMIT, MPD, and ITD tasks but not for the contrastive learning task. Self-pretraining (Figure.~\ref{fig:cka_diff_method} F) with SMIT resulted in more differentiation of off-diagnoal features (layers 5 to 14 compared to lower level features 1 to 4) but such differentiation was to a lesser degree than wild-pretrained SMIT. In general, wild-pretraining resulted in feature changes especially close to the later stage encoder layers (13 and 14) for all the pretext tasks when compared to self-pretraining. 
\\
The trend of higher feature reuse for self-pretraining compared to wild-pretrained models was also observed for contrast and non-contrast CT scans (Figure.~\ref{fig:cka_contrast}). In particular, wild-pretraining resulted in larger deviations of features close to the later encoder layer (13 and 14) for contrast compared to non-contrast CTs (Figure.~\ref{fig:cka_contrast} A and B). Analysis of smooth and sharp reconstruction kernels showed higher feature reuse with wild-pretrained models for the lower levels compared to later layer (13 and 14) as shown in Figure.~\ref{fig:cka_contrast} C and D.  %Results indicate why 

\begin{figure}[ht]
	\begin{center}
		\includegraphics[width=1.00\columnwidth,scale=0.7]{figures/cka_analysis_w_contrastive.pdf}
		\vspace{-0.05cm}\setlength{\belowcaptionskip}{-0.8cm}\setlength{\abovecaptionskip}{0.08cm}\caption{\small CKA analysis performed on the Swin network using pretraining with SMIT (D) and self-pretraining with SMIT (E) as well as pretraining with different pretext tasks including (A) contrastive (B) ITD, (C) MIP, and (D) ITD and MPD.} 
		\label{fig:cka_diff_method}
	\end{center}
\end{figure}


\begin{figure}[ht]
	\begin{center}
		\includegraphics[width=0.82\columnwidth,scale=0.5]{figures/cka_contrast_kernel.pdf}
		\vspace{-0.05cm}\setlength{\belowcaptionskip}{-0.8cm}\setlength{\abovecaptionskip}{0.08cm}\caption{\small CKA analysis to measure the similarity features for (A) contrast and (B) non-contrast CT as well as CT images reconstructed using (C) smooth and (D) sharp kernel with 2.5 mm slices. } 
		\label{fig:cka_contrast}
	\end{center}
\end{figure}

\subsection{Fine-tuning epoch efficiency}{Wild-pretrained models required fewer GPU hours for fine-tuning the models compared to self-pretrained models for Swin, ViT, and CNN networks (Table \ref{tab:finetuning_efficiency}). Wild-pretrained Swin models were most efficient in terms of the number of epochs required for fine-tuning. Validation curves for the various architectures shows faster convergence of wild-pretrained models compared to self-pretrained counterparts (Supplementary Figure \ref{fig:finet_efficiency}). 
	

\begin{table}[!t]
	\centering
	\caption{Fine-tuning epoch efficiency of self-pretrained and pre-trained models with validation DSC. }
	\label{tab:finetuning_efficiency}
	\resizebox{0.50\textwidth}{!}{
		\large
		\begin{tabular}{*{5}{c}}
			\toprule
			Model & Training & DSC & \textcolor{black}{Epoch efficiency \%} & \textcolor{black}{GPU hours}  \\
			\midrule
			\textcolor{black}{CNN}  & \textcolor{black}{Scratch} & \textcolor{black}{0.69} & - & \textcolor{black}{79} \\
			\textcolor{black}{CNN}  & \textcolor{black}{Self-pretraining} & \textcolor{black}{0.70} & \textcolor{black}{30 \%} & \textcolor{black}{55}\\
			\textcolor{black}{CNN}  & \textcolor{black}{Wild-pretraining} & \textcolor{black}{0.72} & \textcolor{black}{50 \%} & \textcolor{black}{39} \\
			\hline
			ViT  & Scratch & 0.67 & - &  \textcolor{black}{153}\\
			ViT  & Self-pretraining & 0.72 & 15 \% & \textcolor{black}{130} \\
			ViT  & Wild-pretraining & \textbf{0.74} & 47 \% & \textcolor{black}{81} \\
			\hline
			Swin  & Scratch & 0.68 & - &  \textcolor{black}{130} \\
			Swin  & Self-pretraining & 0.73 & 46 \% & \textcolor{black}{70}  \\
			Swin  & Wild-pretraining & \textbf{0.78} & 80 \% & \textcolor{black}{26}\\
			\bottomrule
		\end{tabular}
	}
	\vspace{-1em}
\end{table}
\bumpup

\subsection{Why wild-pretraining works better than self-pretraining?} 
Analysis of feature reuse between the wild-pretrained and fine-tuned features as well as self-pretrained and fine-tuned features showed that lower level features (1 to 4) were similar for both SSL approaches Figure.~\ref{fig:cka_diff_method} (E) and (F). Features in the middle layers (5 to 7) were also similar for wild-pretrained model, indicating greater feature reuse for low and some mid-level layers. Also, larger differentiation of the features at the deeper layers (13 and 14) occurred for the wild-pretrained model compared to self-pretrained approach, indicating greater adaption of the network's features to the segmentation task. Feature analysis (Figure.~\ref{fig:cka_pretrain_versus_self_pretrain} (C)) shows wild-pretraining and self-pretraining produces different pretrained features. Feature self-similarity analysis (Figure.~\ref{fig:cka_pretrain_versus_self_pretrain} (A) and (B)) shows that high self-similarity of same features but differentiation of different layer features with wild-pretraining, indicating ability to extract a wider range of pretrained features. 

\begin{figure}[htp]
\begin{center}
	
	\includegraphics[width=0.65\columnwidth,scale=0.7]{figures/CKA_pretrain_versus_pretrain.pdf}
	%\vspace{-0.05cm}
	%\setlength{\belowcaptionskip}{-0.8cm}
	%\setlength{\abovecaptionskip}{0.08cm}
	\caption{\small CKA analysis measuring feature self-similarity for self- and wild-pretrained Swin.} 
	\label{fig:cka_pretrain_versus_self_pretrain}
\end{center}
\end{figure}
\bumpup
\bumpup
\bumpup
\section{Discussion and conclusions}
We performed a comprehensive analysis of SSL wild-pretraining and self-pretraining applied to two transformer and one CNN model in terms of accuracy, robustness to imaging differences, as well as feature reuse. Our results are consistent with findings from natural images that demonstrated improvements in accuracy and fine-tuning epoch efficiency with wild-pretraining~\cite{Goyal2021}. Our analysis also showed that wild-pretrained Swin models were significantly more robust to CT contrast and acquisitions compared to their self-pretrained counterparts. However, the wild-pretraining approach was less beneficial for ViT as well as CNN models. Prior work with natural images\cite{Matsoukas2022} and medical images\cite{Taher2021} showed that SSL pretraining is less beneficial for CNN networks. Our analysis with multiple pretext tasks showed higher feature reuse in the lower stages and feature differentiation especially in the later stages for wild-pretrained models. This trend in lower-level feature reuse but differentiation close to higher levels was also observed for CT imaging variations with wild-pretrained models. Further analysis of feature self-similarity showed larger differentiation of features across different layers for wild-pretrained models compared to self-pretrained models, which allows the former models to extract a wider variety of features, which may contribute to higher accuracy and robustness to imaging variations. 





\midlacknowledgments{This research was partially supported by the NCI R01CA258821 and the Memorial Sloan Kettering Cancer Center Support Grant/Core Grant NCI P30CA008748.}


\bibliography{midl24_109}

\newpage
\appendix

\section{Additional analysis and results}
\label{sup_sec:technical_details_CKA}
\subsection{Centered kernel alignment (CKA)}
Feature similarities between pretrained/self-pretrained and fine tuned models were measured using centered kernel alignment (CKA), which computes a normalized similarity of two feature representations $\bm{X}$ and $\bm{Y}$ in terms of the Hilbert-Schmidt Independence Criterion (HSIC):
\begin{equation}
	\setlength{\abovedisplayskip}{1pt}
	\setlength{\belowdisplayskip}{1pt} 
	\mathrm{CKA}(\bm{K},\bm{L}) = \frac{\mathrm{HSIC_0}(\bm{K},\bm{L})}{\sqrt{\mathrm{HSIC_0}(\bm{K},\bm{K})\mathrm{HSIC_0}(\bm{L},\bm{L})}}
	\label{eqn:CKA_supp}
\end{equation}  
where $\bm{K}$=$\bm{X} \bm{X^T}$ and $\bm{L}$=$\bm{Y} \bm{Y^T}$ are the Gram matrices of feature $\bm{X}$ and $\bm{Y}$. CKA computation typically requires the feature activations of entire dataset to be stored in the memory, which is difficult to implement for transformers that have a large number of parameters. Hence, we implemented the minibatch CKA\cite{nguyen2020wide} by averaging HSIC scores over k minibatches as:
\begin{equation}
	\setlength{\abovedisplayskip}{1pt}
	\setlength{\belowdisplayskip}{1pt} 
	\mathrm{CKA_{minibatch}}(\bm{K},\bm{L}) = \frac{\mathrm{\frac{1}{k}\sum_{i=1}^k HSIC_1}(\bm{\mathrm{X_iX_i^T}},\bm{\mathrm{Y_iY_i^T}})}{\sqrt{\mathrm{\frac{1}{k}\sum_{i=1}^k HSIC_1}(\bm{\mathrm{X_iX_i^T}},\bm{\mathrm{X_iX_i^T}})} \sqrt{\mathrm{\frac{1}{k}\sum_{i=1}^k HSIC_1}(\bm{\mathrm{Y_iY_i^T}},\bm{\mathrm{Y_iY_i^T}})} }
	\label{eqn:CKA_minibatch_supp}
\end{equation}  
An unbiased estimator of HSIC\cite{song2012feature} was computed to reduce dependency of CKA values on the batch size:
\begin{equation}
	\setlength{\abovedisplayskip}{1pt}
	\setlength{\belowdisplayskip}{1pt} 
	\mathrm{HSIC_1}(\bm{K},\bm{L}) = \frac{1}{n(n-3)} (  \mathrm{tr}(\tilde{K}\tilde{L}) + \frac{\bm{1}^\mathrm{\bm{T}}\tilde{K}\bm{1}\bm{1}^\mathrm{\bm{T}} \tilde{L} \bm{1}}{(n-1)(n-2)} - \frac{2}{(n-1)}\bm{1}^\mathrm{\bm{T}} \tilde{K}\tilde{L}\bm{1}                )
	\label{eqn:CKA_minibatch_bias_est}
\end{equation} 




\subsection{Additional results}
\setcounter{figure}{0}  
\setcounter{table}{0}  
\counterwithin{table}{section}
\counterwithin{figure}{section}
\begin{figure}[htp]
	\begin{center}
		\includegraphics[width=1.0\columnwidth,scale=1.0]{figures/plot_acc_vs_vol_add_prcl.pdf}
		\vspace{-0.05cm}\setlength{\belowcaptionskip}{-0.8cm}\setlength{\abovecaptionskip}{0.08cm}\caption{\small The scatter plot of DSC versus tumor volume (cc) to assess dependency of accuracy on the tumor volume for the analyzed networks.} 
		\label{fig:acc_vs_vol}
	\end{center}
\end{figure}


\begin{figure}[ht]
	\begin{center}
		\includegraphics[width=0.85\columnwidth,scale=0.7]{figures/val_curve_w_cnn.pdf}
		\vspace{-0.05cm}\setlength{\belowcaptionskip}{-0.8cm}\setlength{\abovecaptionskip}{0.08cm}\caption{\small Finetuning efficiency measured for self-pretrained and pretrained models using CNN, ViT and Swin backbone .} \label{fig:finet_efficiency}
	\end{center}
\end{figure}




\begin{table}[ht]
	\centering
	\caption{Robustness of tumor segmentation to different scan reconstructions. Significance tests compared wild-pretrained to self-pretrained and scratch trained models using the same network architecture. }
	\label{tab:tumor_seg_kernel}
	\resizebox{0.85\textwidth}{!}{
		\large
		\begin{tabular}{*{8}{|c|}}
			\hline
			\multirow{2}{*}{Model} &\multirow{2}{*}{Training} & \multicolumn{3}{c|}{Slice 2.5mm} & \multicolumn{3}{c|}{Slice 5mm} \\
			\cline{3-8} 
			& & Sharp & Smooth & p-value & Sharp & Smooth & p-value \\ 
			\hline
			CNN  & Scratch  & 0.20$\pm$0.20 & 0.22$\pm$0.21 & 0.61 & 0.27$\pm$0.24 & 0.28$\pm$0.26&0.26 \\
			CNN  & Self-pretraining   & 0.21$\pm$0.20 & 0.22$\pm$0.20  &0.57 & 0.27$\pm$0.19 &  0.31$\pm$0.27 & 0.16\\
			CNN  & Wild-pretraining  &0.23$\pm$0.21 & 0.24$\pm$0.23 & 0.68& 0.30 $\pm$  0.25  & 0.34$\pm$0.31 &0.13 \\
			\hline 
			%Model & Training &  Sharp 2.5 mm & Sharp 5 mm & Smooth 2.5 mm & Smooth 5 mm \\
			%\midrule
			ViT  & Scratch  & 0.47$\pm$0.34 & 0.54$\pm$0.32 & 0.08 & 0.49$\pm$0.34 &  0.50$\pm$0.30 & 0.64 \\
			ViT  & Self-pretraining   & 0.64$\pm$0.18 & 0.56$\pm$0.25  &0.019 & 0.58$\pm$0.26 &  0.51$\pm$0.23 & 0.14\\
			ViT  & Wild-pretraining  &0.67$\pm$0.16 & 0.58$\pm$0.26 & 0.077& 0.62$\pm$0.24 & 0.56$\pm$0.22 &0.11 \\
			\hline
			Swin  & Scratch  & 0.52$\pm$0.32 &0.36$\pm$0.36 & 0.37&0.57$\pm$0.48  &0.47$\pm$0.34 & 0.12 \\
			Swin  & Self-pretraining  & 0.58$\pm$0.27 & 0.54$\pm$0.31 & 0.13& 0.52$\pm$0.28  & 0.49$\pm$0.30 & 0.058 \\
			Swin  & Wild-pretraining   & 0.70$\pm$0.18 & 0.66$\pm$0.21 &0.058 &0.62$\pm$0.28  & 0.58$\pm$0.26 & 0.036 \\
			\hline
			%\bottomrule
		\end{tabular}
	}
	\vspace{-1em}
\end{table}


\end{document}
