\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage[utf8]{inputenc}
\usepackage[T1]{fontenc}
\usepackage{caption}
\usepackage{float}
\usepackage{xcolor}
\usepackage{enumitem}
\setlist{nosep, leftmargin=14pt}

\usepackage{booktabs}

\usepackage{multirow}

\usepackage{makecell}
\newcommand{\cross}{\text{\sffamily X}}

\usepackage{adjustbox}
\usepackage{siunitx}
% Create a handy centered+middle-aligned column type "C"
\newcolumntype{C}[1]{>{\centering\arraybackslash}m{#1}}

\jmlrvolume{-- nnn}
\jmlryear{2025}
\jmlrworkshop{Full Paper -- MIDL 2025}
\editors{Accepted for publication at MIDL 2025}

\title[Enhancing Contrastive Learning for Retinal Imaging]{Enhancing Contrastive Learning for Retinal Imaging via Adjusted Augmentation Scales}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Cheng, Zijie\nametag{$^{1}$}} \Email{zijie.cheng.23@ucl.ac.uk}\\
\addr $^{1}$ UCL Department of Medical Physics \& Biomedical Engineering, United Kingdom \AND
\Name{Li, Boxuan\nametag{$^{1}$}} \Email{b.li.22@ucl.ac.uk}\\
\Name{Altmann, Andre\nametag{$^{1,2}$}} \Email{a.altmann@ucl.ac.uk}\\
\addr $^{2}$ UCL Hawkes Institute, United Kingdom\AND
\Name{Keane, Pearse\nametag{$^{3}$}} \Email{p.keane@ucl.ac.uk}\\
\addr $^{2}$ UCL Institute of Ophthalmology, United Kingdom\AND
\Name{Zhou, Yukun\nametag{$^{2,3}$}} \Email{yukun.zhou.19@ucl.ac.uk}\\
}

\begin{document}

\maketitle

\begin{abstract}
Contrastive learning, a typical self-supervised learning strategy, operates on bringing similar data together while pushing dissimilar data apart in latent space. This approach extracts robust and discriminative representations, thus being widely used in natural computer vision tasks, such as object classification. However, unlike natural images, medical images (e.g., retinal images) tend to share substantial similarities in imaging area and anatomical tissues, leading to a denser distribution in latent space. As a result, the default use of strong augmentations in contrastive learning potentially exacerbates this intensive distribution in retinal images, making it difficult to distinguish between genuinely similar and dissimilar data, and therefore hindering model pre-training convergence. In this paper, we hypothesise that weaker augmentations are better suited to contrastive learning for medical image applications, and we investigate model performance under various augmentation strategies. Our study includes six publicly available retinal datasets covering multiple clinically relevant tasks. We assess the models' performance and generalizability via extensive experiments. The model pre-trained with weak augmentation outperforms the one pre-trained with strong augmentation, achieving approximately a 6\% increase in AUPR ($P$$<$0.001) and a 12.5\% increase in sensitivity ($P$$<$0.001) on MESSIDOR-2. Similar improvements are observed across other datasets. Our findings suggest that optimizing the scale of augmentation is critical for enhancing the efficacy of contrastive learning in medical imaging. The model weights and relevant code are available at: \href{https://github.com/ziijiecheng/Enhance-contrastive-SSL-for-Retinal-Imaging}{https://github.com/ziijiecheng/Enhance-contrastive-SSL-for-Retinal-Imaging}.
\end{abstract}

\begin{keywords}
contrastive learning, augmentation scales, data distribution, retinal imaging
\end{keywords}

\section{Introduction}

Contrastive learning is a machine learning paradigm that pulls similar data points (e.g., images rotated from the same image) closer and pushes dissimilar ones (e.g., images rotated from different images) farther apart in the latent space without relying on explicit labels \cite{LeKhac2020zn,Jaiswal2020ve}. Such an approach trains the model to learn generalizable features. Although pre-trained only on unlabeled data, the models have demonstrated comparable or even better performance compared to supervised learning-based methods \cite{Misra2019-kz,Hendrycks2019-yf}. In natural image domain, contrastive learning has achieved promising results in diverse tasks such as object detection \cite{Xie2021-aj}, image classification \cite{Zeng2020-ey}, and video analysis \cite{Singh_2021_CVPR}. Compared to generative learning, contrastive learning has shown better effectiveness in various applications \cite{oquab2024dinov2learningrobustvisual,Caron2021-th,Liu2020-dy}. However, whether this observation extends to medical images remains underexplored.

Recent research has started comparing contrastive learning and generative learning in medical artificial intelligence (AI). For instance, RETFound \cite{Zhou2023-rr}, a foundation model for retinal images, employed a generative learning strategy named the Masked Autoencoder \cite{He2021-cc} for model development and demonstrated superior performance compared to contrastive learning methods in retinal disease classification. Understanding the reasons behind this inconsistency and developing a simple yet efficient solution to improve contrastive learning for medical imaging is crucial.

\begin{figure}[t]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
\floatconts
  {fig:project_overview}
  {\caption{Figures (a) and (b) illustrate the distribution of distances between positive pairs and negative pairs in both natural and medical image domains. Figure (c) presents the project pipeline: unlabeled data is used to pre-train contrastive learning models while investigating various augmentation strategies. The blue dots and yellow dots indicate augmented images from different original images. The goal of this approach is to enhance feature clustering and improve the accuracy of retinal disease classification.}}
  {\includegraphics[width=0.76\linewidth]{overview}}
\end{figure}
The suboptimal performance of contrastive learning in medical imaging is likely due to inherent differences between the distributions of natural and medical images \cite{WEN2021103145}. Natural images are colorful with varying pixel intensities, while medical images are usually grayscale and structurally similar, especially within the same organ or tissue type \cite{ legras2018distribution,Durston2001-bd}. This characteristic results in a denser distribution of medical images within the latent space compared to natural images \cite{zhou2021review}. We hypothesize that such a dense distribution degrades performance when applying contrastive learning methods to medical images. As shown in Figures \ref{fig:project_overview}(a) and \ref{fig:project_overview}(b), natural images under the default strong augmentations in contrastive learning are sparsely distributed in latent space, while different medical images tend to overlap heavily. The pretext task of contrastive learning aims to distinguish between positive pairs (augmented views of the same image) and negative pairs (augmented views of different images). In the context of medical imaging, the significant overlap of augmented images in the latent space renders this pretext task highly challenging, thereby hindering model converge in contrastive learning. Due to the unique characteristics of medical images, previous studies have explored specific augmentation methods for different medical modalities \cite{Goceri2023-zh, Van_der_Sluijs2023-xc, Kang2022-li}.

In this work, we propose a simple yet effective solution to enhance contrastive learning performance by reducing augmentation scales. The project pipeline is illustrated in Figure \ref{fig:project_overview}(c). We use Distillation with No Labels (DINO) \cite{Caron2021-th} as a study example of contrastive learning strategies, and validated our solution on clinical applications, including glaucoma and diabetic retinopathy diagnosis, using both internal and external evaluations. Our approach not only enhances feature clustering but also demonstrates improved diagnostic performance compared to models using default strong augmentations.

\section{Methods}
\subsection{Problem Definition}
For contrastive learning, given a set of unlabeled retinal images $\mathcal{D} = \left\{ x_i \right\}_{i=1}^{N}$, we create positive pairs $\mathcal{P}^+$ by randomly selecting an image $x_i \in \mathcal{D}$ and apply twice augmentation $\Phi_{t, s}$ respectively to get augmented data $x_i^{1}$ and $x_i^{2}$, where $t$ indicates the augmentation type and $s$ the scale range. While for negative pairs, we sample two images $x_i$ and $x_j \in \mathcal{D}$ (with $i \neq j$) and apply the augmentation to each image, forming the negative pair $\mathcal{P}^- = (x_i^{1}, x_j^{2})$. We then use the feature encoder from model $f$ to project these images in latent space, such as $En(x_i^{1})$. The distance between positive pairs and negative pairs in latent space can be measured by $Dis(\cdot)$:

\begin{equation}
\operatorname{Dis}(\mathcal{P}^+)
= \bigl\| En(x_i^1) - En(x_i^2)\bigr\|_2
= \sqrt{
\sum\limits_{k=1}^{d} 
\Bigl( En(x_i^1)_k \;-\; En(x_i^2)_k \Bigr)^2
},
\label{eq:positive_euclidean_distance}
\end{equation}

\begin{equation}
\operatorname{Dis}(\mathcal{P}^-)
= \bigl\| En(x_i^1) - En(x_j^2)\bigr\|_2
= \sqrt{
\sum\limits_{k=1}^{d} 
\Bigl( En(x_i^1)_k \;-\; En(x_j^2)_k \Bigr)^2
},
\label{eq:negative_euclidean_distance}
\end{equation}

\noindent where \(En(\cdot)\) maps an image into a latent space (i.e., an embedding) with \(d\) as the dimension of the latent representation. The index \(k\) ranges from \(1\) to \(d\), with \(En(x_i^1)_k\) denoting the \(k\)-th component of the embedding vector \(En(x_i^1)\).

The general training objective of contrastive learning is to train the model $f$ to maximize the distance between  negative pairs and to minimize that for positive pairs, 

\begin{equation}
f = \operatorname*{argmax} \left( \text{Dis}(\mathcal{P}^-) - \text{Dis}(\mathcal{P}^+) \right).
\label{eq:model optimization}
\end{equation}

When $\text{Dis}(\mathcal{P}^+)$ approximates $\text{Dis}(\mathcal{P}^-)$, it is challenging to train the model $f$ to converge well. This issue is prominent in medical imaging due to less variation compared to natural images. For example, retinal images depict the anatomical tissue of retina, often showing similar structure and orientation \cite{PATTON200699}, as shown in Figure \ref{fig:project_overview}(b). With strong augmentations $\Phi_{strong}$ (e.g., cropping the images into small patches) following the default augmentation settings in DINO, $\text{Dis}(\mathcal{P}^-)$ decrease while $\text{Dis}(\mathcal{P}^+)$ increases, which brings further challenges in achieving objects of equation \ref{eq:model optimization} and may result in suboptimal model pre-training with contrastive learning, showing the poor performance in classifying the positive and negative pairs. 

Such suboptimal model performance extends to downstream applications, where models are fine-tuned with labeled data \( \mathcal{D}_l = \left\{ x_i, y_i \right\}_{i=1}^{L} \) for diverse tasks like disease diagnosis, where \( x \) represents the data and \( y \) indicates the label. To improve the model's capability in clinically meaningful applications, our strategy involves enhancing the contrastive learning performance in classifying $\mathcal{P}^+$ and $\mathcal{P}^-$ by specifically decreasing \( \text{Dis}(\mathcal{P}^+) \) while increasing \( \text{Dis}(\mathcal{P}^-) \).

\subsection{Scattering Data Distribution with Weak Augmentations}

To achieve such a goal for retinal images, a straightforward strategy is to scale down the augmentation. An extreme case is to remove the augmentation so that $\text{Dis}(\mathcal{P}^+)$ achieves 0 and $\text{Dis}(\mathcal{P}^-)$ stays as a high value. However, pre-training without any augmentation hardly trains the model to learn generalizable and diverse features. Hence, we propose to scale down the augmentation, termed $\Phi_{weak}$, to ease the challenge of training the model $f$ to converge while also avoiding it being too weak for the model to learn generalizable features. Additionally, we investigate the effects of several augmentations that mimic the retinal image artefacts, including random bias field and Gaussian blur. We combine it with $\Phi_{weak}$ to form $\Phi_{weak+med}$.

\section{EXPERIMENT}
\subsection{Data}

The pre-training data are from Moorfields Eye Hospital \cite{wagner2022alzeye,Zhou2023-rr} with 1.4 million color fundus images, a retinal image modality. These images were collected from a retrospective cohort study linking ophthalmic data of 353,157 patients, who attended the hospital between 2008 and 2018. All images are preprocessed and resized to 224 $\times$ 224 by an automated retinal image analysis tool AutoMorph \cite{zhou2022automorph}.


We evaluate the efficacy of different augmentation strategies using clinically meaningful tasks, including diabetic retinopathy (DR) diagnosis, glaucoma detection, and multi-class retinal disease classification. For DR diagnosis, we include MESSIDOR-2 \cite{Decenciere2014-xf}, IDRiD \cite{Porwal2018-pd}, and APTOS2019 \cite{aptos2019_blindness_detection}. The labels for DR are based on the International Clinical DR Severity Scale, covering five stages from no DR to proliferative DR. For glaucoma diagnosis, we use the PAPILA dataset \cite{kovalyk_papila_2022}, which has three categorical labels: non-glaucoma, early glaucoma (suspected glaucoma), and advanced glaucoma. For multi-class disease classification tasks, we use two datasets, JSIEC \cite{cen_automatic_2021} containing 1,000 images with 39 categories of common retinal diseases and conditions, and Retina dataset \cite{jr2ngb_cataractdataset} with labels for normal, glaucoma, cataract, and retinal disease. Data splitting details are shown in Table \ref{tab:datasets}. 

\begin{table}[t]
    \scriptsize % Use small font to save space
    \centering
    \caption{Data summary for the datasets used for disease diagnosis. Each dataset is split into training, validation, and testing sets.}
    \label{tab:datasets}
    \begin{tabular}{>{\centering\arraybackslash}p{2cm} >{\centering\arraybackslash}p{1.4cm} >{\centering\arraybackslash}p{1.4cm} >{\centering\arraybackslash}p{1.4cm} >{\centering\arraybackslash}p{1.4cm} >{\centering\arraybackslash}p{1.4cm}}
        \toprule
        \textbf{Dataset} & \textbf{Country} & \textbf{Categories} & \textbf{Training} & \textbf{Validation} & \textbf{Testing} \\
        \midrule
        \multicolumn{6}{c}{\textbf{Diabetic retinopathy}} \\
        \midrule
        MESSIDOR-2 & France & 5 & 972 & 246 & 526 \\
        IDRiD & India & 5 & 329 & 84 & 103 \\
        APTOS2019 & India & 5 & 2048 & 514 & 1100 \\
        \midrule
        \multicolumn{6}{c}{\textbf{Glaucoma}} \\
        \midrule
        PAPILA & Spain & 3 & 312 & 79 & 98 \\
        \midrule
        \multicolumn{6}{c}{\textbf{Multi-class disease}} \\
        \midrule
        JSIEC & China & 39 & 534 & 150 & 316 \\
        Retina & NR & 4 & 336 & 84 & 181 \\
        \bottomrule
    \end{tabular}
\end{table}


\subsection{Pre-training details} 

DINO \cite{Caron2021-th}, a representative and commonly used contrastive learning strategy, was used in the experiment. We first initialized the model with ImageNet weights and then pre-trained it using 1.4 million color fundus images from Moorfields Eye Hospital. The data preprocessing, data quality control, model architecture, and hyperparameters (except for those related to augmentations) were standardized to ensure a fair comparison. The model was pre-trained using an NVIDIA A100 (80G). The details of $\Phi_{strong}$, $\Phi_{weak}$, and $\Phi_{weak+med}$ are listed in Table \ref{table:augmentation_}. $\Phi_{strong}$ follows the default augmentation settings in DINO, which was well-tuned on natural images. Local crop is a small, zoomed-in region of an image. Global Crop is a large region of an image. Color jittering involves random adjustments to image brightness and contrast. Gaussian blur is a smoothing effect created by applying a Gaussian filter to reduce detail and noise. Gaussian noise consists of random intensity variations that follow a Gaussian distribution. Random bias field is a smooth, spatially varying intensity variation across an image. The $\Phi_{med}$ introduced into $\Phi_{weak}$ is implemented from Torchio, a Python library for medical image processing \cite{Perez-Garcia2021-am}.




\begin{table}[t]
\centering
\caption{Various settings of augmentation types and scales. 
Augmentations not listed are consistent with the default strong augmentations, well-tuned on natural images. For local and global crops, the range (e.g., (0.05, 0.4)) represents the cropping scales relative to the original image. The symbol \(p\) denotes the probability of applying a particular transformation, which is defaulted as 1 unless specified. \(\times\) indicates that the transformation is not applied.}
\label{table:augmentation_}

% Set font size inside the table
\fontsize{10pt}{12pt}\selectfont

\begin{adjustbox}{width=0.8\textwidth}
\begin{tabular}{
  @{}l     % left-aligned for the first column (Method names)
  C{2cm}   % Local crop
  C{2.2cm} % Global crop
  C{2.5cm} % Color jitter
  C{1.2cm} % Blur
  C{1.2cm} % Noise
  C{1.5cm} % Bias field
  @{}}
\toprule
& Local crop & Global crop & Color jitter & Blur & Noise & Bias field \\
\midrule

$\Phi_{\text{strong}}$ 
  & (0.05, 0.4)
  & (0.4, 1.0)
  & \makecell{bright:0.4\\ contrast:0.4}
  & {\hspace{3pt}\(\times\)}      % Nudged right a bit
  & {\hspace{3pt}\(\times\)}
  & {\hspace{3pt}\(\times\)} \\
\midrule

$\Phi_{\text{weak}}$ 
  & (0.2, 0.5) 
  & (0.5, 1.0)
  & \makecell{bright:0.2\\ contrast:0.2}
  & {\hspace{3pt}\(\times\)}
  & {\hspace{3pt}\(\times\)}
  & {\hspace{3pt}\(\times\)} \\
\midrule

$\Phi_{\substack{\text{weak}\\ + \text{med}}}$
  & (0.2, 0.5)
  & (0.5, 1.0)
  & \makecell{bright:0.2\\ contrast:0.2}
  %-----------------------
  % Nudge the "std:..., p:..." lines to the right
  & \makecell{\hspace{3pt}std:0.1\\ \hspace{3pt}\(p\):0.5}
  & \makecell{\hspace{3pt}std:0.1\\ \hspace{3pt}\(p\):0.5}
  & \makecell{\hspace{3pt}scale:0.1\\ \hspace{3pt}\(p\):0.5} \\
\bottomrule
\end{tabular}
\end{adjustbox}

\end{table}
We then compared these models by adapting them to downstream tasks of disease diagnosis. We evaluated the model performance with the Area Under the Receiver Operating Characteristic curve (AUROC), the Area Under the Precision-Recall curve (AUPR), and sensitivity. Each experiment is run five times with random seeds to obtain performance statistics.


\begin{figure}[t]
 % Caption and label go in the first argument and the figure contents
 % go in the second argument
\floatconts
  {fig:dino_t-sne_and_data_distribution}
  {\caption{We extract features using the DINO teacher model (encoder), pre-trained respectively with $\Phi_{strong}$, $\Phi_{weak}$ and $\Phi_{weak+med}$. First, we calculate the Euclidean distances between positive and negative pairs and compare their distance distributions in Figure (a). We also use a t-SNE map to visualize feature clustering in latent space in Figure (b), where different colors represent augmented views from different images. In figure (b), vits represents small ViT.}}
  {\includegraphics[width=\linewidth]{MIDL_interpreterbility}}
\end{figure}

\subsection{Experiment Result}

\begin{table}[t]
\centering
\caption{Model comparison on disease diagnosis with internal evaluation. The middle three columns show model performance under varied data augmentation strategies, with the highest value in each row highlighted in bold. For each task, the model was fine-tuned using five random seeds (affecting training data shuffling) and evaluated on the test set, yielding five replicas. Statistical significance was determined via a repeated-measures analysis of variance (ANOVA), with random seed treated as a within-subjects factor. The resulting $P$ values quantify the significance of performance differences between $\Phi_{weak}$ and $\Phi_{strong}$.}
\label{table:internal_eval}
\begin{adjustbox}{width=0.85\textwidth}
\begin{tabular}{lcccc}
\hline
\phantom{Measurement}         & $\Phi_{strong}$                 & $\Phi_{weak}$                   & $\Phi_{weak + med}$             & $P$ value \\
\hline
\multicolumn{5}{l}{\textbf{MESSIDOR-2}} \\
\hline
AUROC               & .838 \,(.835, .840) & \textbf{.848 \,(.846, .851)} & .823 \,(.817, .829) & $<.001$ \\
AUPR                & .523 \,(.516, .530) & \textbf{.582 \,(.575, .589)} & .523 \,(.498, .547) & $<.001$ \\
Sensitivity                & .154 \,(.146, .162) & \textbf{.279 \,(.255, .303)} & .247 \,(.220, .274) & $<.001$ \\
\hline
\multicolumn{5}{l}{\textbf{APTOS2019}} \\
\hline
AUROC               & .933 \,(.932, .933) & \textbf{.933 \,(.933, .934)} & .924 \,(.924, .925) & $.004$ \\
AUPR                & \textbf{.667 \,(.665, .670)} & .665 \,(.661, .668) & .637 \,(.635, .639) & $.236$ \\
Sensitivity                & .469 \,(.466, .472) & \textbf{.528 \,(.509, .547)} & .482 \,(.474, .491) & $.003$ \\
\hline
\multicolumn{5}{l}{\textbf{IDRiD}} \\
\hline
AUROC               & .747 \,(.736, .758) & \textbf{.790 \,(.782, .798)} & .726 \,(.720, .732) & $<.001$ \\
AUPR                & .461 \,(.445, .476) & \textbf{.498 \,(.486, .509)} & .432 \,(.419, .446) & $<.001$ \\
Sensitivity                & \textbf{.366 \,(.343, .390)} & .355 \,(.340, .369) & .291 \,(.266, .316) & $.369$ \\
\hline
\multicolumn{5}{l}{\textbf{PAPILA}} \\
\hline
AUROC               & .791 \,(.782, .800) & \textbf{.816 \,(.804, .829)} & .792 \,(.785, .800) & $.003$ \\
AUPR                & .637 \,(.630, .643) & \textbf{.671 \,(.653, .688)} & .628 \,(.619, .638) & $.018$ \\
Sensitivity                & .238 \,(.205, .271) & .295 \,(.264, .327) & \textbf{.312 \,(.282, .341)} & $.096$ \\
\hline
\multicolumn{5}{l}{\textbf{JSIEC}} \\
\hline
AUROC               & .960 \,(.958, .962) & \textbf{.977 \,(.975, .979)} & .968 \,(.967, .969) & $<.001$ \\
AUPR                & .651 \,(.637, .664) & \textbf{.760 \,(.750, .769)} & .707 \,(.695, .720) & $<.001$ \\
Sensitivity                & .331 \,(.316, .345) & \textbf{.568 \,(.557, .578)} & .443 \,(.416, .470) & $<.001$ \\
\hline
\multicolumn{5}{l}{\textbf{Retina}} \\
\hline
AUROC               & .781 \,(.776, .787) & .807 \,(.801, .813) & \textbf{.814 \,(.807, .820)} & $<.001$ \\
AUPR                & .594 \,(.585, .604) & \textbf{.632 \,(.620, .643)} & .626 \,(.616, .635) & $.002$ \\
Sensitivity                & .326 \,(.321, .332) & \textbf{.416 \,(.396, .436)} & .375 \,(.352, .399) & $<.001$ \\
\hline
\end{tabular}
\end{adjustbox}
\end{table}

\begin{table}[t]
\centering
\caption{This table presents the external evaluation results on diabetic retinopathy datasets based on AUROC. For each dataset pair, the highest mean value among the different augmentation strategies is highlighted in bold. For each task in interval evaluation, we generate five replicas using different random seeds. For each replica, the model weights corresponding to the best performance on the validation set are saved for subsequent external performance assessment. Statistical significance was determined via a repeated-measures analysis of variance (ANOVA), with random seed treated as a within-subjects factor. The resulting $P$ values quantify the significance of performance differences between $\Phi_{weak}$ and $\Phi_{strong}$.}
\label{table:external_eval}
\begin{adjustbox}{width=\textwidth}
\begin{tabular}{llcccccc}
\hline
\textbf{Fine-tune data} & & \multicolumn{2}{c}{APTOS2019} & \multicolumn{2}{c}{IDRiD} & \multicolumn{2}{c}{MESSIDOR-2} \\
\hline
\textbf{Test data} & & IDRiD & MESSIDOR-2 & APTOS2019 & MESSIDOR-2 & APTOS2019 & IDRiD \\
\hline
$\Phi_{strong}$ & & .785 $\pm$ .006 & \textbf{.767 $\pm$ .001} & .740 $\pm$ .011 & .744 $\pm$ .008 & \textbf{.804 $\pm$ .007} & .743 $\pm$ .010 \\
$\Phi_{weak}$ & & \textbf{.790 $\pm$ .005} & .761 $\pm$ .002 & \textbf{.744 $\pm$ .012} & \textbf{.761 $\pm$ .007} & .798 $\pm$ .009 & \textbf{.746 $\pm$ .007} \\
$\Phi_{weak + med}$ & & .752 $\pm$ .004 & .692 $\pm$ .003 & .732 $\pm$ .011 & .723 $\pm$ .008 & .708 $\pm$ .018 & .738 $\pm$ .009 \\
$P$ value & & .125 & <.001 & .179 & <.001 & .211 & .316 \\
\hline
\end{tabular}
\end{adjustbox}
\end{table}

We first plotted the distribution of distances between positive pairs and distances between negative pairs in different augmentation strategies. The model pre-trained with $\Phi_{weak}$ better distinguished these pairs, as shown in Figure \ref{fig:dino_t-sne_and_data_distribution}(a). We also observed the clustering performance of the models, that is, how positive and negative pairs were distributed, across different augmentation strategies through the t-SNE map \cite{Maaten2008-vj}. We repeatedly augmented each image to create image groups, where positive pairs consisted of images within the same group, and negative pairs were images from different groups. Then, we projected these images into latent space and found that the features of negative pairs have a distinct distribution under the weak augmentation shown in Figure \ref{fig:dino_t-sne_and_data_distribution}(b). We also used the Silhouette score \cite{Shahapure2020-sd} to quantify the clustering quality of DINO pre-trained under different augmentation strategies. DINO pre-trained with \(\Phi_{weak}\) achieved the highest score of 0.201, while those pre-trained with  \(\Phi_{strong}\) and \(\Phi_{weak+med}\) achieved scores of 0.117 and 0.130, respectively.

In the internal evaluation presented in Table \ref{table:internal_eval}, DINO with \(\Phi_{weak}\) outperformed the other augmentation strategies on most retinal disease classification tasks. Specifically, on MESSIDOR-2, PAPILA, JSIEC, and Retina, the model employing \(\Phi_{weak}\) consistently demonstrated higher AUROC, AUPR, and sensitivity compared with \(\Phi_{strong}\). Notably, on JSIEC, the model pre-trained with \(\Phi_{weak}\) achieved a 10\% increase in AUPR and a 23.7\% increase in sensitivity compared to \(\Phi_{strong}\) ($P$<0.001). However, on IDRiD, although the model achieved higher AUROC and AUPR under \(\Phi_{weak}\) than under \(\Phi_{strong}\), \(\Phi_{strong}\) conferred a slight advantage of approximately 1.1\% in sensitivity. Introducing the medical augmentation \(\Phi_{med}\) generally diminished the model’s performance. For example, on IDRiD and JSIEC, combining \(\Phi_{weak}\) with \(\Phi_{med}\) (\(\Phi_{weak+med}\)) reduced performance, particularly sensitivity, by 6.4\% and 12.5\% compared to \(\Phi_{weak}\), respectively. These tasks often show low sensitivity, as seen in Diabetic Retinopathy classification with five classes, a common challenge in this application \cite{Islam2022-ri,Long2024-aq}.

As shown in Table \ref{table:external_eval}, the external evaluation indicated that $\Phi_{weak}$ performed better than $\Phi_{strong}$ and $\Phi_{weak+med}$ in most tasks. For instance, when the model fine-tuned on IDRiD was externally evaluated on APTOS2019 and MESSIDOR-2, the model pre-trained with $\Phi_{weak}$ outperformed $\Phi_{strong}$ by 0.4\% and 1.7\%, respectively.

\section{Discussion and Conclusion}

In this study, we aimed to improve the contrastive learning performance in the medical image domain. We proposed a hypothesis that the dense distribution of medical images might cause the suboptimal performance of contrastive learning, and validated it in our experiments. Our findings suggest that simply reducing augmentation scales to an appropriate level can improve the clustering performance and therefore enhance model performance in downstream tasks. Additionally, when incorporating medical-specific augmentation $\Phi_{med}$ to $\Phi_{weak}$, the collective augmentation could again decrease $\text{Dis}(\mathcal{P}^-)$, while increase $\text{Dis}(\mathcal{P}^+)$ (Figure \ref{fig:dino_t-sne_and_data_distribution}), generating adverse effects on model performance. These offer key guidance into the model pre-training with contrastive learning for medical images.

Although bringing insights, we acknowledge several limitations in this work that should be studied in future work. First, the performance under $\Phi_{weak}$ sometimes only has a slight advantage compared to that under $\Phi_{strong}$ in both internal and external evaluation. This is likely caused by nearly saturated performance after pretraining on large-scale nature images. Some techniques, such as methods for automatically adjusting augmentation scales will be studied to achieve optimised performance. Specifically, for positive pairs that are too far apart in latent space and negative pairs that are too close, the loss function will assign greater weights to them during model pre-training. Second, we only validated our hypothesis and solution on DINO; more contrastive learning strategies, such as DINOv2 \cite{oquab2024dinov2learningrobustvisual}, could be investigated. Third, some quantitative metrics describing the clustering performance have not been investigated, which will be proposed in future work to guide the augmentation scaling. This work pioneered the optimization of contrastive learning in the medical domain and encouraged tailored model training settings for medical images.

\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version


\bibliography{midl25_139}

\end{document}
