\documentclass{midl} % Include author names
% \documentclass[anon]{midl} % Anonymized submission

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

% \usepackage{mwe} % to get dummy images

%========================================
% Our commonly used packages
\input{math_commands.tex}
\usepackage{mathtools}
\usepackage{graphicx}
% \usepackage{subfigure}
\usepackage{float}
\usepackage{color}
\usepackage{bm}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{amsfonts}
% \usepackage{amsthm}
\usepackage{multirow}
\usepackage{adjustbox}
\usepackage{booktabs}
\usepackage{caption}
% \usepackage{subcaption}
\usepackage{makecell}
\usepackage{mathtools}
\usepackage{enumitem}
\usepackage{array}
\usepackage{wrapfig}
\usepackage{hyperref}
\usepackage{nicefrac}% compact symbols for 1/2, etc.
% \newtheorem{theorem}{Theorem}
% \newtheorem{corollary}{Corollary}[theorem]
% \newtheorem{remark}{Remark}[theorem]
% \newtheorem{lemma}{Lemma}
\newcommand{\norm}[1]{\lVert #1 \rVert}
\def \aa {{\bm{a}}}
\def \ee {{\bm{e}}}
\def \rr {{\bm{r}}}
\def \tt {{\bm{t}}}
\def \xx {{\bm{x}}}
\def \yy {{\bm{y}}}
\def \pp {{\bm{p}}}
\def \qq {{\bm{q}}}
\def \uu {{\bm{u}}}
\def \ww {{\bm{w}}}
\def \zz {{\bm{z}}}
\def \ej {{\bm{e_j}}}
\def \ei {{\bm{e_i}}}
\def \btheta {{\bm{\theta}}}
\def \X  {\mathcal{X}}
\def \Y  {\mathcal{Y}}
\def \F  {\mathcal{F}}
\def \H  {\mathcal{H}}
\def \L  {\mathcal{L}}
\def \R  {\mathbb{R}}
\def \U  {\mathcal{U}}
\def \D  {\mathcal{D}}
\def \C  {\mathcal{C}}
\def \F  {\mathcal{F}}
\def \E  {\mathbb{E}}
\def \W {{\bm{W}}}
%========================================


\newtheorem{mydefinition}{Definition}
%\newtheorem{theorem}{Theorem}
%\newtheorem{lemma}{Lemma}
%\newtheorem{observation}{Observation}
%\newtheorem{corollary}{Corollary}
%\newtheorem{property}{Property}
%\newenvironment{theorem}{\begin{Theorem}}{\end{Theorem}}
%\newenvironment{lemma}{\begin{Lemma}}{\end{Lemma}}
\newenvironment{observation}{\begin{Observation}}{\end{Observation}}
%\newenvironment{corollary}{\begin{Corollary}}{\end{Corollary}}
\newenvironment{property}{\begin{Property}}{\end{Property}}
%\newenvironment{qedproof}{\noindent {\bf Proof:\,\ }}{\hfill\mbox{\ $\Box$}\medskip}
\newenvironment{qedproof}{\begin{proof}}{\end{proof}}
%\newcolumntype{L}{>{\centering\arraybackslash}m{3cm}}
%\usepackage{tikz}
%\usetikzlibrary{decorations.pathreplacing}
%\newcommand{\vect}[2]{\boldsymbol{#1}}​
% shorter bold for vectors
\renewcommand{\b}[1]{\mathbf{#1}}
\let\vec\mathbf

\newcommand{\LLL}{\mathcal{L}}

% Local defs

\usepackage{color}

\newcommand{\TODO}[1]{{\colour{red}TODO: #1}}
\newcommand{\NOTE}[1]{{\colour{blue}NOTE: #1}}

\def\Summe#1#2#3{
  \mathop{\oalign{\ensuremath{\sum \limits^{#2}{#3}}\hfill\cr
    \ensuremath{\begin{subarray}{l}{#1}\end{subarray}}\hfill\cr}}
}

\newcommand{\xlnx}{\operatorname{xlnx}}
\newcommand{\IDop}{\operatorname{LID}}
\newcommand{\ID}{\IDop}
\newcommand{\IDstar}{\IDop^{*}}

\newcommand{\NTXent}{\mathrm{NTXent}}
\newcommand{\BYOL}{\mathrm{BYOL}}
\newcommand{\simil}{\mathrm{sim}}
\newcommand{\SSL}{\mathrm{SSL}}

\newcommand{\FR}{\ensuremath{\mathrm{FR}}}
\newcommand{\AFR}{\ensuremath{\mathrm{AFR}}}
\newcommand{\KLD}{\ensuremath{\mathrm{KL}}}
\newcommand{\AKL}{\ensuremath{\mathrm{AKL}}}
\newcommand{\Fm}{\ensuremath{\cal F}}
%\newcommand{\Fm}{\ensuremath{\mathrm{Fm}}}

\newcommand{\Coeffop}{\operatorname{\lambda}}
\newcommand{\Coeff}{\Coeffop}
\newcommand{\Coeffstar}{\Coeffop^{*}}

\newcommand{\Ent}{\mathrm{H}}
\newcommand{\VarEnt}{\mathrm{VarH}}
\newcommand{\NEnt}{\mathrm{nH}}

\newcommand{\cEnt}{\mathrm{cH}}
\newcommand{\ncEnt}{\mathrm{ncH}}

\newcommand{\EntP}{\mathrm{HP}}
\newcommand{\nEntP}{\mathrm{nHP}}

\newcommand{\CEnt}{\mathrm{XH}}
\newcommand{\CEntP}{\mathrm{XHP}}

%\newcommand{\KL}{\mathrm{KL}}

\newcommand{\WD}{\mathrm{WD}}

\newcommand{\JS}{\mathrm{JS}}

\newcommand{\TV}{\mathrm{TV}}

\newcommand{\HD}{\mathrm{HD}}

%\newcommand{\Ltwo}{\mathrm{L}_2\mathrm{D}}
\newcommand{\Ltwo}{\mathrm{L2D}}

\newcommand{\ChiSq}{\chi^{2}\mathrm{D}}

\newcommand{\AlphD}{\alpha\mathrm{D}}

\newcommand{\expauxOp}{A}
\newcommand{\expaux}[1]{\ensuremath{{\expauxOp}_{#1}}}
\newcommand{\pr}{\mathsf{Pr}}
\newcommand{\e}{\mathsf{E}}
\newcommand{\var}{\mathsf{Var}}
\newcommand{\IntrDim}{\mathrm{IntrDim}}
\newcommand{\InDiscr}{\mathrm{InDiscr}}
\newcommand{\InDiscrop}{\operatorname{InDiscr}}
\newcommand{\IntrDimop}{\operatorname{IntrDim}}
% \newcommand{\set}[2]{\left.\left\{{#1}~\right|~{#2}\right\}}
\newcommand{\domain}{\ensuremath{\mathcal{S}}}
\newcommand{\real}{\mathbb{R}}
\newcommand{\distrib}{\ensuremath{\mathcal{D}}}
\newcommand{\ball}{\ensuremath{\mathcal{B}}}
\newcommand{\ballx}{\ensuremath{\ball_{\mathbf{x}}}}
\newcommand{\sphere}{\ensuremath{\ball'}}

\newcommand{\righteqn}{ & & \hfill}

\newcommand{\myMethod}{DDA}

% End of our packages
%========================================

% Header for extended abstracts
\jmlrproceedings{MIDL}{Medical Imaging with Deep Learning}
\jmlryear{2024}

% to be uncommented for submissions under review
% \jmlrvolume{-- Under Review}
% \jmlryear{2024}
% \jmlrworkshop{Full Paper -- MIDL 2024 submission}
% \editors{Under Review for MIDL 2024}
\jmlryear{2024}\jmlrworkshop{Full Paper -- MIDL 2024}\jmlrvolume{-- 17}\editors{Accepted for publication at MIDL 2024}
\title[Dimensionality Driven Augmentation]{DDA: Dimensionality Driven Augmentation Search for Contrastive Learning in Laparoscopic Surgery}


\midlauthor{\Name{{Yuning} Zhou\nametag{$^{1}$}}
\Name{Henry Badgery\nametag{$^{2}$}} 
\Name{{Matthew} Read\nametag{$^{3}$}} 
\Name{{James} Bailey\nametag{$^{4}$}}\\ 
\Name{{Catherine E.} Davey\nametag{$^{1}$}} \\
\addr $^{1}$ Department of Biomedical Engineering, the University of Melbourne, Australia \\
\addr $^{2}$ Department of HPB/UGI Surgery, St Vincent’s Hospital Melbourne, Australia \\
\addr $^{3}$ Department of Surgery, St Vincent’s Hospital Melbourne, Australia \\
\addr $^{4}$ School of Computing and Information Systems, the University of Melbourne, Australia \\
}


\begin{document}

\maketitle

\begin{abstract}
Self-supervised learning (SSL) has potential for effective representation learning in medical imaging, but the choice of data augmentation is critical and domain-specific. It remains uncertain if general augmentation policies suit surgical applications. 
In this work, we automate the search for suitable augmentation policies through a new method called Dimensionality Driven Augmentation Search (\myMethod{}). 
\myMethod{} leverages the local dimensionality of deep representations as a proxy target, and differentiably searches for suitable data augmentation policies in contrastive learning.
We demonstrate the effectiveness and efficiency of \myMethod{} in navigating a large search space and successfully identifying an appropriate data augmentation policy for laparoscopic surgery.
We systematically evaluate \myMethod{}  across three laparoscopic image classification and segmentation tasks, where it significantly improves over existing baselines.
Furthermore, \myMethod{}'s optimised set of augmentations provides insight into domain-specific dependencies when applying contrastive learning in medical applications. For example, while hue is an effective augmentation for natural images, it is not advantageous for laparoscopic images. 
\end{abstract}

\begin{keywords}
differentiable augmentation search, contrastive learning, laparoscopic imaging
\end{keywords}




\section{Introduction}
Self-supervised learning (SSL) has recently shown its potential for generating representations from large-scale datasets without human supervision \cite{chen2020simple, grill2020bootstrap, bardes2022vicreg, he2022masked}. In this approach, a model typically conducts representation learning using a data-generated objective on unlabeled datasets. The learned representations can subsequently be transferred to downstream tasks with limited annotations.
In medical domains such as endoscopic or laparoscopic surgery, data can be readily generated from surgical recordings.  This contrasts with obtaining high-quality annotations, which can be prohibitively expensive in terms of human expert time, particularly for applications like segmentation \cite{ward2021challenges}. In such cases, SSL is highly attractive as it allows effective utilisation of unlabeled data, thereby reducing the demand for annotations.

Contrastive learning is a type of SSL, where the model minimizes the distance between feature representations of augmented views from the same image and maximizes the distance to different images.
This can be achieved either explicitly \cite{chen2020simple} in the loss function or implicitly \cite{grill2020bootstrap,bardes2022vicreg}.
The augmentation policy that generates augmented views significantly influences the representation learning quality, and consequently the transferred performance on downstream tasks \cite{wagner2022importance,huang2023towards}. 
Existing augmentation search methods that automatically select the ``optimal'' policy are usually conducted with supervised learning and require access to annotations \cite{cubuk2018autoaugment,lim2019fast,ho2019population,hataya2020faster}.
\citet{reed2021selfaugment} proposed SelfAugment for augmentation policy search in contrastive learning. It requires training additional projectors with proxy SSL tasks for each candidate and uses Bayesian optimisation for selection. It is very time-consuming if the search space is large. 


Given that the most effective augmentation policy is domain dependent \cite{xiao2020should,bendidi2023no}, the transferability of policies that have been developed for natural images \cite{deng2009imagenet, chen2020simple,grill2020bootstrap,garrido2023on} to medical datasets remains uncertain.
\citet{van2023exploring} explored contrastive learning augmentation policies on Chest X-ray datasets by conducting a grid search on a predefined space of 8 augmentations with varying strengths.
The $8^2$ search space poses challenges for replication due to the exponential growth in computational resources required for SSL \cite{chen2020big}. This challenge is further pronounced when extending findings to other medical applications.


To address these limitations, we propose \emph{Dimensionality Driven Augmentation (\myMethod)} as illustrated in Figure \ref{fig:SimAug framework}. to streamline augmentation policy selection in contrastive learning. 
Based on the differentiable augmentation search framework \cite{hataya2020faster}, \myMethod{} incorporates the augmentation pool, the number of augmentations in a policy, their probabilities and strengths of application as differentiable parameters. 
Instead of optimizing performance on downstream tasks, which is often infeasible, we optimise a proxy objective function. 
\myMethod{} considers the optimisation of geometric characteristics of the deep representation in contrastive learning (the intrinsic dimension). 
As a result, \myMethod{} does not require access to an annotated dataset or additional training.
We demonstrate that \myMethod{} can explore a large search space, encompassing up to $10^8$ different choices, with an optimizable augmentation strength.

\begin{figure}[t]
    \centering
    \includegraphics[width=400pt]{figures/DDAug_framework.pdf}
    \caption{Illustration of differentiable augmentation policy design and an example application of the contrast operation (left), and the comparison of grid search and our \myMethod{} framework for contrastive learning (right).  
    % Dash lines in the right hand boxes indicate the gradient-guided augmentation selection, and solid lines indicate feed-forward contrastive learning .
    }
    \label{fig:SimAug framework}
    \vspace{-0.2in}
\end{figure}

To summarize, the contributions of this paper are:
\begin{itemize} 
    
    \item We propose a novel approach, \myMethod{}, to search for optimal augmentation policy in contrastive learning without any additional supervised evaluation (finetuning) on annotated datasets. \myMethod{} effectively identifies a suitable augmentation policy for laparoscopic surgery across various tasks.
    
    \item Notably, \myMethod{} significantly reduces the time required for augmentation search to constant time complexity. For example, navigating a $10^5$ search space with \myMethod{} only takes 48 hours, which is $10^5$ times faster compared to a grid search.

    \item We show that commonly used augmentations for natural images are sub-optimal when used for laparoscopic images. 
    Our selected augmentations provide valuable insights into which techniques are most effective for medical applications. 

\end{itemize}

\section{Method}
In this section, we first provide background regarding contrastive learning and its augmentation policy in Section \ref{sec:contrastive_learning}.
Then, we describe the learnable augmentation policy design and an overview of the \myMethod{} framework in Section \ref{sec:search_framework}, followed by a detailed discussion on our proxy objective function in Section \ref{sec:objective function}.

\subsection{Problem Definition}
\label{sec:contrastive_learning}

\noindent\textbf{Contrastive Learning.} We consider a typical two-stage semi-supervised setting, a popular SSL application. 
We are given a dataset, $\mathcal{D}$, comprised of an unlabelled subset $\mathcal{D}_u$ for contrastive pretraining, and a labelled subset $\mathcal{D}_l$ for finetuning, where $\mathcal{D} = \mathcal{D}_u \cup \mathcal{D}_l$.

In the first stage, for each $\bar \xx \in \mathcal{X}$ in a batch of $M$ images from $\mathcal{D}_u$, we generate two positive samples $\xx,\xx^+$ from the augmentation policy $\mathcal{T}(\bar \xx)$, and $2(M-1)$ independent negative samples $\{\xx_m^-\}_{m=1}^{2(M-1)} = \{\mathcal{T}(\xx) | \xx \neq \bar \xx  \}$ from the other $M-1$ images.
An encoder $f(\cdot)$ maps input images to the representation space 
$\mathcal{X}\to\mathcal{R}^{d}$ with the representation $\mathbf{z}=f(\xx)$, and projector $g(\cdot)$ obtains embedding $\mathbf{e}=g(\mathbf{z}) \in \mathcal{R}^{e}$. 
A classical optimisation objective for contrastive learning \cite{chen2020simple} is the following, 
\begin{equation}
\label{ntxent_loss}
{\mathcal{L}}_\text{NTXent} = \E_{\substack{\xx,\xx^+, \{\xx_m^-\}}} - \ln \frac{\exp(\simil(\bm{e}, \bm{e^+}) / \eta)}{\sum_{m=1}^{M} \exp(\simil(\bm{e}, \bm{e_m}) / \eta)},
\end{equation}
% where the $\xx_1$ and $\xx_2$ are two views augmented by policy $\mathcal{T}$ from the same image $\xx$, also known as positive pairs, $\xx_{i=3}^{2N}$ are augmented views from other N-1 images in the batch as negative pairs, 
where $\simil(\cdot)$ is the cosine similarity, and $\eta>0$ is the temperature that controls the smoothness of distance distribution.
% , and final loss is computed across all positive pairs.

In the second stage, after the encoder is trained, an additional classifier or segmentation model $h(\cdot)$ can be attached to the encoder as $h \circ f$, for simplicity, we denote this as $f^{'}$. The $f^{'}$ learns to map the input image to the label space $\mathcal{X}\to\mathcal{Y}$ using $\{(\xx, \yy)\} \in \mathcal{D}_{l}$ following a typical supervised learning setup by minimizing the following objective,
\begin{equation}
    \label{finetune_objective}
    % \argmin_{\theta} 
    {\mathcal{L}}_\text{Supervised} = \mathbb{E}_{\xx \sim \mathcal{D}_l} \mathcal{L} ( f'(\xx), \yy),
\end{equation}
where $\mathcal{L}$ is the objective function for supervised downstream tasks (e.g., cross-entropy).

\noindent\textbf{Augmentation Policy.} The optimal policy for contrastive learning is defined as, 
\begin{equation}
    \label{main_search_objective_function}
    \mathcal{T}^{*} = \argmin_{\mathcal{T}} \mathbb{E}_{\xx \sim \mathcal{D}_{Val}} \mathcal{L} (f^{'}(\xx), \yy),
\end{equation}
where $\mathcal{D}_{Val}$ is the unseen validation data. 
Note that the above objective is defined with respect to a specific downstream task. However, in contrastive learning, we are really interested in obtaining the optimal $\mathcal{T}^{*}$ that is suitable for a wide range of tasks. 

\subsection{\myMethod{} Search Framework}
\label{sec:search_framework}
Each round of a two-stage learning framework mentioned in the previous section is time consuming.
This challenge makes it impossible to apply a thorough grid search, especially for a large search space with a diverse number of operations and choices in each augmentation policy.
To tackle these challenges, we propose a streamlined search framework, \myMethod{}.

Specifically, we setup the augmentation search in a differentiable fashion following existing works \cite{liu2018darts, hataya2020faster}.
Let $\mathbb{O}$ be a set of $K$ image augmentation operations $\mathcal{O}_k\in\mathbb{O}:\mathcal{X}\to\mathcal{X}$, and $\mathcal{X}$ is input space. 
Each operation $\mathcal{O}_k(\hspace{0.15cm}\cdot\hspace{0.15cm};p_k,\lambda_k)$ has two parameters: the probability $p_k$, and the augmentation magnitude $\lambda_k$, for applying the operation. 
For an input image, $\xx\in\mathcal{X}$, the output of applying an augmentation from the set of possible operations $\mathbb{O}$, also known as a sub-policy, depends on weighted sampling. This is defined as, $\bar{\mathcal{O}}(\xx; \bm{p},\bm{\lambda}) \in \{ \mathcal{O}_k(\xx; p_k, \lambda_k ) | k=1,\dots K\}.$

Let $\tau$ be the set of augmentation sub-policies as $\tau=\{\bar{\mathcal{O}_{1}},\dots\bar{\mathcal{O}_{N_{\tau}}}\}$ contains $N_{\tau}$ consecutive sub-policies. 
The augmented image is obtained by $\xx' = \tau(\xx) =(\bar{\mathcal{O}_{N_{\tau}}}\circ\dots\circ\bar{\mathcal{O}_{1}})(\xx;\bm{p_n},\bm{\lambda_n})$, $n=1,\dots N_{\tau}$.
During searching, the output of the $n^{th}$ sub-policy $\bar{\mathcal{O}_n}(\xx,\bm{p_n},\bm{\lambda_n})$ is the weighted sum of all possible operation choices,
\begin{equation}
\label{weighted_sum_choice}
  \bar{\mathcal{O}_n}(\xx, \bm{p_n},\bm{\lambda_n}) = \sum_{k=1}^{K} \mathcal{O}_{k}(\xx; \sigma_\eta(w_{nk}), \lambda_{nk}),
\end{equation}
The probability $p_k$ is convert from learnable real-number parameter, $w_k$, using the softmax function as $p_{nk} = \sigma_\eta(w_{nk}) = \frac{\exp(w_{nk} / \eta)}{\sum_{K} \exp(w_{nk} / \eta)}$, with temperature $\eta>0$. Low temperature will generate the distribution of $p_k$ as a one-hot like vector. 
 
The components of the augmentation policy thus become learnable parameters and can be updated by backpropagation.
In this case, our augmentation policy is designed as a block of plug-in-and-play operation layers as $\mathcal{T}$ with parameters $\vw, \bm{\lambda}$.
To learn the optimal augmentation policy $\mathcal{T}^{*}$, we further propose a suitable proxy objective in Section \ref{sec:objective function} for the augmentation search, which optimises the contrastive representation on a fixed encoder.

Here we provide an overview of the three-step \myMethod{} search framework as illustrated in Figure \ref{fig:SimAug framework}.
Firstly, we obtain a contrastive learning encoder $f$ that is trained with the most basic augmentation (e.g. image cropping) with a contrastive projector $g$.
Then, we apply our differentiable augmentation search with the proxy objective explained in Section \ref{sec:objective function} to optimise the parameters of the policy only on the fixed $f$.
Lastly, the optimal policy $\mathcal{T^{*}}$ will be applied to re-conduct contrastive learning to obtain $g^{*}\circ f^{*}$.
We provide a sketch of the pseudocode of the \myMethod{} framework in Algorithm \ref{alg:LDA_framework}. in Appendix \ref{appendix:DAA_algorithm}.


\subsection{\myMethod{} Search with Representation Dimensionality}
\label{sec:objective function} 
\equationref{main_search_objective_function} can't be directly optimised due to lacking validation data, and the two-stage training is hard to optimise differentially.
We propose a novel objective as a proxy so that the differentiable search only involves contrastive pretraining. 

In contrastive learning, the deep representation $\mathbf{z} \in \mathcal{R}^{d}$ extracted from a surgical image needs to be distinguishable from the representations of other images. 
Our focus lies in understanding the local distribution of deep representations in vicinty of a query image.
We define a local neighbourhood-of-interest as a $d$-dimensional sphere with a small radius $r$ centered at the query.
Any data within the neighbourhood has a smaller distance than $r$ from the query, and likely has similar visual content as the query.
An illustration of the deep representation for a local neighbourhood is shown in Figure \ref{fig:lid_r}, in Appendix \ref{appendix:DAA_algorithm}.

To evaluate the properties of a query image's neighborhood, one can estimate the growth rate in the number of nearby data points encountered as the distance from the query increases.
This growth rate provides an estimation of the local intrinsic dimensionality (LID) of the query data located in the high-dimensional subspace \cite{houle2017local1}.
Intuitively, the LID assesses the effective number of dimensions (the intrinsic dimension) needed to characterize the local neighbourhood of a query point.   If the LID equals 2, the local neighbourhood surrounding the query point behaves like it has two dimensions.  LID is thus like a complexity measure and assesses the space-filling properties around a query point.  In our scenario, we will assess the LID of each point in the deep representation produced by SSL.

Let $\mathbf{z}$ be the query point, and $\mathbf{s}$ denotes any nearby representation of a different image within its vicinity.
The distance between $\mathbf{z}$ and $\mathbf{s}$ is denoted as $r_s=l(\mathbf{z},\mathbf{s})$, where $l$ represents a distance measurement. 
We define the cumulative distribution function (CDF), $F(r)$, of the number of data points concerning the local distance distribution with respect to $\mathbf{z}$ as the probability of the sample distance lying within a threshold $r$, $F(r) \triangleq \pr[r_s\leq r]$.

\label{D:IntrDim}
\begin{theorem}[\citet{houle2017local1}]
\label{T:fundamental}
If $F$ is continuously differentiable at 
$r$, then
\[
\ID_F(r)\triangleq\frac{r\cdot F'(r)}{F(r)}
\, .
\]
\end{theorem}
The local intrinsic dimension (LID) at $\mathbf{z}$ defined as the limit, when the radius $r$ tends to zero as,
\[
\IDstar_F\triangleq\lim_{r\to 0^{+}}\ID_F(r)
\, .
\]
In practice, $\IDstar_{F}$ needs to be estimated, and it is not an integer. For simplicity, for the rest of the paper, we denote `LID' as the quantity of $\IDstar_{F}$.\\

In our scenario, we will wish to optimise the LID of each data sample, encouraging it to be large.  Work by \citet{huang2023ldreg} has shown that it is theoretically desirable to optimise the log transform of the LID in deep learning, rather than the LID.  Intuitively, if a query point has a large (log) LID, then its neighbourhood requires more dimensions to characterise. This is a desirable property for SSL, where a key objective is to avoid what is known as dimensional collapse~\cite{jing2022understanding}.
 
This makes LID a suitable proxy objective for the augmentation search. Since 1) it does not require finetuning on downstream tasks, and 2) optimising the LID distribution of samples in the deep space can easily be integrated into a gradient descent framework.
Following \citet{huang2023ldreg}, to obtain a representation with higher LIDs, we can optimise the Fisher-Rao distance between the LID of the representations $\bm{z}$ and a uniform distance distribution (with $\mathrm{LID}$ of 1) using the following loss function as,
\begin{equation}
\label{eq:abs_fr_distance}
{\mathcal{L}}_\text{DDA} = \min -\frac{1}{M}\sum_i^M \ln{\IDstar_{F_i}}
\, ,
\end{equation}
where M is the number of samples in the batch, and $\IDstar_{F_i}$ can be estimated using any popular estimator for LID in the encoder's output representation $\bm{z}$. 

Different from existing work \cite{huang2023ldreg} that investigated optimisation of model parameters to produce a better contrastive representation, {\em in our work we optimise the augmentation policy with a fixed encoder $f$}.  We find this simple but important shift in perspective is highly significant.
More concretely, we apply differentiable augmentation search to optimise the parameters of the policy with \equationref{eq:abs_fr_distance}. as the objective as shown in Algorithm \ref{alg:LDAug} in the Appendix \ref{appendix:DAA_algorithm}.
Although it is also possible to apply other proxy functions instead of using LID for \myMethod{}, we empirically find that using LID is highly suitable for the augmentation search.  {\em To our knowledge, ours is the first work to consider the use of LID as a measure for optimising augmentations.}

\section{Experiments}

We evaluate the performance of the augmentation policy found by \myMethod{} in terms of representation quality following standard evaluation protocols such as linear evaluations and finetuning. 
We use ResNet-50 \cite{he2016deep} and SimCLR \cite{chen2020simple} as the contrastive learning framework. 
All hyperparameters closely follow the original papers. 
We perform the pretraining on two datasets: our private dataset SVHM and the public dataset Cholec80 \cite{twinanda2016endonet}. 
For downstream evaluations, we use the Cholec80 Tool \cite{twinanda2016endonet}, CholecSeg8K \cite{DBLP:journals/corr/abs-2012-12453} and our annotated private dataset (denoted as SVHM Seg). 
For the augmentation search, we set $N=5$ by default. We search across the following augmentation operations: \textit{Identical, Brightness, Contrast, Hue, Saturation, Solarize, GaussianBlur, Posterize, Gray, Sharpness}, with further details in Appendix \ref{appendix:search_space}. We set the temperature $\eta$ in \equationref{weighted_sum_choice}. to $0.1$. 
For LID estimation, we use the method of moments estimator \cite{amsaleg2018extreme}, with the neighbourhood size 16. We compare with the original augmentation policy used by SimCLR, a \textit{Base} policy (only with crop and horizontal flip), a randomly generated one using our search space (\textit{Random}), a manually selected one based on domain experts (\textit{Manual}), and SelfAugment \cite{reed2021selfaugment} with differentiable adaptations. For SelfAugment, we use its min-max strategy as it showed the best performance. We use categorical sampling of the final policy as default. Additional results for the Argmax sampling can be found in Appendix \ref{appendix_additional_results}.
A description of our private dataset, technical details, computing infrastructure and sample source code is provided in Appendix \ref{appendix_exp}. 
The augmentation search using \myMethod{} takes 8 hours for our private dataset and 3 hours for Cholec80. 

\subsection{Evaluations}

\begin{table}[!ht]
\centering
\caption{All results are based on using ResNet-50 as encoder and SimCLR as contrastive pretraining. Results of the linear probing are reported using mean Average Precision (mAP) (\%), and finetuning on downstream segmentation tasks is reported using mIoU (\%). The best results are in \textbf{boldface}. }
\vspace{-0.1in}
\begin{adjustbox}{width=0.95\linewidth}
\begin{tabular}{c|c|c|c|cc}
\midrule
\multirow{2}{*}{Pretraining Dataset} & \multirow{2}{*}{Augmentation} & \multirow{2}{*}{Sampling} & \begin{tabular}[c]{@{}c@{}}Linear Prob\\ (Classification)\end{tabular} & \multicolumn{2}{c}{\begin{tabular}[c]{@{}c@{}}Finetune\\ (Segmentation)\end{tabular}} \\ \cline{4-6} 
 & & & Cholec80 Tool & SVHM Seg & CholecSeg8K \\ \midrule
 None & Supervised & - & - & 55.77 & 57.79 \\ \midrule
\multirow{6}{*}{SVHM} & SimCLR & N/A & 60.00 & 57.28 & 56.18 \\
 & Base & N/A & 59.01 & 57.15 & \textbf{58.71} \\ 
 & Manual & N/A & 47.97 & 54.09 & 57.11 \\ \cline{2-6} 
 & Random & Categorical & Complete Collapse & N/A & N/A \\ 
 & SelfAugment & Categorical & Complete Collapse & N/A & N/A \\ 
 & \myMethod{}& Categorical & \textbf{65.95} & \textbf{58.29} & 57.86 \\ \midrule
\multirow{6}{*}{Cholec80} & SimCLR & N/A & 67.59 & 58.29 & 56.02 \\
 & Base & N/A & 67.78 & 58.36 & 57.04 \\ 
 & Manual & N/A & 59.00 & 55.12 & 59.01 \\ \cline{2-6} 
 & Random & Categorical & Complete Collapse & N/A & N/A \\ 
 & SelfAugment & Categorical & 60.24 & 58.41 & 55.86 \\ 
 & \myMethod{}& Categorical & \textbf{73.59} & \textbf{59.31} & \textbf{59.40} \\ \midrule
\end{tabular}
\end{adjustbox}
\label{main_results}
\vspace{-0.2in}
\end{table}

As shown in Table \ref{main_results}, compared against the original SimCLR augmentation, a large 6\% improvement can be observed for our \myMethod{} on the linear evaluations. The original SimCLR policy performs similarly to the Base policy (crop and resize) on laparoscopic cholecystectomy datasets. For finetuning on segmentation tasks, augmentations found by our method can outperform the original SimCLR policy by 1-3\%. This can be considered as a significant improvement in the context of SSL evaluation \cite{he2020momentum,bardes2022vicreg}. 
Without the augmentation search, using randomly selected operations with our search space results in a complete collapse of the representations, where the model outputs a constant vector \cite{jing2022understanding}. It has been shown in existing work \cite{jing2022understanding} that excessive augmentations could cause dimension or complete collapse. It is worth noting that the augmentation policy found by \myMethod{} avoids selecting an excessive number of operations by selecting \textit{Identical}, as shown in Figure \ref{prob_dist_svhm} and \ref{prob_dist_cholec80}.
Compared with SelfAugment, our method also demonstrated consistently superior performance. On our private SVHM dataset, the augmentation policy found by SelfAugment caused a complete collpase. 
For pretraining with Cholec80, \myMethod{} outperforms SelfAugment by 13\% in the linear evaluations.

\subsection{Analysis of the Augmentation Policy Found by \myMethod}
\label{analysis_found_policy}

\begin{figure}[!hbt]
    \centering
    \subfigure{
        \includegraphics[width=95pt]{figures/linear_prob_over_N_SVHM_catonly.pdf}
        \label{linear_prob_svhm}
    }
    \hspace{-0.2in}
    \subfigure{
        \includegraphics[width=95pt]{figures/linear_prob_over_N_Cholec80_catonly.pdf}
        \label{linear_prob_cholec80}
    }
    \hspace{-0.2in}
    \subfigure{
        \includegraphics[width=115pt]{figures/SVHM_search_over_N_dist.pdf}
        \title{Pretraining on SVHM}
        \label{prob_dist_svhm}
    }
    \hspace{-0.25in}
    \subfigure{
        \includegraphics[width=115pt]{figures/Cholec80_search_over_N_dist.pdf}
        \title{Pretraining on Cholec80}
        \label{prob_dist_cholec80}
    }
    \vspace{-0.1in}
    \caption{
        (a-b) Linear probing accuracy on the Cholec80 Tool dataset with different numbers of augmentation operations ($N$). Each data point is an individual run of the experiment, from augmentation search to pretraining and evaluations.  
        (c-d) Distributions of different augmentation operations found by our method. In subfigures (a) and (c), results are obtained by pretraining on our private SVHM dataset. In subfigures (b) and (d), results are obtained by pretraining on the public dataset Cholec80. 
    }
    \label{fig1}
    \vspace{-0.1in}
\end{figure}

In this subsection, we investigate the effect of the different numbers of operations in the policy for the representation quality. We also examine the policy found by \myMethod. 
As shown in Figure \ref{linear_prob_svhm} and \ref{linear_prob_cholec80}, the representation quality evaluated by the linear probing shows a consistent improvement over the augmentation used by SimCLR and the Base policy.
Similar results for finetuning can be found in Appendix \ref{appendix_additional_results}. This consistent improvement indicates that the commonly used augmentations on natural images appear suboptimal for a medical dataset such as laparoscopic cholecystectomy (LC). 

\begin{figure}[t]
    \centering
        \includegraphics[width=350pt]{figures/representation_space.pdf}
        \vspace{-0.1in}
        \caption{Illustration of DDA and SimCLR augmented images on SVHM dataset.}
        \label{fig:representation_space}
        \vspace{-0.2in}

\end{figure}

To further investigate the difference between the optimal policy for natural images and policies found by \myMethod, which are suitable for the medical datasets, we plotted the distributions of selected operations in Figure \ref{prob_dist_svhm} and \ref{prob_dist_cholec80}. 
It can be observed that despite the different pretraining datasets (our private dataset SVHM and Cholec80), the resulting policy is similar. Detailed results for each policy are in Appendix \ref{appendix_augmentation_result}. The policy found by \myMethod{} prefers selecting \textit{Gaussian Blur}, \textit{Saturation}, \textit{Posterize}, and \textit{Sharpness}. 
One common characteristic regarding these operations is that they do not change the colour profile of the image. In the existing literature, it has been shown that an augmentation resulting in an overlapping view of two different images and semantically similar to each other is beneficial for contrastive learning \cite{cai2020all, wang2022chaos, huang2023towards, joshi2023data}. 
In the context of LC images, randomly changing the colour profile to other random colours is not ideal for creating such an overlapping view, since the majority of the contents in the images are red. 
On the other hand, the \textit{Gaussian Blur} and \textit{Posterize} could create a blurring effect or decrease the image quality of the image that could easily create an overlapping view. This is because in LC surgery, motion movement of the camera and fog generated during anatomy dissection by diathermy hook  can degrade image quality. 
Similarly, the effect of \textit{Sharpness} and \textit{Saturation} could also inherently appear in the dataset. 
In Figure \ref{fig:representation_space}, we provide a visualisation of augmented images using \myMethod{} and SimCLR augmentation, where \myMethod{} can create semantically similar views while SimCLR augmentation produces different and unrealistic views. 
Additional augmented images can be found in Appendix \ref{visualisation_of_augmented_images}. 
In summary, the augmentation policy found by \myMethod{} is more effective (verified by linear probing and finetuning) and well suited to the LC dataset. 



\section{Conclusion}
In this study we introduced \myMethod, an automatic augmentation search method explicitly tailored for contrastive learning which considers dimensionality characteristics of the deep representation. \myMethod{} showcases both efficiency and effectiveness in identifying inherently optimised augmentation policies for laparoscopic images. 
Beyond its application to laparoscopic images, \myMethod{} has the potential to be used in other contexts, particularly in representation learning for medical images.

\midlacknowledgments{This research was supported by The University of Melbourne’s Research Computing Services and the Petascale Campus Initiative. All data are provided with ethics approval through St Vincent’s Hospital (ref HREC/67934/SVHM-2020-235987). }

\bibliography{midl24_17}


\clearpage
\appendix

\section{DAA Algorithm}
\label{appendix:DAA_algorithm}


\begin{figure}[!hbt]
    \centering
        \includegraphics[width=350pt]{figures/lid_r.pdf}
        \caption{The figure shown on the left is a representation learned by training a SimCLR model with DDA. These representations are projected onto a 2D space using t-SNE. In this visualization, the radius, $r$, indicates the maximum distance from the query to the relevant neighbourhood, while $r1$ and $r2$ represent the distances from the query to the first (NN-1) and second (NN-2) nearest data points, respectively. The third nearest neighbour (NN-3) lies on the sphere at the same distance from the query as $r$.}
        \label{fig:lid_r}
\end{figure}


\SetKwComment{Comment}{/* }{ */}
\begin{algorithm2e}[!hbt]
\small
\caption{Using \myMethod{} to obtain a pre-trained model (higher level pipeline)}
\SetKwInput{KwData}{Input}
\SetKwInput{KwResult}{Output}
\KwData{Encoder $f$, projector $g$, unlabeled dataset $\mathcal{D}_u$, basic augmentation $\mathcal{T}_{basic}$, augmentation policy $\mathcal{T}_{\vw,\lambda}$ }
\KwResult{Optimal policy $\mathcal{T^{*}}_{\vw^{*},\lambda^{*}}$, final encoder pretrained with optimal policy $f^{*}$ }\
\BlankLine
Step 1: Conduct contrastive learning for $g\circ f$ on $\mathcal{D}_u$ using $\mathcal{T}_{basic}$ \\
Step 2: Optimise $\mathcal{T}_{\vw,\lambda}$ on fixed pre-trained $f$ to find $\mathcal{T^{*}}_{\vw^{*},\lambda^{*}}$ through DDA search (see Algorithm \ref{alg:LDAug}) \\
Step 3: Redo contrastive learning for randomly initialised model $g\circ f$ on $\mathcal{D}_u$ using $\mathcal{T^{*}}_{\vw^{*},\lambda^{*}}$ to obtain final $f^*$  \\
\label{alg:LDA_framework}
\end{algorithm2e}


\SetKwComment{Comment}{/* }{ */}
\begin{algorithm2e}[!hbt]
\small
\caption{\myMethod{} Search}
\SetKwInput{KwData}{Input}
\SetKwInput{KwResult}{Output}
\KwData{Encoder $f(\cdot)$, Dataset $\mathcal{D}_u$, policy $\mathcal{T}_{\vw,\lambda}$, learning rate $\alpha$, neighbourhood size $k$, number of epochs $E$, basic augmentation $\mathcal{T}_{basic}$}
\KwResult{Optimal policy $\mathcal{T^{*}}_{\vw^{*},\lambda^{*}}$}
\
\BlankLine
Conduct contrastive learning for $f(\cdot)$ on $\mathcal{D}_u$ with $\mathcal{T}_{basic}$ \\
\For{$e\leftarrow 1$ \KwTo $E$}{
  \For{$i\leftarrow1$ \KwTo Number of Batches}{
  $\xx$ = Sample($\mathcal{D}_u$) \Comment*[r]{Random sample batch of images}
  $\xx'$ = $\mathcal{T}_{\vw,\lambda}$($\xx$) \Comment*[r]{Augment images}
  $\mathbf{z} = f(\xx')$ \Comment*[r]{Obtain representations}
  LID$_{\xx'}$ = estimations($\mathbf{z}$, $k$) \Comment*[r]{LID estimations}
  % $\mathcal{L}$ = - LIDep$_{\xx'}$ 
  $\mathcal{L}$ = - log(LID$_{\xx'}$) \Comment*[r]{Follow \equationref{eq:abs_fr_distance}}
  ${(\vw,\lambda)}^{i+1}$ = ${(\vw,\lambda)}^{i}$ - $\alpha \nabla \mathcal{L}({(\vw,\lambda)}^{(i)})$ \Comment*[r]{Gradient descent on $\mathcal{T}$ parameters}
  }
}
\label{alg:LDAug}
\end{algorithm2e}

In Figure \ref{fig:lid_r}, we provide an intuitive example of LID, which describes the relative rate at which its cumulative distance function (CDF), the $F(r)$ increases as the distance $r$ increases from 0. We use the representation learned by SimCLR and project it into a 2D space using t-SNE as an example. Considering a radius $r$ for the query point (the red dot), the LID measures the rate of growth in the number of data objects (blue dots) encountered as the radius $r$ increases. The LID can be estimated by calculating the distances to its k-nearest neighbourhoods (orange dots), such as Maximum Likelihood Estimator (MLE) \cite{levina2004maximum} and the Method of Moments (MoM) \cite{amsaleg2018extreme}. 

In Algorithm \ref{alg:LDAug}, we present the pseudo-code for applying DDA. On a high level, we use basic augmentation to train a contrastive learning encoder and then use this encoder to obtain representations of the data. We optimize a differentiable augmentation policy $\mathcal{T^{*}}_{\vw^{*},\lambda^{*}}$ with randomly initialized parameters that can maximize the LID of the representations. The optimized policy can then be used to train the final encoder with contrastive learning. 




\section{Experiments}
\label{appendix_exp}
We conducted our experiments on Nvidia A100 GPUs with PyTorch implementation, with each experiment distributed across 4 GPUs. We used automatic mixed precision due to its memory efficiency. The implementation of the differentiable search following existing work \cite{hataya2020faster} and their official code base, as well as the Kornia library\footnote{https://github.com/kornia/kornia}. 
Our implementation is available in this code repository:  \href{https://github.com/JoJoNing25/DDAug}{https://github.com/JoJoNing25/DDAug}.


\subsection{Experiment Settings}
\label{appendix:experiment_settings}

In this section, we provide details regarding our experiment settings. 

\noindent\textbf{Pretraining.}
For contrastive pretraining, we use SimCLR \cite{chen2020simple} and ResNet-50 \cite{he2016deep}, pretraining for 50 epochs, with a base learning rate of 0.075 with the square root scaling rule. We use LARS \cite{you2017large} as the optimiser and weight decay of $1 \times 10^{-6}$. We use batch size of 192 for all experiments. 

We constructed a large-scale in-house dataset (SVHM) by prospectively recording 70 laparoscopic cholecystectomy (LC) videos, each for an operative case with diverse disease severity and anatomical variability \cite{madni2018parkland}, over three major teaching hospitals in Australia, the Epworth HealthCare, St Vincent's Hospital Melbourne public and private sectors. 
Procedures followed the same supine approach \cite{carroll1995laparoscopic}, and we only considered data from the initial grasping of the gallbladder up to the cutting of the cystic duct in $1,960 \times 1,080$ pixels.

We perform the pretraining with public datasets Cholec80 \cite{twinanda2016endonet}, and our private SVHM dataset. Cholec80 has 400,000 frames from 80 videos at 2 fps. We use 70 videos for the pretraining and resevering the rest 10 vidoes for downstream evaluations. 
For our private SVHM dataset, we use 300,000 images dissected at 4 frame-per-second (fps) from 50 videos. We use image size of $480 \times 854$ pixels for pretraining with Cholec80 and $432 \times 784$ for our private dataset.



\noindent\textbf{Linear Probing and Finetuning.}
For downstream evaluations, we use the Cholec80 tool classification dataset \cite{twinanda2016endonet} for linear probing, CholecSeg8k \cite{DBLP:journals/corr/abs-2012-12453} and our private labelled dataset (SVHM Seg). 


Cholec80 tool dataset \cite{twinanda2016endonet} is a multi-class classification task to determine if the sugical tool is presence in the image. We use 70 videos for training and the rest 10 videos for evaluations. There is no overlap between the videos for evaluations and training/pretraining. 

CholecSeg8k \cite{DBLP:journals/corr/abs-2012-12453} is a labeled subset of Cholec80 where 8,080 frames of $480 \times 854$ at 25 fps are extracted from 17 videos in Cholec80. We removed frames from training set that are presence in the test set to ensure no leakage.


Our SVHM Seg dataset is collected from 20 videos and yielded 1,975 frames with a training dataset of 1,583 frames from 16 videos and a test set of 392 frames from 4 videos unseen in the training set. The dataset was annotated with 20 classes and validated by our surgeons, details are in Table \ref{tab:private_Seg definition}.  
There is no overlap between the SVHM Seg test set, the SVHM Seg training set, or the pretraining SVHM dataset since they are split based on videos (distinctive operating case). 

\begin{table}[!hbt]
\centering
\caption{Class description of our SVHM Seg dataset. }
\label{tab:private_Seg definition}
\resizebox{\columnwidth}{!}{%
\begin{tabular}{@{}cc@{}}
\toprule
Class name             & Description                                                                                           \\ \midrule
Abdominal wall         & abdominal wall                                                                                         \\
Background             & black background beyond circular visual field                                                         \\
Cholangiogram catheter & instrument to apply dye-enhanced imaging for bile ducts visulization (includes shaft, trip and catheter) \\
Clip applicator        & instrument to apply clips to close cystic artery and duct (includes shaft, trip and catheter)        \\
Common bile duct       & bile duct drain from hepatic ducts to duodenum                                                       \\
Cystic artery          & blood supply to the gallbladder                                                                      \\
Cystic duct            & duct draining bile from gallbladder to common bile duct                                              \\
Diathermy hook shaft   & diathermy hook instrument - shaft                                                                    \\
Diathermy hook tip     & diathermy hook instrument - tip                                                                      \\
Duodenum               & dection of gastrointestinal tract where common bile duct drains,   distal to stomach                   \\
Gallbladder            & gallbladder                                                                                             \\
Grasper shaft          & grasping instrument of any kind - shaft                                                                \\
Grasper tip            & grasping instrument of any kind - tip                                                                     \\
Liver                  & all other liver segments                                                                                    \\
Omentum                & intra-abdominal fat, includes small bowel                                                                \\
Rouviere's sulcus      & cleft on the right side of the liver; important landmark                                                \\
Scissors shaft         & instrument to cut tissues and structures                                                                 \\
Scissors tip           & instrument to cut tissues and structures                                                                 \\
Segment iv             & segment of liver to the patient left side of gallbladder                                                   \\
Sucker irrigator       & cylindrical instrument for suction and   irrigation                                                        \\ \bottomrule
\end{tabular}
}
\end{table}


For linear probing, we follow the standard protocol that adds a linear classification layer on top of the frozen encoder. Following \citet{ramesh2023dissecting}, for the Cholec80 tool dataset, we use weighted binary cross entropy loss. We use SGD as an optimiser with a learning rate of 0.1, weight decay of $1.0 \times 10^{-4}$, and batch size of 256 and 80 epochs. 

For finetuning on the segmentation task, DeepLabV3+ \cite{chen2017rethinking} is used for the segmentation head. For all datasets, we use AdamW \cite{loshchilov2018decoupled} as the optimiser with a learning rate of 0.005, weight decay of 0.05, and batch size of 32 and 100 epochs. We use the same image size as used in pretraining. 



\subsection{Search Space of the Augmentation}
\label{appendix:search_space}
We summarized all selected operations for the search space in Table \ref{search_space_table}. The augmentation policy used by SimCLR \cite{chen2020simple} on ImageNet \cite{deng2009imagenet} is summarized in Table \ref{simclr_aug_policy}. For easy comparison, we converted the strength used by SimCLR into the same scale as our search space. 
We perform the search for 10 epochs with a learning rate of 0.01, and Adam \cite{kingma2014adam} as the optimiser. This takes around 8 hours for our private dataset and 3 hours for Cholec80. 

For comparison with the \textit{Manual}, it is manually selected augmentation based on domain expert.
In supervised learning, existing works \cite{tokuyasu2021development, silva2022analysis, scheikl2020deep, owen2022automated} have shown that rotation by 30 degrees, contrast, Gaussian noise, and Gaussian blur are commonly used for supervised segmentation tasks. Based on this domain knowledge, we constructed an additional manual selection policy using these popular augmentations from the literature. We set the probability of applying each augmentation to 0.8. The strength for rotation is 30 degrees, the strength for contrast and Gaussian noise is randomly sampled, and the sigma is set to 0.1 to 2.0 for Gaussian blur based on the settings from the above-mentioned literature. 


For comparison with the baseline method, the SelfAugment \cite{reed2021selfaugment}, for fair comparison and efficiency, we use our differentiable framework instead of the original Bayesian optimisation. This is more efficient because only one additional proxy linear layer is required. We adopt the original proxy SSL task, rotation for the proxy linear layer, and Min-Max (minimize $\mathcal{L}_{SS}$ and maximize $\mathcal{L}_{NTXent}$) loss objective defined as the following:
\begin{equation}
    \label{selfaugment_objective}
    \argmin_{\mathcal{T}}\mathcal{L}_{SS} - \mathcal{L}_{NTXent},
\end{equation}
where the $\mathcal{L}_{SS}$ is the rotation objective function, and $\mathcal{L}_{NTXent}$ is following \equationref{ntxent_loss}.
More simply, instead of applying \equationref{eq:abs_fr_distance} of the \myMethod, we apply \equationref{selfaugment_objective} for the SelfAugment to compare within our experiment. All other hyperparameters are kept the same. 

We summarize the augmentation policy found by SelfAugment in Tables \ref{selfaug_policy_SVHM} and \ref{selfaug_policy_cholec80}. 

\begin{table}[!hbt]
\centering
\caption{List of all image augmentations that the policy can choose from during the search. }
\begin{adjustbox}{width=0.95\linewidth}
\begin{tabular}{ccc}
\hline
Operation Name & Description & Range of magnitudes \\ \hline
Identical & No augmentation & N/A \\
Brightness & \makecell{Adjust the brightness of the image. A magnitude of 0 does not modify the input image, \\ whereas magnitude of 1 gives the white image.} & {[}0.0, 1.0{]} \\
Contrast & \makecell{Control the contrast of the image. A magnitude of 0 generates a completely black image, \\ 1 does not modify the input image, while any other non-negative number modifies \\ the brightness by this factor.} & {[}0.0, 1.0{]} \\
Hue & \makecell{The image hue is adjusted by converting the image to HSV and cyclically shifting the \\ intensities in the hue channel (H). A magnitude of $\pi$ and $-\pi$ give complete reversal of hue \\ channel in HSV space in positive and negative directions, respectively. 0 means no shift.} & {[}$-\pi$, $\pi${]} \\
Saturation & \makecell{Adjust the saturation of the image. A magnitude of 0 will give a black-and-white image, \\ 1 will give the original image, and 2 will enhance the saturation by a factor of 2.} & {[}0.0, 2.0{]} \\
Solarize & \makecell{Invert all pixels above a threshold value of magnitude.} & {[}0.0, 1.0{]} \\
Gaussian Blur & \makecell{Blurs image with randomly chosen Gaussian blur. The kernel size is kept fixed at (23, 23),\\ and the magnitude controls the standard deviation to be used for creating a kernel to perform \\ blurring.} & {[}0.0, $\infty$ {]} \\
Posterize & \makecell{Reduce the number of bits for each pixel to magnitude bits.} & {[}0, 8{]} \\
Gray & Convert the image to a grey scale. No magnitude parameters.  & N/A \\
Sharpness & \makecell{Adjust the sharpness of the image. Adjust the sharpness of the image. A magnitude of 0\\ gives the original image, whereas a magnitude of 1 gives the sharpened image. } & {[}0.0, 1.0{]} \\ \hline
\end{tabular}
\end{adjustbox}
\label{search_space_table}
\end{table}

\begin{figure}[!hbt]
    \centering
        \includegraphics[width=400pt]{figures/10_aug_images.pdf}
        \caption{Illustration of 10 augmentation operations (1)-(10) in our search space.}
        \label{fig:10_aug_images}
\end{figure}

\begin{table}[!hbt]
\centering
\caption{The augmentation policy used by SimCLR }
\begin{tabular}{ccc}
\hline
 & Augmentations & Strengths \\ \midrule
Operation No.1 & Brightness (80\%), Identical (20\%) & 0.8 \\
Operation No.2 & Contrast (80\%), Identical (20\%) & 0.8 \\
Operation No.3 & Saturation (80\%), Identical (20\%) & 0.8 \\
Operation No.4 & Hue (80\%), Identical (20\%) & 1.26 \\
Operation No.5 & Gray (20\%), Identical (80\%) & N/A \\ 
Operation No.6 & GaussianBlur (50\%), Identical (50\%) & {[}0.0, 2.0{]} \\ \midrule
\end{tabular}
\label{simclr_aug_policy}
\end{table}

\begin{table}[!hbt]
\centering
\caption{The augmentation policy was found by SelfAugment(with $N=5$) using our private SVHM dataset as the pretraining dataset. }
\begin{adjustbox}{width=0.95\linewidth}
\begin{tabular}{ccc}
\hline
 & Augmentations & Strengths \\ \midrule
Operation No.1 & Hue (2\%) & 1.11 \\
Operation No.2 & Contrast (97\%), Hue (2\%), Saturation (1\%) & 0.01, 1.06, 1.98 \\
Operation No.3 & Brightness (95\%), Hue (5\%) & 0.32, 0.73 \\
Operation No.4 & Hue (100\%) & 1.66 \\
Operation No.5 & Saturation (91\%), Hue (7\%), Brightness (1\%), Contrast (1\%) & 2.00, 1.11, 0.71, 0.25 \\ \midrule
\end{tabular}
\end{adjustbox}
\label{selfaug_policy_SVHM}
\end{table}


\begin{table}[!hbt]
\centering
\caption{The augmentation policy was found by SelfAugment(with $N=5$) using Cholec80 as the pretraining dataset. }
\begin{adjustbox}{width=0.95\linewidth}
\begin{tabular}{ccc}
\hline
 & Augmentations & Strengths \\ \midrule
Operation No.1 & Saturation(100\%) & 2.00 \\
Operation No.2 & Saturation(100\%) & 2.00 \\
Operation No.3 & Hue (100\%) & 1.54 \\
Operation No.4 & Saturation (89\%), Solarize (11\%) & 2.00, 0.03 \\
Operation No.5 & Saturation (54\%), Hue (25\%), Solarize (14\%) Contrast (7\%) & 2.00, 2.57, 0.79, 0.09 \\ \midrule
\end{tabular}
\end{adjustbox}
\label{selfaug_policy_cholec80}
\end{table}


\clearpage
\subsection{Augmentation Policy found by \myMethod}
\label{appendix_augmentation_result}

In this section, we summarize the augmentation policy found by our \myMethod. For our default choice $N=5$, the found augmentation policy is summarized in Tables \ref{ldaug_policy_SVHM} and \ref{ldaug_policy_cholec80}. 
The augmentation policies corresponding to Figure \ref{fig1} are summarized in Figures \ref{ldaug_policy_cholec80_2} to \ref{ldaug_policy_cholec80_8}. 

\begin{table}[!hbt]
\centering
\caption{The augmentation policy was found by \myMethod{}(with $N=5$) using our private SVHM dataset as the pretraining dataset. }
\begin{adjustbox}{width=0.95\linewidth}
\begin{tabular}{ccc}
\hline
 & Augmentations & Strengths \\ \midrule
Operation No.1 & Identical (89\%), Posterize (8\%), GaussianBlur (2\%) & N/A,  0.96, {[}0.22, 0.28{]} \\
Operation No.2 & Saturation(66\%), Sharpness (20\%), Posterize (10\%) & 1.07,  0.06,  0.99 \\
Operation No.3 & Identical (93\%), Posterize (7\%) & N/A, 1.00 \\
Operation No.4 & Identical (99\%) & N/A \\
Operation No.5 & GaussianBlur (100\%) & {[}0.17, 0.98{]} \\ \midrule
\end{tabular}
\end{adjustbox}
\label{ldaug_policy_SVHM}
\end{table}


\begin{table}[!hbt]
\centering
\caption{The augmentation policy was found by \myMethod(with $N=5$) using Cholec80 as the pretraining dataset. }
\begin{adjustbox}{width=0.95\linewidth}
\begin{tabular}{ccc}
\hline
 & Augmentations & Strengths \\ \midrule
Operation No.1 & Identical (54\%), GaussianBlur (34\%), Posterize (8\%) & N/A,  {[}0.16, 0.53{]}, 1.00 \\
Operation No.2 & Saturation(90\%), GaussianBlur (5\%), Hue (4\%) & 1.12,  {[}0.14, 0.17{]},  -1.32 \\
Operation No.3 & Identical (100\%) & N/A \\
Operation No.4 & Identical (100\%) & N/A \\
Operation No.5 & GaussianBlur (100\%) & {[}0.17, 0.79{]} \\ \midrule
\end{tabular}
\end{adjustbox}
\label{ldaug_policy_cholec80}
\end{table}





\begin{figure}[!hbt]
    \centering
    \includegraphics[width=300pt]{figures/Cholec80_tau_0.1_layers_2_dist.pdf}
    \caption{Augmentation policy found by \myMethod{} with $N=2$ on the Cholec80 dataset. The operation number is indicated on y-axis, and augmentation choices on x-axis. The number on each cell indicates the probability of such augmentation being selected in corresponding operation.}
    \label{ldaug_policy_cholec80_2}
\end{figure}


\begin{figure}[!hbt]
    \centering
    \includegraphics[width=300pt]{figures/Cholec80_tau_0.1_layers_4_dist.pdf}
    \caption{Augmentation policy found by \myMethod{}with $N=4$ on the Cholec80 dataset. The operation number is indicated on y-axis, and augmentation choices on x-axis. The number on each cell indicates the probability of such augmentation being selected in corresponding operation.}
\end{figure}


\begin{figure}[!hbt]
    \centering
    \includegraphics[width=300pt]{figures/Cholec80_tau_0.1_layers_5_dist.pdf}
    \caption{Augmentation policy found by \myMethod{} with $N=5$ on the Cholec80 dataset. The operation number is indicated on y-axis, and augmentation choices on x-axis. The number on each cell indicates the probability of such augmentation being selected in corresponding operation.}
\end{figure}


\begin{figure}[!hbt]
    \centering
    \includegraphics[width=300pt]{figures/Cholec80_tau_0.1_layers_6_dist.pdf}
    \caption{Augmentation policy found by \myMethod{} with $N=6$ on the Cholec80 dataset. The operation number is indicated on y-axis, and augmentation choices on x-axis. The number on each cell indicates the probability of such augmentation being selected in corresponding operation.}
\end{figure}

\begin{figure}[!hbt]
    \centering
    \includegraphics[width=300pt]{figures/Cholec80_tau_0.1_layers_8_dist.pdf}
    \caption{Augmentation policy found by \myMethod{} with $N=8$ on the Cholec80 dataset. The operation number is indicated on y-axis, and augmentation choices on x-axis. The number on each cell indicates the probability of such augmentation being selected in corresponding operation.}
    \label{ldaug_policy_cholec80_8}
\end{figure}


\clearpage
\subsection{Additional Results}
\label{appendix_additional_results}

\begin{table}[!ht]
\centering
\caption{Extended table of Table \ref{main_results} with Argmax sampling. All results are based on using ResNet-50 as encoder and SimCLR as contrastive pretraining. Results of the linear probing are reported using mean Average Precision (mAP) (\%), and finetuning on downstream segmentation tasks is reported using mIoU (\%). The best results are in \textbf{boldface}. }
\begin{adjustbox}{width=0.95\linewidth}
\begin{tabular}{c|c|c|c|cc}
\midrule
\multirow{2}{*}{Pretraining Dataset} & \multirow{2}{*}{Augmentation} & \multirow{2}{*}{Sampling} & \begin{tabular}[c]{@{}c@{}}Linear Prob\\ (Classification)\end{tabular} & \multicolumn{2}{c}{\begin{tabular}[c]{@{}c@{}}Finetune\\ (Segmentation)\end{tabular}} \\ \cline{4-6} 
 & & & Cholec80 Tool & SVHM Seg & CholecSeg8K \\ \midrule
\multirow{8}{*}{SVHM} & SimCLR & N/A & 60.00 & 57.28 & 56.18 \\
 & Base & N/A & 59.01 & 57.15 & \textbf{58.71} \\ \cline{2-6} 
 & Random & Argmax & Complete Collapse & N/A & N/A \\
 & Random & Categorical & Complete Collapse & N/A & N/A \\
 & SelfAugment & Argmax & Complete Collapse & N/A & N/A \\ 
 & SelfAugment & Categorical & Complete Collapse & N/A & N/A \\ 
 & \myMethod{}& Argmax & 60.13 & 57.64 & 56.19 \\
 & \myMethod{}& Categorical & \textbf{65.95} & \textbf{58.29} & 57.86 \\ \midrule
\multirow{8}{*}{Cholec80} & SimCLR & N/A & 67.59 & 58.29 & 56.02 \\
 & Base & N/A & 67.78 & 58.36 & 57.04 \\ \cline{2-6} 
 & Random & Argmax & Complete Collapse & N/A & N/A \\
 & Random & Categorical & Complete Collapse & N/A & N/A \\
 & SelfAugment & Argmax & 62.28 & 58.41 & 58.07 \\ 
 & SelfAugment & Categorical & 60.24 & 58.41 & 55.86 \\ 
 & \myMethod{}& Argmax & 72.02 & 58.86 & 55.09 \\
 & \myMethod{}& Categorical & \textbf{73.59} & \textbf{59.31} & \textbf{59.40} \\ \midrule
\end{tabular}
\end{adjustbox}
\label{main_results_extend}
\end{table}



\begin{figure}[!hbt]
    \centering
    \subfigure{
        \includegraphics[width=105pt]{figures/finetune_SVHM_over_N_SVHM.pdf}
    }
    \hspace{-0.2in}
    \subfigure{
        \includegraphics[width=105pt]{figures/finetune_Cholec8k_over_N_SVHM.pdf}
        \label{finetune_Cholec8k_over_N_SVHM}
    }
    \hspace{-0.2in}
    \subfigure{
        \includegraphics[width=105pt]{figures/finetune_SVHM_over_N_Cholec80.pdf}
    }
    \hspace{-0.2in}
    \subfigure{
        \includegraphics[width=105pt]{figures/finetune_Cholec8k_over_N_Cholec80.pdf}
    }
    \caption{
        (a-b) Pretraining on our private dataset. Each data point is an individual run of the experiment, from augmentation search to pretraining and evaluations. 
        (c-d) Pretraining on the public the dataset Cholec80.
        In subfigures (a) and (c), results are showing finetuning on our private SVHM Seg dataset. In subfigures (b) and (d), results are showing finetuning on the public dataset CholecSeg8k. 
    }
    \label{fig2}
\end{figure}


\begin{table}[!ht]
\centering
\caption{Extended table of Table \ref{main_results} with MOCO contrastive pretraining. All results are based on using ResNet-50 as encoder. Results of the linear probing are reported using mean Average Precision (mAP) (\%), and finetuning on downstream segmentation tasks is reported using mIoU (\%). The best results are in \textbf{boldface}. }
\begin{adjustbox}{width=0.95\linewidth}
\begin{tabular}{@{}c|c|c|c|cc@{}}
\toprule
\multicolumn{1}{l|}{\multirow{2}{*}{Pretraining Dataset}} & \multicolumn{1}{l|}{\multirow{2}{*}{Loss Objective}} & \multicolumn{1}{l|}{\multirow{2}{*}{Augmentation}} & \multicolumn{1}{l|}{\begin{tabular}[c]{@{}l@{}}Linear Prob \\ (Classification)\end{tabular}} & \multicolumn{2}{c}{\begin{tabular}[c]{@{}c@{}}Finetune\\ (Segmentation)\end{tabular}} \\ \cmidrule(l){4-6} 
\multicolumn{1}{l|}{}                                     & \multicolumn{1}{l|}{}                                & \multicolumn{1}{l|}{}                              & \multicolumn{1}{l|}{Cholec80 Tool}                                                              & \multicolumn{1}{l}{SVHM Seg}              & \multicolumn{1}{l}{CholecSeg8K}             \\ \midrule
\multirow{4}{*}{SVHM}                                     & SimCLR                                               & SimCLR                                             & 60.00                                                                                           & 57.28                                     & 56.18                                       \\
                                                          & SimCLR                                               & DDA                                                & \textbf{65.95}                                                                                  & \textbf{58.29}                            & \textbf{57.86}                              \\
                                                          & MoCo                                                 & MoCo                                               & 53.94                                                                                           & 57.03                                     & 58.60                                       \\
                                                          & MoCo                                                 & DDA                                                & \textbf{57.35}                                                                                  & \textbf{57.81}                            & \textbf{58.91}                              \\ \midrule
\multirow{4}{*}{Cholec80}                                 & SimCLR                                               & SimCLR                                             & 67.59                                                                                           & 58.29                                     & 56.02                                       \\
                                                          & SimCLR                                               & DDA                                                & \textbf{73.59}                                                                                  & \textbf{59.31}                            & \textbf{59.40}                              \\
                                                          & MoCo                                                 & MoCo                                               &  61.75                                                                                               & 57.89                                          &   55.89                                          \\
                                                          & MoCo                                                 & DDA                                                &    \textbf{61.83}                                                                                             &  \textbf{58.29}                                         &  \textbf{57.20}                                           \\ \bottomrule
\end{tabular}

\end{adjustbox}
\label{tab:MOCO}
\end{table}


In Table \ref{main_results_extend}, we present extended results for using the Argmax sampling for the final policy. It can be observed that sampling from categorical distributions results in better performance. As a result, we use sampling from categorical distributions as default for our \myMethod{}. Note that the SimCLR augmentation policy also uses sampling from categorical distributions (see Table \ref{simclr_aug_policy}).  


In Figure \ref{fig2}, we plotted additional results for finetuning on CholecSeg8k and our Private Seg dataset. All details are the same as in Figure \ref{fig1}. It can be observed that it can either outperform or be on par with the SimCLR policy. We believe that the result in Figure \ref{finetune_Cholec8k_over_N_SVHM} that slightly under-perform to the Base policy is due to the data distribution difference between our SVHM dataset and Cholec80. It is worth noting that slightly under-performing in a few finetuning tasks is common in SSL evaluations, and the main evaluation metric is the linear evaluations \cite{bardes2022vicreg,huang2023ldreg}. The goal of the SSL is learning general representations; on average, \myMethod{} demonstrates solid improvement over existing methods. 

\subsection{Ablation Study on the Basic Augmentation for the Initial Encoder}

For the initial encoder, we used image cropping due to its importance and effectiveness in contrastive learning. This has been studied in existing works on both natural images \cite{chen2020simple} and X-ray images \cite{van2023exploring}. Other augmentations are also plausible. We performed an experiment with rotation as the initial augmentation choice. The experiment is conducted with the public dataset Cholec80, and the downstream evaluations are the same as the main paper. Results are in Table \ref{tab:ablation_intial}. 
It can be observed that using rotation can also outperform the baseline methods. However, image cropping as base for DDA is indeed more effective compared to rotation.


% Please add the following required packages to your document preamble:
% \usepackage{multirow}
\begin{table}[!hbt]
\centering
\caption{All results are based on using ResNet-50 as encoder and SimCLR as contrastive pretraining. Results of the linear probing are reported using mean Average Precision (mAP) (\%), and finetuning on downstream segmentation tasks is reported using mIoU (\%). The best results are in \textbf{boldface}. }
\begin{adjustbox}{width=0.95\linewidth}
\begin{tabular}{c|c|c|cc}
\toprule
\multirow{2}{*}{Initial Augmentation} & \multirow{2}{*}{Augmentation} & Linear Prob (Classification) & \multicolumn{2}{c}{Finetune} \\ \cline{3-5} 
 &  & Cholec80 Tool & SVHM Seg & CholecSeg8K \\ \midrule
N/A & SimCLR & 67.59 & 58.29 & 56.02 \\
N/A & Base & 67.78 & 58.36 & 57.04 \\ \midrule
Image Cropping & DDA & \textbf{73.59} & \textbf{59.31} & \textbf{59.40} \\
Rotation & DDA & 72.38 & 58.02 & 59.26 \\ \bottomrule
\end{tabular}
\end{adjustbox}
\label{tab:ablation_intial}
\end{table}


\subsection{Application on Other Datasets}

Although in this paper, we mainly focused on the laparoscopic images, DDA can also be applied in other domains. In this subsection, we perform an experiment with an X-ray image dataset, CheXpert \cite{irvin2019chexpert} and the natural image dataset, CIFAR10 \cite{krizhevsky2009learning}. 
All experimental settings are the same as our main paper. For CheXpert, we removed the operation converting to gray scale from the search space since X-ray images are already gray scale. Results reported using macro area under the ROC Curve (AUROC) with linear probing.
For CIFAR10, the search space is the same as our main paper, and results are reported as classification accuacy. Results are in Table \ref{tab:other_datasets}

\begin{table}[!hbt]
\centering
\caption{For CIFAR10, the results are reported with classification accuracy. For CheXpert, the results are reported as macro AUROC. }
\begin{adjustbox}{width=0.4\linewidth}
\begin{tabular}{c|c|c}
\toprule
Dataset & Augmentation & Linear Prob \\ \midrule
\multirow{2}{*}{CheXpert} & SimCLR & 72.4 \\
 & DDA & 71.5 \\ \hline
\multirow{2}{*}{CIFAR10} & SimCLR & 92.2 \\
 & DDA & 91.6 \\ \bottomrule
\end{tabular}
\end{adjustbox}
\label{tab:other_datasets}
\end{table}

It can be observed that DDA performs similarly with SimCLR on X-ray images. This does not indicate that DDA is not effective. This is because SimCLR policy is already performing in the optimal range. This dataset is also used by \citet{van2023exploring}. They conducted a grid search for the optimal augmentation policy on X-ray images. By conducting grid-search, they tried different combinations to find the best one. They reported Macro AUROC with optimal augmentation operations on this dataset is in the range of 68.8 to 73.6. This indicates that the SimCLR augmentation policy behaves differently with X-ray images compared to laparoscopic images. This result also indicates that DDA can find suitable augmentation for X-ray images. In the SimCLR paper \cite{chen2020simple}, the authors also conducted a grid search on CIFAR10 to find the optimal policy. It can be observed that DDA can also find suitable augmentation for natural images. 


In existing works \cite{chen2020simple,van2023exploring} that conduct a grid search, one needs to perform the pretraining on every possible combination of the augmentation. For DDA, it just needs the pretraining once, regardless of the number of combinations. Although the DDA result might not be the best one, it is very close to the best-performing one. The efficiency of DDA makes it very suitable for other medical images, which often have different characteristics. 





\section{Visualisations of the Augmented Images}
\label{visualisation_of_augmented_images}

In this section, we show the augmented images. For a batch of randomly selected images with no augmentation applied in Figure \ref{visual_original_images}, Figure \ref{visual_ldaug_images} shows the augmented images using our \myMethod, Figure \ref{visual_simclr_images} shows the augmented images using SimCLR policy, Figure \ref{visual_selfaug_images} shows the augmented images using SelfAugment augmentation, and Figure \ref{visual_random_images} shows the augmented images using random augmentation. 
To demonstrate the overlapping view we discussed in Section \ref{analysis_found_policy}, for every image shown in Figures \ref{visual_ldaug_images} and \ref{visual_simclr_images}, considering how easily to find another image that is visually similar to itself. Since laparoscopic cholecystectomy (LC) dataset contents are predominantly red in colour, a green gallbladder in the first row, the second and third columns from the top left corner of Figure \ref{visual_simclr_images}, is very unlikely to match with other images. As a result, augmentation operations like \textit{Hue} that change colour profiles are unsuitable in LC. 
Augmentation policies found by \myMethod(summarized in Section \ref{appendix_augmentation_result}) are inherently more suitable for the dataset it is searched on. 

\begin{figure}[!hbt]
    \centering
    \includegraphics[width=320pt]{figures/simclr_original_images.pdf}
    \caption{Visualisations of the images in our private dataset with no augmentation applied. }
    \label{visual_original_images}
\end{figure}

\begin{figure}[!hbt]
    \centering
    \includegraphics[width=320pt]{figures/SVHM_tau_0.1_layers_5_aug_images.pdf}
    \caption{Visualisations of the images in our private dataset with the augmentation policy found by \myMethod($N=5$). }
    \label{visual_ldaug_images}
\end{figure}


\begin{figure}[!hbt]
    \centering
    \includegraphics[width=320pt]{figures/simclr_aug_images.pdf}
    \caption{Visualisations of the images in our private dataset with the SimCLR augmentation policy. }
    \label{visual_simclr_images}
\end{figure}

\begin{figure}[!hbt]
    \centering
    \includegraphics[width=320pt]{figures/SVHM_tau_0.1_layers_5_selfaug_images.pdf}
    \caption{Visualisations of the images in our private dataset with the SelfAugment augmentation applied. }
    \label{visual_selfaug_images}
\end{figure}

\begin{figure}[!hbt]
    \centering
    \includegraphics[width=320pt]{figures/SVHM_tau_0.1_layers_5_random_aug_images.pdf}
    \caption{Visualisations of the images in our private dataset with random augmentation applied. }
    \label{visual_random_images}
\end{figure}

\end{document}
