\documentclass{midl} % Include author names
\usepackage{graphicx}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{booktabs}
\usepackage{pythonhighlight}
\usepackage{pifont}
\newcommand{\xmark}{\ding{55}}
\usepackage[subtle]{savetrees}
\newcommand{\keypoint}[1]{\noindent\textbf{#1}.\quad}
\usepackage{diagbox}
\usepackage{multirow}
\usepackage{listings}
\usepackage{adjustbox,lipsum}
\usepackage[utf8]{inputenc}
\usepackage{caption}
% \usepackage{graphicx}
% \usepackage{subcaption}

\usepackage{listings}

\definecolor{codegreen}{rgb}{0,0.6,0}
\definecolor{codegray}{rgb}{0.5,0.5,0.5}
\definecolor{codepurple}{rgb}{0.58,0,0.82}
\definecolor{backcolour}{rgb}{0.95,0.95,0.92}

\lstdefinestyle{mystyle}{
    backgroundcolor=\color{backcolour},   
    commentstyle=\color{codegreen},
    keywordstyle=\color{magenta},
    numberstyle=\tiny\color{codegray},
    stringstyle=\color{codepurple},
    basicstyle=\ttfamily\footnotesize,
    breakatwhitespace=false,         
    breaklines=true,                 
    captionpos=b,                    
    keepspaces=true,                 
    numbers=left,                    
    numbersep=5pt,                  
    showspaces=false,                
    showstringspaces=false,
    showtabs=false,                  
    tabsize=2
}

\lstset{style=mystyle}

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{mwe} % to get dummy images
% \jmlrvolume{-- Under Review}
\jmlrvolume{}
\jmlryear{2024}
\jmlrworkshop{Full Paper -- MIDL 2024 submission}
% \editors{Under Review for MIDL 2024}
\editors{Accepted at MIDL 2024}

\title[PEFT in Medical Image Analysis]{Parameter-Efficient Fine-Tuning for Medical Image Analysis: The Missed Opportunity}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Raman Dutt} \Email{raman.dutt@ed.ac.uk}\\
\addr School of Informatics, The University of Edinburgh, UK \AND
% \addr $^{1}$ Address 1 \\
% \addr $^{2}$ Address 2 \AND
\Name{Linus Ericsson} \Email{linus.ericsson@ed.ac.uk}\\
\addr School of Informatics, The University of Edinburgh, UK \AND
\Name{Pedro Sanchez} \Email{pedro.sanchez@ed.ac.uk} \\
\addr School of Engineering, The University of Edinburgh, UK \AND
\Name{Sotirios A. Tsaftaris} \Email{s.tsaftaris@ed.ac.uk}\\
\addr School of Engineering, The University of Edinburgh, UK \AND
\Name{Timothy Hospedales} \Email{t.hospedales@ed.ac.uk}\\
\addr School of Informatics, The University of Edinburgh, UK
}

\begin{document}

\maketitle

% RE-STRUCTURED BY RAMAN
\begin{abstract}

    % SHORTER VERSION
    Foundation models have significantly advanced medical image analysis through the \emph{pre-train fine-tune} paradigm. Among various fine-tuning algorithms, Parameter-Efficient Fine-Tuning (PEFT) is increasingly utilized for knowledge transfer across diverse tasks, including vision-language and text-to-image generation. However, its application in medical image analysis is relatively unexplored due to the lack of a structured benchmark for evaluating PEFT methods. This study fills this gap by evaluating 17 distinct PEFT algorithms across convolutional and transformer-based networks on image classification and text-to-image generation tasks using six medical datasets of varying size, modality, and complexity. Through a battery of over 700 controlled experiments, our findings demonstrate PEFT's effectiveness, particularly in low data regimes common in medical imaging, with performance gains of up to 22\% in discriminative and generative tasks. These recommendations can assist the community in incorporating PEFT into their workflows and facilitate fair comparisons of future PEFT methods, ensuring alignment with advancements in other areas of machine learning and AI.
\end{abstract}

\begin{keywords}
Parameter-Efficient Fine-Tuning, Transfer Learning, Image Classification, Text-to-Image Generation
\end{keywords}

\section{Introduction}
\label{sec:intro}
Medical image analysis has benefited from the deep learning revolution, despite the data-hungry nature of recent foundation models \cite{dosovitskiy2021an}. The challenge of curating large training datasets in medical image analysis is exacerbated due to privacy restrictions, the long-tailed nature of medical conditions of interest, and high annotation cost \cite{willemink2020preparing}. However, the ability to transfer knowledge from one domain into another (\emph{transfer learning}) has been a key ingredient behind the development of some of the most performant models \cite{li2023llavamed, azizi2021bigMedicalSSRL, azizi2022selfSupMedical, huang2023selfMedical, dutt2022automatic, singh2020dmenet}. Under this paradigm, the pre-training is conducted on either out-of-domain non-medical images or unlabeled medical images followed by fine-tuning on in-domain medical images for the specific task. The emergence of `foundation models' \cite{bommasani2021foundation} has further widened the adoption of this approach.

Significant efforts have bolstered the progress in foundation models by scaling them to billions of parameters, hence, the remaining challenge lies in the fine-tuning process that requires striking a delicate balance in adapting the pre-trained model to specialize it for a downstream medical task while avoiding overfitting. This balance has been explored through various fine-tuning algorithms, such as regularized fine-tuning \cite{xuhong2018explicit, gouk2020distance}. More recently, Parameter-Efficient Fine-Tuning (PEFT) has gained traction \cite{xie2023difffit, rebuffi2018efficient, hu2022lora, he2022towards}. The concept involves freezing the original backbone and fine-tuning either a (very small) existing subset or a small new set of parameters. 
While the NLP and vision communities have greatly benefitted from structured benchmarks for evaluating PEFT algorithms, a similar direction is lacking in medical image analysis. Furthermore, their efficacy in this domain largely remains underexplored.



% REPHRASE OF THE PARA ABOVE
In this work, we present the first structured benchmark for evaluating state-of-the-art PEFT algorithms on diverse medical imaging datasets and tasks. Our evaluation compares 16 different techniques across six medical datasets encompassing both CNN and transformer architectures, discriminative diagnosis tasks, and a novel, first-of-a-kind demonstration of PEFT's effectiveness in a generative medical image synthesis task. We experiment with architectures that match the size of recent foundation models introduced for computer vision and medical image analysis \cite{kirillov2023segment, chambon2022roentgen}. Furthermore, we investigate aspects such as the trade-off between PEFT effectiveness and data volume for the task at hand. We establish the first comprehensive comparison benchmark for PEFT in medical vision and offer the community valuable insights into the, currently, best-suited PEFT methods for different types of tasks.

Our contributions can be summarised by the following questions and their answers:

\noindent\textbf{Q1:} \textit{How effective is PEFT for low data scenarios?} \textbf{A1:} Given a large pre-trained model, benefits from PEFT increase as data volume decreases and model size increases (Sec.~\ref{sec:parameter_efficiency}).

\noindent\textbf{Q2:} \textit{Can PEFT improve transfer to discriminative medical tasks?} \textbf{A2:} Yes, three methods achieve consistent gains compared to full fine-tuning, two of which also significantly reduce the computational cost of tuning (Sec.~\ref{sec:discriminative}).

\noindent\textbf{Q3:} \textit{Can PEFT improve costly text-to-image generation?} \textbf{A3:} Yes, PEFT can provide significant performance gains in image generation quality with much lesser computational cost. (Sec.~\ref{sec:text_to_image}).

\begin{table*}[]
\resizebox{\textwidth}{!}{
\begin{tabular}{@{}clllll@{}}
\toprule
\multicolumn{1}{l}{\textbf{PEFT Method}}                               & \textbf{Paper} & \textbf{Summary}                                                                                                                                              & \textbf{CNNs}                                & \textbf{ViTs}                                & \textbf{PEFT Type} \\ \midrule
\begin{tabular}[c]{@{}c@{}}Task-Specific Adapters\\ (TSA)\end{tabular} &      Li \etal \cite{li2022cross}          & Cross-domain few-shot learning by inserting learnable modules.                                                                                                & \checkmark                                          & \xmark                                           & Additive           \\
BatchNorm Tuning                                                       &     Frankle \etal \cite{training_bn_only}           & \begin{tabular}[c]{@{}l@{}}Training only BatchNorm layers (even with random initialization) \\ leads to high performance in CNNs.\end{tabular}                & \checkmark                                          & \xmark                                           & Selective          \\
Bias Tuning                                                            &     Cai \etal \cite{tinyTL}           & \begin{tabular}[c]{@{}l@{}}Propose TinyTL framework that learns only bias modules.\\  for parameter-efficient on-device learning.\end{tabular}                 & \checkmark                                          & \xmark                                           & Selective          \\
\begin{tabular}[c]{@{}c@{}}Scale-Shift Features\\ (SSF)\end{tabular}   &     Lian \etal \cite{lian2022scaling}           & \begin{tabular}[c]{@{}l@{}}Adapt a pre-trained model to downstream datasets by \\ introducing parameters that modulate the extracted features.\end{tabular}    & \checkmark                                          & \checkmark                                          & Additive           \\
Attention Tuning                                                       &   Touvron \etal \cite{touvron2022three}             & \begin{tabular}[c]{@{}l@{}}Fine-tuning attention layers is sufficient to adapt ViTs to \\ different classification tasks.\end{tabular}                         & \xmark                                           & \checkmark                                          & Selective          \\
LayerNorm Tuning                                                       &   Basu \etal \cite{basu2023strong}             & \begin{tabular}[c]{@{}l@{}}Fine-tuning LayerNorm parameters is a strong baseline for \\ few-shot adaptation.\end{tabular}                                      & \xmark                                           & \checkmark                                          & Selective          \\
BitFit                                                                 &     Zaken \etal \cite{ben-zaken-etal-2022-bitfit}           & \begin{tabular}[c]{@{}l@{}}Fine-tuning the bias terms in a transformer is competitive\\  or better than full-fine-tuning.\end{tabular}                          & \xmark                                           & \checkmark                                          & Selective          \\
LoRA                                                                   &   Hu \etal \cite {hu2022lora}             & \begin{tabular}[c]{@{}l@{}}Training injected rank decomposition matrices in transformers\\  is on-par or better than full-fine-tuning.\end{tabular}            & \xmark                                           & \checkmark                                          & Additive           \\
AdaptFormer                                                            &    Chen \etal \cite{chen2022adaptformer}            & \begin{tabular}[c]{@{}l@{}}Adding lightweight modules increases a ViT's transferability for\\  different image and video tasks.\end{tabular}                  & \xmark                                           & \checkmark                                          & Additive           \\
SV-Diff                                                                &   Han \etal \cite{han2023svdiff}             & \begin{tabular}[c]{@{}l@{}}Fine-tuning singular values of weight matrices is a parameter-efficient\\ adapter for text-to-image generation models.\end{tabular} & \multicolumn{2}{l}{\begin{tabular}[c]{@{}l@{}}U-Net and Text-\\ Encoder in SD\end{tabular}} & Additive    \\      DiffFit                                                                &   Xie \etal \cite{xie2023difffit}             & \begin{tabular}[c]{@{}l@{}}Fine-tune only the bias terms and newly-added scaling factors in specific layers.\end{tabular} & \multicolumn{2}{l}{\begin{tabular}[c]{@{}l@{}}U-Net and Text-\\ Encoder in SD\end{tabular}} & Additive   

\\ \bottomrule
\end{tabular}
}
\caption{Summary of the Parameter-Efficient Fine-Tuning (PEFT) methods included in this evaluation, highlighting the specific model type they are designed for and their respective categories.}
\label{tab:peftSummary}
\end{table*}


%-------------------------------------------------------------------------

\section{Related Work}
%We now briefly review PEFT methods first and then discuss how PEFT has been used in text-to-image generation and medical image analysis settings.

\keypoint{Finetuning for Medical Image Analysis}
Due to limited availability of data in medical domains, a common paradigm is starting with a deep neural network pre-trained on large natural images, and adapting its weights by fine-tuning \cite{tajbakhsh2016convolutional} e.g.~via ensembling \cite{kumar2017ensemble}, active learning \cite{zhou2017fine_tuning} or with the aid of expert interactions \cite{wang2018interactive}. However, tuning recent large \emph{foundation} models on small datasets -- e.g.\ billions of parameters but only thousands of data points -- can cause stability issues and overfitting. Thus, focus has shifted towards what is known as \emph{parameter efficient fine-tuning} (PEFT), i.e.\ updating only a small number of parameters while keeping the rest fixed.

\keypoint{PEFT for Medical Image Analysis}
PEFT techiniques can be categorised into three families, adaptive methods \cite{hu2022lora,rebuffi2018efficient,li2022cross,lian2022scaling}, selective methods \cite{ben-zaken-etal-2022-bitfit,training_bn_only,touvron2022three} and prompt tuning \cite{lester2021power,VPT_jia,li2021prefix}. A summary of different PEFT methods along with their categorization is given in Table \ref{tab:peftSummary}. There has been limited adoption of PEFT techniques within medical image analysis.
%\citet{fischer2022prompt} modify a vanilla U-Net \cite{UNet} with class-dependent learnable prompt tokens for semantic segmentation. Their approach almost matches the performance of a fully finetuned model on two medical datasets. \citet{silva2023transductive} study adapters designed for dense prediction tasks to improve medical image segmentation.
In image segmentation, successes have come from learning prompt tokens in a U-Net \citet{UNet}, or adapters designed specifically for dense prediction tasks \cite{silva2023transductive}. On the recently proposed Segment Anything Model (SAM) \cite{kirillov2023segment} --- previously unsuccessful in the medical domain --- researchers have used PEFT to outperform state-of-the-art methods \cite{ma2024segment, zhang2023customized}. Finally, PEFT has also been shown to improve fairness in downstream medical tasks \cite{dutt2024fairtune}.
%would benefit more from. Their proposed method combines a novel spatial adapter with constrained transductive inference for few-shot medical image segmentation. 
%Experiments with the recently proposed Segment Anything Model (SAM) \cite{kirillov2023segment} have revealed its inefficacy in the medical image analysis domain \cite{deng2023segment}. To overcome this, \citet{wu2023medical} combine the SAM model with adapters \cite{houlsby2019parameter} and outperform existing state-of-the-art methods on several downstream medical segmentation tasks. Similarly, \citet{zhang2023customized} combined SAM with LoRA.

\keypoint{PEFT for Text-to-Image Generation}
Diffusion models \cite{ho2020denoising} have led to state-of-the-art results in a variety of tasks such as text-to-image generation \cite{rombach2022high, balaji2022ediffi, saharia2022photorealistic}, image synthesis \cite{dhariwal2021diffusion}, density estimation \cite{kingma2021variational} and many others.
%Recently, PEFT methods for text-to-image diffusion models have been proposed.
As in other areas, PEFT methods have been proposed to tune these large models.
%\citet{xie2023difffit} proposed \textit{DiffFit}, for diffusion models based on transformers \cite{Peebles2022DiT}. They found that fine-tuning the bias terms is a strong baseline for downstream fine-tuning and further introduced learnable scaling factors at specific positions of the model for efficient adaptation. %This approach also enabled adapting a low-resolution generative model to a high-resolution one. 
%\citet{moon2022fine} identified fine-tuning the attention modules as another effective strategy and introduced an adapter that improves image-generation quality. \citet{xiang2023closer} presented the procedure for parameter-efficient fine-tuning in diffusion models by studying the design space of adapters in terms of position and function form. \citet{han2023svdiff} propose a new method, termed ``\textit{SV-Diff}'', for efficient personalization of text-to-image diffusion models. They designed a compact parameter space by fine-tuning singular values of weight matrices. This method is more parameter-efficient than existing approaches such as LoRA \cite{hu2022lora}.
Key approaches include solely tuning bias terms and learnable scaling factors \cite{xie2023difffit}, attention modules \cite{moon2022fine}, adapters \cite{xiang2023closer, moon2022fine} or the singular values of weight matrices \cite{han2023svdiff}.

As of yet, these PEFT methods have not been systematically compared in a medical image analysis setting. We perform the first wide benchmarking study that applies PEFT techniques to diverse tasks in the medical image analysis domain, using state-of-the-art architectures.

%\keypoint{Additive Methods}
%Adapters are new modules added among the layers of a pre-trained network to reconfigure a given pre-trained model for a downstream task. The idea was originally introduced for learning representations that could aid in the analysis of multiple, diverse domains. \cite{rebuffi2017learning, rebuffi2018efficient}. \citet{li2022cross} introduced Task-Specific Adapters (\textit{TSA}) for learning multiple domains but with limited examples (cross-domain few-shot learning). \citet{houlsby2019parameter} extended \cite{rebuffi2017learning} for natural language processing by applying adapters to transformers. 
%\citet{hu2022lora} build upon \cite{aghajanyan2020intrinsic} and introduce low-rank matrices into the self-attention layer of transformer-based models. This approach, termed \textit{LoRA}, introduces no additional inference latency, as seen in \cite{houlsby2019parameter}, and has demonstrated superior performance over traditional model fine-tuning across diverse datasets and tasks. \citet{lian2022scaling} propose a different parameter-efficient fine-tuning approach through Scaling and Shifting the Features (\textit{SSF}) extracted by a pre-trained model. The difference in upstream and downstream data distribution poses a challenge to adapting pre-trained models to downstream datasets. SSF introduces parameters, after each network operation, that modulate (scale and shift) the extracted features of the downstream dataset such that they fall in a discriminative space. \citet{chen2022adaptformer} introduced a novel PEFT approach designed for vision transformers for image and video recognition (\textit{AdaptFormer}) and demonstrated that adding only 2\% extra parameters to a vision transformer can outperform fully-finetuned networks on multiple benchmarks. 

%\keypoint{Selective Methods}
%This line of approach does not insert any new modules but finetunes a subset of the original pre-trained model. BitFit \cite{ben-zaken-etal-2022-bitfit} proposes finetuning only the bias terms in a transformer-based model. In addition to being parameter-efficient, BitFit also alleviates catastrophic forgetting and matches full fine-tuning performance. \citet{tinyTL} have drawn a similar parallel by fine-tuning only the bias terms in CNNs for on-device learning. Similarly, \citet{training_bn_only} reveal the expressivity of parameters in the batch normalization layers, particularly, in shifting and rescaling random features. The SSF approach \cite{lian2022scaling} described earlier, was developed on this understanding. Similar properties have also been revealed in Vision transformers \cite{dosovitskiy2021an}. \citet{touvron2022three} show that fine-tuning only attention layers in vanilla ViTs is an effective strategy. \citet{basu2023strong} demonstrated that fine-tuning the parameters of normalization layers is very effective for few-shot classification. 

%\keypoint{Prompt Tuning}
%The concept of appending prompts to large language models has enabled powerful zero-shot and few-shot performance \cite{brown2020language, schick2020s}. Prompt tuning \cite{lester2021power,,} introduced the idea of appending the input in language models with a trainable tensor, termed \textit{soft prompt}. \citet{VPT_jia} extended this concept for vision transformers by proposing two schemes, \textit{VPT-Deep} and \textit{VPT-Shallow}. %In the former scheme, prompts are added prior to each layer within the network, whereas they are introduced solely before the initial layer in \textit{VPT-Shallow}. 
%Similar to \textit{VPT-Deep}, \citet{li2021prefix} advocated the idea of appending trainable parameters in all hidden layers.  \citet{Qin2021ExploringUI} proposed adopting an autoencoder for decompressing the soft prompts to improve the convergence in \cite{lester2021power}.

% -------------------------------------------------------------------------------------------------------

\section{Background}
\subsection{Problem Definition}
%The general problem definition for fine-tuning can be formalized as follows. 
Let $f$ be a pre-trained model parameterized by $\theta$, $\ell$ be a loss function we wish to minimize and $\mathcal{D} = \{(x_i, y_i)\}_i^N$ be the downstream dataset of interest, consisting of inputs $x_i$ and their targets $y_i$. Starting from the initialization $\theta = \theta_{\sb 0}$, where $\theta_{\sb 0}$ are the weights from pre-training, our objective is then to optimize by gradient descent  the total loss
$
    L = \frac{1}{N} \sum_{i=1}^N \ell(f(x_i;\; \theta),\; y_i).
$ Due to resource constraints, such full fine-tuning is not always possible. It can also be suboptimal to tune the entirety of network weights, as many layers may have learned generally applicable features. Parameter-Efficient Fine-Tuning provides options in these cases, which fall into two broad families. \textbf{Selective} methods rely on optimising only a subset of model parameters, $\phi \in \theta$. This could be a subset of the layers or a specific type of parameter like batch norm. \textbf{Additive} methods instead introduce new parameters such that the full set becomes $\theta' = \{\theta, \phi\}$ where $\phi$ can be as simple as a new classifier layer or carefully designed adapters. For both method families, the update rule is $    \phi = \phi - \eta \nabla_\phi L,$
where $\eta$ is the learning rate.

\subsection{PEFT Methods For Comparison} \label{sec:methods}
We now formally define the different fine-tuning protocols used in the analysis. We begin with a downstream dataset $D$ and a feature extractor $f_{\theta}$ (pre-trained CNN (ResNet50) or a ViT (Base/Large/Huge)) expected to produce generalizable representations for diverse tasks. First, we freeze all the weights of this feature extractor and enable either an existing subset or a newly added parameter set according to the fine-tuning protocol. 

In selective tuning methods, we permit specific parameters to be trainable based on the selected algorithm. For instance, for protocols like BatchNorm and Bias Tuning, the parameters of the `BatchNorm2d' layers or the `bias' terms are respectively made trainable. More details including the pseudocode are provided in Appendix (section \ref{sec:training_protocols}). 

% Selective methods such as BatchNorm and Bias Tuning protocols in CNNs enable training of only \textit{BatchNorm2d} layers or \textit{bias} terms respectively. Similarly, \textit{BitFit}, \textit{Attention Tuning}, and \textit{LayerNorm Tuning} can be implemented using algorithms \ref{alg:bitfit}, \ref{alg:attention}, and \ref{alg:layernorm} respectively.

In \textbf{TSA}, our objective is to learn task-specific weights $\phi$ to obtain the task-adapted classifier $f_{(\theta, \phi)}$. Next, we minimize the cross-entropy l oss $L$ over the samples in the downstream dataset $D$ w.r.t the task-specific weights $\phi$. \citet{li2022cross} recommend the parallel adapter configuration. %The output of the \textit{l}-th layer of the feature extractor $f_{\theta}$ can be combined with the task-specific adapters $r_{\phi}$ for an input tensor $h \in \mathbb{R}^{W \times H \times C}$ in a parallel configuration using,
%
% \begin{equation} \label{eqn:tsa}
%     f_{(\theta_l, \phi)}(h) = r_{\phi}(h) + f_{\theta_l}(h).
% \end{equation}

In the \textbf{SSF} method, feature modulation is achieved by introducing scale ($\gamma$) and shift ($\beta$) parameters following each operation in the model. The previous operation's output is multiplied by the scale parameter through a dot product and combined with the shift factor. Therefore, for a given input $x$, the output $y$ is calculated using the formula $y = \gamma \cdot x + \beta$.

An \textbf{AdaptFormer} module (\textit{AdaptMLP}) consists of two branches wherein the first branch is identical to the MLP block of a vanilla transformer while the second branch consists of a down-projection ($W_{down}$), a ReLU layer, an up-projection ($W_{up}$), and a scaling factor ($s$). The adapted features are combined with the original features entering the \textit{AdaptMLP} block through a residual connection.

% The adapted features ($x_{adap}$), obtained using Eq. \ref{eqn:adapted_features}, are combined with the original features entering the \textit{AdaptMLP} block ($x_{orig}$) through a residual connection, as defined in Eq. \ref{eqn:combined}.
% %
% \begin{gather} 
%     x_{adap} = ReLU(LN(x_{orig}) \cdot W_{down}) \cdot W_{up} \label{eqn:adapted_features} \\ 
%     x_{final} = MLP(LN(x_{orig})) + s.x_{adap} + x_{orig} \label{eqn:combined}
% \end{gather}

\textbf{LoRA} is based on the concept that, during adaptation, weight updates exhibit low intrinsic rank. Consequently, when a pre-trained weight matrix $W_0$ is updated, the change ($\Delta W$) is characterized by a low-rank decomposition operation with rank $r$, as shown in Eq.~\ref{eqn:lora} where $B \in \mathbb{R}^{d \times r}$ and $A \in \mathbb{R}^{r \times k}$,
\begin{equation} \label{eqn:lora}
    W_0 + \Delta W = W_0 + BA.
\end{equation}

\textbf{SV-Diff} performs Singular Value Decomposition (SVD) of the weight matrices of a pre-trained diffusion model and optimizes the spectral shift ($\delta$), defined as the difference between singular values and of the updated and original weight matrix. 

% \textbf{SV-Diff} performs Singular Value Decomposition (SVD) of the weight matrices of a pre-trained diffusion model (Eq. \ref{eqn:svd}) and optimizes the spectral shift ($\delta$), defined as the difference between singular values and of the updated and original weight matrix. 

% The update rule is defined in Eq.~\ref{eqn:svdiff_update},
% %\vspace{-0.9em}
% \begin{gather} 
%     W = U \Sigma V^\intercal \quad \text{with} \quad \Sigma = \text{diag}(\sigma) \label{eqn:svd}, \\
%     W_{\delta} = U \Sigma_{\delta} V^\intercal \quad \text{with} \quad \Sigma_{\delta} = \text{diag}(\text{ReLU}(\sigma + \delta)). \label{eqn:svdiff_update}
% \end{gather}

% -------------------------------------------------------- %

\begin{figure}[t]
  \centering
  \begin{minipage}[b]{0.45\textwidth}
    \includegraphics[width=\textwidth]{HAM10000_performance_plot_ViT_Base.png}
    %\caption{Caption for Figure 1}
    \label{fig:size_exp_vit_base}
  \end{minipage}
  \hspace{0.01\textwidth}
  %\hfill
  \begin{minipage}[b]{0.45\textwidth}
    \includegraphics[width=\textwidth]{HAM10000_performance_plot_VIT_Large.png}
    %\caption{Caption for Figure 2}
    \label{fig:size_exp_vit_large}
  \end{minipage}
    \vspace{-2.4em}
  \caption{Plots showing the performance comparison for Full Fine-tuning, BitFit and LoRA with varying downstream dataset size for ViT-Base and ViT-Large models.}
  \label{fig:Dataset_size_exp}
\end{figure}

\begin{figure*}
\centering
\includegraphics[width=1\textwidth]{SD_Figure.png}
\caption{Figure showing text-to-image generation examples with the ground truth in the ascending average rank order (best five) for two data regimes. The input prompt for the generated samples is: ``\textit{No acute cardiopulmonary process.}''}
\label{fig:sd_generation_examples}
\end{figure*}

\section{Experiments}

\subsection{How Effective is PEFT For Low Data Scenarios?} \label{sec:parameter_efficiency}
\keypoint{Setup} We utilized the HAM10000 dataset \cite{HAM10000} and employed three distinct fine-tuning methods, namely Full Fine-tuning, BitFit, and LoRA, in combination with two different encoders, ViT Base and ViT Large. F1-Score was measured at various dataset sizes, commencing with the entire sample size of 7,511 images (100\%) and progressively reducing it to a minimum of 75 images (1\%). To account for potential variability in the results, we report the average performance across three random seeds.

\keypoint{Results} The results are shown in Figure \ref{fig:Dataset_size_exp}. For ViT Base (left), we find that when using 100\% of available downstream data, full fine-tuning is optimal, closely followed by LoRA. As the availability decreases, however, the benefits from PEFT approaches increase. The crossover is at 50\%, when all approaches are approximately equal. For smaller data sizes, both PEFT approaches consistently outperform full FT, with LoRA providing gains of up to 6\% over the baseline. For ViT Large, the trend is similar, but the crossover now differs between the PEFT approaches. LoRA overtakes the baseline as early as 80\% while BitFit is only better at data volumes below 20\%. The take-home message here is that when data are scarce and the upstream model is large, it becomes especially important to consider parameter-efficient tuning.

\subsection{Can PEFT Improve Transfer to Discriminative Medical Tasks?} \label{sec:discriminative}
\keypoint{Setup} In our discriminative experiments, we use five diverse datasets widely recognized in the medical image analysis community for image classification tasks, BreastUS \cite{al2020dataset}, HAM10000 \cite{HAM10000}, Fitzpatrick17K \cite{groh2021evaluating, groh2022towards}, Standardized Multi-Channel Dataset for Glaucoma (SMDG) \cite{riley_kiefer_2023, kiefer2022survey}, and RSNA Pneumonia Detection Dataset \cite{rsna2018}. The experiments employ ResNet50 \cite{he2016deep} and ViT (Base/Large/Huge) \cite{dosovitskiy2021an} as encoders. All CNN experiments employed ResNet50 pre-trained on ImageNet \cite{imagenet} while all ViT variants were pre-trained on ImageNet-21k \cite{Imagenet21k}.
% Hyper-Parameter Optimization (HPO) was utilized to determine the optimal set of hyperparameters for each experiment. To ensure the timely completion of our search, the ASHA hyperparameter optimization algorithm \cite{li2018massively} was adopted. %ASHA leverages parallelism in training and early stopping to discard unpromising candidate combinations. The best set of parameters found was subsequently employed to train the models.
% For each dataset, various fine-tuning methods were applied using the hyperparameters obtained from the search, and performance was compared using a separate test set. During training, early stopping was implemented, halting the training process if the performance on the validation set did not improve for a predefined number of epochs.

\begin{table*}[t]
    \centering
    \resizebox{0.95\textwidth}{!}{
    \begin{tabular}{@{}lllllll@{}}
\toprule
\diagbox{\textbf{Dataset}}{\textbf{Method}}    & \textbf{Full FT} & \textbf{\begin{tabular}[c]{@{}l@{}}Linear\\ Probing\end{tabular}} & \textbf{TSA} & \textbf{\begin{tabular}[c]{@{}l@{}}BN\\ Tuning\end{tabular}} & \textbf{\begin{tabular}[c]{@{}l@{}}Bias\\ Tuning\end{tabular}} & \textbf{SSF} \\
 & \textbf{(23.5M)}            & \textbf{(3.8-7.2K)}                                                              & \textbf{(10.6M)}        & \textbf{(59.1K)}                                                        & \textbf{(32.7K)}                                                          & \textbf{(60.6K)}        \\
 \midrule
%\multicolumn{7}{l}{}                                                                                                                                                                                                                                                     \\
BreastUS (584)          & $0.72 \pm 1.1$            & $0.61 \pm 1.3$                                                             & $0.90 \pm 0.8$        & $0.92 \pm 0.9$                                                       & $0.89 \pm 1.2$                                                         & $\textbf{0.94} \pm \textbf{0.7}$         \\

FitzPatrick (5809)        & $\textbf{0.71} \pm \textbf{0.4}$             & $0.66 \pm 0.8$                                                              & $0.69 \pm 1.4$         & $0.67 \pm 1.1$                                                         & $0.64 \pm 1.3$                                                           & $\textbf{0.71} \pm \textbf{0.7}$        \\

HAM10000 (7511)           & $0.87 \pm 1.2$            & $0.82 \pm 0.6$                                                             & $0.86 \pm 1.0$         & $0.84 \pm 0.6$                                                        & $0.70 \pm 1.0$                                                        & $\textbf{0.89} \pm \textbf{0.9}$        \\

SMDG (9852)               &     $0.75 \pm 0.9$             &         $0.69 \pm 1.0$                                                         & $\textbf{0.85} \pm \textbf{0.7}$         & $0.83 \pm 1.4$                                                       & $0.73 \pm 0.6$                                                          & $0.84  \pm 0.9$       \\ 
Pneumonia (20412)               &      $0.86\pm 1.4$            &                                            $0.80 \pm 0.4$                     &    $0.86 \pm 1.1$      &           $0.84 \pm 1.5$                                             &    $0.85 \pm 1.9$                                                   &     $\textbf{0.87} \pm \textbf{1.2}$   \\ 
\midrule
Average F1 Score        &       0.77           &         0.72                                                          &    0.83          &           0.82                                                   &    0.76                                                            &       \textbf{0.85}       \\
Average Rank        &       2.8           &         5.2                                                          &    2.2          &         3.2                                                     &     4.6                                                           &     \textbf{1.2}         \\ \bottomrule
\end{tabular}}
    \caption{Comparing different fine-tuning methods for ImageNet pre-trained ResNet50. Dataset size and parameter count are indicated in brackets. The best result for each dataset is highlighted, and the average rank for each fine-tuning method is shown at the bottom.}
    \label{tab:ResNet}
\end{table*}


%  \begin{figure*}
%     \begin{subfigure}{0.45\textwidth}    \includegraphics[width=0.5\textwidth]{perforamance_param_resnet.png}
%         \label{fig:sub1}
%     \end{subfigure}
%     \begin{subfigure}{0.45\linewidth}
% \includegraphics[width=0.5\textwidth]{MIDLLatexTemplate/perforamance_param_vit_edited1.png}
%         \label{fig:sub2}
%     \end{subfigure}
%     \vspace{-35pt}
%     \caption{Performance vs. Parameter Count for ResNet50 and ViT-Base Encoders. The marker size indicates the tunable parameter count for each method.}
%     \label{fig:main}
% \end{figure*}


\begin{figure}[t]
  \centering
  \begin{minipage}[b]{0.45\textwidth}
    \includegraphics[width=\textwidth]{perforamance_param_resnet.png}
    %\caption{Caption for Figure 1}
    \label{fig:size_exp_vit_base}
  \end{minipage}
  % \hspace{0.01\textwidth}
  %\hfill
  \begin{minipage}[b]{0.45\textwidth}
    \includegraphics[width=\textwidth]{MIDLLatexTemplate/perforamance_param_vit_edited2.png}
    %\caption{Caption for Figure 2}
    \label{fig:size_exp_vit_large}
  \end{minipage}
  \label{fig:performance_param_count}
  \vspace{-2.4em}
  \caption{Performance vs. Parameter Count for ResNet50 and ViT-Base Encoders. The marker size indicates the tunable parameter count for each method.}\label{fig:ppc}
\end{figure}

\keypoint{Results} We present the results for ResNet-50 in Table \ref{tab:ResNet}. Given its convolutional architecture, ResNet-50 is compatible with certain PEFT methods but not others. Overall, full fine-tuning tends to outperform basic linear probing. Observations from the BreastUS and SMDG datasets indicate that most PEFT methods enhance performance beyond the full FT baseline. The SSF method, despite only tuning 60K parameters (0.25\%), improves performance by up to 22\%. While gains on HAM10000, FitzPatrick and Pneumonia are more modest, the previous section has discussed how these results could potentially vary with changes in data volume and model size. Overall, SSF emerges as the top-performing method based on average F1 score and ranking. Full fine-tuning and TSA present a close tie with the latter emerging on top. BatchNorm and bias tuning perform better than linear probing which turns out to be the worst strategy. Overall, the greatest gains are observed in the smallest dataset (BreastUS), however, the performance gap between full fine-tuning and PEFT methods minimizes with an increase in dataset size. 
For \textbf{Transformer} models in Tab.~\ref{tab:vit}, the situation is similar. The biggest gains over full FT are on BreastUS and SMDG, while linear probing underperforms here as well. The best PEFT method is \textbf{LoRA}, for both average F1 score and rank, across all five datasets. AdaptFormer does not perform well and even falls behind linear probing for ViT Large. This can be attributed to the fact that this method was mainly designed for video recognition tasks. We also see that the benefits of PEFT increase slightly as the model size increases, with a 4\% improvement for ViT Base going to 6\% for ViT Huge. This is an interesting finding, and agrees with Sec.~\ref{sec:parameter_efficiency}, as the proportion of parameters tuned actually decreases for the larger models.

Figure~\ref{fig:ppc} illustrates the trade-off between each method's performance and parameter count. This comparison is crucial as different applications may prioritize either superior performance or computational efficiency. For the results produced by the ResNet50 (shown on the left), each PEFT method lies on the Pareto frontier, indicating that a specific method could be selected based on the prioritization of either performance or cost.
Remarkably, the SSF method stands out by delivering high performance at a significantly reduced cost. In the case of the ViT-B model, LoRA emerges as the prominent choice, outpacing SSF while maintaining a similar computational expense.

To answer our question \emph{can PEFT Improve Transfer to Discriminative Medical Tasks?} Yes, \textbf{TSA, SSF and LoRA} provide consistent improvements over full fine-tuning while requiring as little as 0.25\% of parameters. 

\begin{table*}[]
\centering
\resizebox{\textwidth}{!}{
\begin{tabular}{llllllllll}
\hline
\textbf{Encoder}                   & \diagbox{\textbf{Dataset}}{\textbf{Method}} & \textbf{Full FT} & \textbf{\begin{tabular}[c]{@{}l@{}}Linear \\ Probing\end{tabular}} & \textbf{\begin{tabular}[c]{@{}l@{}}Attention\\ Tuning\end{tabular}} & \textbf{BitFiT} & \textbf{LoRA} & \textbf{SSF} & \textbf{Adaptformer} & \textbf{\begin{tabular}[c]{@{}l@{}}LayerNorm\\ Tuning\end{tabular}} \\ 
% & & \textbf{(87.2/ 303/ 630M)} & \textbf{(3.8-7.2K)} & \textbf{(28.5/100/210M)} & \textbf{(0.1/0.2/0.4M)} & \textbf{(0.6/1.5/2.6M)} & \textbf{(0.2/0.5/0.9M)} & \textbf{(0.1/0.3/0.5M)} & \textbf{(0.04/0.1/0.2M)} & \\
\hline
\multirow{6}{*}{\textbf{ViT Base}} & BreastUS (584)        & $0.82 \pm 1.2$             & $0.79 \pm 0.7$                                                               & $0.93 \pm 1.4$                                                                & $\textbf{0.97} \pm \textbf{1.3}$            & $0.94 \pm 0.6$          & $0.95 \pm 0.9$         & $0.95 \pm 0.7$                 & $0.88 \pm 1.1$                             \\
                                    & FitzPatrick (5,809)     & $0.80 \pm 1.3$             & $0.74 \pm 0.6$                                                               & $0.76 \pm 1.3$                                                                & $0.71 \pm 1.6$            & $\textbf{0.82} \pm \textbf{1.4}$          & $0.77 \pm 0.7$         & $0.72 \pm 1.1$                 & $0.73 \pm 1.2$                                                                \\
                                    
                                   & HAM10000  (7,511)       & $\textbf{0.91} \pm \textbf{1.4}$             & $0.72 \pm 0.5$                                                               & $0.86 \pm 1.2$                                                                & $0.87 \pm 1.8$            & $\textbf{0.91} \pm \textbf{1.3}$          & $0.88 \pm 0.8$         & $0.76 \pm 1.2$                 & $0.85 \pm 1.3$                                                                \\
                                  
                                   & SMDG (9,852)            & $0.80 \pm 1.6$             & $0.60 \pm 0.6$                                                               & $0.84 \pm 1.8$                                                                & $0.66 \pm 1.4$            & $\textbf{0.86} \pm \textbf{1.5}$          & $0.85 \pm 0.9$         & $0.60 \pm 1.3$                 & $0.80 \pm 1.4$                                                                \\
                                   & Pneumonia (20,412)       & $0.87 \pm 1.7$             & $0.86 \pm 0.4$                                                               & $0.85 \pm 1.1$                                                                & $0.87 \pm 1.2$            &    $0.86 \pm 0.8$           & $\textbf{0.88} \pm \textbf{1.0}$         & $0.83 \pm 0.9$                 & $0.87 \pm 1.7$ 
                                   \\
                                   \cline{2-10}
                                   & \textbf{Average F1 Score}       &   0.84            &      0.74                                      &      0.85                                                           &    0.82        &    \textbf{0.88}           &   0.87       &    0.77             &  0.83 \\
\midrule%\multicolumn{10}{l}{}                                                                                                                                                                                                                                                                                                                                             \\
\multirow{6}{*}{\textbf{ViT Large}}                 & BreastUS (584)        & $0.84 \pm 1.8$             & $0.73 \pm 0.7$                                                               & $0.86 \pm 1.3$                                                                & $\textbf{0.95} \pm \textbf{1.4}$            & $0.93 \pm 1.3$          & $0.92 \pm 1.8$         & $\textbf{0.95} \pm \textbf{1.1}$                 & $0.88 \pm 1.4$                                                                \\

                                    & FitzPatrick (5,809)     & $\textbf{0.82} \pm \textbf{1.4}$             & $0.74 \pm 0.5$                                                               & $0.77 \pm 1.2$                                                                & $0.74 \pm 1.5$            & $\textbf{0.82} \pm \textbf{1.9}$          & $0.80 \pm 1.3$         & $0.72 \pm 1.2$                 & $0.78 \pm 1.3$                                                         \\
                                    
                                   & HAM10000 (7,511)         & $\textbf{0.90} \pm \textbf{1.6}$             & $0.82 \pm 0.8$                                                               & $0.88 \pm 1.4$                                                                & $0.86 \pm 1.1$            & $0.89 \pm 1.5$          & $0.88 \pm 1.7$         & $0.74 \pm 1.0$                 & $0.87 \pm 1.7$                                                                \\
                                   
                                   & SMDG (9,852)            & $0.81 \pm 1.5$             & $0.77 \pm 0.6$                                                               & $0.84 \pm 1.5$                                                                & $0.83 \pm 1.9$            & $0.83 \pm 1.2$          &   $\textbf{0.87} \pm \textbf{1.2}$           & $0.63 \pm 1.3$                 & $0.85 \pm 1.5$                                                                \\
                                   & Pneumonia (20,412)       & $0.80 \pm 1.8$             & $0.78 \pm 0.9$                                                               & $0.81 \pm 1.5$                                                                & $0.80 \pm 1.4$            &     $\textbf{0.82} \pm \textbf{1.1}$          & $0.80 \pm 1.0$         & $0.78 \pm 1.4$                 & $0.80 \pm 1.6$                                                                \\
                                   \cline{2-10}
                                   & \textbf{Average F1 Score}       &     0.83          &      0.77                                      & 0.83                                                                &    0.84        &     \textbf{0.86}          &    0.85      &     0.76            &  0.84 \\
\midrule  %                                                                                                                                                                                                                                                                                                                                         \\
\multirow{6}{*}{\textbf{ViT Huge}}                  & BreastUS (584)        & $0.92 \pm 1.8$             & $0.67 \pm 0.9$                                                               & $0.89 \pm 1.5$                                                                & $\textbf{0.96} \pm \textbf{1.2}$            & $0.86 \pm 1.8$          & $\textbf{0.96} \pm \textbf{1.1}$         & $0.93 \pm 1.0$                 & $0.92 \pm 1.4$                                                                \\
                                    & FitzPatrick (5,809)      & $0.69 \pm 1.3$             & $0.72 \pm 0.6$                                                               & $0.70 \pm 1.3$                                                                & $0.72 \pm 1.2$            & $\textbf{0.78} \pm \textbf{1.5}$          & $0.73 \pm 1.1$         & $0.72 \pm 1.4$                 & $0.72 \pm 0.8$                                                                \\
                                   & HAM10000 (7,511)        & $0.74 \pm 1.7$             & $0.74 \pm 0.7$                                                               & $0.77 \pm 1.5$                                                                & $0.71 \pm 1.4$            & $0.87 \pm 1.1$          &    $0.70 \pm 0.7$          & $0.73 \pm 1.0$                 & $0.72 \pm 1.7$                                                                \\
                                   
                                   & SMDG (9,852)            & $0.73 \pm 1.5$             & $0.64 \pm 1.1$                                                               & $0.72 \pm 1.4$                                                                & $0.64 \pm 0.9$            & $\textbf{0.83} \pm 1.7$          & $0.67 \pm 1.1$         & $0.64 \pm 1.2$                 & $0.67 \pm 1.3$                                                                \\
                                   & Pneumonia (20,412)       & $0.78 \pm 1.6$             & $0.76 \pm 1.3$                                                               &     $0.78 \pm 0.9$                                                                & $0.79 \pm 1.5$            &    $\textbf{0.81} \pm \textbf{1.7}$           &       $0.79 \pm 1.1$       &      $0.78 \pm 1.1$                &   $0.78 \pm 1.2$                                                                  \\
                                                                      \cline{2-10}
                                   & \textbf{Average F1 Score}       &    0.77           &     0.71                                       & 0.77                                                                &     0.76       &     \textbf{0.83}         &    0.77      &      0.76           &  0.76 \\
                                   
                                   \hline
\textbf{Combined Average Rank}      &        &        4.1          &   6.7               &     4.5                                                               &  4.5                                                                   &    \textbf{2.4}             &     3.1          &     6.0         &      4.7                                                                                  \\ \hline
\end{tabular}
    }
\caption{Results with different ViT encoders (base/ large/ huge). Dataset size and parameter count are indicated in brackets. The best result for each dataset is highlighted, and the average rank for each fine-tuning method is shown at the end. Parameter count for each PEFT method and encoder is presented in Appendix Sec. \ref{sec:param_count}.}
\label{tab:vit}
\end{table*}


\subsection{Can PEFT Improve Costly Text-to-Image Generation?} \label{sec:text_to_image}

\keypoint{Setup} We use the MIMIC-CXR dataset (v.\ 2.0.0) \cite{johnson2019mimic}. Following the recommendations of \citet{chambon2022adapting}, we fine-tune only the U-Net component (keeping text-encoder and VAE frozen) of the stable diffusion pipeline for different sizes of the downstream dataset (110K, 55K, and 11K, representing 100\%, 50\% and 10\% of the entire dataset).  For analysis, we compare the full-finetuning of U-Net with 7 different PEFT methods and report the FID Score over 1000 test images averaged across four random seeds. Stable Diffusion pipelines and PEFT methods were implemented using the \textit{diffusers} \cite{diffusers} and \textit{peft} \cite{peft} packages.

\keypoint{Results} Refer to Table~\ref{tab:sd_results} for quantitative results and Figure~\ref{fig:sd_generation_examples} for example images generated using different fine-tuning methods for two scenarios (110K and 11K samples). 
\textbf{Note} that certain PEFT strategies (bias tuning, norm tuning, etc) have not been published in the literature in the context of text-to-image generation but are included here in experiments.

For all data volumes, several PEFT methods outperformed full fine-tuning with significant differences in FID scores. A particularly interesting observation is that simple strategies such as fine-tuning just the bias or normalization layers are amongst the best performers, assuming first and third ranks respectively. Other PEFT methods designed exclusively for text-to-image generation tasks (SV-Diff and DiffFit) follow closely and also outperform full fine-tuning. Interestingly, LoRA, the best-performing method for classification tasks fails to provide any benefits in image generation.
Overall, PEFT shows strong promise in improving the medical image generation quality across different data volumes.



% \begin{table*}[t]
% \resizebox{\textwidth}{!}{
% \begin{tabular}{@{}ccccccccc@{}}
% \toprule
% \diagbox{\textbf{\textbf{FID}}}{\textbf{PEFT}} & \textbf{\begin{tabular}[c]{@{}l@{}}Full FT\\ (85.9M)\end{tabular}} & \textbf{\begin{tabular}[c]{@{}l@{}}Attention\\ (26.7M)\end{tabular}} & \textbf{\begin{tabular}[c]{@{}l@{}}Bias\\ (343K)\end{tabular}} & \textbf{\begin{tabular}[c]{@{}l@{}}Norm\\ (200K)\end{tabular}} & \multicolumn{1}{c}{\textbf{\begin{tabular}[c]{@{}c@{}}Bias+Norm+Attention\\ (26.7M)\end{tabular}}} & \textbf{\begin{tabular}[c]{@{}l@{}}LoRA\\ (797K)\end{tabular}} & \textbf{\begin{tabular}[c]{@{}l@{}}SV-Diff\\ (222K)\end{tabular}} & \textbf{\begin{tabular}[c]{@{}l@{}}DiffFit\\ (581K)\end{tabular}} \\ \midrule
% FID @ 110K    & 59.77                                                              & 47.26                                                                & 23.18                                                          & 30.01                                                          & 34.88                                                                                              &                                                                & 28.60                                                              & 18.99                                                              \\
% FID @ 55K     & 90.49                                                              & 40.77                                                                & 33.89                                                          & 29.12                                                          & 63.66 (TBD)                                                                                        &                                                                & 27.62                                                              & 47.80                                                              \\
% FID @ 11K     & 49.34                                                              & 29.59                                                                  & 32.12                                                          & 24.52                                                          & TBD                                                                                              &                                                                & 26.17                                                              & 31.90                                                              \\ \midrule
% Average FID   &                                                                    &                                                                      &                                                                &                                                                &                                                                                                    &                                                                &                                                                    &                                                                    \\
% Average Rank  &                                                                    &                                                                      &                                                                &                                                                &                                                                                                    &                                                                &                                                                    &                                                                    \\ \bottomrule
% \end{tabular}
% }
% \caption{Comparing different strategies for fine-tuning the U-Net subcomponent of the stable diffusion pipeline text-to-image generation of chest x-rays. }
% \label{tab:sd_results}
% \end{table*}

% Please add the following required packages to your document preamble:
% \usepackage{booktabs}
\begin{table}[]
\resizebox{\textwidth}{!}{
\begin{tabular}{@{}ccccccccc@{}}
\toprule
\diagbox{\textbf{FID} (\downarrow)}{\textbf{PEFT}} & \textbf{\begin{tabular}[c]{@{}c@{}}Full FT\\ (85.9M)\end{tabular}} & \textbf{\begin{tabular}[c]{@{}c@{}}Attention\\ (26.7M)\end{tabular}} & \textbf{\begin{tabular}[c]{@{}c@{}}Bias\\ (0.34M)\end{tabular}} & \textbf{\begin{tabular}[c]{@{}c@{}}Norm\\ (0.2M)\end{tabular}} & \textbf{\begin{tabular}[c]{@{}c@{}}Bias+Norm+Attention\\ (26.7M)\end{tabular}} & \textbf{\begin{tabular}[c]{@{}c@{}}LoRA\\ (0.8M)\end{tabular}} & \textbf{\begin{tabular}[c]{@{}c@{}}SV-Diff\\ (0.22M)\end{tabular}} & \textbf{\begin{tabular}[c]{@{}c@{}}DiffFit\\ (0.58M)\end{tabular}} \\ 
\midrule
FID @ 110K    & 58.74                                                              & 52.41                                                                & \textbf{20.81}                                                          & 29.84                                                          & 35.93                                                                          & 439.65                                                         & 23.59                                                              & 42.50                                                             \\
FID @ \phantom{0}55K     & 98.48                                                              & 39.76                                                                & 28.67                                                          & 29.24                                                          & 62.34                                                                          & 392.45                                                         & \textbf{22.06}                                                              & 51.24                                                             \\
FID @ \phantom{0}11K     & 74.70                                                              & 61.01                                                                & 17.87                                                          & 37.30                                                          & 43.46                                                                          & 399.28                                                         & 27.02                                                              & \textbf{17.49}                                                             \\ \midrule
Average FID (\downarrow)   & 77.30                                                              & 51.06                                                                & \textbf{22.45}                                                          & 32.12                                                          & 47.24                                                                          & 410.46                                                         & 24.22                                                              & 37.07                                                             \\
Average Rank  & 7                                                                  & 5.33                                                                    & \textbf{1.67}                                                              & 3.33                                                              & 5                                                                              & 8                                                              & 2                                                                  & 3.67                                                                 \\ \bottomrule

\end{tabular}
}
\caption{Table presenting the FID scores for different strategies of fine-tuning the U-Net sub-component on different ratios of the MIMIC dataset. Full Fine-tuning is outperformed by almost every other method by a significant margin.}
\label{tab:sd_results}
\end{table}

% -----------------------------------------------------------------------------------------%

\section{Conclusion}
We performed the first, thorough evaluation of parameter-efficient fine-tuning for the medical image analysis domain covering a wide range of algorithms, architectures, datasets, and tasks.  
\textbf{For discriminative tasks, } the benefits of PEFT increase with decreasing data volume and increasing model size. Furthermore, The benefits of PEFT are especially prominent for low to medium-scale datasets, which are particularly common in the medical domain. SSF and LoRA emerged as the best-performing methods for CNNs and ViTs respectively in our analysis. \textbf{For generative tasks, } simple strategies such as Bias Tuning and tailored methods such as SV-Diff provide significant performance gains over conventional strategies. 
With rapid progress in studying efficient fine-tuning algorithms, this benchmark would allow easy integration and evaluation of new PEFT methods on diverse medical tasks in future.


% The benefits of PEFT increase with decreasing data volume and increasing model size (ii) for discriminative tasks, the benefits from PEFT are especially prominent for low to medium-scale datasets, which are particularly common in the medical domain. Specific methods such as \textbf{SSF} and \textbf{LoRA} are particularly effective in this scenario. Moreover, PEFT greatly reduces the computational and memory requirements, thereby making the application and adoption of large models much more practical in clinical settings; (iii) Generative tasks can also greatly benefit from PEFT with simple strategies such as \textbf{Bias Tuning} and tailored methods such as \textbf{SV-Diff} providing significant performance gains. Overall, PEFT methods should be an integral part of a medical vision AI practitioner's toolbox. %However, generation quality can still be improved with a third of the parameters of full fine-tuning. As a recommendation, SSF and LoRA could be employed while adapting large convolution and transformer-based networks to medical tasks, respectively.

% \vspace{-10pt}
% Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{Raman Dutt is supported by the United Kingdom Research and Innovation (grant
EP/S02431X/1), UKRI Centre for Doctoral Training in Biomedical AI at the University of Edinburgh, School of Informatics.}


\bibliography{bibliography,extra_refs}

\clearpage

\appendix

\section{Results on Self-Supervised Encoders}

We extended our evaluation to include ViT encoders (ViT Base) pre-trained using different self-supervised objectives. More specifically, we adopted the highly-effective Masked Autoencoder (MAE) \cite{he2022masked} and Contrastive Language-Image Pretraining (CLIP) \cite{radford2021learning} strategies. 

Our results align with our previous observations outlined in section \ref{sec:discriminative}. \textbf{LoRA} continues to outperform other PEFT methods across both pre-training objectives. In the case of MAE ViT, Attention Tuning performs slightly better than SSF. Overall, the average ranks are very similar to the ones originally reported in the paper.

\begin{table}[h!]
\centering
\begin{adjustbox}{max width=\columnwidth}
\resizebox{\textwidth}{!}{
\begin{tabular}{@{}llrrrrrrrr@{}}
\toprule
\textbf{Encoder}                        & \textbf{Dataset}      & \multicolumn{1}{l}{\textbf{Full FT}} & \multicolumn{1}{l}{\textbf{\begin{tabular}[c]{@{}l@{}}Linear \\ Readout\end{tabular}}} & \multicolumn{1}{l}{\textbf{\begin{tabular}[c]{@{}l@{}}Attention\\ Tuning\end{tabular}}} & \multicolumn{1}{l}{\textbf{BitFiT}} & \multicolumn{1}{l}{\textbf{LoRA}} & \multicolumn{1}{l}{\textbf{SSF}} & \multicolumn{1}{l}{\textbf{Adaptformer}} & \multicolumn{1}{l}{\textbf{\begin{tabular}[c]{@{}l@{}}LayerNorm\\ Tuning\end{tabular}}} \\ \midrule
\multirow{5}{*}{\textbf{ViT Base MAE}}  & BreastUS              & 0.89                                 & 0.80                                                                                   & 0.94                                                                                    & 0.84                                & \textbf{0.97}                     & 0.95                             & 0.84                                     & 0.92                                                                                    \\
                                        & FitzPatrick           & 0.77                                 & 0.68                                                                                   & \textbf{0.80}                                                                           & 0.72                                & 0.78                              & 0.79                             & 0.72                                     & 0.73                                                                                    \\
                                        & HAM10000              & 0.83                                 & 0.68                                                                                   & \textbf{0.90}                                                                           & 0.80                                & 0.87                              & 0.81                             & 0.70                                     & 0.85                                                                                    \\
                                        & SMDG                  & \textbf{0.87}                        & 0.72                                                                                   & 0.86                                                                                    & 0.78                                & \textbf{0.87}                     & 0.85                             & 0.75                                     & 0.82                                                                                    \\
                                        & Pneumonia             & \textbf{0.87}                        & 0.83                                                                                   & 0.86                                                                                    & \textbf{0.87}                       & \textbf{0.87}                     & 0.83                             & 0.86                                     & 0.85                                                                                    \\ \midrule
                                        & \textbf{Average Rank} & 3.0                                  & 7.8                                                                                    & 2.4                                                                                     & 5.2                                 & \textbf{2.2}                               & 4.2                              & 6.6                                      & 4.6                                                                                     \\
                                        &                       & \multicolumn{1}{l}{}                 & \multicolumn{1}{l}{}                                                                   & \multicolumn{1}{l}{}                                                                    & \multicolumn{1}{l}{}                & \multicolumn{1}{l}{}              & \multicolumn{1}{l}{}             & \multicolumn{1}{l}{}                     & \multicolumn{1}{l}{}                                                                    \\
\multirow{5}{*}{\textbf{ViT Base CLIP}} & BreastUS              & 0.91                                 & 0.83                                                                                   & 0.94                                                                                    & 0.91                                & 0.94                              & \textbf{0.97}                    & 0.91                                     & 0.95                                                                                    \\
                                        & FitzPatrick           & \textbf{0.82}                        & 0.69                                                                                   & 0.80                                                                                    & 0.72                                & 0.81                              & 0.78                             & 0.72                                     & 0.78                                                                                    \\
                                        & HAM10000              & 0.84                                 & 0.77                                                                                   & 0.85                                                                                    & 0.81                                & \textbf{0.89}                     & 0.83                             & 0.81                                     & 0.87                                                                                    \\
                                        & SMDG                  & 0.83                                 & 0.69                                                                                   & 0.84                                                                                    & 0.82                                & \textbf{0.88}                     & 0.87                             & 0.76                                     & 0.85                                                                                    \\
                                        & Pneumonia             & 0.86                                 & 0.8                                                                                    & 0.86                                                                                    & 0.85                                & \textbf{0.87}                     & \textbf{0.87}                    & 0.84                                     & 0.85                                                                                    \\ \midrule
\textbf{}                               & \textbf{Average Rank} & 3.6                                  & 8.0                                                                                    & 3.4                                                                                     & 5.8                                 & \textbf{1.8}                               & 2.8                              & 7.0                                      & 3.6                                                                                     \\ \bottomrule
\end{tabular}
}
\end{adjustbox}
\vspace{-0.3cm}
\caption{Table presenting the results for ViT Base model pre-trained using different self-supervised objectives.}
\end{table}


\section{Training Details}
\textbf{Details on Batch Size and Optimizer: }For each experiment, we used a batch size of \textbf{512} and \textbf{AdamW} optimizer \cite{loshchilov2017decoupled}. Our initial experiments concluded that the choice of optimizer does not have any major impact on the downstream performance and hence, we proceeded with AdamW as it is one of the most commonly adopted optimizers for both discriminative and generative tasks. \\

\textbf{Details on Learning Rate Selection: }We observed that fine-tuning of PEFT methods shows a preference for larger learning rates (about a magnitude higher than the full fine-tuning). However, since each fine-tuning strategy, model architecture, and dataset might benefit from a different learning rate, we relied on a common HPO procedure, implemented using the \textit{Optuna} package \cite{akiba2019optuna}, to obtain the optimal learning rate for each competitor, in order to perform a fair comparison.
The goal of the HPO was to find the best learning rate by maximizing the performance on the validation set. We ran the HPO procedure to find the optimal learning rate for each fine-tuning strategy, model architecture and dataset. Finally, we used the HPO-recommended learning rates and reported the performance on the test set.


\clearpage

\section{Update Rules for PEFT Algorithms}

\subsection{Task-Specific Adapters (TSA)}

In \textbf{TSA}, our objective is to learn task-specific weights $\phi$ to obtain the task-adapted classifier $f_{(\theta, \phi)}$. Next, we minimize the cross-entropy loss $L$ over the samples in the downstream dataset $D$ w.r.t the task-specific weights $\phi$. \citet{li2022cross} recommend the parallel adapter configuration. The output of the \textit{l}-th layer of the feature extractor $f_{\theta}$ can be combined with the task-specific adapters $r_{\phi}$ for an input tensor $h \in \mathbb{R}^{W \times H \times C}$ in a parallel configuration using,

\begin{equation} \label{eqn:tsa}
    f_{(\theta_l, \phi)}(h) = r_{\phi}(h) + f_{\theta_l}(h).
\end{equation}

\subsection{Adaptformer}

In \textit{Adaptformer} (section \ref{sec:methods}), the adapted features are obtained using equation \ref{eqn:adapted_features}. These features are then combined with the original features entering the \textit{AdaptMLP} block through a residual connection, described in equation \ref{eqn:combined}. Here, \textit{ReLU} and \textit{LN} describe the Rectified Linear Unit and Layer Normalization respectively.


\begin{gather} 
    x_{adap} = ReLU(LN(x_{orig}) \cdot W_{down}) \cdot W_{up} \label{eqn:adapted_features} \\ 
    x_{final} = MLP(LN(x_{orig})) + s.x_{adap} + x_{orig} \label{eqn:combined}
\end{gather}

\subsection{SV-Diff}

\textbf{SV-Diff} performs Singular Value Decomposition (SVD) of the weight matrices of a pre-trained diffusion model (Eq. \ref{eqn:svd}) and optimizes the spectral shift ($\delta$), defined as the difference between singular values and of the updated and original weight matrix. 

The update rule is defined in Eq.~\ref{eqn:svdiff_update},
%\vspace{-0.9em}
\begin{gather} 
    W = U \Sigma V^\intercal \quad \text{with} \quad \Sigma = \text{diag}(\sigma) \label{eqn:svd}, \\
    W_{\delta} = U \Sigma_{\delta} V^\intercal \quad \text{with} \quad \Sigma_{\delta} = \text{diag}(\text{ReLU}(\sigma + \delta)). \label{eqn:svdiff_update}
\end{gather}

\subsection{DiffFit}

DiffFit builds on the BitFit approach \cite{ben-zaken-etal-2022-bitfit} and fine-tunes only the bias, normalization terms and the class-condition module. Further, learnable scaling factors $\gamma$ are introduced. A minimal implementation protocol of DiffFit is provided in Section \ref{sec:difffit_protocol}.

\clearpage

\section{Trainable Parameter Count for PEFT Methods} \label{sec:param_count}

The trainable parameter count for each PEFT method and ViT variant is presented in Table \ref{tab:param_count}. For \textit{Linear probing}, the parameter count depends on the number of classes in the downstream dataset. Certain methods such as \textit{Attention Tuning}, despite of falling under the PEFT, show a high parameter count. For other PEFT methods, the number of trainable parameters do not grow as rapidly as the total parameter in the respective ViT variant. \\

% Please add the following required packages to your document preamble:
% \usepackage{booktabs}
\begin{table}[h]
\resizebox{\textwidth}{!}{
\begin{tabular}{@{}lllllllll@{}}
\toprule
\textbf{Encoder} & \textbf{Full FT} & \textbf{\begin{tabular}[c]{@{}l@{}}Linear\\ Probing\end{tabular}} & \textbf{\begin{tabular}[c]{@{}l@{}}Attention\\ Tuning\end{tabular}} & \textbf{BitFit} & \textbf{LoRA} & \textbf{SSF} & \textbf{Adaptformer} & \textbf{\begin{tabular}[c]{@{}l@{}}LayerNorm\\ Tuning\end{tabular}} \\ \midrule
ViT Base         & 87.2 M           & 3.8 - 7.2 K                                                        & 28.5 M                                                              & 0.1 M           & 0.6 M         & 0.2 M        & 0.1 M                & 0.04 M                                                              \\
ViT Large        & 303 M            & 3.8 - 7.2 K                                                        & 100 M                                                               & 0.2 M           & 1.5 M         & 0.5 M        & 0.3 M                & 0.1 M                                                               \\
ViT Huge         & 630 M            & 3.8 - 7.2 K                                                        & 210 M                                                               & 0.4 M           & 2. 6 M        & 0.9 M        & 0.5 M                & 0.2 M                                                               \\ \bottomrule
\end{tabular}
}
\caption{Table presenting the trainable parameter count for each PEFT method and ViT variant (Base/ Large/ Huge)}
\label{tab:param_count}
\end{table}

\clearpage

\section{Training Protocols of Selective PEFT Methods} \label{sec:training_protocols}

\subsection{Discriminative Tasks}

\subsubsection{Normalization Tuning (CNNs)}

\begin{lstlisting}[language=Python, caption={Fine-Tuning only the normalization parameters (BatchNorm) in CNNs}, label={lst:python}]
def set_module_grad_status(module, flag=False):
    if isinstance(module, list):
        # print("list", module)
        for m in module:
            set_module_grad_status(m, flag)
    else:
        # print("not a list", module)
        for p in module.parameters():
            p.requires_grad = flag


# Function to enable batchnorm parameters
def enable_bn_update(model):
    for m in model.modules():
        if type(m) in [nn.BatchNorm2d, nn.GroupNorm]:
            if m.weight is not None:
                set_module_grad_status(m, True)
\end{lstlisting}


\subsubsection{Bias Tuning (CNNs)}

\begin{lstlisting}[language=Python, caption={Fine-Tuning only the bias parameters in CNNs}, label={lst:python}]
def enable_bias_update(model):
    for m in model.modules():
        for name, param in m.named_parameters():
            if name == "bias":
                param.requires_grad = True

\end{lstlisting}


\subsubsection{Attention Tuning (ViTs)}

\begin{lstlisting}[language=Python, caption={Fine-Tuning only the attention parameters in ViTs}, label={lst:python}]
def tune_attention_layers(model, model_type):
    
    for name_p,p in model.named_parameters():
        if '.attn.' in name_p or 'attention' in name_p:
            p.requires_grad = True
        else:
            p.requires_grad = False
        
        model.head.weight.requires_grad = True
        model.head.bias.requires_grad = True
        
        # POSITION EMBEDDING
        try:
            model.pos_embed.requires_grad = True
        except:
            print('no pos embedding')
            
        # PATCH EMBEDDING
        try:
            for p in model.patch_embed.parameters():
                p.requires_grad = False
        except:
            print('no patch embed')
                
\end{lstlisting}

\subsubsection{Task-Specific Adapters (TSA)}

\begin{lstlisting}[language=Python, caption={Attaching TSA layers to a pre-trained ResNet}, label={lst:python}]
# orig_resnet = pretrained ResNet

for block in orig_resnet.layer1:
    for name, m in block.named_children():
        if isinstance(m, nn.Conv2d):
            new_conv = conv_tsa(m, self.ad_type)
            setattr(block, name, new_conv)

for block in orig_resnet.layer2:
    for name, m in block.named_children():
        if isinstance(m, nn.Conv2d):
            new_conv = conv_tsa(m, self.ad_type)
            setattr(block, name, new_conv)

for block in orig_resnet.layer3:
    for name, m in block.named_children():
        if isinstance(m, nn.Conv2d):
            new_conv = conv_tsa(m, self.ad_type)
            setattr(block, name, new_conv)

for block in orig_resnet.layer4:
    for name, m in block.named_children():
        if isinstance(m, nn.Conv2d):
            new_conv = conv_tsa(m, self.ad_type)
            setattr(block, name, new_conv)

\end{lstlisting}


\subsection{Generative Tasks}

\subsubsection{Norm Tuning}

\begin{lstlisting}[language=Python, caption={Fine-Tuning only the normalization parameters in Stable Diffusion (U-Net)}, label={lst:python}]
def enable_norm_update(model):
    print("Enabling Normalization layers")
    for m in model.modules():
        for name, param in m.named_parameters():
            if "norm" in name:
                param.requires_grad = True

\end{lstlisting}

\subsubsection{Bias Tuning}

\begin{lstlisting}[language=Python, caption={Fine-Tuning only the bias parameters in Stable Diffusion (U-Net)}, label={lst:python}]
def enable_bias_update(model):
    print("Enabling Bias layers")
    for m in model.modules():
        for name, param in m.named_parameters():
            if name == "bias":
                param.requires_grad = True

\end{lstlisting}

\subsubsection{Bias Tuning}

\begin{lstlisting}[language=Python, caption={Fine-Tuning only the attention parameters in Stable Diffusion (U-Net)}, label={lst:python}]
def enable_attention_update(model):
    print("Enabling Attention layers")
    for m in model.modules():
        for name, param in m.named_parameters():
            if "attentions" in name:
                param.requires_grad = True
                
\end{lstlisting}

\subsubsection{DIFFFIT} \label{sec:difffit_protocol}

\begin{lstlisting}[language=Python, caption={Fine-Tuning protocol for DiffFit}, label={lst:python}]
def enable_difffit_update(model: nn.Module):
    
    trainable_names = ["bias","norm","gamma","y_embed"]

    for par_name, par_tensor in model.named_parameters():
        par_tensor.requires_grad = any([kw in par_name for kw in trainable_names])

    return model
                
\end{lstlisting}


\end{document}
