\documentclass{midl} % Include author names
% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution
\definecolor{LightSalmon}{RGB}{255,184,152}
\definecolor{LightSkyBlue}{RGB}{120,206,255}
\definecolor{Darkgreen}{RGB}{30,190,30}
\usepackage{mwe} % to get dummy images
% \usepackage{graphicx}
% \usepackage{amsmath}
% \usepackage{amssymb}
\usepackage{soul}
\usepackage{booktabs}
\usepackage{enumitem}
\usepackage{makecell}
\usepackage{tabu}
\usepackage{multirow}
\usepackage{colortbl}
\usepackage{url}
\usepackage{hyperref}
\hypersetup{
	citecolor={gray},
}
\usepackage{wrapfig}
\jmlryear{2024}\jmlrworkshop{Full Paper -- MIDL 2024}\jmlrvolume{--43}\editors{Accepted for publication at MIDL 2024}

\title[A New PET Framework for Medical Volumetric Segmentation]{Med-Tuning: A New Parameter-Efficient Tuning Framework for Medical Volumetric Segmentation}
\midlauthor{\Name{Jiachen Shen\midljointauthortext{Contributed equally}\nametag{$^{1}$}} \Email{m202110559@xs.ustb.edu.cn}\\
\Name{Wenxuan Wang\midlotherjointauthor\nametag{$^{1}$}}\Email{s20200579@xs.ustb.edu.cn}\\
\addr $^{1}$ School of Automation and Electrical Engineering, University of Science and Technology Beijing\\
\Name{Chen Chen\nametag{$^{2}$}} \Email{chen.chen@crcv.ucf.edu}\\
\addr $^{2}$ Center for Research in Computer Vision, University of Central Florida\\
\Name{Jianbo Jiao\nametag{$^{3}$}} \Email{j.jiao@bham.ac.uk} \\
\addr $^{3}$ School of Computer Science, University of Birmingham\\
\Name{Jing Liu\nametag{$^{4}$}} \Email{jliu@nlpr.ia.ac.cn}\\
\addr $^{4}$ Institute of Automation, Chinese Academy of Sciences\\
% \addr $^{5}$ School of Artificial Intelligence, University of Chinese Academy of Sciences\\
\Name{Yan Zhang\nametag{$^{1}$}} 
% \Email{m202110578@xs.ustb.edu.cn}
\\
\Name{Shanshan Song\nametag{$^{1}$}} 
% \Email{fgh@bar.com}
\\
\Name{Jiangyun Li\midljointauthortext{Corresponding author}\nametag{$^{1}$}} \Email{leejy@ustb.edu.cn}
}

\begin{document}
\maketitle
\vspace{-6mm}
\begin{abstract}
The ``pre-training then fine-tuning (FT)" paradigm is widely adopted to boost the model performance of deep learning-based methods for medical volumetric segmentation. However, conventional full FT incurs high computational and memory costs. Thus, it is of increasing importance to fine-tune pre-trained models for medical volumetric segmentation tasks in a both effective and parameter-efficient manner. In this paper, we introduce a new framework named Med-Tuning to realize parameter-efficient tuning (PET) for medical volumetric segmentation task and an efficient plug-and-play module named Med-Adapter for task-specific feature extraction. With a small number of tuned parameters, our framework enhances the 2D baselines's precision on segmentation tasks, which are pre-trained on natural images. Extensive experiments on three benchmark datasets (CT and MRI modalities) show that our method achieves better results than previous PET methods on volumetric segmentation tasks. 
Compared to full FT, Med-Tuning reduces the fine-tuned model parameters by up to 4$\times$, with even better segmentation performance. Our project webpage is at \url{https://rubics-xuan.github.io/Med-Tuning/}. 
\end{abstract}

\begin{keywords}
Parameter-Efficient Tuning, Medical Volumetric Segmentation, Transformer.
\end{keywords}

\section{Introduction}
\vspace{-1mm}
\label{sec:intro}
{Medical volumetric segmentation (MVS) task is to identify tumors and organ sub-regions in biomedical images, aiding accurate clinical diagnoses and treatment planning.
It is crucial in medical research, due to the widespread use of 3D imaging like computed tomography (CT) and magnetic resonance imaging (MRI).}
In the last decades, a large number of deep neural network architectures have been proposed, including convolutional neural networks (CNNs) (e.g., \cite{vnet,3dunet,isensee2021nnu}) and Transformer-based networks (e.g., \cite{cao2022swin,hatamizadeh2022unetr,Hatamizadeh2022SwinUS,zhou2023nnformer,peiris2022robust}). 
{Recently, the ``pre-training then fine-tuning" paradigm~\cite{yosinski2014transferable} has gained much popularity to enhance model performance in downstream tasks.
As in \cite{cao2022swin}, the conventional full fine-tuning scheme updates all parameters of the pre-trained models.}
Yet, as models continuously improve in performance, particularly Transformer-based ones like \cite{cao2022swin,hatamizadeh2022unetr,Hatamizadeh2022SwinUS}, their tuned parameter count escalates significantly. 
Thus, full fine-tuning involves a lot of tuned parameters and entails great training costs.
To reduce tuned parameters, head-tuning (Head) was proposed~\cite{he2022masked},  focusing solely on optimizing the task-specific decoder, albeit resulting in decreased model performance. 
Meanwhile, recent studies~\cite{jia2022visual,chen2022adaptformer,pan2022st,sung2022lst,yu2022towards,zhang2023dept,xu2023side,wu2023medical,fischer2024prompt} focus on parameter-efficient tuning (PET) to balance model performance and tuned parameters.
\begin{wrapfigure}{r}{0.6\textwidth}
    \vspace{-2mm}
    \centering
    \includegraphics[width=0.6\textwidth]{Figures/intro.pdf}
    \vspace{-10mm}
    \caption{
    The two-fold gaps between upstream pre-training and our downstream fine-tuning.
    }
    \vspace{-8mm}
    \label{fig1}
\end{wrapfigure}
In this paper, we aim to investigate how to adapt strong visual foundation models pre-trained on natural images to MVS tasks via PET.
{We initiate our analysis with some examples of widely available models that use image-level pre-training
(e.g., classification task ~\cite{deng2009imagenet}, CLIP~\cite{Radford2021LearningTV}, MOCO v3~\cite{Chen2021AnES}) in natural image domain.
\figureref{fig1} presents the two-fold gaps between upstream pre-training and downstream fine-tuning: 
\textbf{(1) Domain gap} between natural images and medical volumes; \textbf{(2) Task gap} between image-level pre-training and pixel-level segmentation.
To narrow these gaps, we propose \textbf{Med-Tuning}, a new PET framework for MVS, and \textbf{Med-Adapter}, an efficient plug-and-play module for task-specific feature extractions.
Med-Tuning processes 3D volumes through a frozen pre-trained Transformer model with inserted Med-Adapters.
Med-Adapter greatly narrow both gaps by capturing spatial multi-scale features and volumetric correlations between slices with few additional parameters.}
Our main contributions are summarized as follows:
\setlist{nolistsep}
\begin{itemize}[noitemsep,leftmargin=*]
	\item We propose a new PET framework \textbf{Med-Tuning}, which greatly boosts the performance of the pre-trained models on MVS task and reduces training costs.\textbf{}
	\item We propose a plug-and-play module \textbf{Med-Adapter}, to consider both spatial relationship modeling (coarse/fine-grained) and volumetric correlations between slices. 
	\item Extensive experiments on three benchmark datasets with both CT and MRI modalities convince the effectiveness of \textbf{Med-Tuning} over full fine-tuning and other PET methods.
    \item \textbf{Med-Tuning} adapts well to the rapidly evolving Transformer-based visual foundation models (i.e., SAM), showcasing strong generalization and flexibility. 
\end{itemize}

\vspace{-2mm}
\section{Related Work}
\label{sec:relatedwork}
\vspace{-1mm}
\subsection{Medical Volumetric Segmentation} 
\vspace{-1mm}
Achieving promising performance on MVS requires the incorporation of both spatial multi-scale representations and volumetric correlations, as demonstrated by prior research~\cite{Hatamizadeh2022SwinUS}.
Several U-Net inspired CNN-based models~\cite{unet,3dunet,unet++,isensee2021nnu} concatenate multi-scale features from the encoder and up-sampled features, complementing the loss of spatial information caused by down-samplings. 
Cao~et~al.~\cite{cao2022swin} use skip connections to effectively fuse low-level and high-level features in Transformers. 
Besides, various MVS methods capture volumetric correlations by 3D convolutions~\cite{3dunet,vnet,isensee2021nnu} or self-attention mechanism among 3D volumes~\cite{wang2021transbts}. 
\vspace{-4mm}
\subsection{Visual Parameter-Efficient Tuning} 
\vspace{-2mm}
Recently, novel vision PET methods have emerged to balance accuracy and tuned parameter efficiency during fine-tuning, which can be categorized into three groups:
\textbf{(1)} Prompt-based methods~\cite{zhang2023dept,fischer2024prompt}. 
For instance, VPT~\cite{jia2022visual} adds learnable prompt tokens to patch embeddings for downstream visual tasks. 
Pro-tuning~\cite{nie2023pro} inserts multiple stage-wise prompt blocks into different stages of the backbone. 
\textbf{(2)} Adapter-based methods~\cite{Houlsby2019ParameterEfficientTL,chen2022adaptformer,yang2023aim,wu2023medical}. Adapter is a lightweight module inserted between the feed-forward layer and layer normalization in Transformer, which are tuned during fine-tuning while other layers stay frozen.
ST-Adapter~\cite{pan2022st} introduces 3D depth-wise convolution (DWConv)~\cite{ye20193d} in Adapter modules to capture spatial-temporal features. 
\textbf{(3)} Other PET techniques. 
SAN~\cite{xu2023side} is a small and separate network that is trained via shortcut connections from backbone to reduce memory cost during fine-tuning. 
Recent studies ~\cite{wu2023medical, chai2023ladder} mainly focus on exploring the potential of the Segment Anything Model (SAM) for medical image analysis.

\vspace{-4mm}
\subsection{Utilization of Fourier Transform in Computer Vision} 
\vspace{-2mm}
Image analysis in Fourier domain is extensively used in various vision tasks~\cite{ding2017circnn,lee2018single,li2020falcon,chi2020fast,yang2020fda,rao2021global}. Fast Fourier Transform (FFT) and Inverse Fast Fourier Transform (IFFT) leverage frequency information for global connectivity through parameter-free domain mapping on original images, resulting in an intrinsic global vision characteristic.
According to the conclusion of~\cite{oppenheim1979phase,liu2023devil}, the phase-only image or feature retains many of the semantics features of the original image.

\vspace{-4mm}
\section{Methodology}
\label{sec:methodology}
\vspace{-1mm}
\subsection{Preliminaries}
\label{subsec:preliminaries}
\paragraph{Vanilla Adapter.}
\vspace{-1mm}
{Given an input feature $X \in \mathbb{R}^{N \times d}$, the vanilla Adapter can be represented as \equationref{eq_adapter}~\cite{Houlsby2019ParameterEfficientTL}
, where $W_{down}$ and $W_{up}$ indicate the down-projection and up-projection layer, $\sigma(\cdot)$ is an activation function, $+$ is a skip-connection.}
\vspace{-2mm}
\begin{equation}
  \text{Adapter}(X) = X + \sigma(XW_{down})W_{up},
  \label{eq_adapter}
\vspace{-2mm}
\end{equation}

\vspace{-2mm}
\paragraph{Discrete Fourier Transform.}
\vspace{-1mm}
Discrete Fourier Transform (DFT) serves as classical technique for computer vision applications~\cite{rao2021global}.
Given a 3D input (volumetric data or feature) $x[D,H,W]$, DFT is defined as: 
\vspace{-2mm}
\begin{equation}
    \small
    X=\mathcal{F}(x)=\sum_{w=0}^{W-1} \sum_{h=0}^{H-1} \sum_{d=0}^{D-1} x(d,h,w) e^{-j 2 \pi\left(\frac{xd}{D}+\frac{yh}{H}+\frac{zw}{W}\right)}= \mathcal{R}+\mathcal{I}j,
    \label{eq:fft}
\vspace{-3mm}
\end{equation}
where $X$ is a complex matrix, $\mathcal{R}$ and $\mathcal{I}$ denote its real and imaginary part. In implementation, we use the accelerated versions of DFT and Inverse DFT (i.e., FFT and IFFT). 

\begin{figure}[!tp]
    \centering \includegraphics[width=1.0\textwidth]{Figures/methodology.pdf}
    \vspace{-10mm}
    \caption{
    % Med-Tuning is our proposed PET framework, which consists of 2D Transformer baselines for MVS with our proposed Med-Adapter modules gradually inserted in each stage. 
    The overall architecture of Med-Tuning. Med-Tuning consists of a 2D Transformer baseline with proposed Med-Adapters inserted at each encoder stage. 
    % Note that we need to reshape the 3D input medical volumes from $\left[B, D, H, W \right]$ to $\left[BD, H, W\right]$ before embedding layer, where B = Batch.
    % Before embedding, reshape the input along the slice dimension. 
    Only Med-Adapters and decoder are \textcolor{LightSalmon}{\textbf{tuned}} while all the other layers stay \textcolor{LightSkyBlue}{\textbf{frozen}}.
    % during fine-tuning.
    }
    \label{fig:method}
    \vspace{-9mm}
\end{figure}
\vspace{-2mm}
\subsection{Med-Tuning: Parameter-Efficient Tuning for MVS}
\vspace{-1mm}
The overall architecture of our framework is depicted in \figureref{fig:method}. Med-Tuning consists of a 2D Transformer backbone $G$ pre-trained on natural images, a segmentation decoder and inserted Med-Adapter. 
Given a batch of medical volumes input $I[B,C,D,H,W]$,($B, C, D, H, W$ is the number of batch size, channel, slice, height, width), we initially reshape them to $I[BD,C,H,W]$ before embedding layer. 
According to two main considerations in the following, we decide to exclusively introduce Med-Adapter into encoder without modifying decoder, enabling high decoder scalability to meet different requirements.
First, encoder plays a pivotal role in baseline. 
Inadequate feature extraction will hinder performance even with the same robust decoder, as evidenced by the decline in results for Head compared to Full, detailed in \tableref{tab:KiTS 2019_sota}, \tableref{tab:vit_sota} and \tableref{tab:swinunet_sota}.
Besides, not all segmentation decoders come with pre-trained weights, necessitating the fine-tuning of the entire decoder.
Secondly, sole insertion in encoder part improves the flexibility of the whole framework. 
Our inserting strategy broadens the adaptability of Med-Tuning on visual foundation models while reducing tuned parameters.

\vspace{-2mm}
\subsection{Med-Adapter: Adapter for MVS}
\vspace{-1mm}
We propose a task-oriented and simple yet effective module for medical volumetric data, namely \textbf{Med-Adapter}.
The motivation of Med-Adapter is to empower a 2D Transformer model pre-trained on natural images to gain the capability of volumetric feature modeling in a parameter-efficient manner. 
Here we consider three criteria when designing Med-Adapter: 
(1) \textit{MVS oriented}: It's necessary to narrow the mentioned gaps in \figureref{fig1}. (2) \textit{Light-weight}: Structure with a low amount of parameters is crucial. (3) \textit{Plug-and-play}: An easy-to-implement module is friendly to practical deployment.
While retaining the bottleneck structure of the vanilla Adapter (\equationref{eq_adapter}), we introduce a few tailored designs into Med-Adapter based on above criteria, shown in \figureref{fig:method} (right). 

\vspace{-2mm}
\subsubsection{Intra-stage Feature Enhancement (Intra-FE).}  
\vspace{-1mm}
We introduce multiple branches tailored to capture fine-grained feature representations, coarse-grained global semantics and volumetric correlations among slices, which are vital for realizing accurate medical volumetric segmentation task.
\vspace{-1mm}
\paragraph{Multi-scale Local Branch (MLB)} 
We employ parallel 3D convolutions with kernels of sizes $k=3$ and $k=5$ to extract spatial multi-scale features and 3D volumetric correlations between slices.
The conventional 3D convolution operations are replaced with 3D DWConv in a parameter-efficient manner.
Moreover, we use two cascaded $1 \times k \times k$ and $k \times 1 \times 1$ convolutions to replace the $k \times k \times k$ convolution to pursue an extremely lightweight structure. 

\paragraph{FFT Global Branch (FGB)}
\vspace{-1mm}
To achieve coarse-grained global semantic extraction in a parameter-efficient way, we substitute traditional large convolutional kernels and attention mechanisms, known for their memory and computation demands, with 3D FFT, IFFT and learnable complex matrices.
These filter-like complex matrices are designed to model frequency features that contain global semantics in the whole frequency domain.
Compared to vanilla 3D self-attention operation with $\mathcal{O}(n^{3})$ complexity ($n$ is the number of tokens), our FGB is a
lightweight module. In detail, the computational complexity of FGB is $\mathcal{O}(nlog(n))$, where FFT and IFFT are both with $\mathcal{O}(nlog(n))$ complexity while that of Hadamard product or matrix addition is $\mathcal{O}(n)$.
Then we merge the aforementioned branches by efficient $1 \times 1 \times 1$ convolution.
Therefore, given the intermediate embedded feature representations $X\in[BD,C',HW]$, the Intra-FE module can be expressed as \equationref{intrafe}-(\ref{downsap}). 
In this way, the Intra-FE module is theoretically capable of modeling volumetric correlations among slices and incorporating abundant spatial multi-scale features for the downstream dense prediction task (i.e., MVS).
\vspace{-2mm}
\begin{equation}
    \small
    \text{Intra-FE} =M= Conv_{1 \times 1 \times 1}(\text{MLB} + \text{FGB}),
    \label{intrafe}
\end{equation}
\begin{equation}
    \small
    \text{MLB}=DWConv_3(X') + DWConv_5(X'),\text{ 
 } \text{FGB} = \mathcal{F}^{-1}(W_F \odot \mathcal{F}(X') + b_F)
    \label{mlb}
\end{equation}
\begin{equation}
    \small %修改字体
    X'= \text{R}_{volume}(\sigma(XW_{down})), \text{ 
 } X'\in[B,C',D,H,W], \text{ 
 } C'=C/\alpha
    \label{downsap}
\end{equation}
where $M$ is the intra-stage enhanced feature representations. $W_F$ and $b_F$ are the learnable complex matrices, $\mathcal{F}$ is 3D FFT in \equationref{eq:fft} and $\mathcal{F}^{-1}$ is 3D IFFT. $\odot$ is the hadamard product, while $\text{R}_{volume}$ is a reshape operation to obtain cube-shape feature representations.

\vspace{-2mm}
\subsubsection{Inter-stage Feature Interaction (Inter-FI).}
\vspace{-1mm}
To fully exploit the representations collected by Inter-FI modules in Med-Adapter at each stage, we further consider the feature interaction between different stages.
The intra-stage enhanced feature representations $M$ in this stage will be fused with $M_{LastStage}$ (i.e., the output of the Intra-FE module from the previous stage).
Note that Inter-FI is only introduced at specific Med-Adapters at the end of each stage. 
Thus, Inter-FI is expressed as \equationref{eq:inter2}.
In this way, feature representations extracted by Intra-FE modules of Med-Adapters in shallow layers are gradually fed to adjacent higher layers, realizing inter-stage feature interaction by explicit enhancement and thus boosting model performance.
\vspace{-2mm}
\begin{equation}
\small
\text{Inter-FI}=Cat(\mathcal{A}(H, M_{LastStage})), \text{  }
M=
\begin{cases}
\text{Inter-FI}, & \text{if $Flag_{last}$} \\
M, & \text{if not $Flag_{last}$}
\end{cases}
\label{eq:inter2}
\vspace{-2mm}
\end{equation}
where $\mathcal{A}$ denotes convolution operations to align $M$ and $M_{LastStage}$ in terms of spatial resolution and channel dimension, $Cat$ refers the concatenation. 
$Flag_{last}$ is a bool parameter and $Flag_{last}=True$ when the current Med-Adapter is the inserted last one at this stage.

In summary, our Med-Adapter can be formulated as \equationref{eq:med_adapter_inter}. $\text{R}_{flatten}$ denotes a symmetric operation that reshapes the feature back to the original shape of $X$.

\begin{equation}
    \small %修改字体
    % \text{Med-Adapter}(X) = X + \sigma(\text{R}_{flatten}(H+X') W_{up}).
    \text{Med-Adapter}(X) = X + \text{R}_{flatten}(M+\text{R}_{volume}(\sigma(XW_{down})) W_{up}. \label{eq:med_adapter_inter}
\end{equation} 

\vspace{-8mm}
\section{Experiments and Results}
\label{sec:experiments}
\vspace{-1mm}
\subsection{Experimental Setup}
\label{experimentalsetup}
\vspace{-1mm}
\paragraph{Datasets and Evaluation Metrics.} 
Our proposed Med-Tuning is evaluated on three benchmark datasets: \textbf{(1)} Kidney Tumor Segmentation 2019~\cite{heller2019kits19}(KiTS 2019), \textbf{(2)} Brain Tumor Segmentation 2019 (BraTS 2019)~\cite{menze2014multimodal,bakas2017advancing,bakas2018identifying}, \textbf{(3)} Brain Tumor Segmentation 2020 (BraTS 2020)~\cite{menze2014multimodal,bakas2017advancing,bakas2018identifying}, detailed in \textcolor{blue}{Appendix} \ref{app:benchmarkdata}.
KiTS 2019 dataset comprises multi-phase 3D CTs depicting the kidneys and tumors. 
The ground truth contains 3 classes: background (label 0), kidney (label 1), and kidney tumor (label 2). The segmentation accuracy of KiTS 2019 is measured by kidney dice (label 1 and 2) and tumor dice (label 2), composite dice (the average of kidney and tumor dice). 
BraTS 2019 and BraTS 2020 datasets consist of 3D brain MRI scans with four modalities. The ground truth contains 4 classes: background (label 0), necrotic and non-enhancing tumor (label 1), peritumoral edema (label 2), and GD-enhancing tumor (label 4). The segmentation accuracy is measured by Dice score and the Hausdorff distance (95\%) metrics for enhancing tumor region (ET, label 4), regions of tumor core (TC, labels 1 and 4), and whole tumor region (WT, labels 1,2 and 4).

\vspace{-2mm}
\paragraph{Implementation Details.}
Experiments utilizing PyTorch~\cite{paszke2019pytorch} for implementation are conducted on NVIDIA GeForce RTX 3090 GPUs.
The pre-trained vanilla ViT~\cite{dosovitskiy2020vit} with UPerNet~\cite{xiao2018unified} decoder (ViT+UPerNet) and Swin-UNet~\cite{cao2022swin} based on pre-trained Swin Transformer(tiny) are our chosen baselines. 
More implementation details are available in \textcolor{blue}{Appendix} \ref{app:benchmarkexperiment}.

\vspace{-3mm}
\subsection{Results and Analysis}
\vspace{-1mm}
\label{section_results}
We conduct experiments on three benchmark validation sets and compare our method with scratch (i.e., training with random initialization, without pre-training), full fine-tuning, head tuning and previous state-of-the-art PET approaches (i.e., VPT~\cite{jia2022visual}, Adapter~\cite{Houlsby2019ParameterEfficientTL}, AdaptFormer~\cite{chen2022adaptformer}, Pro-tuning~\cite{nie2023pro}, ST-Adapter~\cite{pan2022st}). Qualitative results are shown in \textcolor{blue}{Appendix}~\ref{appl:visual}.
\vspace{-2mm}
\paragraph{KiTS 2019.} 
The performance comparisons with ViT+UPerNet and Swin-UNet as baseline are shown in \tableref{tab:KiTS 2019_sota}. 
Our proposed method boosts the performance of full fine-tuning considerably and achieves much higher Dice scores than previous PET methods, with much fewer tuned model parameters. 
In comparison with recently proposed PET methods (e.g., VPT, Pro-tuning and ST-Adapter), our Med-Tuning achieves better performance-efficiency trade-off on two baselines. 
Specifically, Med-Tuning improves model performance by a large margin (i.e., $\uparrow\textbf{4.20\%}$ Kidney Dice, $\uparrow\textbf{17.13\%}$ Tumor Dice, $\uparrow\textbf{10.67\%}$ Composite Dice on ViT+UPerNet and $\uparrow\textbf{1.01\%}$ Kidney Dice, $\uparrow\textbf{8.02\%}$ Tumor Dice, $\uparrow\textbf{4.52\%}$ Composite Dice on Swin-UNet) with only $\textbf{17.70\%}$ and $\textbf{27.58\%}$ of tuned parameters respectively in comparison with full fine-tuning.

\vspace{-2mm}
\paragraph{BraTS 2019.} 
Performance comparisons on BraTS 2019 on two baselines are shown in \tableref{tab:vit_sota} (left) and \tableref{tab:swinunet_sota} (left). Results show that our method attains the best trade-off between performance and efficiency, achieving comparable or even better results than previous methods.
Compared to full fine-tuning, Med-Tuning achieves maximum improvements of \textbf{4.23\%} (ViT+UPerNet) and \textbf{1.28\%} (Swin-UNet) in Dice scores.
Our Med-Tuning also achieves high parameter efficiency, tuning only \textbf{17.70\%} parameters of ViT and \textbf{27.58\%} of Swin-UNet, with inserted parameters being only \textbf{2.82\%} of ViT and \textbf{2.72\%} of Swin-UNet. 
\vspace{-2mm}
\paragraph{BraTS 2020.} 
Performance comparisons on BraTS 2020 are shown in \tableref{tab:vit_sota} (right) and \tableref{tab:swinunet_sota} (right).
Compared to full fine-tuning, Med-Tuning achieves maximum improvements of \textbf{4.07\%} (ViT+UPerNet) and \textbf{1.64\%} (Swin-UNet) in Dice scores with very few tuned parameters, surpassing most of PET methods. 
Compared to ST-Adapter, our tuned parameters are fewer yet yield a more substantial overall performance improvement.
Moreover, Med-Tuning took about 1.34 (ViT+UPerNet) and 1.68 (Swin-UNet) hours for fine-tuning, 0.76 (ViT+UPerNet) and 0.46 (Swin-UNet) minutes per sample for inference.
\begin{table}
  \centering
  \caption{
  Performance comparison on KiTS 2019 with Swin-UNet and ViT+UPerNet. 
  \textcolor{blue}{Blue} and \textcolor{Darkgreen}{Green} text denote the percentage of tuned parameters and the performance improvement compared to full fine-tuning (with grey background).
  }
  \vspace{-4mm}
  \scriptsize
  \setlength{\tabcolsep}{0.2mm}
  \begin{tabular}{l|c|c|ccc!{\vrule width0.8pt}c|c|ccc}
    \toprule[1.1pt]
    \multirow{3}{*}{KiTS 2019} & 
    \multicolumn{5}{c|}{ViT+UPerNet} &
    \multicolumn{5}{c}{Swin-UNet} 
    \\
    \cline{2-11}
    &
    \multirow{2}{*}{\makecell[c]{Tuned\\Params(M)}} &
    \multirow{2}{*}{\makecell[c]{Inserted\\Params(M)}} &
    \multicolumn{3}{c|}{Dice (\%) $\uparrow$}  &
    \multirow{2}{*}{\makecell[c]{Tuned\\Params(M)}} &
    \multirow{2}{*}{\makecell[c]{Inserted\\Params(M)}} &
    \multicolumn{3}{c}{Dice (\%) $\uparrow$}
    \\
    \cline{4-6}
    \cline{9-11}
    &
    &  & Kidney & Tumor & Composite
    & & & Kidney & Tumor & Composite\\
    \hline
    Scratch
    & 100.849 & -
    & 88.01 & 46.53 & 67.27     		
    &
    27.154 & -
    & 94.33 & 61.10 & 77.71  	 	 
    \\%swinu
    \rowcolor{gray!30}Full
    & 100.849 & -
    & 87.32 & 47.34 & 67.33 	 	 
    &
    27.154 & -
    & 94.68 & 62.13 & 78.40 	 	 
    \\%swinu
    Head
    & 15.007 & -
    & 87.35 & 42.85 & 65.10 	 	 
    &
    6.752 & -
    & 91.95 & 53.93 & 72.94 	 	 
    \\%swinu
    VPT-Shallow
    & 15.015 & 0.008
    & 86.91 & 41.67 & 64.29  	 
    &
    6.753 & 0.001
    & 91.72 & 54.86 & 73.29
    \\%swinu
    VPT-Deep
    & 15.100 & 0.092 
    & 88.01 & 46.45 & 67.23 	 	 
    &
    6.780 & 0.029
    & 91.53 & 53.41 & 72.47 	 	 
    \\%swinu
    Adapter
    & 18.567 & 3.560		
    & 89.75 & 49.03 & 69.39  	 	
    &
    7.541 & 0.790		
    & 93.02 & 57.15 & 75.08 	 	 
    \\%swinu
    AdaptFormer
    & 16.197 & 1.190
    & 87.62 & 44.46 & 66.04 	 	 
    &
    7.124 & 0.372
    & 93.74 & 59.79 & 76.77
    \\%swinu
    Pro-tuning
    & 19.812 & 4.805
    & 89.44 & 48.32 & 68.88  	 	 
    &
    8.359 & 1.607
    & 90.34 & 51.19 & 70.77  	 	 
    \\%swinu
    ST-Adapter
    & 22.118  & 7.110
    & 90.33 & 61.29 & 75.81
    & 	 	 
    8.328  & 1.577
    & 92.97 & 57.33 & 75.15 	 	 
    \\%swinu
    \hline
    \textbf{Ours}
    & \makecell[c]{17.853\\\textcolor{blue}{17.70\%}}&
    \makecell[c]{2.846\\\textcolor{blue}{2.82\%}} &
    \makecell[c]{\textbf{91.52}\\\textcolor{Darkgreen}{(+4.20)}} &
    \makecell[c]{\textbf{64.47}\\\textcolor{Darkgreen}{(+17.13)}} &
    \makecell[c]{\textbf{78.00}\\\textcolor{Darkgreen}{(+10.67)}}
    &
    \makecell[c]{7.489\\\textcolor{blue}{27.58\%}}&
    \makecell[c]{0.738\\\textcolor{blue}{2.72\%}} &
    \makecell[c]{\textbf{95.69}\\\textcolor{Darkgreen}{(+1.01)}} &
    \makecell[c]{\textbf{70.14}\\\textcolor{Darkgreen}{(+8.01)}} &
    \makecell[c]{\textbf{82.92}\\\textcolor{Darkgreen}{(+4.52)}}
    \\
    \bottomrule[1.1pt]
  \end{tabular}
  \label{tab:KiTS 2019_sota}
  \vspace{-4mm}
  \end{table}

\begin{table}
  % \vspace{-2mm}
  \centering
  \caption{Performance comparison on BraTS 2019 and BraTS 2020 with ViT+UPerNet. }
  \scriptsize
  \setlength{\tabcolsep}{0.01mm}
  \begin{tabular}{@{}l|c|c|ccc|ccc|ccc|ccc@{}}
    \toprule[1.1pt]
    \multirow{3}{*}{\makecell[l]{ViT+\\UPerNet}} & 
    \multirow{3}{*}{\makecell[c]{Tuned\\Params\\(M)}} &
    \multirow{3}{*}{\makecell[c]{Inserted\\Params\\(M)}} &
    \multicolumn{6}{c|}{BraTS 2019} &
    \multicolumn{6}{c}{BraTS 2020} 
    \\
    \cline{4-15}
    &  &  & \multicolumn{3}{c|}{Dice (\%) $\uparrow$} &
    \multicolumn{3}{c|}{Hausdorff (mm) $\downarrow$} &
    \multicolumn{3}{c|}{Dice (\%) $\uparrow$}  &
    \multicolumn{3}{c}{Hausdorff (mm) $\downarrow$} 
    \\
    \cline{4-15}
    &  &  & ET & WT & TC & ET & WT & TC &  ET & WT & TC & ET & WT & TC 
    \\
    \hline
    Scratch
    & 100.849 & - 
    & 64.96 & 83.03 & 71.34    & 7.64 & 10.60 & 10.94
    & 65.80 & 83.72 & 72.01    & 32.48 & 10.06 & 21.47    
    \\%vit
    \rowcolor{gray!30} Full
    & 100.849 & -
    & 68.49 & 85.56 & 75.12    & 6.67 & 7.88 & 10.53   
    & 69.12 & 85.90 & 75.29    & 34.43 & 7.32 & 17.09    
    \\%vit
    Head
    & 15.007 & -
    & 65.71 & 84.19 & 74.77    & 6.13 & 7.51 & 7.86
    & 66.03 & 84.50 & 74.47    & 37.81 & 7.47 & 14.15
    \\%vit
    VPT-Shallow
    & 15.015 & 0.008
    & 66.02 & 84.72 & 75.84    & 6.11 & 7.51 & 8.47
    & 66.52 & 84.82 & 75.46    & 37.77 & 7.47 & 13.53    
    \\%vit
    VPT-Deep
    & 15.100 & 0.092 		
    & 67.01 & 85.14 & 76.80    & 6.06  & 7.72 & 7.65
    & 67.69 & 85.28 & 76.59    & 31.77 & 7.74 & \textbf{10.62}    
    \\%vit
    Adapter
    & 18.567 & 3.560		
    & 68.30 & 85.37 & 77.05    & \textbf{5.50} & 7.64 & 7.99    		
    & 68.58 & 85.77 & 77.00    & 32.63 & 8.17 & 16.18    
    \\%vit
    AdaptFormer
    & 16.197 & 1.190		
    & 65.88 & 84.34 & 74.77    & 6.65 & 8.20 & 8.43
    & 65.52 & 84.14 & 74.28    & 41.03 & 8.39 & 14.78    
    \\%vit
    Pro-tuning
    & 19.812 & 4.805
    & 67.18 & 85.32 & 76.51    & 5.81 & 7.07 & 7.56    
    & 67.28 & 85.57 & 76.58    & 40.43 & 7.00 & 12.87    
    \\%vit
    ST-Adapter
    & 22.118  & 7.110
    & 69.18 & 86.27 & 79.18    & 6.08 & 6.94 & \textbf{6.78}
    & 68.60 & 86.55 & \textbf{79.52}    & 34.06 & 6.79 & 12.77    
    \\%vit
    \hline
    \textbf{Ours}
    &\makecell[c]{17.853 \\\textcolor{blue}{17.70\%}}&
    \makecell[c]{2.846 \\\textcolor{blue}{2.82\%}} &
    \makecell[c]{\textbf{70.53} \\\textcolor{Darkgreen}{(+2.04)}} &
    \makecell[c]{\textbf{86.58} \\\textcolor{Darkgreen}{(+1.02)}} &
    \makecell[c]{\textbf{79.35} \\\textcolor{Darkgreen}{(+4.23)}} & 
    \makecell[c]{5.86 \\\textcolor{Darkgreen}{(-0.81)}} & 
    \makecell[c]{\textbf{6.22} \\\textcolor{Darkgreen}{(-1.66)}} & 
    \makecell[c]{6.95 \\\textcolor{Darkgreen}{(-3.58)}}&  
    
    \makecell[c]{\textbf{70.69} \\\textcolor{Darkgreen}{(+1.57)}} &
    \makecell[c]{\textbf{86.69} \\\textcolor{Darkgreen}{(+0.79)}} &
    \makecell[c]{79.36 \\\textcolor{Darkgreen}{(+4.07)}} & 
    \makecell[c]{\textbf{28.64} \\\textcolor{Darkgreen}{(-5.79)}} & 
    \makecell[c]{\textbf{6.20} \\\textcolor{Darkgreen}{(-1.12)}} & 
    \makecell[c]{15.05 \\\textcolor{Darkgreen}{(-2.04)}}
    \\%vit
    \bottomrule[1.1pt]
  \end{tabular}
  \vspace{-4mm}  
  \label{tab:vit_sota}
\end{table}

\begin{table}
  \vspace{-8mm}
  \centering
  \caption{Performance comparison on BraTS 2019 and BraTS 2020 with Swin-UNet.}
  \scriptsize
  \setlength{\tabcolsep}{0.01mm}
  \begin{tabular}{@{}l|c|c|ccc|ccc|ccc|ccc@{}}
    \toprule[1.1pt]
    \multirow{3}{*}{Swin-UNet} & 
    \multirow{3}{*}{\makecell[c]{Tuned\\Params\\(M)}} &
    \multirow{3}{*}{\makecell[c]{Inserted\\Params\\(M)}} &
    \multicolumn{6}{c|}{BraTS 2019} &
    \multicolumn{6}{c}{BraTS 2020} 
    \\
    \cline{4-15}
    &  &  & \multicolumn{3}{c|}{Dice (\%) $\uparrow$} &
    \multicolumn{3}{c|}{Hausdorff (mm) $\downarrow$} &
    \multicolumn{3}{c|}{Dice (\%) $\uparrow$}  &
    \multicolumn{3}{c}{Hausdorff (mm) $\downarrow$}     
    \\
    \cline{4-15}
    &  &  & ET & WT & TC & ET & WT & TC &  ET & WT & TC & ET & WT & TC 
    \\
    \hline
    Scratch
    & 27.154 & -
    & 78.38 & 88.59 & 76.46    & 6.06 & 10.65 & 9.18
    & 78.72 & 89.12 & 77.07    & 7.62 & 6.98 & 19.08    
    \\
    \rowcolor{gray!30}Full
    & 27.154 & -
    & 78.26 & 89.56 & 79.16    & 4.33 & 6.15 & 6.70    
    & 79.09 & 89.87 & 79.15    & 9.67 & 6.03 & 15.31    
    \\
    Head
    & 6.752 & -
    & 78.07 & 88.68 & 77.26    & 5.02 & 6.70 & 7.09    
    & 78.77 & 88.66 & 76.90    & \textbf{4.89} & 8.49 & 16.06    
    \\
    VPT-Shallow
    & 6.753 & 0.001
    & 77.16 & 88.30 & 76.77    & 5.42 & 6.15 & 7.35
    & 77.43 & 88.23 & 76.13    & 7.53 & 6.07 & 16.07    		
    \\
    VPT-Deep
    & 6.780 & 0.029
    & 77.02 & 88.65 & 76.91    & 5.30 & 7.09 & 7.94 
    & 78.63 & 88.80 & 77.17    & 8.27 & 6.23 & 13.25    
    \\
    Adapter
    & 7.541 & 0.790		
    & 77.98 & 89.22 & 78.02 & 5.30 & 6.62 & 8.49
    & 78.51 & 89.16 & 77.71    & 7.05 & 6.25 & 19.09    
    \\
    AdaptFormer
    & 7.124 & 0.372
    & 77.69 & 88.61 & 76.83    & 4.91 & 6.29 & 7.89    
    & 78.22 & 88.92 & 76.40    & 10.35 & 6.48 & 16.90
    \\
    Pro-tuning
    & 8.359 & 1.607
    & \textbf{78.58} & 89.33 & 78.79    & 5.27 & 6.41 & 8.24
    & 78.77 & 89.46 & 78.20    & 7.31 & 6.50 & \textbf{10.54}    
    \\
    ST-Adapter
    & 8.328  & 1.577
    & 78.40 & 89.54 & 77.44    & 4.75 & 6.01 & 7.41 
    & 78.96 & 89.54 & 77.85    & 7.67 & \textbf{5.48} & 15.53    
    \\
    \hline
    \textbf{Ours}
    & \makecell[c]{7.489 \\ \textcolor{blue}{27.58\%}}&
    \makecell[c]{0.738 \\ \textcolor{blue}{2.72\%}} &
    \makecell[c]{78.51\\\textcolor{Darkgreen}{(+0.25)}} &
    \makecell[c]{\textbf{89.68}\\\textcolor{Darkgreen}{(+0.12)}} &
    \makecell[c]{\textbf{80.44}\\\textcolor{Darkgreen}{(+1.28)}} & 
    \makecell[c]{\textbf{4.00}\\\textcolor{Darkgreen}{(-0.33)}} & 
    \makecell[c]{\textbf{5.52}\\\textcolor{Darkgreen}{(-0.63)}} & 
    \makecell[c]{\textbf{5.76}\\\textcolor{Darkgreen}{(-0.94)}} & 
    \makecell[c]{\textbf{79.25}\\\textcolor{Darkgreen}{(+0.16)}} &
    \makecell[c]{\textbf{90.06}\\\textcolor{Darkgreen}{(+0.19)}} &
    \makecell[c]{\textbf{80.79}\\\textcolor{Darkgreen}{(+1.64)}} & 
    \makecell[c]{12.40 \\ {(+2.73)}} & 
    \makecell[c]{4.41 \\ \textcolor{Darkgreen}{(-1.62)}} & 
    \makecell[c]{11.59 \\ \textcolor{Darkgreen}{(-3.72)}} \\%swinu
    \bottomrule[1.1pt]
  \end{tabular}
  \vspace{-8mm}
  \label{tab:swinunet_sota}
\end{table}

\vspace{-2mm}
\subsection{Ablation Studies}
\vspace{-1mm}
\begin{wraptable}{r}{6.9cm}
\vspace{-10mm}
 \caption{Ablation study on the position of inserted Med-Adapter.}
 \centering
  \scriptsize
  \setlength{\tabcolsep}{0.1mm}
    \vspace{-2mm}
   \begin{tabular}{@{}cccc|ccc|ccc@{}}
        \toprule[1.1pt]
        \multicolumn{4}{c|}{Encoder}&
        \multicolumn{3}{c|}{Dice (\%) $\uparrow$}& 
        \multicolumn{3}{c}{HF (mm)$\downarrow$}\\
        \cline{5-10}
        $n$=0 & $n$=1 & $n$=2 & $n$=3 & ET & WT & TC & ET & WT & TC\\
        \hline
        -&-&-&-&78.07&88.68&77.26&  5.02&6.70&7.10 \\
        \checkmark&-&-&-&-  &  -  &  -  &    - & -  & -\\
        \checkmark&\checkmark&-&-& 74.83&87.09&72.94&  7.26& 13.12& 10.17     \\
        \checkmark&\checkmark&\checkmark&-& 75.60&86.79&73.41&  8.44& 12.32& 11.24     \\
        \rowcolor{gray!30}\checkmark&\checkmark&\checkmark&\checkmark& \textbf{78.51}&\textbf{89.68}&\textbf{80.44}&  \textbf{4.00}&\textbf{5.52}&\textbf{5.76}     \\
        \bottomrule[1.1pt]
      \end{tabular}
      \label{tab:position}
\end{wraptable}

Extensive ablation experiments are conducted based on five-fold cross-validation. For more ablation experiments please refer to \textcolor{blue}{Appendix}~\ref{appl:ablation}.
\vspace{-6mm}
\paragraph{Inserted Position of Med-Adapter.}
We conduct experiments on BraTS 2019 training set to assess the segmentation performance by inserting Med-Adapter at various stages of Swin-UNet encoder. Given that Swin-UNet encoder has four continuous stages ($n=0,1,2,3$).
According to \tableref{tab:position}, Inserting Med-Adapter in the initial stages resulted in degraded performance, with none surpassing our best default setting (gray background). This may be attributed to the greater contribution of features learned in later encoder stages when transferring pre-trained weights to the MVS task.

\vspace{-2mm}
\paragraph{Generalization Capability on Other Pre-trained Weights.}
To explore the potential of our Med-Tuning, we investigate the effect of diverse encoder pre-trained weights (e.g., multi-modal based (CLIP~\cite{Radford2021LearningTV}), self-supervised based (MAE~\cite{he2022masked}, MoCo v3~\cite{Chen2021AnES}) and SAM~\cite{kirillov2023segment}) on BraTS 2019 training set with ViT-B/16.
As presented in \tableref{tab:ckpt}, given different pre-trained weights, our easy-to-integrate framework boosts the performance consistently with much fewer tuned parameters, suggesting the effectiveness and robustness of our Med-Tuning framework.

\vspace{-2mm}
\paragraph{Generalization Capability on 3D Baseline and Medical Pre-trained Weight.}
To demonstrate the generalization capability of our approach, we select Swin UNETR~\cite{tang2022self} pre-trained on medical datasets as a supplementary 3D baseline and experiment on part of the Medical Segmentation Decathlon (MSD)~\cite{antonelli2022medical} dataset. For implementation details please refer to \textcolor{blue}{Appendix}~\ref{app:msd}. Experimental results in \tableref{tab:msd} show that our method still outperforms full fine-tuning in Memory, Time and Dice score.

\vspace{-2mm}
\begin{minipage}{0.9\textwidth}
    \begin{minipage}[h]{0.4\textwidth}
      \makeatletter\def\@captype{table}
      \centering 	 	 	 
	 \makeatletter\def\@captype{table}
	 \makeatother \caption{Ablations on other pre-trained weights.}
      \vspace{-2mm}
      \label{tab:ckpt}
      \scriptsize
      \setlength{\tabcolsep}{0.8mm}
      \begin{tabular}{@{}l|c|ccc@{}}
        \toprule[1.1pt]
        \multirow{2}{*}{\makecell[c]{Pre-trained\\Weights}}&
        \multirow{2}{*}{Method}&
        \multicolumn{3}{c}{Dice (\%) $\uparrow$}\\
        \cline{3-5}
        &  & ET & WT & TC  \\
        \hline
        \multirow{2}{*}{CLIP}
        & Full & 64.58  & 84.69 &73.31 \\
        & \cellcolor{gray!30}\textbf{Ours} & \cellcolor{gray!30}\textbf{68.05}& \cellcolor{gray!30}\textbf{86.29}& \cellcolor{gray!30}\textbf{77.34}
        \\
        \hline
        \multirow{2}{*}{MAE}
        & Full & 64.86 & 84.71 & 73.95  \\
        & \cellcolor{gray!30}\textbf{Ours}  & \cellcolor{gray!30}\textbf{66.32}& \cellcolor{gray!30}\textbf{85.50}& \cellcolor{gray!30}\textbf{78.05}
        \\
        \hline
        \multirow{2}{*}{MoCo v3}
        & Full & 65.06 & 84.30 & 73.51 \\
        & \cellcolor{gray!30}\textbf{Ours}  & \cellcolor{gray!30}\textbf{67.09}& \cellcolor{gray!30}\textbf{85.45}& \cellcolor{gray!30}\textbf{77.41}
        \\
        \hline
        \multirow{2}{*}{SAM}
        & Full & 65.89 & 85.32 & 74.05\\
        & \cellcolor{gray!30}\textbf{Ours} & \cellcolor{gray!30}\textbf{67.64}& \cellcolor{gray!30}\textbf{86.10}& \cellcolor{gray!30}\textbf{78.33}
        \\
        \bottomrule[1.1pt]
      \end{tabular}
    \end{minipage}
    \hspace{1cm}
    \begin{minipage}[h]{0.5\textwidth}
      \makeatletter\def\@captype{table}
          \centering 	 	 	 
    	 \makeatletter\def\@captype{table}
    	 \makeatother 
        \caption{Ablations on MSD dataset with pre-trained Swin UNETR.}
        \vspace{-4mm}
          \scriptsize
          \setlength{\tabcolsep}{0.6mm}
          \begin{tabular}{@{}c|c|ccc@{}}
            \toprule[1.1pt]
            Organ & Method & Memory(GB)$\downarrow$ & Time(h)$\downarrow$ & Dice\_AVG(\%)$\uparrow$ 
            \\
            \hline
            \multirow{3}{*}{\makecell[c]{Task02\\Heart\\(MRI)}}
            & Scratch & 19.73 & 1.05 & 91.95 \\
            & Full & 19.73 & 1.06 & 93.73 \\
            & \cellcolor{gray!30}\textbf{Ours} & \cellcolor{gray!30}\textbf{13.44} & \cellcolor{gray!30}\textbf{0.86} & \cellcolor{gray!30}\textbf{95.84} \\
            \hline
            \multirow{3}{*}{\makecell[c]{Task06\\Lung\\(CT)}}
            & Scratch & 23.51 & 8.39 & 65.82 \\
            & Full & 23.51 & 8.39 & 67.69 \\
            & \cellcolor{gray!30}\textbf{Ours} & \cellcolor{gray!30}\textbf{20.30} & \cellcolor{gray!30}\textbf{8.03} & \cellcolor{gray!30}\textbf{78.09} \\
            \hline
            \multirow{3}{*}{\makecell[c]{Task09\\Spleen\\(CT)}}
            & Scratch & 20.32 & 3.21 & 95.76 \\
            & Full & 20.32 & 3.21 & 96.52\\
            & \cellcolor{gray!30}\textbf{Ours} & \cellcolor{gray!30}\textbf{19.71} & \cellcolor{gray!30}\textbf{2.22} & \cellcolor{gray!30}\textbf{97.06} \\
            \bottomrule[1.1pt]
          \end{tabular}
      \setlength{\tabcolsep}{0.6mm}
        \label{tab:msd}
        \vspace{-3mm}
    \end{minipage}
\end{minipage}
\vspace{-3mm}

\section{Conclusion}
\vspace{-2mm}
In this work, we present a new PET framework named Med-Tuning with strong generalization capabilities for the practical application of MVS.
Taking advantage of both spatial relationship modeling (coarse/fine-grained) and volumetric correlations, our framework achieves better volumetric segmentation accuracy on 2D baselines pre-trained on relatively easily acquired natural images.
To some extent, Med-Tuning could consistently and sustainably boost the segmentation performance of pre-trained models on MVS tasks, keeping pace with the rapid development of foundation models in computer vision field.


\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version

\midlacknowledgments{Jianbo Jiao is supported by the Royal Society grants IES\textbackslash R3\textbackslash 223050 and SIF\textbackslash R1\textbackslash 231009.}


\bibliography{midl24_43}


\appendix


\section{Implementation Details.}
\label{appl:details}

\subsection{Details about Benchmark Datasets}
\label{app:benchmarkdata}
Details about BraTS 2019, BraTS 2020 and KiTS 2019 datasets are shown in \tableref{tab:dataset}.

\subsection{Details about Benchmark Validation Experiments.}
\label{app:benchmarkexperiment}
We employ pre-trained weights from two exemplary Transformer-based backbones, Swin Transformer tiny~\cite{liu2021swin} pre-trained on ImageNet-1k and Vision Transformer base version (ViT-B/16)~\cite{dosovitskiy2020vit} pre-trained on ImageNet-21k~\cite{deng2009imagenet}.
Swin-UNet~\cite{cao2022swin} and ViT~\cite{dosovitskiy2020vit} with UPerNet~\cite{xiao2018unified} decoder (ViT+UPerNet) are chosen as two robust baselines to ensure equitable comparison.
As shown in \tableref{table1}, the specific implementation details on BraTS 2019, BraTS 2020, and KiTS 2019 datasets for two baselines are comprehensively illustrated.
On all three benchmark datasets, models are fine-tuned with a batch size of 16 and the Adam optimizer.

During training, the following data augmentation techniques are applied to BraTS 2019 and BraTS 2020 datasets: (1) random cropping from $240\times240\times155$ to $128\times128\times128$ voxels; (2) random mirror flipping across the axial, coronal and sagittal planes by a probability of 0.5; (3) random intensity shift between [-0.1, 0.1] and scale between [0.9, 1.1]. 
$L2$ Norm is also applied for regularization with a weight decay rate of $10^{-5}$. 
As for the KiTS 2019 dataset, the employed data augmentations follow as the prior work \cite{isensee2021nnu}.

\begin{table}
    \centering
    \caption{Details about BraTS 2019, BraTS 2020 and KiTS 2019 datasets.}
    \begin{tabular}{c|c|c|c|c}
    \toprule[1.1pt]
    Dataset & Modality & \makecell[c]{Number of \\Training Cases} & \makecell[c]{Number of \\Test Cases} & Spatial Resolution\\
    \hline
    KiTS 2019 & CT & 210 & 90 & $512\times 512$\\
    BraTS 2019 & MRI & 335 & 125 & $240 \times 240 \times 155$\\
    BraTS 2020 & MRI & 369 & 125 & $240 \times 240 \times 155$\\
    \bottomrule[1.1pt]
    \end{tabular}
    \label{tab:dataset}
\end{table}
\begin{table}[htbp]
  \centering
  \caption{Implementation details on BraTS 2019, BraTS 2020 and KiTS 2019 datasets for two baselines (i.e., Swin-UNet, ViT+UPerNet).}
  \setlength{\tabcolsep}{1mm}
  \begin{tabular}{@{}c|c|c|c|c|c|cc@{}}
    \toprule[1.1pt]
    Dataset
    & Baseline 
    & Backbone 
    & \makecell[c]{Pre-trained \\Weight} 
    & \makecell[c]{Learning \\rate} 
    & \makecell[c]{Training\\ epochs}
    & \makecell[c]{Warm-up \\epochs}
    \\
    \hline
    \multirow{2}{*}{\makecell[c]{BraTS 2019 \&\\ BraTS 2020}} & Swin-UNet & Swin-T & ImageNet-1k & 0.002 & 250 & 60 \\
    \cline{2-7}
     & ViT+UPerNet & ViT-B/16 & ImageNet-21k & 0.002  & 250 & 25 \\
    \hline
    \multirow{2}{*}{KiTS 2019} & Swin-UNet & Swin-T & ImageNet-1k & 0.002 & 500 & 20 \\
    \cline{2-7}
      & ViT+UPerNet & ViT-B/16 & ImageNet-21k & 0.004  & 500 & 20 \\
    \bottomrule[1.1pt]
    \end{tabular}
\label{table1}
\end{table}

\subsection{Details about Generalization Capability Experiments.}
\label{app:msd}
We conduct ablation experiments to investigate the generalization capability of our Med-Tuning on the 3D baseline and pre-trained weight on the medical dataset.
The Medical Segmentation Decathlon (MSD)~\cite{antonelli2022medical} dataset includes 10 segmentation tasks covering various organs and image modalities. These tasks are intentionally diverse, presenting challenges like limited training data, class imbalances, multi-modality data, and small objects. 
In the main text, we have validated our approach on two MRI and one CT benchmark dataset. For ablation experiments, we selected one MRI and two CT datasets (i.e., Task02 Heart (MRI), Task06 Lung (CT), and Task09 Spleen (CT)) from the MSD dataset. Dataset pre-processing followed the protocol outlined in Swin UNETR~\cite{tang2022self}.
In \tableref{tab:msd}, Memory(GB) represents memory usage during the fine-tuning, Time(h) denotes the fine-tuning time, and Dice AVG signifies the average of multi-class Dice scores for the corresponding segmentation task.


\subsection{The position of the inserted parameters.}
Regarding the insertion position for the parameters, for SwinUnet-Tiny and SwinUNETR, we have incorporated the Med-Adapter exclusively within each transformer layer of their encoder. This results in a total of 8 Med-Adapters, calculated from $4\times2$ (number of stages $\times$ number of layers in each stage). Within each stage, the second Med-Adapter is designated for Inter-FI. In the case of ViT+UPerNet, a Med-Adapter is inserted following every layer, amounting to a total of 12 (number of layers in ViT-B/16) Med-Adapters. Specifically, the Med-Adapters placed after the 2nd, 5th, 8th, and 11th layers are used for Inter-FI, maintaining a division of the ViT encoder into 4 stages, similar to the Swin Transformer setup. The relative positioning between Med-Adapters and Transformer blocks can be referenced in Figure 2 of our manuscript. Through our experiments, we have determined that the insertion position illustrated in Figure 2 of the manuscript represent the optimal configuration, as currently established.
\subsection{The scale of bottleneck features of Med-Adapter.}
As indicated in Table 9 of our manuscript, the default Reduction Ratio is set to 6. Consequently, for all baselines, the scale of the bottleneck features of the adapter is represented by $L/6$, $L$ is the base scale of features in each stage (i.e., the scale of input features of Med-Adapter). Specifically, the scale of bottleneck features of 8 Med-Adapter in SwinUnet-Tiny or SwinUNETR are $[16, 16, 32, 32, 64, 64, 128, 128]$. The scale of bottleneck features of 12 Med-Adapter in ViT+UPerNet are both $128$.

\section{Visualization Comparisons}
\label{appl:visual}
\subsection{Visualization Comparisons with other PET method}
Comparison with full fine-tuning, head tuning and previous PET methods in terms of the trade-off between the number of tuned parameters and segmentation accuracy is shown in \figureref{fig2}. The experiments were conducted using ViT+UPerNet as the baseline on the BraTS 2019 dataset. The horizontal axis represents the parameters involved in model training during the fine-tuning stage, while the vertical axis denotes the mean Dice scores for ET, WT, and TC. Our method achieves much better segmentation performance than full fine-tuning and previous state-of-the-art PET methods with much less tuned parameters.
\begin{figure}
    \centering
    \includegraphics[width=0.6\textwidth]{Figures/compare.pdf}
    \caption{Comparison with previous PET methods in terms of the number of tuned parameters and Dice scores. }
    \label{fig2}
\end{figure}


\subsection{Visualization of BraTS 2019}
Qualitative results of BraTS 2019 datasets are shown in \figureref{fig3}, with the comparison with full fine-tuning, ST-Adapter and VPT. 
As the labels for the validation set are not available, five-fold cross-validation is conducted on the training set for visualization.
Our method recognizes brain tumors in enhancing and non-enhancing regions more accurately and reduces missed or false identification of the peritumoral edema in general.

\begin{figure}
    \centering
    \includegraphics[width=0.8\textwidth]{./Figures/visual_comparison.pdf}
    \begin{tabu} to 1.0\linewidth{X[1.0c] X[1.0c] X[1.0c] X[1.0c] X[1.0c]} 
        \scriptsize{Full} &  \scriptsize{ST-Adapter} &  \scriptsize{VPT} &  \scriptsize{\textbf{Ours}} &  \scriptsize{GT} \\
    \end{tabu}
    \vspace{-8mm}
    \caption{
    The visual comparison of segmentation results on BraTS 2019. The \textcolor{blue}{blue}, \textcolor{red}{red} and \textcolor{green}{green} regions denote the enhancing tumors, non-enhancing tumors, and peritumoral edema. GT=Ground Truth.
    }
    \label{fig3}
\end{figure}


\subsection{Visualization of KiTS 2019}
As the labels for the validation set are not available, five-fold cross-validation is conducted on the training set for visualization.
The qualitative results of KiTS 2019 datasets, depicted in \figureref{supplementary_visualonKiTS 2019}, highlight the superior performance of our method in organ and tumor segmentation compared to full fine-tuning, ST-Adapter, and VPT. Our approach demonstrates enhanced accuracy in segmenting organs and tumor types, producing finer-grained segmentation masks for corresponding tumors.

\begin{figure}
    \centering
    \includegraphics[width=0.8\textwidth]{./Figures/visual_comparison_supplementary_kits19.pdf}
    \begin{tabu} to 0.92\linewidth{X[1.0c] X[1.0c] X[1.0c] X[1.0c] X[1.0c]} 
        \scriptsize{Full} &  \scriptsize{ST-Adapter} &  \scriptsize{VPT} &  \scriptsize{\textbf{Ours}} &  \scriptsize{GT} \\
    \end{tabu}
    \caption{
    The visual comparison of segmentation results on KiTS 2019. The \textcolor{red}{red} and \textcolor{green}{green} regions denote the kidneys and kidney tumors. GT=Ground Truth.
    }
    \label{supplementary_visualonKiTS 2019}
\end{figure}



\section{More Ablation Studies and Analysis}
\label{appl:ablation}
\subsection{Reduction Ratio in Bottleneck Design.}
\begin{table}[htbp]
  \centering
  \setlength{\tabcolsep}{2mm}
  \begin{tabular}{@{}l|c|c|cccc@{}}
    \toprule[1.1pt]
    \multirow{2}{*}{Method} & \multirow{2}{*}{\makecell[c]{Tuned\\Params(M)}} &
    \multirow{2}{*}{\makecell[c]{Inserted\\Params(M)}} & \multicolumn{4}{c}{Dice (\%) $\uparrow$}\\
    \cline{4-7}
    &  &  & ET & WT & TC & Avg. \\
    \hline 	 	 	 
    $\alpha$=2 & 10.064 & 3.313   & 76.89 & 90.14 & 81.92 & 82.99   \\
    $\alpha$=4 & 7.994  & 1.243   & \textbf{77.22} & 90.09 & 81.59 & 82.97   \\
    \rowcolor{gray!30} $\alpha$=6 & 7.489  & 0.738   & 77.06 & \textbf{90.28} & \textbf{82.71} & \textbf{83.35}   \\
    $\alpha$=8 & 7.271  & 0.520   & 76.94 & 89.62 & 80.74 & 82.44   \\
    \bottomrule[1.1pt]
  \end{tabular}
   \caption{Ablation study on reduction ratio $\alpha$. Swin-UNet with Swin-T pre-trained on supervised ImageNet-1k.}
   \label{tab:ratio}
\end{table}

We analyze the effect of different reduction ratios of the bottleneck structure in our Med-Adapter. Note that the reduction ratio $\alpha$ here is a key factor that influences the tuned parameters introduced by our Med-Adapter. 
Four diverse settings of $\alpha$ are selected. 
As shown in \tableref{tab:ratio}, Med-Tuning achieves a promising trade-off between segmentation accuracy and the tuned parameter costs with $\alpha=6$. 
On this basis, higher $\alpha$ would cause inferior model performance because of the deteriorated representation capability with limited tuned parameters, while lower $\alpha$ would lead to a certain degree of information redundancy and a sharp increase of tuned parameters, resulting in both decreased segmentation accuracy and high training costs.


\subsection{Design for Global Dependency Modeling.}
\begin{table}[htbp]
  \caption{Ablation study on different designs for global dependency modeling. The baseline is Swin-UNet with Swin-T pre-trained on supervised ImageNet-1k. DWConvK denotes depth-wise convolution with a kernel size of K $\times$ K.}
  \centering
  \setlength{\tabcolsep}{2mm}
  \begin{tabular}{@{}l|c|c|cccc@{}}
    \toprule[1.1pt]
    \multirow{2}{*}{Method} & \multirow{2}{*}{\makecell[c]{Tuned\\Params(M)}} &
    \multirow{2}{*}{\makecell[c]{Inserted\\Params(M)}} & \multicolumn{4}{c}{Dice (\%) $\uparrow$}\\
    \cline{4-7}
    &  &  & ET & WT & TC & Avg. \\
    \hline
    DWConv9  & 7.837 & 1.086 & 76.48  & \textbf{90.58}  & 81.10  & 82.72\\
    DWConv11  & 8.126 & 1.375 & 76.82 & 89.40  & 80.05  & 82.09   \\
    \rowcolor{gray!30}FFT & 7.994 & 1.243  & \textbf{77.22} & 90.09 & \textbf{81.59} & \textbf{82.97} \\
    \bottomrule[1.1pt]
  \end{tabular} 
  
\label{tab:global}
\end{table}

In order to pursue the most effective and parameter-efficient architecture of our proposed Med-Adapter, we also investigate different designs for the global branch in our Med-Adapter block to achieve global dependency modeling. 
Since convolutional blocks with a large kernel size or self-attention are usually adopted by previous works for global contextual modeling and the baseline Swin-UNet itself consists of plenty of self-attention operation in each local window, we take the depth-wise convolution with a kernel size of 9 and 11 separately to replace our originally employed Fast Fourier Transform (i.e., FFT) branch for a comprehensive comparison.  
The comparison of the segmentation performance and tuned model parameters is shown in \tableref{tab:global}. 
It can be noticed that by taking advantage of the parameter-efficient FFT branch for effective long-range context modeling, the architecture with the FFT branch achieves the optimal trade-off between model performance and tuned parameters, reaching the best segmentation accuracy with only $1.243$M introduced model parameters. 
In contrast, too large kernel size of the employed convolutions (i.e., DWConv11) will result in a burdensome model structure and a large amount of tuned parameter cost.

\begin{table}
  \centering
  \caption{Ablation study on Intra-FE. The first row is the result of the Vanilla Adapter.
  }
      \label{tab:multiscale}
      \setlength{\tabcolsep}{2mm}
      \begin{tabular}{@{}ccc|c|c|ccc@{}}
        \toprule[1.1pt]
        \multirow{2}{*}{MLB} &
        \multirow{2}{*}{FGB} &
        \multirow{2}{*}{$Conv_{1 \times 1 \times 1}$} &
        \multirow{2}{*}{\makecell[c]{Tuned\\Params(M)}} &
        \multirow{2}{*}{\makecell[c]{Inserted\\Params(M)}} & 
        \multicolumn{3}{c}{Dice (\%) $\uparrow$}\\
        \cline{6-8}
        &  &   &  &  & ET & WT & TC \\
        \hline 	 	 	 	 	 
        - & - & -        
        & 7.541   & 0.790   & 75.13  &  87.50 & 75.29 \\
        \checkmark & - & -             
        & 7.574   & 0.823   & 75.19  &  89.44 & 80.89    \\ 	 	 	  	  	 	
        \checkmark & \checkmark & -
        & 7.577   & 0.825  &  75.30  &  89.93  & \textbf{81.93}   \\
        \rowcolor{gray!30} \checkmark & \checkmark & \checkmark
        & 7.675 & 0.924  & \textbf{77.10} & \textbf{90.05} & 81.02 \\
        \bottomrule[1.1pt]
      \end{tabular}
\end{table}      

\begin{table}
    \centering
      \caption{Ablation study on inter-FI.}
      \label{tab:interstage}
      \setlength{\tabcolsep}{2mm}
      \begin{tabular}{@{}l|c|c|ccc@{}}
        \toprule[1.1pt]
        \multirow{2}{*}{Method} & \multirow{2}{*}{\makecell[c]{Tuned\\Params(M)}} &
        \multirow{2}{*}{\makecell[c]{Inserted\\Params(M)}} & \multicolumn{3}{c}{Dice (\%) $\uparrow$}\\
        \cline{4-6}
        &  &  & ET & WT & TC \\
        \hline
        Add & 7.896  & 1.144  & 75.79 & 88.99 & 79.00 \\ %add
        Max & 7.896  & 1.144  & 75.22 & 89.72 & 81.41  \\%add
        \rowcolor{gray!30} Concat & 7.994 & 1.243  & \textbf{77.22} & \textbf{90.09} & \textbf{81.59} \\  %add 
        \bottomrule[1.1pt]
      \end{tabular}
\end{table} 

\subsection{Intra-FE Module Design.}
We first probe into the rationale of the proposed Intra-FE Module without Inter-FI on Swin-UNet baseline. 
Swin-UNet with Swin-T pre-trained on supervised ImageNet-1k is taken as a baseline.
As presented in \tableref{tab:multiscale}, the introduction of MLB, FGB, or channel mixing consistently leads to a considerable performance increase. 
Specifically, with only 0.002M additional tuned parameters, the FGB branch greatly improves the segmentation accuracy, 
showing the effectiveness and parameter efficiency of our employed FGB branch. 
Additionally, channel mixing further boosts the performance by a large margin, especially on ET ($\uparrow 1.80\%$). 

\subsection{Inter-FI Module Design.}
After investigating the effect of the intra-stage feature enhancement, we further verify the effectiveness of the inter-stage feature interaction, as shown in \tableref{tab:interstage}. Compared with the intra-only structure (i.e., without the feature connectivity between adjacent Med-Adapters), the model with inter-stage achieves a considerable performance gain with only 0.319M extra parameters for feature alignment among adjacent stages, showing the effectiveness of our inter-stage interaction.
Unlike concatenation which maintains the feature representations of different stages as much as possible, direct addition or taking the maximum value (at each pixel) of neighboring feature maps with diverse semantic levels would unintentionally degrade the original feature representation, resulting in a sharp decrease in segmentation performance.



\subsection{Decoder Design.}

\begin{table}[htbp]
  \centering 	 	 	 
  \setlength{\tabcolsep}{2mm}
  \caption{Ablation study on decoder design. ViT-B/16 is pre-trained on supervised ImageNet-1k.}
  \begin{tabular}{@{}l|c|c|cccc@{}}
    \toprule[1.1pt]
    \multirow{2}{*}{Method} & \multirow{2}{*}{\makecell[c]{Tuned\\Params(M)}} &
    \multirow{2}{*}{\makecell[c]{Decoder\\Params(M)}} & \multicolumn{4}{c}{Dice (\%) $\uparrow$}\\
    \cline{4-7}
    &  &  & ET & WT & TC & Avg. \\
    \hline
    \rowcolor{gray!30} UPerNet (Default)    & 19.562 & 15.095    & 68.27 & 87.22 & 81.63 & 79.04 \\
    U-Net      & 9.269  & 4.712     & 67.68 & \textbf{88.08} & 81.72 & 79.16   \\
     SETR-MLA   & 8.347  & 3.790     & 68.12  & 87.91 & \textbf{81.98}  & \textbf{79.34}   \\
    SETR-Naive & 5.004  & 0.447     & \textbf{69.11}  & 86.93  & 81.71  & 79.25   \\
    SETR-PUP   & 5.200  & 0.643     & 68.55  & 86.51  & 80.42  & 78.49   \\
        \bottomrule[1.1pt]
  \end{tabular}
  \label{tab:decoder}
  
\end{table}


Here we explore the effect of different decoder designs in our architecture.
Although the backbone is frozen and only the inserted Med-Adapters as well as the decoder are updated during fine-tuning, the essentially tuned model parameters introduced by the segmentation decoder can not be reckoned as negligible. 
In other words, to pursue an extremely PET framework, the design of the employed decoder should be sufficiently lightweight with strictly controlled model parameters.
Thus, various segmentation decoders with greatly varied model complexity are introduced respectively for a thorough analysis. 
As shown in \tableref{tab:decoder}, ViT-B/16 with the SETR-MLA decoder reaches the best trade-off between segmentation accuracy and tuned parameter costs, benefiting from the effective multi-scale feature aggregation. 
Besides, taking the simplest SETR-Naive that is composed of a convolution and an interpolation operation for upsampling as the decoder leads to the lowest tuned parameters 5.004M while achieving promising segmentation performance with an average Dice score of 79.34\%. 
It can be seen from \tableref{tab:decoder} that although the decoder size dominantly decides the overall tuned parameters, it does not show a direct impact on model performance.


\subsection{Data Efficiency.}
\begin{table}[htbp]
  \centering
  
  \caption{Ablation study on data efficiency property with pre-trained ViT-B/16.}
  \label{tab:data_effi}
  \setlength{\tabcolsep}{2mm}
  \begin{tabular}{@{}c|c|c|c|ccc|ccc@{}}
    \toprule[1.1pt]
    \multirow{2}{*}{\makecell[c]{Dataset \\Ratio}} & \multirow{2}{*}{Method} & \multirow{2}{*}{\makecell[c]{Memory\\Cost (GB)$\downarrow$}} & \multirow{2}{*}{\makecell[c]{Training\\Time (h)$\downarrow$}} & \multicolumn{3}{c}{Dice (\%) $\uparrow$} & \multicolumn{3}{c}{HF (mm)$\downarrow$}\\
    \cline{5-10}
    &  &  &  & ET & WT & TC & ET & WT & TC\\
    \hline
    100\% & Full & 16.55 & 1.34 & 68.04 & 85.74 & 76.58 & 6.94 & 7.28 & 7.99 \\
    \rowcolor{gray!30} 100\% & Ours & 13.53 & 1.20 & \textbf{75.46} & \textbf{86.80} & \textbf{86.24} & \textbf{3.78} & 6.94 & \textbf{4.34}\\
    75\% & Ours & 13.53 & 1.05 & 69.12 & 86.69 & 78.06 & 6.33 & \textbf{6.01} & 6.63 \\
    50\% & Ours & 13.53 & 0.72 & 69.19 & 86.26 & 77.26 & 6.28 & 7.03 & 7.12 \\
    25\% & Ours & 13.53 & 0.39 & 67.43 & 85.64 & 74.57 & 6.32 & 7.71 & 8.14  \\
    5\% & Ours & 13.53 & 0.17 & 59.61 & 80.44 & 64.01 & 15.07 & 16.64 & 16.36 \\
    \bottomrule[1.1pt]
  \end{tabular}
\end{table}
At last, we also explore the data efficiency property of our method by examining performance across various training data ratios, particularly in low-data settings.
\tableref{tab:data_effi} shows the quantitative comparison with different numbers of training samples.
Our Med-Tuning can already achieve comparable performance to full fine-tuning using only \textbf{25\%} training data. 
As the scale of training data increases, our method consistently improves the segmentation accuracy, with reduced training time and memory cost compared with full fine-tuning.

\subsection{Other Weight Pre-trained on Medical Image Datasets.}
Med-Tuning is not solely focused on pushing SOTA. Instead, it allows us to capitalize on the extensive progress made in natural image processing. This perspective underscores our belief in the potential and value of integrating advancements from one domain to enhance the capabilities and applications in another.

Indeed, as highlighted in recent literature \cite{liu2023clip,silva2023towards,ulrich2023multitalent}, there have been significant advancements in the field of medical image pre-trained models. Nevertheless, due to the considerable constraints of time, monetary resources, and clinical applicability faced by many researchers working on medical image pre-training, the pace of updates and the scale of medical image pre-training efforts still trail behind those in the natural image domain. Additionally, the use of many open-source codes in the medical imaging field presents a high threshold. Therefore, the vast array of convenient and accessible large-scale pre-trained weights from the natural image domain have become our primary choice.

Based on the above choices, we hypothesize that: If Med-Tuning can tackle the more challenging task of a large domain shift from features pre-trained on natural 2D images to CT/MRI volumes, then it is also capable of addressing the comparatively easier task of domain shift from features pre-trained on medical images to CT/MRI volumes. The experimental results in our manuscript have demonstrated the feasibility of a broader transfer process, thereby validating the effectiveness of our proposed approach in achieving the former scenario.

Regarding the latter scenario, we conducted experiments using the same baselines and pre-trained weights as those in \cite{liu2023clip,silva2023towards}, following our default training setting. The comprehensive results of all experiments are depicted in \tableref{tab:universal}.

\begin{table}[htbp]
  \centering
  \caption{The comparison between original SwinUNETR, Universal Model and our proposed Med-Tuning. The performance is evaluated by average Dice scores. "W1" signifies the use of the model and pre-training weights from \cite{tang2022self}, while "W2" references the model and pre-training weights from \cite{liu2023clip}. "SCR" denotes the model is trained from scratch and "FULL" denotes the full fine-tuning mothod. The first two columns of scores were directly copied from \cite{liu2023clip}. The last four columns of scores were obtained through training using our framework.}
  \label{tab:universal}
  \setlength{\tabcolsep}{0.1mm}
  \begin{tabular}{@{}c|ccccccc@{}}
    \toprule[1.1pt]
    Dataset& \makecell[c]{SwinUNETR\\(SCR)} & \makecell[c]{Universal\\Model\\(FULL)} & \makecell[c]{Ours\\(SCR)} & \makecell[c]{Ours\\(FULL, W1)} & \makecell[c]{Ours\\(Med-Tuning,\\W1)} & \makecell[c]{Ours\\(FULL, W2)} & \makecell[c]{Ours\\(Med-Tuning,\\W2)}\\
    \hline
    \makecell[c]{Task06\\Heart} & 68.90 & 67.15 & 65.82 & 67.69 & 78.09 & 68.37 & \textbf{78.53}\\
    \makecell[c]{Task09\\Heart} & 95.80 & 96.71 & 95.76 & 96.52 & 97.06 & 96.35 & \textbf{97.60}\\
    \bottomrule[1.1pt]
  \end{tabular}
\end{table}

From our results, Med-Tuning proves to be capable of consistently improving the precision in medical volumetric segmentation tasks by using medical pre-trained weights, requiring only a small number of training parameters for this enhancement. Besides, the trends observed in our experimental results suggest that our proposed approach can keep pace with the development of visual models pre-trained in medical domain, aligning with the conclusions drawn at the end of our manuscript.

Finally, we would like to add that as demonstrated in our results shown in Table 10, using the W2 weights improved the Dice score by 0.6 over the W1 weights. Hence, we also look forward to the widespread development of large-scale pre-trained models like \cite{liu2023clip} in medical domain and are excited about the potential to further enhance their performance using our Med-Tuning.

\subsection{Training Time.}
Under default training settings, the training time of each method are listed in \tableref{tab:train_time}. The results indicate that the introduction of few new training parameters inevitably results in a slight increase in training. However, we achieves a commendable balance between training time cost and the tuning of parameters.
\begin{table}[]
    \centering
    \caption{Comparisons of training time (hours) on BraTS2019 with SwinUnet and ViT+UPerNet backbone.}
    \begin{tabular}{c|cc}
    \toprule[1.1pt]
        Method&ViT+UPerNet&SwinUnet\\
        \hline
        Scratch&1.74h&1.26h\\
        Full&1.73h&1.26h\\
        Head&1.28h&1.02h\\
        VPT-Shallow&1.09h&0.98h\\
        VPT-Deep&1.18h&1.01h\\
        Adapter&1.77h&1.30h\\
        AdaptFormer&1.44h&1.18h\\
        Pro-tuning&1.84h&1.47h\\
        ST-Adapter&1.79h&1.55h\\
        Ours&1.88h&1.51h\\
    \bottomrule[1.1pt]
    \end{tabular}
    \label{tab:train_time}
\end{table}

\end{document}