\documentclass{midl} % Do NOT use the 'anon' option
% Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{array}
\usepackage{booktabs}
\usepackage{multirow}
\usepackage{makecell}
\usepackage[table]{xcolor}
\usepackage{adjustbox}
\usepackage{pifont}
\usepackage{float}
\usepackage{placeins}

\newcommand{\cmark}{\ding{51}} 
\newcommand{\xmark}{\ding{55}} 

\jmlryear{2026}
\jmlrworkshop{Full Paper -- MIDL 2026}
\jmlrvolume{-- 211}
\editors{Accepted for publication at MIDL 2026}

\title[DINOv3-FD]{Incentivizing DINOv3 Adaptation for Medical Vision Tasks via Feature Disentanglement}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{
\Name{Zhicheng He\nametag{$^{1,}$}}\midljointauthortext{Contributed equally} \orcid{0009-0000-6233-440X} \Email{e1583419@u.nus.edu}\\
\Name{Yibing Fu\nametag{$^{1,}$}}\midlotherjointauthor 
\Email{yibingfu@u.nus.edu}\\
\Name{Yueming Jin\nametag{$^{1,}$}}\midljointauthortext{Corresponding author}\Email{ymjin@nus.edu.sg}\\
\addr $^{1}$ National University of Singapore
}

\begin{document}

\maketitle

\begin{abstract}
The emerging general vision foundation models such as DINOv3 have demonstrated remarkable representation learning capability in natural image domains. However, transferring these representations to medical imaging is challenging due to substantial domain discrepancies. To bridge this gap, parameter-efficient fine-tuning (PEFT) has emerged as a promising strategy to adapt these vision foundation models to medical vision tasks by updating only a small subset of parameters while preserving pretrained knowledge. Despite the efficiency, existing PEFT strategies overlook that pretrained features inherently interleave task-relevant semantics with task-irrelevant patterns and noise, potentially limiting effective adaptation in medical scenarios. To address this challenge, we propose DINOv3-FD, a task-oriented feature disentanglement framework that adapts DINOv3 to medical vision tasks. DINOv3-FD introduces a dual-stream adapter that separates features into task-relevant and task-irrelevant subspaces, reinforced by an orthogonality loss to encourage their mutual independence. Additionally, a distributional regularization loss drives the task-irrelevant branch toward task-agnostic predictions, discouraging it from encoding task-specific semantics. Consequently, the task-relevant stream is encouraged to retain more discriminative representations that facilitate downstream medical tasks. Experimental results show that DINOv3-FD outperforms other PEFT strategies over three medical classification tasks, demonstrating the effectiveness of feature disentanglement. Our code is available at \url{https://github.com/hezhicheng2002/DINOv3-FD}.
\end{abstract}

\begin{keywords}
Feature Disentanglement, Representation Learning, Medical Image Classification
\end{keywords}

\section{Introduction}

Recent advancements of general vision foundation models (GVFMs) which are pretrained on web-scale natural images have been revolutionizing the computer vision domain. Models such as MAE \cite{he2022masked}, DINO \cite{caron2021emerging}, and iBOT \cite{zhou2022image} provide strong visual priors for downstream tasks, significantly enhancing their performance across a broad spectrum of visual tasks, including classification, segmentation, detection, and so on. More recently, DINOv3 \cite{simeoni2025dinov3} has further unleashed the potential of self-distillation strategies by tailoring the pretraining strategy over web-scale images, demonstrating remarkable performance and scalability. Although GVFMs provide powerful representations in natural image domains, their performances commonly diminish on medical imaging tasks because of significant domain shifts.
 
To narrow this gap, the medical AI community has increasingly focused on adapting GVFMs pretrained on natural images to medical vision tasks through parameter-efficient fine-tuning (PEFT) strategies. Rather than updating all model parameters, PEFT strategies such as Low-Rank Adaptation (LoRA) \cite{hu2021lora} and Adapters \cite{houlsby2019parameter} selectively optimize only a small subset of parameters while keeping the majority frozen. These approaches effectively preserve the general visual priors encoded in the pretrained models while enabling efficient adaptation to medical downstream tasks. Moreover, it substantially reduces computational overhead and mitigates overfitting under low-data regimes, which is common in the medical imaging domain \cite{fu2025unleashing}. These advantages make PEFT strategy a highly appealing paradigm, which intuitively raises a question-\textbf{\textit{How can the latest DINOv3 be effectively adapted to medical vision tasks}?}

Previous researches have demonstrated the effectiveness of adapting GVFMs to medical imaging tasks via PEFT strategies. For instance, Veasey and Amini leverage LoRA to adapt GVFMs for lung nodule malignancy classification, achieving improved performance with substantially fewer trainable parameters~\cite{veasey2025low}. Wu \textit{et al.} proposes Medical SAM Adapter, which incorporates lightweight adapters to adapt SAM for 2D and 3D medical image segmentation tasks~\cite{wu2025medical}. Despite the efficiency, such existing PEFT methods do not explicitly consider how the features extracted by GVFMs contribute to the medical tasks, and instead apply uniform updates across the entire embedding or projection layers. However, features from GVFMs may contain a mixture of task-relevant, task-irrelevant, and even noisy information. Therefore, how to disentangle these components during adaptation in order to strengthen task-relevant features while suppressing irrelevant counterparts remains an important yet unexplored problem.

In this work, we propose DINOv3-FD, which adapts DINOv3 to medical vision tasks from the perspective of task-oriented feature disentanglement. Specifically, we develop a \textbf{dual-stream adapter framework} that decomposes the feature representations into task-relevant and task-irrelevant subspaces. To explicitly enforce this separation, we incorporate an \textbf{orthogonality loss} that promotes mutual independence between the two branches. Additionally, we innovate a \textbf{distributional regularization loss} on the task-irrelevant adapter, encouraging its predictions to approach a label-agnostic random distribution. This incentivizes the adapter to encode task-irrelevant features rather than task-discriminative semantics. By jointly leveraging these three components, DINOv3-FD enables DINOv3 to better preserve task-relevant representations while effectively isolating noisy or task-irrelevant features. We comprehensively evaluate our method over three medical image classification tasks, demonstrating the superior performance of our proposed method. In conclusion, our contributions can be summarized as threefold:

\begin{itemize}
    \item We propose DINOv3-FD, which introduces a dual-stream adapter to adapt DINOv3 to medical vision tasks in a task-oriented feature disentanglement manner.
    \item We propose an orthogonality loss across the two subspaces, which promotes mutual independence between the feature representations.
    \item We innovate a distributional regularization loss to the task-irrelevant branch, pushing its predictions toward a label-agnostic distribution to filter task-irrelevant features.
\end{itemize}

\section{Related Works}

\subsection{Parameter-efficient fine-tuning}
Parameter-efficient fine-tuning (PEFT) has emerged as an appealing strategy for adapting GVFM to data-constrained and heterogeneous medical imaging scenarios. Instead of updating all model parameters, PEFT selectively introduces lightweight modules or low-rank projection layers, achieving strong adaptation performance while retaining most pretrained prior knowledge. Early approaches began with adapter modules~\cite{houlsby2019parameter}, which integrated lightweight adaptation layers to enable task-specific adaptations. Subsequently, LoRA~\cite{hu2021lora} introduced low-rank updates to attention projections, which emerged as a milestone for fine-tuning foundation models. Following its success, numerous variants have been proposed to enhance LoRA’s performance. For example, IA$^{3}$~\cite{liu2022few} reduced trainable parameters through learned multiplicative vectors applied to key components of transformer blocks. LyCORIS~\cite{yeh2024navigating} expanded LoRA through hybrid pathways that make modulation more flexible. Building on this direction, VeRA~\cite{kopiczko2024vera} reparameterized adaptation directly through optimizer states for further compression. PaCA~\cite{woo2025paca} introduced parallel low-rank branches to refine attention and feed-forward transformations. However, these approaches mainly focus on how to design the adapter or projection layers while neglecting how to highlight task-relevant representations.

\subsection{Feature Disentanglement}

Feature disentanglement aims to reorganize latent feature representations such that task-specific information is distilled into a dedicated subspace, whereas task-irrelevant features or noises are isolated to another subspace since they may compromise the overall performance.
This motivation has led to substantial advances in techniques aimed at reducing statistical dependencies within learned feature representations. Cogswell \textit{et al.} introduced decorrelation penalties, exemplified by DeCov~\cite{cogswell2015reducing}, which encouraged separation by directly suppressing correlated activations across feature dimensions. Building on this principle, kernel-based dependence measures such as HSIC~\cite{gretton2005measuring,ma2020hsic} expanded the concept of independence by capturing nonlinear interactions through kernel embeddings. Variational approaches like MINE~\cite{belghazi2018mutual} further generalized these ideas by estimating mutual information, supporting the modeling of richer and more diverse statistical dependencies. Building on these principles, recent studies developed architectures that explicitly divide representational roles across different model components. Invariant-learning frameworks such as IRM~\cite{arjovsky2019invariant} and IIV~\cite{ahuja2021invariance} promoted consistency across environments, pushing environment-dependent variation into distinct representational directions. In medical imaging, related ideas have been adapted to domain-specific sources of variation. For example, Wang \textit{et al.} disentangled disease-related features from obscuring tissues via explicit factor separation~\cite{wang2022disentangling}. MIMM-X~\cite{fay2025mimm} reduced mutual dependence between causal and auxiliary components. Recent segmentation frameworks further disentangle lesion-relevant features from background to reduce annotation noise~\cite{xiong2024semi}.

\section{Method}

\subsection{Framework}

As illustrated in Fig.~\ref{fig:arch_overview}, our DINOv3-FD framework adopts a frozen DINOv3 \cite{simeoni2025dinov3} model as the vision encoder, while incorporating lightweight LoRA \cite{hu2021lora} layers to facilitate efficient feature adaptation. Given an input image, the DINOv3 backbone yields a global representation, which serves as the input to our feature disentanglement module. Specifically, this representation is routed into two parallel adapters, i.e., a task-relevant adapter (TRA) and a task-irrelevant adapter (TIA). During training, we introduce an orthogonality loss and a distributional regularization loss to disentangle and strengthen task-relevant features within the TRA, while isolating task-irrelevant features via the TIA. During inference, only the TRA is retained for task-oriented adaptation.

\begin{figure}[t]
\centering
\includegraphics[width=.95\linewidth]{framework.jpg}
\caption{
Overview of the DINOv3-FD framework. We leverage the DINOv3 integrated with LoRA layers as the vision encoder. The obtained global representation is routed to the task-relevant and irrelevant adapters for feature disentanglement. An orthogonality loss and a distributional regularization loss jointly boost feature disentanglement by promoting information independence and pushing the irrelevant branch toward label-agnostic predictions.}
\label{fig:arch_overview}
\end{figure}

\subsection{Dual-stream Adapter Framework}

In our DINOv3-FD framework, we leverage the latest vision foundation model DINOv3 and incorporate LoRA layers to build the vision encoder. Given a medical image, we extract the global representation $\boldsymbol{f}_{\mathrm{cls}} \in \mathbb{R}^d$ from the encoder, where $d$ denotes the embedding dimension. Subsequently, we route $\boldsymbol{f}_{\mathrm{cls}}$ to TRA and TIA to generate the task-relevant feature $\boldsymbol{f}_{\mathrm{T}} \in \mathbb{R}^d$ and task-irrelevant feature $\boldsymbol{f}_{\mathrm{I}} \in \mathbb{R}^d$. It is worth noting that both the TRA and TIA comprise two linear layers coupled with nonlinear activation functions, resulting in only minimal additional parameters overhead.

Subsequently, the task-relevant feature $\boldsymbol{f}_{\mathrm{T}}$ and task-irrelevant feature $\boldsymbol{f}_{\mathrm{I}}$ are processed by two linear classification heads to get the class decision scores as follows:
\begin{equation}
    \boldsymbol{\hat{y}}_{\mathrm{T}} = g_\phi(\boldsymbol{f}_{\mathrm{T}}),
    \qquad
    \boldsymbol{\hat{y}}_{\mathrm{I}} = g_\psi(\boldsymbol{f}_{\mathrm{I}}),
\end{equation}
where $g_\phi$ and $g_\psi$ denote the task-relevant and task-irrelevant prediction heads. $\boldsymbol{\hat{y}}_{\mathrm{T}}$ and $\boldsymbol{\hat{y}}_{\mathrm{I}}$ both reside in a
$\mathbb{R}^c$ space, where $c$ denotes the target classes. This dual-stream adapter framework empowers the model to explicitly separate interleaved features into task-relevant and irrelevant subspaces.

\subsection{Feature Disentanglement}

To further promote mutual independence between $\boldsymbol{f}_{\mathrm{T}}$ and $\boldsymbol{f}_{\mathrm{I}}$, we introduce an orthogonality loss for feature disentanglement. Specifically, inspired by \cite{belghazi2018mutual}, we firstly introduce a statistical dependency estimator $E_\omega$, which is implemented as a small multilayer perceptron that takes the concatenated feature
$[\boldsymbol{f}_{T},\boldsymbol{f}_{I}]$ as input. Following the Donsker–Varadhan lower bound on mutual information \cite{belghazi2018mutual}, the dependence between two feature pathways is estimated as:
\begin{equation}
\mathcal{L}_{\mathrm{ortho}}
=
\mathbb{E}_{(\boldsymbol{f}_{\mathrm{T}}, \boldsymbol{f}_{\mathrm{I}})_{\text{pair}}\sim\mathcal{D}_{T,I}}
\!\left[
E_\omega([\boldsymbol{f}_{T},\boldsymbol{f}_{I}])
\right]
-
\log
\mathbb{E}_{(\boldsymbol{f}_{\mathrm{T}}, \boldsymbol{f}_{\mathrm{I}})_{\text{shuffle}}\sim \mathcal{D}_{T}\mathcal{D}_{I}}
\!\left[
\exp\!\left(E_\omega([\boldsymbol{f}_{T},\boldsymbol{f}_{I}])\right)
\right].
\end{equation}

In the above equation, $(\boldsymbol{f}_{\mathrm{T}}, \boldsymbol{f}_{\mathrm{I}})_{\text{pair}}$ refers to paired features, which originates from combining $\boldsymbol{f}_{\mathrm{T}}$ and $\boldsymbol{f}_{\mathrm{I}}$ from the same input image. Conversely, $(\boldsymbol{f}_{\mathrm{T}}, \boldsymbol{f}_{\mathrm{I}})_{\text{shuffle}}$ denotes the shuffled features, which are formed by keeping $\boldsymbol{f}_{\mathrm{T}}$ fixed while randomly permuting $\boldsymbol{f}_{\mathrm{I}}$ from different images within a minibatch. 

Mathematically, $(\boldsymbol{f}_{\mathrm{T}}, \boldsymbol{f}_{\mathrm{I}})_{\text{pair}}$ approximates samples from the joint distribution $\mathcal{D}_{T, I}(\boldsymbol{f}_{\mathrm{T}}, \boldsymbol{f}_{\mathrm{I}})$ of the two subspaces. $(\boldsymbol{f}_{\mathrm{T}}, \boldsymbol{f}_{\mathrm{I}})_{\text{shuffle}}$ approximates samples from the product of their marginals $\mathcal{D}_{T}(\boldsymbol{f}_{\mathrm{T}})\,\mathcal{D}_{I}(\boldsymbol{f}_{\mathrm{I}})$. By minimizing $\mathcal{L}_{\mathrm{ortho}}$, the predicted scores of the paired tuples and shuffled tuples become increasingly similar, which implies that the joint distribution $\mathcal{D}_{T, I}(\boldsymbol{f}_{\mathrm{T}}, \boldsymbol{f}_{\mathrm{I}})$ becomes progressively harder to be distinguished from the product distribution $\mathcal{D}_{T}(\boldsymbol{f}_{\mathrm{T}})\, \mathcal{D}_{I}(\boldsymbol{f}_{\mathrm{I}})$, which means low mutual information between $\mathcal{D}_{T}(\boldsymbol{f}_{\mathrm{T}})$ and $ \mathcal{D}_{I}(\boldsymbol{f}_{\mathrm{I}})$. As a result, this optimization procedure promotes statistically independent and disentangled feature pathways.

\subsection{Distribution Regularization}

Furthermore, to ensure that the TIA subspace captures task-irrelevant representations, we introduce a distributional regularization loss.  Specifically, the output $\boldsymbol{\hat{y}}_{\mathrm{I}}$ is encouraged to approach a uniform distribution $\mathcal{U}$ over the $c$ classes. This is achieved by minimizing the divergence:
\begin{equation}
    \mathcal{L}_{\mathrm{reg}}
    =
    \sum_{i=1}^{c}
    \boldsymbol{\hat{y}}_{\mathrm{I},i}
    \log
    \frac{\boldsymbol{\hat{y}}_{\mathrm{I},i}}{\boldsymbol{u}_i},
\end{equation}
where $\boldsymbol{u}_i = 1/c$ and $\boldsymbol{\hat{y}}_{\mathrm{I},i}$ corresponds the prediction results of \textit{i}-th class.

By explicitly encouraging $\boldsymbol{\hat{y}}_{\mathrm{I}}$ toward a uniform distribution, the TIA space is discouraged from retaining informative features with respect to class labels. Synergistically, by combining the orthogonality loss $\mathcal{L}_{\mathrm{ortho}}$, the TRA subspace is motivated to concentrate on discriminative representations that are maximally aligned with the task objective. Consequently, the dual-stream representation achieves clearer task-oriented feature disentanglement and retains the task-relevant features in the TRA space.

\subsection{Overall Objective}

The full training objective integrates the task prediction loss with the two aforementioned components, which can be formulated as:
\begin{equation}
    \mathcal{L}
    =
    \mathcal{L}_{\mathrm{cls}}
    + \lambda_{\mathrm{ortho}}\,\mathcal{L}_{\mathrm{ortho}}
    + \lambda_{\mathrm{reg}}\,\mathcal{L}_{\mathrm{reg}}.
\end{equation}

In the above equation, $\mathcal{L}_{\mathrm{cls}}$ is the standard cross-entropy loss function for optimizing classification tasks. $\lambda_{\mathrm{ortho}}$ and $\lambda_{\mathrm{reg}}$ are hyperparameters that balance the contribution of the proposed orthogonality
loss and distributional regularization loss. The overall training strategy is summarized in Algorithm \ref{alg:dinov3-fd}.

\begin{algorithm2e}[htbp]
\caption{Training and inference with DINOv3\text{-}FD}
\label{alg:dinov3-fd}
\KwIn{Training set $\mathcal{D}_{\text{train}}$, test set $\mathcal{D}_{\text{test}}$}
\KwIn{Pretrained encoder $f_\theta$ with LoRA; adapters TRA and TIA; classification heads $g_\phi$ and $g_\psi$; dependency estimator $E_\omega$}
\KwIn{Hyperparameters $\lambda_{\mathrm{ortho}},\,\lambda_{\mathrm{reg}}$}

\BlankLine
\textbf{Training phase:}

\For{minibatch $(x, y) \subset \mathcal{D}_{\text{train}}$}{
    $\boldsymbol{f}_{\mathrm{cls}} \leftarrow f_\theta(x)$\;
    % $\boldsymbol{f}_{\mathrm{T}} \leftarrow \mathrm{TRA}(\boldsymbol{f}_{\mathrm{cls}})$,\quad
    
    $\boldsymbol{\hat{y}}_{\mathrm{T}} \leftarrow g_\phi(\mathrm{TRA}(\boldsymbol{f}_{\mathrm{cls}}))$,\quad   $\boldsymbol{\hat{y}}_{\mathrm{I}} \leftarrow g_\psi(\mathrm{TIA}(\boldsymbol{f}_{\mathrm{cls}}))$\;
    $\mathcal{L}_{\mathrm{cls}} \leftarrow \mathrm{CrossEntropy}(\boldsymbol{\hat{y}}_{\mathrm{T}}, y)$\, 
    
    $\mathcal{L}_{\mathrm{reg}} \leftarrow
        \sum_{i=1}^{c}
        \boldsymbol{\hat{y}}_{\mathrm{I},i}
        \log
        \frac{\boldsymbol{\hat{y}}_{\mathrm{I},i}}{1/c}$\;
    $(\boldsymbol{f}_{\mathrm{T}},\boldsymbol{f}_{\mathrm{I}})_{\text{pair}} \leftarrow \text{same-image tuples}$,\quad$(\boldsymbol{f}_{\mathrm{T}},\boldsymbol{f}_{\mathrm{I}})_{\text{shuffle}} \leftarrow \text{cross-image tuples}$\;
    $\mathcal{L}_{\mathrm{ortho}} \leftarrow
    \mathbb{E}_{(\boldsymbol{f}_{\mathrm{T}}, \boldsymbol{f}_{\mathrm{I}})_{\text{pair}}\sim\mathcal{D}_{T,I}}
    \!\left[
    E_\omega([\boldsymbol{f}_{T},\boldsymbol{f}_{I}])
    \right]
    -
    \log
    \mathbb{E}_{(\boldsymbol{f}_{\mathrm{T}}, \boldsymbol{f}_{\mathrm{I}})_{\text{shuffle}}\sim \mathcal{D}_{T}\mathcal{D}_{I}}
    \!\left[
    \exp\!\left(E_\omega([\boldsymbol{f}_{T},\boldsymbol{f}_{I}])\right)
    \right]
    $\;
    
    $\mathcal{L} \leftarrow
    \mathcal{L}_{\mathrm{cls}}
    + \lambda_{\mathrm{ortho}}\,\mathcal{L}_{\mathrm{ortho}}
    + \lambda_{\mathrm{reg}}\,\mathcal{L}_{\mathrm{reg}}$\;
    
    Update TRA, TIA, $g_\phi$, $g_\psi$, $E_\omega$, and unfrozen PEFT parameters.\;
}

\BlankLine
\textbf{Inference phase:}

\For{each image $x \in \mathcal{D}_{\text{test}}$}{
    $\boldsymbol{f}_{\mathrm{cls}} \leftarrow f_\theta(x)$\;
    $\boldsymbol{\hat{y}}_{\mathrm{T}} \leftarrow g_\phi(\mathrm{TRA}(\boldsymbol{f}_{\mathrm{cls}}))$
     
    \Return $\boldsymbol{\hat{y}}_{\mathrm{T}}$\;
}
\end{algorithm2e}

\section{Experiments}

\subsection{Datasets and Preprocessing}

We evaluate DINOv3-FD on three medical image classification tasks.
The RSNA Pneumonia dataset\cite{rsna-pneumonia-detection-challenge} provides unique frontal-view chest radiographs labeled for pneumonia presence and follows a split of 21{,}346, 2{,}668, and 2{,}670 for training, validating, and testing.  
ISIC 2018\cite{codella2019skin,tschandl2018ham10000} consists of dermoscopic images across seven diagnostic categories and adopts the official competition partition, with 10{,}015 cases for train, 193 cases for validation, and 1{,}512 cases for test.  
ODIR-5K\cite{ODIR2019} contains 5{,}000 color fundus photographs labeled for eight ocular conditions; the imagefolder format expands its multi-label structure into class-wise directories, resulting in 6{,}392 images due to multi-label duplication, as 5{,}110 are trained, 635 are validated and 647 are tested. Training images are augmented with scale-jittering and horizontal flipping, whereas validation and test images are resized to a fixed 256 $\times$ 256 and subsequently center-cropped to 224 $\times$ 224. All images are normalized using ImageNet statistics to ensure consistency with the original DINOv3 pretraining.

\subsection{Implementation Details}

\paragraph{Training details}
All experiments are conducted on one NVIDIA RTX A6000 GPU. Models are trained using the ViT-L/16 DINOv3 encoder as the backbone, with its \texttt{[CLS]} representation serving as the global feature. Training is performed with AdamW optimizer using an initial learning rate of $5\times10^{-5}$, batch size of 16, and weight decay of $5\times10^{-2}$. The hyperparameter $\lambda_{\mathrm{ortho}}$ and  $\lambda_{\mathrm{reg}}$ are set as 0.005 and 0.2.

\paragraph{Metrics}

We report classification accuracy (ACC) and area under the ROC curve (AUC).  
For binary tasks such as RSNA, AUC is computed from the sigmoid probability of the positive class, and ACC uses a threshold of 0.5.  
For multi-class datasets (ISIC and ODIR), we employ a single softmax classifier and compute macro One-vs-Rest AUC along with top-1 ACC. We additionally report the macro $F_{\mathrm{1}}$ score, which provides a balanced measure of per-class performance, especially for datasets with label imbalance.

\section{Results}

\subsection{Comparisons with State-of-the-arts}

In this section, we compare our DINOv3-FD with seven state-of-the-art (SOTA) PEFT approaches, including Linear Probe~\cite{simeoni2025dinov3}, Adapter-LN~\cite{houlsby2019parameter}, LoRA~\cite{hu2021lora}, IA$^{3}$~\cite{liu2022few}, LyCORIS~\cite{yeh2024navigating}, VeRA~\cite{kopiczko2024vera}, and PaCA~\cite{woo2025paca}. The quantitative results are presented in Table~\ref{tab:main_results}. As shown in the table, DINOv3-FD consistently achieves the best overall performance. For instance, it reaches 87.45\%, 85.91\%, and 73.11\% accuracy on the three datasets, outperforming the second-best model by 0.56\%, 3.11\%, and 6.96\%, respectively. Similar observations can be found in other evaluation metrics as well. Notably, our method exhibits the largest improvement on ODIR, which provides the smallest number of training samples, further demonstrating the effectiveness of our feature disentanglement mechanism in low-data regimes. Furthermore, while the compared PEFT methods exhibit fluctuating performance across datasets, DINOv3-FD remains consistently superior, demonstrating its robustness under diverse medical tasks. Meanwhile, we include non-GVFM baselines trained from scratch with standard ResNet50 and ViT-B architectures. The inferior performance in Table~\ref{tab:main_results} indicates that heavy vision models cannot be effectively trained from scratch under a small-scale medical dataset.

\begin{table*}[t]
\centering
\setlength{\tabcolsep}{5pt}
\caption{Quantitative performance in percentage (\%) across RSNA, ISIC, and ODIR datasets. The best and second-best performances are marked in \textbf{bold} and \underline{underline}.}
\label{tab:main_results}
\begin{tabular}{l|ccc|ccc|ccc}
\toprule
\multirow{2}{*}{Method} 
& \multicolumn{3}{c|}{RSNA (\%)} 
& \multicolumn{3}{c|}{ISIC (\%)} 
& \multicolumn{3}{c}{ODIR (\%)} \\
& ACC & AUC & $F_{\mathrm{1}}$ 
& ACC & AUC & $F_{\mathrm{1}}$ 
& ACC & AUC & $F_{\mathrm{1}}$ \\
\midrule
ResNet50 & 84.61 & 87.85 & 63.72 & 64.62 & 85.33 & 32.73 & 48.22 & 72.84 & 24.55 \\
ViT-B & 83.33 & 83.66 & 53.60 & 63.76 & 81.96 & 23.39 & 44.51 & 52.50 & 7.71 \\
Linear Probe  & 80.97 & 81.27 & 36.34 & 57.41 & 87.04 & 40.65 & 31.38 & 72.82 & 35.27 \\
Adapter-LN    & \underline{86.89} & 90.66 & 67.29 & 78.57 & 96.45 & 70.10 & 56.57 & 88.84 & 55.75 \\
LoRA          & 86.33 & 91.08 & 66.91 & 82.28 & \underline{96.70} & 72.43 & 65.84 & \underline{91.40} & \underline{65.45} \\
IA$^3$        & 86.63 & 90.22 & 67.04 & \underline{82.80} & 96.38 & 70.19 & \underline{66.15} & 88.80 & 56.03 \\
LyCORIS       & 84.12 & 87.18 & 60.00 & 79.70 & 94.17 & 65.29 & 62.60 & 87.15 & 50.82 \\
VeRA          & 85.09 & 88.16 & 62.02 & 79.03 & 94.68 & 63.37 & 60.90 & 86.58 & 49.87 \\
PaCA       & 86.70 & \underline{91.09} & \underline{68.83} & 81.61 & 96.48 & \underline{72.47} & 58.57 & 90.23 & 58.33 \\
\midrule
\textbf{Ours} 
              & \textbf{87.45} & \textbf{91.39} & \textbf{70.06} 
              & \textbf{85.91} & \textbf{96.99} & \textbf{75.17} 
              & \textbf{73.11} & \textbf{92.00} & \textbf{65.59} \\
\bottomrule
\end{tabular}
\end{table*}

\subsection{Ablation Studies}

\paragraph{Effect of key components.}

We evaluate the contribution of key components in our DINOv3-FD through a set of ablation studies, where each component is progressively introduced to assess its impact on the overall performance. The setting configurations include: (1) Removing $\mathcal{L}_{\mathrm{ortho}}$ and $\mathcal{L}_{\mathrm{reg}}$, which degenerates to the Lora-based fine-tuning. (2) With $\mathcal{L}_{\mathrm{ortho}}$. (3) With $\mathcal{L}_{\mathrm{reg}}$, which equals to our DINOv3-FD with all components. The results are presented in 
Table~\ref{tab:two_ablation}. As shown, incorporating the orthogonality loss consistently improves performance across all evaluated tasks. Moreover, further integrating the distributional regularization loss yields additional gains and leads to stronger overall results.

\begin{table}[htbp]
\centering
\caption{Ablation study with orthogonality and distributional regularization loss.}
\label{tab:two_ablation}
\begin{tabular}{c|c|ccc|ccc|ccc}
\toprule
\multirow{2}{*}{$\mathcal{L}_{\mathrm{ortho}}$} & \multirow{2}{*}{$\mathcal{L}_{\mathrm{reg}}$} 
& \multicolumn{3}{c|}{RSNA (\%)} 
& \multicolumn{3}{c|}{ISIC (\%)} 
& \multicolumn{3}{c}{ODIR (\%)} \\
\cmidrule(lr){3-5} \cmidrule(lr){6-8} \cmidrule(lr){9-11}
& & ACC & AUC & $F_{\mathrm{1}}$ & ACC & AUC & $F_{\mathrm{1}}$ & ACC & AUC & $F_{\mathrm{1}}$ \\
\midrule

\xmark & \xmark & 86.33 & 91.08 & 66.91 & 82.28 & 96.70 & 72.43 & 65.84 & 91.40 & 65.45 \\
\cmark & \xmark & 86.44 & 91.34 & 65.78 & 83.53 & \textbf{97.07} & \textbf{76.02} & 66.31 & 90.92 & 65.45 \\
\cmark & \cmark & \textbf{87.45} & \textbf{91.39} & \textbf{70.06} & \textbf{85.91} & 96.99 & 75.17 & \textbf{73.11} & \textbf{92.00} & \textbf{65.59} \\
\bottomrule
\end{tabular}
\end{table}

\paragraph{Comparison with alternative regularization objectives.}

In this section, we compare our method with alternative modules in terms of the distributional regularization loss. Our method adopts a Kullback–Leibler divergence (KLD) for promoting the output of TIA toward a uniform distribution. We compare KLD with two other alternative modules, GRL and ER. Specifically, 
Gradient reversal (GRL)~\cite{ganin2016dann} suppresses class-dependent signals through an adversarial objective and performs well in settings where the feature structure aligns with such inverted gradients. Entropy-based regularization (ER)~\cite{grandvalet2004semi} provides a softer constraint by encouraging high-entropy predictions. The quantitative results are shown in Table~\ref{tab:alternative_ablation}. Across all datasets, our method achieves the best and stable performance. Enforcing a uniform predictive distribution within TIA, it provides more explicit targets over the irrelevant pathway and complements the orthogonality loss in constructing a disentangled representation.

\begin{table}[htbp]
\centering
\caption{Ablation study with alternative distributional regularization objectives}
\label{tab:alternative_ablation}
\begin{tabular}{c|ccc|ccc|ccc}
\toprule
\multirow{2}{*}{Methods} 
& \multicolumn{3}{c|}{RSNA (\%)} 
& \multicolumn{3}{c|}{ISIC (\%)} 
& \multicolumn{3}{c}{ODIR (\%)} \\
\cmidrule(lr){2-4} \cmidrule(lr){5-7} \cmidrule(lr){8-10}
& ACC & AUC & $F_{\mathrm{1}}$ & ACC & AUC & $F_{\mathrm{1}}$ & ACC & AUC & $F_{\mathrm{1}}$ \\
\midrule

GRL & 87.37 & \textbf{91.68} & 69.50 & 78.04 & 95.81 & 72.66 & 55.49 & 87.37 & 51.83 \\
ER & 87.42 & 91.33 & 68.72 & 83.86 & \textbf{97.06} & 68.44 & 67.85 & 87.73 & 53.25 \\
KLD (Ours) & \textbf{87.45} & 91.39 & \textbf{70.06} & \textbf{85.91} & 96.99 & \textbf{75.17} & \textbf{73.11} & \textbf{92.00} & \textbf{65.59} \\
\bottomrule
\end{tabular}
\end{table}

\begin{figure}[htbp]
    \centering
    \includegraphics[width=0.85\linewidth]{visual.pdf}
    \caption{(a) 3D bar plot of average ACC, AUC and $F_{\mathrm{1}}$ scores
    across three datasets under different values of $\lambda_{\mathrm{ortho}}$ and $\lambda_{\mathrm{reg}}$. Warmer (more yellow) colors indicate higher performance. (b) T-SNE visualizations of the feature embeddings from task-relevant and task-irrelevant adapters over RSNA. (c) Grad-CAM visualization maps of task-relevant and task-irrelevant pathways of two randomly selected samples on ISIC and RSNA datasets.
    }
    \label{fig:visualization}
\end{figure}

\paragraph{Effect of loss weights.}

To evaluate the sensitivity of our framework to the hyperparameters $\lambda_{\mathrm{ortho}}$ and $\lambda_{\mathrm{reg}}$, we assess the average performance across three datasets under different configurations. As illustrated in Figure~\ref{fig:visualization}-(a), compared with fully removing the feature disentanglement mechanism ($\lambda_{\mathrm{ortho}}=0$, $\lambda_{\mathrm{reg}}=0$), the performance consistently improves across various values of $\lambda_{\mathrm{ortho}}$ and $\lambda_{\mathrm{reg}}$ in terms of three metrics averaged on three datasets. The best trade-off is achieved at $\lambda_{\mathrm{ortho}}=0.005$ and $\lambda_{\mathrm{reg}}=0.2$. We adopt this configuration as the default setting for our method.


\subsection{Interpretable Visualization}

\paragraph{Feature embedding visualization.}

We utilize the t-SNE
(t-distributed Stochastic Neighbor Embedding) method to visualize the feature embeddings from the two branches of our DINOv3-FD. The result over RSNA dataset is depicted in Figure~\ref{fig:visualization}-(b). As shown, the left and right regions correspond to the TRA and TIA feature spaces, respectively, with different colors denoting different disease categories. In the TRA space, normal and pneumonia samples form two clearly separated clusters, indicating strong task relevance. In contrast, the TIA space exhibits a highly random distribution with no visible class separation. These visualization results demonstrate the effectiveness of our feature disentanglement mechanism and validate that TRA captures task-relevant features. More visualization results can be found in the Appendix.


\paragraph{Feature localization visualization.}

To further locate the regions of interest of the two branches during classification, we use Grad-CAM \cite{selvaraju2017grad} to generate class-specific activation maps for visualization. Figure~\ref{fig:visualization}-(c) illustrates the activation patterns of two randomly selected samples from RSNA and ISIC. As shown, the TRA branch focuses on clinically meaningful regions that support the decision-making process. For example, TRA highlights lesion areas in dermatology images and lung regions in chest X-ray images. In contrast, the TIA branch predominantly attends to regions with limited diagnostic relevance, including background or nonspecific structures. These qualitative observations further validate the complementary nature of the two branches and demonstrate the success of our feature disentanglement strategy. More visualization results can be found in the Appendix.

\section{Conclusion}

In this work, we proposed DINOv3-FD, a feature disentanglement framework that enables parameter-efficient adaptation of DINOv3 for medical vision tasks. By leveraging a dual adapter framework, we disentangle the task-relevant and task-irrelevant representations into two subspaces. Furthermore, the proposed orthogonality and distributional regularization objectives further promote the disentanglement procedure in a task-oriented manner. Extensive experiments across multiple medical datasets demonstrate that DINOv3-FD outperforms other existing PEFT approaches. 
In the future, we plan to explore this framework in three directions. First, we will incorporate textual information such as radiology reports and clinical metadata to enable text-guided adaptation, allowing the model to exploit cross-modal interaction during adaptation. Second, we will explore how to adapt DINOv3, which is trained on 2D images, to 3D medical imaging modalities, like CT and MRI volumes. Third, we will further evaluate the framework on a wider range of vision tasks, such as segmentation and lesion localization.

\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{This work was supported by Tier 1 grant, NUS, Singapore (24-1250-P0001) and Ministry of Education Tier 2 grant, Singapore (T2EP20224-0028).}

\bibliography{midl26_211}

\appendix

\section{Additional Embedding and Localization Visualizations}

We further explored DINOv3's powerful representation skills across all three datasets.

\subsection{T-SNE Analyses on ISIC and ODIR}

To further examine the representational behavior of DINOv3-FD beyond the RSNA dataset, we provide additional t-SNE embeddings on ISIC and ODIR. As illustrated in Figure~\ref{fig:appendix}-(a), the task-relevant adapter (TRA) retains well-structured and semantically aligned clusters across both dermatology (ISIC) and ophthalmology (ODIR) domains. Disease categories with distinct visual signatures, such as basal cell carcinoma and glaucoma, form clearly separated regions, highlighting the adapter’s ability to consolidate diagnostic cues. 

In contrast, the task-irrelevant adapter (TIA) produces highly intermixed distributions with no meaningful class separation, reinforcing its role as a repository for non-discriminative 
or nuisance factors. These complementary behaviors are consistent with our design goal and match the observations 
reported in the main text.

\begin{figure}[htbp]
    \centering
    \includegraphics[width=\linewidth]{appendix.pdf}
    \caption{
    \textbf{(a) T-SNE plots on ISIC and ODIR.}
    The TRA space (left of each pair) forms compact and semantically aligned clusters, whereas the TIA space (right) exhibits diffuse and unstructured patterns. 
    Colors correspond to clinical categories as defined in the datasets.
    \textbf{(b) Grad-CAM visualizations on ISIC, RSNA, and ODIR.}
    The TRA branch (middle) consistently concentrates on disease-relevant regions, while the TIA branch (right) focuses on non-diagnostic structures. 
    These results validate the complementary representational roles introduced by our 
    disentanglement framework.}
    \label{fig:appendix}
\end{figure}

\subsection{Grad-CAM Localization Across Three Datasets}

We additionally report Grad-CAM visualizations over ISIC, RSNA, and ODIR to further probe the spatial focus of the two adapters. As shown in Figure~\ref{fig:appendix}-(b), the TRA pathway consistently highlights clinically informative regions, including dermoscopic lesions, pulmonary opacities, 
and disease-related fundus structures. The TIA pathway, by contrast, attends to diffuse or anatomically non-salient areas, often favoring background textures or peripheral structures. This complementary spatial behavior reflects the intended disentanglement between task-discriminative and task-agnostic information, offering an interpretable view of how DINOv3-FD separates diagnostic cues from incidental image content.

\section{Stability Analysis}

\subsection{Stability Across Different Seeds and Batch Sizes}

We further analyze training stability by repeating experiments with different random seeds. As summarized in Table~\ref{tab:stable}, DINOv3-FD exhibits low variance across runs, indicating that the proposed dependence-minimization objective can be optimized reliably. We also compare against a representative baseline, LoRA, observing consistently better performance.

\begin{table*}[t]
\centering
\caption{Stability analysis across different random seeds (0, 1, 42). We report mean $\pm$ standard deviation for all metrics.}
\label{tab:stable}
\begin{tabular}{llccc}
\toprule
Method & Dataset & ACC & AUC & F1 \\
\midrule
\multirow{3}{*}{LoRA}
& RSNA & 86.00$\pm$0.5 & 90.67$\pm$0.6 & 66.57$\pm$0.9 \\
& ISIC & 80.34$\pm$1.9 & 96.68$\pm$0.3 & 73.33$\pm$0.8 \\
& ODIR & 65.43$\pm$0.9 & 90.84$\pm$0.5 & 64.59$\pm$0.8 \\
\midrule
\multirow{3}{*}{Ours}
& RSNA & 86.27$\pm$1.0 & 90.72$\pm$0.7 & 66.87$\pm$2.8 \\
& ISIC & 84.41$\pm$1.5 & 96.90$\pm$0.3 & 73.35$\pm$2.3 \\
& ODIR & 72.33$\pm$0.8 & 91.57$\pm$0.7 & 65.71$\pm$2.2 \\
\bottomrule
\end{tabular}
\end{table*}

Besides multiple seed runs, we also conduct experiments with the batch size of 16, 32, and 64. As Table~\ref{tab:bs} shows, although there is slight turbulence with changes in batch size, we notice that the batch size of 16 achieves the best average performance across 3 datasets. Notably, all these 3 options outperform other PEFT baselines, demonstrating the effectiveness of feature disentanglement.

\begin{table}[htbp]
\centering
\caption{Effect of batch size on performance across RSNA, ISIC, and ODIR datasets. The best and second-best performances are marked in \textbf{bold} and \underline{underline}.}
\label{tab:bs}
\setlength{\tabcolsep}{4pt}
\begin{tabular}{l|ccc|ccc|ccc}
\toprule
\multirow{2}{*}{Batch Size}
& \multicolumn{3}{c|}{RSNA (\%)}
& \multicolumn{3}{c|}{ISIC (\%)}
& \multicolumn{3}{c}{ODIR (\%)} \\
& ACC & AUC & F1
& ACC & AUC & F1
& ACC & AUC & F1 \\
\midrule
64
& \underline{86.67} & \underline{91.34} & 66.85
& 83.00 & 96.52 & 70.78
& 71.25 & 91.56 & 63.87 \\

32
& 86.40 & 91.24 & \textbf{70.27}
& \underline{84.39} & \textbf{97.19} & \textbf{75.24}
& \underline{72.33} & \underline{91.86} & \textbf{66.26} \\

16 (Ours)
& \textbf{87.45} & \textbf{91.39} & \underline{70.06}
& \textbf{85.91} & \underline{96.99} & \underline{75.17}
& \textbf{73.11} & \textbf{92.00} & \underline{65.59} \\
\bottomrule
\end{tabular}
\end{table}

\subsection{Comparison with alternative orthogonal objectives}

We compare the proposed dependence-minimization objective with simpler decorrelation losses like cross-covariance~\cite{bardes2021vicreg}, cosine decorrelation~\cite{zbontar2021barlow}, and Gram~\cite{cogswell2015reducing}, and even more complex losses like HSIC~\cite{ma2020hsic}. The results are shown in Table~\ref{tab:loss}. While these alternatives show distinct performance across different datasets, our choice has the highest average outcome among them. 

\begin{table}[htbp]
\centering
\caption{Comparison of different decorrelation objectives across RSNA, ISIC, and ODIR datasets. The best and second-best performances are marked in \textbf{bold} and \underline{underline}.}
\label{tab:loss}
\setlength{\tabcolsep}{4pt}
\begin{tabular}{l|ccc|ccc|ccc}
\toprule
\multirow{2}{*}{Method}
& \multicolumn{3}{c|}{RSNA (\%)}
& \multicolumn{3}{c|}{ISIC (\%)}
& \multicolumn{3}{c}{ODIR (\%)} \\
& ACC & AUC & F1
& ACC & AUC & F1
& ACC & AUC & F1 \\
\midrule
HSIC
& 86.97 & 91.34 & \underline{70.00}
& 85.58 & 96.89 & \underline{76.42}
& 71.25 & 91.70 & \underline{66.05} \\

Gram
& 86.63 & 91.33 & 68.98
& 85.65 & 96.91 & 75.98
& 72.33 & \textbf{92.42} & \textbf{67.22} \\

Cross Covariance
& 86.74 & 91.08 & 66.60
& \textbf{88.60} & \textbf{98.89} & \textbf{77.38}
& \underline{72.64} & 91.51 & 65.52 \\

Cosine
& \underline{87.23} & \textbf{91.45} & 68.22
& 85.58 & 96.79 & 74.95
& 71.87 & 91.51 & 65.17 \\

\midrule
\textbf{MINE (Ours)}
& \textbf{87.45} & \underline{91.39} & \textbf{70.06}
& \underline{85.91} & \underline{96.99} & 75.17
& \textbf{73.11} & \underline{92.00} & 65.59 \\
\bottomrule
\end{tabular}
\end{table}

\section{Strategies to Ease Class Imbalance Settings}

\subsection{Effect of Imbalance-handling Strategies}

Given the fact that RSNA, ISIC, and ODIR have an imbalanced distribution, we conduct evaluation experiments with two approaches: one is using a balanced data sampler, and the other is using reweighting, to investigate their effectiveness. As Table~\ref{tab:imbalance} shows, our original setup shows a better average performance, demonstrating the effectiveness of our method.

\begin{table}[htbp]
\centering
\caption{Effect of imbalance-handling strategies on RSNA, ISIC, and ODIR datasets. The best and second-best performances are marked in \textbf{bold} and \underline{underline}.}
\label{tab:imbalance}
\setlength{\tabcolsep}{4pt}
\begin{tabular}{l|ccc|ccc|ccc}
\toprule
\multirow{2}{*}{Method}
& \multicolumn{3}{c|}{RSNA (\%)}
& \multicolumn{3}{c|}{ISIC (\%)}
& \multicolumn{3}{c}{ODIR (\%)} \\
& ACC & AUC & F1
& ACC & AUC & F1
& ACC & AUC & F1 \\
\midrule
Balanced Sampling
& 79.63 & \textbf{91.49} & 66.13
& 84.72 & \textbf{97.35} & \textbf{77.27}
& 59.66 & 90.47 & 58.33 \\

Reweighting
& \underline{86.97} & \underline{91.42} & \underline{68.01}
& \underline{85.38} & \underline{96.87} & \underline{75.43}
& \underline{71.87} & \underline{90.82} & \textbf{66.70} \\

\midrule
\textbf{Ours}
& \textbf{87.45} & 91.39 & \textbf{70.06}
& \textbf{85.91} & 96.99 & 75.17
& \textbf{73.11} & \textbf{92.00} & \underline{65.59} \\
\bottomrule
\end{tabular}
\end{table}

\subsection{Comparison with Prior Matching.}

While we regularize the task-irrelevant adapter toward a uniform distribution by default, we also evaluate a prior-matching variant~\cite{bhat2025prior2posterior} that aligns predictions with the empirical class distribution to check if it is better for an imbalance situation. The results shown in Table~\ref{tab:prior} indicate that our method mostly performs better than the prior-matching approach. This implies that enforcing a uniform target can effectively remove task-irrelevant information from the task-relevant features. 

\begin{table}[htbp]
\centering
\caption{Comparison between prior-matching regularization and uniform regularization. The better performances are marked in \textbf{bold}.}
\label{tab:prior}
\setlength{\tabcolsep}{4pt}
\begin{tabular}{l|ccc|ccc|ccc}
\toprule
\multirow{2}{*}{Method}
& \multicolumn{3}{c|}{RSNA (\%)}
& \multicolumn{3}{c|}{ISIC (\%)}
& \multicolumn{3}{c}{ODIR (\%)} \\
& ACC & AUC & F1
& ACC & AUC & F1
& ACC & AUC & F1 \\
\midrule
Prior Matching
& 86.67 & 91.21 & 69.26
& 85.58 & 96.88 & \textbf{75.92}
& 71.72 & 90.80 & 64.96 \\

\midrule
\textbf{Uniform (Ours)}
& \textbf{87.45} & \textbf{91.39} & \textbf{70.06}
& \textbf{85.91} & \textbf{96.99} & 75.17
& \textbf{73.11} & \textbf{92.00} & \textbf{65.59} \\
\bottomrule
\end{tabular}
\end{table}

\section{Transferability to Segmentation Tasks}

To evaluate whether the task-relevant adapter (TRA) captures clinically meaningful and transferable representations beyond classification, we further assess DINOv3-FD on a dense prediction task. Specifically, we transfer the learned TRA features to the ISIC 2018 Lesion Boundary Segmentation task.

As shown in Table~\ref{tab:isic_seg}, DINOv3-FD consistently outperforms representative PEFT baselines. In particular, our method achieves a Dice score of \textbf{91.26\%} and an IoU of \textbf{83.93\%}, surpassing all competing methods. These results indicate that the proposed method can generalize to dense visual prediction tasks. 

\begin{table*}[htbp]
\centering
\caption{Quantitative performance in percentage (\%) on the ISIC 2018 Lesion Boundary Segmentation task. The best and second-best performances are marked in \textbf{bold} and \underline{underline}.}
\label{tab:isic_seg}
\begin{tabular}{lcccc}
\toprule
Method & ACC (\%) & Dice (\%) & IoU (\%) & mAP (\%) \\
\midrule
Linear Probe  & 38.57 & 46.53 & 30.32 & 35.86 \\
Adapter-LN    & 91.65 & 90.67 & 82.94 & 98.71 \\
LoRA          & \underline{92.12} & \underline{91.25} & \underline{83.91} & 98.91 \\
IA$^3$        & 91.71 & 90.64 & 82.88 & 98.66 \\
LyCORIS       & 91.27 & 90.07 & 81.94 & 98.25 \\
VeRA          & 91.40 & 90.22 & 82.19 & 98.32 \\
PaCA          & 91.93 & 90.85 & 83.23 & \underline{98.72} \\
\midrule
\textbf{Ours} & \textbf{92.14} & \textbf{91.26} & \textbf{83.93} & \textbf{98.91} \\
\bottomrule
\end{tabular}
\end{table*}

\section{Efficiency Analysis}

We also calculate the computational overhead as shown in Table~\ref{tab:eff}. Our method accounts for around 4.7M trainable parameters, which is comparable to LoRA. The GFLOPs is 123.11, which is comparable to other PEFT baselines.

\begin{table*}[htbp]
\centering
\caption{Performance of efficiency metrics on all baselines and our method. The best and second-best performances are marked in \textbf{bold} and \underline{underline}. Param. stands for learnable parameters.}
\label{tab:eff}
\begin{tabular}{l|cc}
\toprule
\multirow{2}{*}{Method} 
& \multicolumn{2}{c}{Efficiency} \\
& Param. & GFLOPs \\
\midrule
ResNet50 & 26.4M & 8.18 \\
ViT-B & 30.3M & 33.70 \\
Linear Probe  & 0.3M & 121.76 \\
Adapter-LN    & 1.7M & 121.76 \\
LoRA          & 4.7M & 123.11 \\
IA$^3$        & 1.8M & 121.80 \\
LyCORIS       & 1.6M & 121.76 \\
VeRA          & 1.8M & 123.07 \\
PaCA       & 3.3M & 121.76 \\
\midrule
\textbf{Ours} 
              & 4.7M & 123.11 \\
\bottomrule
\end{tabular}
\end{table*}

\end{document}
