\documentclass[accepted]{uai2023} % for initial submission
% \documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}
\usepackage{subcaption}
%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

\usepackage{graphicx}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{multirow}
\usepackage{amsthm}
\usepackage{flexisym}
\usepackage{wrapfig}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\DeclareMathOperator*{\argmax}{argmax}
\DeclareMathOperator*{\argmin}{argmin}
% \theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
% \theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
% \theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}

\newcommand{\cuongjr}[1]{{\textcolor{blue}{#1}}}
\newcommand{\vcuong}[1]{{\textcolor{red}{#1}}}
\newcommand{\minisection}[1]{\noindent{\textbf{#1}}}
\usepackage{xr}

\title{Simple Transferability Estimation for Regression Tasks}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author{\href{mailto:<cnguy049@cs.fiu.edu>?Subject=Your UAI 2023 paper}{Cuong N.~Nguyen}$^1$ \qquad Phong Tran$^{2,3}$ \qquad Lam Si Tung Ho$^4$ \qquad Vu Dinh$^5$ {\vskip 0.1cm} Anh T.~Tran$^2$ \qquad Tal Hassner$^6$ \qquad Cuong V.~Nguyen}
\affil{Florida International University, USA \qquad $^2$VinAI Research, Vietnam \qquad $^3$MBZUAI, UAE {\vskip 0.1cm} $^4$Dalhousie University, Canada \qquad
$^5$University of Delaware, USA \qquad $^6$Meta AI, USA
}

\iffalse
\author[1]{\href{mailto:<cnguy049@cs.fiu.edu>?Subject=Your UAI 2023 paper}{Cuong N.~Nguyen}{}}
\author[2,3]{Phong Tran}
\author[4]{Lam Si Tung Ho}
\author[5]{\\Vu Dinh}
\author[2]{Anh T.~Tran}
\author[6]{Tal Hassner}
\author[1]{Cuong V.~Nguyen}
% Add affiliations after the authors
\affil[1]{%
    Florida International University\\
    USA
}
\affil[2]{%
    VinAI Research\\
    Vietnam
}
\affil[3]{%
    MBZUAI\\
    UAE
}
\affil[4]{%
    Dalhousie University\\
    Canada
}
\affil[5]{%
    University of Delaware\\
    USA
}
\affil[6]{%
    Meta AI\\
    USA
}
\fi
  
\begin{document}
\maketitle

\begin{abstract}
We consider transferability estimation, the problem of estimating how well deep learning models transfer from a source to a target task. We focus on regression tasks, which received little previous attention, and propose two simple and computationally efficient approaches that estimate transferability based on the negative regularized mean squared error of a linear regression model. We prove novel theoretical results connecting our approaches to the actual transferability of the optimal target models obtained from the transfer learning process. Despite their simplicity, our approaches significantly outperform existing state-of-the-art regression transferability estimators in both accuracy and efficiency. On two large-scale keypoint regression benchmarks, our approaches yield 12\% to 36\% better results on average while being at least 27\% faster than previous state-of-the-art methods.
\end{abstract}


%%%%%%%%% BODY TEXT
\section{Introduction}

Transferability estimation~\citep{bao2019information, tran2019transferability, nguyen2020leep} aims to develop computationally efficient metrics to predict the effectiveness of transferring a deep learning model from a source to a target task. This problem has recently gained attention as a means for model and task selection~\citep{bao2019information, tran2019transferability, nguyen2020leep, bolya2021scalable, you2021logme} that can potentially improve the performance and reduce the cost of transfer learning, especially for expensive deep learning models. In recent years, new transferability estimators were also developed and used in applications such as checkpoint ranking~\citep{huang2021exploiting, li2021ranking} and few-shot learning~\citep{tong2021mathematical}.

Nearly all existing methods consider only the transferability between classification tasks~\citep{bao2019information, tran2019transferability, nguyen2020leep, deshpande2021linearized, li2021ranking, tan2021otce, huang2022frustratingly}, with very few designed for regression~\citep{you2021logme, huang2022frustratingly}, despite the importance of regression problems in a wide range of applications such as landmark detection~\citep{fard2021asmnet, poster2021visible}, object detection and localization~\citep{cai2020deep, bu2021gaia}, pose estimation~\citep{schwarz2015rgb, doersch2019sim2real}, or image generation~\citep{ramesh2021zero, razavi2019generating}. Moreover, those few methods are often a byproduct of a classification transferability estimator and were never tested against regression transferability estimation baselines.

In this paper, we explicitly consider transferability estimation for regression tasks and formulate a novel definition for this problem. Our formulation is based on the practical usage of transferability estimation: to compare the actual transferability between different tasks~\citep{bao2019information, tran2019transferability, nguyen2020leep, you2021logme}. We then propose two \emph{simple, efficient, and theoretically grounded} approaches for this problem that estimate transferability using the negative regularized mean squared error (MSE) of a linear regression model computed from the source and target training sets. The first approach, \emph{Linear MSE}, uses the linear regression model between features extracted from the source model (a model trained on the source task) and true labels of the target training set. The second approach, \emph{Label MSE}, estimates transferability by regressing between the \emph{dummy} labels, obtained from the source model, and true labels of the target data. In special cases where the source and target data share the inputs, the Label MSE estimators can be computed even more efficiently from the true labels without a source model.

In addition to their simplicity, we show our transferability estimators to have theoretical properties relating them to the actual transferability of the transferred target model. In particular, we prove that the transferability of the target model obtained from transfer learning is lower bounded by the Label MSE minus a complexity term, which depends on the target dataset size and the model architecture. Similar theoretical results can also be proven for the case where the source and target tasks share the inputs.

We conduct extensive experiments on two real-world keypoint detection datasets, CUB-200-2011~\citep{wah2011caltech} and OpenMonkey~\citep{yao2021openmonkeychallenge}, as well as the dSprites shape regression dataset~\citep{matthey2017dsprites} to show the advantages of our approaches. The results clearly demonstrate that despite their simplicity, our approaches outperform recently published, state-of-the-art (SotA) regression transferability estimators, such as LogME~\citep{you2021logme} and TransRate~\citep{huang2022frustratingly}, in both effectiveness and efficiency. In particular, our approaches can improve SotA results from 12\% to 36\% on average, while being at least 27\% faster.

\minisection{Summary of contributions.} (1) We formulate a new definition for the transferability estimation problem that can be used for comparing the actual transferability (\S\ref{settings}). (2) We propose Linear MSE and Label MSE, two simple yet effective  transferability estimators for regression tasks (\S\ref{sec:estimators}). (3) We prove novel theoretical results for these estimators to connect them with the actual task transferability (\S\ref{sec:theory}). (4) We rigorously test our approaches in various settings and challenging benchmarks, showing their advantages compared to SotA regression transferability methods (\S\ref{sec:experiment}).\footnote{Implementations of our methods are available at: \url{https://github.com/CuongNN218/regression_transferability}.}



\section{Related work}

Our paper is one of the recent attempts to develop efficient and effective transferability estimators for deep transfer learning~\citep{bao2019information, tran2019transferability, nguyen2020leep, deshpande2021linearized, li2021ranking, tan2021otce, you2021logme, huang2022frustratingly, nguyen2022generalization}, which is closely related to the generalization estimation problem~\citep{chuang2020estimating, deng2021labels}. Most of the existing work for transferability estimation focuses on classification~\citep{bao2019information, tran2019transferability, nguyen2020leep, deshpande2021linearized, li2021ranking, tan2021otce, nguyen2022generalization}, while we are only aware of two methods developed for regression~\citep{you2021logme, huang2022frustratingly}.

One regression transferability method, called LogME~\citep{you2021logme}, takes a Bayesian approach and uses the maximum log evidence of the target data as the transferability estimator. While this method can be sped up using matrix decomposition, its scalability is still limited since the required memory is large. In contrast, our proposed approaches are simpler, faster, and more effective. We also provide novel theoretical properties for our methods that were not available for LogME. Another approach for transferability estimation between regression tasks, called TransRate~\citep{huang2022frustratingly}, is to divide the real-valued outputs into different bins and apply a classification transferability estimator. In our experiments, we will show that this approach is less accurate than both LogME and our approaches.

Transferability can also be inferred from a task taxonomy~\citep{zamir2018taskonomy, dwivedi2019representation, dwivedi2020duality} or a task space representation~\citep{achille2019task2vec}, which embeds tasks as vectors on a vector space. A popular task taxonomy, Taskonomy~\citep{zamir2018taskonomy}, exploits the underlying structure of visual tasks by computing a task affinity matrix that can be used for estimating transferability. Constructing the Taskonomy requires training a small classification head, which resembles the training of the regularized linear regression models in our approaches. However, they investigate the global taxonomy of classification tasks, while our paper studies regression tasks with a focus on estimating their transferability efficiently.

Our paper is also related to transfer learning with kernel methods~\citep{radhakrishnan2022transfer} and with deep models~\citep{tan2018survey}, which has been successful in real-world regression problems such as object detection and localization~\citep{cai2020deep, bu2021gaia}, landmark detection~\citep{fard2021asmnet, poster2021visible}, or pose estimation~\citep{schwarz2015rgb, doersch2019sim2real}. Several previous works have investigated theoretical bounds for transfer learning~\citep{ben2003exploiting, blitzer2007learning, mansour2009domain, azizzadenesheli2018regularized, wang2019transfer, tripuraneni2020theory}; however, these bounds are hard to compute in practice and thus unsuitable for transferability estimation. Some previous transferability estimators have theoretical bounds on the empirical loss of the transferred model~\citep{tran2019transferability, nguyen2020leep}, but these bounds were for classification and did not relate directly to transferability. Our bounds, on the other hand, focus on regression and connect our approaches directly to the notion of transferability.



\section{Transferability between regression tasks}
\label{settings}

In this section, we describe the transfer learning setting that will be used in our subsequent analysis. We then propose a definition of transferability for regression tasks and a new formulation for the transferability estimation problem.


\subsection{Transfer learning for regression}
\label{sec:transfer_learning}

Consider a source training set $\mathcal{D}_s = \{ (x^s_i, y^s_i) \}_{i=1}^{n_s}$ and a target training set $\mathcal{D}_t = \{ (x^t_i, y^t_i) \}_{i=1}^{n_t}$ consisting of $n_s$ and $n_t$ examples respectively, where $x^s_i, x^t_i \in \mathbb{R}^d$ are $d$-dimensional input vectors, $y^s_i \in \mathbb{R}^{d_s}$ is a $d_s$-dimensional source label vector, and $y^t_i \in \mathbb{R}^{d_t}$ is a $d_t$-dimensional target label vector. Here we allow multi-output regression tasks (with $d_s, d_t \ge 1$) where the source and target labels may have different dimensions ($d_s \neq d_t$). In the simplest case, the source and target tasks are both single-output regression tasks where $d_s = d_t = 1$.

In this paper, we will refer to a model (such as $w$, $w^*$, $h$, $h^*$, $k$, or $k^*$) and its parameters interchangeably. Using the source dataset $\mathcal{D}_s$, we train a deep learning model $(w^*, h^*)$ consisting of an optimal feature extractor $w^*$ and an optimal regression head $h^*$ that minimizes the empirical MSE loss:\footnote{Here we assume $(w^*, h^*)$ is a global minimum of Eq.~\eqref{eq:source}. However, practical optimization algorithms often only return a local minimum for this problem. The same is also true for Eq.~\eqref{eq:target}.}
%
\begin{equation}
\textstyle w^*, h^* = \argmin_{w, h} \mathcal{L} (w, h; \mathcal{D}_s),
\label{eq:source}
\end{equation}
%
where $w : \mathbb{R}^d \rightarrow \mathbb{R}^{d_r}$ is a feature extractor network that transforms a $d$-dimensional input vector into a $d_r$-dimensional feature vector, $h : \mathbb{R}^{d_r} \rightarrow \mathbb{R}^{d_s}$ is a source regression head network that transforms a $d_r$-dimensional feature vector into a $d_s$-dimensional output vector, and $\mathcal{L} (w, h; \mathcal{D}_s)$ is the empirical MSE loss of the whole model $(w, h)$ on the dataset $\mathcal{D}_s$:
%
\begin{equation}
\mathcal{L} (w, h; \mathcal{D}_s) = \frac{1}{n_s} \sum_{i=1}^{n_s} \| y^s_i - h(w(x^s_i)) \|^2,
\end{equation}
%
with $\| \cdot \|$ being the $\ell_2$ norm. In practice, we usually consider a source model (e.g.,~a ResNet~\citep{he2016deep}) as a whole and use its first $l$ layers from the input (for some chosen number $l$) as the feature extractor $w$. The regression head $h$ is the remaining part of the model from the $l$-th layer to the output layer, and the prediction for any input $x$ is $h(w(x))$.

After training the optimal source model $(w^*, h^*)$, we perform transfer learning to the target task by freezing the optimal feature extractor $w^*$ and re-training a new regression head $k^*$ using the target dataset $\mathcal{D}_t$, also by minimizing the empirical MSE loss:
%
\begin{align}
    k^* &= \textstyle \argmin_k \mathcal{L} (w^*, k; \mathcal{D}_t) \notag\\
    &= {\textstyle \argmin_k} \Big\{ \frac{1}{n_t} \sum_{i=1}^{n_t} \| y^t_i - k(w^*(x^t_i)) \|^2 \Big\}
    \label{eq:target},
\end{align}
%
where $k : \mathbb{R}^{d_r} \rightarrow \mathbb{R}^{d_t}$ is a target regression head network that may have a different architecture than that of $h$. In general, the regression heads $h$ and $k$ may contain multiple layers and are not necessarily linear. 

This transfer learning algorithm, usually called \emph{head re-training}, has been widely used for deep learning models~\citep{donahue2014decaf, oquab2014learning, sharif2014cnn, whatmough2019fixynn} and will be used for our theoretical analysis. In practice and in our experiments, we also consider another transfer learning algorithm, widely known as \emph{fine-tuning}, where we fine-tune the trained feature extractor $w^*$ on the target set, and then train a new target regression head $k^*$ with this fine-tuned feature extractor~\citep{agrawal2014analyzing, girshick2014rich, chatfield2014return, dhillon2019baseline}.


\subsection{Transferability estimation}

As our first contribution, we propose a definition of transferability for regression tasks and a new formulation for the transferability estimation problem. For this purpose, we make the standard assumption that the target data $\mathcal{D}_t$ are drawn iid from the true but unknown distribution $\mathbb{P}_t := \mathbb{P}(X^t, Y^t)$; that is, $(x^t_i, y^t_i) \stackrel{\mathrm{iid}}{\sim} \mathbb{P}_t$. We do not make any assumption on the distribution of the source data $\mathcal{D}_s$, but we assume a source model $(w^*, h^*)$ is pre-trained on $\mathcal{D}_s$ and then transferred to a target model $(w^*, k^*)$ using the procedure in Section~\ref{sec:transfer_learning}. 

We now define the transferability between the source dataset $\mathcal{D}_s$ and the target task represented by $\mathbb{P}_t$. In our Definition~\ref{def:transferability} below, the transferability is the expected negative $\ell_2$ loss of the target model $(w^*, k^*)$ on a random example drawn from $\mathbb{P}_t$. From this definition, the lower the loss of $(w^*, k^*)$, the higher the transferability.

\begin{definition}
The \textbf{transferability} between a source dataset $\mathcal{D}_s$ and a target task $\mathbb{P}_t$ is defined as: $\mathrm{Tr}(\mathcal{D}_s, \mathbb{P}_t) := \mathbb{E}_{(x^t, y^t) \sim \mathbb{P}_t} \left \{ - \|y^t - k^*(w^*(x^t)) \|^2 \right \}$.
\label{def:transferability}
\end{definition}

In the above definition, transferability is also equivalent to the negative expected (true) risk of $(w^*, k^*)$. Next, we formulate the transferability estimation problem. Previous work~\citep{tran2019transferability, huang2022frustratingly} defined this problem as estimating $\mathrm{Tr}(\mathcal{D}_s, \mathbb{P}_t)$ from the training sets $(\mathcal{D}_s, \mathcal{D}_t)$, i.e., to derive a real-valued metric $\mathcal{T}(\mathcal{D}_s, \mathcal{D}_t) \in \mathbb{R}$ such that ${ \mathcal{T}(\mathcal{D}_s, \mathcal{D}_t) \approx \mathrm{Tr}(\mathcal{D}_s, \mathbb{P}_t) }$. However, in most applications of transferability estimation such as task selection~\citep{tran2019transferability, huang2022frustratingly, you2021logme} or model ranking~\citep{huang2021exploiting, li2021ranking}, an accurate approximation of $\mathrm{Tr}(\mathcal{D}_s, \mathbb{P}_t)$ is usually not required since $\mathcal{T}(\mathcal{D}_s, \mathcal{D}_t)$ is only used for comparing tasks or models. Thus, we propose below an \emph{alternative definition} for this problem that better aligns with its practical usage.

\begin{definition}
\textbf{Transferability estimation} aims to find a computationally efficient real-valued metric ${ \mathcal{T}(\mathcal{D}_s, \mathcal{D}_t) \in \mathbb{R} }$ for any pair of training datasets $(\mathcal{D}_s, \mathcal{D}_t)$ such that:
$\mathcal{T}(\mathcal{D}_s, \mathcal{D}_t) \le \mathcal{T}(\mathcal{D}'_s, \mathcal{D}'_t)$ if and only if $\mathrm{Tr}(\mathcal{D}_s, \mathbb{P}_t) \leq \mathrm{Tr}(\mathcal{D}'_s, \mathbb{P}'_t)$, where $\mathbb{P}_t$ and $\mathbb{P}'_t$ are the tasks corresponding with the datasets $\mathcal{D}_t$ and $\mathcal{D}'_t$ respectively.
\label{def:trans_est}
\end{definition}

In our new definition, a transferability estimator $\mathcal{T}(\mathcal{D}_s, \mathcal{D}_t)$ is a function of $(\mathcal{D}_s, \mathcal{D}_t)$ that can be used for comparing or ranking transferability. It does \emph{not} need to be an approximation of $\mathrm{Tr}(\mathcal{D}_s, \mathbb{P}_t)$. This is a generalization of previous definitions~\citep{nguyen2020leep, huang2022frustratingly} and can be used for source task selection (when $\mathbb{P}_t = \mathbb{P}'_t$ and ${\mathcal{D}_t = \mathcal{D}'_t}$) as well as target task selection (when $\mathcal{D}_s = \mathcal{D}'_s$). It is consistent with the usage of transferability estimators and the way they are evaluated in the literature by correlation analysis~\citep{tran2019transferability, nguyen2020leep, you2021logme, huang2022frustratingly}.



\section{Simple transferability estimators for regression}
\label{sec:estimators}

In theory, we can use $-\mathcal{L} (w^*, k^*; \mathcal{D}_t)$, the negative MSE of the transferred target model $(w^*, k^*)$, as a transferability estimator, since it is an empirical estimation of $\mathrm{Tr}(\mathcal{D}_s, \mathbb{P}_t)$ using the dataset $\mathcal{D}_t$. However, this method requires us to run the actual transfer learning process, which could be expensive if the network architecture of the target regression heads (e.g., $k$ and $k^*$) is deep and complex. This violates a crucial requirement for a transferability estimator in Definition~\ref{def:trans_est}: the estimator must be \emph{computationally efficient} since it will be computed several times for task comparison. In this section, we propose two simple regression transferability estimators to address this problem.


\subsection{Linear MSE estimator}

To reduce the cost of computing $\mathcal{L} (w^*, k^*; \mathcal{D}_t)$, a simple idea is to approximate it with an $\ell_2$-regularized linear regression (Ridge regression) head. This leads to our first simple transferability estimator, Linear MSE, which is defined as the negative regularized MSE of this Ridge regression head. In this definition, $\| \cdot \|_F$ is the Frobenius norm.

\begin{definition}
The \textbf{Linear MSE} transferability estimator with a regularization parameter $\lambda \ge 0$ is: $\mathcal{T}^{\mathrm{lin}}_{\lambda}(\mathcal{D}_s, \mathcal{D}_t) := - \min_{A, b} \big\{ \frac{1}{n_t} \sum_{i=1}^{n_t} {\| y^t_i - A w^*(x^t_i) - b \|^2} + \lambda \|A \|_F^2 \big\}$, where $A \in \mathbb{R}^{d_r \times d_t}$ is a $d_r \times d_t$ real-valued matrix and $b \in \mathbb{R}^{d_t}$ is a $d_t$-dimensional real-valued vector.
\label{def:linmse}
\end{definition}

Here we add a regularizer to avoid overfitting when the target dataset $\mathcal{D}_t$ is small. Previous work such as LogME~\citep{you2021logme} proposed to prevent overfitting by taking a Bayesian approach, which is more complicated and expensive. We will show empirically in our experiments (Section~\ref{sec:exp_small}) that our simple regularization approach can tackle the issue more effectively and efficiently.

Given a pre-trained feature extractor $w^*$ and a target set $\mathcal{D}_t$, we can compute $\mathcal{T}^{\mathrm{lin}}_{\lambda}(\mathcal{D}_s, \mathcal{D}_t)$ efficiently using the closed form solution for Ridge regression or using second-order optimization~\citep{bishop2006pattern}. If the target regression head $k^*$ is a linear regression model, $\mathcal{T}^{\mathrm{lin}}_0(\mathcal{D}_s, \mathcal{D}_t)$ with $\lambda = 0$ is the negative MSE of the transferred target model $(w^*, k^*)$ on $\mathcal{D}_t$. If $k^*$ has more than one layer with a non-linear activation, $\mathcal{T}^{\mathrm{lin}}_{\lambda}(\mathcal{D}_s, \mathcal{D}_t)$ can be regarded as using a regularized linear model to approximate this non-linear head.


\subsection{Label MSE estimator}
\label{sec:labelmse}

Although the Linear MSE transferability score above can be computed efficiently, this computation may still be relatively expensive if the feature vectors $w^*(x^t_i)$ are high-dimensional. To further reduce the costs, we propose another transferability estimator, Label MSE, which replaces $w^*(x^t_i)$ by the ``dummy'' source label $z_i = h^*(w^*(x^t_i))$. Using dummy labels from the pre-trained source model $(w^*, h^*)$ is a technique previously used to compute the LEEP transferability score for classification~\citep{nguyen2020leep}. We define our Label MSE estimator below.

\begin{definition}
The \textbf{Label MSE} transferability estimator with a regularization parameter $\lambda \ge 0$ is: $\mathcal{T}^{\mathrm{lab}}_{\lambda}(\mathcal{D}_s, \mathcal{D}_t) := - \min_{A, b} \big\{ \frac{1}{n_t} \sum_{i=1}^{n_t} \| y^t_i - A z_i - b \|^2 + \lambda \|A \|_F^2 \big\}$, where ${ A \in \mathbb{R}^{d_s \times d_t} }$ is a $d_s \times d_t$ real-valued matrix, $b \in \mathbb{R}^{d_t}$ is a $d_t$-dimensional real-valued vector, and $z_i = h^*(w^*(x^t_i))$.
\label{def:labmse}
\end{definition}

In practice, since the size of $z_i$ is usually much smaller than that of $w^*(x^t_i)$ (i.e., $d_s \ll d_r$), computing the Label MSE is usually faster than computing the Linear MSE.

\minisection{$\bullet$ Special case with shared inputs.}
When the source and target datasets have the same inputs, i.e., ${ \mathcal{D}_s = \{ (x_i, y^s_i) \}_{i=1}^n }$ and $\mathcal{D}_t = \{ (x_i, y^t_i) \}_{i=1}^n$, we can compute the Label MSE even faster using only the true labels. Particularly, we can consider the following version of the Label MSE.

\begin{definition}
The \textbf{Shared Inputs Label MSE} transferability estimator with a regularization parameter $\lambda \ge 0$ is: $\widehat{\mathcal{T}}^{\mathrm{lab}}_{\lambda}(\mathcal{D}_s, \mathcal{D}_t) := - \min_{A, b} \Big\{ { \frac{1}{n} \sum_{i=1}^{n} \| y^t_i - A y^s_i - b \|^2 } +  \lambda \|A \|_F^2 \Big\}$, where $A \in \mathbb{R}^{d_s \times d_t}$ and $b \in \mathbb{R}^{d_t}$.
\label{def:shared_labmse}
\end{definition}

In this definition, the Shared Inputs Label MSE is computed by training a Ridge regression model directly from the true label pairs $(y^s_i, y^t_i)$, which is \emph{less expensive} than the original Label MSE since we do not need to train the source model $(w^*, h^*)$ or compute the dummy labels.

Intuitively, our estimators use a weaker version of the actual target model that helps trade off the estimators’ accuracy for computational speed. Our estimators can also be viewed as instances of the kernel Ridge regression approach~\citep{smale2007learning, hastie2009elements}. While the Linear MSE can be interpreted as a linear approximation to $- \mathcal{L} (w^*, k^*; \mathcal{D}_t)$, properties of the Label MSE and Shared Inputs Label MSE are not well understood. In the next section, we shall prove novel theoretical properties for these estimators.



\section{Theoretical properties}
\label{sec:theory}

We now prove some theoretical properties for the Label MSE with ReLU feed-forward neural networks. These properties are in the form of generalization bounds relating $\mathcal{T}^{\mathrm{lab}}_{\lambda}(\mathcal{D}_s, \mathcal{D}_t)$ with the transferability $\mathrm{Tr}(\mathcal{D}_s, \mathbb{P}_t)$. Throughout this section, we assume the space of all target regression heads $k$, which may have more than one layer, is a superset of all the linear regression models. This assumption is generally true for ReLU networks~\citep{arora2018understanding}.

First, we show in Lemma~\ref{lemma:empirical} below a relationship between the negative MSE loss $- \mathcal{L} (w^*, k^*; \mathcal{D}_t)$ of $(w^*, k^*)$ and the Label MSE. This lemma states that the negative MSE loss $- \mathcal{L} (w^*, k^*; \mathcal{D}_t)$ upper bounds the Label MSE. The proof for this lemma is in the supplementary.

\begin{lemma}
For any $\lambda \ge 0$, we have:
$\mathcal{T}^{\mathrm{lab}}_{\lambda}(\mathcal{D}_s, \mathcal{D}_t) \le - \mathcal{L} (w^*, k^*; \mathcal{D}_t)$.
\label{lemma:empirical}
\end{lemma}

Using this lemma, we can prove our main theoretical result in Theorem~\ref{thm:generalization} below. In this theorem, $L$ is the number of layers of the ReLU feed-forward neural network $(w^*, k^*)$, and we assume the number of hidden nodes and parameters in each layer are upper bounded by $H$ and $M \ge 1$ respectively. Without loss of generality, we also assume all input and output data are upper bounded by $1$ in $\ell_\infty$-norm. This assumption can easily be satisfied by a pre-processing step that scales them to $[0, 1]$ in $\ell_\infty$-norm.

\begin{theorem}
For any source dataset $\mathcal{D}_s$, $\lambda \ge 0$ and $\delta > 0$, with probability at least $1 - \delta$ over the randomness of $\mathcal{D}_t$, we have:
$\mathrm{Tr}(\mathcal{D}_s, \mathbb{P}_t) \ge \mathcal{T}^{\mathrm{lab}}_{\lambda}(\mathcal{D}_s, \mathcal{D}_t) - C(d, d_t, M, H, L, \delta)/\sqrt{n_t}$, where
$C(d, d_t, M, H, L, \delta) = 16 M^{2L+2} H^{2L} [ d_t^2 d \sqrt{L+1+ \ln d} + d_t d^2 \sqrt{2 \ln(4/\delta)} ]$.
\label{thm:generalization}
\end{theorem}

The proof for this theorem is in the supplementary. The theorem shows that the transferability $\mathrm{Tr}(\mathcal{D}_s, \mathbb{P}_t)$ is lower bounded by the Label MSE $\mathcal{T}^{\mathrm{lab}}_{\lambda}(\mathcal{D}_s, \mathcal{D}_t)$ minus a complexity term $C(d, d_t, M, H, L, \delta)/\sqrt{n_t}$ that depends on the target dataset (specifically, the input and output dimensions, as well as the dataset size) and the architecture of the target network. When this complexity term is small (e.g., when $n_t$ is large enough), the bound in Theorem~\ref{thm:generalization} will be tighter. In this case, a higher Label MSE score will likely lead to better transferability.

\minisection{$\bullet$ Shared inputs case.}
We can also derive similar bounds for the Shared Inputs Label MSE $\widehat{\mathcal{T}}^{\mathrm{lab}}_{\lambda}(\mathcal{D}_s, \mathcal{D}_t)$.
Denote
${ A^*_\lambda, b^*_\lambda := \argmin_{A, b} \big\{ \frac{1}{n} \sum_i \| y^t_i - A y^s_i - b \|^2 + \lambda \|A \|_F^2 \big\} }$.
We first show the following lemma relating $\widehat{\mathcal{T}}^{\mathrm{lab}}_{\lambda}(\mathcal{D}_s, \mathcal{D}_t)$ and the losses of the source and target models.

\begin{lemma}
For any $\lambda \geq 0$, we have:
$
\widehat{\mathcal{T}}^{\mathrm{lab}}_{\lambda}(\mathcal{D}_s, \mathcal{D}_t) \leq - \mathcal{L} (w^*, k^*; \mathcal{D}_t)/2 + \|A^*_\lambda\|_F^2 \mathcal{L} (w^*, h^*; \mathcal{D}_s).
$
\label{lemma:empirical_same_input}
\end{lemma}

Using this lemma, we can prove the following theorem for this shared inputs setting. The proofs for these results are in the supplementary.

\begin{theorem}
For any source dataset $\mathcal{D}_s$, $\lambda \geq 0$ and $\delta > 0$, with probability at least $1 - \delta$ over the randomness of $\mathcal{D}_t$, we have:
$
\mathrm{Tr}(\mathcal{D}_s, \mathbb{P}_t) \geq 2 \widehat{\mathcal{T}}^{\mathrm{lab}}_{\lambda}(\mathcal{D}_s, \mathcal{D}_t) - 2 \|A^*_\lambda\|_F^2 \mathcal{L} (w^*, h^*; \mathcal{D}_s) - C(d, d_t, M, H, L, \delta)/\sqrt{n}  
$.
\label{thm:generalization_same_input}
\end{theorem}

From the theorem, $\widehat{\mathcal{T}}^{\mathrm{lab}}_{\lambda}(\mathcal{D}_s, \mathcal{D}_t)$ can indirectly tell us information about the transferability $\mathrm{Tr}(\mathcal{D}_s, \mathbb{P}_t)$ without actually training $w^*$, $h^*$, and $k^*$. This bound becomes tighter when $n$ is large or $\mathcal{L} (w^*, h^*; \mathcal{D}_s)$ is small (e.g., when the source model is expressive enough to fit the source data).
An experiment to investigate the usefulness of our theoretical bounds in this section is available in the supplementary.

\section{Experiments}
\label{sec:experiment}

In this section, we conduct experiments to evaluate our approaches on the keypoint (or landmark) regression tasks using the following two large-scale public datasets:

$\bullet$ \textbf{CUB-200-2011}~\citep{wah2011caltech}. This dataset contains 11,788 bird images with 15 labeled keypoints indicating 15 different parts of a bird body. We use 9,788 images for training and 2,000 images for testing. Since the annotations for occluded keypoints are highly inaccurate, we remove all occluded keypoints during the training for both source and target tasks.

$\bullet$ \textbf{OpenMonkey}~\citep{yao2021openmonkeychallenge}. This is a benchmark for the non-human pose tracking problem. It offers over 100,000 monkey images in natural contexts, annotated with 17 body landmarks. We use the original train-test split, which contains 66,917 training images and 22,306 testing images.

In our experiments, we use ResNet34~\citep{he2016deep} as the backbone since it provides good performance as a source model. Following previous work~\citep{tran2019transferability, nguyen2020leep, huang2022frustratingly, nguyen2022generalization}, we investigate how well our transferability estimators correlate (using Pearson correlation) with the \emph{negative test MSE} of the target model obtained from actual transfer learning. This correlation analysis is a good method to measure how well transferability estimators satisfy our Definition~\ref{def:trans_est}. In the supplementary, we provide additional results for other non-linear correlation measures, including Kendall’s $\tau$ and Spearman correlations. The conclusions in our paper remain the same when comparing these correlations.

We consider three standard transfer learning algorithms: (1) \textbf{head re-training}~\citep{donahue2014decaf, sharif2014cnn}: We fix all layers of the source model up until the penultimate layer and re-train the last fully-connected (FC) layer using the target training set; (2) \textbf{half fine-tuning}~\citep{donahue2014decaf, sharif2014cnn}: We fine-tune the last convolutional block and all the FC layers of the source model, while keeping all other layers fixed; and (3) \textbf{full fine-tuning}~\citep{agrawal2014analyzing, girshick2014rich}: We fine-tune the whole source model using the target training set. Among these settings, head re-training resembles the transfer scenario in Section~\ref{sec:transfer_learning}, while half and full fine-tuning are more commonly used in practice. For half fine-tuning, around half of the parameters in the network will be fine-tuned ($\sim$13M parameters). More details of our experiment settings are in the supplementary.

We compare our transferability estimators, Linear MSE and Label MSE, with two recent SotA baselines for regression: LogME~\citep{you2021logme} and TransRate~\citep{huang2022frustratingly}. For our methods, we consider $\lambda = 0$ (named \textbf{LinMSE0} and \textbf{LabMSE0}) for the estimators without regularization, and $\lambda = 1$ (named \textbf{LinMSE1} and \textbf{LabMSE1}) for the estimators with the default $\lambda$ value. The effects of $\lambda$ on our algorithms are investigated in Section~\ref{sec:lambda_exp}. 

For the baselines, besides the usual versions (\textbf{LogME} and \textbf{TransRate}) that are computed from the extracted features and the target labels, we also consider the versions where they are computed from the dummy labels and the target labels (named \textbf{LabLogME} and \textbf{LabTransRate}). As in previous work~\citep{huang2022frustratingly}, we divide the target label values into equal-sized bins (five bins in our case) to compute TransRate and LabTransRate.

\subsection{General transfer between two different domains}
\label{exp:different_input}

\begin{table*}[t]
\caption{{\bf Correlation coefficients when transferring from OpenMonkey to CUB-200-2011}. Bold numbers indicate best results in each row. Asterisks (*) indicate best results among the corresponding label-based or feature-based methods. Detailed correlation plots are in the supplementary. Our estimators improve up to 25.9\% in comparison with SotA (LogME) while being 12.9\% better on average.}
{\vskip -0.2cm}
\centering
\resizebox{\textwidth}{!}{%
\begin{tabular}{ccccccccc}
\toprule
\multirow{2}{*}{Transfer setting} & \multicolumn{4}{c}{Label-based method} & \multicolumn{4}{c}{Feature-based method} \\
\cmidrule(lr){2-5} \cmidrule(lr){6-9}
& LabLogME & LabTransRate & LabMSE0 & LabMSE1 & LogME & TransRate & LinMSE0 & LinMSE1 \\
\midrule
Head re-training & 0.824 & 0.165 & 0.991 & \textbf{0.995}* & 0.969 & 0.121 & 0.982 & \textbf{0.995*}\\
Half fine-tuning & 0.706 & 0.392 & 0.881 &  \textbf{0.885}* & 0.870 & 0.304 & 0.866 & \textbf{0.885*}\\
Full fine-tuning & 0.691 & 0.410 & ~~{\bf 0.870}* & 0.869~~ & 0.861 & 0.311 & 0.855 & 0.869* \\
\bottomrule
\end{tabular}
}
\label{tab:different_input}
\end{table*}

This experiment considers the general case where source models are trained on one dataset (OpenMonkey) and then transferred to another (CUB-200-2011). Specifically, we train a source model for each of the 17 keypoints of the OpenMonkey dataset and transfer them to each of the 15 keypoints of the CUB-200-2011 dataset, resulting in a total of 255 final models. Since each keypoint consists of x and y positions, all source and target tasks in this experiment have two dimensional labels. The actual MSEs of these models are computed on the respective test sets and then used to calculate the Pearson correlation coefficients with the transferability estimators. In this experiment, LabMSE0, LabMSE1, LabLogME, and LabTransRate are computed from the dummy source labels and the actual target labels.

Results for this experiment are in Table~\ref{tab:different_input}. In this setting, TransRate and LabTransRate perform poorly, while our methods are equal or better than LogME and LabLogME in most cases, especially when using $\lambda=1$ (LinMSE1) or dummy labels (LabMSE0 and LabMSE1). The results show our approaches improve up to 25.9\% in comparison with SotA (LogME) while being 12.9\% better on average.

It is interesting to observe that LabMSE0 and LabMSE1 provide competitive or even better correlations than LinMSE0 and LinMSE1 in this experiment. This shows that the dummy labels (i.e., body parts of monkeys) can provide as much information about the target labels (i.e., body parts of birds) as the extracted features.

In the supplementary, we also report additional results where both source and target tasks have 10-dimensional labels (i.e., each task predicts 5 keypoints simultaneously). We also achieve better correlations than the baselines in this case.



\subsection{Transfer with shared-inputs tasks}
\label{sec:exp_shared_inputs}

\begin{table*}[t]
\caption{{\bf Correlation coefficients when transferring between tasks with shared inputs}. Bold numbers indicate best results in each row. Asterisks (*) indicate best results among the corresponding label-based or feature-based methods. Detailed correlation plots are in the supplementary. Our estimators improve up to 113\% in comparison with SotA (LogME) while being 36.6\% better on average.}
{\vskip -0.2cm}
\centering
\resizebox{\textwidth}{!}{%
\begin{tabular}{cccccccccc}
\toprule
\multirow{3}{*}{Dataset} & \multirow{3}{*}{Transfer setting} & \multicolumn{4}{c}{Label-based method} & \multicolumn{4}{c}{Feature-based method} \\
\cmidrule(lr){3-6} \cmidrule(lr){7-10}
& & LabLogME & LabTransRate & LabMSE0 & LabMSE1 & LogME & TransRate & LinMSE0 & LinMSE1 \\
\midrule
\multirow{3}{*}{\parbox{0.9cm}{CUB-200-2011}} & Head re-training & 0.547 & 0.019 & 0.916 & 0.946* & 0.890 & 0.029 & 0.921 & ~~{\bf 0.960}*\\
& Half fine-tuning & 0.401 & 0.006 & 0.536 & 0.565* & 0.560 & 0.064 & ~~{\bf 0.628}* & 0.619\\
& Full fine-tuning & ~~{\bf 0.128}* & 0.041 & 0.056 & 0.057~~ & 0.100 & ~~0.109* & 0.097 & 0.082\\
\midrule
\multirow{3}{*}{\parbox{0.9cm}{Open Monkey}} & Head re-training & 0.890 & 0.666 & ~~0.973* & 0.773~~ & 0.695 & 0.711 & 0.946 & ~~{\bf 0.975}*\\
& Half fine-tuning & 0.615 & 0.340 & 0.754 & 0.890* & 0.446 & 0.488 & ~~{\bf 0.899}* & 0.801\\
& Full fine-tuning & 0.569 & 0.269 & 0.705 & \textbf{0.882*}  & 0.403 & 0.439 & ~~0.859* & 0.761\\
\bottomrule
\end{tabular}
}
\label{tab:shared_input}
\end{table*}

In this experiment, we consider the setting where the source and target tasks have the same inputs (the special setting in Section~\ref{sec:labelmse}). Since images in our datasets contain multiple labels (15 keypoints for CUB-200-2011 and 17 keypoints for OpenMonkey), we can use any two different keypoints on the same dataset as source and target tasks. In total, we construct 210 source-target pairs for CUB-200-2011 and 272 pairs for OpenMonkey that all have the same source and target inputs but different labels. The labels for all tasks are also two dimensional real values.

We repeat the experiment in Section~\ref{exp:different_input} with these source-target pairs for CUB-200-2011 and OpenMonkey separately. The main difference in this experiment is that we use the \emph{true} source labels (instead of dummy labels) when computing LabLogME, LabTransRate, LabMSE0, and LabMSE1. Under this setting, the LabMSE estimators here are the Shared Inputs Label MSE estimators in Definition~\ref{def:shared_labmse}. These estimators can be computed without any source models, and thus incurring very low computational costs in this setting.

Results for these experiments are in Table~\ref{tab:shared_input}. In the results, both versions of TransRate perform poorly on CUB-200-2011, while TransRate is slightly better than LogME on OpenMonkey. In most settings, LabMSE0 and LabMSE1 both outperform LabLogME and LabTransRate, while LinMSE0 and LinMSE1 both outperform LogME and TransRate. In the setting where we transfer by full fine-tuning on the CUB-200-2011 dataset, all methods perform poorly. From these results, our approaches improve up to 113\% in comparison with SotA (LogME) while being 36.6\% better on average.

We also report in the supplementary additional results for each individual source task. The results show that our methods are consistently better than LogME, LabLogME, TransRate, and LabTransRate for most source tasks on both datasets. Furthermore, our methods are also better than these baselines when transferring to higher dimensional target tasks (tasks that predict 5 keypoints simultaneously and have 10-dimensional labels). These additional results further confirm the effectiveness of our approaches.

\subsection{Evaluations on small target sets}
\label{sec:exp_small}

\begin{figure*}[t]
\centering
    \begin{subfigure}[b]{0.27\textwidth}
    \includegraphics[width=\textwidth]{figures/main/diff_n_cub.png}
    \end{subfigure}
    {\hskip 0.7cm}
    \begin{subfigure}[b]{0.27\textwidth}
    \includegraphics[width=\textwidth]{figures/main/diff_n_openmonkey.png}
    \end{subfigure}
    {\vskip -0.2cm}
    \caption{\textbf{Correlation coefficients with small target training sets} on CUB-200-2011 (left) and OpenMonkey (right). LinMSE1 and LogME are designed to avoid overfitting, but LinMSE1 is better than LogME in both datasets.}
    \label{fig:small_data}

    {\vskip 0.5cm}

    \begin{subfigure}[b]{0.27\textwidth}
    \includegraphics[width=\textwidth]{figures/main/timeplot_uai.pdf}
    \end{subfigure}
    {\hskip 0.7cm}
    \begin{subfigure}[b]{0.27\textwidth}
    \includegraphics[width=\textwidth]{figures/main/training_size_time.png}
    \end{subfigure}
    {\vskip -0.2cm}
    \caption{\textbf{Average running time} (in milliseconds) for the experiments in Sections~\ref{sec:exp_shared_inputs} (left) and~\ref{sec:exp_small} (right).}
    \label{fig:processing_time}

    {\vskip 0.5cm}
    
    \begin{subfigure}[b]{0.21\textwidth}
    \includegraphics[width=\textwidth]{figures/main/dsprite_logme.pdf}
    \caption{LogME}
    \end{subfigure}
    %
    \begin{subfigure}[b]{0.21\textwidth}
    \includegraphics[width=\textwidth]{figures/main/dsprite_transrate.pdf}
    \caption{TransRate}
    \end{subfigure}
    %
    \begin{subfigure}[b]{0.22\textwidth}
    \includegraphics[width=\textwidth]{figures/main/dsprite_linmse_0.0.pdf}
    \caption{LinMSE0}
    \end{subfigure}
    %
    \begin{subfigure}[b]{0.31\textwidth}
    \includegraphics[width=\textwidth]{figures/main/dsprite_linmse_1.0.png}
    \caption{LinMSE1 \qquad \qquad \qquad}
    \end{subfigure}
    {\vskip -0.1cm}
    \caption{\textbf{Test MSEs vs.~transferability scores} when transferring from pre-trained classification models to a target regression task. The $x$-axis represents the transferability scores. A linear regression model (dashed line) is fitted to the points in each plot. Our methods give better fits than the baselines.}
    \label{fig:from_pretrained}
\end{figure*}



In many real-world transfer learning scenarios, the target set is usually small. This experiment will evaluate the effectiveness of the feature-based transferability estimators (LogME, TransRate, LinMSE0, and LinMSE1) in this small data regime where the number of samples is smaller than the feature dimension. For this experiment, we fix a source task (\emph{Belly} for CUB-200-2011 and \emph{Right eye} for OpenMonkey) and transfer to all other tasks in the corresponding dataset using head re-training. These source tasks are chosen since they have fewer missing labels and thus can be used to train reasonably good source models for transfer learning. For each target task, instead of using the full data, we randomly select a small subset of 100 to 400 images to perform transfer learning and to compute the transferability scores. The actual MSEs of the transferred models are still computed using the full target test sets.

Figure~\ref{fig:small_data} compares the correlations of the 4 methods on different target set sizes between 100 and 400. The results are averaged over 10 runs with 10 different random seeds. From the figure, LogME and LinMSE1 are better than TransRate and LinMSE0. This is expected since LogME and LinMSE1 are designed to avoid overfitting on small data. Both LogME and LinMSE1 are also more stable, but LinMSE1 is slightly better than LogME on all dataset sizes.



\subsection{Efficiency of our estimators}
\label{sec:efficiency}

One of the main strengths of our methods is their efficiency due to the simplicity of training the Ridge regression head. In this experiment, we first use the settings in Section~\ref{sec:exp_shared_inputs} to compare the running time of our methods with that of the baselines on the CUB-200-2011 dataset. Figure~\ref{fig:processing_time} (left) reports the results (averaged over 5 runs with different random seeds) for this experiment. From these results, our methods, LabMSE0, LabMSE1, LinMSE0, and LinMSE1, are all faster than the corresponding label-based or feature-based baselines. The figure also shows that LabMSE1 and LinMSE1 achieve the best running time among the label-based and feature-based methods respectively.

In Figure~\ref{fig:processing_time} (right), we also compare the average running time of the 4 transferability estimators using the CUB-200-2011 experiment in Section~\ref{sec:exp_small}. This figure clearly shows that our methods, LinMSE0 and LinMSE1, are more computationally efficient than LogME and TransRate. Both results in Figure~\ref{fig:processing_time} show that LinMSE1 and LabMSE1 are significantly faster than other corresponding feature-based and label-based methods. In these experiments, LinMSE1 and LabMSE1 converge faster than LinMSE0 and LabMSE0 respectively, and thus are more efficient.



\subsection{Source task selection}
\label{sec:source_selection}

Source task selection is important for applying transfer learning since the right source task can improve transfer learning performance~\citep{nguyen2020leep}. In this experiment, we examine the application of our transferability estimation methods for selecting source tasks on the CUB-200-2011 dataset. We use the head re-training setting similar to Section~\ref{sec:exp_shared_inputs}, but fix one of the tasks as the target and choose the best source task from the rest of the task pool. We repeat this process for all 15 target tasks and measure the top-$k$ matching rate of each transferability estimator. 

The top-$k$ matching rate is defined as $m_{\text{match}}/m_{\text{target}}$, where $m_{\text{target}}$ is the total number of target tasks (15 in our case), and $m_{\text{match}}$ is the number of times the selected source task gives a target model within the best $k$ models. Here the best $k$ models are determined by the actual test MSE on the target task.

Results for this experiment are in Table~\ref{tab:source_selection}. From the results, our methods are better than the baselines in terms of top-$3$ and top-$5$ matching rates. When comparing top-$1$ matching rates, our methods are competitive with LogME and LabLogME for the feature-based and label-based approaches respectively. This experiment shows that our transferability estimators are useful for source task selection.

\begin{table*}[t]
\caption{{\bf Top-$k$ matching rates for source task selection} on CUB-200-2011. Bold numbers indicate best results in each column. Asterisks (*) indicate best results among the corresponding label-based or feature-based methods.}
\centering
\small
\begin{tabular}{ccccccccc}
\toprule
\multirow{2}{*}{$k$} & \multicolumn{4}{c}{Label-based method} & \multicolumn{4}{c}{Feature-based method} \\
\cmidrule(lr){2-5} \cmidrule(lr){6-9} \cmidrule(lr){6-7}
& LabLogME & LabTransRate & LabMSE0 & LabMSE1 & LogME & TransRate & LinMSE0 & LinMSE1\\
\midrule
1 & 6/15* & 4/15  & 6/15* & 2/15 & \textbf{11/15}* & 2/15 & 9/15 & 10/15 \\
3 & 9/15  & 9/15  & 10/15* & 9/15 & 12/15 & 6/15 & 12/15 & \textbf{13/15}* \\
5 &	10/15 & 12/15 & \textbf{14/15}* & \textbf{14/15}* & 12/15 & 6/15 & 12/15 & 13/15* \\
\bottomrule
\end{tabular}
\label{tab:source_selection}
\end{table*}



{\vskip -0.2cm}
\subsection{Effects of $\lambda$}
\label{sec:lambda_exp}
{\vskip -0.2cm}

\begin{table}[t]
\setlength{\tabcolsep}{3pt}
\caption{{\bf Correlation coefficients for different values of $\lambda$} on CUB-200-2011. Bold numbers indicate best results in each column. Results of the baselines are given in the last 2 rows for comparison. When there are meaningful correlations (head re-training and half fine-tuning), our methods are better than the corresponding baselines for all $\lambda$ values.}
\resizebox{\linewidth}{!}{%
\centering
\begin{tabular}{@{\hskip -1pt}ccccccc@{\hskip -1pt}}
\toprule
\multirow{2}{*}{$\lambda$} & \multicolumn{2}{c}{Head re-training} & \multicolumn{2}{c}{Half fine-tuning} & \multicolumn{2}{c}{Full fine-tuning}\\
\cmidrule(lr){2-3} \cmidrule(lr){4-5} \cmidrule(lr){6-7}
& LabMSE & LinMSE & LabMSE & LinMSE & LabMSE & LinMSE\\
\midrule
0 &	0.916 & 0.921 & 0.536 & 0.628 & 0.056 & \textbf{0.097} \\
0.001 & 0.921 & 0.933 & 0.562 & \textbf{0.645} & 0.051 & 0.091 \\ 
0.01 & 0.922 & 0.943 & 0.560 & 0.643 & 0.048 & 0.089 \\
0.1	& 0.935	& 0.954	& 0.552	& 0.639	& 0.043	& 0.089 \\ 
0.5	& 0.945	& \textbf{0.960} & 0.562 & 0.629	& 0.053	& 0.085 \\ 
1 & \textbf{0.946} & \textbf{0.960} & 0.565 & 0.619 & 0.057	& 0.082 \\ 
2 & 0.945 & 0.958 & 0.567 & 0.607 & 0.059 & 0.077 \\ 
5 & 0.945 & 0.954 & \textbf{0.568} & 0.594 & \textbf{0.061} & 0.072 \\ 
10 & 0.945 & 0.951 & \textbf{0.568} & 0.586 & \textbf{0.061} & 0.069 \\
15 & 0.945 & 0.950 & \textbf{0.568} & 0.582 & \textbf{0.061} & 0.067 \\ 
20 & 0.945 & 0.949 & \textbf{0.568} & 0.580 & \textbf{0.061} & 0.066 \\
\midrule
(Lab)LogME & 0.547 & 0.889 & 0.400 & 0.560  & 0.120 & 0.099 \\
(Lab)TransRate & 0.008 & 0.029 & 0.006 & 0.006 & 0.001 & 0.100 \\
\bottomrule
\end{tabular}
}
\label{tab:all_lambdas}
\end{table}

In this experiment, we investigate the effects of $\lambda$ on our proposed transferability estimators. We use the setting in Section~\ref{sec:exp_shared_inputs} with the CUB-200-2011 dataset and vary the value of $\lambda$ in [0, 20] for both LabMSE and LinMSE. Table~\ref{tab:all_lambdas} reports the results for all three transfer learning settings.

For head re-training, we observe that the best correlations are achieved at $\lambda = 1$ for both LabMSE and LinMSE. For half fine-tuning, $\lambda \ge 5$ gives the best result for LabMSE, while $\lambda = 0.001$ gives the best result for LinMSE. For full fine-tuning, we do not observe significant correlations for both transferability estimators.

Notably, from the results in Table~\ref{tab:all_lambdas} for the head re-training and half fine-tuning settings (where we have significant correlations for at least one transferability estimator), LabMSE with any tested $\lambda$ value in [0, 20] is better than LabLogME and LabTransRate, while LinMSE with any tested $\lambda$ value in this range is better than LogME and TransRate. These results show that our methods are better than the baselines for a wide range of $\lambda$ values.


\subsection{Beyond regression}
\label{sec:beyond_regression}

Although our paper mainly focuses on regression tasks, the main idea of using the negative regularized MSE of a Ridge regression model for transferability estimation goes beyond regression. In principle, this idea can be applied for transferring between classification tasks (in this case, we should train a linear classifier and use its regularized log-likelihood as the transferability estimator) or between a classification and a regression task.

In this section, we demonstrate that our idea can be applied for transferability estimation between a classification and a regression task. Particularly, we use 8 source models pre-trained on ImageNet~\citep{deng2009imagenet} and transfer to a target regression task on the \emph{dSprite} dataset~\citep{matthey2017dsprites} using full fine-tuning. This setting is similar to~\citet{you2021logme} where the target is a regression task with 4-dimensional labels: x and y positions, scale, and orientation. We compute the transferability scores from the extracted features and the labels of the target training set. More details about this experiment are in the supplementary.

From the results in Figure~\ref{fig:from_pretrained}, the trends for LogME, LinMSE0, and LinMSE1 are correct (i.e., transferability scores have negative correlations with actual MSEs), while that of TransRate is incorrect. Note that there is a discrepancy between the ranges of the transferability and the transferred MSE because of two reasons: (1) The transferability estimators are computed from the target training set, while the transferred MSEs are computed from the target test set, and (2) there is a mismatch between the source task (ImageNet classification) and the target task (dSprite shape regression). 

To compare the transferability estimation methods, we fit a linear regression to the points in each plot and compute its RMSE to these points, where we obtain: $6.12 \times 10^{-3}$ (LogME), $6.16 \times 10^{-3}$ (TransRate), $6.10 \times 10^{-3}$ (LinMSE0), and $\textbf{5.46} \times 10^{-3}$ (LinMSE1). These results show that LinMSE0 and LinMSE1 are better than LogME and TransRate.

\section{Conclusion}

We formulated transferability estimation for regression tasks and proposed the Linear MSE and Label MSE estimators, two simple but effective approaches for this problem. We proved novel theoretical results for these estimators, showing their relationship with the actual task transferability. Our extensive experiments demonstrated that the proposed approaches are superior to recent, relevant SotA methods in terms of efficiency and effectiveness. Our proposed ideas can also be extended to mixed cases where one of the tasks is a classification problem.

\begin{acknowledgements}
LSTH was supported by the Canada Research Chairs program, the NSERC Discovery Grant RGPIN-2018-05447, and the NSERC Discovery Launch Supplement DGECR-2018-00181.
VD was supported by the University of Delaware Research Foundation (UDRF) Strategic Initiatives Grant, and the National Science Foundation Grant DMS-1951474.
\end{acknowledgements}


\bibliography{nguyen_247}
\end{document}
