% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}
\usepackage{subcaption}
\usepackage{multirow}
%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

\usepackage{graphicx}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{multirow}
\usepackage{amsthm}
\usepackage{flexisym}
\usepackage{wrapfig}

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr} 
% \usepackage{xr-hyper}
% \usepackage[pagebackref,breaklinks,colorlinks]{hyperref}

\makeatletter
\newcommand*{\addFileDependency}[1]{
  \typeout{(#1)}
  \@addtofilelist{#1}
  \IfFileExists{#1}{}{\typeout{No file #1.}}
}\makeatother

\newcommand*{\myexternaldocument}[1]{
    \externaldocument{#1}
    \addFileDependency{#1.tex}
    \addFileDependency{#1.aux}
}

\myexternaldocument{nguyen_247}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example
\newcommand{\cuongjr}[1]{{\textcolor{blue}{#1}}}

\DeclareMathOperator*{\argmax}{argmax}
\DeclareMathOperator*{\argmin}{argmin}
% \theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
% \theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
% \theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}

\newcommand{\minisection}[1]{\noindent{\textbf{#1}}}

\title{Simple Transferability Estimation for Regression Tasks\\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author{\href{mailto:<cnguy049@cs.fiu.edu>?Subject=Your UAI 2023 paper}{Cuong N.~Nguyen}$^1$ \qquad Phong Tran$^{2,3}$ \qquad Lam Si Tung Ho$^4$ \qquad Vu Dinh$^5$\\ {\vskip 0.1cm} Anh T.~Tran$^2$ \qquad Tal Hassner$^6$ \qquad Cuong V.~Nguyen}
\affil{Florida International University, USA \qquad $^2$VinAI Research, Vietnam \qquad $^3$MBZUAI, UAE {\vskip -0.1cm} $^4$Dalhousie University, Canada \qquad $^5$University of Delaware, USA \qquad $^6$Meta AI, USA
}


\begin{document}

\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle

\appendix
\counterwithin{table}{section}
\counterwithin{figure}{section}


\vspace{0.5cm}

The contents of this supplementary include:
%
\begin{enumerate}
    \item \textbf{Appendix~\ref{proof:lemma:empirical}}: Proof of Lemma~\ref{lemma:empirical} in the main paper.
    \item \textbf{Appendix~\ref{proof:thm:generalization}}:  Proof of Theorem~\ref{thm:generalization} in the main paper.
    \item \textbf{Appendix~\ref{proof:lemma:empirical_same_input}}: Proof of Lemma~\ref{lemma:empirical_same_input} in the main paper.
    \item \textbf{Appendix~\ref{proof:thm:generalization_same_input}}: Proof of Theorem~\ref{thm:generalization_same_input} in the main paper.
    \item \textbf{Appendix~\ref{appendix:experiment_settings_1}}: More details for the experiment settings in Sections~\ref{exp:different_input}--\ref{sec:lambda_exp} of the main paper.
    \item \textbf{Appendix~\ref{appendix:experiment_settings_2}}: More details for the experiment setting in Section~\ref{sec:beyond_regression} of the main paper.
    \item \textbf{Appendix~\ref{appendix:tightness_bounds}}: An additional experiment to show the usefulness of our theoretical bounds.
    \item \textbf{Appendix~\ref{appendix:high_dim_exp_1}}: Additional experiment results for Section~\ref{exp:different_input} of the main paper.
    \item \textbf{Appendix~\ref{appendix:high_dim_exp_2}}: Additional experiment results for Section~\ref{sec:exp_shared_inputs} of the main paper.
\end{enumerate}



\section{Mathematical proofs}

\subsection{Proof of Lemma~\ref{lemma:empirical}}
\label{proof:lemma:empirical}

Denote 
$\displaystyle A^*, b^* = \argmin_{A, b} \left\{ \frac{1}{n_t} \sum_{i=1}^{n_t} {\| y^t_i - A z_i - b \|^2} + \lambda \|A \|_F^2 \right\}.$

For all $k$, we have:
%
\begin{align*}
  \sqrt{\mathcal{L} (w^*, k^*; \mathcal{D}_t)} &\le \sqrt{\mathcal{L} (w^*, k; \mathcal{D}_t)} \tag{definition of $k^*$} \\
  &= \left[ \frac{1}{n_t} \sum_{i=1}^{n_t} \| y^t_i - k(w^*(x^t_i))\|^2 \right]^{1/2} \tag{definition of $\mathcal{L}$} \\
  &\le \left[ \frac{1}{n_t} \sum_{i=1}^{n_t} \| y^t_i - A^* z_i - b^*\|^2 \right]^{1/2} + \left[ \frac{1}{n_t} \sum_{i=1}^{n_t} \| A^* z_i + b^* - k(w^*(x^t_i))\|^2 \right]^{1/2} \tag{triangle inequality} \\
  &\le \sqrt{- \mathcal{T}^{\mathrm{lab}}_{\lambda}(\mathcal{D}_s, \mathcal{D}_t)} + \left[ \frac{1}{n_t} \sum_{i=1}^{n_t} \| A^* z_i + b^* - k(w^*(x^t_i)) \|^2 \right]^{1/2} \\
  &= \sqrt{- \mathcal{T}^{\mathrm{lab}}_{\lambda}(\mathcal{D}_s, \mathcal{D}_t)} + \left[ \frac{1}{n_t}\sum_{i=1}^{n_t} \| A^* h^*(w^*(x^t_i)) + b^* - k(w^*(x^t_i))\|^2 \right]^{1/2} \tag{definition of $z_i$}.\\
\end{align*}

By choosing $k (\cdot) = A^*h^*(\cdot) + b^*$, the second term in the above inequality becomes 0. This implies $\sqrt{\mathcal{L} (w^*, k^*; \mathcal{D}_t)} \le \sqrt{- \mathcal{T}^{\mathrm{lab}}_{\lambda}(\mathcal{D}_s, \mathcal{D}_t)}$ and thus the lemma.



\subsection{Proof of Theorem~\ref{thm:generalization}}
\label{proof:thm:generalization}

First, we need to define the notion of expected (true) risk. Given any model $(w, k)$ for the target task, the expected risk of $(w, k)$ is defined as:
%
\begin{equation}
  \mathcal{R}(w, k) := \mathbb{E}_{(x^t, y^t) \sim \mathbb{P}_t} \left \{ \|y^t - k(w(x^t))\|^2 \right \}.
\end{equation}

Note that $\mathrm{Tr}(\mathcal{D}_s, \mathbb{P}_t) = - \mathcal{R}(w^*, k^*)$. We prove the uniform bound in Lemma~\ref{lemma:uniform} below that can help us prove Theorem~\ref{thm:generalization}.

\begin{lemma}
For any $\delta >0$, with probability at least ${ 1-\delta }$, for all ReLU feed-forward neural network $(w, k)$ of the target task, we have:
\[ |\mathcal{R} (w, k) - \mathcal{L} (w, k; \mathcal{D}_t)| ~\le~ C(d, d_t, M, H, L, \delta)/\sqrt{n_t}. \]
\label{lemma:uniform}
\end{lemma}

\begin{proof}
We recall the definition of Rademacher complexity. Given a real-valued function class $\mathcal{G}$ and a set of data points $\mathcal{D} = \{ u_i \}_{i=1}^n$, the (empirical) Rademacher complexity $\widehat R_{\mathcal{D}}(\mathcal{G})$ is defined as:
\[
\widehat R_{\mathcal{D}}(\mathcal{G}) = \mathbb{E}_{\epsilon} \left[ \sup_{g \in \mathcal{G}} \frac{1}{n} \sum_{i=1}^n{\epsilon_i g(u_i)} \right],
\]
where $\epsilon = (\epsilon_1, \epsilon_2, \ldots, \epsilon_n)$ is a vector uniformly distributed in $\{ - 1, +1\}^n$ .

In our setting, the hypothesis space $\Phi$ is the class of $L$-layer ReLU feed-forward neural networks whose number of hidden nodes and parameters in each layer are bounded from above by $H$ and $M \ge 1$ respectively.
For all $(w, k) \in \Phi$ and $x$ such that $\|x \|_\infty \leq 1$,  we have:
\[ 
\| k(w(x)) \|_{\infty} \le d M^{L+1} H^L. 
\]
Define $f_{w, k}(x, y) = y - k(w(x))$ and note that $f_{w, k}(x, y) \in \mathbb{R}^{d_t}$. For any $j = 1, 2, \ldots, d_t$, let $[\cdot]_j$ be the projection map to the $j$-th coordinate. We consider the following real-valued function classes:
%
\begin{align*}
\mathcal{F} &= \{\|f_{w, k}\|^2: (w, k) \in \Phi\}, \\   
\mathcal{F}_j &= \{[f_{w, k}]_j: (w, k) \in \Phi\}, \\
\Phi_j &= \{[k(w(\cdot)]_j: (w, k) \in \Phi\},
\end{align*}
%
where each element of $\mathcal{F}$ or $\mathcal{F}_j$ is a function with variables $(x, y)$, and each element of $\Phi_j$ is a function with variable $x$.
Let $\mathcal{D}^x_t = \{ x^t_i \}_{i=1}^{n_t}$ be the set of target inputs.
By Theorem 2 of~\cite{golowich2018size}, for all $j = 1, 2, \ldots, d_t$, we have:
\[
\widehat R_{\mathcal{D}^x_t}(\Phi_j) \leq 2 d_t M^{L+1} H^L \sqrt{\frac{L+1+ \ln d}{n_t}}.
\]
We note that for any $i = 1, 2, \ldots, n_t$, the function $r_i(a) = (a - y_i^t)^2$ mapping from ${ a \in [- d M^{L+1} H^L, d M^{L+1} H^L]}$ to $\mathbb{R}$ is Lipschitz with constant $4 d M^{L+1} H^L$. Thus, applying the Contraction Lemma (Lemma 26.9 in~\cite{shalev2014understanding}), we obtain:
\[
\widehat R_{\mathcal{D}_t}(\mathcal{F}_j) \le 4 d M^{L+1} H^L \widehat R_{\mathcal{D}^x_t}(\Phi_j) \le 8 d d_t M^{2L+2} H^{2L} \sqrt{\frac{L+1+ \ln d}{n_t}}.
\]

Therefore,
\[
\widehat R_{\mathcal{D}_t}(\mathcal{F}) \le \sum_{j=1}^{d_t} \widehat R_{\mathcal{D}_t}(\mathcal{F}_j) \le 8 d d_t^2 M^{2L+2} H^{2L} \sqrt{\frac{L+1+ \ln d}{n_t}}.
\]

Using this inequality, the result of Lemma~\ref{lemma:uniform} follows from Theorem 26.5 in~\cite{shalev2014understanding}.
\end{proof}

To prove Theorem~\ref{thm:generalization}, we apply Lemma~\ref{lemma:empirical} in the main paper and Lemma~\ref{lemma:uniform} above for the transferred target model $(w^*, k^*)$. Thus, for any $\lambda \ge 0$ and $\delta > 0$, with probability at least $1 - \delta$, we have:
\begin{align*}
\mathcal{T}^{\mathrm{lab}}_{\lambda}(\mathcal{D}_s, \mathcal{D}_t) &\le - \mathcal{L} (w^*, k^*; \mathcal{D}_t)\\
&\le - \mathcal{R}(w^*, k^*) + C(d, d_t, M, H, L, \delta)/\sqrt{n_t}\\
&= \mathrm{Tr}(\mathcal{D}_s, \mathbb{P}_t) + C(d, d_t, M, H, L, \delta)/\sqrt{n_t}.
\end{align*}

Therefore, Theorem~\ref{thm:generalization} holds.



\subsection{Proof of Lemma~\ref{lemma:empirical_same_input}}
\label{proof:lemma:empirical_same_input}

Note that
$\displaystyle A^*_\lambda, b^*_\lambda = \argmin_{A, b} \left\{ \frac{1}{n} \sum_{i=1}^n \| y^t_i - A y^s_i - b \|^2 + \lambda \|A \|_F^2 \right\}.$

For all $k$, we have:
%
\begin{align*}
  \sqrt{\mathcal{L} (w^*, k^*; \mathcal{D}_t)} 
  &\le \sqrt{\mathcal{L} (w^*, k; \mathcal{D}_t)} \tag{definition of $k^*$} \\
  &= \left[ \frac{1}{n} \sum_{i=1}^n \| y^t_i - k(w^*(x_i)) \|^2 \right]^{1/2} \tag{definition of $\mathcal{L}$} \\
  &\le \left[ \frac{1}{n} \sum_{i=1}^n \| y^t_i - A^*_\lambda y^s_i - b^*_\lambda \|^2 \right]^{1/2} + \left[ \frac{1}{n}\sum_{i=1}^n \| A^*_\lambda y^s_i + b^*_\lambda - k(w^*(x_i))\|^2 \right ]^{1/2} \tag{triangle inequality} \\
  &\le \sqrt{- \widehat{\mathcal{T}}^{\mathrm{lab}}_{\lambda}(\mathcal{D}_s, \mathcal{D}_t)} + \left[ \frac{1}{n} \sum_{i=1}^n \| A^*_\lambda y^s_i + b^*_\lambda - k(w^*(x_i)) \|^2 \right ]^{1/2}. \tag{definition of $\widehat{\mathcal{T}}^{\mathrm{lab}}_{\lambda}$}
\end{align*}

Picking $k(\cdot) = A^*_\lambda h^*(\cdot) + b^*_\lambda$, this inequality becomes:
%
\begin{align*}
  \sqrt{\mathcal{L} (w^*, k^*; \mathcal{D}_t)} 
  &\le \sqrt{- \widehat{\mathcal{T}}^{\mathrm{lab}}_{\lambda}(\mathcal{D}_s, \mathcal{D}_t)} + \left[ \frac{1}{n} \sum_{i=1}^n \| A^*_\lambda [y^s_i - h^*(w^*(x_i))]\|^2\right]^{1/2} \\
  &\le \sqrt{- \widehat{\mathcal{T}}^{\mathrm{lab}}_{\lambda}(\mathcal{D}_s, \mathcal{D}_t)} + \| A^*_\lambda\|_F \left[ \frac{1}{n}\sum_{i=1}^n \|y^s_i - h^*(w^*(x_i))\|^2\right]^{1/2} \\
  &= \sqrt{- \widehat{\mathcal{T}}^{\mathrm{lab}}_{\lambda}(\mathcal{D}_s, \mathcal{D}_t)} + \|A^*_\lambda\|_F \sqrt{\mathcal{L} (w^*, h^*; \mathcal{D}_s)}.
\end{align*}

Note that if $a \le b + c$, then $a^2 \le 2b^2 + 2c^2$. Applying this fact to the above inequaility, we have:
%
\[ \mathcal{L} (w^*, k^*; \mathcal{D}_t) \le - 2 \widehat{\mathcal{T}}^{\mathrm{lab}}_{\lambda}(\mathcal{D}_s, \mathcal{D}_t) + 2 \|A^*_\lambda\|^2_F \mathcal{L} (w^*, h^*; \mathcal{D}_s). \]

Thus, Lemma~\ref{lemma:empirical_same_input} holds.



\subsection{Proof of Theorem~\ref{thm:generalization_same_input}}
\label{proof:thm:generalization_same_input}

For any $\lambda \geq 0$ and $\delta > 0$, applying Lemma~\ref{lemma:uniform} for $(w^*, k^*)$ and Lemma~\ref{lemma:empirical_same_input}, with probability at least $1 - \delta$:
%
\begin{align*}
  \mathcal{R} (w^*, k^*) &\le \mathcal{L} (w^*, k^*; \mathcal{D}_t) + C(d, d_t, M, H, L, \delta)/\sqrt{n} \\
  &\le - 2 \widehat{\mathcal{T}}^{\mathrm{lab}}_{\lambda}(\mathcal{D}_s, \mathcal{D}_t) + 2 \|A^*_\lambda\|^2_F ~ \mathcal{L} (w^*, h^*; \mathcal{D}_s) + C(d, d_t, M, H, L, \delta)/\sqrt{n}.
\end{align*}

Since $\mathrm{Tr}(\mathcal{D}_s, \mathbb{P}_t) = -\mathcal{R} (w^*, k^*)$, Theorem~\ref{thm:generalization_same_input} holds.



\section{More details for experiment settings}

\subsection{More details for Sections~\ref{exp:different_input}--\ref{sec:lambda_exp}}
\label{appendix:experiment_settings_1}

For these experiments, we train our source models from scratch using the MSE loss with the AdamW optimizer~\citep{loshchilov2018decoupled}, which we run for 40 epochs with batch size of 64 and the cosine learning rate scheduler. To obtain good source models, we resize all input images to 256$\times$256 and apply basic image augmentations without horizontal flipping (i.e., affine transformation, Gaussian blur, and color jitter). We also scale all labels into $[0, 1]$ using the width and height of the input images.

For the transfer learning setting with head re-training, we freeze the trained feature extractor and re-train the regression head on the target dataset using the same setting above, except that we run 15 epochs on the CUB-200-2011 dataset and 30 epochs on the OpenMonkey dataset. For half fine-tuning, we unfreeze the last convolution layer and the head classifier since the number of trainable parameters is around half of the total number of parameters. For full fine-tuning, we unfreeze the whole network. In these two fine-tuning settings, we fine-tune for 15 epochs on both datasets. We use PyTorch~\citep{paszke2019pytorch} for implementation.


\subsection{More details for Section~\ref{sec:beyond_regression}}
\label{appendix:experiment_settings_2}

For this experiment, we use the following 8 ImageNet pre-trained models as the source models: ResNet50, ResNet101, ResNet152~\citep{he2016deep}, DenseNet121, DenseNet169, DenseNet201~\citep{huang2017densely}, GoogleNet~\citep{szegedy2015going}, and Inceptionv3~\citep{szegedy2016rethinking}. These models are taken from the PyTorch Model Zoo. 

We use the dSprites dataset~\citep{matthey2017dsprites} for the target task. This dataset contains 737,280 images with 4 outputs for regression: x and y positions, scale, and orientation. The train-test split is similar to the settings in~\cite{you2021logme}: 60\% for training, 20\% for validation, and 20\% for testing. The transferred MSE is computed on the test set. We train our models with 10 epochs using the AdamW optimizer. The initial learning rate is $10^{-3}$, which is divided by 10 every 3 epochs.



\section{Additional experiment results}

\subsection{Usefulness of theoretical bounds}
\label{appendix:tightness_bounds}

Although the theoretical bounds in Section~\ref{sec:theory} show the relationships between the transferability of the optimal transferred model and our transferability estimators, these bounds could be loose in practice unless the number of samples is large. This is in fact a limitation of this type of generalization bounds. To show the usefulness of our bounds in practice, we conduct an experiment to investigate the generalization gap using the head re-training setting in Section~\ref{exp:different_input}. 

The generalization gap is defined as the \emph{difference between our transferability score and the negative MSE (the transferability) of the transferred model}. According to our theorems, this generalization gap is bounded above by the complexity term. We will compare the generalization gap with the absolute value of our transferability score and also inspect whether it has any significant correlation with the actual transferred MSE.

From this experiment, the ratios between the absolute value of transferability score and the generalization gap for our transferability estimators are: 1.6 (LinMSE0), 2.0 (LinMSE1), 2.3 (LabMSE0), and 2.3 (LabMSE1). These results show that the transferability scores dominate the generalization gap in practice. More importantly, there is \emph{no significant correlation} between the generalization gap and the actual transferred MSE. These findings indicate that the complexity term in our bounds may have little effects for transferability estimation, as opposed to the transferability score term that has a strong effect (shown by the high correlations in our main experiments).




\begin{table*}[t]
\caption{{\bf Kendall's-$\tau$ correlation coefficients when transferring from OpenMonkey to CUB-200-2011}. Bold numbers indicate best results in each row. Asterisks (*) indicate best results among the corresponding label-based or feature-based methods. Our estimators improve up to 28.4\% in comparison with SotA (LogME) while being 13\% better on average.}
\centering
\small
\begin{tabular}{ccccccccc}
\toprule
\multirow{2}{*}{Transfer setting} & \multicolumn{4}{c}{Label-based method} & \multicolumn{4}{c}{Feature-based method} \\
\cmidrule(lr){2-5} \cmidrule(lr){6-9}
& LabLogME & LabTransRate & LabMSE0 & LabMSE1 & LogME & TransRate & LinMSE0 & LinMSE1 \\
\midrule
Head re-training & 0.728 & 0.028 & ~~\textbf{0.935}* & 0.924 & 0.906 & 0.104 & 0.896 & 0.922*\\
Half fine-tuning & 0.525 & 0.392 & 0.644 & ~~0.646* & 0.651 & 0.291 & ~~\textbf{0.667}* & 0.646\\
Full fine-tuning & 0.497 & 0.289 & ~~0.606* & 0.594 & 0.611 & 0.328 & ~~{\bf 0.616}* & 0.594 \\
\bottomrule
\end{tabular}
\label{tab:kendall}
\end{table*}


\begin{table*}[t]
\caption{{\bf Spearman correlation coefficients when transferring from OpenMonkey to CUB-200-2011}. Bold numbers indicate best results in each row. Asterisks (*) indicate best results among the corresponding label-based or feature-based methods. Our estimators improve up to 19.9\% in comparison with SotA (LogME) while being 9.7\% better on average.}
\centering
\small
\begin{tabular}{ccccccccc}
\toprule
\multirow{2}{*}{Transfer setting} & \multicolumn{4}{c}{Label-based method} & \multicolumn{4}{c}{Feature-based method} \\
\cmidrule(lr){2-5} \cmidrule(lr){6-9}
& LabLogME & LabTransRate & LabMSE0 & LabMSE1 & LogME & TransRate & LinMSE0 & LinMSE1 \\
\midrule
Head re-training & 0.857 & 0.102 & ~~\textbf{0.994}* & 0.991 & 0.988 & 0.215 & 0.984 & ~~0.990*\\
Half fine-tuning & 0.726 & 0.409 & 0.857 & ~~0.858* & 0.857 & 0.437 & ~~\textbf{0.865}* & 0.858\\
Full fine-tuning & 0.689 & 0.433 & ~~0.826* & 0.823 & ~~\textbf{0.827}* & 0.474 & ~~\textbf{0.827}* & 0.823 \\
\bottomrule
\end{tabular}
\label{tab:spearman}
\end{table*}

\begin{table*}[t]
\caption{{\bf Correlation coefficients when transferring between 10d-output tasks from OpenMonkey to CUB-200-2011}. Bold numbers indicate best results in each row. Asterisks (*) indicate best results among the corresponding label-based or feature-based methods. All correlations are statistically significant with $p<0.001$. Our estimators with both $\lambda$ values are better than SotA (LogME).}
\centering
\small
\begin{tabular}{ccccccccc}
\toprule
\multirow{2}{*}{Transfer setting} & \multicolumn{4}{c}{Label-based method} & \multicolumn{4}{c}{Feature-based method} \\
\cmidrule(lr){2-5} \cmidrule(lr){6-9}
& LabLogME & LabTransRate & LabMSE0 & LabMSE1 & LogME & TransRate & LinMSE0  & LinMSE1 \\
\midrule
Head re-training & 0.970 & 0.719 & 0.991* & 0.989 & 0.968 & 0.656 & 0.990 & ~~\textbf{0.995}*\\
Half fine-tuning & 0.944 & 0.742 & 0.963* & 0.943 & 0.954 & 0.684 & ~~\textbf{0.980}* & 0.958\\
Full fine-tuning & 0.878 & 0.736 & 0.892* & 0.863 & 0.892 & 0.669 & ~~\textbf{0.916}* & 0.881\\
\bottomrule
\end{tabular}
\label{tab:different_input_high_dim}
\end{table*}


\subsection{Additional results for Section~\ref{exp:different_input}}
\label{appendix:high_dim_exp_1}

\minisection{Detailed correlation plots for Table~\ref{tab:different_input}.}
In Figures~\ref{fig:different_input_head_rt},~\ref{fig:different_input_half_ft}, and~\ref{fig:different_input_full_ft}, we show the detailed correlation plots and $p$-values for our experiment results reported in Table~\ref{tab:different_input} of the main paper. From these plots, all correlations are statistically significant with $p < 0.001$, except for TransRate and LabTransRate with head re-training.

\minisection{Additional results with non-linear correlation metrics.}
In Tables~\ref{tab:kendall} and~\ref{tab:spearman}, we report the Kendall's-$\tau$ and Spearman correlation coefficients to complement the results in Table~\ref{tab:different_input} of the main paper. These coefficients, as described in~\cite{bolya2021scalable}, are used to assess the ranking associations or the monotonic relationships between the transferability measures and the model performance. Based on the findings presented in these tables, our proposed scores are generally on par with or outperform the current state-of-the-art (SotA) approach, LogME~\citep{you2021logme}, with an average correlation improvement of 9.7\% and 13\% for Spearman and Kendall's-$\tau$ coefficients, respectively. This serves as a strong evidence illustrating the effectiveness of our proposed measures, not only in the linear relationship assessment, but also in the non-linear one.

\minisection{Additional result with high-dimensional labels.}
Using the setting in Section~\ref{exp:different_input}, we also conducted an additional experiment where both source and target tasks have 10-dimensional labels. In particular, we train a source model to predict five OpenMonkey keypoints: \emph{right eye}, \emph{left eye}, \emph{nose}, \emph{head}, and \emph{neck} simultaneously (i.e., this source model returns a 10-dimensional output). The source model is then transferred to a target task that predicts a combination of five CUB-200-2011 keypoints. We consider each combination of 5 keypoints among 10 CUB-200-2011 keypoints as a target task, resulting in 252 target tasks that all have 10-dimensional labels.

We also run 3 transfer learning algorithms: head re-training, half fine-tuning, and full fine-tune, using the same training settings as in Section~\ref{exp:different_input}. For TransRate and LabTransRate, we use 2 bins per dimension instead of 5 bins to reduce the computational costs. The results for this experiment are reported in Table~\ref{tab:different_input_high_dim}. From these results, our approaches are better than the baselines for both $\lambda$ values.



\subsection{Additional results for Section~\ref{sec:exp_shared_inputs}}
\label{appendix:high_dim_exp_2}


\begin{table*}[t]
\caption{{\bf Correlation coefficients when transferring from 2d-output tasks to 10d-output tasks on CUB-200-2011}. Bold numbers indicate best results in each row. Asterisks (*) indicate best results among the corresponding label-based or feature-based methods. Except for TransRate with half and full fine-tuning, all correlations are statistically significant with $p<0.001$. Our estimators are better than SotA (LogME) in most cases.}
\centering
\small
\begin{tabular}{ccccccccc}
\toprule
\multirow{2}{*}{Transfer setting} & \multicolumn{4}{c}{Label-based method} & \multicolumn{4}{c}{Feature-based method} \\
\cmidrule(lr){2-5} \cmidrule(lr){6-9}
& LabLogME & LabTransRate & LabMSE0  & LabMSE1  & LogME & TransRate & LinMSE0  & LinMSE1 \\
\midrule
Head re-training & 0.602 & 0.632 & ~~0.868* & 0.816 & 0.885 & 0.549 & 0.901 & ~~{\bf 0.973}*\\
Half fine-tuning & 0.491 & 0.645 & 0.771 & ~~0.881* & 0.804 & 0.072 & ~~{\bf 0.913}* & 0.818\\
Full fine-tuning & 0.397 & 0.632 & 0.727 & ~~{\bf 0.888*} & 0.756 & 0.050 & ~~0.884* & 0.833\\
\bottomrule
\end{tabular}
\label{tab:shared_input_high_dim}
\end{table*}


\minisection{Detailed correlation plots for Table~\ref{tab:shared_input}.}
In Figures~\ref{fig:shared_input_cub_head_rt}--~\ref{fig:shared_input_openmonkey_full_ft}, we show the detailed correlation plots and $p$-values for our experiment results reported in Table~\ref{tab:shared_input} of the main paper. From these plots, all correlations are statistically significant with $p < 0.001$, except for TransRate and LabTransRate as well as the full fine-tuning setting on the CUB-200-2011 dataset.

\minisection{Additional result for each individual source task.}
We report in Tables~\ref{tab:full_all_sources_cub} and~\ref{tab:full_all_sources_openmonkey} more comprehensive results for all source tasks on CUB-200-2011 and OpenMonkey respectively. Each row of the tables corresponds to one source task and shows the correlation coefficients when transferring to all other tasks in the respective dataset. From the tables, our transferability estimators are consistently better than LogME, LabLogME, TransRate, and LabTransRate for most source tasks on both datasets. These results confirm the effectiveness of our proposed methods.

\minisection{Additional result with high-dimensional labels.}
In this additional experiment, we further show the effectiveness of our proposed methods when the target tasks have higher dimensional labels. In particular, we transfer from 4 source tasks on CUB-200-2011 (\emph{back}, \emph{beak}, \emph{belly}, and \emph{breast}) to all the combinations of 5 attributes among the remaining tasks (except for \emph{right eye}, \emph{right leg}, and \emph{right wing}, which may not always be available in the data). In total, we have 224 source-target pairs, where the source tasks have 2-dimensional labels and the target tasks have 10-dimensional labels. We use the same training settings as in Section~\ref{sec:exp_shared_inputs} of the main paper, except that we also use 2 bins per dimension when calculating TransRate and LabTransRate to reduce computational costs. Table~\ref{tab:shared_input_high_dim} reports the results for this experiment. These results clearly show that our methods, LinMSE0 and LinMSE1, are better than the LogME and TransRate baselines in most cases.


\begin{figure*}[h]
\captionsetup[subfigure]{justification=centering}
    % For head re-training
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c1_new/logme.png}
    \caption{LogME}
    \end{subfigure}
    {\hskip 4pt}
    %
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c1_new/transrate.pdf}
    \caption{TransRate}
    \end{subfigure}
    {\hskip 4pt}
    %
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c1_new/linmse_0.0.png}
    \caption{LinMSE0}
    \end{subfigure}
    {\hskip 4pt}
    %
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c1_new/linmse_1.0.png}
    \caption{LinMSE1}
    \end{subfigure}
    
    {\vskip 0.2cm}
    
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c1_new/lablogme.png}
    \caption{LabLogME}
    \end{subfigure}
    {\hskip 4pt}
    %
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c1_new/labtransrate.pdf}
    \caption{LabTransRate}
    \end{subfigure}
    {\hskip 4pt}
    %
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c1_new/labmse_0.0.png}
    \caption{LabMSE0}
    \end{subfigure}
    {\hskip 4pt}
    %
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c1_new/labmse_1.0.png}
    \caption{LabMSE1}
    \end{subfigure}
    %
    {\vskip -0.2cm}
    \caption{\textbf{Correlation coefficients and $p$-values between transferability estimators and negative test MSEs} when transferring with head re-training from OpenMonkey to CUB-200-2011.}
    \label{fig:different_input_head_rt}
    
    {\vskip 0.4cm}
    
    % For half fine-tuning
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c2_new/logme.png}
    \caption{LogME}
    \end{subfigure}
    {\hskip 4pt}
    %
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c2_new/transrate.pdf}
    \caption{TransRate}
    \end{subfigure}
    {\hskip 4pt}
    %
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c2_new/linmse_0.0.png}
    \caption{LinMSE0}
    \end{subfigure}
    {\hskip 4pt}
    %
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c2_new/linmse_1.0.png}
    \caption{LinMSE1}
    \end{subfigure}
    
    {\vskip 0.2cm}
    
    \begin{subfigure}[b]{0.23\textwidth}
\includegraphics[width=\textwidth]{figures/supp/c2_new/lablogme.png}
    \caption{LabLogME}
    \end{subfigure}
    {\hskip 4pt}
    %
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c2_new/labtransrate.pdf}
    \caption{LabTransRate}
    \end{subfigure}
    {\hskip 4pt}
    %
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c2_new/labmse_0.0.png}
    \caption{LabMSE0}
    \end{subfigure}
    {\hskip 4pt}
    %
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c2_new/labmse_1.0.png}
    \caption{LabMSE1}
    \end{subfigure}
    %
    {\vskip -0.2cm}
    \caption{\textbf{Correlation coefficients and $p$-values between transferability estimators and negative test MSEs} when transferring with half fine-tuning from OpenMonkey to CUB-200-2011.}
    \label{fig:different_input_half_ft}
    
    {\vskip 0.4cm}
    
    % For full fine-tuning
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c3_new/logme.png}
    \caption{LogME}
    \end{subfigure}
    {\hskip 4pt}
    %
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c3_new/transrate.pdf}
    \caption{TransRate}
    \end{subfigure}
    {\hskip 4pt}
    %
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c3_new/linmse_0.0.png}
    \caption{LinMSE0}
    \end{subfigure}
    {\hskip 4pt}
    %
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c3_new/linmse_1.0.png}
    \caption{LinMSE1}
    \end{subfigure}
    
    {\vskip 0.2cm}
    
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c3_new/lablogme.png}
    \caption{LabLogME}
    \end{subfigure}
    {\hskip 4pt}
    %
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c3_new/transrate.pdf}
    \caption{LabTransRate}
    \end{subfigure}
    {\hskip 4pt}
    %
    \begin{subfigure}[b]{0.23\textwidth}
\includegraphics[width=\textwidth]{figures/supp/c3_new/labmse_0.0.png}
    \caption{LabMSE0}
    \end{subfigure}
    {\hskip 4pt}
    %
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c3_new/labmse_1.0.png}
    \caption{LabMSE1}
    \end{subfigure}
    %
    {\vskip -0.2cm}
    \caption{\textbf{Correlation coefficients and $p$-values between transferability estimators and negative test MSEs} when transferring with full fine-tuning from OpenMonkey to CUB-200-2011.}
    \label{fig:different_input_full_ft}
\end{figure*}


\begin{figure*}[h]
\captionsetup[subfigure]{justification=centering}
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c4/logme.png}
    \caption{LogME}
    \end{subfigure}
    {\hskip 4pt}
    %
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c4/transrate.png}
    \caption{TransRate}
    \end{subfigure}
    {\hskip 4pt}
    %
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c4/linmse_0.0.png}
    \caption{LinMSE0}
    \end{subfigure}
    {\hskip 4pt}
    %
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c4/linmse_1.0.png}
    \caption{LinMSE1}
    \end{subfigure}
    
    {\vskip 0.2cm}
    
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c4/lablogme.png}
    \caption{LabLogME}
    \end{subfigure}
    {\hskip 4pt}
    %
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c4/labtransrate.png}
    \caption{LabTransRate}
    \end{subfigure}
    {\hskip 4pt}
    %
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c4/labmse_0.0.png}
    \caption{LabMSE0}
    \end{subfigure}
    {\hskip 4pt}
    %
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c4/labmse_1.0.png}
    \caption{LabMSE1}
    \end{subfigure}
    %
    {\vskip -0.2cm}
    \caption{\textbf{Correlation coefficients and $p$-values between transferability estimators and negative test MSEs} when transferring with head re-training between any two different keypoints (with shared inputs) on CUB-200-2011.}
    \label{fig:shared_input_cub_head_rt}
    
    {\vskip 0.4cm}
    
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c5/logme.png}
    \caption{LogME}
    \end{subfigure}
    {\hskip 4pt}
    %
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c5/transrate.png}
    \caption{TransRate}
    \end{subfigure}
    {\hskip 4pt}
    %
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c5/linmse_0.0.png}    
    \caption{LinMSE0}
    \end{subfigure}
    {\hskip 4pt}
    %
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c5/linmse_1.0.png}
    \caption{LinMSE1}
    \end{subfigure}
    
    {\vskip 0.2cm}
    
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c5/lablogme.png}
    \caption{LabLogME}
    \end{subfigure}
    {\hskip 4pt}
    %
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c5/labtransrate.png}
    \caption{LabTransRate}
    \end{subfigure}
    {\hskip 4pt}
    %
    \begin{subfigure}[b]{0.23\textwidth}
\includegraphics[width=\textwidth]{figures/supp/c5/labmse_0.0.png}
    \caption{LabMSE0}
    \end{subfigure}
    {\hskip 4pt}
    %
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c5/labmse_1.0.png}
    \caption{LabMSE1}
    \end{subfigure}
    %
    {\vskip -0.2cm}
    \caption{\textbf{Correlation coefficients and $p$-values between transferability estimators and negative test MSEs} when transferring with half fine-tuning between any two different keypoints (with shared inputs) on CUB-200-2011.}
    \label{fig:shared_input_cub_half_ft}

    {\vskip 0.4cm}

    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c6/logme.png}
    \caption{LogME}
    \end{subfigure}
    {\hskip 4pt}
    %
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c6/transrate.png}
    \caption{TransRate}
    \end{subfigure}
    {\hskip 4pt}
    %
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c6/linmse_0.0.png}    
    \caption{LinMSE0}
    \end{subfigure}
    {\hskip 4pt}
    %
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c6/linmse_1.0.png}
    \caption{LinMSE1}
    \end{subfigure}
    
    {\vskip 0.2cm}
    
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c6/lablogme.png}
    \caption{LabLogME}
    \end{subfigure}
    {\hskip 4pt}
    %
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c6/labtransrate.png}
    \caption{LabTransRate}
    \end{subfigure}
    {\hskip 4pt}
    %
    \begin{subfigure}[b]{0.23\textwidth}
\includegraphics[width=\textwidth]{figures/supp/c6/labmse_0.0.png}
    \caption{LabMSE0}
    \end{subfigure}
    {\hskip 4pt}
    %
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c6/labmse_1.0.png}
    \caption{LabMSE1}
    \end{subfigure}
    %
    {\vskip -0.2cm}
    \caption{\textbf{Correlation coefficients and $p$-values between transferability estimators and negative test MSEs} when transferring with full fine-tuning between any two different keypoints (with shared inputs) on CUB-200-2011.}
    \label{fig:shared_input_cub_full_ft}
\end{figure*}


\begin{figure*}[h]
\captionsetup[subfigure]{justification=centering}
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c7/logme.png}
    \caption{LogME}
    \end{subfigure}
    {\hskip 4pt}
    %
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c7/transrate.pdf}
    \caption{TransRate}
    \end{subfigure}
    {\hskip 4pt}
    %
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c7/linmse_0.0.png}    
    \caption{LinMSE0}
    \end{subfigure}
    {\hskip 4pt}
    %
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c7/linmse_1.0.png}
    \caption{LinMSE1}
    \end{subfigure}
    
    {\vskip 0.2cm}
    
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c7/lablogme.png}
    \caption{LabLogME}
    \end{subfigure}
    {\hskip 4pt}
    %
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c7/labtransrate.pdf}
    \caption{LabTransRate}
    \end{subfigure}
    {\hskip 4pt}
    %
    \begin{subfigure}[b]{0.23\textwidth}
\includegraphics[width=\textwidth]{figures/supp/c7/labmse_0.0.png}
    \caption{LabMSE0}
    \end{subfigure}
    {\hskip 4pt}
    %
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c7/labmse_1.0.png}
    \caption{LabMSE1}
    \end{subfigure}
    %
    {\vskip -0.2cm}
    \caption{\textbf{Correlation coefficients and $p$-values between transferability estimators and negative test MSEs} when transferring with head re-training between any two different keypoints (with shared inputs) on OpenMonkey.}
    \label{fig:shared_input_openmonkey_head_rt}
    
    {\vskip 0.4cm}
    
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c8/logme.png}
    \caption{LogME}
    \end{subfigure}
    {\hskip 4pt}
    %
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c8/transrate.pdf}
    \caption{TransRate}
    \end{subfigure}
    {\hskip 4pt}
    %
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c8/linmse_0.0.png}    
    \caption{LinMSE0}
    \end{subfigure}
    {\hskip 4pt}
    %
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c8/linmse_1.0.png}
    \caption{LinMSE1}
    \end{subfigure}
    
    {\vskip 0.2cm}
    
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c8/lablogme.png}
    \caption{LabLogME}
    \end{subfigure}
    {\hskip 4pt}
    %
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c8/labtransrate.pdf}
    \caption{LabTransRate}
    \end{subfigure}
    {\hskip 4pt}
    %
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c8/labmse_0.0.png}
    \caption{LabMSE0}
    \end{subfigure}
    {\hskip 4pt}
    %
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c8/labmse_1.0.png}
    \caption{LabMSE1}
    \end{subfigure}
    %
    {\vskip -0.2cm}
    \caption{\textbf{Correlation coefficients and $p$-values between transferability estimators and negative test MSEs} when transferring with half fine-tuning between any two different keypoints (with shared inputs) on OpenMonkey.}
    \label{fig:shared_input_openmonkey_half_ft}
    
    {\vskip 0.4cm}
    
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c9/logme.png}
    \caption{LogME}
    \end{subfigure}
    {\hskip 4pt}
    %
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c9/transrate.pdf}
    \caption{TransRate}
    \end{subfigure}
    {\hskip 4pt}
    %
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c9/linmse_0.0.png}    
    \caption{LinMSE0}
    \end{subfigure}
    {\hskip 4pt}
    %
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c9/linmse_1.0.png}
    \caption{LinMSE1}
    \end{subfigure}
    
    {\vskip 4pt}
    
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c9/lablogme.png}
    \caption{LabLogME}
    \end{subfigure}
    {\hskip 4pt}
    %
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c9/labtransrate.pdf}
    \caption{LabTransRate}
    \end{subfigure}
    {\hskip 4pt}
    %
    \begin{subfigure}[b]{0.23\textwidth}
\includegraphics[width=\textwidth]{figures/supp/c9/labmse_0.0.png}
    \caption{LabMSE0}
    \end{subfigure}
    {\hskip 4pt}
    %
    \begin{subfigure}[b]{0.23\textwidth}
    \includegraphics[width=\textwidth]{figures/supp/c9/labmse_1.0.png}
    \caption{LabMSE1}
    \end{subfigure}
    %
    {\vskip -0.2cm}
    \caption{\textbf{Correlation coefficients and $p$-values between transferability estimators and negative test MSEs} when transferring with full fine-tuning between any two different keypoints (with shared inputs) on OpenMonkey.}
    \label{fig:shared_input_openmonkey_full_ft}
\end{figure*}


\begin{table*}[ht]
\caption{{\bf Correlation coefficients for all source tasks} on CUB-200-2011. Bold numbers indicate best results in each row. Asterisks (*) indicate best results among the corresponding label-based or feature-based methods.}
\resizebox{\textwidth}{!}{%
\centering
\begin{tabular}{cccccccccc}
\toprule
\multirow{3}{*}{\parbox{1.4cm}{Transfer setting}} & \multirow{3}{*}{Source task} & \multicolumn{4}{c}{Label-based method} & \multicolumn{4}{c}{Feature-based method} \\
\cmidrule(lr){3-6} \cmidrule(lr){7-10}
& & LabLogME & LabTransRate & LabMSE0 & LabMSE1 & LogME & TransRate & LinMSE0  & LinMSE1 \\
\midrule
\multirow{15}{*}{\parbox{1.4cm}{Head re-training}} & Back & 0.743 & 0.116 & 0.956 & \textbf{0.966*} & 0.920 & 0.273 & 0.931 & 0.964*\\ 
& Beak & 0.863 & 0.229 & 0.922* & 0.915 & 0.878 & 0.158 & 0.906 & \textbf{0.945*}\\ 
& Belly & 0.892 & 0.097 & 0.970 & \textbf{0.982*} & 0.933 & 0.188 & 0.932 & \textbf{0.982*}\\ 
& Breast & 0.915 & 0.120 & 0.935 & 0.945* & 0.903 & 0.279 & 0.922 & \textbf{0.961*}\\ 
& Crown & 0.917 & 0.041 & 0.962 & 0.966* & 0.913 & 0.251 &0.945 & \textbf{0.979*}\\ 
& Forehead & 0.888 & 0.076 & 0.941* & 0.939 & 0.885 & 0.221 & 0.924 & \textbf{0.966*}\\ 
& Left eye & 0.035 & 0.076 & 0.913 & 0.964* & 0.924 & 0.289 & 0.945 & \textbf{0.969*}\\ 
& Left leg & 0.261 & 0.221 & 0.935 & \textbf{0.975*} & 0.935 & 0.223 & 0.953 & \textbf{0.975*}\\ 
& Left wing & 0.260 & 0.170 & 0.964 & \textbf{0.994*} & 0.980 & 0.173 & \textbf{0.994*} & \textbf{0.994*}\\ 
& Nape& 0.889 & 0.085 & 0.922 & 0.942* & 0.900 & 0.300 & 0.929 & \textbf{0.953}*\\ 
& Right eye & 0.625 & 0.242 & 0.904 & 0.974* & 0.921 & 0.244 & 0.948 & \textbf{0.975*}\\ 
& Right leg & 0.508 & 0.047 & 0.958 & 0.989* & 0.942 & 0.217 & 0.954 & \textbf{0.990*}\\ 
& Right wing & 0.521 & 0.167 & 0.907 & 0.979* & 0.935 & 0.270 & 0.946 & \textbf{0.980*}\\ 
& Tail & 0.591 & 0.392 & 0.900 & \textbf{0.927*} & 0.872 & 0.544 & 0.880 & 0.890*\\ 
& Throat & 0.896 & 0.124 & 0.938 & 0.941* & 0.890 & 0.291 & 0.924 & \textbf{0.956*}\\ 
\midrule
\multirow{15}{*}{\parbox{1.4cm}{Half fine-tuning}} 
& Back & 0.714 & 0.076 & 0.791 & 0.814* & 0.835 & 0.168 & \textbf{0.911*} & 0.873\\ 
& Beak & 0.663 & 0.160 & 0.831* & 0.772 & 0.765 & 0.076 & 0.883 & \textbf{0.899*}\\ 
& Belly & 0.528 & 0.233 & 0.655 & 0.752* &0.758 & 0.309 & \textbf{0.849*} &0.764\\ 
& Breast & 0.730 & 0.100 & 0.802* &0.779 & 0.762 & 0.152 & \textbf{0.867}* & 0.850\\ 
& Crown & 0.644 & 0.068 & 0.752 & 0.776* & 0.714 & 0.165 & \textbf{0.832}* & 0.816\\
& Forehead & 0.654 & 0.032 & 0.804* & 0.786 & 0.727 & 0.120 & 0.859 & \textbf{0.873*} \\ 
& Left eye & 0.420 & 0.046 & \textbf{0.913*} & 0.853 & 0.812 & 0.227 & 0.892* &0.865\\ 
& Left leg & 0.121 & 0.095 & 0.721 & 0.819* & 0.845 & 0.150 & \textbf{0.893}* & 0.832\\ 
& Left wing & 0.352 & 0.150 & \textbf{0.949}* & 0.918 & 0.859 & 0.189 & 0.919* & 0.918 \\ 
& Nape& 0.660 & 0.055 & 0.705 & 0.770* & 0.751& 0.181 & \textbf{0.863*} & 0.802\\ 
& Right eye & 0.561 & 0.221 & \textbf{0.911*} & 0.873 & 0.786 & 0.180 & 0.871 & 0.890*\\ 
& Right leg & 0.268 & 0.125 & 0.690 & 0.804* & 0.810 & 0.069 & \textbf{0.861*} & 0.820\\ 
& Right wing & 0.407 & 0.133 & 0.495 & 0.613* & 0.516 & 0.338 & 0.521 & \textbf{0.617*}\\ 
& Tail & 0.801 & 0.117 & 0.930* & 0.812 & 0.848 & 0.285 & 0.924 & \textbf{0.968*}\\ 
& Throat & 0.767 & 0.013 & 0.870* & 0.810 & 0.811 & 0.253 & \textbf{0.900*} & 0.873\\ 
\midrule
\multirow{15}{*}{\parbox{1.4cm}{Full fine-tuning}} & Back & 0.710 & 0.085 & 0.785 & 0.808* & 0.829 & 0.178 & \textbf{0.906*} & 0.868\\ 
& Beak & 0.659 & 0.161 & 0.826* & 0.780 & 0.758 & 0.073 & 0.877 & \textbf{0.899*}\\ 
& Belly & 0.645 & 0.273 & 0.782 & 0.847* & 0.862 & 0.365 & \textbf{0.926*} & 0.856 \\ 
& Breast & 0.740 & 0.104 & 0.811* & 0.791 & 0.768 & 0.152 & \textbf{0.871*} & 0.859\\ 
& Crown & 0.647 & 0.073 & 0.756 & 0.784* & 0.717 & 0.157 & \textbf{0.834*} & 0.821 \\ 
& Forehead & 0.648 & 0.037 & 0.799* & 0.783 & 0.723 & 0.111 & 0.855 & \textbf{0.869*} \\ 
& Left eye & 0.224 & \textbf{0.456}* & 0.297 & 0.347 & 0.333* &0.246 & 0.282 & 0.326 \\ 
& Left leg & 0.057 & 0.067 & 0.659 & 0.769* & 0.796 & 0.146 & \textbf{0.850}* & 0.783 \\ 
& Left wing & 0.342 & 0.159 & \textbf{0.954}* & 0.915 & 0.860 & 0.195 & 0.920* & 0.914\\ 
& Nape & 0.667 & 0.041 & 0.713 & 0.779* & 0.752 & 0.177 & \textbf{0.864*} & 0.810 \\ 
& Right eye & 0.549 & 0.213 & \textbf{0.915*} & 0.876 & 0.794 & 0.199 & 0.877 & 0.893* \\ 
& Right leg & 0.237 & 0.377 & 0.673 & 0.692* & 0.755 & 0.431 & \textbf{0.766}* &0.693 \\ 
& Right wing& \textbf{0.254}* & 0.046 & 0.237 & 0.223 & 0.225 & 0.093 & 0.227* & 0.220 \\ 
& Tail & 0.803 & 0.122 & 0.930* & 0.818 & 0.846 & 0.288 & 0.923 & \textbf{0.969}*\\ 
& Throat & 0.665 & 0.027 & 0.801* & 0.779 & 0.744 & 0.256 & \textbf{0.850*} & 0.834\\ 
\bottomrule
\end{tabular}
}
\label{tab:full_all_sources_cub}
\end{table*}


\begin{table*}[ht]
\caption{{\bf Correlation coefficients for all source tasks} on OpenMonkey. Bold numbers indicate best results in each row. Asterisks (*) indicate best results among the corresponding label-based or feature-based methods.}
\resizebox{\textwidth}{!}{%
\centering
\begin{tabular}{cccccccccc}
\toprule
\multirow{3}{*}{\parbox{1.4cm}{Transfer setting}} & \multirow{3}{*}{Source task} & \multicolumn{4}{c}{Label-based method} & \multicolumn{4}{c}{Feature-based method} \\
\cmidrule(lr){3-6} \cmidrule(lr){7-10}
& & LabLogME & LabTransRate & LabMSE0 & LabMSE1 & LogME & TransRate & LinMSE0 & LinMSE1 \\
\midrule
\multirow{17}{*}{\parbox{1.4cm}{Head re-training}} 
& Right eye & 0.894 & 0.859 & \textbf{0.986*} & 0.835 & 0.918 & 0.846 & 0.978 & \textbf{0.986*}\\ 
&Left eye& 0.895 & 0.854 &\textbf{0.987}* & 0.838 & 0.868 & 0.858 & 0.981 & \textbf{0.987*}\\ 
& Nose & 0.908 & 0.849 & 0.988* & 0.849 & 0.818 & 0.837 & 0.978 & \textbf{0.989*}\\ 
& Head & 0.941 & 0.881 & \textbf{0.992*} & 0.821 & 0.897 & 0.884 & 0.983* & 0.978\\ 
& Neck & 0.972 & 0.862 & \textbf{0.998}* & 0.887 & 0.932 & 0.839 & 0.982 & 0.987*\\ 
& Right shoulder & 0.977 & 0.837 & \textbf{0.994*} & 0.891 & 0.842 & 0.811 & 0.982* & 0.980\\ 
& Right elbow & 0.963 & 0.529 & \textbf{0.994*} & 0.940 & 0.469 & 0.564 & 0.969 & 0.990*\\ 
& Right wrist & 0.970 & 0.753 & \textbf{0.993}* & 0.939 & 0.615 & 0.446& 0.963 & 0.990*\\ 
& Left shoulder & 0.972 & 0.800 & \textbf{0.997}* & 0.915 & 0.823 & 0.808 & 0.988* & 0.988*\\ 
& Left elbow & 0.960 & 0.546 & \textbf{0.994}* & 0.948 & 0.711 & 0.572 & 0.969 & 0.989*\\ 
& Left wrist & 0.975 & 0.597 & \textbf{0.993*} & 0.951 & 0.964 & 0.544 & 0.963 & \textbf{0.993*}\\ 
& Hip & 0.922 & 0.540 & 0.989* & 0.325 & 0.874 & 0.557 & 0.800 & \textbf{0.991*}\\ 
& Right knee & 0.925 & 0.080 & 0.975* & 0.850 & 0.766 & 0.331 & 0.945 & \textbf{0.993*}\\ 
&Right ankle & 0.931 & 0.411 & 0.989* & 0.770 & 0.737 & 0.371 & 0.930 & \textbf{0.997*}\\ 
&Left knee & 0.923 & 0.160 & 0.978* & 0.848 & 0.692 & 0.209 & 0.936 & \textbf{0.994*}\\ 
&Left ankle & 0.916 & 0.416 & 0.986* & 0.775 & 0.852 & 0.329 & 0.925 & \textbf{0.998*}\\ 
&Tail & 0.936 & 0.712 & \textbf{0.993*} & 0.312 & 0.821 & 0.662 & 0.897 & 0.990*\\ 
\midrule
\multirow{17}{*}{\parbox{1.4cm}{Half fine-tuning}} 
& Right eye & 0.795 & 0.734 & 0.906* & 0.883 & 0.835 & 0.709 & \textbf{0.963*} & 0.923 \\ 
&Left eye & 0.797 & 0.731 & 0.905* & 0.879 & 0.771 & 0.719 & \textbf{0.960*} & 0.918 \\ 
& Nose & 0.829 & 0.736 &0.914* &0.872 & 0.649 & 0.721 & \textbf{0.968*} & 0.916 \\ 
& Head & 0.835 & 0.759 & 0.921* & 0.882 & 0.804 & 0.751 & \textbf{0.964*} & 0.928\\ 
& Neck & 0.902 & 0.793 & 0.929* & 0.871 & 0.745 & 0.765 & \textbf{0.969*} &0.915\\ 
& Right shoulder & 0.887 & 0.725 & 0.924* & 0.890 & 0.751 & 0.758 & \textbf{0.972*} & 0.924\\ 
& Right elbow & 0.764 & 0.250 & 0.806 & 0.914* & 0.048 & 0.602 & \textbf{0.931}* & 0.821\\ 
&Right wrist & 0.806 & 0.501 & 0.823 & 0.903* &0.172 & 0.643 & 
\textbf{0.929*} & 0.819\\ 
& Left shoulder & 0.893 & 0.718 & 0.927* &0.899 & 0.702 & 0.774 & \textbf{0.972*} & 0.930\\ 
&Left elbow& 0.782 & 0.369 & 0.824 & 0.919* & 0.366 & 0.594 & \textbf{0.946*} & 0.839\\ 
&Left wrist & 0.822 & 0.523 & 0.828 & 0.902* & 0.765 & 0.663 & \textbf{0.932*} & 0.824\\ 
&Hip& 0.030 & 0.487 & 0.233 & \textbf{0.910*} & 0.006 & 0.359 & 0.800* & 0.305\\ 
&Right knee& 0.481 & 0.429 & 0.598 & \textbf{0.906*} & 0.186 & 0.067&  0.831* & 0.687\\ 
&Right ankle & 0.357 & 0.275 & 0.534 & \textbf{0.910*} & 0.286 & 0.226 & 0.806* & 0.632\\ 
&Left knee & 0.467 & 0.355 & 0.601 & \textbf{0.899}* & 0.172 & 0.215 & 0.855* & 0.692\\ 
&Left ankle& 0.331 & 0.242 & 0.530 & \textbf{0.904*} & 0.197 & 0.303 & 0.822* & 0.632\\ 
&Tail & 0.231 & 0.196 & 0.434 & \textbf{0.829*} & 0.160 & 0.121 & 0.729* & 0.494\\ 
\midrule
\multirow{17}{*}{\parbox{1.4cm}{Full fine-tuning}}
&Right eye & 0.796 & 0.711 & 0.905* & 0.894 & 0.821&0.694&\textbf{0.959*}&0.927\\ 
&Left eye& 0.790 & 0.734 & 0.904* & 0.882 & 0.763 & 0.714 & \textbf{0.957*} & 0.921\\ 
& Nose & 0.810 & 0.731 & 0.912* & 0.892 & 0.642 & 0.709 & \textbf{0.960*} &0.932\\ 
&Head& 0.801 & 0.737 & 0.900* & 0.892&0.772&0.718&\textbf{0.947*}&0.920\\ 
&Neck&0.893&0.782&0.930*&0.886&0.755&0.743&\textbf{0.962*}&0.926\\ 
&Right shoulder&0.896&0.722&0.936*&0.908&0.759&0.750&\textbf{0.975*}&0.940\\ 
&Right elbow&0.689&0.168&0.736&0.878*&0.047&0.562&\textbf{0.888*}&0.761\\ 
&Right wrist&0.796&0.505&0.805&0.876*&0.199&0.644&\textbf{0.910*}&0.803\\ 
&Left shoulder&0.872&0.690&0.901*&0.882&0.670&0.762&\textbf{0.955*}&0.903\\ 
&Left elbow&0.726&0.282&0.774&0.904*&0.326&0.538&\textbf{0.914*}&0.797\\ 
&Left wrist&0.787&0.488&0.787&0.868*&0.725&0.672&\textbf{0.903*}&0.785\\ 
&Hip&0.016&0.518&0.173&\textbf{0.894*}&0.038&0.382&0.757*&0.238\\ 
&Right knee&0.391&0.518&0.516&\textbf{0.891*}&0.096&0.141&0.763*&0.614\\ 
&Right ankle&0.246&0.396&0.437&\textbf{0.889*}&0.185&0.340&0.726*&0.546\\ 
&Left knee&0.381&0.448&0.521&\textbf{0.891*}&0.149&0.303&0.789*&0.618\\ 
&Left ankle&0.244&0.297&0.444&\textbf{0.871*}&0.098&0.357&0.751*&0.551\\ 
&Tail&0.105&0.299&0.309&\textbf{0.824*}&0.047&0.212&0.628*&0.372\\ 
\bottomrule
\end{tabular}
}
\label{tab:full_all_sources_openmonkey}
\end{table*}


\small{
\setlength{\bibsep}{3pt}
\bibliography{nguyen_247}
}

\end{document}
