% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{multirow}
% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr} 
% \externaldocument{uai2023}
\externaldocument{fan_236}



\usepackage{url}            % simple URL typesetting
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography

\usepackage{graphicx}
\usepackage{amssymb,amsmath,amsthm}
\usepackage{epstopdf}
\usepackage{algorithm,algcompatible}
\usepackage{mathtools}
\usepackage{wrapfig}
\usepackage{soul}
\usepackage{tikz}
\usepackage{xspace}
\usepackage{subcaption}
%\usepackage{enumitem}
\usepackage[inline]{enumitem}

\usetikzlibrary{fit,positioning,arrows,automata}

\mathtoolsset{showonlyrefs}

 
\newtheorem{theorem}{Theorem}
\newtheorem{definition}{Definition}
\newtheorem{proposition}{Proposition}
\newtheorem{lemma}{Lemma}
\newtheorem{corollary}{Corollary}
\newtheorem{remark}{Remark}
\newtheorem{example}{Example}
\newtheorem{defn}{Definition}
\newtheorem{assum}{Assumption}

\def\bx{{\mathbf{x}}}
\def\bz{{\mathbf{z}}}
\def\bG{{\mathbf{G}}}
\def\bA{{\mathbf{A}}}
\def\bB{{\bf B}}
\def\bC{{\bf C}}
\def\bH{{\mathbf{H}}}
\def\bR{{\mathbf{R}}}
\def\bP{{\mathbf{P}}}
\def\bS{{\mathbf{S}}}
\def\bX{{\mathbf{X}}}
\def\bJ{{\mathbf{J}}}
\def\bQ{{\mathbf{Q}}}
\def\bK{{\mathbf{K}}}
\def\bU{{\mathbf{U}}}
\def\bV{{\mathbf{V}}}
\def\bF{{\mathbf{F}}}

\newcommand{\mC}{{\mathbb C}}
\newcommand{\mD}{{\mathbb D}}
\newcommand{\mV}{{\mathcal{V}}}
\newcommand{\mE}{{\mathbb E}}
\newcommand{\mP}{{\mathbb P}}
\newcommand{\mR}{{\mathbb R}}
\newcommand{\mN}{{\mathbb N}}
\newcommand{\mS}{{\mathbb S}}
\newcommand{\cB}{{\mathcal B}}
\newcommand{\cC}{{\mathcal C}}
\newcommand{\cD}{{\mathcal D}}
\newcommand{\cE}{{\mathcal E}}
\newcommand{\cF}{{\mathcal F}}
\newcommand{\cG}{{\mathcal G}}
\newcommand{\cH}{{\mathcal H}}
\newcommand{\cI}{{\mathcal I}}
\newcommand{\cJ}{{\mathcal J}}
\newcommand{\cL}{{\mathcal L}}
\newcommand{\cN}{{\mathcal N}}
\newcommand{\cO}{{\mathcal O}}
\newcommand{\cP}{{\mathcal P}}
\newcommand{\cR}{{\mathcal R}}
\newcommand{\cS}{{\mathcal S}}
\newcommand{\cT}{{\mathcal T}}
\newcommand{\cU}{{\mathcal U}}
\newcommand{\cV}{{\mathcal V}}
\newcommand{\cW}{{\mathcal W}}
\newcommand{\cX}{{\mathcal X}}
\newcommand{\cY}{{\mathcal Y}}
\newcommand{\cZ}{{\mathcal Z}}
\newcommand{\one}{{\mathbf 1}}
\newcommand{\tT}{{\text{T}}}

\def\OT{{\textup{OT}}}
\def\KL{{\textup{KL}}}
\def\TV{{\textup{TV}}}
\def\LSI{{\textup{LSI}}}
\def\TI{{\textup{TI}}}
\def\<{{\langle}}
\def\>{{\rangle}}
\def\Rn{{\mathbb{R}^n}}
\def\R{{\mathbb{R}}}
\def\d{{\text{d}}}
% \def\l{{\left}}
% \def\r{{\right}}
\newcommand{\Cov}{{\text{Cov}}}
\newcommand{\Var}{{\text{Var}}}


\newcommand{\fan}[1]{{\color{violet}{#1}}}
\newcommand{\dam}[1]{{\color{orange}{#1}}}
\newcommand{\ones}{\mathbf{1}} 

\DeclarePairedDelimiter\ceil{\lceil}{\rceil}
\DeclarePairedDelimiter\floor{\lfloor}{\rfloor}

\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}

\everypar{\looseness=-1}


\title{Generating Synthetic Datasets by \\ Interpolating along Generalized Geodesics \\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<jiaojiaofan@gatech.edu>?Subject=Your UAI 2023 paper}{Jiaojiao Fan}{}}
\author[2]{David Alvarez-Melis}
\affil[1]{
    Georgia Tech\\
    Atlanta, Georgia, USA
}
\affil[2]{%
    Microsoft Research \& Harvard University\\
    Cambridge, Massachusetts, USA
}

\begin{document}

\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle


\appendix
\section{Proofs}\label{sec:proof}
\begin{proof}[Proof of Lemma \ref{lem:convex_w2}]
  By \citet[\S4.4]{santambrogio2017euclidean}, the result holds when $m=2$. Then Proposition 7.5 in \citet{agueh2011barycenters} extends the result to the case of $m>2$.
\end{proof}

\begin{proof}[Proof of Proposition \ref{prop:eq}]
  Since linear combination preserves cyclically monotonicity, $\sum_{i=1}^m a_i T_i^*(x)$ is the optimal map from $\nu$ to $\rho_a^G$~\citep{mccann1995existence}.
  Then according to the definition of $W_{2, \nu }(\cdot , \cdot ) $, we can write
  \begin{align}\label{eq:pf}
    W_{2, \nu }^2(\rho^G_a, \nu )
    =
    \int \left \|x - \sum_{i=1}^m a_i T_i^*(x) \right\|^2 \d \nu (x).
    % \sum_{i=1}^m a_i W_{2,\nu }^2(\mu_i, \nu ) - \frac{1}{2} \sum_{i \neq j} a_i a_j W_{2,\nu }^2(\mu_i, \mu_j )    
  \end{align}
  For scalars $p , q_1,\ldots, q_m$, it holds that
  \begin{align}
    \left(p - \sum_{i=1}^m a_i q_i \right)^2 & = p^2 + \sum_{i=1}^m a_i^2 q_i^2 - 2  \sum_{i=1}^m a_i p q_i  + \sum_{i\ne j} a_i a_j q_i q_j                           \\
                                             & = p^2 + \sum_{i=1}^m (a_i - a_i \sum_{j \ne i} a_j ) q_i^2 - 2  \sum_{i=1}^m a_i p q_i  + \sum_{i\ne j} a_i a_j q_i q_j \\
                                             & =  \sum_{i=1}^m a_i (p-q_i)^2 - \frac{1}{2} \sum_{i \ne j} a_i a_j (q_i - q_j)^2.
  \end{align}
  Plugging this equality into \eqref{eq:pf} gives
  \begin{align}
    W_{2, \nu }^2(\rho^G_a, \nu )
     & =
    \int \left( \sum_{i=1}^m a_i   \|x -  T_i^*(x)\|^2 - \frac{1}{2} \sum_{i \ne j} a_i a_j \| T_i^*(x) -  T_j^*(x)\|^2 \right) \d \nu (x)         \\
     & =   \sum_{i=1}^m a_i \int  \|x -  T_i^*(x)\|^2 \d \nu (x) - \frac{1}{2} \sum_{i \ne j} a_i a_j \int \| T_i^*(x) -  T_j^*(x)\|^2  \d \nu (x) \\
     & = \sum_{i=1}^m a_i W_{2,\nu }^2(\mu_i, \nu ) - \frac{1}{2} \sum_{i \neq j} a_i a_j W_{2,\nu }^2(\mu_i, \mu_j ).
  \end{align}
\end{proof}
\begin{proof}[Proof of Proposition \ref{prop:metric}]
  Firstly, $\cW_{2,Q}$ is symmetric and nonnegative by definition. It is non-degenerate since $\cW_{2,Q} (P_i,P_j) \ge d_\OT (P_i, P_j) $ and $d_\OT$ is a metric. Finally, we show it satisfies the triangular inequality. Indeed,
  \begin{align}
     & ~~~~~~\cW_{2,Q}(P_1, P_3)                                                                      \\
     & =\left( \int  \| x_1 -  x_3 \|^2 +
    W_2^2(\alpha_{y_1}, \alpha_{y_3})
    % \bar  y_i^\top M \bar y_j
    \d Q(z) \right)^{1/2}                                                                             \\
     & \le \left( \int  (\| x_1 -  x_2 \| + \| x_2 -  x_3 \| )^2 +
    (W_2(\alpha_{y_1}, \alpha_{y_2})
    + W_2(\alpha_{y_2}, \alpha_{y_3})
    )^2
    \d Q(z) \right)^{1/2}                                                                             \\
     & \le \left( \int  \| x_1 -  x_2 \|^2 + W^2_2(\alpha_{y_1}, \alpha_{y_2}) \d Q(z)  \right)^{1/2}
    + \left( \int \| x_2 -  x_3 \|^2 + W^2_2(\alpha_{y_2}, \alpha_{y_3})
    \d Q(z) \right)^{1/2}                                                                             \\
     & = \cW_{2,Q}(P_1, P_2) +  \cW_{2,Q}(P_2, P_3),
  \end{align}
  where the first inequality is the triangular inequality and the second inequality is the Minkowski inequality.
\end{proof}
\section{Implementation details of OTDD map}\label{sec:neural_map}

% \subsection{OTDD barycentric projection}
\paragraph{OTDD barycentric projection}
We use the implementation \url{https://github.com/microsoft/otdd} to solve OTDD coupling. The rest part is straightforward.
\paragraph{OTDD neural map}
% \fan{extend OTDD  to have soft labels, soft label need to have a weight}
To solve the problem \eqref{eq:max-min}, we parameterize $f, G, \ell$ to be three neural networks. In NIST dataset experiments, we parameterize $f$ as ResNet~\footnote{\url{https://github.com/harryliew/WGAN-QC}} from WGAN-QC~\citep{liu2019wasserstein}, and take feature map $G$ to be UNet\footnote{\url{https://github.com/milesial/Pytorch-UNet}}~\citep{ronneberger2015u}. We generate the labels $\bar y$ with a pre-trained classifier $\ell(\cdot)$, and use a LeNet or VGG-5 with Spinal layers\footnote{\url{https://github.com/dipuk0506/SpinalNet}} \citep{kabir2022spinalnet} to parameterize $\ell(\cdot)$.  In 2D Gaussian mixture experiments, we use Residual MLP to represent all of them.

We remove the discriminator's condition on label to simplify the loss function as
\begin{align}
  \sup_f \inf_G  \int \bigl( \underbrace{\| x- G( z)\|_2^2}_\text{feature loss}   + \underbrace{W_2^2(\alpha_{y}, \alpha_{\bar y} )}_\text{label loss} \bigr) \d Q(z) \underbrace{- \int f(\bar x ) \d Q(z)
    + \int f(x') \d P(z') }_\text{discriminator loss}.
\end{align}
% Also the details of experiments.
In this formula, we assume both $y$ and $\bar y$ are hard labels, but in practice, the output of $\ell(\cdot )$ is a soft label. Simply taking the \texttt{argmax} to get a hard label can
break the computational graph,
so we replace the label loss $W_2^2(\alpha_{y}, \alpha_{\bar y} )$ by $y^\top M \bar{y}$, where $y$ is the one-hot label from dataset $Q$.
And $M \in \mR_{\ge 0}^{C_{Q} \times C_{P}}$ is the label-to-label matrix where $M(i,j) := W_2^2(\alpha_{y_i}, \alpha_{y_j}).$
% $M$ is the label-to-label matrix in Definition \ref{def:Q_ds_distance}, 
 The matrix $M$ is precomputed before the training, and is frozen during the training.

We pre-train the feature map $G$ to be an identity map before the main adversarial training. We use the Exponential Moving Average\footnote{\url{https://github.com/fadel/pytorch_ema}} of the trained feature maps as the final feature map.

\paragraph{Data processing} For all the *NIST datasets, we rescale the images to size $32\times 32$, and repeat their channel 3 times and obtain 3-channel images. We use the default train-test split from \texttt{torchvision}.
For the VTAB datasets, we use a masked auto-encoder with 
196 batches and 1024 embed dimension based on ViT-Large. So the final embedding dimension is $197 \times 1024 = 201728$.  We also use the default train-test split from \texttt{torchvision}.

\paragraph{Hyperparameters} For the experimental results in \S\ref{sec:nist}, we use the OTDD neural map and train them using Adam optimizer with learning rate $10^{-3}$ and batch size 64. We train a LeNet for 2000 iterations, and fine-tune for 100 epochs. Regarding the comparison with other baselines in \S\ref{sec:nist}, for transfer learning methods, we train a SpinalNet for $10^4$ iterations, and fine-tune it for $2000$ iterations on the test dataset. Training from scratch on the test dataset takes also 2000 iterations. For the results in \S\ref{sec:vtab}, we pre-train the ResNet-18 model for 5 epochs, then fine-tune the model on the few-shot dataset for 10 epochs. During fine-tuning, we still let the whole network tunable. The batch size is 128, and the learning rate is $10^{-3}$.

\section{Discussions over complexity-accuracy trade-off}

We agree that our method is more computationally demanding than Mixup in general.
Specifically, we consider Mixup and our methods to occupy different points of a compute-accuracy trade-off characterized by the expressivity of the geodesics between datasets they define. That being said, the trade-off is nevertheless not a prohibitive one, as shown by the fact that we can scale our method to VTAB-sized datasets with a very standard GPU setup.

‘Vanilla’ mixup with uniform dataset weights is indeed quite cheap (but, as shown in Table \ref{tab:vtab}, considerably worse than alternatives). On the other hand, the version of Mixup that uses the ‘optimal’ mixture weights (labeled Mixup - optimal in Table \ref{tab:vtab}, and the only Mixup version in Table 1) requires solving Eq. \eqref{eq:ds_eq}, which involves non-trivial computing to obtain OTDD maps. In the context of the trade-off spectrum described above, Mixup with optimal weights is strictly in between vanilla Mixup and OTDD interpolation.

\section{Additional results}\label{sec:nist_maps}


\begin{figure}[h]
  \centering
  %   \vspace{-0.4cm}
  \begin{subfigure}{0.35\textwidth}
    \includegraphics[width=1\linewidth]{images/permute.png}
    % \vspace{-0.5cm}
  \end{subfigure}
  \caption{The numbers above images are the labels. In the first labelling method, all 0 MNIST digits are assigned as class "0", and they are labelled as class "7" in the bottom labelling.}
  \label{fig:labelling}
\end{figure}
\subsection{OTDD neural map visualization}
% % We present a visualization of the pushforward NIST images of OTDD neural map in Figure \ref{fig:EMNIST}.
% In Figure \ref{fig:EMNIST}, we in addition provide qualitative results of OTDD map from EMNIST (letter)~\citep{cohen2017emnist} dataset to all other *NIST dataset and USPS dataset. At this point, we can confirm three traits of OTDD map, which are mentioned at the end of \S\ref{sec:map}.
% \begin{wrapfigure}[11]{r}{8.6cm}
%   % \begin{figure}[h]
%   \centering
% %   \vspace{-0.4cm}
%   \begin{subfigure}{0.5\textwidth}
%     \includegraphics[width=1\linewidth]{images/permute.png}
%     % \vspace{-0.5cm}
%   \end{subfigure}
%   \caption{The numbers above images are the labels. In the first labelling method, all 0 MNIST digits are assigned as class "0", and they are labelled as class "7" in the bottom labelling.}
%   \label{fig:labelling}
% \end{wrapfigure}
% 1) We don't assume a known source label to target label correspondence. So we can map between two irrelevent datasets such as EMNIST and FashinMNIST.
% % As a result, two datasets can have different number of labels; 
% 2) The map is invariant to the permutation of label assignment. For example, we show two different labelling in Figure \ref{fig:labelling}, and the final OTDD map will be the same.
% 3) It doesn't enforce the label to label mapping but would follow the feature similarity. From Figure \ref{fig:EMNIST} in the appendix, we notice many cross-class mapping behaviors. For example, when the target domain is USPS~\citep{hull1994database} dataset, the lower-case letter "l" is always mapped to digit 1, and the capital letter "L" is mapped to other digits such as 6 or 0 because the map follows the feature similarity.

% \begin{figure}[h!]
%   \centering
%   \begin{subfigure}{0.9\textwidth}
%     \centering
%     \includegraphics[width=1\linewidth]{images/EMNIST.png}
%   \end{subfigure}
%   \caption{The dataset $Q$ is EMNIST (letters). We show all the datasets pushforwarded towards Fashion-MNIST, MNIST, USPS, KMNIST by OTDD map. The OTDD map is solved by neural OT method.}
%   \label{fig:EMNIST}
% \end{figure}

We show the OTDD neural map between  2D Gaussian mixture models with 16 components in Figure \ref{fig:chess}. This example is very special so that we have the closed-form solution of OTDD map. The feature map is a identity map and the pushforward label is equal to the corresponding class that has the same conditional distribution $p(x|y)$ as source label. For example, the sample from top left corner cluster is still mapped to the top left corner cluster, and the label is changed from blue to orange. This map achieves zero transport cost. Since the transport cost is always non-negative, this map is the optimal OTDD map.
However, \cite{asadulaev2022neural,bunne2022supervised} enforce mapping to preserve the labels, so with their methods, the blue cluster would still map to the blue cluster. Thus their feature map is highly non-convex and more difficult to learn. We refer to Figure 5 in \citet{asadulaev2022neural} for their performance on the same example. Compared with them, our pushforward dataset aligns with the target dataset better.
% conditional distribution $p(y|x)$ of target distribution.

\begin{figure}[h!]
  \centering
  \begin{subfigure}{1\textwidth}
    \centering
    \includegraphics[width=0.85\linewidth]{images/chess.png}
  \end{subfigure}
  \caption{OTDD neural map for 2D Gaussian mixture distributions.}
  \label{fig:chess}
\end{figure}

\subsection{McCann's interpolation between datasets}

Our OTDD map can be extended to generate  McCann's interpolation between datasets. We propose an anolog of McCann's interpolation \eqref{eq:mccan} in the dataset space. We define McCann's interpolation between datasets $P_0$ and $P_1$ as
\begin{align}
  P^M_t: = ((1-t) {\rm Id} + t \cT^* )\sharp P_0 , \quad t \in [0,1],
\end{align}
where $\cT^*$ is the optimal OTDD map from $P_0$ to $P_1$ and $t$ is the interpolation parameter. The superscript $M$ of $P_t^M$ means McCann. We use the same convex combination method in \S\ref{sec:comb} to obtain samples from $P^M_t$.
Assume $(x_0,y_0) \sim P_0,~ (x_1,y_1) = \cT^*(x_0,y_0)$ and $P_0, P_1$ contain 7, 3 classes respectively, i.e. $y_0 \in \{0,1\}^7, y_1 \in \{0,1\}^3$. Then the combination of features is $x_t = (1-t) x_0 + t x_1 $, and the combination of labels is
\begin{align}
  y_t = (1-t)
  \begin{bmatrix}
    y_0 \\ \mathbf{0}_{3}
  \end{bmatrix}
  + t
  \begin{bmatrix}
    \mathbf{0}_7 \\ y_1
  \end{bmatrix}.
\end{align}
Thus $(x_t, y_t)$ is a sample from $((1-t) {\rm Id} + t \cT^* )\sharp P_0 $. We visualize McCann's interpolation between two Gaussian mixture distributions in Figure \ref{fig:mccan_2d}. This method can map the labeled data from one dataset to another, and do the interpolation between them. Thus we can use it to map abundant data from an external dataset, to a scarce dataset for data augmentation. For example, in Figure \ref{fig:mccan_2d_imb}, the target dataset only has 30 samples, but the source dataset has 60000 samples. We learn the OTDD neural map between them and solve their interpolation. We find that $P_1^M$ creates new data out of the domain of the original target distribution, which Mixup~\citep{zhang2018mixup} can not achieve. Thus, the data from $P_t^M$ for $t$ close to 1.0 can enrich the target dataset, and be potentially used in data augmentation for classification tasks.
% To deal with the convex combination of data from 

\begin{figure}[h!]
  \centering
  \begin{subfigure}{1\textwidth}
    \centering
    \includegraphics[width=1\linewidth]{images/gmm2-3.png}
  \end{subfigure}
  \caption{McCann's interpolation for 2D labelled datasets. Each color represents a class. When $t \rightarrow 1.0$, the samples within blue classes become less and less, and finally disappear when $t=1.0$.}
  \label{fig:mccan_2d}
\end{figure}

\begin{figure}[ht!]
  \centering
  \begin{subfigure}{1\textwidth}
    \centering
    \includegraphics[width=1\linewidth]{images/gmm2-3_imbalance.png}
  \end{subfigure}
  \caption{Data augmentation by mapping  an external dataset to a few-shot dataset.
    % by McCann's interpolation.
  }
  \label{fig:mccan_2d_imb}
\end{figure}

\subsection{Correlation study of *NIST experiments}
 A more concrete visualization of the correlation  between $\mathcal{W}^2(P_a, Q)$ and *NIST transfer learning test accuracy is shown in Figure \ref{fig:corr}. Among all datasets, USPS and KMNIST lack correlation. 
 We believe it’s caused by (i) small variance in the distances from pretraining dataset to target dataset, implying a limited relative diversity of datasets on which to draw on and (ii) (in the case of USPS) a very simple task where baseline accuracy is already very high and hard to improve upon via transfer. 
 
\begin{figure*}[ht!]
  \centering
  \begin{subfigure}{1\textwidth}
    \centering
    \includegraphics[width=1\linewidth]{images/corr.jpg}
  \end{subfigure}
  \caption{Pearson correlation between the (averaged) function $\cW^2(P_a, Q)$ and the test accuracy of the fine-tuned model.
    Most datasets present a negative correlation between $\cW^2(P_a, Q)$ and the accuracy.
    When test dataset is USPS or KMNIST (rightmost two), 
    all three training datasets are similarly distant to the test dataset;
    thus, the range of $\cW^2(P_a, Q)$ is not wide enough to show an obvious negative correlation.
    % the accuracy results
    % %   from different training datasets 
    % are rather similar as well.
    %   pre-training cannot improve on them. 
    This explains the nearly zero slope and relatively large $p$-value for those two datasets.
    %   accuracy is low for any training dataset $P_a$, and we observe a weak positive correlation.
    Similar pattern has been observed in \citet[Figure 5(a)]{yeaton2022hierarchical}.
  }
  \label{fig:corr}
\end{figure*}

\subsection{Fine-grained analysis over $\cW^2 (P_a,W)$ in *NIST experiments}

In Table \ref{tab:w_stat}, we provide a more fine-grained analysis for different aspects of $\mathcal{W}(P_a,Q)$ and their effect on transfer accuracy. To do so, we provide the min, median, range, and standard deviation of $\mathcal{W}(P_a,Q)$ in the table below. In addition, as a proxy for the hardness / best possible gain from transfer learning, we show in the last column \textit{OTDD accuracy} minus \textit{few shot accuracy}, where \textit{OTDD accuracy} and \textit{few shot accuracy} are the mean accuracies in Rows 1 and 4, respectively, in Table \ref{tab:compare}. 

Based on these statistics, we make the following observations on the relation between $\mathcal{W}(P_a,Q)$ and transfer accuracy:

\begin{itemize}
\item The accuracy improvement is strongly driven by $\min_a \mathcal{W}(P_a,Q)$. EMNIST and MNIST are with relatively smaller $\min_a \mathcal{W}(P_a,Q)$ and share the largest improvement margin. On the other hand, FMNIST and KMNIST as $Q$ have the largest $\mathcal{W}(P_a,Q)$ to the other pre-training datasets, and have relatively smaller accuracy gain. In other words, the correlation between distance and accuracy is stronger in the part of the convex dataset polytope that is closest to the target dataset.
\item The strength of the correlation between $\mathcal{W}(P_a,Q)$ and accuracy seems to depend on the \textbf{range} and \textbf{standard deviation} of he former. On the one hand, \textbf{settings with low dynamic range in $\mathcal{W}(P_a,Q)$ (like USPS and EMNIST) make it harder to observe meaningful differences in accuracy}. On the other hand, this indicates that those datasets are roughly (or at least \textbf{more}) equidistant from all pretraining datasets, and therefore any convex combination of them will also be close to equidistant from the target, yielding no visible improvement.
\item Intrinsic task hardness matters. Consider USPS: all pretraining datasets, regardless of distance, seem to yield very similar accuracy on it, and it has the lowest accuracy gain (only $\sim$5\%) among 5 tasks. But considering that the no-transfer (i.e. 5-shot) accuracy is already almost 81\%, it is clear that the benefit from transfer learning is “a priori” limited, and therefore all pretraining datasets yield a similar minor improvement.
\end{itemize}

\begin{table}[h]
\centering
\caption{Statistics of $\cW(P_a,Q)$
 and transfer accuracy in *NIST experiments (\S \ref{sec:nist}).}
\begin{tabular}{|l|c|c|c|c|c|}
\hline
Test dataset & 
% min of $\mathcal{W}(P_a,Q)$ 
\begin{tabular}[c]{@{}c@{}} Mean of \\ $\mathcal{W}(P_a,Q)$ \end{tabular}
& \begin{tabular}[c]{@{}c@{}} Median of \\
$\mathcal{W}(P_a,Q)$ \end{tabular} & \begin{tabular}[c]{@{}c@{}} Range of \\
$\mathcal{W}(P_a,Q)$ \end{tabular}  & \begin{tabular}[c]{@{}c@{}} Standard deviation \\
 of $\mathcal{W}(P_a,Q)$ \end{tabular} & \begin{tabular}[c]{@{}c@{}} Mean of accuracy \\
 improvement \end{tabular}  \\
\hline
EMNIST & 34.41 & 43.71 & 39.58 & 9.94 & 13.46 \\
MNIST & 39.13 & 49.04 & 44.17 & 11.35 & 20.94 \\
FMNIST & 44.19 & 54.75 & 39.11 & 10.64 & 10.62 \\
USPS & 42.04 & 48.32 & \textbf{23.49} & \textbf{6.13} & 5.28 \\
KMNIST & 47.65 & 53.92 & \textbf{24.83} & \textbf{6.19} & 10.88 \\
\hline
\end{tabular}
\label{tab:w_stat}
\end{table}



\subsection{Full results of VTAB experiments}
In Section \ref{sec:vtab}, we only showed the relative improvement of the test accuracy compared to non-pretraining. Here we will show the full test accuracy results. We keep the hyper-parameters consistent through all pre-training datasets. Table \ref{tab:vtab_full} clearly shows that the interpolation dataset with optimal weight assigned by our method can have a better performance than a na\"{\i}ve uniform weight. And with the same weight, our OTDD map will give a higher accuracy than Mixup because Mixup does not use the information from the reference dataset (see Figure \ref{fig:proj_2d}).

\paragraph{Poor sub-pooling performance} We show the sub-pooling baseline as a non-trivial method to combine datasets. However, it performs poorly, and we believe there are two main reasons for this. First, this baseline wastes relevant label data, by discarding the original labels of the pretraining dataset and replacing them with the inputted nearest-neighbor label from the target examples. Secondly, it only uses the neighbors of the pet dataset, leaving all other datapoints unused.

\begin{table}[H]
\caption{Test accuracy (mean $\pm$ std over 5 runs in percent) of 1000-shot learning on Oxford-IIIT Pet test dataset. 
% TL is short for transfer learning. 
Non-transfer learning skips the pre-training step.
}\label{tab:vtab_full}
\centering
{\renewcommand{\arraystretch}{1.1}%
\begin{tabular}{|cc|c|}
\hline
\multicolumn{1}{|c|}{\multirow{9}{*}{Transfer learning}} & OTDD map (optimal weight) 
  & \textbf{22.60 $\pm$ 1.01} \\ \cline{2-3} 
\multicolumn{1}{|c|}{}                    & OTDD map (uniform weight)           & 21.06 $\pm$ 0.45 \\ \cline{2-3} 
\multicolumn{1}{|c|}{}                    &   Mixup (optimal weight)      & 17.45  $\pm$ 2.2 \\ \cline{2-3} 
\multicolumn{1}{|c|}{}                    & Mixup (uniform weight)         & 15.4 $\pm$ 1.56 \\ \cline{2-3} 
\multicolumn{1}{|c|}{}                    & \textsc{Caltech101}  & 18.24 $\pm$ 3.42 \\ \cline{2-3} 
\multicolumn{1}{|c|}{}                    &  \textsc{DTD}   &11.46 $\pm$ 0.68 \\ \cline{2-3} 
\multicolumn{1}{|c|}{}                    & \textsc{Flowers102}    & 11.11 
 $\pm$ 1.92 \\  \cline{2-3} 
 \multicolumn{1}{|c|}{}      & \textsc{Pooling}    & 14.88 $\pm$
 0.57 \\  \cline{2-3} 
 \multicolumn{1}{|c|}{}       & \textsc{Sub-pooling}    & 14.88 $\pm$
 0.57 \\ \hline 
\multicolumn{2}{|c|}{Non-transfer learning}                           & 11.71 $\pm$ 1.65 \\ \hline
\end{tabular}
}
\end{table}



\bibliography{fan_236}

\end{document}
