\documentclass[accepted]{uai2022} % for initial submission
% \documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:

%\usepackage{natbib} % has a nice set of citation styles and commands
%    \bibliographystyle{plainnat}
%    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example


%%%%
\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}    % use 8-bit T1 fonts
\usepackage{hyperref}       % hyperlinks
\usepackage{url}            % simple URL typesetting
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathrsfs}
\usepackage{graphicx}
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{amsthm}
\usepackage{subcaption}
\usepackage{xcolor}
\usepackage[toc,page]{appendix}
\usepackage{enumitem}
\usepackage{nameref}
\usepackage{zref-xr,zref-user}


\usepackage{biblatex}
\addbibresource{turrisi_445.bib} %Import the bibliography file

\newtheorem{theorem}{Theorem}
\newtheorem{proposition}{Proposition}
\newtheorem{lemma}{Lemma}
\newtheorem{definition}{Definition}
\newtheorem{example}{Example}
\newtheorem{remark}{Remark}

\newcommand{\rf}[1]{{\color{blue} #1}}
\newcommand{\ros}[1]{{\color{red} #1}}
\newcommand{\rosanna}[1]{{\color{green} #1}}
\newcommand{\ar}[1]{{\color{orange} #1}}
\newcommand{\mass}[1]{{\color{brown} #1}}

\usepackage[textwidth=2.0cm, textsize=tiny]{todonotes} % for writting
\newcommand{\rt}[2][noinline]{\todo[color=red!20,#1]{{\bf Ros:} #2}}
\newcommand{\remi}[2][noinline]{\todo[color=blue!20,#1]{{\bf Remi:} #2}}
\newcommand{\gino}[2][noinline]{\todo[color=yellow!20,#1]{{\bf Gino:} #2}}
\newcommand{\mas}[2][noinline]{\todo[color=brown!20,#1]{{\bf Mas:} #2}}
\newcommand{\alain}[2][noinline]{\todo[color=orange!20,#1]{{\bf Alain:} #2}}
\newcommand{\leo}[2][noinline]{\todo[color=black!20,#1]{{\bf Leo:} #2}}

%\usepackage{caption} 
%\captionsetup[table]{skip=10pt}
\newcommand{\red}[1]{\textcolor{blue}{#1}}
\hypersetup{
     colorlinks = true,
     linkcolor = blue,
     anchorcolor = blue,
     citecolor = blue,
     filecolor = blue,
     urlcolor = blue
     }

\begin{document}
\onecolumn

\begin{center}
{\bf \huge Supplementary Material}
\end{center}

% for Multi-source Domain Adaptation via Weighted Joint Distributions Optimal Transport}


\setcounter{section}{0}
\setcounter{theorem}{0}
\setcounter{definition}{0}
\setcounter{lemma}{0}
\setcounter{table}{2}
%\setcounter{theorem}{0}

\vspace{.3truecm}
\appendix
{The supplementary material is organized as follows. In Section~\ref{app:A}
we provide proof of {\bf Lemma 1}, {\bf Lemma 2} and {\bf Theorem 1}. For reader's convenience the results are repeated in this supplementary material. Section \ref{app:B} recalls the MSDA-WJDOT algorithm and defines the projection to the simplex implemented in the algorithm. Finally, in Section \ref{app:C} we present additional numerical experiments.}

\section{Proofs}
\label{app:A}


\subsection{Proof of Lemma 1}

\begin{lemma}\label{lemma1}
For any hypothesis $f \in \mathcal{H}$, denote as $\varepsilon_{p_T}(f)$ and $\varepsilon_{p_S^{\boldsymbol{\alpha}}}(f)$, the expected loss of $f$ on the \textit{target}  and on the weighted sum of the \textit{source} domains, with respect to a loss function $L$ bounded by $B$. We have
\begin{equation}\label{eq:bound_tv}
    \varepsilon_{p_T}(f) \leq \varepsilon_{p_S^{\boldsymbol{\alpha}}}(f) +B \cdot D_{TV}\left(p_S^{\boldsymbol{\alpha}},p_T\right)
\end{equation}
where $p_S^{\boldsymbol{\alpha}}=\sum_{j=1}^J \alpha_j p_{S,j}$ is a convex combination of the \textit{source} distributions with weights $\boldsymbol{\alpha}\in\Delta^J$, and $D_{TV}$ is the total variation distance.
\end{lemma}
\begin{proof} We define the error of an hypothesis $f$ with respect to a loss function $L(\cdot,\cdot)$ and a joint probability distribution $p(x,y)$ as 
$$
\varepsilon_{p}(f) = \int p(x,y) L(y,f(x)) dxdy
$$
then using simple arguments, we have
\begin{align}
    \varepsilon_{p_T}(f) &= \varepsilon_{p_T}(f) + \varepsilon_{p_S^{\boldsymbol{\alpha}}}(f) - \varepsilon_{p_S^{\boldsymbol{\alpha}}}(f)   \\ \nonumber
    &\leq\varepsilon_{p_S^{\boldsymbol{\alpha}}}(f)  + | \varepsilon_{p_T}(f) - \varepsilon_{p_S^{\boldsymbol{\alpha}}}(f)| \\\nonumber
    &\leq \varepsilon_{p_S^{\boldsymbol{\alpha}}}(f) + \int | p_S^{\boldsymbol{\alpha}}(x,y) - p_T(x,y)||L(y,f(x)| dxdy \\\nonumber
    & \leq \varepsilon_{p_S^{\boldsymbol{\alpha}}}(f) + B \int \big | p_S^{\boldsymbol{\alpha}}(x,y) - p_T(x,y) \big | dxdy
\end{align}
and using the definition of the total variation distance between distribution we conclude the proof.
\end{proof}


\subsection{Proof of Theorem 1}
The proof of this theorem follows the same steps as the one proposed by \cite{Courty2017} and we reproduce it here for a sake of completeness.

%\mass{[The following definition seems to involve only marginal distributions. I am correct? Also the function $\phi$ should satisfy further properties (decreasing?) otherwise choose $\phi = 1$ and the bound trivially hold]}
\begin{definition}[{Probabilistic Transfer Lipschitzness -- PLT Property}] Let $p_S$ and $p_T$ be respectively the \textit{source} and \textit{target}  distributions. Let $\phi : \mathbb{R} \rightarrow [0,1]$. A labeling function $f : \mathcal{G} \rightarrow \mathbb{R}$ and a joint distribution $\pi\in\Pi(p_S,p_T)$ over $p_S$ and $p_T$ are $\phi$-Lipschitz transferable if for all $\lambda > 0$, we have
$$
{\rm Prob}_{(x_S,x_T)\sim \pi}\big [|f(x_S) - f(x_T) |] > \lambda D(x_S,x_T) \big ] \le \phi(\lambda)
$$
with $D$ being a metric on $\mathcal{G}$.
\end{definition}
%As stated in \citep{Courty2017}, given a function $f$ and coupling
%$\pi$, the PTL 
This property provides a bound on the probability of finding a couple of source-target examples that are differently labeled in a $(1/\lambda)$-ball with respect to $\pi$ and the metric $D$.


\begin{definition}{{\em (Similarity measure)}} 
Let $\mathcal{H}$ be a space of $M$-Lipschitz labelling functions. Assume 
%also that the input space is so 
that, for every $f \in \mathcal{H}$ and $x,x' \in {\cal G}$, $|f(x) - f(x^\prime)| \leq M$.
The similarity between $p_{S}^{\boldsymbol{\alpha}}$ and $p_T$ can defined \cite[Def. 5]{ben2010impossibility} as 
\begin{equation}
\Lambda(p_S^{\boldsymbol{\alpha}},p_T)=\min_{f\in\mathcal{H}} \varepsilon_{p_S^{\boldsymbol{\alpha}}}(f) + \varepsilon_{p_T}(f),\label{eq:Lambda_supp}
\end{equation}
% $$%\texstyle
% \Lambda(p^{\boldsymbol{\alpha}},p^T)=\min_{f\in\mathcal{H}} \varepsilon_{p^{\boldsymbol{\alpha}}}(f) + \varepsilon_{p^T}(f)
% $$ 
where the risk is measured w.r.t. to a symmetric and $k$-Lipschitz loss function that satisfies the triangle inequality.
\label{defLambda_supp}
\end{definition}


\begin{lemma} 
Let $\mathcal{H}$ be the space described in Definition \ref{defLambda_supp} and assume that the function $f^*$ minimizing the Similarity measure in Eq. \ref{eq:Lambda_supp} satisfies the PTL property. Then, for any $f \in \mathcal{H}$, we have
%Let $f$ be any labelling function belonging to the continuous hypothesis space $\mathcal{H}$ and suppose that  $\forall f \in \mathcal{H}$  $f$ is a $M$-Lipschitz function. 
%We define $\Lambda(p^{\boldsymbol{\alpha}},p^T)=\min_{f\in\mathcal{H}} \varepsilon_{p^{\boldsymbol{\alpha}}}(f) + \varepsilon_{p^T}(f)$ a measure of similarity between the two distribution introduced in \cite[Def. 5]{ben2010impossibility}. We suppose that the optimal solution $f^\star$ in $\Lambda(p^{\boldsymbol{\alpha}},p^T)$  satisfies the Probabilistic Transfer Lipschitzness (PTL) property \cite{Courty2017} and recall that $p^{\boldsymbol{\alpha}}=\sum_s \alpha_s p_s$. If we also suppose that the loss function $L$ satisfies the triangle inequality and is $k$-Lipschitz then
% \begin{equation}\label{eq:gen_bound}
% \varepsilon_{p^T}(f) \leq W_D\left(\sum_s \alpha_s p_s, p^f\right)
% + \ W_D( p^f,p^T_\dagger) + \ W_D( p^T_\dagger, p^T) + 
% \varepsilon_{p^{\boldsymbol{\alpha}}}(f^\star) + \varepsilon_{p^T}(f^\star) + 2kM \phi(\lambda)
% \end{equation}
% where $p^T_\dagger$ is the projection (in the Wasserstein sense) of the true \textit{target}  joint distribution on the convex envelope defined by the \textit{source} distributions \mas{Do you mean on the convex hull of the \textit{source} distributions? Anyway I much prefer the second bound below}.  (\rf{RF: Or i prefer this one without the weird pr"projection"})
\begin{equation}\label{eq:gen_bound}
\varepsilon_{p_T}(f) \leq W_D\left(p_S^{\boldsymbol{\alpha}}, p_T^f\right)
+\Lambda(p_S^{\boldsymbol{\alpha}},p_T)
+ kM \phi(\lambda),
\end{equation}
where $ \phi(\lambda)$ is a constant depending on the PTL of $f^\star$.
\end{lemma}
\begin{proof}
We have that 
$$
\begin{aligned}
    \varepsilon_{p_T}(f) & \equiv \mathbb{E}_{(x,y)\sim p_T} \big[L(y,f(x))\big] \\
 &\leq \mathbb{E}_{(x,y)\sim p_T} \big[L(y,f^\star(x)) + L(f^\star(x),f(x))\big] \\
 &= \varepsilon_{p_T}(f^\star)+ \mathbb{E}_{(x,y)\sim p_T} \big[L(f^\star(x),f(x))]\\
 &= \varepsilon_{p_T}(f^\star)+ \mathbb{E}_{(x,y)\sim p_{T}^f} \big[L(f^\star(x),f(x))] \\
 & = \varepsilon_{p_T}(f^\star) + \varepsilon_{p_T^f}(f^\star) + \varepsilon_{p_S^{\boldsymbol{\alpha}}}(f^\star)  
 - \varepsilon_{p_S^{\boldsymbol{\alpha}}}(f^\star)  \\
 & \leq |\varepsilon_{p_T^f}(f^\star)  - \varepsilon_{p_S^{\boldsymbol{\alpha}}}(f^\star) |  + \varepsilon_{p_S^{\boldsymbol{\alpha}}}(f^\star)  
 +\varepsilon_{p_T}(f^\star)
\end{aligned}
$$
where the second equality comes from the symmetry of the loss function and the third one is due to the fact that  $ \mathbb{E}_{(x,y)\sim p_T} L(f^\star(x),f(x)) =\mathbb{E}_{(x,y)\sim p_T^f} L(f^\star(x),f(x))=\mathbb{E}_{x\sim \mu_T} L(f^\star(x),f(x))$ since the label $y$ is not used in the expectation.
%.https://www.overleaf.com/project/5e171532c504fe000157d8ba

Now,  we analyze the first term in the {\rm r.h.s.} of the last inequality. Note  that samples drawn from  $p_T^f$ distribution can be expressed as  $(x_T,y_T^f)\sim p_T^f$ with $y_T^f=f(x_T)$.
%\mass{Below we start with $\pi^\star$ then we drop the $\star$. A typo?}
\begin{align}
|\varepsilon_{p_T^f}(f^\star)  {-}\varepsilon_{p_S^{\boldsymbol{\alpha}}}(f^\star)| &
=  \left|\int_{\mathcal{G} \times \mathbb{R}} L(y,f^\star(x)) (p_T^f(x,y) - p_S^{\boldsymbol{\alpha}}(x,y)) dxdy           \right| %\ros{p_{T}^{f} \text{ instead of } p_{T}?} 
\nonumber \\
 &= \left |\int_{\mathcal{G} \times \mathbb{R}} L(y,f^\star(x)) d(p_T^f - p_S^{\boldsymbol{\alpha}}) \right| %\ros{p_{T}^{f} \text{ instead of } p_{T}?}
 \nonumber \\
 & \leq \int_{(\mathcal{G} \times \mathbb{R})^2}\Big| L(y_T^f, f^\star(x_T)) - L(y_{\boldsymbol{\alpha}}, f^\star(x_{\boldsymbol{\alpha}}))\Big| 
d\pi^\star((x_{\boldsymbol{\alpha}},y_{\boldsymbol{\alpha}}),(x_T,y_T^f)) \label{eq:dualkanto} %\ros{f^{*} \text{ instead of } f?}
\\
%& = \int_{(\mathcal{G} \times \mathbb{R})^2} \Big[ L(y_T^f, f^\star(x_T)) - L(y_T^f, f^\star(x_{\boldsymbol{\alpha}}))%\ros{f^{*} \text{ instead of } f?} 
%\nonumber\\
%%
%&  \quad\quad +  L(y_T^f, f^\star(x_{\boldsymbol{\alpha}})) - L(y_{\boldsymbol{\alpha}}, f^\star(x_{\boldsymbol{\alpha}})) \Big] d\pi^\star((x_{\boldsymbol{\alpha}},y_{\boldsymbol{\alpha}}),(x_T,y_T^f)) \nonumber\\
%
& \leq  \int_{(\mathcal{G} \times \mathbb{R})^2} \Bigg[\Big| L(y_T^f, f^\star(x_T)) {-} L(y_T^f, f^\star(x_{\boldsymbol{\alpha}}))\Big|%\ros{f^{*} \text{ instead of } f?} 
\nonumber\\
&  \quad\quad + \Big| L(y_T^f, f^\star(x_{\boldsymbol{\alpha}})) {-} L(y_{\boldsymbol{\alpha}}, f^\star(x_{\boldsymbol{\alpha}})) \Big|\Bigg] d\pi^\star((x_{\boldsymbol{\alpha}},y_{\boldsymbol{\alpha}}),(x_T,y_T^f)) \nonumber\\
%
& \leq  \int_{(\mathcal{G} \times \mathbb{R})^2} \Bigg[  k \big|  f^\star(x_T) - f^\star(x_{\boldsymbol{\alpha}})) \big|  + \Big| L(y_T^f, f^\star(x_{\boldsymbol{\alpha}})) {-} L(y_{\boldsymbol{\alpha}}, f^\star(x_{\boldsymbol{\alpha}}) \Big|\Bigg] d\pi^\star((x_{\boldsymbol{\alpha}},y_{\boldsymbol{\alpha}}),(x_T,y_T^f)) \label{eq:theoklip}\\
%
& \leq    k M \phi(\lambda)  +\hspace{-.1truecm} \int_{(\mathcal{G} \times \mathbb{R})^2} \hspace{-.1truecm} \Bigg[ k\lambda D(x_T,x_{\boldsymbol{\alpha}}) {+}  \Big| L(y_T^f, f^\star(x_{\boldsymbol{\alpha}})) {-} L(y_{\boldsymbol{\alpha}}, f^\star(x_{\boldsymbol{\alpha}})) \Big|\Bigg] d\pi^\star((x_{\boldsymbol{\alpha}},y_{\boldsymbol{\alpha}}),(x_T,y_T^f)) \label{eq:theoptl}\\
%
& \leq    k M \phi(\lambda)  + \int_{(\mathcal{G} \times \mathbb{R})^2} \Bigg[\beta D(x_T,x_{\boldsymbol{\alpha}}) +   \Big| L(y_T^f, y_{\boldsymbol{\alpha}}) \Big|\Bigg] d\pi^\star((x_{\boldsymbol{\alpha}},y_{\boldsymbol{\alpha}}),(x_T,y_T^f)) \label{eq:theosymm}\\
%
&=   k M \phi(\lambda) + W_D(p_S^{\boldsymbol{\alpha}}, p_T^f).
\end{align}
Inequality in line \eqref{eq:dualkanto} is due to the Kantorovitch-Rubinstein theorem stating that for any coupling $\pi \in \Pi(p_{S}^{\boldsymbol{\alpha}},p_T)$ the following inequality holds
$$
  \left |\int_{\mathcal{G} \times \mathbb{R}} L(y,f^\star(x)) d(p_T^f - p_S^{\boldsymbol{\alpha}}) \right| 
\leq   \left |\int_{(\mathcal{G} \times \mathbb{R})^2}| L(y_T^f, f^\star(x_T)) - L(y_{\boldsymbol{\alpha}}, f^\star(x_{\boldsymbol{\alpha}})| 
d\pi((x_{\boldsymbol{\alpha}},y_{\boldsymbol{\alpha}}),(x_T,y_T^f))   \right |,
$$
followed by an application of the triangle inequality. Since, the above inequality applies for
any coupling, it applies also for $\pi^\star$. Inequality \eqref{eq:theoklip} is due to the assumption that the loss function is $k$-Lipschitz in its second argument. Inequality \eqref{eq:theoptl} derives from the PTL property with probability $1-\phi(\lambda)$ of $f^\star$ and $\pi^\star$. In addition, taking into account that the difference between two samples with respect to $f^\star$ is bounded by $M$, we have the term $kM\phi(\lambda)$ that covers the regions where PTL assumption does not hold. Inequality \eqref{eq:theosymm} is obtained from the symmetry of $D(\cdot,\cdot)$, the triangle inequality on the loss and by posing $k\lambda=\beta$.
\end{proof}


{
First we need to prove the following Lemma.
\begin{lemma} \label{lem:wass} For any distributions $\hat p_{S,j} , p_{S,j}$ and $\boldsymbol{\alpha}\in\Delta^J$ in the simplex we have
\[
W_D\left(\sum_{j=1}^J \alpha_j \hat p_{S,j}, \sum_{j=1}^J \alpha_j  p_{S,j}\right)\leq \sum_{j=1}^J \alpha_j W_D\left( \hat p_{S,j}, p_{S,j}\right).
\]
\end{lemma}

\begin{proof}
First we recall that the Wasserstein Distance between two distribution is 
\begin{equation}
    W_D(p,p')=\min_{\pi\in\Pi(p,p')} \int D(\mathbf{v},\mathbf{v}')\pi(\mathbf{v},\mathbf{v}')d\mathbf{v}d\mathbf{v}',
\end{equation}
where $\Pi(p,p')=\{\pi|\int \pi(\mathbf{v},\mathbf{v}')d\mathbf{v}'=p(\mathbf{v}), \int \pi(\mathbf{v},\mathbf{v}')d\mathbf{v}=p'(\mathbf{v}')\}$. Let $\pi_{S,j}^*$ be the optimal OT matrix between $\hat p_{S,j}$ and  $p_{S,j}$. It is obvious to see that $\sum_{j=1}^J \alpha_j \pi_{S,j}^*$ respects the marginal constraints for $W_D\left(\sum_{j=1}^J \alpha_j \hat p_{S,j}, \sum_{j=1}^J \alpha_j  p_{S,j}\right)$, i.e. 
$\sum_{j=1}^J \alpha_j \pi_{S,j}^* \in \Pi\left(\sum_{j=1}^J \alpha_j \hat p_{S,j}, \sum_{j=1}^J \alpha_j  p_{S,j}\right)$. Hence, $\sum_{j=1}^J \alpha_j \pi_{S,j}^*$ is a feasible solution for the OT problem and, consequently, the cost for this feasible solution is greater or equal than the optimal value  $W_D\left(\sum_{j=1}^J \alpha_j \hat p_{S,j}, \sum_{j=1}^J \alpha_j  p_{S,j}\right)$. Since $ \int D(\mathbf{v},\mathbf{v}')\sum_{j=1}^J\alpha_j \pi_{S,j}^*(\mathbf{v},\mathbf{v}')d\mathbf{v}d\mathbf{v}'=\sum_{j=1}^J \alpha_j W_D\left( \hat p_{S,j}, p_{S,j}\right)$ we recover the Lemma above.
\end{proof}

We can now prove {\bf Theorem 1}, which we also restate for the convenience of the reader.
\begin{theorem}\label{thm:gen2}
%\ros{We already used $\delta$ for the Dirac function!} \rf{i think it is ok it is clear from context that this delta is a real}
Under the assumptions of Lemma 2, let $\hat p_{S,j}$ be $j$-th source empirical distributions of $N_j$ samples and $\hat p_T$ the empirical target distribution with $N_T$ samples. Then for all $\lambda>0$ , with $\beta=\lambda k$ in the ground metric $D$ we have with probability $1-\eta$
%\rt{I think we should remind who's $\beta$}
\begin{equation}\label{eq:gen2}\scriptstyle
\begin{split}
    \varepsilon_{p_T}(f) \leq &W_D\left(\hat p_S^{\boldsymbol{\alpha}}, \hat p_T^f\right)+\sqrt{\frac{2}{c'}\log \frac{2}{\eta}}\left(\frac{1}{N_T}+\sum_{j=1}^J\frac{\alpha_j}{ N_j}\right)+ \Lambda(p_S^{\boldsymbol{\alpha}},p_T)%\ W_D( p^f,p^T_\dagger) + \ W_D( p^T_\dagger, p^T) + 
 + kM \phi(\lambda).
 \end{split}
\end{equation}
\end{theorem}
\begin{proof}
%Let $\pi_s$ be the OT matrix solution of $W_D(\hat p_s, p_s)$ satisfying the marginal constraints $\pi_s\in \Pi(\hat p_s, p_s)$. It is clear because of the linearity of the marginal constraints that $\sum_s  \alpha_s \pi_s \in \Pi(\sum_s  \alpha_s \hat p_s,\sum_s  \alpha_s p_s)$ which means that $\sum_s  \alpha_s \pi_s$ is a feasible point of the optimization problem of $W_D\left(\sum_s \alpha_s \hat p_s, \sum_s \alpha_s  p_s\right)$. Since the objective function is also linear it means that the OT loss for $\sum_s \pi_s$ is equal to $\sum_s \alpha_s W_D\left( \hat p_s, p_s\right)$ and will be greater or equal to $W_D\left(\sum_s \alpha_s \hat p_s, \sum_s \alpha_s  p_s\right)$.
%

%In order to prove Theorem \ref{thm:gen2} first we show that 
By the triangle inequality we have that
\begin{align*}
W_D\left(\sum_{j=1}^J \alpha_j p_{S,j}, p_T^f\right)&\leq W_D\left(\sum_{j=1}^J \alpha_j \hat p_{S,j}, \hat p_T^f\right)+W_D(\hat p_T^f, p_T^f)+ W_D\left(\sum_{j=1}^J \alpha_j \hat p_{S,j}, \sum_{j=1}^J \alpha_j  p_{S,j}\right) \\
&\leq W_D\left(\sum_{j=1}^J \alpha_j \hat p_j, \hat p_T^f\right)+W_D(\hat p_T^f, p_T^f)+ \sum_{j=1}^J \alpha_j W_D\left( \hat p_{S,j}, p_{S,j}\right) \\
% &\leq W_D\left(\sum_s \alpha_s \hat p_s, \hat p^f\right)+\sqrt{\frac{2}{c'}\log\left(\frac{2}{\delta}\right)}\left(\frac{1}{N}+\sum_s\frac{\alpha_s}{ N_s}\right)\\
%&\leq \sum_s \alpha_s \left( W_D\left(\hat p_s, \hat p^f\right)+W_D(\hat p_s, p_s)+W_D(\hat p^f, p^f)\right)\\
%&\leq W_D(\hat p^f, p^f)+ \sum_s \alpha_s \left( W_D\left(\hat p_s, \hat p^f\right)+W_D(\hat p_s, p_s))\right)
\end{align*}
where the last inequality follows from Lemma \ref{lem:wass}.
Using the well known convergence property of the Wasserstein distance proven in \cite{bolley2007quantitative} we find the following bound with probability $1-\eta$ 
\begin{equation}\small
\varepsilon_{p_T}(f) \leq W_D\left(\sum_{j=1}^J \alpha_j \hat p_{S,j}, \hat p_T^f\right)+\sqrt{\frac{2}{c'}\log\left(\frac{2}{\eta}\right)}\left(\frac{1}{N_{T}}+\sum_{j=1}^J\frac{\alpha_j}{ N_j}\right)
+ %\ W_D( p^f,p^T_\dagger) + \ W_D( p^T_\dagger, p^T) + 
\Lambda(p_S^{\boldsymbol{\alpha}},p_T) + 2kM \phi(\lambda)
\end{equation}
with $c'$ corresponding to all \textit{source} and \textit{target}  distributions under similar conditions as in \cite{Courty2017}.
\end{proof}

}



\section{The algorithm}\label{app:B}
We recall here the algorithm we proposed to solve the MSDA-WJDOT problem (Algorithm \ref{alg:wjdot}). $P_{\Delta ^{J}}$ is the projection to the simplex $\Delta ^{J} = \{\pmb\alpha\in\mathbb{R}^{J}| \sum _{j=1}^{J}\alpha _{j}=1, \alpha _{j}\geq 0\}$ defined as 
\begin{equation}
    P_{\Delta ^{J}}(\pmb w) =\operatorname*{argmin}_{\pmb\alpha\in\Delta^{J}} \Vert \pmb{w} - \pmb\alpha \Vert.
\end{equation}

We implemented it by using Algorithm \ref{alg:projection}, firstly proposed in \cite{held1974}.

\begin{algorithm}[h]
\caption{Optimization for MSDA-WJDOT\label{alg:wjdot}}
\begin{algorithmic}
\STATE Initialise $\pmb{\alpha}=\frac{1}{J}\mathbf{1}_J$ and $\pmb{\theta}$ parameters of $f_{\pmb{\theta}}$ and steps $\mu_{\pmb{\alpha}}$ and $\mu_{\pmb{\theta}}$.
\REPEAT %\STATE{<text>} 
\STATE $\pmb{\theta}\leftarrow \pmb{\theta}-\mu_{\pmb{\theta}}\nabla_{\pmb{\theta}} W_D\Big(\hat p_T^{f}, \sum _{j=1}^{J} \alpha _{j} \hat p_{S,j}\Big)$
\STATE $\pmb{\alpha}\leftarrow P_{\Delta^J}\Big(\pmb{\alpha}-\mu_{\pmb{\alpha}}\nabla_{\pmb{\alpha}} W_D(\hat p_T^{f}, \sum _{j=1}^{J} \alpha _{j}\hat p_{S,j}) \Big)$
\UNTIL{Convergence}
\end{algorithmic}
\end{algorithm}

\begin{algorithm}[h]
\caption{Projection to the simplex \cite{held1974}\label{alg:projection}}
\begin{algorithmic}
\STATE Sort $\pmb{w}$ into $\pmb{u}$: $u_{1} \geq \cdots \geq u_{J}$.
\STATE Set $K:=\operatorname*{max}_{1\leq k \leq J} \{k|(\sum _{j=1}^{k} u_{j}-1/k < u_{k}\}.$
\STATE Set $\tau := (\sum _{j=1}^{K} u_{j} -1)/K.$
\STATE For $j=1,\ldots, J$ set $\alpha _{j}:=\max\{w_{j}-\tau, 0\}.$
\end{algorithmic}
\end{algorithm}


\section{Numerical experiments}\label{app:C}

\subsection{Simulated data}
\paragraph{Domain shift}
We generate a data set $(X_{0}, Y_{0})$ 
by drawing $X_{0}$ from a 3-dimensional Gaussian distribution with 3 cluster centers and standard deviation $\sigma=0.8$. 
We keep the same number of examples for each cluster. To simulate the $J$ \textit{sources}, we apply $J$ rotations to the 
input data $X_{0}$ around the $x$-axis. More precisely, we draw $J$ equispaced angles $\theta _{j}$ from $[0, \frac{3}{2}\pi]$
and we get $X_{j}=\{\textbf{x}_{j}^{i}\}$ as
\begin{equation}
{\textbf{x}_{j}^{i}}^{\top} = {\textbf{x}_{0}^{i}}^{\top} \cdot 
\begin{bmatrix}
1 & 0 & 0\\
0 & cos(\theta _{j}) & -sin(\theta _{j}) \\
0 & sin(\theta _{j}) & cos(\theta _{j})
\end{bmatrix}.
\end{equation}
To generate the \textit{target} domain $X_{T}$, we follow the same procedure by randomly choosing an angle $\theta _{T}\in[0, \frac{3}{2}\pi]$.  
We keep the label set fixed, i.e. $Y_{j}=Y_{T}=Y_{0}$. Note that in this case the embedding function $g$ is the identity function and, hence, $\mathcal{X}\equiv \mathcal{G}$.
%\ros{
In the following we report all the experiment we carried out on the simulated data, in which we also investigate to replace the exact Wasserstein distance by the the Bures-Wasserstein distance 
 \begin{equation}
     BW(\mu_S,\mu_T)^2=\|\mathbf{m}_S-\mathbf{m}_T\|^2+ \text{Trace}\left(\Sigma_S+\Sigma_T-2\left(\Sigma_S^{1/2}\Sigma_T\Sigma_S^{1/2}\right)^{1/2}\right),
 \end{equation}
 where the $\mathbf{m}_S,\Sigma_S$ are respectively the first and second order moments of distribution $\mu_S$ (and similarly for  $\mathbf{m}_T,\Sigma_T$). The BW distance has the advantage of having a complexity linear in the number of samples that can scale better to large dataset. 
 We label this method variant with $(B)$, while we refer to the exact OT as $(E)$.\\ 
 
In the following, we investigate the performance of MSDA-WJDOT at varying of the number of \textit{sources} $J$, \textit{source} samples $N_{j}$, and \textit{target} samples $N_{T}$. We compare the proposed approch with other MSDA methods and with the \texttt{Baseline}, \texttt{Target}, \texttt{Bayes} classification.
\begin{itemize} 
\item \textit{Varying the number of \textit{sources}:} we keep the number of samples fixed in both \textit{sources} and \textit{target} datasets (s.t. $N_{j}=N_{T}$ $\forall j$) and we vary the number of \textit{sources} $J\in \{3, 5, 10, 20, 25, 30\}$. In Fig. \ref{fig:simu_exp1} we report the accuracy of the different methods.

\begin{figure}[h!]
\vspace{1cm}
\centering\includegraphics[width=.99\linewidth]{Figures/Exp1_boxplot_allmethods.pdf}
\caption{Methods' accuracy for varying the number of \textit{sources} $J$.}
\label{fig:simu_exp1}
\vspace{2cm}
\centering\includegraphics[width=.99\linewidth]{Figures/Exp5_boxplot_allmethods.pdf}
\caption{Methods' accuracy for varying the number of \textit{source} samples.}
\label{fig:simu_exp5}
\vspace{1cm}

\end{figure}

\begin{figure}[h!]

\centering\includegraphics[width=.99\linewidth]{Figures/Exp2_boxplot_allmethods.pdf}
\caption{Methods' accuracy for varying the number of \textit{target} samples}
\label{fig:simu_exp2}
\vspace{1cm}
\centering\includegraphics[width=.99\linewidth]{Figures/Exp4_boxplot_allmethods.pdf}
\caption{Methods' accuracy for varying the number of \textit{source} and \textit{target} samples}
\vspace{1cm}
\end{figure}


\begin{figure}[!h]
\centering\includegraphics[scale=.35]{Figures/Exp4alphas_N60.pdf}
\caption{Recovered $\pmb{\alpha}$ with small sample size ($N_{j}=N_{T}=60$).}
\label{alpha_60examples}
\vspace{1cm}
\centering\includegraphics[scale=0.35]{Figures/Exp5alphas_N300.pdf}
\caption{Recovered $\pmb{\alpha}$ for $N_{j}=N_{T}=300$.}
\label{fig:simu_exp5_alpha}
\end{figure}


\item \textit{Varying the number of \textit{source} samples:}
we fix the number of \textit{sources} $J$ equal to 20 and the number of \textit{target} samples $N_{T}$ to 300. Fig \ref{fig:simu_exp5} and \ref{fig:simu_exp5_alpha} show the methods accuracy for varying the number of \textit{source} samples $N_{j}$ in $\{60, 180, 300\}$ and the recovered $\pmb{\alpha}$ weight for $N_{j}=300$, respectively. 

\item \textit{Varying the number of the \textit{target}  samples:}
we fix $J=20$ and $N_{j}=300$, with $1 \leq j \leq J$. We let vary the number of \textit{target}  samples $N_{T}$ in $\{60, 180, 300\}$ (Fig. \ref{fig:simu_exp2}).



\item \textit{Varying the number of samples of all domains}:
we fix the number of \textit{sources} equal to 20. We let vary the number of \textit{source} and \textit{target}  samples in $\{60, 180, 300\}$, by keeping $N_{j}=N_{T}$ with $1 \leq j \leq J$.
We report the methods' accuracy in Fig. \ref{fig:simu_exp5}.


\end{itemize}

In all experiments \texttt{MSDA-WJDOT} significantly outperfoms \texttt{CJDOT}, \texttt{MJDOT}, \texttt{IWERM} and the \texttt{Baseline}.  Both \texttt{MSDA-WJDOT(E)} and \texttt{MSDA-WJDOT(B)} provide a better or at least comparable performance w.r.t. the \texttt{Target} method, in which the labels of the \textit{target} dataset are used. In Fig. \ref{alpha_60examples} and \ref{fig:simu_exp5_alpha} we show the recovered weights $\pmb\alpha$ for $N_{j}=N_{T}=60$ and $N_{j}=N_{T}=300$, respectively. In both cases, the $x$- axis   reports different random \textit{target} angles in the $[0, \frac{3}{2}\pi]$ interval (ordered by increasing angles), whereas the $y$-axis represents the \textit{source} angles ordered such that $\theta _{j}\leq \theta _{j+1}$, $1\leq j \leq J-1$. As we can see, the weights are higher along the diagonal meaning that \texttt{MSDA-WJDOT} always rewards the \textit{sources} with angle closest to $\theta _{T}$.

%\begin{figure}
%    \centering
%    ~\hfill
%    \includegraphics[width=7cm]{./Figures/lambda_bound5source.png }
%    \includegraphics[width=7cm]{./Figures/lambda_bound30source.png}\hfill~
%    \caption{Examples of upper-bound of $\Lambda$ when the function $f$ is the function learned by our approach (instead of the one minimizing $\Lambda$ in \eqref{eq:lambda}). The blue curve represents the $\alpha$-weighted source error for $10000$ random $\alpha$ (the $x$-axis), the orange one corresponds to the source error for the learned $\alpha$ and green line represents the target error. We can see that for both $5$ (Left) and $30$ sources (Right) the learned $alpha$ leads to lower source error even though $\alpha$ has been optimized for aligning joint distributions. }
%    \label{fig:lambda}
%\end{figure}


\begin{figure}
    \centering
    ~\hfill
    \includegraphics[width=7cm]{./Figures/hist_lambda_bound5source_2.png }
    \includegraphics[width=7cm]{./Figures/hist_lambda_bound30source_2.png}\hfill~
    \caption{Examples of source error and target error when the function $f$ is the function learned by our approach (instead of the one minimizing $\Lambda$ in \eqref{eq:lambda}). The blue curve represents an histogram of the $\alpha$-weighted source error for $10000$ random $\alpha$.
     The $x$-axis represents the value of the error and the  $y$-axis the count.  
    The green line corresponds to the source error for the learned $\alpha$, red one gives the error for an uniform alpha and the black one represents the target error (the height of the lines has been arbitrarily set for a sake of clarity).
    We can see that for both $5$ (Left) and $30$ sources (Right) the learned $alpha$ leads to lower source error even though $\alpha$ has been optimized for aligning joint distributions. }
    \label{fig:lambda}
\end{figure}



\subsection{Real data}
In the section, we introduce a new strategy for the validation, in alternative to the one based on SSE proposed in Sec. 3.2. We propose to employ the accuracy of the learned classifier $f$ on the \textit{source} datasets and weighted by $\pmb{\alpha}$, i.e.
\begin{equation}\label{eq:acc}
\sum _{j=1}^{J} \alpha _{j} ACC_{S,j}(f),
\end{equation}
with $ACC_{S,j}(f) = \frac{\# \{f(x_{j}^{i}) =y_{j}^{i}\}}{N_{j}}$.
To refer to this approach, we denote as \texttt{MSDA-WJDOT}$^{acc}$, \texttt{CJDOT}$^{acc}$, \texttt{MJDOT}$^{acc}$ the MSDA-WJDOT and the two JDOT extensions respectively. Let us remark that \texttt{MSDA-WJDOT}$^{acc}$ is a way to reuse the weights $\pmb{\alpha}$ that provide the closest \textit{source} distributions which, hence, are supposed to give a better estimate of the performance of the current classifier.

\paragraph{Object recognition}

%\ros{
In Table \ref{tab:alpha_CO} we report the \textit{source} weights provided by MSDA-WJDOT. In all cases, $\pmb{\alpha}$ is a one-hot vector suggesting that only one \textit{source} is meaningfully related to the \textit{target} domain. This is in line with the results on single-source DA found in \cite{Courty2015} in which the \textit{source} domain providing the highest accuracy corresponds to the one selected by \texttt{MSDA-WJDOT}.


%\begin{table}[!ht]
%%\begin{center}\small
%\begin{tabular}{|c||c|c|c|c||c|}  
%\hline
%\textbf{Target} & \textit{Amazon} & \textit{dslr} & \textit{webcam} & %\textit{Caltech10} & \textit{WJDOT} \\
%\hline

% \textbf{Amazon}& - & 92.92 & 91.98 & 92.15 & 94.23 \\
% \hline
% \textbf{dslr} & 85.25 & - & 91.38 & 85.38 &  100.00 \\
%\hline
% \textbf{webcam} & 84.50 & 94.17 & - & 83.84 & 83.84\\
% \hline
% \textbf{Caltech10} & 87.16 & 84.93 & 83.71 &- & 85.93\\
%\hline
%\end{tabular}
%\end{center}
%%\caption{Accuracy reported in \cite{Courty2015} for SDA based on %\textbf{OT-GL} and WJDOT accuracy}
%\label{tab:SDAvsMDA}
%\end{table}


\begin{table}[!ht]
\begin{center}\small
\begin{tabular}{c||c|c|c|c}  
\hline
\textbf{Target} & \textit{Amazon} & \textit{dslr} & \textit{webcam} & \textit{Caltech10}\\
\hline
 \textbf{Amazon}& - & 0 & 0 & 1 \\
 \hline
 \textbf{dslr} & 0 &-&1&0\\
 \hline
\textbf{webcam} & 0 &1&-&0\\
\hline
 \textbf{Caltech10} & 1&0&0&-\\
\hline
\end{tabular}
\end{center}
\caption{$\pmb{\alpha}$ weights}
\label{tab:alpha_CO}
\end{table}
%}
%\rt{maybe it's useless this table} 

%\ros{
 Table \ref{tab:CaltechOffice2} is a full version of Table 1 in the paper, in which we also show the accuracy obtained by employing the validation strategy introduced in Eq. \ref{eq:acc}. We can observe that \texttt{MSDA-WJDOT}$^{acc}$ provides good performances, comparable with both \texttt{MSDA-WJDOT} and the other MSDA methods, but \texttt{MSDA-WJDOT} still remains the state of the art.

\begin{table}[!h]
\begin{center}\small
\begin{tabular}{c||c|c|c|c|c}  
\hline
\textbf{Method} & \textbf{Amazon} & \textbf{dslr} & \textbf{webcam} & \textbf{Caltech10} & \textbf{AR}\\
\hline
\texttt{Baseline} & $93.13 \pm 0.07 $& $94.12 \pm 0.00$ & $89.33 \pm 1.63$ & $82.65 \pm 1.84$ & 6.75 \\
\hline
\texttt{IWERM} \cite{Sugiyama2007} & $93.30 \pm 0.75$ & $\pmb{100.00 \pm 0.00}$ &$89.33 \pm 1.16$ & $\pmb{91.19 \pm 2.57} $& 3.25\\ 
\texttt{CJDOT}$^{acc}$ \cite{Courty2017}&  $92.27 \pm 0.83$ & $97.06 \pm 2.94$  & $90.33 \pm 2.33$ & $86.19 \pm 0.09$ &  4.50 \\
\texttt{CJDOT} \cite{Courty2017}&  $93.74 \pm 1.57$ & $93.53\pm4.59$  & $90.33 \pm 2.13$ & $85.84 \pm 1.73$ & 4.50\\
\texttt{MJDOT}$^{acc}$ \cite{Courty2017} & $93.61 \pm 0.04$& $ 98.82\pm 2.35 $ & ${91.00 \pm 1.53 }$ & $ 85.22\pm 1.48$ & 3.75  \\
\texttt{MJDOT} \cite{Courty2017}& $94.12 \pm 1.57$& $97.65\pm  2.88$ & $90.27 \pm 2.48$ & $84.72\pm 1.73$ & 4.50  \\
\texttt{JCPOT}$^*$ \cite{Redko2019} & $79.23\pm 3.09 $& $81.77\pm 2.81 $&$93.93\pm 0.60$&$77.91 \pm 0.45 $& {7.25}\\
\texttt{WBT}$^*$ \cite{montesuma2021}& $59.86\pm2.48 $& $60.99\pm2.15 $&$64.13\pm 2.38$&$62.80 \pm 1.61 $&{9.50}\\
\texttt{WBT$_{reg}^*$} \cite{montesuma2021}& $92.74\pm 0.45 $& $95.87\pm 1.43 $&$ \pmb{96.57\pm 1.76}$&$ 85.01\pm 0.84 $&{5.00}\\
\hline
\texttt{MSDA-WJDOT}$^{acc}$ & $ 93.61 \pm 0.09$ & $\pmb{100.00\pm 0.00}$ & $ 86.00 \pm  2.91$ & $ 85.49 \pm 1.69$ & 4.25 \\
\texttt{MSDA-WJDOT} & $\pmb{94.23\pm 0.90}$ & $\pmb{100.00\pm 0.00}$ & $89.33\pm 2.91$ & $85.93\pm 2.07$ &  \textbf{2.75} \\
\hline
\texttt{Target}  & $95.77 \pm 0.31$& $88.35\pm 2.76$ & $99.87\pm 0.65$  & $89.75\pm 0.85$ & - \\
\texttt{Baseline+Target}  & $94.78 \pm 0.48$ &  $99.88\pm 0.82$ & $100.00\pm 0.00$ & $91.89\pm 0.69$ & -\\
\hline
\end{tabular}
\end{center}
\caption{Accuracy on Caltech Office Dataset. Results of methods marked by $^*$ are from \cite{montesuma2021}.}
\label{tab:CaltechOffice2}
\end{table}
%}

\paragraph{Music-speech discrimination}
\begin{figure}[!h]
\centering\includegraphics[scale=0.5]{Figures/BLSTM.png}
\caption{BLSTM architecture. A similar architecture is used for the multi-task learning approach: we use the same embedding function $g$ and $J$ classification functions $f_{j}$.}
\label{fig:BLSTM}
\end{figure}

%\ros{
The model we adopted is shown in Fig. \ref{fig:BLSTM}, where $g$ is a two-layers Bidirectional Long Short-Term Memory (BLSTM) that feeds the one feed-forward layer $f$ with the last hidden state. Weights were initialized with Xavier initialization. Training is performed with Adam optimizer with 0.9 momentum and $\epsilon = e^{-8}$. Learning rate exponentially decays every epoch. We grid-research the initial learning rate value and the decay rate. 



In Table \ref{tab:MSdiscrimination2} we show the MSDA performances in the music-speech discrimination. In particular, for MSDA-WJDOT and JDOT variants the validation strategy described in formula \ref{eq:acc} has been employed. Results show that, although this is a valid strategy, early stopping based on SSE described in Sec. 4 always outperforms. The Average Rank shows that MSDA-WJDOT is state of the art in music-speech discrimination. % with both validation strategies. 



\begin{table}[!h]
\begin{center}\small
\begin{tabular}{l||c|c|c|c|c}  
\hline
\textbf{Method} & \textbf{F16} & \textbf{Buccaneer2} & \textbf{Factory2} & \textbf{Destroyerengine} & \textbf{AR}\\
\hline
\texttt{Baseline} & $69.67 \pm 8.78$ & $57.33 \pm 7.57$ & $83.33 \pm 9.13$ & $87.33\pm 6.72$ & 11.25  \\
\hline
\texttt{IWERM} \cite{Sugiyama2007} &$72.22\pm 3.93$ & $58.33 \pm 5.89$& $85.00 \pm 6.23$ & $81.64 \pm 3.33$ &  10.75\\
\texttt{IWERM}$_{mtl}$ \cite{Sugiyama2007} & $75.00\pm 0.00$ & $66.67 \pm 0.00$ & \pmb{$ 100.00 \pm 0.00$}&  $98.33
\pm 3.33 $ &  5.50\\
\texttt{DCTN} \cite{Xu2018}& $66.67\pm 3.61$ & $68.75 \pm 3.61$ & $87.50 \pm 12.5$ & $94.44 \pm 7.86$ & 8.50 \\ 
\texttt{M}$^{\pmb{3}}$\texttt{SDA} \cite{Peng2018}& $70.00 \pm 4.08$ & $61.67 \pm 4.08$& $85.00 \pm 11.05$ & $83.33 \pm 0.00 $ & 10.25 \\
\texttt{CJDOT} \cite{Courty2017}& $ 59.50 \pm 13.95 $ & $ 50.00\pm 0.00$ & $ 83.33 \pm 0.00 $ & $91.67 \pm 0.00 $ &  11.50 \\
\texttt{CJDOT}$_{mtl}$ \cite{Courty2017}& $ 83.83 \pm 5.11 $ & $ 74.83 \pm 1.17$ & $ \pmb{100.00 \pm 0.00}$ & $ 95.74 \pm 16.92 $ & 4.00 \\
\texttt{CJDOT}$^{acc}_{mtl}$ \cite{Courty2017}& $79.83 \pm 4.74$ & $74.83 \pm 1.17$ & $99.67 \pm 1.63$ & $\pmb{100.00\pm 0.00}$ &  3.50 \\
\texttt{MJDOT}\cite{Courty2017}& $ 66.33  \pm 9.57 $ & $ 50.00\pm 0.00$ & $ 83.33 \pm 0.00 $ & $91.67 \pm 0.00 $&  11.50 \\
\texttt{MJDOT}$_{mtl}$\cite{Courty2017}& $ 86.00 \pm 4.55 $ & $ 72.83\pm 5.73$ & $97.67 \pm 3.74$ & $97.74 \pm 8.28$ &  4.00 \\
\texttt{MJDOT}$^{acc}_{mtl}$\cite{Courty2017} & $77.67 \pm 5.12$ & $69.00 \pm 4.72$ & $99.67 \pm 1.63$ & $99.83 \pm 1.17$ & 4.75 \\
\texttt{JCPOT}$^*$\cite{Redko2019} & $79.23\pm 3.09 $& $81.77\pm 2.81 $&$93.93\pm 0.60$&$77.91 \pm 0.45 $& {7.50}\\
\texttt{WBT}$^*$\cite{montesuma2021} & $59.86\pm2.48 $& $60.99\pm2.15 $&$64.13\pm 2.38$&$62.80 \pm 1.61 $&{13.00}\\
\texttt{WBT$_{reg}^*$}\cite{montesuma2021} & $ \pmb{92.74\pm 0.45} $& $\pmb{95.87\pm 1.43} $&${96.57\pm 1.76}$&$ 85.01\pm 0.84 $&{4.25}\\
\hline
\texttt{MSDA-WJDOT} & $83.33 \pm 0.00$ & $58.33 \pm 6.01$ & $87.00\pm 6.05$ & $89.00 \pm 4.84$& 8.00\\
\texttt{MSDA-WJDOT}$_{mtl}$& ${87.17 \pm 4.15} $& $74.83\pm 1.20 $& $99.67 \pm 1.63$ & $99.67 \pm 1.63 $& \textbf{2.75} \\
\texttt{MSDA-WJDOT}$^{acc}_{mtl}$ &$ 83.00 \pm 4.07$ & $75.00 \pm 0.00 $& $\pmb{100.00 \pm 0.00}$ & $98.83 \pm 3.34 $& 3.50\\
\texttt{MSDA-WJDOT}$^{acc}$ & $83.33 \pm 0.00$ & $58.33 \pm 6.01$ & $87.00\pm 6.05$ & $89.00 \pm 4.84$& 8.00\\
\hline
\texttt{Target} & $ 73.67 \pm 6.09
$  & $69.17\pm 7.50$ & $77.33 \pm 4.73$  & $73.17\pm 9.90$ & - \\
\texttt{Baseline+Target} & $71.06\pm 9.31 $ & $67.62\pm 11.92$ & $85.33 \pm 11.85$ & $79.53 \pm 10.05$ & - \\
\hline
\end{tabular}
\end{center}
\caption{Accuracy on Music-Speech Dataset. Results of methods marked by $^*$ are from \cite{montesuma2021}.}
\label{tab:MSdiscrimination2}
\end{table}





\end{document}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%




%TABLE WITH REFERENCES
\iffalse
\begin{table}[!t]
\caption{Object recognition accuracy. The last column reports the average rank across target domain.\\
The marker ``$^*$'' besides a method indicate that the results are taken from \cite{montesuma2021}}
\label{tab:CaltechOffice}
\begin{center}
\resizebox{\linewidth}{!}{% 

\begin{tabular}{l|c|c|c|c|c}  
\hline
\textbf{Method} & \textbf{Amazon} & \textbf{dslr} & \textbf{webcam} & \textbf{Caltech10} & \textbf{AR} \\
\hline
\texttt{Baseline} & $93.13 \pm 0.07 $& $94.12 \pm 0.00$ & $89.33 \pm 1.63$ & $82.65 \pm 1.84$ &  5.25 \\
\hline
\texttt{IWERM} \cite{Sugiyama2007} & $93.30 \pm 0.75$ & $\pmb{100.00 \pm 0.00}$ &$89.33 \pm 1.16$ & $\pmb{91.19 \pm 2.57} $& 3.00 \\ 
\texttt{CJDOT} \cite{Courty2017}&  $93.71 \pm 1.57$ & $93.53\pm4.59$  & ${90.33 \pm 2.13}$ & $85.84 \pm 1.73$ &  {3.75}\\
\texttt{MJDOT} \cite{Courty2017}& $94.12 \pm 1.57$& $97.65\pm  2.88$ & $90.27 \pm 2.48$ & $84.72\pm 1.73$ & {3.50} \\
\texttt{JCPOT}$^*$ \cite{Redko2019}& $79.23\pm 3.09 $& $81.77\pm 2.81 $&$93.93\pm 0.60$&$77.91 \pm 0.45 $& {6.00}\\
\texttt{JCPOT-LP}$^*$\cite{Redko2019} & $83.45\pm 0.15 $& $81.51\pm 1.65 $&$ 91.35\pm 1.91$&$ 79.65\pm 0.54 $&{6.00}\\
\texttt{WBT} \cite{montesuma2021}& $59.86\pm2.48 $& $60.99\pm2.15 $&$64.13\pm 2.38$&$62.80 \pm 1.61 $&{8.25}\\
\texttt{WBT$_{reg}$}\cite{montesuma2021} & $92.74\pm 0.45 $& $95.87\pm 1.43 $&$ \pmb{96.57\pm 1.76}$&$ 85.01\pm 0.84 $&{3.50}\\
\hline
\texttt{MSDA-WJDOT} & $\pmb{94.23\pm 0.90}$ & $\pmb{100.00\pm 0.00}$ & $89.33\pm 2.91$ & $85.93\pm 2.07$ & \textbf{2.50} \\
\hline
\texttt{Target} & $95.77 \pm 0.31$& $88.35\pm 2.76$ & $99.87\pm 0.65$  & $89.75\pm 0.85$ & -  \\
\texttt{Baseline+Target} & $94.78 \pm 0.48$ &  $99.88\pm 0.82$ & $100.00\pm 0.00$ & $91.89\pm 0.69$ & -  \\
\hline
\end{tabular}}
\end{center}
\end{table}



%TABLE WITHOUT REF AND WITH JCPOT-LP
\begin{table}[!t]
    \caption{Object recognition accuracy. The last column reports the average rank across target domain. Result of methods marked by $^*$ are from \cite{montesuma2021}.
}
\label{tab:CaltechOffice}
\begin{center}
\resizebox{\linewidth}{!}{% 
\vspace{-.3truecm}
    \begin{tabular}{l|c|c|c|c|c}  
\hline
\textbf{Method} & \textbf{Amazon} & \textbf{dslr} & \textbf{webcam} & \textbf{Caltech10} & \textbf{AR} \\
\hline
\texttt{Baseline} & $93.13 \pm 0.07 $& $94.12 \pm 0.00$ & $89.33 \pm 1.63$ & $82.65 \pm 1.84$ &  5.25 \\
\hline
\texttt{IWERM} & $93.30 \pm 0.75$ & $\pmb{100.00 \pm 0.00}$ &$89.33 \pm 1.16$ & $\pmb{91.19 \pm 2.57} $& 3.00 \\ 
\texttt{CJDOT}&  $93.71 \pm 1.57$ & $93.53\pm4.59$  & ${90.33 \pm 2.13}$ & $85.84 \pm 1.73$ &  {3.75}\\
\texttt{MJDOT}& $94.12 \pm 1.57$& $97.65\pm  2.88$ & $90.27 \pm 2.48$ & $84.72\pm 1.73$ & {3.50} \\
\texttt{JCPOT}$^*$ & $79.23\pm 3.09 $& $81.77\pm 2.81 $&$93.93\pm 0.60$&$77.91 \pm 0.45 $& {6.00}\\
\texttt{JCPOT-LP}$^*$ & $83.45\pm 0.15 $& $81.51\pm 1.65 $&$ 91.35\pm 1.91$&$ 79.65\pm 0.54 $&{6.00}\\
\texttt{WBT} & $59.86\pm2.48 $& $60.99\pm2.15 $&$64.13\pm 2.38$&$62.80 \pm 1.61 $&{8.25}\\
\texttt{WBT$_{reg}$} & $92.74\pm 0.45 $& $95.87\pm 1.43 $&$ \pmb{96.57\pm 1.76}$&$ 85.01\pm 0.84 $&{3.50}\\
\hline
\texttt{MSDA-WJDOT} & $\pmb{94.23\pm 0.90}$ & $\pmb{100.00\pm 0.00}$ & $89.33\pm 2.91$ & $85.93\pm 2.07$ & \textbf{2.50} \\
\hline
\texttt{Target} & $95.77 \pm 0.31$& $88.35\pm 2.76$ & $99.87\pm 0.65$  & $89.75\pm 0.85$ & -  \\
\texttt{Baseline+Target} & $94.78 \pm 0.48$ &  $99.88\pm 0.82$ & $100.00\pm 0.00$ & $91.89\pm 0.69$ & -  \\
\hline
\end{tabular}}
\end{center}
\end{table}
\fi


%TABLE WITH REFERENCES
\iffalse
\begin{table}[!t]
\caption{Music-Speech discrimination accuracy and average rank across target domains. Result of methods marked by $^*$ are from \cite{montesuma2021}.
%\\$^*$Results from \cite{montesuma2021}
}
\label{tab:MSdiscrimination}

\begin{center}
\resizebox{\linewidth}{!}{% 
\begin{tabular}{l|c|c|c|c|c}  
\hline
\textbf{Method} & \textbf{F16} & \textbf{B2} & \textbf{F2} & \textbf{D} & \textbf{AR}\\
\hline
\texttt{Baseline} & $69.67 \pm 8.78$ & $57.33 \pm 7.57$ & $83.33 \pm 9.13$ & $87.33\pm 6.72$ & {8.25} \\
\hline
\texttt{IWERM} \cite{Sugiyama2007} &$72.22\pm 3.93$ & $58.33 \pm 5.89$& $85.00 \pm 6.23$ & $81.64 \pm 3.33$ &  {8.00} \\
\texttt{IWERM}$_{MTL}$ \cite{Sugiyama2007} & $75.00\pm 0.00$ & $66.67 \pm 0.00$ & \pmb{$ 100.00 \pm 0.00$}&  $98.33
\pm 3.33 $ & {3.50}\\
\texttt{DCTN} \cite{Xu2018}& $66.67\pm 3.61$ & $68.75 \pm 3.61$ & $87.50 \pm 12.5$ & $94.44 \pm 7.86$ & {6.00}\\ 
\texttt{M}$^{\pmb{3}}$\texttt{SDA} \cite{Peng2018}& $70.00 \pm 4.08$ & $61.67 \pm 4.08$& $85.00 \pm 11.05$ & $83.33 \pm 0.00 $ & {7.50}\\
\texttt{CJDOT} \cite{Courty2017}& $ 59.50 \pm 13.95 $ & $ 50.00\pm 0.00$ & $ 83.33 \pm 0.00 $ & $91.67 \pm 0.00 $ & {8.75} \\
\texttt{CJDOT}$_{MTL}$ \cite{Courty2017}& $ 83.83 \pm 5.11 $ & $ \pmb{74.83 \pm 1.17}$ & $ \pmb{100.00 \pm 0.00}$ & $ 95.74 \pm 16.92 $ & {2.50}\\
\texttt{MJDOT}\cite{Courty2017}& $ 66.33  \pm 9.57 $ & $ 50.00\pm 0.00$ & $ 83.33 \pm 0.00 $ & $91.67 \pm 0.00 $&  {8.50}\\
\texttt{MJDOT}$_{MTL}$\cite{Courty2017}& $ 86.00 \pm 4.55 $ & $ 72.83\pm 5.73$ & $97.67 \pm 3.74$ & $97.74 \pm 8.28$ & 2.50 \\
\texttt{JCPOT}$^*$\cite{Redko2019} & $51.92\pm 3.25$& $35.87\pm 0.41$&$ 51.95\pm1.75$&$ 48.47\pm 2.97$& {11.75}\\

\texttt{JCPOT-LP}$^*$ \cite{Redko2019}& $56.30\pm 0.37$& $ 36.40\pm0.39 $&$ 51.95\pm3.25 $&$ 52.92\pm 1.32$&{11.00}\\

\texttt{WBT} \cite{montesuma2021}& $25.30\pm6.02 $& $21.37\pm2.25 $&$22.70\pm 2.25$&$24.30 \pm 2.71 $&{12.75}\\
\texttt{WBT$_{reg}$}\cite{montesuma2021} & $84.40\pm 1.71 $& $70.60\pm 1.27 $&$ 90.17\pm 0.46$&$ 83.05\pm 0.97 $&{5.00}\\

\hline
\texttt{MSDA-WJDOT} & $83.33 \pm 0.00$ & $58.33 \pm 6.01$ & $87.00\pm 6.05$ & $89.00 \pm 4.84$&{6.25} \\
\texttt{MSDA-WJDOT}$_{MTL}$& $\pmb{87.17 \pm 4.15} $& $\pmb{74.83\pm 1.20} $& $99.67 \pm 1.63$ & $\pmb{99.67 \pm 1.63} $& \textbf{1.25} \\
\hline
\texttt{Target} & $ 73.67 \pm 6.09
$  & $69.17\pm 7.50$ & $77.33 \pm 4.73$  & $73.17\pm 9.90$ & - \\
\texttt{Baseline+Target} & $71.06\pm 9.31 $ & $67.62\pm 11.92$ & $85.33 \pm 11.85$ & $79.53 \pm 10.05$ & - \\
\hline
\end{tabular}}
\end{center}
\end{table}


%TABLE WITHOUT REF AND WITH JCPOT-LP
\begin{table}[!ht]
\caption{Music-Speech discrimination accuracy and average rank across target domains. Result of methods marked by $^*$ are from \cite{montesuma2021}.
}
\label{tab:MSdiscrimination}

\begin{center}
\resizebox{\linewidth}{!}{% 
\vspace{-.3truecm}
\begin{tabular}{l|c|c|c|c|c}  
\hline
\textbf{Method} & \textbf{F16} & \textbf{B2} & \textbf{F2} & \textbf{D} & \textbf{AR}\\
\hline
\texttt{Baseline} & $69.67 \pm 8.78$ & $57.33 \pm 7.57$ & $83.33 \pm 9.13$ & $87.33\pm 6.72$ & {8.25} \\
\hline
\texttt{IWERM}&$72.22\pm 3.93$ & $58.33 \pm 5.89$& $85.00 \pm 6.23$ & $81.64 \pm 3.33$ &  {8.00} \\
\texttt{IWERM}$_{MTL}$& $75.00\pm 0.00$ & $66.67 \pm 0.00$ & \pmb{$ 100.00 \pm 0.00$}&  $98.33
\pm 3.33 $ & {3.50}\\
\texttt{DCTN} & $66.67\pm 3.61$ & $68.75 \pm 3.61$ & $87.50 \pm 12.5$ & $94.44 \pm 7.86$ & {6.00}\\ 
\texttt{M}$^{\pmb{3}}$\texttt{SDA} & $70.00 \pm 4.08$ & $61.67 \pm 4.08$& $85.00 \pm 11.05$ & $83.33 \pm 0.00 $ & {7.50}\\
\texttt{CJDOT} & $ 59.50 \pm 13.95 $ & $ 50.00\pm 0.00$ & $ 83.33 \pm 0.00 $ & $91.67 \pm 0.00 $ & {8.75} \\
\texttt{CJDOT}$_{MTL}$ & $ 83.83 \pm 5.11 $ & $ \pmb{74.83 \pm 1.17}$ & $ \pmb{100.00 \pm 0.00}$ & $ 95.74 \pm 16.92 $ & {2.50}\\
\texttt{MJDOT}& $ 66.33  \pm 9.57 $ & $ 50.00\pm 0.00$ & $ 83.33 \pm 0.00 $ & $91.67 \pm 0.00 $&  {8.50}\\
\texttt{MJDOT}$_{MTL}$& $ 86.00 \pm 4.55 $ & $ 72.83\pm 5.73$ & $97.67 \pm 3.74$ & $97.74 \pm 8.28$ & 2.50 \\
\texttt{JCPOT}$^*$ & $51.92\pm 3.25$& $35.87\pm 0.41$&$ 51.95\pm1.75$&$ 48.47\pm 2.97$& {11.75}\\

\texttt{JCPOT-LP}$^*$ & $56.30\pm 0.37$& $ 36.40\pm0.39 $&$ 51.95\pm3.25 $&$ 52.92\pm 1.32$&{11.00}\\

\texttt{WBT}$^*$ & $25.30\pm6.02 $& $21.37\pm2.25 $&$22.70\pm 2.25$&$24.30 \pm 2.71 $&{12.75}\\
\texttt{WBT$_{reg}^{*}$}& $84.40\pm 1.71 $& $70.60\pm 1.27 $&$ 90.17\pm 0.46$&$ 83.05\pm 0.97 $&{5.00}\\

\hline
\texttt{MSDA-WJDOT} & $83.33 \pm 0.00$ & $58.33 \pm 6.01$ & $87.00\pm 6.05$ & $89.00 \pm 4.84$&{6.25} \\
\texttt{MSDA-WJDOT}$_{MTL}$& $\pmb{87.17 \pm 4.15} $& $\pmb{74.83\pm 1.20} $& $99.67 \pm 1.63$ & $\pmb{99.67 \pm 1.63} $& \textbf{1.25} \\
\hline
\texttt{Target} & $ 73.67 \pm 6.09
$  & $69.17\pm 7.50$ & $77.33 \pm 4.73$  & $73.17\pm 9.90$ & - \\
\texttt{Baseline+Target} & $71.06\pm 9.31 $ & $67.62\pm 11.92$ & $85.33 \pm 11.85$ & $79.53 \pm 10.05$ & - \\
\hline
\end{tabular}}
\end{center}
\end{table}
\fi

