% \documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}
%\usepackage{algorithm}  
%\usepackage{algorithmic} 
\usepackage{algpseudocode}
\usepackage[vlined,ruled,linesnumbered]{algorithm2e}
\usepackage{appendix}
\usepackage{hyperref}
%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{graphicx}
\usepackage{color}
\usepackage{subfigure}
\usepackage{amssymb}
\usepackage{amsmath}  
\usepackage{amsthm}
\newtheorem{definition}{\textbf{Definition}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{bbm}
\usepackage{xspace}
\setlength{\textfloatsep}{4pt}
%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example
\newcommand{\alg}{CAT\xspace}

\title{Cross-Domain Adaptive Transfer Reinforcement \\ Learning Based on State-Action Correspondence}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{Heng You}

\author[1,2]{Tianpei Yang \thanks{Correspondence to Tianpei Yang <tpyang@tju.edu.cn>, Jianye Hao <jianye.hao@tju.edu.cn>}} 

\author[1]{Yan Zheng}
\author[1]{Jianye Hao \textsuperscript{$\small{*}$}}
\author[2]{Matthew E. Taylor}

% Add affiliations after the authors

\affil[1]{College of Intelligence and Computing, Tianjin University, China}

\affil[2]{Department of Computing Science, University of Alberta and Alberta Machine Intelligence Institute, Canada}
  
  \begin{document}
\maketitle

\begin{abstract}
Despite the impressive success achieved in various domains, deep reinforcement learning (DRL) is still faced with the sample inefficiency problem. 
Transfer learning (TL), which leverages prior knowledge from different but related tasks to accelerate the target task learning, has emerged as a promising direction to improve RL efficiency. 
The majority of prior work considers TL across tasks with the same state-action spaces, while transferring across domains with different state-action spaces is relatively unexplored. 
Furthermore, such existing cross-domain transfer approaches only enable transfer from a single source policy, leaving open the important question of how to best transfer from multiple source policies. This paper proposes a novel framework called \emph{Cross-domain Adaptive Transfer} (\alg) to accelerate DRL. \alg learns the state-action correspondence from each source task to the target task and adaptively transfers knowledge from multiple source task policies to the target policy. \alg can be easily combined with existing DRL algorithms and experimental results show that \alg significantly accelerates learning and outperforms other cross-domain transfer methods on multiple continuous action control tasks. 
The code for this project are released, under the project page of \href{https://github.com/TJU-DRL-LAB/transfer-and-multi-task-reinforcement-learning/tree/main/Single-agent Transfer RL/Cross-domain Transfer}{https://github.com/TJU-DRL-LAB/transfer-and-multi-task-reinforcement-learning}.
\end{abstract}

\section{Introduction}\label{sec:intro}
Deep reinforcement learning (DRL) combining deep neural networks with RL algorithms \citep{DBLP:books/lib/SuttonB98} has achieved impressive success in multiple domains like game playing \citep{DBLP:journals/nature/MnihKSRVBGRFOPB15,DBLP:journals/nature/SilverHMGSDSAPL16} and continuous control \citep{DBLP:journals/corr/LillicrapHPHETS15}. However, DRL is still faced with sample inefficiency problem that requires large amounts of interactions with the environment. Transfer learning (TL), as a technique to accelerate the learning process of RL by leveraging prior knowledge, has become one popular research direction to significantly reduce sample complexity \citep{DBLP:journals/jmlr/TaylorS09,DBLP:journals/corr/abs-2009-07888,ptf,maptf}. 

One major branch of transfer in RL focuses on leveraging external knowledge from pre-trained policies on source tasks, which we call \emph{policy transfer}.
%A promising technique of policy transfer works distills the knowledge from source policies by minimizing the cross-entropy (or KL-divergence) between the state-conditional action distributions of the source and target policy networks \citep{DBLP:journals/corr/ParisottoBS15,DBLP:journals/corr/abs-1803-03835,ptf,ptfabs,DBLP:conf/icml/TaoGCSM21}, or maximizing the probability that the source policy will visit trajectories generated by the target policy \citep{DBLP:conf/nips/TehBCQKHHP17}.
These approaches either distill knowledge from source policies by imitation learning \citep{DBLP:journals/corr/RusuCGDKPMKH15,DBLP:journals/corr/abs-1803-03835,DBLP:journals/corr/ParisottoBS15,ptf,ptfabs,DBLP:conf/icml/TaoGCSM21},
or reuse source policies for exploration based on the evaluation of source policies on the target environment \citep{DBLP:conf/atal/FernandezV06,DBLP:conf/aaai/LiZ18a}.
However, all these methods require the same assumption that source tasks share the same state-action space with the target task so that the source policies can be directly imitated or reused. Previous works transfer knowledge between tasks with different state-action spaces based on the hand-coded or learnt mapping \citep{DBLP:conf/atal/TaylorWS07,DBLP:conf/atal/TaylorKS08} in tabular settings, which can not be applied to more complex tasks. 
%\MET{Why don't previous works that transfer policies between tasks with different state action spaces count (e.g., using hand-coded inter-task mappings to transfer a policy \url{https://irll.ca/files/publications/aamas07-taylor.pdf} or learning inter-task mappings for value function transfer \url{https://irll.ca/files/publications/aamas08-taylor.pdf})?}
In practice, tasks in real-world scenarios may exhibit many differences, not only in the dynamics and rewards but also in the mismatch between the state-action space. 
An ideal method should be capable of handling such a mismatch and realize more generalized transfer learning. 
%In addition to distilling knowledge from source policies, another idea of policy transfer focuses on reusing source policies directly, e.g., \citet{DBLP:conf/atal/FernandezV06,DBLP:conf/aaai/LiZ18a} selects a source policy to interact with the environment based on the expected performance of each source policy to assist the exploration of the target policy.
%As described above, all the methods mentioned are limited to settings where source tasks must share the same state-action space as the target task. However, most tasks in real-world scenarios have many differences from each other especially the mismatch between the state-action space, which makes the above methods unable to be applied to a more realistic setting.


Recently, a few approaches consider how to deal with the mismatch in the state-action space by mapping state spaces into a common feature space using a state encoder \citep{DBLP:conf/iclr/0004DLAL17,DBLP:conf/uai/WanG020}. However, these methods suffer either of the following limitations, e.g., \citet{DBLP:conf/iclr/0004DLAL17} requires paired data of two tasks collected by pre-trained policies or human labeling to train the encoder, which is a strong assumption and usually expensive for real-world problems. 
MIKT \citep{DBLP:conf/uai/WanG020} only considers the relevance of the state embedding and the target state to train the encoder, which makes the state embeddings unable to reflect the correlation with the source state, and finally influences the transfer performance. %Besides, they do not consider the problem of the action space mismatch limited to their transfer methods, which may help design more diverse transfer methods and support more efficient knowledge transfer. 
Some works \citep{DBLP:journals/corr/abs-1909-02291,DBLP:conf/iclr/ZhangXEPW21} focus on learning both the mapping between the state spaces and action spaces for transfer. However, the former method can only deal with discrete action spaces. % limited to the design of their action embedding as a matrix, and the 
The latter method adopts zero-shot transfer through the mapping, which can not achieve optimal performance on the target task. Furthermore, all above methods only consider learning a one-to-one mapping and transferring a single source policy. This paper instead tackles the more difficult case of learning to transfer from multiple tasks with different state-action spaces.

To this end, we propose a novel framework called \emph{Cross-domain Adaptive Transfer} (\alg), which adaptively transfers multiple source policies with different state-action spaces. Different from previous works, we do not require paired data to learn the state-action correspondence or learn insufficiently trained state correspondence. Instead, \alg learns the state-action correspondence from each source domain to the target domain through a state encoder, action encoder, and reverse state encoder using the trajectories of source policies. 
Since the source environment is inaccessible for more information, we do not need reverse action encoders to get the actions on the source environment.
Besides, \alg learns the state embeddings which can satisfy the properties proposed in Section~\ref{sec:3.2} to achieve better transfer performance by proposing extra optimization objectives.
Further, \alg evaluates each source policy on the target task and learns how helpful each source policy is to the target policy, and then uses the performance as the measurement to determine when and which source policies should be transferred. In this way, \alg can adaptively transfer multiple cross-domain policies into the target policy. In summary, our contributions are as follows: 

\begin{itemize}
\item Our novel transfer framework, \alg, consists of three main components: an agent module, a self-adaptive module, and a correction module, to solve the problem of adaptive knowledge transfer from multiple source policies with different state-action spaces.% by combining knowledge adaptively extracted from source policy networks into the target policy. 
\item \alg learns more sufficiently trained state embeddings and action embeddings using the correction module and the agent module, which serves as the basis of the following transfer process.
\item \alg combines knowledge from source policy networks with the target policy network using an adaptive weighting factor generated by the self-adaptation module.
\item \alg can be easily combined with existing DRL algorithms and experimental results show that \alg efficiently accelerates RL and outperforms other related transfer methods on continuous control tasks with different state-action spaces.
\end{itemize}
%\ytp{The mapping is a part of our framework, should not be separated as a single contribution. The contributions are, 1) definition, 2) firstly propose a novel framework that enables ... 3) containing three main components 4) experimental results}

\section{Background}
This section introduces notation and defines our problem setting.
We typically model RL problems with a Markov decision process (MDP), which can be described as a tuple $\mathcal{M} = \langle \mathcal{S}, \mathcal{A}, \mathcal{R}, \mathcal{T}, \gamma \rangle$, where $\mathcal{S}$ and $\mathcal{A}$ are the sets of states and actions, respectively; $\mathcal{T}: \mathcal{S} \times \mathcal{A} \times \mathcal{S} \mapsto [0,1]$ is the state transition probability function; $\mathcal{R}: \mathcal{S} \times \mathcal{A} \times \mathcal{S} \mapsto \mathbbm{R}$ is the reward function which gives returns on the agent's performance; and $\gamma$ is the discount factor for future rewards. A policy $\pi: \mathcal{S} \times \mathcal{A} \mapsto [0,1] $ is defined as a state-conditioned probability distribution over actions and the objective of the agent is to find an optimal policy $\pi^*$ maximizing the expected discounted return $R = \sum_{i=t}^T \gamma^{i-t}r_i$.

\textbf{Policy Gradient (PG) Algorithms.} Policy gradient methods are widely used to directly optimize the policy $\pi$ parameterized by $\theta$. Proximal policy optimization (PPO, \citet{DBLP:journals/corr/SchulmanWDRK17}) is currently one of the most efficient PG methods.
%can avoid the large deviation of the results caused by the use of importance sampling. 
In each iteration, PPO tries to calculate a new policy $\pi_{\theta}$ and ensure that the difference between $\pi_{\theta}$ and the rollout policy $\pi_{\theta_{\textup{old}}}$ is not too large by adding a constraint during the training process. The following loss is minimized over multiple epochs:
\[
%\begin{equation}
  L_{\textup{PPO}}^{\theta} = -\mathbbm{E}_{\tau}\left[\min\left(r_t(\theta)\hat{A}_t,\textup{clip}(r_t(\theta),1-\varepsilon,1+\varepsilon)\hat{A}_t\right)\right]
%\end{equation}
\]
where $r_t(\theta) = \frac{\pi_{\theta}(a_t | s_t)}{\pi_{\theta_{\textup{old}}}(a_t | s_t)}$ is the ratio of the action probabilities under the rollout policy and current policy and $\hat{A}_t$ is the estimated advantage. The value network $V_{\psi}$ is updated with temporal difference learning: $L_{\textup{PPO}}^{\psi} = -\mathbbm{E}_{\tau}[(V_{\psi}(s_t) - V_{t}^{\textup{targ}})^2]$. The overall PPO minimization objective is:
\begin{equation}
\label{eq:L_PPO}
  L_{\textup{PPO}}(\theta,\psi) = L_{\textup{PPO}}^{\theta} + L_{\textup{PPO}}^{\psi}
\end{equation}

\textbf{Problem Settings.}
Same-domain transfer learning considers a source MDP $\mathcal{M}_{\textup{source}}=\langle\mathcal{S}, \mathcal{A}, \mathcal{R}_{\textup{source}}, \mathcal{T}_{\textup{source}}, \gamma \rangle$ 
and a target MDP $\mathcal{M}_{\textup{target}} = \langle \mathcal{S}, \mathcal{A}, \mathcal{R}_{\textup{target}}, \mathcal{T}_{\textup{target}}, \gamma \rangle$,  
where the two MDPs have the same state and action spaces, but other properties such as $\mathcal{R}$ or $\mathcal{T}$ may be different. A standard objective is to accelerate learning the target task by leveraging $\mathcal{M}_{\textup{source}}$ (relative to learning from scratch).

\begin{figure*}[ht]
    \centering
  \includegraphics[width=\linewidth]{Method/CAT.pdf}
    \caption{An illustration of our Cross-domain Adaptive Transfer framework which contains three main components: (a) Self-adaptation Module. (b) Correction Module. (c) Agent Module. The state encoders $\phi_n$ are updated using $\{L_{\textup{PPO}}, L_{\textup{MI}}, L_{\textup{cyc}}, L_{\textup{corr}}\}$. The reverse state encoders $\phi_n^{-}$ are updated using $\{L_{\textup{cyc}}, L_{\textup{corr}}\}$. The action encoders $e_n$ are updated using $\{L_{\textup{corr}}\}$. Note that we only show the target policy network, and the value network is the same.}
    \label{fig:method}
\end{figure*}

In cross-domain transfer, the state and action spaces can be different: 
$\mathcal{M}_{\textup{source}}=\langle \mathcal{S}_{\textup{source}}, \mathcal{A}_{\textup{source}}, \mathcal{R}_{\textup{source}}, \mathcal{T}_{\textup{source}}, \gamma_{\textup{source}}\rangle$ and $\mathcal{M}_{\textup{target}}=\langle\mathcal{S}_{\textup{target}}, \mathcal{A}_{\textup{target}}, \mathcal{R}_{\textup{target}}, \mathcal{T}_{\textup{target}}, \gamma_{\textup{target}}\rangle$. However, current TL methods are only able to successfully distill knowledge from a single source policy. This paper considers the problem of cross-domain transfer between multiple source tasks and a target task. We denote this as a series of source MDPs  $\Pi_{\mathcal{M}} = \{\mathcal{M}_1, \mathcal{M}_2, \cdots, \mathcal{M}_n\}$ and a target MDP $\mathcal{M}_{\textup{target}}$ where $\mathcal{M}_i$ represents the $i$-th source MDP for convenience. We generally assume there are some high-level commonalities between the MDPs (e.g., a quadruped, hexapod, and octopod robots may have qualitatively similar gaits). In this work, our objective is to adaptively transfer knowledge from $\Pi_{\mathcal{M}}$ to accelerate the learning process. 


\section{Methodology}\label{sec:method}

In this section, we first introduce our whole framework and each component. Then, we describe how to learn state and action embeddings and how to adaptively transfer multiple cross-domain source policies to the target task. Finally, we describe \alg combining with a specific DRL algorithm, PPO \citep{DBLP:journals/corr/SchulmanWDRK17} in detail.
%\subsection{Motivation}
%Previous works deal with the mismatch in the state-action space through various approaches. One major direction focuses on solving the discordance of states by mapping state spaces into a common feature space \citep{DBLP:conf/uai/WanG020,DBLP:conf/iclr/0004DLAL17,DBLP:conf/nips/XuWC0Y20}. However, this way usually does not impose sufficient constraints on the state representation or requires mapped data and does not address the action space mismatch, which impedes further knowledge transfer. Another direction of these methods focuses on simultaneously learning the mapping between the state-action space to reuse policies pre-trained on the source task \citep{DBLP:conf/iclr/ZhangXEPW21,DBLP:journals/corr/abs-1909-02291}. However, these methods are faced with the challenge of how to improve the asymptotic performance on the target task, since the source policy is usually directly reused or only trained on the target task subsequently. Furthermore, all the above methods are limited to the setting of a single source policy. How to adaptively draw out beneficial knowledge from multiple source policies which have \emph{different state-action space} according to the target policy learning is an unsolved problem in previous works.

%In this work, we firstly propose a novel \textbf{Multi-Source based Adaptive Transfer Framework (MSATF)} to accelerate the learning process of the target policy by leveraging multiple dissimilar but related source policies. We propose a novel method of learning state and action embeddings to deal with the mismatch between the state-action space, which has stronger constraints on the state representation to find the state in the source task that is most relevant to the current state. Besides, MSATF draws out suitable knowledge from source policy networks and evaluates how helpful multiple source policies are to target policy learning. In this way, MSATF can accurately obtain the most suitable guidance of each source policy and evaluate how helpful the guidance is to the current state to maximize the performance of the agent.

\subsection{Framework Overview}


%In this section, we introduce our framework which improves sample-effificiency of the target policy learning by adaptively transferring knowledge from multiple source policies with different state-action spaces. 
Figure~\ref{fig:method} illustrates the proposed Cross-domain Adaptive Transfer framework (\alg) which contains three components. 
The three components described are not completely novel. However, we integrate them into \alg to better solve the cross-domain transfer problem and our empirical ablation studies validate their effectiveness and importance (see Section~\ref{sec:4.2}).

\paragraph{Correction Module} 
Instead of only considering the relevance between state embedding and the target state in MIKT \citep{DBLP:conf/uai/WanG020}, we propose four properties that the learned state embedding should satisfy and build two extra optimization objectives to learn state and action embeddings in the correction module, described in Section~\ref{sec:3.2}. 
A newly proposed \emph{correction module}  (see Figure~\ref{fig:method}(b)) is used to learn state embeddings that can satisfy these properties
and to learn action embeddings that can better capture the semantics of actions of the source and target tasks, both of which are described in Section~\ref{sec:3.2}. The goal of the correction module is to learn embeddings to distill knowledge from multiple source policies into the target task.

\paragraph{Self-Adaptation Module}
Inspired by existing same-domain transfer methods (e.g., \cite{DBLP:conf/atal/FernandezV06}), we want our method to decide when and which source policy is better to transfer by evaluating them on the target environment. The \emph{self-adaptation module} (see Figure~\ref{fig:method}(a)) evaluates the source policies (via the relevant embeddings) for a fixed number of steps in the target environment. The average performance lets us set weighting factors so that we can combine these different source policies. We explain this idea in the context of CAT in Section~\ref{sec:3.3}.


\paragraph{Agent Module}
Once the embeddings are trained (via the correction module) and the source task policies are weighted (via the self-adaptation module), the agent is now ready to learn from both environmental interaction and guidance from the transferred policies. The \emph{agent module} allows our agent to distill knowledge from source policies, select actions to execute in the target environment, and learn a high-performing policy. See Figure~\ref{fig:method}(c) and Section~\ref{sec:3.3}. 






\subsection{Learning State-Action Correspondence}\label{sec:3.2}
This section considers how to learn meaningful state and action correspondences. We first introduce a set of state embedding spaces parameterized by a set of encoder functions $\{\phi_1, \phi_2, \cdots, \phi_n\}$.
Each state embedding is defined as $\mathcal{S}_{\textup{emb}_i}:=\{\phi_i(s)| s\in\mathcal{S}_{\textup{target}},\phi_i(s)\in\mathcal{S}_{\textup{source}_i}\}$, which will be used to map useful knowledge from source policies into the target policy. 
%For each state embedding, the dimension of the input and the embedding space is the same as that of $\mathcal{M}_{\textup{target}}$ and its corresponding source MDP respectively.

In order to extract more useful knowledge, each state embedding $\mathcal{S}_{\textup{emb}_i}$ should satisfy the following four properties: 
(1) The embeddings should be task-aligned to maximize the cumulative discount rewards in the target MDP. 
(2) The input states and state embeddings should be highly correlated so that the agent can receive the most appropriate guidance from source policies in the current state. 
(3) The embeddings should preserve enough information about the source task so that $\phi_i(s)$ can be reconstructed to the target task as consistently as possible.
(4) In addition to the correspondence on the single state, $s_s$ and $s_t$, the state embedding should keep the correspondence between state sequences of the source and target tasks.

To achieve property (1), we use the policy gradient to update the state encoder parameters \citep{DBLP:conf/uai/WanG020,DBLP:journals/corr/abs-1909-02291}. Property (2) can be achieved by maximizing the mutual information between states and embeddings to achieve a high correlation as follows  \citep{DBLP:conf/uai/WanG020}:

\[
%\begin{equation}
    \begin{aligned}
\mathcal{I}(s;e) &= \mathcal{H}(s) - \mathcal{H}(s|\phi(s)) \\
    &= \mathcal{H}(s) + \mathbbm{E}_{s,e}[\log p(s|e)] \\
    &= \mathcal{H}(s) + \mathbbm{E}_{s,e}[\log q_{\omega}(s|e)] \\
    &\ + \mathbbm{E}_{e}\left[D_{\textup{KL}}(p(s|e)||q_{\omega}(s|e))\right]\\
    &\geq \mathcal{H}(s) + \mathbbm{E}_{s,e}[\log q_{\omega}(s|e)] \\
\end{aligned}
%\end{equation}
\]
where $ \mathcal{H} $ denotes the differential entropy. The above optimization goal is known as a variational information maximization algorithm and the variational distribution $ q_{\omega}(s|e) $ approximates the true conditional distribution $ p(s|e) $. So the final optimization goal can be written as:

\begin{equation}
\label{eq:L_MI}
L_{\textup{MI}}(\phi) = -\mathbbm{E}_{s\sim\rho_{s}}\Big[\textup{log}q_{\omega}\big(s|\phi(s)\big)\Big]
\end{equation}
where $\rho_{s}$ denotes the state distribution of the target policy. 

However, relying only on the above two properties does not guarantee good enough transfer performance of the state embeddings.
Property (3) is also applied in
\citet{DBLP:conf/iclr/0004DLAL17}, and \citet{DBLP:journals/corr/abs-1909-02291} only satisfies property (4) by keeping the correspondence on the single state. 
Instead, we argue that state embeddings that satisfy all four properties will achieve better transfer performance --- this is empirically verified in our experiments. 

To this end, in addition to using policy gradient and mutual information to train state embeddings, we propose the \textbf{correction module} (Figure~\ref{fig:method}(b)) to satisfy the remaining two properties. We introduce a set of reverse state embeddings parameterized by a set of decoder functions $\{\phi_1^{-}, \phi_2^{-}, \cdots, \phi_n^{-}\}$ and  each reverse state embedding is defined as $\mathcal{S}_{\textup{emb}_i}^{-}:=\{\phi_{i}^{-}(s)| s\in\mathcal{S}_{\textup{source}},\phi_{i}^{-}(s)\in\mathcal{S}_{\textup{target}}\}$.

In our method, we use the reverse state embeddings in two ways to build two types of optimization objectives corresponding to properties (3) and (4), respectively. Firstly, a pair of meaningful mapping functions $\phi$ and $\phi^{-}$ should be as invertible as possible: $\phi^{-}\big(\phi(s_t)\big) \approx s_t, \phi\big(\phi^{-}(s_s)\big) \approx s_s$, so that the state embeddings $\mathcal{S}_\textup{emb}$ can preserve as much information about the source domain as possible  \citep{DBLP:conf/iclr/0004DLAL17}. Therefore, it is expected that state embeddings $\mathcal{S}_\textup{emb}$ can map from the embedding spaces back to their original state spaces.
%which encourages $\mathcal{S}_\textup{emb}$ to preserve the largest amount of domain-invariant information (Figure~\ref{fig:method}(b)). 
To satisfy property (3), we define the \emph{cycle-consistency loss} as follows:
\begin{equation}
\label{eq:L_cyc}
\begin{aligned}
L_{\textup{cyc}}(\phi,\phi^{-}) &= \mathbbm{E}_{s_{t}}\Big[||\phi^{-}\big(\phi(s_t)\big) - s_t||_2\Big] \\
&+ \mathbbm{E}_{s_{s}\sim\tau_s}\Big[||\phi\big(\phi^{-}(s_s)\big) - s_s||_2\Big]\\
\end{aligned}   
\end{equation}
where $\tau_s$ denotes the trajectories of the source policies. 
Secondly, we minimize the deviation between the mapping state sequence and the real state sequence to satisfy the property (4). Specifically, trajectories $\langle s_{s_T}, a_{s_T}, s_{s_{T+1}} \rangle$ sampled from source buffers which are collected during the training of source policies are mapped to the target environment $\langle \phi^{-}(s_{s_T}), e(a_{s_T}), \phi^{-}(s_{s_{T+1}}))\rangle$ through the reverse state encoders and action encoders.
Then, given the initial state $s_{s_0}$ of each source trajectory, we can obtain a true trajectory $\langle \langle \phi^{-}(s_{s_0}), e(a_{s_0}), s_{t_1}) \rangle, \dots, \langle s_{t_T}, e(a_{s_T}), s_{t_{T+1}}) \rangle \rangle$, starting with the mapped initial state $\phi^{-}(s_{s_0})$, by interacting with the target environment using each mapped action $e(a_{s_T})$ at each following state. %Then, we get the next state $s_{t+1}$ according to the current state $s_t$ and action $a_t$ which is obtained by mapping $a_s$ to the target environment using action embeddings. 
%At this time, the two next states $s_{s+1}$ and $s_{t+1}$ will have a distance in the embedding space and the distance between the two state sequences should be used to update the state embedding for a stronger constraint.
%To satisfy property (4), we define the \emph{correction loss}.
% it is expected that action embeddings are optimized to minimize this distance to satisfy the property that the distance should be adjacent if the actions have similar effects on the environment.
Figure~\ref{fig:action embedding_a} shows the first step derivation calculation process. To satisfy property (4), the \emph{correction loss} calculates the total derivation over trajectories as follows:
\begin{equation}
\label{eq:L_corr}
L_{\textup{corr}}(\phi,\phi^{-},e) = \mathbbm{E}_{s_s\sim\tau_s,s_t\sim\tau_t}\Big[||\phi^{-}(s_{s_{T+1}}) - s_{t_{T+1}}||_2^2\Big]
\end{equation}



where $\tau_s$ and $\tau_t$ denote the sampled trajectory and the true trajectory, respectively.
Note that it is not necessary for the reverse state embeddings to satisfy properties (1) and (2) since they just need to make sure the successful reconstruction of state embeddings and keep the correspondence between state sequences.
%they are not used during interacting with the environment. 
Therefore, the reverse state encoders are updated only using the cycle-consistency loss and the correction loss in the correction module.

Next, we introduce how to learn meaningful action correspondence which can capture the semantics of actions in the correction module. We introduce a set of action embedding spaces parameterized by a set of encoder functions $\{e_1, e_2, \cdots, e_n\}$. We follow the main idea that the semantics of actions can be reflected by their effects on the environment, which can be measured by the state transition probability in RL \citep{DBLP:journals/corr/abs-1909-02291}.  
The action embeddings can satisfy the property that the distance should be adjacent if the actions have similar effects by minimizing their effects on the environment, which can be reflected in Equation~\ref{eq:L_corr}. Therefore, the action encoders are updated using the correction loss in the correction module.

%\begin{equation}
%L_{\textup{dis}} = \mathbbm{E}_{s_{s+1}\sim\tau_s,s_{t+1}\sim\tau_t}\Big[||s_{s+1} - \phi(s_{t+1})||_2^2\Big]
%\end{equation}

%After theoretical derivation and experimental verification, we find that the above equation and the \emph{correction loss} (Equation~\ref{eq:L_corr}) have the same meaning and effect. Therefore, we use the \emph{correction loss} to optimize the state and action embeddings simultaneously for the unification of the overall algorithm. See Figure~\ref{fig:method}(b) for all details.

\begin{figure}[t]
    \centering
    \includegraphics[width=0.93\linewidth]{Method/action embedding_a.pdf}
    \caption{The first step of computing the distance of the obtained trajectory and the real trajectory.}
    \label{fig:action embedding_a}
\end{figure}

\subsection{Adaptive Policy Transfer}
\label{sec:3.3}
In this section, we describe how to transfer knowledge adaptively from multiple source policies with learned state-action correspondence through the correction module. The first issue is how to determine when and which source policy should be transferred to the target task. This is achieved through the \textbf{self-adaptation module}, which evaluates the source policies and generates the weights for transferring different source task policies (Figure~\ref{fig:method}(a)). 
%the weighting factor of the self-adaptation module and the sufficiently trained state embeddings learned in the correction module. 
%Inspired by the main insight of lateral connections between the source and target networks, we use the representations extracted from the layers of the source networks to augment that of the target networks. 
%In this way, the problem of action space mismatch can be solved since the knowledge transfer happens in the hidden layers. 
%However, the method mentioned above is limited to the setting of a single source policy. Furthermore, it will face a major problem of how to measure the weights between multiple source policies when extended to our problem setting.
%In order to solve the above-mentioned problem, we propose the self-adaptation module to adapt the weights over different source task policies over time (Figure~\ref{fig:method}(a)). 
Specifically, the self-adaptation module first evaluates each source policy's performance on the target environment and uses the total return $u_i$ on a fixed number of episodes as the weight of each source policy in the next iteration after passing through the softmax function:
\[
%\begin{equation}
w_i =  \frac{\textup{exp}(u_i)}{\sum_{n=1}^{N}\textup{exp}(u_n)}
%\end{equation}
\]
This approach is common but the most intuitive and interpretive way to measure each source policy. 
With the weighting factor generated by the self-adaptation module, the \textbf{agent module} (Figure~\ref{fig:method}(c)) makes decisions by adaptively drawing out suitable knowledge from multiple source policies and value networks, denoted as $\pi_{\theta'_i}$ and $V_{\psi'_i}$ respectively. In general, we assume the source and target policy and value networks to have the same number of hidden layers $N_{\pi}$ for ease of exposition. 
Specifically, the agent module serves the current state $s_t$ and feeds it to the set of encoders to produce state embeddings $\mathcal{S}_\textup{emb}$, which can be readily passed through the source networks to extract  $\{z^j_{\theta'_i}, z^j_{\psi'_i}, 1 \leq j \leq N_{\pi}, 1 \leq i \leq N\}$, representing the pre-activation outputs of the $j$-th hidden layers of the $i$-th source policy and value networks. To get the pre-activation representations $\{z^j_{\pi_{\theta}}, z^j_{V_{\psi}}\}$ in the target networks, we used two weighted linear combinations, one for the outputs from the source and target networks and the other for outputs from multiple source policy networks: 

\begin{equation}\label{eq:transfer}
\begin{aligned}
z_{\pi_{\theta}^j} = &pz_{\theta}^{j} + (1 - p) \sum_{i=1}^{N} w_{i}z_{\theta'_i}^{j} \\
z_{V_{\psi}^j} =& pz_{\psi}^{j} + (1 - p)\sum_{i=1}^{N} w_{i}z_{\psi'_i}^{j}
\end{aligned}
\end{equation}
 
where $w_i$ is the weight of source policy $\pi_i$. 
$p \in [0,1]$ is an increasing factor over time
%between the outputs from the source and target networks, 
that controls the decrease of the influence of source policies on the target policy --- A higher value of $p$ means the lesser influence. %$w_i$, generated from the self-adaptation module, controls the influence of each source policy $\pi_i$ on the target policy. 
Besides, a higher value of $w_i$ means the average performance of the corresponding source policy on the target task is higher. Such a source policy can provide more beneficial knowledge. Meanwhile, at the beginning of the training, the agent selects an action relying more on source policies to gain assistance. As the training continues, the agent should focus more on the target task to avoid negative transfer. 
In this way, CAT more fully combines knowledge from multiple source task policies to facilitate more efficient learning. %The way to update the target policy is the same as the based DRL algorithm, which we describe in detail in the following section. 

%A higher value of $p$ means the lesser influence. 
%We consider this to be a general approach in the field of knowledge transfer, where more information from source task policies is needed in the early phase of the training process and the independence of the agent becomes more important at the end of training. To achieve this, we use an additional coupling-loss which makes $p_{\theta}^{j}, p_{\psi}^{j}$ gradually approach 1 over time in \citet{DBLP:conf/uai/WanG020} as follows:
%\begin{equation}
%\label{eq:L_coupling}
%L_{\textup{coupling}} = -\frac{1}{N_\pi}\sum_{j=1}^{N_\pi} %\textup{log}\left(p_{\theta}^{j}\right) - %\frac{1}{N_\pi}\sum_{j=1}^{N_V} \textup{log}\left(p_{\psi}^{j}\right)
%\end{equation}
%The variation of $p_{\theta}^{j}, p_{\psi}^{j}$ over time has been discussed in \citet{DBLP:conf/uai/WanG020}. At the end of training, the value will approach 1 which represents that the agent is completely independent of the source policies. 
  
%Therefore, the self-adaptation can learn when and which source policy is relatively bad and avoid negative transfer by setting a low weight to it.


%After each iteration, the trajectories obtained during the training process of source policies are mapped to the target environment through reverse state embedding and action embedding, so that the deviation between the obtained trajectory and the real trajectory on the target environment is used to update the state embedding and action embedding. Besides, the cycle-consistence loss is used between state embedding and reverse state embedding to strengthen constraints. 
\subsection{CAT-PPO}\label{sec:3.4}
%Given a set of source policies $\Pi_{s} = \{\pi_1, \pi_2, \cdots, \pi_n\}$, the \alg agent first initializes a state embedding, action embedding, and reverse state embedding for each source policy to support the mapping process. 


This section details \alg-PPO, where we integrate PPO \citep{DBLP:journals/corr/SchulmanWDRK17} into our framework. As shown in Algorithm~\ref{alg:CAT}, other DRL algorithms could easily be incorporated instead. \alg-PPO first initializes all the network parameters needed in the learning process (Line 1). In each iteration, the correction module first samples trajectories from each source task buffer to train all encoders following Section \ref{sec:3.2} (Lines 4-6). Then, the self-adaptation module evaluates each source policy and gets the corresponding weight through state and action embeddings (Line 8). Next, the agent module outputs actions by combining knowledge from the target policy network and source policy networks (see Section \ref{sec:3.3}). These actions are executed in the environment to collect trajectories (Line 10). Finally, the agent module computes the RL loss (Equation~\ref{eq:L_PPO}) and the mutual information loss (Equation~\ref{eq:L_MI}) for the update (Lines 11-13). 
%During this process, the \alg-PPO agent maps the current state to the most relevant state in each source environment through state encoders to adaptively distill knowledge from source policy networks with the obtained weighting factor.
%Then, according to the obtained weight, the agent adaptively distills knowledge from each source policy into the target policy and interacts with the environment normally through the method of weighted fusion of the middle layers. 
\begin{algorithm}[h]
	\caption{\alg-PPO}\label{alg:CAT}
		\textbf{Initialize:} state encoder parameters $\phi_i$, reverse state encoder parameters $\phi_i^{-}$, action encoder parameters $ e_i $, target policy and value network parameters $\theta, \psi$,  source buffer $\mathcal{D}_i$
		
		\Repeat{reaching maximum training steps}{
		\textcolor{gray}{// Correction module}\\
		Sample a batch of trajectories from each $\mathcal{D}_i$\\
		Update each $\phi_i,\phi_i^{-}$ \Comment{see Eq.~\eqref{eq:L_cyc}}\\
		Update each $\phi_i,\phi_i^{-},e_i$ \Comment{see Eq.~\eqref{eq:L_corr} } \\
		\textcolor{gray}{// Self-adaptation module}\\
		Evaluate each source policy $\pi_i$ and calculate $w_i$\\
		\textcolor{gray}{// Agent module}\\
		Collect trajectories $\tau$ by combining the target policy network and source policy networks using $w_i$ \Comment{see Eq.~\eqref{eq:transfer}}\\
		\For{each batch $m \in \tau$}{
		 Update $\theta, \psi$ with $\nabla_{\theta, \psi}L_{\textup{PPO}}(\theta, \psi, \phi)$\\
		Update each $\phi_i$ with $\nabla_{\phi}L_{\textup{MI}}(\phi)+\nabla_{\phi}L_{\textup{PPO}}(\theta, \psi, \phi)$\\
		%\STATE Update each $\omega_i$ with corresponding $\nabla_{\phi}L_{\textup{MI}}(\phi, \omega)$
		}
	       }
\end{algorithm}


%After interacting with the environment,  trajectories $\{\tau_1, \tau_2, \dots, \tau_n \}$ are sampled from each of $i$ source buffers and then mapped into the target environment, so that the \emph{correction loss} (Equation~\ref{eq:L_corr}) can be computed to update the encoders (Lines 9-10). The \emph{cycle-consistency loss} (Equation~\ref{eq:L_cyc}) is used to update state encoders and reverse state encoders (Line 11). 

%Overall, we provide the technical details of \alg, as well as how to combine \alg with DRL algorithms, which we then evaluate empirically in the following section.

\section{Experiments}\label{sec:Experiments}

In this section, we conduct extensive experiments to verify the effectiveness of our proposed algorithm compared with previous cross-domain transfer methods. Further, we design several ablation studies to analyze the contribution of each proposed module to the transfer performance. 
We also test the influence of different transfer manners in \alg on the final performance to validate the choice in this paper, which is detailed in the Appendix.
Results are averaged over 5 different random seeds and each seed with 2 million timesteps of environment interactions. 
Please see Appendix for the network structure and parameter settings used in this paper.
%Based on experimental results, we decided to go with the current approach used in CAT. Please see Appendix for the experimental results and analysis. 


\textbf{Environments}: We use a series of environments provided by \citet{DBLP:conf/iclr/WangLBF18}, which have a similar physical structure to a centipede. In these environments, the \textbf{Centipede} agent consists of repetitive torso bodies, each of which has two legs, and needs to learn to run in a particular direction. 
The agent is rewarded for running speed and whether it runs within a valid range and penalized for energy cost and resistance obstruction from the ground. 
To make the experiment more convincing, we consider the \textbf{CrippleCentipede} agent, which has two back legs disabled and we denote it as \textbf{CpCentipede}. In addition, we also considered the standard \textbf{Ant-v2} task from the MuJoCo suite. Figure~\ref{fig:Environments} shows an illustration of all the different scenarios mentioned above. 
All the source policies are obtained by learning from scratch using the standard DRL method PPO.
Please see Appendix for a more detailed description.
%Therefore, we decided to use these environments as benchmarks to evaluate the transfer performance of the proposed algorithm. 

\textbf{Baselines}: Because the cross-domain transfer methods mentioned in Section~\ref{sec:intro} suffer various limitations and can not tackle our more difficult setting, we consider the following three baselines: 
\begin{itemize}
    \item Standard DRL method PPO \citep{DBLP:journals/corr/SchulmanWDRK17}, which learns from scratch in the target task;
    \item MIKT \citep{DBLP:conf/uai/WanG020}, which realizes cross-domain knowledge transfer with a single source policy;
    \item MIKT-MULTI, which is an extended version of MIKT to the setting of multiple source policies. MIKT-MULTI uses a fixed weighting factor to extract knowledge from each source policy, which can be seen as a version of \alg without the self-adaptation module and correction module.
\end{itemize}
%a) the standard DRL method PPO \citep{DBLP:journals/corr/SchulmanWDRK17}, which learns from scratch in the target task; b) MIKT \citep{DBLP:conf/uai/WanG020} which realizes cross-domain knowledge transfer with a single source policy; c) MIKT-MULTI which is an extended version of MIKT to the setting of multiple source policies. MIKT-MULTI uses a fixed weighting factor to extract knowledge from each source policy, which can be seen as a version of \alg without the self-adaptation module and correction module.



\subsection{Experimental Results}

\begin{figure}[pth]
    \centering
    \subfigure[CentipedeFour]{
    \label{CentipedeFour}
    \includegraphics[width=0.145\textwidth]{Environments/CentipedeFour.pdf}
    }
    \subfigure[CentipedeSix]{
    \label{CentipedeSix}
    \includegraphics[width=0.145\textwidth]{Environments/CentipedeSix.pdf}
    }
     \subfigure[CentipedeEight]{
    \label{CentipedeEight}
    \includegraphics[width=0.145\textwidth]{Environments/CentipedeEight.pdf}
    }
    \subfigure[CpCentipedeSix]{
    \label{CpCentipedeSix}
    \includegraphics[width=0.145\textwidth]{Environments/CpCentipedeSix.pdf}
    }
    \subfigure[CpCentipedeEight]{
    \label{CpCentipedeEight}
    \includegraphics[width=0.149\textwidth]{Environments/CpCentipedeEight.pdf}
    }
    \subfigure[Ant]{
    \label{Ant}
    \includegraphics[width=0.145\textwidth]{Environments/Ant.pdf}
    }
    \caption{Our continuous control tasks on MuJoCo: Centipede-\{4,6,8\}, CpCentipede-\{6,8\} and Ant-v2.}
\label{fig:Environments}
\end{figure}

\begin{figure*}[pth]
    \centering
    \subfigure[Target Env: CentipedeEight]{
    \label{fig:4,6-8}
    \includegraphics[width=0.32\textwidth]{Experiments/final/4,6-8.pdf}
    }\hspace{-2mm}
    \subfigure[Target Env: CpCentipedeEight]{
    \label{fig:4,6-cp8}
    \includegraphics[width=0.32\textwidth]{Experiments/final/4,6-cp8.pdf}
    }\hspace{-2mm}
     \subfigure[Target Env: CentipedeSix]{
    \label{fig:4,8-6}
    \includegraphics[width=0.32\textwidth]{Experiments/final/4,8-6.pdf}}
    \subfigure[Target Env: CentipedeSix]{
    \label{fig:4,ant-6}
    \includegraphics[width=0.32\textwidth]{Experiments/final/4,ant-6.pdf}
    }\hspace{-2mm}
    \subfigure[Target Env: CentipedeEight]{
    \label{fig:6,ant-8}
    \includegraphics[width=0.32\textwidth]{Experiments/final/6,ant-8.pdf}
    }\hspace{-2mm}
    \subfigure[Target Env: CentipedeEight]{
    \label{fig:4,cp6-8}
    \includegraphics[width=0.32\textwidth]{Experiments/final/4,cp6-8.pdf}
    }
    \caption{Performance of our proposed algorithm (CAT) and other methods (PPO, MIKT, and MIKT-MULTI) on different combinations of continuous control tasks. We plot the number of timesteps of environment interaction on the x-axis and the average episodic returns on the y-axis (the curves and shadow areas represent the mean and standard deviation, respectively). ``$x,y-z$'' represents $x$ and $y$ transfer to $z$, while ``$x-z$'' is transfer from $x$ to $z$. }\label{fig:Results}
\end{figure*}

In our experiments, we design six different combinations of environments to extensively validate the efficiency of our proposed method. For example, ``4,6-8'' represents \textbf{CentipedeFour} and \textbf{CentipedeSix}
transfer to \textbf{CentipedeEight} in Figure~\ref{fig:Results}(a). All the source policies are trained from scratch on source tasks.
We plot the average episodic returns on the y-axis (mean and standard deviation). 
In each plot, we only consider the source policy which can achieve better transfer performance of MIKT since MIKT can only transfer a single source policy. 
For example, only ``6-8'' is plotted in Figure~\ref{fig:Results}(a) since the source policy from \textbf{CentipedeSix} provides better transfer performance.

Figure~\ref{fig:Results} shows the performance of \alg and the other three baselines in different combinations of environments. We can see that %CAT outperforms all baselines and achieves the highest average rewards with the fastest speed. T
the performance of PPO learning from scratch is the worst because of the sample inefficiency and the lack of the help of expert knowledge. Although MIKT achieves better performance than PPO, it is worse than CAT in terms of learning speed and final performance. %under the setting of a single source policy, which outperforms PPO both in earlier and final returns, but still faces the problem of insufficient performance growth in the late training stage. 
MIKT-MULTI performs even worse than MIKT in most cases. This indicates that using fixed (or manually adjusted) transfer weights among multiple source policies %without considering which source policy  
limits access to more beneficial knowledge or even causes negative transfer, which is exactly what our proposed method aims to solve. This phenomenon further validates the importance of our self-adaptation module and correction module. Finally, we can see that our method (\alg) significantly outperforms all baselines and achieves the highest average rewards with the fastest speed. This is because \alg learns more sufficiently trained state correspondence by satisfying our proposed properties at the same time to lay the foundation for our transfer framework. In addition, it effectively leverages the evaluation performance as the weights of different source policies in the target environment so that it can infer when and which source policy is more beneficial to achieve adaptive knowledge transfer. %The experiment results show that \alg is an effective method to solve the problem of cross-domain transfer with multiple source policies.

\textbf{Other domains and more than two source tasks}: In addition to \textbf{Centipede-x}, we have some other series of environments, such as \textbf{InvPendulum-x}, \textbf{Reacher-x}, and \textbf{Snake-x}, where $x$ represents the number of joints. In each series of environments, the robotics share an inherent structure that could be exploited for transfer learning. 
In these environments, the \textbf{centipede} robotics have the most complex physical structures and CAT can solve these tasks well.
Therefore, we have sufficient reasons to believe that CAT can achieve significant performance in other domains.

To prove this, we choose \textbf{Snake-x} as the additional environment, which has a similar structure to a snake. The goal of the agent is to move as fast as possible but the average rewards will eventually converge to around 400. We consider 400 as the solved score for SnakeSix and take the average required timesteps (M means one million steps) required for convergence as the evaluation criterion.
We supplement the experiments with three source policies to verify that \alg can scale to more source tasks.
Besides, we also add one source policy \textbf{Ant} with a completely different physical structure to verify the ability of \alg to avoid negative transfer.

Table~\ref{tab:Supplementary experiments} shows the performance of \alg and learning from scratch in our additional experiments, where ``3,4,5-6'' represents \textbf{SnakeThree}, \textbf{SnakeFour}, and \textbf{SnakeFive}
transfer to \textbf{SnakeSix}. 
As the result shows, \alg can extract knowledge from more source policies. From experience, robots with more similar morphology can provide more knowledge, corresponding to robots with a similar number of joints in our experimental setting.
\alg can provide accurate weights of each source policy based on the self-adaptation module when using more than two source tasks. Therefore, we only use two source policies for simplicity in our main experiments.
Besides, \alg can also significantly improve learning efficiency even if there is one policy \textbf{Ant} which is expected to provide negative transfer.
This is because \alg has two mechanisms to avoid negative transfer.
First, if a source policy that may cause negative transfer is added, it will get a particularly small weight, which will not affect the transfer performance.
Second, CAT uses two weighted linear combinations. At the beginning of the training, the agent selects an action relying more on source policies to gain assistance but focuses more on the target task as the training continues.
In this way, CAT can effectively avoid negative transfer.

\begin{figure*}[ht]
    \centering
    \subfigure[Target Env: CentipedeEight]{
    \label{fig:ablation:4,6-8}
    \includegraphics[width=0.32\textwidth]{Experiments/Ablation/4,6-8.pdf}
    }
    \hspace{-2mm}
    \subfigure[Target Env: CpCentipedeEight]{
    \label{fig:ablation:4,6-cp8}
    \includegraphics[width=0.32\textwidth]{Experiments/Ablation/4,6-cp8.pdf}
    }\hspace{-2mm}
     \subfigure[Target Env: CentipedeSix]{
    \label{fig:ablation:4,8-6}
    \includegraphics[width=0.32\textwidth]{Experiments/Ablation/4,8-6.pdf}
    }
    \subfigure[Target Env: CentipedeSix]{
    \label{fig:ablation:4,ant-6}
    \includegraphics[width=0.32\textwidth]{Experiments/Ablation/4,ant-6.pdf}
    }\hspace{-2mm}
    \subfigure[Target Env: CentipedeEight]{
    \label{fig:ablation:6,ant-8}
    \includegraphics[width=0.32\textwidth]{Experiments/Ablation/6,ant-8.pdf}
    }\hspace{-2mm}
    \subfigure[Target Env: CentipedeEight]{
    \label{fig:ablation:4,cp6-8}
    \includegraphics[width=0.32\textwidth]{Experiments/Ablation/4,cp6-8.pdf}
    }
    \caption{Ablation studies on the contribution of the mutual information loss and the correction module: \emph{CAT w/o MI} and \emph{CAT w/o corr}.}
    \label{fig:Ablation Results}
\end{figure*}


\begin{table}[ht]
    \centering
    \caption{Performance of CAT and PPO on Snake-6, where `M' denotes million training steps.}\label{tab:Supplementary experiments}
    \begin{tabular}{c|c|c}
      \toprule % from booktabs package
      \bfseries Method &  \bfseries Time to Threshold  &  \bfseries  Rewards  \\
      \midrule % from booktabs package
      PPO  & 1.34M (±0.10) & 449.44 (±15.01) \\
      CAT(3,4,5-6)  & 0.70M (±0.09) & 452.10 (±17.12) \\
      CAT(3,4,Ant-6)  & 0.80M (±0.07) & 458.70 (±17.55) \\
      \bottomrule % from booktabs package
    \end{tabular}
\end{table}

\subsection{Ablation Studies}

\label{sec:4.2}

%Instead of using $\{L_{\textup{PPO}}, L_{\textup{MI}}\}$ in MIKT to update the encoder, we additionally propose the correction module with $\{L_{\textup{cyc}}, L_{\textup{corr}}\}$ to learn state embeddings satisfying our proposed properties. 
To better illustrate the effectiveness of our proposed method, we analyze the contribution of the mutual information loss to verify the necessity of the correction module for the state embedding learning.
Besides, we remove the correction module to see whether the \alg agent can achieve good performance only by relying on the self-adaptation module and state embeddings that can not satisfy properties (3) and (4). The ablation studies are designed as follows:
\begin{itemize}
    \item \emph{\alg w/o MI}: Update state encoders without $ L_{\textup{MI}}$.
    \item \emph{\alg w/o corr}: Update state encoders without $\quad\{L_{\textup{cyc}}, L_{\textup{corr}}\}$.
    \item \emph{\alg w/o corr and adapt}: Note that MIKT-MULTI can be seen as a version of CAT without the self-adaptation module and correction module, which we have shown in Figure~\ref{fig:Results}.
    %but still update action encoders with $ L_{\textup{corr}}$.
\end{itemize}

Figure~\ref{fig:Ablation Results} shows
the influence of these different parts on the
performance of \alg-PPO. Before analyzing the experimental results, we note that it is usually harder to learn in the early stages of training for \emph{MIKT w/o MI}, which removes the mutual information loss, as discussed earlier in \citet{DBLP:conf/uai/WanG020}. 
But we can see that \emph{\alg w/o MI} still has very impressive performance compared to \emph{\alg w/o corr} in most cases. 
This indicates it still achieves good transfer performance even without the mutual information loss, which confirms the effectiveness of our proposed properties and correction module. 
Besides, \emph{\alg w/o corr} has a significant improvement compared to MIKT-MULTI, which confirms the effectiveness of the self-adaptation module.
%The performance of \emph{\alg w/o corr} has not increased significantly compared to MIKT. 
It is obvious that \alg is the most performant in all methods.
This supports our view that the sufficiently trained state embeddings can indeed improve the transfer performance by satisfying the four properties at the same time.
%MIKT-MULTI can be seen as CAT without the self-adaptation module and correction module, the poor results in  Figure~\ref{fig:Results} shows the importance of the two modules. 
 %It is obvious that \alg is the most performant in all methods.

Table~\ref{tab:ablation} shows the average episode rewards without different modules in centipede 4,6-8 including the self-adaptation module.
Note that for simplicity we do not analyze the contribution of the self-adaptation module separately in Figure~\ref{fig:Ablation Results}, which can be verified by comparing the performance of \emph{\alg w/o corr} and MIKT-MULTI.
As the result shows, \alg is the most performant in all methods.
All above results confirm that each component in \alg is necessary and important for effective and efficient transfer in DRL.

\begin{table}[ht]
    \centering
    \caption{Contributions of different modules of CAT in Centipede4,6-8.}\label{tab:ablation}
    \begin{tabular}{p{5cm}|p{2.3cm}}
      \toprule % from booktabs package
      \bfseries Method &  \bfseries Average Return \\
      \midrule % from booktabs package
      PPO  & 1660.7 (±284.5)\\
      MIKT (6-8)  & 2940.0 (±357.0)\\
      CAT  & \textbf{4684.4 (±452.1)}\\
      CAT w/o MI  & 3972.1 (±312.0)\\
      CAT w/o corr  & 3097.8 (±289.8)\\
      CAT w/o self-adapt  & 3381.4 (±286.7)\\
      MIKT-MULTI (w/o corr and self-adapt)  & 2441.5 (±413.5)\\
      \bottomrule % from booktabs package
    \end{tabular}
\end{table}

\section{Related Work}
\textbf{Same-domain transfer and cross-domain transfer} \\
In same-domain transfer, one mainstream method to accelerate DRL is policy distillation, which is extended by \citet{DBLP:journals/corr/RusuCGDKPMKH15}.  \citet{DBLP:journals/corr/ParisottoBS15} mimics the behavior of source policies during the target policy learning process. However, this method highly relies on the task similarity, which restricts its generality. %it requires source policies from tasks with high similarity, which enables a positive transfer by imitating actions on source tasks. 
\citet{DBLP:journals/corr/abs-1803-03835} presents an auxiliary objective which distills knowledge from source policies by minimizing the cross-entropy loss between the source and target policy distributions over actions. However, this method uses an evolution strategy to adjust the hyperparameters which increases the computational complexity.  %\citet{DBLP:conf/nips/TehBCQKHHP17} learns a centroid policy $\pi_{\theta}$ based on distilling knowledge from multiple source policies. 
%\citet{DBLP:journals/corr/ParisottoBS15} and \citet{DBLP:journals/corr/abs-1803-03835} minimize the cross-entropy loss between the source and target policy distributions over actions under the conditions of the same state-action space, which is extended by \citet{DBLP:journals/corr/RusuCGDKPMKH15}. 
\citet{DBLP:conf/icml/TaoGCSM21} propose to combine multiple transfer manners, like policy distillation and value function reuse to facilitate more efficient DRL. %off-policy instance transfer used to collect samples that have high advantages and reduce the training time in case of low task similarity.
However, they assume that the reward function of the target task is known, which is difficult to achieve in our problem setting. Successor features and generalized policy improvement also reuse source policies (value functions) directly in the target task \citep{DBLP:conf/nips/BarretoDMHSSH17,DBLP:journals/corr/abs-1901-10964}. However, all these methods share the same limitation that cannot be applied to tasks with different state-action spaces which is more practical in real-world scenarios. Recently, \citet{DBLP:conf/iclr/0004DLAL17} learns invariant state feature spaces and matches the distributions of optimal trajectories in the source task to transfer skills between different agents. However, they need paired data to train embedding functions which is very expensive in real-world problems. \citet{DBLP:conf/iclr/ZhangXEPW21} learn the mapping between the state-action space to reuse the source policy directly, which may not achieve optimal performance on the target task. %\citet{DBLP:conf/nips/XuWC0Y20} trains a single DRL agent to achieve expert-level performance in multiple different tasks by learning from task-specific teachers. However, it pads the state and action with appended zeros to obtain fixed lengths, which can not capture the semantics of the state and action. 
Our work is most relevant to Mutual Information Based Knowledge Transfer (MIKT) \citep{DBLP:conf/uai/WanG020}. Although MIKT is a successful approach for cross-domain transfer, it is still faced the problem of insufficiently trained state embeddings and the limitation of being able to transfer only a single source policy.
\alg firstly proposes the properties that state embeddings should satisfy at the same time and achieves adaptive knowledge transfer from multiple source policies with different state-action spaces. 

\textbf{Domain Randomization and Domain Adaptation in RL} \\
%Previous work in the field of sim-to-real is mainly divided into two categories, namely domain randomization and domain adaptation, where the former is randomization of features, i.e., transfer across observations, while the latter is to transfer the model learned from the simulator to real-world scenarios, i.e. transfer across dynamics. 
Domain randomization aims to learn a policy with generalization capability which is trained on multiple source domains, hoping to perform well in the target domain \citep{DBLP:conf/iros/TobinFRSZA17,DBLP:journals/corr/abs-1910-10537}. It focuses more on common features between domains by training on multiple source domains. However, this kind of method requires multiple source domains to be available for training, which is a strong assumption compared to the requirement that only pre-trained source policies are needed. Besides, domain randomization is very sensitive to changes in the number of domains, which greatly affects the complexity of training. Some domain adaptation works in RL use image-to-image translation to pair the pixel-based states in the source and target domain, but it has additional computational cost overhead for the image translation \citep{DBLP:conf/bmvc/PanYWL17,DBLP:conf/icml/GamrianG19}. Other works focus on learning a common state representation to solve the problems mentioned above \citep{DBLP:conf/aaai/XingNCZNK21,DBLP:conf/aaai/RoyK21}. However, works in this field do not have a clear benchmark for the difference between the two domains and most of them focus on the problem of observation adaptation. While the source domain $\mathcal{D}_{\textup{source}}$ and target domain $\mathcal{D}_{\textup{target}}$ have different state space $\mathcal{S}$ (visual observations), the action space $\mathcal{A}$ and other properties should remain the same or have some similarity. The main difficulty in our work is how to achieve knowledge transfer among totally different MDPs, which is a more difficult task that these methods cannot be applied. 

\section{Conclusion and Future Work}
In this work, we firstly propose a novel framework called Cross-domain Adaptive Transfer (\alg) which adaptively transfers knowledge from multiple cross-domain policies. \alg is composed of three main components: the agent module, the self-adaptation module, and the correction module. Using the agent module and correction module, we firstly propose four properties that the learned state-action correspondence should satisfy. Then we design the corresponding optimization objectives to learn state and action embeddings to deal with the mismatch in the state-action space of source and target tasks. The self-adaptation module learns to decide when and which source policy is better to transfer by evaluating them on the target environment. The average performance is used to derive the weighting factors so that we can combine these different source policies. The agent module allows our agent to distill knowledge from source policies, select actions to execute in the target environment and learn a high-performing policy. %In this way, \alg significantly accelerates the learning process of the target policy. 
Experimental results show that \alg significantly accelerates RL and outperforms other cross-domain transfer methods. In this paper, we use the average performance over a fixed number of episodes as the weight of each source policy in the next entire iteration. However, each source policy may only be helpful in a part of the state space. It's worthwhile investigating which source policy performs better in which region to facilitate fine-grained transfer. Another direction is to learn a unified embedding space for all source domains and the target domain to improve the generalizability of the method. 
Besides, leveraging prior human knowledge \cite{DBLP:conf/ijcai/ZhangHWTMDZ20} or synthesizing white-box knowledge~\cite{DBLP:journals/corr/abs-2205-13728} for a better transfer learning is worth further study. 


\begin{acknowledgements}The work is supported by the National Natural Science Foundation of China (Grant Nos.: U1836214, 62106172), the new Generation of  Artificial Intelligence Science and Technology Major Project of Tianjin under grant: 19ZXZNGX00010 and the Science and Technology on Information Systems Engineering Laboratory (Grant No. WDZC20205250407). Part of this work has taken place in the Intelligent Robot Learning (IRL) Lab at the University of Alberta, which is supported in part by research grants from Alberta Innovates; the Alberta Machine Intelligence Institute (Amii); a Canada CIFAR AI Chair, Amii; Compute Canada; Mitacs; and NSERC.
    
\end{acknowledgements}


\bibliography{You_545}

% NOTE: necessary when ptmx or no mathfont class option is given
\providecommand{\upGamma}{\Gamma}
\providecommand{\uppi}{\pi}
\end{document}
